Skip to content

Commit

Permalink
add chromdp proxy and update spider/core.go
Browse files Browse the repository at this point in the history
  • Loading branch information
Miachol committed Nov 19, 2019
1 parent 2d937e4 commit fe22aa4
Show file tree
Hide file tree
Showing 7 changed files with 78 additions and 65 deletions.
16 changes: 11 additions & 5 deletions chrome/doi.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,20 @@ import (
"github.com/openbiox/butils/log"
stringo "github.com/openbiox/butils/stringo"

"github.com/chromedp/chromedp"
cdp "github.com/chromedp/chromedp"
)

func DoiSupplURLs(url string, timeout time.Duration) []string {
func DoiSupplURLs(url string, timeout time.Duration, proxy string) []string {
// create context
ctx, _ := cdp.NewContext(context.Background())
ctx, _ = context.WithTimeout(ctx, timeout)
//defer cancel()
o := append(cdp.DefaultExecAllocatorOptions[:],
//... any options here
chromedp.ProxyServer(proxy),
)
cx, cancel := chromedp.NewExecAllocator(context.Background(), o...)
ctx, cancel := cdp.NewContext(cx)
ctx, cancel = context.WithTimeout(ctx, timeout)
defer cancel()
var err error
var attbs []map[string]string
urls := []string{}
Expand Down Expand Up @@ -48,7 +54,7 @@ func visibleScienceDirect(host string, attbs *[]map[string]string) cdp.Tasks {
cdp.Navigate(host),
cdp.WaitVisible(`.show-toc-button`, cdp.ByQuery),
cdp.Click(`.show-toc-button`, cdp.ByQuery),
//cdp.WaitVisible(`a[href="#app2"]`, cdp.ByQuery),
//cdp.WaitVisible(`a[href="#app2"]`, cdp.ByQuery),
//cdp.Click(`a[href="#app2"]`, cdp.ByQuery),
//cdp.WaitVisible(`#app2`, cdp.ByQuery),
cdp.AttributesAll(".Appendices a.icon-link[href]", attbs, cdp.ByQueryAll),
Expand Down
6 changes: 3 additions & 3 deletions doc/doi.list.journal.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,10 @@ Rank Journal.Name Fulltext-1pass-check Supplemental-1pass-check DOI-1pass Fullte
80 ANNALS OF INTERNAL MEDICINE bug 10.7326/M19-3111
81 GASTROENTEROLOGY done yes 10.1186/s12876-019-1087-9
82 INTENSIVE CARE MEDICINE done 10.1007/s00134-019-05829-0
83 JOURNAL OF HEPATOLOGY done no 10.1016/j.jhep.2019.10.023
83 JOURNAL OF HEPATOLOGY done yes 10.1016/j.jhep.2019.10.023
84 Annual Review of Plant Biology done 10.1146/annurev-arplant-050718-100016
85 PHARMACOLOGICAL REVIEWS done 10.1124/jpet.119.260968
86 JOURNAL OF THE AMERICAN COLLEGE OF CARDIOLOGY done no 10.1016/j.jacc.2019.09.001
86 JOURNAL OF THE AMERICAN COLLEGE OF CARDIOLOGY done yes 10.1016/j.jacc.2019.09.067
87 Nature Reviews Rheumatology done yes 10.1038/s41584-019-0335-2
88 Lancet Psychiatry todo 10.1016/S2215-0366(19)30394-3
90 Chem todo no 10.1016/j.chempr.2019.06.020
Expand Down Expand Up @@ -2781,4 +2781,4 @@ Rank Journal.Name Fulltext-1pass-check Supplemental-1pass-check DOI-1pass Fullte
2903 PERMAFROST AND PERIGLACIAL PROCESSES NA
2903 SURGICAL ONCOLOGY-OXFORD NA
2903 Topics in Organometallic Chemistry NA
2903 TRANSFUSION MEDICINE AND HEMOTHERAPY NA
2903 TRANSFUSION MEDICINE AND HEMOTHERAPY NA
6 changes: 3 additions & 3 deletions doc/doi.md
Original file line number Diff line number Diff line change
Expand Up @@ -760,7 +760,7 @@ Supported DOI by bget (Journal)
<td style="text-align: right;">83</td>
<td style="text-align: left;">JOURNAL OF HEPATOLOGY</td>
<td style="text-align: left;">done</td>
<td style="text-align: left;">no</td>
<td style="text-align: left;">yes</td>
<td style="text-align: left;">10.1016/j.jhep.2019.10.023</td>
<td style="text-align: left;">NA</td>
<td style="text-align: left;">NA</td>
Expand All @@ -787,8 +787,8 @@ Supported DOI by bget (Journal)
<td style="text-align: right;">86</td>
<td style="text-align: left;">JOURNAL OF THE AMERICAN COLLEGE OF CARDIOLOGY</td>
<td style="text-align: left;">done</td>
<td style="text-align: left;">no</td>
<td style="text-align: left;">10.1016/j.jacc.2019.09.001</td>
<td style="text-align: left;">yes</td>
<td style="text-align: left;">10.1016/j.jacc.2019.09.067</td>
<td style="text-align: left;">NA</td>
<td style="text-align: left;">NA</td>
</tr>
Expand Down
19 changes: 10 additions & 9 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,24 @@ go 1.12
require (
github.com/PuerkitoBio/goquery v1.5.0
github.com/andybalholm/cascadia v1.1.0 // indirect
github.com/antchfx/htmlquery v1.1.0 // indirect
github.com/antchfx/xmlquery v1.1.0 // indirect
github.com/antchfx/xpath v1.1.0 // indirect
github.com/chromedp/cdproto v0.0.0-20190827000638-b5ac1e37ce90 // indirect
github.com/chromedp/chromedp v0.4.0
github.com/antchfx/htmlquery v1.2.0 // indirect
github.com/antchfx/xmlquery v1.2.0 // indirect
github.com/antchfx/xpath v1.1.1 // indirect
github.com/chromedp/cdproto v0.0.0-20191114225735-6626966fbae4 // indirect
github.com/chromedp/chromedp v0.5.1
github.com/gocolly/colly v1.2.0
github.com/golang/groupcache v0.0.0-20191027212112-611e8accdfc9 // indirect
github.com/google/go-github/v27 v27.0.6
github.com/mattn/go-runewidth v0.0.6 // indirect
github.com/olekukonko/tablewriter v0.0.2
github.com/openbiox/butils v0.0.0-20191109071326-0a3e37c394b2
github.com/pierrec/lz4 v2.3.0+incompatible // indirect
github.com/spf13/cobra v0.0.5
github.com/spf13/pflag v1.0.5 // indirect
github.com/vbauerster/mpb/v4 v4.11.0
golang.org/x/crypto v0.0.0-20191108234033-bd318be0434a // indirect
golang.org/x/net v0.0.0-20191109021931-daa7c04131f5 // indirect
github.com/vbauerster/mpb/v4 v4.11.1
golang.org/x/crypto v0.0.0-20191117063200-497ca9f6d64f // indirect
golang.org/x/net v0.0.0-20191118183410-d06c31c94cae // indirect
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45
golang.org/x/sys v0.0.0-20191105231009-c1f44814a5cd // indirect
golang.org/x/sys v0.0.0-20191119060738-e882bf8e40c2 // indirect
google.golang.org/appengine v1.6.5 // indirect
)
57 changes: 28 additions & 29 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,24 @@ github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5z
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/antchfx/htmlquery v1.0.0 h1:O5IXz8fZF3B3MW+B33MZWbTHBlYmcfw0BAxgErHuaMA=
github.com/antchfx/htmlquery v1.0.0/go.mod h1:MS9yksVSQXls00iXkiMqXr0J+umL/AmxXKuP28SUJM8=
github.com/antchfx/htmlquery v1.1.0 h1:KMS88sLl5KP9GUVU2MQIDcQXNQ0M5MGlkC9WlYgAQqY=
github.com/antchfx/htmlquery v1.1.0/go.mod h1:MS9yksVSQXls00iXkiMqXr0J+umL/AmxXKuP28SUJM8=
github.com/antchfx/htmlquery v1.2.0 h1:oKShnsGlnOHX6t4uj5OHgLKkABcJoqnXpqnscoi9Lpw=
github.com/antchfx/htmlquery v1.2.0/go.mod h1:MS9yksVSQXls00iXkiMqXr0J+umL/AmxXKuP28SUJM8=
github.com/antchfx/xmlquery v1.0.0 h1:YuEPqexGG2opZKNc9JU3Zw6zFXwC47wNcy6/F8oKsrM=
github.com/antchfx/xmlquery v1.0.0/go.mod h1:/+CnyD/DzHRnv2eRxrVbieRU/FIF6N0C+7oTtyUtCKk=
github.com/antchfx/xmlquery v1.1.0 h1:vj0kZ1y3Q6my4AV+a9xbWrMYzubw+84zuiKgvfV8vb8=
github.com/antchfx/xmlquery v1.1.0/go.mod h1:/+CnyD/DzHRnv2eRxrVbieRU/FIF6N0C+7oTtyUtCKk=
github.com/antchfx/xmlquery v1.2.0 h1:1nrzsSN5mFrlqFWSK9byiq/qXKE7O2vivYzhv1Ksnfw=
github.com/antchfx/xmlquery v1.2.0/go.mod h1:/+CnyD/DzHRnv2eRxrVbieRU/FIF6N0C+7oTtyUtCKk=
github.com/antchfx/xpath v1.0.0 h1:Q5gFgh2O40VTSwMOVbFE7nFNRBu3tS21Tn0KAWeEjtk=
github.com/antchfx/xpath v1.0.0/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
github.com/antchfx/xpath v1.1.0 h1:mJTvYpiHvxNQRD4Lbfin/FodHVCHh2a5KrOFr4ZxMOI=
github.com/antchfx/xpath v1.1.0/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
github.com/antchfx/xpath v1.1.1 h1:mqGYmd5pioPu06+REIf8j3y6O3S1UpVNVoCameZHotg=
github.com/antchfx/xpath v1.1.1/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
github.com/antonfisher/nested-logrus-formatter v1.0.2 h1:t65eOqj0fWbOkZR2+OgmxPa0KYIwbPhKdYmseaCMIyI=
github.com/antonfisher/nested-logrus-formatter v1.0.2/go.mod h1:6WTfyWFkBc9+zyBaKIqRrg/KwMqBbodBjgbHjDz7zjA=
github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8=
github.com/chromedp/cdproto v0.0.0-20190812224334-39ef923dcb8d h1:00kLGv5nKzpFchNhGDXDRbKtYx/WoT983Ka2t8/pzRE=
github.com/chromedp/cdproto v0.0.0-20190812224334-39ef923dcb8d/go.mod h1:0YChpVzuLJC5CPr+x3xkHN6Z8KOSXjNbL7qV8Wc4GW0=
github.com/chromedp/cdproto v0.0.0-20190827000638-b5ac1e37ce90 h1:CgIuU+BmhL7FOXl4nTH3L1pwPbAz1VlzexJNEfrS7Kw=
github.com/chromedp/cdproto v0.0.0-20190827000638-b5ac1e37ce90/go.mod h1:0YChpVzuLJC5CPr+x3xkHN6Z8KOSXjNbL7qV8Wc4GW0=
github.com/chromedp/chromedp v0.4.0 h1:0AJC5ejETuh/6n7Tcsw4u4G0eKZkI9aVRwckWaImLUE=
github.com/chromedp/chromedp v0.4.0/go.mod h1:DC3QUn4mJ24dwjcaGQLoZrhm4X/uPHZ6spDbS2uFhm4=
github.com/chromedp/cdproto v0.0.0-20191009033829-c22f49c9ff0a/go.mod h1:PfAWWKJqjlGFYJEidUM6aVIWPr0EpobeyVWEEmplX7g=
github.com/chromedp/cdproto v0.0.0-20191114225735-6626966fbae4 h1:QD3KxSJ59L2lxG6MXBjNHxiQO2RmxTQ3XcK+wO44WOg=
github.com/chromedp/cdproto v0.0.0-20191114225735-6626966fbae4/go.mod h1:PfAWWKJqjlGFYJEidUM6aVIWPr0EpobeyVWEEmplX7g=
github.com/chromedp/chromedp v0.5.1 h1:PAqhoCWCHzRphYnmmxLSiYk7EEwDplCm4woTCCaV2cQ=
github.com/chromedp/chromedp v0.5.1/go.mod h1:3NMfuKTrKNr8PWEvHzdzZ57PK4jm9zW1C5nKiaWdxcM=
github.com/coreos/etcd v3.3.10+incompatible h1:jFneRYjIvLMLhDLCzuTuU4rSJUjRplcJQ7pD7MnhC04=
github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE=
github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8NzMklzPG4d5KIOhIy30Tk=
Expand All @@ -55,6 +54,8 @@ github.com/gobwas/ws v1.0.2 h1:CoAavW/wd/kulfZmSIBt6p24n4j7tHgNVCjsfHVNUbo=
github.com/gobwas/ws v1.0.2/go.mod h1:szmBTxLgaFppYjEmNtny/v3w89xOydFnnZMcgRRu/EM=
github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI=
github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA=
github.com/golang/groupcache v0.0.0-20191027212112-611e8accdfc9 h1:uHTyIjqVhYRhLbJ8nIiOJHkEZZ+5YoOsAbD3sk82NiE=
github.com/golang/groupcache v0.0.0-20191027212112-611e8accdfc9/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/golang/protobuf v1.2.0 h1:P3YflyNX/ehuJFLhxviNdFxQPkGK5cDcApsge1SqnvM=
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg=
Expand All @@ -77,8 +78,8 @@ github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8Nz
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
github.com/klauspost/compress v1.4.1/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
github.com/knq/sysutil v0.0.0-20181215143952-f05b59f0f307 h1:vl4eIlySbjertFaNwiMjXsGrFVK25aOWLq7n+3gh2ls=
github.com/knq/sysutil v0.0.0-20181215143952-f05b59f0f307/go.mod h1:BjPj+aVjl9FW/cCGiF3nGh5v+9Gd3VCgBQbod/GlMaQ=
github.com/knq/sysutil v0.0.0-20191005231841-15668db23d08 h1:V0an7KRw92wmJysvFvtqtKMAPmvS5O0jtB0nYo6t+gs=
github.com/knq/sysutil v0.0.0-20191005231841-15668db23d08/go.mod h1:dFWs1zEqDjFtnBXsd1vPOZaLsESovai349994nHx3e0=
github.com/konsorten/go-windows-terminal-sequences v1.0.1 h1:mweAR1A6xJ3oS2pRaGiHgQ4OO8tzTaLawm8vnODuwDk=
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/konsorten/go-windows-terminal-sequences v1.0.2 h1:DB17ag19krx9CFsz4o3enTrPXyIXCl+2iCXH/aMAp9s=
Expand All @@ -89,8 +90,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e h1:hB2xlXdHp/pmPZq0y3QnmWAArdw9PqbmotexnWx/FU8=
github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
github.com/mailru/easyjson v0.7.0 h1:aizVhC/NAAcKWb+5QsU1iNOZb4Yws5UO2I+aIprQITM=
github.com/mailru/easyjson v0.7.0/go.mod h1:KAzv3t3aY1NaHWoQz1+4F1ccyAH66Jk7yos7ldAVICs=
github.com/mattn/go-runewidth v0.0.4 h1:2BvfKmzob6Bmd4YsL0zygOqfdFnK7GR4QL06Do4/p7Y=
github.com/mattn/go-runewidth v0.0.4/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU=
github.com/mattn/go-runewidth v0.0.6 h1:V2iyH+aX9C5fsYCpK60U8BYIvmhqxuOL3JZcqc1NB7k=
Expand Down Expand Up @@ -145,8 +146,8 @@ github.com/ulikunitz/xz v0.5.6 h1:jGHAfXawEGZQ3blwU5wnWKQJvAraT7Ftq9EXjnXYgt8=
github.com/ulikunitz/xz v0.5.6/go.mod h1:2bypXElzHzzJZwzH67Y6wb67pO62Rzfn7BSiF4ABRW8=
github.com/vbauerster/mpb/v4 v4.9.3 h1:fZv72LoQvz8Pz6OeqUSJr62kMCQDHyOtuY0nl93CcJM=
github.com/vbauerster/mpb/v4 v4.9.3/go.mod h1:xMKSr3w3dixpCH9v7svY4wF3mmhuyWYuYtkpy8T5FOk=
github.com/vbauerster/mpb/v4 v4.11.0 h1:QdSmlc4dUap9XugHWx84yi7ABstYHW1rC5slzDwfXnw=
github.com/vbauerster/mpb/v4 v4.11.0/go.mod h1:2d50DYyCBW+8eE9ZgdMCDEB+7S+ELz4YenPtQ+nKOts=
github.com/vbauerster/mpb/v4 v4.11.1 h1:ZOYQSVHgmeanXsbyC44aDg76tBGCS/54Rk8VkL8dJGA=
github.com/vbauerster/mpb/v4 v4.11.1/go.mod h1:vMLa1J/ZKC83G2lB/52XpqT+ZZtFG4aZOdKhmpRL1uM=
github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 h1:nIPpBwaJSVYIxUFsDv3M8ofmx9yWTog9BfvIu0q41lo=
github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8/go.mod h1:HUYIGzjTL3rfEspMxjDjgmT5uz5wzYJKVo23qUhYTos=
github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
Expand All @@ -156,10 +157,9 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk
golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20190829043050-9756ffdc2472 h1:Gv7RPwsi3eZ2Fgewe3CBsuOebPwO27PoXzRpJPsvSSM=
golang.org/x/crypto v0.0.0-20190829043050-9756ffdc2472/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550 h1:ObdrDkeb4kJdCP557AjRjq69pTHfNouLtWZG7j9rPN8=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20191108234033-bd318be0434a h1:R/qVym5WAxsZWQqZCwDY/8sdVKV1m1WgU4/S5IRQAzc=
golang.org/x/crypto v0.0.0-20191108234033-bd318be0434a/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.0.0-20191112222119-e1110fd1c708/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.0.0-20191117063200-497ca9f6d64f h1:kz4KIr+xcPUsI3VMoqWfPMvtnJ6MGfiVwsWSVzphMO4=
golang.org/x/crypto v0.0.0-20191117063200-497ca9f6d64f/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
Expand All @@ -171,8 +171,8 @@ golang.org/x/net v0.0.0-20190603091049-60506f45cf65 h1:+rhAzEzT3f4JtomfC371qB+0O
golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297 h1:k7pJ2yAPLPgbskkFdhRCsA77k2fySZ1zf2zCjvQCiIM=
golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20191109021931-daa7c04131f5 h1:bHNaocaoJxYBo5cw41UyTMLjYlb8wPY7+WFrnklbHOM=
golang.org/x/net v0.0.0-20191109021931-daa7c04131f5/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20191118183410-d06c31c94cae h1:AzDIJnLFoW3GaQvpbMRKk+SptYRYtnhYdyuX+S/dTbc=
golang.org/x/net v0.0.0-20191118183410-d06c31c94cae/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45 h1:SVwTIAaPC2U/AvvLNZ2a7OVsmBpC8L5BlwK1whH3hm0=
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
Expand All @@ -185,13 +185,12 @@ golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5h
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a h1:aYOabOQFp6Vj6W1F80affTUvO9UxmJRx8K0gsfABByQ=
golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190904154756-749cb33beabd h1:DBH9mDw0zluJT/R+nGuV3jWFWLFaHyYZWD4tOT+cjn0=
golang.org/x/sys v0.0.0-20190904154756-749cb33beabd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191025090151-53bf42e6b339/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191105231009-c1f44814a5cd h1:3x5uuvBgE6oaXJjCOvpCC1IpgJogqQ+PqGGU3ZxAgII=
golang.org/x/sys v0.0.0-20191105231009-c1f44814a5cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191008105621-543471e840be/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191113165036-4c7a9d0fe056/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191119060738-e882bf8e40c2 h1:wAW1U21MfVN0sUipAD8952TBjGXMRHFKQugDlQ9RwwE=
golang.org/x/sys v0.0.0-20191119060738-e882bf8e40c2/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
Expand Down
35 changes: 19 additions & 16 deletions spider/core.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,26 +28,28 @@ func NatureComSpider(opt *DoiSpiderOpt) (urls []string) {
if opt.FullText {
c.OnHTML("a.c-pdf-download__link[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
urls = append(urls, "https://nature.com"+link)
urls = append(urls, linkFilter(link, opt.URL))
})
}
if opt.Supplementary {
c.OnHTML("a.print-link[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
if !strings.HasPrefix(link, "http") {
urls = append(urls, "https://nature.com"+link)
} else {
u, _ := url.Parse(link)
linkTmp := strings.Split(u.Path, "/")
if len(linkTmp) < 4 {
return
if !strings.Contains(link, "/figures/") {
if !strings.HasPrefix(link, "http") {
urls = append(urls, linkFilter(link, opt.URL))
} else {
u, _ := url.Parse(link)
linkTmp := strings.Split(u.Path, "/")
if len(linkTmp) < 4 {
return
}
linkTmp[2] = stringo.StrReplaceAll(linkTmp[2], "art:", "art%3A")
newLink := append(linkTmp[0:2], strings.Join(linkTmp[2:4], "%2F"))
newLink = append(newLink, linkTmp[4:len(linkTmp)]...)
link = strings.Join(newLink, "/")
link = u.Scheme + "://" + u.Host + link
urls = append(urls, link)
}
linkTmp[2] = stringo.StrReplaceAll(linkTmp[2], "art:", "art%3A")
newLink := append(linkTmp[0:2], strings.Join(linkTmp[2:4], "%2F"))
newLink = append(newLink, linkTmp[4:len(linkTmp)]...)
link = strings.Join(newLink, "/")
link = u.Scheme + "://" + u.Host + link
urls = append(urls, link)
}
})
}
Expand Down Expand Up @@ -108,7 +110,8 @@ func CellComSpider(opt *DoiSpiderOpt) []string {
c := colly.NewCollector(
colly.AllowedDomains("doi.org", "www.cell.com", "cell.com", "linkinghub.elsevier.com", "secure.jbs.elsevierhealth.com",
"id.elsevier.com", "www.cancercell.org", "www.sciencedirect.com",
"pdf.sciencedirectassets.com", "www.thelancet.com", "www.gastrojournal.org"),
"pdf.sciencedirectassets.com", "www.thelancet.com", "www.gastrojournal.org",
"www.clinicalkey.com"),
colly.MaxDepth(1),
)
bspider.SetSpiderProxy(c, opt.Proxy, opt.Timeout)
Expand Down Expand Up @@ -175,7 +178,7 @@ func CellComSpider(opt *DoiSpiderOpt) []string {
c.Visit(fmt.Sprintf("https://doi.org/%s", opt.Doi))
if opt.Supplementary {
urls = append(urls, chrome.DoiSupplURLs(fmt.Sprintf("https://doi.org/%s", opt.Doi),
time.Duration(opt.Timeout)*time.Second)...)
time.Duration(opt.Timeout)*time.Second, opt.Proxy)...)
c.OnHTML("#appsec1 a[target=new]", func(e *colly.HTMLElement) {
link := e.Attr("href")
urls = append(urls, link)
Expand Down
4 changes: 4 additions & 0 deletions spider/universal.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ func UniVersalDoiSpider(opt *DoiSpiderOpt) (urls []string) {
link = stringo.StrReplaceAll(link, "pdf[?].*", "pdf")
urls = append(urls, linkFilter(link, opt.URL))
})
c.OnHTML("a.article-pdfLink[data-article-url]", func(e *colly.HTMLElement) {
link := e.Attr("data-article-url")
urls = append(urls, linkFilter(link, opt.URL))
})
staticUrl := static2pdf(opt)
if staticUrl != "" {
urls = append(urls, linkFilter(staticUrl, opt.URL))
Expand Down

0 comments on commit fe22aa4

Please sign in to comment.