From fe22aa4f4b17def5e72b9df65c6d48fc7c07e797 Mon Sep 17 00:00:00 2001 From: Jianfeng Li Date: Tue, 19 Nov 2019 13:53:00 +0800 Subject: [PATCH] add chromdp proxy and update spider/core.go --- chrome/doi.go | 16 +++++++---- doc/doi.list.journal.txt | 6 ++--- doc/doi.md | 6 ++--- go.mod | 19 +++++++------- go.sum | 57 ++++++++++++++++++++-------------------- spider/core.go | 35 +++++++++++++----------- spider/universal.go | 4 +++ 7 files changed, 78 insertions(+), 65 deletions(-) diff --git a/chrome/doi.go b/chrome/doi.go index 42cd922..0f1336f 100644 --- a/chrome/doi.go +++ b/chrome/doi.go @@ -9,14 +9,20 @@ import ( "github.com/openbiox/butils/log" stringo "github.com/openbiox/butils/stringo" + "github.com/chromedp/chromedp" cdp "github.com/chromedp/chromedp" ) -func DoiSupplURLs(url string, timeout time.Duration) []string { +func DoiSupplURLs(url string, timeout time.Duration, proxy string) []string { // create context - ctx, _ := cdp.NewContext(context.Background()) - ctx, _ = context.WithTimeout(ctx, timeout) - //defer cancel() + o := append(cdp.DefaultExecAllocatorOptions[:], + //... any options here + chromedp.ProxyServer(proxy), + ) + cx, cancel := chromedp.NewExecAllocator(context.Background(), o...) + ctx, cancel := cdp.NewContext(cx) + ctx, cancel = context.WithTimeout(ctx, timeout) + defer cancel() var err error var attbs []map[string]string urls := []string{} @@ -48,7 +54,7 @@ func visibleScienceDirect(host string, attbs *[]map[string]string) cdp.Tasks { cdp.Navigate(host), cdp.WaitVisible(`.show-toc-button`, cdp.ByQuery), cdp.Click(`.show-toc-button`, cdp.ByQuery), - //cdp.WaitVisible(`a[href="#app2"]`, cdp.ByQuery), + //cdp.WaitVisible(`a[href="#app2"]`, cdp.ByQuery), //cdp.Click(`a[href="#app2"]`, cdp.ByQuery), //cdp.WaitVisible(`#app2`, cdp.ByQuery), cdp.AttributesAll(".Appendices a.icon-link[href]", attbs, cdp.ByQueryAll), diff --git a/doc/doi.list.journal.txt b/doc/doi.list.journal.txt index ecbee0a..6e23564 100644 --- a/doc/doi.list.journal.txt +++ b/doc/doi.list.journal.txt @@ -78,10 +78,10 @@ Rank Journal.Name Fulltext-1pass-check Supplemental-1pass-check DOI-1pass Fullte 80 ANNALS OF INTERNAL MEDICINE bug 10.7326/M19-3111 81 GASTROENTEROLOGY done yes 10.1186/s12876-019-1087-9 82 INTENSIVE CARE MEDICINE done 10.1007/s00134-019-05829-0 -83 JOURNAL OF HEPATOLOGY done no 10.1016/j.jhep.2019.10.023 +83 JOURNAL OF HEPATOLOGY done yes 10.1016/j.jhep.2019.10.023 84 Annual Review of Plant Biology done 10.1146/annurev-arplant-050718-100016 85 PHARMACOLOGICAL REVIEWS done 10.1124/jpet.119.260968 -86 JOURNAL OF THE AMERICAN COLLEGE OF CARDIOLOGY done no 10.1016/j.jacc.2019.09.001 +86 JOURNAL OF THE AMERICAN COLLEGE OF CARDIOLOGY done yes 10.1016/j.jacc.2019.09.067 87 Nature Reviews Rheumatology done yes 10.1038/s41584-019-0335-2 88 Lancet Psychiatry todo 10.1016/S2215-0366(19)30394-3 90 Chem todo no 10.1016/j.chempr.2019.06.020 @@ -2781,4 +2781,4 @@ Rank Journal.Name Fulltext-1pass-check Supplemental-1pass-check DOI-1pass Fullte 2903 PERMAFROST AND PERIGLACIAL PROCESSES NA 2903 SURGICAL ONCOLOGY-OXFORD NA 2903 Topics in Organometallic Chemistry NA -2903 TRANSFUSION MEDICINE AND HEMOTHERAPY NA +2903 TRANSFUSION MEDICINE AND HEMOTHERAPY NA \ No newline at end of file diff --git a/doc/doi.md b/doc/doi.md index 731c4a7..270b2ca 100644 --- a/doc/doi.md +++ b/doc/doi.md @@ -760,7 +760,7 @@ Supported DOI by bget (Journal) 83 JOURNAL OF HEPATOLOGY done -no +yes 10.1016/j.jhep.2019.10.023 NA NA @@ -787,8 +787,8 @@ Supported DOI by bget (Journal) 86 JOURNAL OF THE AMERICAN COLLEGE OF CARDIOLOGY done -no -10.1016/j.jacc.2019.09.001 +yes +10.1016/j.jacc.2019.09.067 NA NA diff --git a/go.mod b/go.mod index 5e25183..217fb7a 100644 --- a/go.mod +++ b/go.mod @@ -5,12 +5,13 @@ go 1.12 require ( github.com/PuerkitoBio/goquery v1.5.0 github.com/andybalholm/cascadia v1.1.0 // indirect - github.com/antchfx/htmlquery v1.1.0 // indirect - github.com/antchfx/xmlquery v1.1.0 // indirect - github.com/antchfx/xpath v1.1.0 // indirect - github.com/chromedp/cdproto v0.0.0-20190827000638-b5ac1e37ce90 // indirect - github.com/chromedp/chromedp v0.4.0 + github.com/antchfx/htmlquery v1.2.0 // indirect + github.com/antchfx/xmlquery v1.2.0 // indirect + github.com/antchfx/xpath v1.1.1 // indirect + github.com/chromedp/cdproto v0.0.0-20191114225735-6626966fbae4 // indirect + github.com/chromedp/chromedp v0.5.1 github.com/gocolly/colly v1.2.0 + github.com/golang/groupcache v0.0.0-20191027212112-611e8accdfc9 // indirect github.com/google/go-github/v27 v27.0.6 github.com/mattn/go-runewidth v0.0.6 // indirect github.com/olekukonko/tablewriter v0.0.2 @@ -18,10 +19,10 @@ require ( github.com/pierrec/lz4 v2.3.0+incompatible // indirect github.com/spf13/cobra v0.0.5 github.com/spf13/pflag v1.0.5 // indirect - github.com/vbauerster/mpb/v4 v4.11.0 - golang.org/x/crypto v0.0.0-20191108234033-bd318be0434a // indirect - golang.org/x/net v0.0.0-20191109021931-daa7c04131f5 // indirect + github.com/vbauerster/mpb/v4 v4.11.1 + golang.org/x/crypto v0.0.0-20191117063200-497ca9f6d64f // indirect + golang.org/x/net v0.0.0-20191118183410-d06c31c94cae // indirect golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45 - golang.org/x/sys v0.0.0-20191105231009-c1f44814a5cd // indirect + golang.org/x/sys v0.0.0-20191119060738-e882bf8e40c2 // indirect google.golang.org/appengine v1.6.5 // indirect ) diff --git a/go.sum b/go.sum index 390c228..eddcc40 100644 --- a/go.sum +++ b/go.sum @@ -11,25 +11,24 @@ github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5z github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= github.com/antchfx/htmlquery v1.0.0 h1:O5IXz8fZF3B3MW+B33MZWbTHBlYmcfw0BAxgErHuaMA= github.com/antchfx/htmlquery v1.0.0/go.mod h1:MS9yksVSQXls00iXkiMqXr0J+umL/AmxXKuP28SUJM8= -github.com/antchfx/htmlquery v1.1.0 h1:KMS88sLl5KP9GUVU2MQIDcQXNQ0M5MGlkC9WlYgAQqY= -github.com/antchfx/htmlquery v1.1.0/go.mod h1:MS9yksVSQXls00iXkiMqXr0J+umL/AmxXKuP28SUJM8= +github.com/antchfx/htmlquery v1.2.0 h1:oKShnsGlnOHX6t4uj5OHgLKkABcJoqnXpqnscoi9Lpw= +github.com/antchfx/htmlquery v1.2.0/go.mod h1:MS9yksVSQXls00iXkiMqXr0J+umL/AmxXKuP28SUJM8= github.com/antchfx/xmlquery v1.0.0 h1:YuEPqexGG2opZKNc9JU3Zw6zFXwC47wNcy6/F8oKsrM= github.com/antchfx/xmlquery v1.0.0/go.mod h1:/+CnyD/DzHRnv2eRxrVbieRU/FIF6N0C+7oTtyUtCKk= -github.com/antchfx/xmlquery v1.1.0 h1:vj0kZ1y3Q6my4AV+a9xbWrMYzubw+84zuiKgvfV8vb8= -github.com/antchfx/xmlquery v1.1.0/go.mod h1:/+CnyD/DzHRnv2eRxrVbieRU/FIF6N0C+7oTtyUtCKk= +github.com/antchfx/xmlquery v1.2.0 h1:1nrzsSN5mFrlqFWSK9byiq/qXKE7O2vivYzhv1Ksnfw= +github.com/antchfx/xmlquery v1.2.0/go.mod h1:/+CnyD/DzHRnv2eRxrVbieRU/FIF6N0C+7oTtyUtCKk= github.com/antchfx/xpath v1.0.0 h1:Q5gFgh2O40VTSwMOVbFE7nFNRBu3tS21Tn0KAWeEjtk= github.com/antchfx/xpath v1.0.0/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= -github.com/antchfx/xpath v1.1.0 h1:mJTvYpiHvxNQRD4Lbfin/FodHVCHh2a5KrOFr4ZxMOI= -github.com/antchfx/xpath v1.1.0/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= +github.com/antchfx/xpath v1.1.1 h1:mqGYmd5pioPu06+REIf8j3y6O3S1UpVNVoCameZHotg= +github.com/antchfx/xpath v1.1.1/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= github.com/antonfisher/nested-logrus-formatter v1.0.2 h1:t65eOqj0fWbOkZR2+OgmxPa0KYIwbPhKdYmseaCMIyI= github.com/antonfisher/nested-logrus-formatter v1.0.2/go.mod h1:6WTfyWFkBc9+zyBaKIqRrg/KwMqBbodBjgbHjDz7zjA= github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8= -github.com/chromedp/cdproto v0.0.0-20190812224334-39ef923dcb8d h1:00kLGv5nKzpFchNhGDXDRbKtYx/WoT983Ka2t8/pzRE= -github.com/chromedp/cdproto v0.0.0-20190812224334-39ef923dcb8d/go.mod h1:0YChpVzuLJC5CPr+x3xkHN6Z8KOSXjNbL7qV8Wc4GW0= -github.com/chromedp/cdproto v0.0.0-20190827000638-b5ac1e37ce90 h1:CgIuU+BmhL7FOXl4nTH3L1pwPbAz1VlzexJNEfrS7Kw= -github.com/chromedp/cdproto v0.0.0-20190827000638-b5ac1e37ce90/go.mod h1:0YChpVzuLJC5CPr+x3xkHN6Z8KOSXjNbL7qV8Wc4GW0= -github.com/chromedp/chromedp v0.4.0 h1:0AJC5ejETuh/6n7Tcsw4u4G0eKZkI9aVRwckWaImLUE= -github.com/chromedp/chromedp v0.4.0/go.mod h1:DC3QUn4mJ24dwjcaGQLoZrhm4X/uPHZ6spDbS2uFhm4= +github.com/chromedp/cdproto v0.0.0-20191009033829-c22f49c9ff0a/go.mod h1:PfAWWKJqjlGFYJEidUM6aVIWPr0EpobeyVWEEmplX7g= +github.com/chromedp/cdproto v0.0.0-20191114225735-6626966fbae4 h1:QD3KxSJ59L2lxG6MXBjNHxiQO2RmxTQ3XcK+wO44WOg= +github.com/chromedp/cdproto v0.0.0-20191114225735-6626966fbae4/go.mod h1:PfAWWKJqjlGFYJEidUM6aVIWPr0EpobeyVWEEmplX7g= +github.com/chromedp/chromedp v0.5.1 h1:PAqhoCWCHzRphYnmmxLSiYk7EEwDplCm4woTCCaV2cQ= +github.com/chromedp/chromedp v0.5.1/go.mod h1:3NMfuKTrKNr8PWEvHzdzZ57PK4jm9zW1C5nKiaWdxcM= github.com/coreos/etcd v3.3.10+incompatible h1:jFneRYjIvLMLhDLCzuTuU4rSJUjRplcJQ7pD7MnhC04= github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE= github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8NzMklzPG4d5KIOhIy30Tk= @@ -55,6 +54,8 @@ github.com/gobwas/ws v1.0.2 h1:CoAavW/wd/kulfZmSIBt6p24n4j7tHgNVCjsfHVNUbo= github.com/gobwas/ws v1.0.2/go.mod h1:szmBTxLgaFppYjEmNtny/v3w89xOydFnnZMcgRRu/EM= github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI= github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA= +github.com/golang/groupcache v0.0.0-20191027212112-611e8accdfc9 h1:uHTyIjqVhYRhLbJ8nIiOJHkEZZ+5YoOsAbD3sk82NiE= +github.com/golang/groupcache v0.0.0-20191027212112-611e8accdfc9/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/protobuf v1.2.0 h1:P3YflyNX/ehuJFLhxviNdFxQPkGK5cDcApsge1SqnvM= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg= @@ -77,8 +78,8 @@ github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8Nz github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= github.com/klauspost/compress v1.4.1/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A= github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek= -github.com/knq/sysutil v0.0.0-20181215143952-f05b59f0f307 h1:vl4eIlySbjertFaNwiMjXsGrFVK25aOWLq7n+3gh2ls= -github.com/knq/sysutil v0.0.0-20181215143952-f05b59f0f307/go.mod h1:BjPj+aVjl9FW/cCGiF3nGh5v+9Gd3VCgBQbod/GlMaQ= +github.com/knq/sysutil v0.0.0-20191005231841-15668db23d08 h1:V0an7KRw92wmJysvFvtqtKMAPmvS5O0jtB0nYo6t+gs= +github.com/knq/sysutil v0.0.0-20191005231841-15668db23d08/go.mod h1:dFWs1zEqDjFtnBXsd1vPOZaLsESovai349994nHx3e0= github.com/konsorten/go-windows-terminal-sequences v1.0.1 h1:mweAR1A6xJ3oS2pRaGiHgQ4OO8tzTaLawm8vnODuwDk= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/konsorten/go-windows-terminal-sequences v1.0.2 h1:DB17ag19krx9CFsz4o3enTrPXyIXCl+2iCXH/aMAp9s= @@ -89,8 +90,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= -github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e h1:hB2xlXdHp/pmPZq0y3QnmWAArdw9PqbmotexnWx/FU8= -github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= +github.com/mailru/easyjson v0.7.0 h1:aizVhC/NAAcKWb+5QsU1iNOZb4Yws5UO2I+aIprQITM= +github.com/mailru/easyjson v0.7.0/go.mod h1:KAzv3t3aY1NaHWoQz1+4F1ccyAH66Jk7yos7ldAVICs= github.com/mattn/go-runewidth v0.0.4 h1:2BvfKmzob6Bmd4YsL0zygOqfdFnK7GR4QL06Do4/p7Y= github.com/mattn/go-runewidth v0.0.4/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU= github.com/mattn/go-runewidth v0.0.6 h1:V2iyH+aX9C5fsYCpK60U8BYIvmhqxuOL3JZcqc1NB7k= @@ -145,8 +146,8 @@ github.com/ulikunitz/xz v0.5.6 h1:jGHAfXawEGZQ3blwU5wnWKQJvAraT7Ftq9EXjnXYgt8= github.com/ulikunitz/xz v0.5.6/go.mod h1:2bypXElzHzzJZwzH67Y6wb67pO62Rzfn7BSiF4ABRW8= github.com/vbauerster/mpb/v4 v4.9.3 h1:fZv72LoQvz8Pz6OeqUSJr62kMCQDHyOtuY0nl93CcJM= github.com/vbauerster/mpb/v4 v4.9.3/go.mod h1:xMKSr3w3dixpCH9v7svY4wF3mmhuyWYuYtkpy8T5FOk= -github.com/vbauerster/mpb/v4 v4.11.0 h1:QdSmlc4dUap9XugHWx84yi7ABstYHW1rC5slzDwfXnw= -github.com/vbauerster/mpb/v4 v4.11.0/go.mod h1:2d50DYyCBW+8eE9ZgdMCDEB+7S+ELz4YenPtQ+nKOts= +github.com/vbauerster/mpb/v4 v4.11.1 h1:ZOYQSVHgmeanXsbyC44aDg76tBGCS/54Rk8VkL8dJGA= +github.com/vbauerster/mpb/v4 v4.11.1/go.mod h1:vMLa1J/ZKC83G2lB/52XpqT+ZZtFG4aZOdKhmpRL1uM= github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 h1:nIPpBwaJSVYIxUFsDv3M8ofmx9yWTog9BfvIu0q41lo= github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8/go.mod h1:HUYIGzjTL3rfEspMxjDjgmT5uz5wzYJKVo23qUhYTos= github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q= @@ -156,10 +157,9 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20190829043050-9756ffdc2472 h1:Gv7RPwsi3eZ2Fgewe3CBsuOebPwO27PoXzRpJPsvSSM= golang.org/x/crypto v0.0.0-20190829043050-9756ffdc2472/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550 h1:ObdrDkeb4kJdCP557AjRjq69pTHfNouLtWZG7j9rPN8= -golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20191108234033-bd318be0434a h1:R/qVym5WAxsZWQqZCwDY/8sdVKV1m1WgU4/S5IRQAzc= -golang.org/x/crypto v0.0.0-20191108234033-bd318be0434a/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20191112222119-e1110fd1c708/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20191117063200-497ca9f6d64f h1:kz4KIr+xcPUsI3VMoqWfPMvtnJ6MGfiVwsWSVzphMO4= +golang.org/x/crypto v0.0.0-20191117063200-497ca9f6d64f/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -171,8 +171,8 @@ golang.org/x/net v0.0.0-20190603091049-60506f45cf65 h1:+rhAzEzT3f4JtomfC371qB+0O golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297 h1:k7pJ2yAPLPgbskkFdhRCsA77k2fySZ1zf2zCjvQCiIM= golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20191109021931-daa7c04131f5 h1:bHNaocaoJxYBo5cw41UyTMLjYlb8wPY7+WFrnklbHOM= -golang.org/x/net v0.0.0-20191109021931-daa7c04131f5/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20191118183410-d06c31c94cae h1:AzDIJnLFoW3GaQvpbMRKk+SptYRYtnhYdyuX+S/dTbc= +golang.org/x/net v0.0.0-20191118183410-d06c31c94cae/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45 h1:SVwTIAaPC2U/AvvLNZ2a7OVsmBpC8L5BlwK1whH3hm0= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -185,13 +185,12 @@ golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a h1:aYOabOQFp6Vj6W1F80affTUvO9UxmJRx8K0gsfABByQ= -golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190904154756-749cb33beabd h1:DBH9mDw0zluJT/R+nGuV3jWFWLFaHyYZWD4tOT+cjn0= golang.org/x/sys v0.0.0-20190904154756-749cb33beabd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20191025090151-53bf42e6b339/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20191105231009-c1f44814a5cd h1:3x5uuvBgE6oaXJjCOvpCC1IpgJogqQ+PqGGU3ZxAgII= -golang.org/x/sys v0.0.0-20191105231009-c1f44814a5cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191008105621-543471e840be/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191113165036-4c7a9d0fe056/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191119060738-e882bf8e40c2 h1:wAW1U21MfVN0sUipAD8952TBjGXMRHFKQugDlQ9RwwE= +golang.org/x/sys v0.0.0-20191119060738-e882bf8e40c2/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs= diff --git a/spider/core.go b/spider/core.go index 5adcb45..a7187cd 100644 --- a/spider/core.go +++ b/spider/core.go @@ -28,26 +28,28 @@ func NatureComSpider(opt *DoiSpiderOpt) (urls []string) { if opt.FullText { c.OnHTML("a.c-pdf-download__link[href]", func(e *colly.HTMLElement) { link := e.Attr("href") - urls = append(urls, "https://nature.com"+link) + urls = append(urls, linkFilter(link, opt.URL)) }) } if opt.Supplementary { c.OnHTML("a.print-link[href]", func(e *colly.HTMLElement) { link := e.Attr("href") - if !strings.HasPrefix(link, "http") { - urls = append(urls, "https://nature.com"+link) - } else { - u, _ := url.Parse(link) - linkTmp := strings.Split(u.Path, "/") - if len(linkTmp) < 4 { - return + if !strings.Contains(link, "/figures/") { + if !strings.HasPrefix(link, "http") { + urls = append(urls, linkFilter(link, opt.URL)) + } else { + u, _ := url.Parse(link) + linkTmp := strings.Split(u.Path, "/") + if len(linkTmp) < 4 { + return + } + linkTmp[2] = stringo.StrReplaceAll(linkTmp[2], "art:", "art%3A") + newLink := append(linkTmp[0:2], strings.Join(linkTmp[2:4], "%2F")) + newLink = append(newLink, linkTmp[4:len(linkTmp)]...) + link = strings.Join(newLink, "/") + link = u.Scheme + "://" + u.Host + link + urls = append(urls, link) } - linkTmp[2] = stringo.StrReplaceAll(linkTmp[2], "art:", "art%3A") - newLink := append(linkTmp[0:2], strings.Join(linkTmp[2:4], "%2F")) - newLink = append(newLink, linkTmp[4:len(linkTmp)]...) - link = strings.Join(newLink, "/") - link = u.Scheme + "://" + u.Host + link - urls = append(urls, link) } }) } @@ -108,7 +110,8 @@ func CellComSpider(opt *DoiSpiderOpt) []string { c := colly.NewCollector( colly.AllowedDomains("doi.org", "www.cell.com", "cell.com", "linkinghub.elsevier.com", "secure.jbs.elsevierhealth.com", "id.elsevier.com", "www.cancercell.org", "www.sciencedirect.com", - "pdf.sciencedirectassets.com", "www.thelancet.com", "www.gastrojournal.org"), + "pdf.sciencedirectassets.com", "www.thelancet.com", "www.gastrojournal.org", + "www.clinicalkey.com"), colly.MaxDepth(1), ) bspider.SetSpiderProxy(c, opt.Proxy, opt.Timeout) @@ -175,7 +178,7 @@ func CellComSpider(opt *DoiSpiderOpt) []string { c.Visit(fmt.Sprintf("https://doi.org/%s", opt.Doi)) if opt.Supplementary { urls = append(urls, chrome.DoiSupplURLs(fmt.Sprintf("https://doi.org/%s", opt.Doi), - time.Duration(opt.Timeout)*time.Second)...) + time.Duration(opt.Timeout)*time.Second, opt.Proxy)...) c.OnHTML("#appsec1 a[target=new]", func(e *colly.HTMLElement) { link := e.Attr("href") urls = append(urls, link) diff --git a/spider/universal.go b/spider/universal.go index a81bd92..1da0ceb 100644 --- a/spider/universal.go +++ b/spider/universal.go @@ -35,6 +35,10 @@ func UniVersalDoiSpider(opt *DoiSpiderOpt) (urls []string) { link = stringo.StrReplaceAll(link, "pdf[?].*", "pdf") urls = append(urls, linkFilter(link, opt.URL)) }) + c.OnHTML("a.article-pdfLink[data-article-url]", func(e *colly.HTMLElement) { + link := e.Attr("data-article-url") + urls = append(urls, linkFilter(link, opt.URL)) + }) staticUrl := static2pdf(opt) if staticUrl != "" { urls = append(urls, linkFilter(staticUrl, opt.URL))