Skip to content

Commit 06cb0b4

Browse files
committed
Merge branch 'master' of github.com:PuerkitoBio/gocrawl
2 parents fa35d57 + dada5e9 commit 06cb0b4

File tree

1 file changed

+8
-2
lines changed

1 file changed

+8
-2
lines changed

worker.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"path"
1414

1515
"github.com/PuerkitoBio/goquery"
16+
"github.com/andybalholm/cascadia"
1617
"github.com/temoto/robotstxt"
1718
"golang.org/x/net/html"
1819
)
@@ -381,10 +382,15 @@ func handleBaseTag(root *url.URL, baseHref string, aHref string) string {
381382
return resolvedURL.String()
382383
}
383384

385+
var (
386+
aHrefMatcher = cascadia.MustCompile("a[href]")
387+
baseHrefMatcher = cascadia.MustCompile("base[href]")
388+
)
389+
384390
// Scrape the document's content to gather all links
385391
func (w *worker) processLinks(doc *goquery.Document) (result []*url.URL) {
386-
baseURL, _ := doc.Find("base[href]").Attr("href")
387-
urls := doc.Find("a[href]").Map(func(_ int, s *goquery.Selection) string {
392+
baseURL, _ := doc.FindMatcher(baseHrefMatcher).Attr("href")
393+
urls := doc.FindMatcher(aHrefMatcher).Map(func(_ int, s *goquery.Selection) string {
388394
val, _ := s.Attr("href")
389395
if baseURL != "" {
390396
val = handleBaseTag(doc.Url, baseURL, val)

0 commit comments

Comments
 (0)