diff --git a/options.go b/options.go index 21abc8c..b34b20e 100644 --- a/options.go +++ b/options.go @@ -75,6 +75,9 @@ type Options struct { // Extender is the implementation of hooks to use by the crawler. Extender Extender + + // ParseImageTags specifies whether to parse tags and include in the crawling + ParseImageTags bool } // NewOptions creates a new set of Options with default values @@ -96,5 +99,6 @@ func NewOptions(ext Extender) *Options { DefaultNormalizationFlags, LogError, ext, + false, } } diff --git a/worker.go b/worker.go index 1089df2..e60a53d 100644 --- a/worker.go +++ b/worker.go @@ -385,6 +385,7 @@ func handleBaseTag(root *url.URL, baseHref string, aHref string) string { var ( aHrefMatcher = cascadia.MustCompile("a[href]") baseHrefMatcher = cascadia.MustCompile("base[href]") + imgSrcMatcher = cascadia.MustCompile("img[src]") ) // Scrape the document's content to gather all links @@ -397,6 +398,16 @@ func (w *worker) processLinks(doc *goquery.Document) (result []*url.URL) { } return val }) + if w.opts.ParseImageTags { + imgURLs := doc.FindMatcher(imgSrcMatcher).Map(func(_ int, s *goquery.Selection) string { + val, _ := s.Attr("src") + if baseURL != "" { + val = handleBaseTag(doc.Url, baseURL, val) + } + return val + }) + urls = append(urls, imgURLs...) + } for _, s := range urls { // If href starts with "#", then it points to this same exact URL, ignore (will fail to parse anyway) if len(s) > 0 && !strings.HasPrefix(s, "#") {