Skip to content

Commit

Permalink
Merge branch 'release/v0.4.0-alpha'
Browse files Browse the repository at this point in the history
  • Loading branch information
andreaskoch committed Nov 5, 2020
2 parents 2c56094 + 183bf6b commit a3e9041
Show file tree
Hide file tree
Showing 155 changed files with 818 additions and 30,565 deletions.
26 changes: 26 additions & 0 deletions .github/workflows/go.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name: Go

on: [push]

jobs:

build:
name: Build
runs-on: ubuntu-latest
steps:

- name: Set up Go 1.x
uses: actions/setup-go@v2
with:
go-version: ^1

- name: Check out code into the Go module directory
uses: actions/checkout@v2

- name: Build
run: |
go mod tidy
go build -v .
- name: Test
run: go test -v .
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
.DS_Store
/.idea
/vendor
11 changes: 10 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,19 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/)
and this project adheres to [Semantic Versioning](http://semver.org/).

## [Unreleased]
## [v0.4.0-alpha] - 2020-11-05

Logging

### Added
- Add "Save downloaded data to disk" to the roadmap (feature request #1)
- Log results to a log file
- Add Github actions

### Changed
- Add support for non-sitemap URLs
- Capture the parent URL
- Switch from go 1.14 to 1.15

## [v0.2.0-alpha] - 2017-02-07

Expand Down
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,30 @@ You can specify a customized user agent using the `--user-agent` argument:
gargantua crawl --url https://www.sitemaps.org/sitemap.xml --workers 5 --user-agent "gargantua bot / iPhone"
```

### Log all requests

You can specify a log file with the `--log` argument:

```bash
gargantua crawl --url https://www.sitemaps.org/sitemap.xml --workers 5 --log "gargantua.log"
```

```
Date and time #worker Status Code Bytes Response Time URL Parent URL
2020/11/05 09:23:14 #001: 200 4403 148.759000ms https://www.sitemaps.org https://www.sitemaps.org/ko/faq.html
2020/11/05 09:23:14 #002: 200 4403 290.536000ms http://www.sitemaps.org/ https://www.sitemaps.org/ko/faq.html
2020/11/05 09:23:14 #003: 200 45077 283.243000ms https://www.sitemaps.org/protocol.html https://www.sitemaps.org/ko/faq.html
2020/11/05 09:23:14 #004: 404 1245 155.376000ms https://www.sitemaps.org/protocol.htm https://www.sitemaps.org/ko/faq.html
2020/11/05 09:23:14 #005: 200 4403 155.577000ms https://www.sitemaps.org/index.html https://www.sitemaps.org/ko/faq.html
2020/11/05 09:23:14 #001: 200 2591 286.451000ms http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd https://www.sitemaps.org/ko/faq.html
2020/11/05 09:23:14 #003: 200 10839 143.738000ms https://www.sitemaps.org/terms.html https://www.sitemaps.org/ko/faq.html
2020/11/05 09:23:14 #005: 200 15681 141.580000ms https://www.sitemaps.org/faq.html https://www.sitemaps.org/ko/protocol.html
2020/11/05 09:23:14 #002: 404 1245 286.175000ms http://www.sitemaps.org/protocol.htm https://www.sitemaps.org/ko/faq.html
```

[gargantua.log](files/gargantua.log)


## Download

You can download binaries for Linux, macOS and Windows from [github.com »andreaskoch » gargantua » releases](https://github.com/andreaskoch/gargantua/releases):
Expand Down
23 changes: 21 additions & 2 deletions crawler.go
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
package main

import (
"github.com/pkg/errors"
"log"
"net/url"
"os"
"time"
)

type CrawlOptions struct {
NumberOfConcurrentRequests int
Timeout time.Duration
UserAgent string
LogFile string
}

func crawl(xmlSitemapURL url.URL, options CrawlOptions, stop chan bool) error {
Expand All @@ -20,7 +24,7 @@ func crawl(xmlSitemapURL url.URL, options CrawlOptions, stop chan bool) error {
}

// the URL queue
urls := make(chan url.URL, len(urlsFromXMLSitemap))
urls := make(chan crawlerUrl, len(urlsFromXMLSitemap))

// fill the URL queue with the URLs from the XML sitemap
for _, xmlSitemapURLEntry := range urlsFromXMLSitemap {
Expand All @@ -37,7 +41,7 @@ func crawl(xmlSitemapURL url.URL, options CrawlOptions, stop chan bool) error {

allURLsHaveBeenVisited := make(chan bool)
go func() {
var visitedURLs = make(map[string]url.URL)
var visitedURLs = make(map[string]crawlerUrl)
for {
select {
case <-stop:
Expand Down Expand Up @@ -76,6 +80,17 @@ func crawl(xmlSitemapURL url.URL, options CrawlOptions, stop chan bool) error {
}
}()

var logger *log.Logger
if options.LogFile != "" {
file, err := os.OpenFile(options.LogFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0666)
if err != nil {
return errors.Wrapf(err, "failed to open log file %q for writing", options.LogFile)
}

defer file.Close()
logger = log.New(file, "", log.Ldate|log.Ltime)
}

// update the statistics with the results
allStatisticsHaveBeenUpdated := make(chan bool)
go func() {
Expand All @@ -89,6 +104,10 @@ func crawl(xmlSitemapURL url.URL, options CrawlOptions, stop chan bool) error {
receivedUrl := result.URL()
debugf("Received results for URL %q", receivedUrl.String())
updateStatistics(result)

if logger != nil {
logResult(logger, result)
}
}
}
}()
Expand Down
Loading

0 comments on commit a3e9041

Please sign in to comment.