Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 12 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@
Developers Italia provides [a catalog of Free and Open Source](https://developers.italia.it/en/search)
software aimed to Public Administrations.

This **crawler** retrieves the `publiccode.yml` files from the
`publiccode-crawler` retrieves the `publiccode.yml` files from the
repositories of publishers found in the [Developers Italia API](https://github.com/italia/developers-italia-api).

## Setup and deployment processes

The crawler can either run manually on the target machine or it can be deployed
`publiccode-crawler` can either run manually on the target machine or it can be deployed
from a Docker container.

### Manually configure and build the crawler
### Manually configure and build

1. Rename `config.toml.example` to `config.toml` and set the variables

Expand All @@ -43,16 +43,23 @@ docker run -it italia/publiccode-crawler

## Commands

### `crawler crawl`
### `publiccode-crawler crawl`

Gets the list of publishers from `https://api.developers.italia.it/v1/publishers`
and starts to crawl their repositories.

### `crawler crawl publishers*.yml`
### `publiccode-crawler crawl publishers*.yml`

Gets the list of publishers in `publishers*.yml` and starts to crawl
their repositories.

### `publiccode-crawler crawl-software <software> <publisher>`

Crawl just the software specified as paramenter.
It takes the software URL and its publisher id as parameters.

Ex. `publiccode-crawler crawl-software https://api.developers.italia.it/v1/software/a2ea59b0-87cd-4419-b93f-00bed8a7b859 edb66b3d-3e36-4b69-aba9-b7c4661b3fdd"

### Other commands

* `crawler download-publishers` downloads organizations and repositories from
Expand Down
19 changes: 19 additions & 0 deletions apiclient/apiclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,25 @@ page:
return publishers, nil
}

// GetSoftware returns the software with the given id or any error encountered.
func (clt APIClient) GetSoftware(id string) (*Software, error) {
var softwareResponse Software

res, err := clt.retryableClient.Get(joinPath(clt.baseURL, "/software") + "/" + id)
if err != nil {
return nil, fmt.Errorf("can't GET /software/%s: %w", id, err)
}

defer res.Body.Close()

err = json.NewDecoder(res.Body).Decode(&softwareResponse)
if err != nil {
return nil, fmt.Errorf("can't parse GET /software/%s response: %w", id, err)
}

return &softwareResponse, nil
}

// GetSoftwareByURL returns the software matching the given repo URL and
// any error encountered.
// In case no software is found and no error occours, (nil, nil) is returned.
Expand Down
43 changes: 43 additions & 0 deletions cmd/crawl-software.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package cmd

import (
"github.com/italia/publiccode-crawler/v4/common"
"github.com/italia/publiccode-crawler/v4/crawler"
log "github.com/sirupsen/logrus"
"github.com/spf13/cobra"
"github.com/spf13/viper"
)

func init() {
crawlSoftwareCmd.Flags().BoolVarP(&dryRun, "dry-run", "n", false, "perform a dry run with no changes made")

rootCmd.AddCommand(crawlSoftwareCmd)
}

var crawlSoftwareCmd = &cobra.Command{
Use: "crawl-software [SOFTWARE_ID | SOFTWARE_URL] PUBLISHER_ID",
Short: "Crawl a single software by its id.",
Long: `Crawl a single software by its id.

Crawl a single software given its API id and its publisher.`,
Example: "# Crawl just the specified software\n" +
"publiccode-crawler crawl-software" +
" https://api.developers.italia.it/v1/software/af6056fc-b2b2-4d31-9961-c9bd94e32bd4 PCM",

Args: cobra.ExactArgs(2),
Run: func(_ *cobra.Command, args []string) {
if token := viper.GetString("GITHUB_TOKEN"); token == "" {
log.Fatal("Please set GITHUB_TOKEN, it's needed to use the GitHub API'")
}

c := crawler.NewCrawler(dryRun)

publisher := common.Publisher{
ID: args[1],
}

if err := c.CrawlSoftwareByID(args[0], publisher); err != nil {
log.Fatal(err)
}
},
}
11 changes: 10 additions & 1 deletion cmd/crawl.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,21 @@ func init() {
}

var crawlCmd = &cobra.Command{
Use: "crawl publishers.yml [directory/*.yml ...]",
Use: "crawl [publishers.yml] [directory/*.yml ...]",
Short: "Crawl publiccode.yml files in publishers' repos.",
Long: `Crawl publiccode.yml files in publishers' repos.

When run with no arguments, the publishers are fetched from the API,
otherwise the passed YAML files are used.`,
Example: `
# Crawl publishers fetched from the API
crawl

# Crawl using a specific publishers.yml file
crawl publishers.yml

# Crawl all YAML files in a specific directory
crawl directory/*.yml`,

Args: cobra.MinimumNArgs(0),
Run: func(_ *cobra.Command, args []string) {
Expand Down
78 changes: 0 additions & 78 deletions cmd/one.go

This file was deleted.

2 changes: 1 addition & 1 deletion cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import (
var (
dryRun bool
rootCmd = &cobra.Command{
Use: "crawler",
Use: "publiccode-crawler",
Short: "A crawler for publiccode.yml files.",
Long: `A fast and robust publiccode.yml file crawler.
Complete documentation is available at https://github.com/italia/publiccode-crawler`,
Expand Down
41 changes: 31 additions & 10 deletions crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"net/http"
"net/url"
"os"
"path"
"regexp"
"runtime"
"strings"
Expand Down Expand Up @@ -83,18 +84,38 @@ func NewCrawler(dryRun bool) *Crawler {
return &c
}

// CrawlRepo crawls a single repository (only used by the 'one' command).
func (c *Crawler) CrawlRepo(repoURL url.URL, publisher common.Publisher) error {
log.Infof("Processing repository: %s", repoURL.String())
// CrawlSoftwareByAPIURL crawls a single software.
func (c *Crawler) CrawlSoftwareByID(software string, publisher common.Publisher) error {
var id string

softwareURL, err := url.Parse(software)
if err != nil {
id = software
} else {
id = path.Base(softwareURL.Path)
}

s, err := c.apiClient.GetSoftware(id)
if err != nil {
return err
}

s.URL = strings.TrimSuffix(s.URL, ".git")

repoURL, err := url.Parse(s.URL)
if err != nil {
return err
}

log.Infof("Processing repository: %s", softwareURL.String())

var err error
switch {
case vcsurl.IsGitHub(&repoURL):
err = c.gitHubScanner.ScanRepo(repoURL, publisher, c.repositories)
case vcsurl.IsBitBucket(&repoURL):
err = c.bitBucketScanner.ScanRepo(repoURL, publisher, c.repositories)
case vcsurl.IsGitLab(&repoURL):
err = c.gitLabScanner.ScanRepo(repoURL, publisher, c.repositories)
case vcsurl.IsGitHub(repoURL):
err = c.gitHubScanner.ScanRepo(*repoURL, publisher, c.repositories)
case vcsurl.IsBitBucket(repoURL):
err = c.bitBucketScanner.ScanRepo(*repoURL, publisher, c.repositories)
case vcsurl.IsGitLab(repoURL):
err = c.gitLabScanner.ScanRepo(*repoURL, publisher, c.repositories)
default:
err = fmt.Errorf(
"publisher %s: unsupported code hosting platform for %s",
Expand Down