Skip to content

Commit 1bd90fb

Browse files
authored
feat!: add crawl-software command and remove one (#398)
Add a new `crawl-software` command that replaces the non functional `one` command. Fix #122.
1 parent 4feb1ae commit 1bd90fb

File tree

7 files changed

+116
-95
lines changed

7 files changed

+116
-95
lines changed

README.md

+12-5
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,15 @@
99
Developers Italia provides [a catalog of Free and Open Source](https://developers.italia.it/en/search)
1010
software aimed to Public Administrations.
1111

12-
This **crawler** retrieves the `publiccode.yml` files from the
12+
`publiccode-crawler` retrieves the `publiccode.yml` files from the
1313
repositories of publishers found in the [Developers Italia API](https://github.com/italia/developers-italia-api).
1414

1515
## Setup and deployment processes
1616

17-
The crawler can either run manually on the target machine or it can be deployed
17+
`publiccode-crawler` can either run manually on the target machine or it can be deployed
1818
from a Docker container.
1919

20-
### Manually configure and build the crawler
20+
### Manually configure and build
2121

2222
1. Rename `config.toml.example` to `config.toml` and set the variables
2323

@@ -43,16 +43,23 @@ docker run -it italia/publiccode-crawler
4343

4444
## Commands
4545

46-
### `crawler crawl`
46+
### `publiccode-crawler crawl`
4747

4848
Gets the list of publishers from `https://api.developers.italia.it/v1/publishers`
4949
and starts to crawl their repositories.
5050

51-
### `crawler crawl publishers*.yml`
51+
### `publiccode-crawler crawl publishers*.yml`
5252

5353
Gets the list of publishers in `publishers*.yml` and starts to crawl
5454
their repositories.
5555

56+
### `publiccode-crawler crawl-software <software> <publisher>`
57+
58+
Crawl just the software specified as paramenter.
59+
It takes the software URL and its publisher id as parameters.
60+
61+
Ex. `publiccode-crawler crawl-software https://api.developers.italia.it/v1/software/a2ea59b0-87cd-4419-b93f-00bed8a7b859 edb66b3d-3e36-4b69-aba9-b7c4661b3fdd"
62+
5663
### Other commands
5764

5865
* `crawler download-publishers` downloads organizations and repositories from

apiclient/apiclient.go

+19
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,25 @@ page:
192192
return publishers, nil
193193
}
194194

195+
// GetSoftware returns the software with the given id or any error encountered.
196+
func (clt APIClient) GetSoftware(id string) (*Software, error) {
197+
var softwareResponse Software
198+
199+
res, err := clt.retryableClient.Get(joinPath(clt.baseURL, "/software") + "/" + id)
200+
if err != nil {
201+
return nil, fmt.Errorf("can't GET /software/%s: %w", id, err)
202+
}
203+
204+
defer res.Body.Close()
205+
206+
err = json.NewDecoder(res.Body).Decode(&softwareResponse)
207+
if err != nil {
208+
return nil, fmt.Errorf("can't parse GET /software/%s response: %w", id, err)
209+
}
210+
211+
return &softwareResponse, nil
212+
}
213+
195214
// GetSoftwareByURL returns the software matching the given repo URL and
196215
// any error encountered.
197216
// In case no software is found and no error occours, (nil, nil) is returned.

cmd/crawl-software.go

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
package cmd
2+
3+
import (
4+
"github.com/italia/publiccode-crawler/v4/common"
5+
"github.com/italia/publiccode-crawler/v4/crawler"
6+
log "github.com/sirupsen/logrus"
7+
"github.com/spf13/cobra"
8+
"github.com/spf13/viper"
9+
)
10+
11+
func init() {
12+
crawlSoftwareCmd.Flags().BoolVarP(&dryRun, "dry-run", "n", false, "perform a dry run with no changes made")
13+
14+
rootCmd.AddCommand(crawlSoftwareCmd)
15+
}
16+
17+
var crawlSoftwareCmd = &cobra.Command{
18+
Use: "crawl-software [SOFTWARE_ID | SOFTWARE_URL] PUBLISHER_ID",
19+
Short: "Crawl a single software by its id.",
20+
Long: `Crawl a single software by its id.
21+
22+
Crawl a single software given its API id and its publisher.`,
23+
Example: "# Crawl just the specified software\n" +
24+
"publiccode-crawler crawl-software" +
25+
" https://api.developers.italia.it/v1/software/af6056fc-b2b2-4d31-9961-c9bd94e32bd4 PCM",
26+
27+
Args: cobra.ExactArgs(2),
28+
Run: func(_ *cobra.Command, args []string) {
29+
if token := viper.GetString("GITHUB_TOKEN"); token == "" {
30+
log.Fatal("Please set GITHUB_TOKEN, it's needed to use the GitHub API'")
31+
}
32+
33+
c := crawler.NewCrawler(dryRun)
34+
35+
publisher := common.Publisher{
36+
ID: args[1],
37+
}
38+
39+
if err := c.CrawlSoftwareByID(args[0], publisher); err != nil {
40+
log.Fatal(err)
41+
}
42+
},
43+
}

cmd/crawl.go

+10-1
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,21 @@ func init() {
1616
}
1717

1818
var crawlCmd = &cobra.Command{
19-
Use: "crawl publishers.yml [directory/*.yml ...]",
19+
Use: "crawl [publishers.yml] [directory/*.yml ...]",
2020
Short: "Crawl publiccode.yml files in publishers' repos.",
2121
Long: `Crawl publiccode.yml files in publishers' repos.
2222
2323
When run with no arguments, the publishers are fetched from the API,
2424
otherwise the passed YAML files are used.`,
25+
Example: `
26+
# Crawl publishers fetched from the API
27+
crawl
28+
29+
# Crawl using a specific publishers.yml file
30+
crawl publishers.yml
31+
32+
# Crawl all YAML files in a specific directory
33+
crawl directory/*.yml`,
2534

2635
Args: cobra.MinimumNArgs(0),
2736
Run: func(_ *cobra.Command, args []string) {

cmd/one.go

-78
This file was deleted.

cmd/root.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import (
88
var (
99
dryRun bool
1010
rootCmd = &cobra.Command{
11-
Use: "crawler",
11+
Use: "publiccode-crawler",
1212
Short: "A crawler for publiccode.yml files.",
1313
Long: `A fast and robust publiccode.yml file crawler.
1414
Complete documentation is available at https://github.com/italia/publiccode-crawler`,

crawler/crawler.go

+31-10
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"net/http"
77
"net/url"
88
"os"
9+
"path"
910
"regexp"
1011
"runtime"
1112
"strings"
@@ -83,18 +84,38 @@ func NewCrawler(dryRun bool) *Crawler {
8384
return &c
8485
}
8586

86-
// CrawlRepo crawls a single repository (only used by the 'one' command).
87-
func (c *Crawler) CrawlRepo(repoURL url.URL, publisher common.Publisher) error {
88-
log.Infof("Processing repository: %s", repoURL.String())
87+
// CrawlSoftwareByAPIURL crawls a single software.
88+
func (c *Crawler) CrawlSoftwareByID(software string, publisher common.Publisher) error {
89+
var id string
90+
91+
softwareURL, err := url.Parse(software)
92+
if err != nil {
93+
id = software
94+
} else {
95+
id = path.Base(softwareURL.Path)
96+
}
97+
98+
s, err := c.apiClient.GetSoftware(id)
99+
if err != nil {
100+
return err
101+
}
102+
103+
s.URL = strings.TrimSuffix(s.URL, ".git")
104+
105+
repoURL, err := url.Parse(s.URL)
106+
if err != nil {
107+
return err
108+
}
109+
110+
log.Infof("Processing repository: %s", softwareURL.String())
89111

90-
var err error
91112
switch {
92-
case vcsurl.IsGitHub(&repoURL):
93-
err = c.gitHubScanner.ScanRepo(repoURL, publisher, c.repositories)
94-
case vcsurl.IsBitBucket(&repoURL):
95-
err = c.bitBucketScanner.ScanRepo(repoURL, publisher, c.repositories)
96-
case vcsurl.IsGitLab(&repoURL):
97-
err = c.gitLabScanner.ScanRepo(repoURL, publisher, c.repositories)
113+
case vcsurl.IsGitHub(repoURL):
114+
err = c.gitHubScanner.ScanRepo(*repoURL, publisher, c.repositories)
115+
case vcsurl.IsBitBucket(repoURL):
116+
err = c.bitBucketScanner.ScanRepo(*repoURL, publisher, c.repositories)
117+
case vcsurl.IsGitLab(repoURL):
118+
err = c.gitLabScanner.ScanRepo(*repoURL, publisher, c.repositories)
98119
default:
99120
err = fmt.Errorf(
100121
"publisher %s: unsupported code hosting platform for %s",

0 commit comments

Comments
 (0)