-
Notifications
You must be signed in to change notification settings - Fork 275
refactor(vulnfeeds): make nvd conversion run in parallel #4662
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
jess-lowe
wants to merge
27
commits into
google:master
Choose a base branch
from
jess-lowe:refactor/nvd-parallel
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+418
−360
Open
Changes from all commits
Commits
Show all changes
27 commits
Select commit
Hold shift + click to select a range
f495f90
Move alpine/debian converters into converters dir
jess-lowe db52aad
Make mirrors dir
jess-lowe e200103
Move nvd conversion
jess-lowe 2ddd920
move and rename cve5 converters
jess-lowe f71c2ed
Fix routing
jess-lowe f7130cd
fix test path
jess-lowe f99caec
Refactor duplicate use of CPE
jess-lowe e0b08d9
Move CVE5 and NVD CVE models into models dir, and renamed CVE -> NVDC…
jess-lowe 366145e
move ConversionOutcomes into models for NVD and CVE to share.
jess-lowe 8e9cc6d
Move functions that could be shared between converters
jess-lowe 71c74ef
Merge branch 'master' into refactor/move-converters
jess-lowe 3d193d7
Fix dockerfile routing
jess-lowe 53c1f7e
Merge branch 'refactor/move-converters' into refactor/consolidate-models
jess-lowe de599e9
Merge branch 'master' into refactor/consolidate-models
jess-lowe 7dfa5f0
fix lint
jess-lowe ec8ef66
Fix importing
jess-lowe a7e8493
fix lint (again)
jess-lowe 4be0335
Merge branch 'master' into refactor/consolidate-models
jess-lowe f07e60b
Merge branch 'master' into refactor/consolidate-models
jess-lowe ab2f983
Use ConversionMetrics instead of notes throughout conversion
jess-lowe 058c133
fix errors
jess-lowe 4bc036b
Parallelize NVD through making RepoCache mutexable
jess-lowe 8fa92cf
fixa the issues
jess-lowe 7e44934
Write the metrics files out properly
jess-lowe 746a3be
fix formatting
jess-lowe d42fea6
Merge branch 'master' into refactor/nvd-parallel
jess-lowe dc89999
updaet some logger calls to use metrics.AddNotes
jess-lowe File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,16 +2,14 @@ | |
| package main | ||
|
|
||
| import ( | ||
| "encoding/csv" | ||
| "encoding/json" | ||
| "errors" | ||
| "flag" | ||
| "fmt" | ||
| "log/slog" | ||
| "os" | ||
| "path/filepath" | ||
| "slices" | ||
| "strings" | ||
| "sync" | ||
|
|
||
| "github.com/google/osv/vulnfeeds/conversion/nvd" | ||
| "github.com/google/osv/vulnfeeds/cves" | ||
|
|
@@ -20,60 +18,28 @@ import ( | |
| "github.com/google/osv/vulnfeeds/utility/logger" | ||
| ) | ||
|
|
||
| var ErrNoRanges = errors.New("no ranges") | ||
|
|
||
| var ErrUnresolvedFix = errors.New("fixes not resolved to commits") | ||
|
|
||
| var ( | ||
| jsonPath = flag.String("nvd-json", "", "Path to NVD CVE JSON to examine.") | ||
| parsedCPEDictionary = flag.String("cpe-repos", "", "Path to JSON mapping of CPEs to repos generated by cpe-repo-gen") | ||
| outDir = flag.String("out-dir", "", "Path to output results.") | ||
| outFormat = flag.String("out-format", "OSV", "Format to output {OSV,PackageInfo}") | ||
| workers = flag.Int("workers", 30, "The number of concurrent workers to use for processing CVEs.") | ||
| ) | ||
| var RepoTagsCache git.RepoTagsCache | ||
| var Metrics struct { | ||
| TotalCVEs int | ||
| CVEsForApplications int | ||
| CVEsForKnownRepos int | ||
| OSVRecordsGenerated int | ||
| Outcomes map[models.CVEID]models.ConversionOutcome // Per-CVE-ID record of conversion result. | ||
| } | ||
|
|
||
| func loadCPEDictionary(productToRepo *cves.VendorProductToRepoMap, f string) error { | ||
| data, err := os.ReadFile(f) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| var RepoTagsCache = &git.RepoTagsCache{} | ||
| var VPRepoCache = &cves.VPRepoCache{} | ||
|
|
||
| return json.Unmarshal(data, &productToRepo) | ||
| } | ||
|
|
||
| // Output a CSV summarizing per-CVE how it was handled. | ||
| func outputOutcomes(outcomes map[models.CVEID]models.ConversionOutcome, reposForCVE map[models.CVEID][]string, directory string) error { | ||
| outcomesFile, err := os.Create(filepath.Join(directory, "outcomes.csv")) | ||
| func loadCPEDictionary(productToRepo *cves.VPRepoCache, f string) error { | ||
| data, err := os.ReadFile(f) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| defer outcomesFile.Close() | ||
| w := csv.NewWriter(outcomesFile) | ||
| if err := w.Write([]string{"CVE", "outcome", "repos"}); err != nil { | ||
| return err | ||
| } | ||
| for CVE, outcome := range outcomes { | ||
| // It's conceivable to have more than one repo for a CVE, so concatenate them. | ||
| r := "" | ||
| if repos, ok := reposForCVE[CVE]; ok { | ||
| r = strings.Join(repos, " ") | ||
| } | ||
| if err := w.Write([]string{string(CVE), outcome.String(), r}); err != nil { | ||
| return err | ||
| } | ||
| } | ||
| w.Flush() | ||
|
|
||
| if err = w.Error(); err != nil { | ||
| var tempMap cves.VendorProductToRepoMap | ||
| if err := json.Unmarshal(data, &tempMap); err != nil { | ||
| return err | ||
| } | ||
| productToRepo.M = tempMap | ||
|
|
||
| return nil | ||
| } | ||
|
|
@@ -85,8 +51,6 @@ func main() { | |
| os.Exit(1) | ||
| } | ||
|
|
||
| Metrics.Outcomes = make(map[models.CVEID]models.ConversionOutcome) | ||
|
|
||
| logger.InitGlobalLogger() | ||
|
|
||
| data, err := os.ReadFile(*jsonPath) | ||
|
|
@@ -100,165 +64,73 @@ func main() { | |
| logger.Fatal("Failed to parse NVD CVE JSON", slog.Any("err", err)) | ||
| } | ||
|
|
||
| VPRepoCache := make(cves.VendorProductToRepoMap) | ||
|
|
||
| ReposForCVE := make(map[models.CVEID][]string) | ||
|
|
||
| if *parsedCPEDictionary != "" { | ||
| err = loadCPEDictionary(&VPRepoCache, *parsedCPEDictionary) | ||
| err = loadCPEDictionary(VPRepoCache, *parsedCPEDictionary) | ||
| if err != nil { | ||
| logger.Fatal("Failed to load parsed CPE dictionary", slog.Any("err", err)) | ||
| } | ||
| logger.Info("VendorProductToRepoMap cache has entries preloaded", slog.Int("count", len(VPRepoCache))) | ||
| logger.Info("VendorProductToRepoMap cache has entries preloaded", slog.Int("count", len(VPRepoCache.M))) | ||
| } | ||
|
|
||
| for _, cve := range parsed.Vulnerabilities { | ||
| refs := cve.CVE.References | ||
| CPEs := cves.CPEs(cve.CVE) | ||
| CVEID := cve.CVE.ID | ||
|
|
||
| if len(refs) == 0 && len(CPEs) == 0 { | ||
| logger.Info("Skipping due to lack of CPEs and lack of references", slog.String("cve", string(CVEID))) | ||
| // 100% of these in 2022 were rejected CVEs | ||
|
|
||
| continue | ||
| } | ||
|
|
||
| // Edge case: No CPEs, but perhaps usable references. | ||
| if len(refs) > 0 && len(CPEs) == 0 { | ||
| repos := cves.ReposFromReferences(string(CVEID), nil, nil, refs, cves.RefTagDenyList) | ||
| if len(repos) == 0 { | ||
| logger.Warn("Failed to derive any repos and there were no CPEs", slog.String("cve", string(CVEID))) | ||
| continue | ||
| } | ||
| logger.Info("Derived repos for CVE with no CPEs", slog.String("cve", string(CVEID)), slog.Any("repos", repos)) | ||
| ReposForCVE[CVEID] = repos | ||
| } | ||
|
|
||
| // Does it have any application CPEs? Look for pre-computed repos based on VendorProduct. | ||
| appCPECount := 0 | ||
| for _, CPEstr := range cves.CPEs(cve.CVE) { | ||
| CPE, err := cves.ParseCPE(CPEstr) | ||
| if err != nil { | ||
| logger.Warn("Failed to parse CPE", slog.String("cve", string(CVEID)), slog.String("cpe", CPEstr), slog.Any("err", err)) | ||
| jobs := make(chan models.NVDCVE) | ||
| var wg sync.WaitGroup | ||
|
|
||
| continue | ||
| } | ||
| if CPE.Part == "a" { | ||
| appCPECount += 1 | ||
| } | ||
| vendorProductKey := cves.VendorProduct{Vendor: CPE.Vendor, Product: CPE.Product} | ||
| if _, ok := VPRepoCache[vendorProductKey]; ok { | ||
| logger.Info("Pre-references, derived repos using cache", slog.String("cve", string(CVEID)), slog.Any("repos", VPRepoCache[vendorProductKey]), slog.String("vendor", CPE.Vendor), slog.String("product", CPE.Product)) | ||
| if _, ok := ReposForCVE[CVEID]; !ok { | ||
| ReposForCVE[CVEID] = VPRepoCache[vendorProductKey] | ||
| continue | ||
| } | ||
| // Don't append duplicates. | ||
| for _, repo := range VPRepoCache[vendorProductKey] { | ||
| if !slices.Contains(ReposForCVE[CVEID], repo) { | ||
| ReposForCVE[CVEID] = append(ReposForCVE[CVEID], repo) | ||
| } | ||
| } | ||
| } | ||
| } | ||
| for range *workers { | ||
| wg.Add(1) | ||
| go worker(&wg, jobs, *outDir) | ||
| } | ||
|
|
||
| if len(CPEs) > 0 && appCPECount == 0 { | ||
| // This CVE is not for software (based on there being CPEs but not any application ones), skip. | ||
| continue | ||
| } | ||
| for _, cve := range parsed.Vulnerabilities { | ||
| jobs <- cve.CVE | ||
| } | ||
|
|
||
| if appCPECount > 0 { | ||
| Metrics.CVEsForApplications++ | ||
| } | ||
| close(jobs) | ||
| wg.Wait() | ||
| logger.Info("NVD Conversion run complete") | ||
| } | ||
|
|
||
| // If there wasn't a repo from the CPE Dictionary, try and derive one from the CVE references. | ||
| if _, ok := ReposForCVE[CVEID]; !ok && len(refs) > 0 { | ||
| for _, CPEstr := range cves.CPEs(cve.CVE) { | ||
| CPE, err := cves.ParseCPE(CPEstr) | ||
| if err != nil { | ||
| logger.Warn("Failed to parse CPE", slog.String("cve", string(CVEID)), slog.String("cpe", CPEstr), slog.Any("err", err)) | ||
| continue | ||
| } | ||
| // Continue to only focus on application CPEs. | ||
| if CPE.Part != "a" { | ||
| continue | ||
| } | ||
| if slices.Contains(cves.VendorProductDenyList, cves.VendorProduct{Vendor: CPE.Vendor, Product: ""}) { | ||
| continue | ||
| } | ||
| if slices.Contains(cves.VendorProductDenyList, cves.VendorProduct{Vendor: CPE.Vendor, Product: CPE.Product}) { | ||
| continue | ||
| } | ||
| repos := cves.ReposFromReferences(string(CVEID), VPRepoCache, &cves.VendorProduct{Vendor: CPE.Vendor, Product: CPE.Product}, refs, cves.RefTagDenyList) | ||
| if len(repos) == 0 { | ||
| logger.Warn("Failed to derive any repos", slog.String("cve", string(CVEID)), slog.String("vendor", CPE.Vendor), slog.String("product", CPE.Product)) | ||
| continue | ||
| } | ||
| logger.Info("Derived repos", slog.String("cve", string(CVEID)), slog.Any("repos", repos), slog.String("vendor", CPE.Vendor), slog.String("product", CPE.Product)) | ||
| ReposForCVE[CVEID] = repos | ||
| } | ||
| func processCVE(cve models.NVDCVE) error { | ||
| metrics := &models.ConversionMetrics{ | ||
| CVEID: cve.ID, | ||
| CNA: "nvd", | ||
| } | ||
| repos := nvd.FindRepos(cve, VPRepoCache, metrics) | ||
| metrics.Repos = repos | ||
|
|
||
| var err error | ||
| switch *outFormat { | ||
| case "OSV": | ||
| err = nvd.CVEToOSV(cve, repos, RepoTagsCache, *outDir, metrics) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: RepoTagsCache and VPRepoCache should be passed in via arguments |
||
| case "PackageInfo": | ||
| err = nvd.CVEToPackageInfo(cve, repos, RepoTagsCache, *outDir, metrics) | ||
| } | ||
| // Parse this error to determine which failure mode it was | ||
| if err != nil { | ||
| logger.Warn("Failed to generate an OSV record", slog.String("cve", string(cve.ID)), slog.Any("err", err)) | ||
| if errors.Is(err, nvd.ErrNoRanges) { | ||
| metrics.Outcome = models.NoRanges | ||
| return err | ||
| } | ||
|
|
||
| logger.Info("Finished processing "+string(CVEID), | ||
| slog.String("cve", string(CVEID)), | ||
| slog.Int("cpes", len(CPEs)), | ||
| slog.Int("app_cpes", appCPECount), | ||
| slog.Int("derived_repos", len(ReposForCVE[CVEID]))) | ||
|
|
||
| // If we've made it to here, we may have a CVE: | ||
| // * that has Application-related CPEs (so applies to software) | ||
| // * has a reference that is a known repository URL | ||
| // OR | ||
| // * a derived repository for the software package | ||
| // | ||
| // We do not yet have: | ||
| // * any knowledge of the language used | ||
| // * definitive version information | ||
|
|
||
| if _, ok := ReposForCVE[CVEID]; !ok { | ||
| // We have nothing useful to work with, so we'll assume it's out of scope | ||
| logger.Info("Passing due to lack of viable repository", slog.String("cve", string(CVEID))) | ||
| Metrics.Outcomes[CVEID] = models.NoRepos | ||
|
|
||
| continue | ||
| if errors.Is(err, nvd.ErrUnresolvedFix) { | ||
| metrics.Outcome = models.FixUnresolvable | ||
| return err | ||
| } | ||
| metrics.Outcome = models.ConversionUnknown | ||
|
|
||
| logger.Info("Found Repos for CVE "+string(CVEID), slog.String("cve", string(CVEID)), slog.Any("repos", ReposForCVE[CVEID])) | ||
|
|
||
| Metrics.CVEsForKnownRepos++ | ||
| return err | ||
| } | ||
| metrics.Outcome = models.Successful | ||
|
|
||
| switch *outFormat { | ||
| case "OSV": | ||
| err = nvd.CVEToOSV(cve.CVE, ReposForCVE[CVEID], RepoTagsCache, *outDir) | ||
| case "PackageInfo": | ||
| err = nvd.CVEToPackageInfo(cve.CVE, ReposForCVE[CVEID], RepoTagsCache, *outDir) | ||
| } | ||
| // Parse this error to determine which failure mode it was | ||
| if err != nil { | ||
| logger.Warn("Failed to generate an OSV record", slog.String("cve", string(CVEID)), slog.Any("err", err)) | ||
| if errors.Is(err, ErrNoRanges) { | ||
| Metrics.Outcomes[CVEID] = models.NoRanges | ||
| continue | ||
| } | ||
| if errors.Is(err, ErrUnresolvedFix) { | ||
| Metrics.Outcomes[CVEID] = models.FixUnresolvable | ||
| continue | ||
| } | ||
| Metrics.Outcomes[CVEID] = models.ConversionUnknown | ||
| return nil | ||
| } | ||
|
|
||
| continue | ||
| func worker(wg *sync.WaitGroup, jobs <-chan models.NVDCVE, _ string) { | ||
| defer wg.Done() | ||
| for cve := range jobs { | ||
| if err := processCVE(cve); err != nil { | ||
| logger.Warn("Failed to generate an OSV record", slog.String("cve", string(cve.ID)), slog.Any("err", err)) | ||
| } else { | ||
| logger.Info("Generated OSV record for "+string(cve.ID), slog.String("cve", string(cve.ID))) | ||
| } | ||
| Metrics.OSVRecordsGenerated++ | ||
| Metrics.Outcomes[CVEID] = models.Successful | ||
| } | ||
| Metrics.TotalCVEs = len(parsed.Vulnerabilities) | ||
| err = outputOutcomes(Metrics.Outcomes, ReposForCVE, *outDir) | ||
| if err != nil { | ||
| // Log entry with size 1.15M exceeds maximum size of 256.0K | ||
| fmt.Fprintf(os.Stderr, "Failed to write out metrics: %v", err) | ||
| } | ||
| // Outcomes is too big to log, so zero it out. | ||
| Metrics.Outcomes = nil | ||
| logger.Info("Metrics", slog.String("path", filepath.Base(*jsonPath)), slog.Any("metrics", Metrics)) | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think these should be defined in the main function and passed through arguments.