Skip to content

Commit 161ad14

Browse files
feat: implemented dataset metadata retrievel and adding it to the resulting aiboms + add dummy dataset fetchers
1 parent 25215c0 commit 161ad14

22 files changed

+1216
-239
lines changed

.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
11
# Ignore Go build binary
22
aibomgen-cra
3+
AIBoMGen-cra
4+
35
aibomgen-cli
46
AIBoMGen-cli
57

8+
aibomgen
9+
AIBoMGen
10+
11+
612
# Binaries for programs and plugins
713
*.exe
814
*.exe~

cmd/completeness.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,6 @@ func init() {
7070
completenessCmd.Flags().StringVarP(&inFormat, "format", "f", "", "Input BOM format: json|xml|auto")
7171
completenessCmd.Flags().StringVar(&completenessLogLevel, "log-level", "", "Log level: quiet|standard|debug")
7272

73-
_ = completenessCmd.MarkFlagRequired("input")
74-
7573
// Bind all flags to viper for config file support
7674
viper.BindPFlag("completeness.input", completenessCmd.Flags().Lookup("input"))
7775
viper.BindPFlag("completeness.format", completenessCmd.Flags().Lookup("format"))

cmd/enrich.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -170,8 +170,6 @@ func init() {
170170
enrichCmd.Flags().StringVar(&enrichHFBaseURL, "hf-base-url", "", "Hugging Face base URL (for refetch)")
171171
enrichCmd.Flags().IntVar(&enrichHFTimeout, "hf-timeout", 0, "Hugging Face API timeout in seconds (for refetch)")
172172

173-
_ = enrichCmd.MarkFlagRequired("input")
174-
175173
// Bind all flags to viper for config file support
176174
viper.BindPFlag("enrich.input", enrichCmd.Flags().Lookup("input"))
177175
viper.BindPFlag("enrich.output", enrichCmd.Flags().Lookup("output"))

cmd/generate.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ var generateCmd = &cobra.Command{
133133
if err != nil {
134134
return err
135135
}
136+
136137
// Online mode: per discovery: store + fetch + map + build (inside generator).
137138
discoveredBOMs, err = generator.BuildPerDiscovery(discoveries, hfToken, timeout)
138139
if err != nil {
@@ -225,6 +226,7 @@ func init() {
225226
generateCmd.Flags().IntVar(&hfTimeoutSec, "hf-timeout", 0, "HTTP timeout in seconds for Hugging Face API")
226227
generateCmd.Flags().StringVar(&hfToken, "hf-token", "", "Hugging Face access token")
227228
generateCmd.Flags().BoolVar(&enrich, "enrich", false, "Prompt for missing fields and compute completeness (deprecated)")
229+
228230
generateCmd.Flags().StringVar(&generateLogLevel, "log-level", "", "Log level: quiet|standard|debug")
229231

230232
// Bind all flags to viper for config file support

cmd/root.go

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,14 +68,30 @@ func initConfig() {
6868
home, err := os.UserHomeDir()
6969
cobra.CheckErr(err)
7070

71-
// Search for config in multiple locations (in order of priority):
72-
// 1. $HOME/.aibomgen-cli.yaml
73-
// 2. ./config/defaults.yaml (project local)
71+
viper.SetConfigType("yaml")
7472
viper.AddConfigPath(home)
7573
viper.AddConfigPath("./config")
76-
viper.SetConfigType("yaml")
77-
viper.SetConfigName(".aibomgen-cli") // for $HOME/.aibomgen-cli.yaml
78-
viper.SetConfigName("defaults") // for ./config/defaults.yaml
74+
75+
// Try .aibomgen-cli first
76+
viper.SetConfigName(".aibomgen-cli")
77+
err = viper.ReadInConfig()
78+
79+
// If not found, try defaults.yaml
80+
notFound := &viper.ConfigFileNotFoundError{}
81+
if err != nil && errors.As(err, notFound) {
82+
viper.SetConfigName("defaults")
83+
err = viper.ReadInConfig()
84+
}
85+
86+
if err != nil && !errors.As(err, notFound) {
87+
cobra.CheckErr(err)
88+
}
89+
90+
if err == nil {
91+
fmt.Fprintln(os.Stderr, "Using config file:", viper.ConfigFileUsed())
92+
}
93+
94+
return
7995
}
8096

8197
// Enable environment variable support (e.g., AIBOMGEN_HUGGINGFACE_TOKEN)

cmd/validate.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,6 @@ func init() {
9292
validateCmd.Flags().BoolVar(&validateCheckModelCard, "check-model-card", false, "Validate model card fields")
9393
validateCmd.Flags().StringVar(&validateLogLevel, "log-level", "", "Log level: quiet|standard|debug")
9494

95-
validateCmd.MarkFlagRequired("input")
96-
9795
// Bind all flags to viper for config file support
9896
viper.BindPFlag("validate.input", validateCmd.Flags().Lookup("input"))
9997
viper.BindPFlag("validate.format", validateCmd.Flags().Lookup("format"))

config/defaults.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@ generate:
3131
# ============================================================================
3232
enrich:
3333
# Path to existing AIBOM (required)
34-
input: "./dist/distilbert_distilgpt2_aibom.json"
34+
input: "./dist/WiebeVandendriessche_model-card-example_aibom.json"
3535
# Output file path (default: overwrite input)
36-
output: "./dist/distilbert_distilgpt2_aibom.json"
36+
output: "./dist/WiebeVandendriessche_model-card-example_aibom.json"
3737
# Input BOM format: json|xml|auto
3838
format: "auto"
3939
# Output BOM format: json|xml|auto
@@ -66,7 +66,7 @@ enrich:
6666
# ============================================================================
6767
validate:
6868
# Path to AIBOM file (required)
69-
input: "./dist/distilbert_distilgpt2_aibom.json"
69+
input: "./dist/WiebeVandendriessche_model-card-example_aibom.json"
7070
# Input format: json|xml|auto
7171
format: "auto"
7272
# Strict mode: fail on missing required fields
@@ -83,7 +83,7 @@ validate:
8383
# ============================================================================
8484
completeness:
8585
# Path to existing AIBOM file (required)
86-
input: "./dist/distilbert_distilgpt2_aibom.json"
86+
input: "./dist/WiebeVandendriessche_model-card-example_aibom.json"
8787
# Input BOM format: json|xml|auto
8888
format: "auto"
8989
# Log level: quiet|standard|debug

internal/builder/bom_builder.go

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,35 @@ func (b BOMBuilder) Build(ctx BuildContext) (*cdx.BOM, error) {
4949
return bom, nil
5050
}
5151

52+
// BuildDataset builds a dataset component into BOM.components
53+
func (b BOMBuilder) BuildDataset(ctx DatasetBuildContext) (*cdx.Component, error) {
54+
logf(ctx.DatasetID, "build dataset start")
55+
56+
comp := buildDatasetComponent(ctx)
57+
58+
// Apply dataset registry
59+
src := metadata.DatasetSource{
60+
DatasetID: strings.TrimSpace(ctx.DatasetID),
61+
Scan: ctx.Scan,
62+
HF: ctx.HF,
63+
Readme: ctx.Readme,
64+
}
65+
tgt := metadata.DatasetTarget{
66+
Component: comp,
67+
IncludeEvidenceProperties: b.Opts.IncludeEvidenceProperties,
68+
HuggingFaceBaseURL: b.Opts.HuggingFaceBaseURL,
69+
}
70+
71+
for _, spec := range metadata.DatasetRegistry() {
72+
if spec.Apply != nil {
73+
spec.Apply(src, tgt)
74+
}
75+
}
76+
77+
logf(ctx.DatasetID, "build dataset ok")
78+
return comp, nil
79+
}
80+
5281
func buildMetadataComponent(ctx BuildContext) *cdx.Component {
5382
// Minimal skeleton; registry fills the rest
5483
name := strings.TrimSpace(ctx.ModelID)
@@ -65,3 +94,19 @@ func buildMetadataComponent(ctx BuildContext) *cdx.Component {
6594
ModelCard: &cdx.MLModelCard{},
6695
}
6796
}
97+
98+
// buildDatasetComponent creates skeleton for DATASET component (DATA type)
99+
func buildDatasetComponent(ctx DatasetBuildContext) *cdx.Component {
100+
name := strings.TrimSpace(ctx.DatasetID)
101+
if name == "" && strings.TrimSpace(ctx.Scan.Name) != "" {
102+
name = strings.TrimSpace(ctx.Scan.Name)
103+
}
104+
if name == "" {
105+
name = "dataset"
106+
}
107+
108+
return &cdx.Component{
109+
Type: cdx.ComponentTypeData,
110+
Name: name,
111+
}
112+
}

internal/builder/context.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,14 @@ type BuildContext struct {
1212
Readme *fetcher.ModelReadmeCard
1313
}
1414

15+
// DatasetBuildContext for dataset component building
16+
type DatasetBuildContext struct {
17+
DatasetID string
18+
Scan scanner.Discovery
19+
HF *fetcher.DatasetAPIResponse
20+
Readme *fetcher.DatasetReadmeCard
21+
}
22+
1523
type Options struct {
1624
IncludeEvidenceProperties bool
1725
HuggingFaceBaseURL string
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
package fetcher
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
"fmt"
7+
"net/http"
8+
"strings"
9+
)
10+
11+
// DatasetAPIResponse is the decoded response from GET https://huggingface.co/api/datasets/:id
12+
type DatasetAPIResponse struct {
13+
ID string `json:"id"`
14+
Author string `json:"author"`
15+
SHA string `json:"sha"`
16+
LastMod string `json:"lastModified"`
17+
CreatedAt string `json:"createdAt"`
18+
Private bool `json:"private"`
19+
Gated BoolOrString `json:"gated"`
20+
Disabled bool `json:"disabled"`
21+
Tags []string `json:"tags"`
22+
Description string `json:"description"`
23+
Downloads int `json:"downloads"`
24+
Likes int `json:"likes"`
25+
UsedStorage int64 `json:"usedStorage"`
26+
CardData map[string]any `json:"cardData"`
27+
}
28+
29+
// DatasetAPIFetcher fetches dataset metadata from the Hugging Face Hub API.
30+
type DatasetAPIFetcher struct {
31+
Client *http.Client
32+
Token string
33+
BaseURL string // optional; defaults to "https://huggingface.co"
34+
}
35+
36+
// Fetch fetches dataset metadata for the given datasetID.
37+
func (f *DatasetAPIFetcher) Fetch(ctx context.Context, datasetID string) (*DatasetAPIResponse, error) {
38+
client := f.Client
39+
if client == nil {
40+
client = http.DefaultClient
41+
}
42+
43+
trimmedDatasetID := strings.TrimPrefix(strings.TrimSpace(datasetID), "/")
44+
logf(datasetID, "GET /api/datasets/%s", trimmedDatasetID)
45+
46+
baseURL := strings.TrimRight(strings.TrimSpace(f.BaseURL), "/")
47+
if baseURL == "" {
48+
baseURL = "https://huggingface.co"
49+
}
50+
51+
url := fmt.Sprintf("%s/api/datasets/%s", baseURL, trimmedDatasetID)
52+
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
53+
if err != nil {
54+
return nil, err
55+
}
56+
req.Header.Set("Accept", "application/json")
57+
if strings.TrimSpace(f.Token) != "" {
58+
req.Header.Set("Authorization", "Bearer "+strings.TrimSpace(f.Token))
59+
}
60+
61+
resp, err := client.Do(req)
62+
if err != nil {
63+
logf(datasetID, "request error (%v)", err)
64+
return nil, err
65+
}
66+
defer resp.Body.Close()
67+
68+
if resp.StatusCode != http.StatusOK {
69+
logf(datasetID, "non-200 status=%d", resp.StatusCode)
70+
return nil, fmt.Errorf("huggingface api status %d", resp.StatusCode)
71+
}
72+
73+
var parsed DatasetAPIResponse
74+
if err := json.NewDecoder(resp.Body).Decode(&parsed); err != nil {
75+
logf(datasetID, "decode error (%v)", err)
76+
return nil, err
77+
}
78+
logf(datasetID, "ok")
79+
return &parsed, nil
80+
}

0 commit comments

Comments
 (0)