Skip to content

Commit cbeb185

Browse files
refactor: better handling of authors, manufacturers, groups etc + workaround for tag removal below spec v1.5
1 parent a23f606 commit cbeb185

File tree

5 files changed

+175
-23
lines changed

5 files changed

+175
-23
lines changed

internal/io/io.go

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,9 +106,58 @@ func WriteBOM(bom *cdx.BOM, outputPath string, format string, spec string) error
106106
if !ok {
107107
return fmt.Errorf("unsupported CycloneDX spec version: %q", spec)
108108
}
109+
110+
// WORKAROUND: Manually strip tags for spec < 1.6
111+
// Tags were introduced in spec 1.6, but cyclonedx-go doesn't remove them
112+
// when encoding to earlier versions (unlike manufacturer, authors, etc.)
113+
// See: https://github.com/CycloneDX/cyclonedx-go/issues/248
114+
// TODO: Remove this workaround once issue #248 is fixed
115+
if sv < cdx.SpecVersion1_6 {
116+
stripTagsFromBOM(bom)
117+
}
118+
109119
return encoder.EncodeVersion(bom, sv)
110120
}
111121

122+
// stripTagsFromBOM removes tags from all components in the BOM.
123+
// WORKAROUND for cyclonedx-go issue #248: Tags are not automatically removed
124+
// when encoding to spec versions < 1.6, even though tags were introduced in 1.6.
125+
// This function manually strips tags to ensure spec compliance.
126+
// TODO: Remove this workaround once https://github.com/CycloneDX/cyclonedx-go/issues/248 is fixed
127+
func stripTagsFromBOM(bom *cdx.BOM) {
128+
if bom == nil {
129+
return
130+
}
131+
132+
// Strip tags from metadata component
133+
if bom.Metadata != nil && bom.Metadata.Component != nil {
134+
stripTagsFromComponent(bom.Metadata.Component)
135+
}
136+
137+
// Strip tags from all components
138+
if bom.Components != nil {
139+
for i := range *bom.Components {
140+
stripTagsFromComponent(&(*bom.Components)[i])
141+
}
142+
}
143+
}
144+
145+
// stripTagsFromComponent recursively removes tags from a component and its children.
146+
func stripTagsFromComponent(comp *cdx.Component) {
147+
if comp == nil {
148+
return
149+
}
150+
151+
comp.Tags = nil
152+
153+
// Recursively process child components
154+
if comp.Components != nil {
155+
for i := range *comp.Components {
156+
stripTagsFromComponent(&(*comp.Components)[i])
157+
}
158+
}
159+
}
160+
112161
// ParseSpecVersion parses a spec version string to a CycloneDX SpecVersion.
113162
func ParseSpecVersion(s string) (cdx.SpecVersion, bool) {
114163
s = strings.TrimSpace(s)

internal/metadata/core.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,16 +59,16 @@ const (
5959
DatasetLicenses DatasetKey = "BOM.components[DATA].licenses"
6060
DatasetDescription DatasetKey = "BOM.components[DATA].data.description"
6161
DatasetManufacturer DatasetKey = "BOM.components[DATA].manufacturer"
62-
DatasetAuthor DatasetKey = "BOM.components[DATA].author"
62+
DatasetAuthors DatasetKey = "BOM.components[DATA].authors"
6363
DatasetGroup DatasetKey = "BOM.components[DATA].group"
6464
DatasetContents DatasetKey = "BOM.components[DATA].data.contents.attachments"
6565
DatasetSensitiveData DatasetKey = "BOM.components[DATA].data.sensitiveData"
6666
DatasetClassification DatasetKey = "BOM.components[DATA].data.classification"
6767
DatasetGovernance DatasetKey = "BOM.components[DATA].data.governance"
6868
DatasetHashes DatasetKey = "BOM.components[DATA].hashes"
69-
DatasetContact DatasetKey = "BOM.components[DATA].properties.contact"
70-
DatasetCreatedAt DatasetKey = "BOM.components[DATA].properties.createdAt"
71-
DatasetUsedStorage DatasetKey = "BOM.components[DATA].properties.usedStorage"
69+
DatasetContact DatasetKey = "BOM.components[DATA].properties.huggingface:datasetContact"
70+
DatasetCreatedAt DatasetKey = "BOM.components[DATA].properties.huggingface:createdAt"
71+
DatasetUsedStorage DatasetKey = "BOM.components[DATA].properties.huggingface:usedStorage"
7272
DatasetLastModified DatasetKey = "BOM.components[DATA].tags.lastModified"
7373
)
7474

internal/metadata/fields_component.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,25 @@ func componentFields() []FieldSpec {
372372
Weight: 0.25,
373373
Required: false,
374374
Sources: []func(Source) (any, bool){
375+
func(src Source) (any, bool) {
376+
// Extract group from ModelID (part before /)
377+
var modelID string
378+
if src.HF != nil && strings.TrimSpace(src.HF.ModelID) != "" {
379+
modelID = strings.TrimSpace(src.HF.ModelID)
380+
} else if src.HF != nil && strings.TrimSpace(src.HF.ID) != "" {
381+
modelID = strings.TrimSpace(src.HF.ID)
382+
} else {
383+
modelID = strings.TrimSpace(src.ModelID)
384+
}
385+
if modelID == "" {
386+
return nil, false
387+
}
388+
parts := strings.SplitN(modelID, "/", 2)
389+
if len(parts) > 0 && strings.TrimSpace(parts[0]) != "" {
390+
return strings.TrimSpace(parts[0]), true
391+
}
392+
return nil, false
393+
},
375394
func(src Source) (any, bool) {
376395
if src.HF != nil {
377396
if s := strings.TrimSpace(src.HF.Author); s != "" {

internal/metadata/fields_dataset.go

Lines changed: 98 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -297,13 +297,17 @@ func DatasetRegistry() []DatasetFieldSpec {
297297
Required: false,
298298
Sources: []func(DatasetSource) (any, bool){
299299
func(src DatasetSource) (any, bool) {
300-
if src.Readme == nil {
301-
return nil, false
300+
// First try API author (authors[0])
301+
if src.HF != nil && strings.TrimSpace(src.HF.Author) != "" {
302+
return strings.TrimSpace(src.HF.Author), true
302303
}
303-
if len(src.Readme.AnnotationCreators) == 0 {
304-
return nil, false
304+
// Fallback to first AnnotationCreator from README (authors[1])
305+
if src.Readme != nil && len(src.Readme.AnnotationCreators) > 0 {
306+
if trimmed := strings.TrimSpace(src.Readme.AnnotationCreators[0]); trimmed != "" {
307+
return trimmed, true
308+
}
305309
}
306-
return strings.TrimSpace(src.Readme.AnnotationCreators[0]), true
310+
return nil, false
307311
},
308312
},
309313
Parse: func(value string) (any, error) {
@@ -332,19 +336,98 @@ func DatasetRegistry() []DatasetFieldSpec {
332336
return comp != nil && comp.Manufacturer != nil && strings.TrimSpace(comp.Manufacturer.Name) != ""
333337
},
334338
},
339+
{
340+
Key: DatasetAuthors,
341+
Weight: 0.6,
342+
Required: false,
343+
Sources: []func(DatasetSource) (any, bool){
344+
func(src DatasetSource) (any, bool) {
345+
var allAuthors []string
346+
347+
// First, add API author if available
348+
if src.HF != nil && strings.TrimSpace(src.HF.Author) != "" {
349+
allAuthors = append(allAuthors, strings.TrimSpace(src.HF.Author))
350+
}
351+
352+
// Then, add annotation creators from README
353+
if src.Readme != nil && len(src.Readme.AnnotationCreators) > 0 {
354+
for _, creator := range src.Readme.AnnotationCreators {
355+
if trimmed := strings.TrimSpace(creator); trimmed != "" {
356+
allAuthors = append(allAuthors, trimmed)
357+
}
358+
}
359+
}
360+
361+
if len(allAuthors) == 0 {
362+
return nil, false
363+
}
364+
return allAuthors, true
365+
},
366+
},
367+
Parse: func(value string) (any, error) {
368+
parts := strings.Split(value, ",")
369+
authors := normalizeStrings(parts)
370+
return authors, nil
371+
},
372+
Apply: func(tgt DatasetTarget, value any) error {
373+
input, ok := value.(applyInput)
374+
if !ok {
375+
return fmt.Errorf("invalid input for %s", DatasetAuthors)
376+
}
377+
if tgt.Component == nil {
378+
return fmt.Errorf("component is nil")
379+
}
380+
var authors []cdx.OrganizationalContact
381+
switch v := input.Value.(type) {
382+
case []string:
383+
for _, authorName := range v {
384+
if trimmed := strings.TrimSpace(authorName); trimmed != "" {
385+
authors = append(authors, cdx.OrganizationalContact{
386+
Name: trimmed,
387+
})
388+
}
389+
}
390+
case string:
391+
if trimmed := strings.TrimSpace(v); trimmed != "" {
392+
authors = append(authors, cdx.OrganizationalContact{
393+
Name: trimmed,
394+
})
395+
}
396+
}
397+
if len(authors) == 0 {
398+
return fmt.Errorf("authors value is empty")
399+
}
400+
if !input.Force && tgt.Component.Authors != nil && len(*tgt.Component.Authors) > 0 {
401+
return nil
402+
}
403+
tgt.Component.Authors = &authors
404+
return nil
405+
},
406+
Present: func(comp *cdx.Component) bool {
407+
return comp != nil && comp.Authors != nil && len(*comp.Authors) > 0
408+
},
409+
},
335410
{
336411
Key: DatasetGroup,
337412
Weight: 0.4,
338413
Required: false,
339414
Sources: []func(DatasetSource) (any, bool){
340415
func(src DatasetSource) (any, bool) {
341-
if src.Readme == nil {
342-
return nil, false
416+
// Extract group from DatasetID (part before /)
417+
var datasetID string
418+
if src.HF != nil && strings.TrimSpace(src.HF.ID) != "" {
419+
datasetID = strings.TrimSpace(src.HF.ID)
420+
} else {
421+
datasetID = strings.TrimSpace(src.DatasetID)
343422
}
344-
if len(src.Readme.AnnotationCreators) < 2 {
423+
if datasetID == "" {
345424
return nil, false
346425
}
347-
return strings.TrimSpace(src.Readme.AnnotationCreators[1]), true
426+
parts := strings.SplitN(datasetID, "/", 2)
427+
if len(parts) > 0 && strings.TrimSpace(parts[0]) != "" {
428+
return strings.TrimSpace(parts[0]), true
429+
}
430+
return nil, false
348431
},
349432
},
350433
Parse: func(value string) (any, error) {
@@ -669,11 +752,11 @@ func DatasetRegistry() []DatasetFieldSpec {
669752
return fmt.Errorf("component is nil")
670753
}
671754
createdAt, _ := input.Value.(string)
672-
setProperty(tgt.Component, "createdAt", strings.TrimSpace(createdAt))
755+
setProperty(tgt.Component, "huggingface:createdAt", strings.TrimSpace(createdAt))
673756
return nil
674757
},
675758
Present: func(comp *cdx.Component) bool {
676-
return hasProperty(comp, "createdAt")
759+
return hasProperty(comp, "huggingface:createdAt")
677760
},
678761
},
679762
{
@@ -700,11 +783,11 @@ func DatasetRegistry() []DatasetFieldSpec {
700783
return fmt.Errorf("component is nil")
701784
}
702785
usedStorage, _ := input.Value.(string)
703-
setProperty(tgt.Component, "usedStorage", strings.TrimSpace(usedStorage))
786+
setProperty(tgt.Component, "huggingface:usedStorage", strings.TrimSpace(usedStorage))
704787
return nil
705788
},
706789
Present: func(comp *cdx.Component) bool {
707-
return hasProperty(comp, "usedStorage")
790+
return hasProperty(comp, "huggingface:usedStorage")
708791
},
709792
},
710793
{
@@ -793,11 +876,11 @@ func DatasetRegistry() []DatasetFieldSpec {
793876
if tgt.Component == nil {
794877
return fmt.Errorf("component is nil")
795878
}
796-
setProperty(tgt.Component, "contact", contact)
879+
setProperty(tgt.Component, "huggingface:datasetContact", contact)
797880
return nil
798881
},
799882
Present: func(comp *cdx.Component) bool {
800-
return hasProperty(comp, "contact")
883+
return hasProperty(comp, "huggingface:datasetContact")
801884
},
802885
},
803886
}

internal/metadata/fieldspecs_test.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@ func TestRegistryApplyAndPresent(t *testing.T) {
3434
Evidence: "pattern",
3535
},
3636
HF: &fetcher.ModelAPIResponse{
37-
ID: "hf-id",
38-
ModelID: "hf-model",
37+
ID: "hf-org/hf-model",
38+
ModelID: "hf-org/hf-model",
3939
Author: "hf-author",
4040
PipelineTag: "classification",
4141
LibraryName: "transformers",
@@ -107,8 +107,9 @@ func TestRegistryApplyAndPresent(t *testing.T) {
107107
if comp.Manufacturer == nil || comp.Manufacturer.Name != "hf-author" {
108108
t.Fatalf("manufacturer mismatch")
109109
}
110-
if comp.Group != "hf-author" {
111-
t.Fatalf("group mismatch")
110+
// Group is now extracted from ModelID (first part before /)
111+
if comp.Group != "hf-org" {
112+
t.Fatalf("group mismatch: expected 'hf-org', got %q", comp.Group)
112113
}
113114
if comp.Properties == nil || !hasProperty(comp, "huggingface:lastModified") {
114115
t.Fatalf("expected huggingface properties")

0 commit comments

Comments
 (0)