|
| 1 | +package safetensors |
| 2 | + |
| 3 | +// https://huggingface.co/docs/safetensors/en/index |
| 4 | + |
| 5 | +import ( |
| 6 | + "fmt" |
| 7 | + "math" |
| 8 | + "sort" |
| 9 | + |
| 10 | + "github.com/wader/fq/format" |
| 11 | + "github.com/wader/fq/internal/mapstruct" |
| 12 | + "github.com/wader/fq/pkg/decode" |
| 13 | + "github.com/wader/fq/pkg/interp" |
| 14 | + "github.com/wader/fq/pkg/scalar" |
| 15 | +) |
| 16 | + |
| 17 | +var jsonFormat decode.Group |
| 18 | + |
| 19 | +type TensorInfo struct { |
| 20 | + Dtype string `mapstruct:"dtype"` |
| 21 | + Shape []int `mapstruct:"shape"` |
| 22 | + DataOffsets []int `mapstruct:"data_offsets"` |
| 23 | +} |
| 24 | + |
| 25 | +type SafeTensorsHeader struct { |
| 26 | + Tensors map[string]TensorInfo `mapstruct:",remain"` |
| 27 | + Metadata map[string]any `mapstruct:"__metadata__"` |
| 28 | +} |
| 29 | + |
| 30 | +func init() { |
| 31 | + interp.RegisterFormat( |
| 32 | + format.SAFETENSORS, |
| 33 | + &decode.Format{ |
| 34 | + Description: "SafeTensors", |
| 35 | + DecodeFn: decodeSafeTensors, |
| 36 | + Dependencies: []decode.Dependency{ |
| 37 | + {Groups: []*decode.Group{format.JSON}, Out: &jsonFormat}, |
| 38 | + }, |
| 39 | + }) |
| 40 | +} |
| 41 | + |
| 42 | +func parseHeader(dv *decode.Value) (*SafeTensorsHeader, error) { |
| 43 | + actualVal, ok := dv.V.(*scalar.Any) |
| 44 | + if !ok { |
| 45 | + return nil, fmt.Errorf("expected scalar.Any, got %T", dv.V) |
| 46 | + } |
| 47 | + |
| 48 | + headerMap, ok := actualVal.Actual.(map[string]any) |
| 49 | + if !ok { |
| 50 | + return nil, fmt.Errorf("expected map[string]any, got %T", actualVal.Actual) |
| 51 | + } |
| 52 | + |
| 53 | + var header SafeTensorsHeader |
| 54 | + if err := mapstruct.ToStruct(headerMap, &header); err != nil { |
| 55 | + return nil, fmt.Errorf("failed to parse header: %w", err) |
| 56 | + } |
| 57 | + |
| 58 | + return &header, nil |
| 59 | +} |
| 60 | + |
| 61 | +// https://en.wikipedia.org/wiki/Bfloat16_floating-point_format |
| 62 | +// https://en.wikipedia.org/wiki/Single-precision_floating-point_format |
| 63 | +// float32: 1 sign bit, 8 exponent bits, 23 fraction bits |
| 64 | +// bfloat16: 1 sign bit, 8 exponent bits, 7 fraction bits |
| 65 | +// To convert bfloat16 to float32, we can shift the bits to the left by 16. |
| 66 | +func bfloat16_bits_to_float(bits uint16) float32 { |
| 67 | + return math.Float32frombits(uint32(bits) << 16) |
| 68 | +} |
| 69 | + |
| 70 | +var dataDecoders = map[string]func(d *decode.D){ |
| 71 | + "F64": func(d *decode.D) { d.FieldF64("x") }, |
| 72 | + "F32": func(d *decode.D) { d.FieldF32("x") }, |
| 73 | + "F16": func(d *decode.D) { d.FieldF16("x") }, |
| 74 | + "BF16": func(d *decode.D) { |
| 75 | + d.FieldFltFn("x", func(d *decode.D) float64 { |
| 76 | + return float64(bfloat16_bits_to_float(uint16(d.U16()))) |
| 77 | + }) |
| 78 | + }, |
| 79 | + "I64": func(d *decode.D) { d.FieldS64("x") }, |
| 80 | + "I32": func(d *decode.D) { d.FieldS32("x") }, |
| 81 | + "I16": func(d *decode.D) { d.FieldS16("x") }, |
| 82 | + "I8": func(d *decode.D) { d.FieldS8("x") }, |
| 83 | + "U8": func(d *decode.D) { d.FieldU8("x") }, |
| 84 | + "BOOL": func(d *decode.D) { d.FieldBool("x") }, |
| 85 | +} |
| 86 | + |
| 87 | +func decodeSafeTensors(d *decode.D) any { |
| 88 | + d.Endian = decode.LittleEndian |
| 89 | + |
| 90 | + headerSize := d.FieldU64("header size") |
| 91 | + |
| 92 | + var dv *decode.Value |
| 93 | + |
| 94 | + d.LimitedFn(8*int64(headerSize), func(d *decode.D) { |
| 95 | + dv, _ = d.FieldFormat("header", &jsonFormat, nil) |
| 96 | + }) |
| 97 | + |
| 98 | + d.FieldStruct("tensors", func(d *decode.D) { |
| 99 | + header, err := parseHeader(dv) |
| 100 | + if err != nil { |
| 101 | + d.Fatalf("failed to parse header: %v", err) |
| 102 | + return |
| 103 | + } |
| 104 | + |
| 105 | + // Get tensor names and sort them for deterministic output |
| 106 | + tensorNames := make([]string, 0, len(header.Tensors)) |
| 107 | + for tensorName := range header.Tensors { |
| 108 | + tensorNames = append(tensorNames, tensorName) |
| 109 | + } |
| 110 | + sort.Strings(tensorNames) |
| 111 | + |
| 112 | + for _, tensorName := range tensorNames { |
| 113 | + tensorInfo := header.Tensors[tensorName] |
| 114 | + |
| 115 | + decoder, exists := dataDecoders[tensorInfo.Dtype] |
| 116 | + if !exists { |
| 117 | + d.Fatalf("unsupported dtype: %s", tensorInfo.Dtype) |
| 118 | + continue |
| 119 | + } |
| 120 | + |
| 121 | + if len(tensorInfo.DataOffsets) < 2 { |
| 122 | + d.Fatalf("invalid data_offsets for tensor %s: %v", tensorName, tensorInfo.DataOffsets) |
| 123 | + continue |
| 124 | + } |
| 125 | + |
| 126 | + begin := tensorInfo.DataOffsets[0] |
| 127 | + |
| 128 | + d.FieldStruct(tensorName, func(d *decode.D) { |
| 129 | + d.FieldArray("shape", func(d *decode.D) { |
| 130 | + for _, s := range tensorInfo.Shape { |
| 131 | + d.FieldValueSint("dim", int64(s)) |
| 132 | + } |
| 133 | + }) |
| 134 | + |
| 135 | + if len(tensorInfo.Shape) == 0 { |
| 136 | + return |
| 137 | + } |
| 138 | + |
| 139 | + d.SeekAbs(8*(8+int64(headerSize)+int64(begin)), func(d *decode.D) { |
| 140 | + var reshape func(d *decode.D, i int) |
| 141 | + reshape = func(d *decode.D, i int) { |
| 142 | + d.FieldArray("data", func(d *decode.D) { |
| 143 | + if i == len(tensorInfo.Shape)-1 { |
| 144 | + for range tensorInfo.Shape[i] { |
| 145 | + decoder(d) |
| 146 | + } |
| 147 | + } else { |
| 148 | + for range tensorInfo.Shape[i] { |
| 149 | + reshape(d, i+1) |
| 150 | + } |
| 151 | + } |
| 152 | + }) |
| 153 | + } |
| 154 | + reshape(d, 0) |
| 155 | + }) |
| 156 | + |
| 157 | + }) |
| 158 | + } |
| 159 | + }) |
| 160 | + |
| 161 | + return nil |
| 162 | +} |
0 commit comments