Merge pull request #1203 from Leowbattle/safetensors

wader · web-flow · commit f36b6e06bf5e · 2025-10-22T13:32:42.000+02:00
safetensors: Add decoder
diff --git a/README.md b/README.md
@@ -141,6 +141,7 @@ prores_frame,
 protobuf_widevine,
 pssh_playready,
 [rtmp](doc/formats.md#rtmp),
+safetensors,
 sll2_packet,
 sll_packet,
 [tap](doc/formats.md#tap),
diff --git a/doc/formats.md b/doc/formats.md
@@ -113,6 +113,7 @@
 |`protobuf_widevine`                                             |Widevine&nbsp;protobuf                                                                                       |<sub>`protobuf`</sub>|
 |`pssh_playready`                                                |PlayReady&nbsp;PSSH                                                                                          |<sub></sub>|
 |[`rtmp`](#rtmp)                                                 |Real-Time&nbsp;Messaging&nbsp;Protocol                                                                       |<sub>`amf0` `mpeg_asc`</sub>|
+|`safetensors`                                                   |SafeTensors                                                                                                  |<sub>`json`</sub>|
 |`sll2_packet`                                                   |Linux&nbsp;cooked&nbsp;capture&nbsp;encapsulation&nbsp;v2                                                    |<sub>`inet_packet`</sub>|
 |`sll_packet`                                                    |Linux&nbsp;cooked&nbsp;capture&nbsp;encapsulation                                                            |<sub>`inet_packet`</sub>|
 |[`tap`](#tap)                                                   |TAP&nbsp;tape&nbsp;format&nbsp;for&nbsp;ZX&nbsp;Spectrum&nbsp;computers                                      |<sub></sub>|
diff --git a/format/all/all.fqtest b/format/all/all.fqtest
@@ -158,6 +158,7 @@ protobuf             Protobuf
 protobuf_widevine    Widevine protobuf
 pssh_playready       PlayReady PSSH
 rtmp                 Real-Time Messaging Protocol
+safetensors          SafeTensors
 sll2_packet          Linux cooked capture encapsulation v2
 sll_packet           Linux cooked capture encapsulation
 tap                  TAP tape format for ZX Spectrum computers
diff --git a/format/all/all.go b/format/all/all.go
@@ -54,6 +54,7 @@ import (
 	_ "github.com/wader/fq/format/protobuf"
 	_ "github.com/wader/fq/format/riff"
 	_ "github.com/wader/fq/format/rtmp"
+	_ "github.com/wader/fq/format/safetensors"
 	_ "github.com/wader/fq/format/tap"
 	_ "github.com/wader/fq/format/tar"
 	_ "github.com/wader/fq/format/text"
diff --git a/format/format.go b/format/format.go
@@ -165,6 +165,7 @@ var (
 	ProtobufWidevine    = &decode.Group{Name: "protobuf_widevine"}
 	PSSH_Playready      = &decode.Group{Name: "pssh_playready"}
 	RTMP                = &decode.Group{Name: "rtmp"}
+	SAFETENSORS         = &decode.Group{Name: "safetensors"}
 	SLL_Packet          = &decode.Group{Name: "sll_packet"}
 	SLL2_Packet         = &decode.Group{Name: "sll2_packet"}
 	TAP                 = &decode.Group{Name: "tap"}
diff --git a/format/safetensors/safetensors.go b/format/safetensors/safetensors.go
@@ -0,0 +1,162 @@
+package safetensors
+
+// https://huggingface.co/docs/safetensors/en/index
+
+import (
+	"fmt"
+	"math"
+	"sort"
+
+	"github.com/wader/fq/format"
+	"github.com/wader/fq/internal/mapstruct"
+	"github.com/wader/fq/pkg/decode"
+	"github.com/wader/fq/pkg/interp"
+	"github.com/wader/fq/pkg/scalar"
+)
+
+var jsonFormat decode.Group
+
+type TensorInfo struct {
+	Dtype       string `mapstruct:"dtype"`
+	Shape       []int  `mapstruct:"shape"`
+	DataOffsets []int  `mapstruct:"data_offsets"`
+}
+
+type SafeTensorsHeader struct {
+	Tensors  map[string]TensorInfo `mapstruct:",remain"`
+	Metadata map[string]any        `mapstruct:"__metadata__"`
+}
+
+func init() {
+	interp.RegisterFormat(
+		format.SAFETENSORS,
+		&decode.Format{
+			Description: "SafeTensors",
+			DecodeFn:    decodeSafeTensors,
+			Dependencies: []decode.Dependency{
+				{Groups: []*decode.Group{format.JSON}, Out: &jsonFormat},
+			},
+		})
+}
+
+func parseHeader(dv *decode.Value) (*SafeTensorsHeader, error) {
+	actualVal, ok := dv.V.(*scalar.Any)
+	if !ok {
+		return nil, fmt.Errorf("expected scalar.Any, got %T", dv.V)
+	}
+
+	headerMap, ok := actualVal.Actual.(map[string]any)
+	if !ok {
+		return nil, fmt.Errorf("expected map[string]any, got %T", actualVal.Actual)
+	}
+
+	var header SafeTensorsHeader
+	if err := mapstruct.ToStruct(headerMap, &header); err != nil {
+		return nil, fmt.Errorf("failed to parse header: %w", err)
+	}
+
+	return &header, nil
+}
+
+// https://en.wikipedia.org/wiki/Bfloat16_floating-point_format
+// https://en.wikipedia.org/wiki/Single-precision_floating-point_format
+// float32:  1 sign bit, 8 exponent bits, 23 fraction bits
+// bfloat16: 1 sign bit, 8 exponent bits, 7 fraction bits
+// To convert bfloat16 to float32, we can shift the bits to the left by 16.
+func bfloat16_bits_to_float(bits uint16) float32 {
+	return math.Float32frombits(uint32(bits) << 16)
+}
+
+var dataDecoders = map[string]func(d *decode.D){
+	"F64": func(d *decode.D) { d.FieldF64("x") },
+	"F32": func(d *decode.D) { d.FieldF32("x") },
+	"F16": func(d *decode.D) { d.FieldF16("x") },
+	"BF16": func(d *decode.D) {
+		d.FieldFltFn("x", func(d *decode.D) float64 {
+			return float64(bfloat16_bits_to_float(uint16(d.U16())))
+		})
+	},
+	"I64":  func(d *decode.D) { d.FieldS64("x") },
+	"I32":  func(d *decode.D) { d.FieldS32("x") },
+	"I16":  func(d *decode.D) { d.FieldS16("x") },
+	"I8":   func(d *decode.D) { d.FieldS8("x") },
+	"U8":   func(d *decode.D) { d.FieldU8("x") },
+	"BOOL": func(d *decode.D) { d.FieldBool("x") },
+}
+
+func decodeSafeTensors(d *decode.D) any {
+	d.Endian = decode.LittleEndian
+
+	headerSize := d.FieldU64("header size")
+
+	var dv *decode.Value
+
+	d.LimitedFn(8*int64(headerSize), func(d *decode.D) {
+		dv, _ = d.FieldFormat("header", &jsonFormat, nil)
+	})
+
+	d.FieldStruct("tensors", func(d *decode.D) {
+		header, err := parseHeader(dv)
+		if err != nil {
+			d.Fatalf("failed to parse header: %v", err)
+			return
+		}
+
+		// Get tensor names and sort them for deterministic output
+		tensorNames := make([]string, 0, len(header.Tensors))
+		for tensorName := range header.Tensors {
+			tensorNames = append(tensorNames, tensorName)
+		}
+		sort.Strings(tensorNames)
+
+		for _, tensorName := range tensorNames {
+			tensorInfo := header.Tensors[tensorName]
+
+			decoder, exists := dataDecoders[tensorInfo.Dtype]
+			if !exists {
+				d.Fatalf("unsupported dtype: %s", tensorInfo.Dtype)
+				continue
+			}
+
+			if len(tensorInfo.DataOffsets) < 2 {
+				d.Fatalf("invalid data_offsets for tensor %s: %v", tensorName, tensorInfo.DataOffsets)
+				continue
+			}
+
+			begin := tensorInfo.DataOffsets[0]
+
+			d.FieldStruct(tensorName, func(d *decode.D) {
+				d.FieldArray("shape", func(d *decode.D) {
+					for _, s := range tensorInfo.Shape {
+						d.FieldValueSint("dim", int64(s))
+					}
+				})
+
+				if len(tensorInfo.Shape) == 0 {
+					return
+				}
+
+				d.SeekAbs(8*(8+int64(headerSize)+int64(begin)), func(d *decode.D) {
+					var reshape func(d *decode.D, i int)
+					reshape = func(d *decode.D, i int) {
+						d.FieldArray("data", func(d *decode.D) {
+							if i == len(tensorInfo.Shape)-1 {
+								for range tensorInfo.Shape[i] {
+									decoder(d)
+								}
+							} else {
+								for range tensorInfo.Shape[i] {
+									reshape(d, i+1)
+								}
+							}
+						})
+					}
+					reshape(d, 0)
+				})
+
+			})
+		}
+	})
+
+	return nil
+}
diff --git a/format/safetensors/testdata/gen_test.py b/format/safetensors/testdata/gen_test.py
@@ -0,0 +1,11 @@
+import torch
+from safetensors.torch import save_file
+
+tensors = {
+    "weight1": torch.reshape(torch.arange(12, dtype=torch.float32), (12,)),
+    "weight2": torch.reshape(torch.arange(12, dtype=torch.int64), (3, 4)),
+    "weight3": torch.reshape(torch.arange(12, dtype=torch.float16), (2, 2, 3)),
+    "weight4": torch.reshape(torch.arange(12, dtype=torch.bfloat16), (4, 3)),
+}
+
+save_file(tensors, "format/safetensors/testdata/test.safetensors")
diff --git a/format/safetensors/testdata/test.fqtest b/format/safetensors/testdata/test.fqtest
@@ -0,0 +1,89 @@
+$ fq -d safetensors dv test.safetensors
+     |00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f|0123456789abcdef|.{}: test.safetensors (safetensors) 0x0-0x1d0 (464)
+0x000|08 01 00 00 00 00 00 00                        |........        |  header size: 264 0x0-0x8 (8)
+     |00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f|0123456789abcdef|
+0x000|                        7b 22 77 65 69 67 68 74|        {"weight|  header: {} (json) 0x8-0x110 (264)
+0x010|32 22 3a 7b 22 64 74 79 70 65 22 3a 22 49 36 34|2":{"dtype":"I64|
+*    |until 0x10f.7 (264)                            |                |
+     |                                               |                |  tensors{}: 0x110-0x1d0 (192)
+     |                                               |                |    weight1{}: 0x110-0x1a0 (144)
+     |                                               |                |      shape[0:1]: 0x110-0x110 (0)
+     |                                               |                |        [0]: 12 dim
+     |                                               |                |      data[0:12]: 0x170-0x1a0 (48)
+0x170|00 00 00 00                                    |....            |        [0]: 0 x 0x170-0x174 (4)
+0x170|            00 00 80 3f                        |    ...?        |        [1]: 1 x 0x174-0x178 (4)
+0x170|                        00 00 00 40            |        ...@    |        [2]: 2 x 0x178-0x17c (4)
+0x170|                                    00 00 40 40|            ..@@|        [3]: 3 x 0x17c-0x180 (4)
+0x180|00 00 80 40                                    |...@            |        [4]: 4 x 0x180-0x184 (4)
+0x180|            00 00 a0 40                        |    ...@        |        [5]: 5 x 0x184-0x188 (4)
+0x180|                        00 00 c0 40            |        ...@    |        [6]: 6 x 0x188-0x18c (4)
+0x180|                                    00 00 e0 40|            ...@|        [7]: 7 x 0x18c-0x190 (4)
+0x190|00 00 00 41                                    |...A            |        [8]: 8 x 0x190-0x194 (4)
+0x190|            00 00 10 41                        |    ...A        |        [9]: 9 x 0x194-0x198 (4)
+0x190|                        00 00 20 41            |        .. A    |        [10]: 10 x 0x198-0x19c (4)
+0x190|                                    00 00 30 41|            ..0A|        [11]: 11 x 0x19c-0x1a0 (4)
+     |                                               |                |    weight2{}: 0x110-0x170 (96)
+     |                                               |                |      shape[0:2]: 0x110-0x110 (0)
+     |                                               |                |        [0]: 3 dim
+     |                                               |                |        [1]: 4 dim
+     |                                               |                |      data[0:3]: 0x110-0x170 (96)
+     |                                               |                |        [0][0:4]: data 0x110-0x130 (32)
+0x110|00 00 00 00 00 00 00 00                        |........        |          [0]: 0 x 0x110-0x118 (8)
+0x110|                        01 00 00 00 00 00 00 00|        ........|          [1]: 1 x 0x118-0x120 (8)
+0x120|02 00 00 00 00 00 00 00                        |........        |          [2]: 2 x 0x120-0x128 (8)
+0x120|                        03 00 00 00 00 00 00 00|        ........|          [3]: 3 x 0x128-0x130 (8)
+     |                                               |                |        [1][0:4]: data 0x130-0x150 (32)
+0x130|04 00 00 00 00 00 00 00                        |........        |          [0]: 4 x 0x130-0x138 (8)
+0x130|                        05 00 00 00 00 00 00 00|        ........|          [1]: 5 x 0x138-0x140 (8)
+0x140|06 00 00 00 00 00 00 00                        |........        |          [2]: 6 x 0x140-0x148 (8)
+0x140|                        07 00 00 00 00 00 00 00|        ........|          [3]: 7 x 0x148-0x150 (8)
+     |                                               |                |        [2][0:4]: data 0x150-0x170 (32)
+0x150|08 00 00 00 00 00 00 00                        |........        |          [0]: 8 x 0x150-0x158 (8)
+0x150|                        09 00 00 00 00 00 00 00|        ........|          [1]: 9 x 0x158-0x160 (8)
+0x160|0a 00 00 00 00 00 00 00                        |........        |          [2]: 10 x 0x160-0x168 (8)
+0x160|                        0b 00 00 00 00 00 00 00|        ........|          [3]: 11 x 0x168-0x170 (8)
+     |                                               |                |    weight3{}: 0x110-0x1d0 (192)
+     |                                               |                |      shape[0:3]: 0x110-0x110 (0)
+     |                                               |                |        [0]: 2 dim
+     |                                               |                |        [1]: 2 dim
+     |                                               |                |        [2]: 3 dim
+     |                                               |                |      data[0:2]: 0x1b8-0x1d0 (24)
+     |                                               |                |        [0][0:2]: data 0x1b8-0x1c4 (12)
+     |                                               |                |          [0][0:3]: data 0x1b8-0x1be (6)
+0x1b0|                        00 00                  |        ..      |            [0]: 0 x 0x1b8-0x1ba (2)
+0x1b0|                              00 3c            |          .<    |            [1]: 1 x 0x1ba-0x1bc (2)
+0x1b0|                                    00 40      |            .@  |            [2]: 2 x 0x1bc-0x1be (2)
+     |                                               |                |          [1][0:3]: data 0x1be-0x1c4 (6)
+0x1b0|                                          00 42|              .B|            [0]: 3 x 0x1be-0x1c0 (2)
+0x1c0|00 44                                          |.D              |            [1]: 4 x 0x1c0-0x1c2 (2)
+0x1c0|      00 45                                    |  .E            |            [2]: 5 x 0x1c2-0x1c4 (2)
+     |                                               |                |        [1][0:2]: data 0x1c4-0x1d0 (12)
+     |                                               |                |          [0][0:3]: data 0x1c4-0x1ca (6)
+0x1c0|            00 46                              |    .F          |            [0]: 6 x 0x1c4-0x1c6 (2)
+0x1c0|                  00 47                        |      .G        |            [1]: 7 x 0x1c6-0x1c8 (2)
+0x1c0|                        00 48                  |        .H      |            [2]: 8 x 0x1c8-0x1ca (2)
+     |                                               |                |          [1][0:3]: data 0x1ca-0x1d0 (6)
+0x1c0|                              80 48            |          .H    |            [0]: 9 x 0x1ca-0x1cc (2)
+0x1c0|                                    00 49      |            .I  |            [1]: 10 x 0x1cc-0x1ce (2)
+0x1c0|                                          80 49|              .I|            [2]: 11 x 0x1ce-0x1d0 (2)
+     |                                               |                |    weight4{}: 0x110-0x1b8 (168)
+     |                                               |                |      shape[0:2]: 0x110-0x110 (0)
+     |                                               |                |        [0]: 4 dim
+     |                                               |                |        [1]: 3 dim
+     |                                               |                |      data[0:4]: 0x1a0-0x1b8 (24)
+     |                                               |                |        [0][0:3]: data 0x1a0-0x1a6 (6)
+0x1a0|00 00                                          |..              |          [0]: 0 x 0x1a0-0x1a2 (2)
+0x1a0|      80 3f                                    |  .?            |          [1]: 1 x 0x1a2-0x1a4 (2)
+0x1a0|            00 40                              |    .@          |          [2]: 2 x 0x1a4-0x1a6 (2)
+     |                                               |                |        [1][0:3]: data 0x1a6-0x1ac (6)
+0x1a0|                  40 40                        |      @@        |          [0]: 3 x 0x1a6-0x1a8 (2)
+0x1a0|                        80 40                  |        .@      |          [1]: 4 x 0x1a8-0x1aa (2)
+0x1a0|                              a0 40            |          .@    |          [2]: 5 x 0x1aa-0x1ac (2)
+     |                                               |                |        [2][0:3]: data 0x1ac-0x1b2 (6)
+0x1a0|                                    c0 40      |            .@  |          [0]: 6 x 0x1ac-0x1ae (2)
+0x1a0|                                          e0 40|              .@|          [1]: 7 x 0x1ae-0x1b0 (2)
+0x1b0|00 41                                          |.A              |          [2]: 8 x 0x1b0-0x1b2 (2)
+     |                                               |                |        [3][0:3]: data 0x1b2-0x1b8 (6)
+0x1b0|      10 41                                    |  .A            |          [0]: 9 x 0x1b2-0x1b4 (2)
+0x1b0|            20 41                              |     A          |          [1]: 10 x 0x1b4-0x1b6 (2)
+0x1b0|                  30 41                        |      0A        |          [2]: 11 x 0x1b6-0x1b8 (2)
diff --git a/format/safetensors/testdata/test.safetensors b/format/safetensors/testdata/test.safetensors