From 179af512eaad61c69d9150b052182620dea8f941 Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Sun, 5 Jan 2025 20:49:22 +0100
Subject: [PATCH 1/9] Add unsafe little endian loaders

Benchmarks pending
---
 .github/workflows/go.yml       | 48 ++++++++++++++-----------
 flate/fast_encoder.go          |  7 ++--
 flate/fuzz_test.go             |  1 -
 internal/le/le.go              |  5 +++
 internal/le/unsafe_disabled.go | 27 ++++++++++++++
 internal/le/unsafe_enabled.go  | 37 ++++++++++++++++++++
 s2/encode_all.go               |  6 ++--
 zstd/_generate/gen.go          |  4 +--
 zstd/bitreader.go              | 37 ++++++++++----------
 zstd/decoder.go                |  1 +
 zstd/matchlen_generic.go       |  5 +--
 zstd/seqdec.go                 |  2 +-
 zstd/seqdec_amd64.s            | 64 +++++++++++++++++-----------------
 zstd/seqdec_generic.go         |  2 +-
 zstd/zstd.go                   |  7 ++--
 15 files changed, 166 insertions(+), 87 deletions(-)
 create mode 100644 internal/le/le.go
 create mode 100644 internal/le/unsafe_disabled.go
 create mode 100644 internal/le/unsafe_enabled.go

diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
index ea71ec2f28..3b04386f22 100644
--- a/.github/workflows/go.yml
+++ b/.github/workflows/go.yml
@@ -35,15 +35,21 @@ jobs:
     - name: Test Noasm
       run: go test -tags=noasm ./...
 
+    - name: Test Nounsafe
+      run: go test -tags=nounsafe ./...
+
+    - name: Test Nounsafe, noasm
+      run: go test -tags=nounsafe,noasm ./...
+
     - name: Test Race 1 CPU
       env:
         CGO_ENABLED: 1
-      run: go test -cpu=1 -short -race -v ./...
+      run: go test -cpu=1 -short -race -tags=nounsafe -v ./...
 
     - name: Test Race 4 CPU
       env:
         CGO_ENABLED: 1
-      run: go test -cpu=4 -short -race -v ./...
+      run: go test -cpu=4 -short -race -tags=nounsafe -v ./...
 
   generate:
     strategy:
@@ -122,22 +128,22 @@ jobs:
         uses: actions/checkout@v4
 
       - name: S2/FuzzDictBlocks
-        run: go test -run=none -fuzz=FuzzDictBlocks -fuzztime=100000x -test.fuzzminimizetime=10ms ./s2/.
+        run: go test -tags=nounsafe -run=none -fuzz=FuzzDictBlocks -fuzztime=100000x -test.fuzzminimizetime=10ms ./s2/.
 
       - name: S2/FuzzEncodingBlocks
-        run: go test -run=none -fuzz=FuzzEncodingBlocks -fuzztime=500000x -test.fuzzminimizetime=10ms ./s2/.
+        run: go test -tags=nounsafe -run=none -fuzz=FuzzEncodingBlocks -fuzztime=500000x -test.fuzzminimizetime=10ms ./s2/.
 
       - name: S2/FuzzLZ4Block
-        run: go test -run=none -fuzz=FuzzLZ4Block -fuzztime=500000x -test.fuzzminimizetime=10ms ./s2/.
+        run: go test -tags=nounsafe -run=none -fuzz=FuzzLZ4Block -fuzztime=500000x -test.fuzzminimizetime=10ms ./s2/.
 
       - name: S2/FuzzDictBlocks/noasm
-        run: go test -tags=noasm -run=none -fuzz=FuzzDictBlocks -fuzztime=100000x -test.fuzzminimizetime=10ms ./s2/.
+        run: go test -tags=noasm,nounsafe -run=none -fuzz=FuzzDictBlocks -fuzztime=100000x -test.fuzzminimizetime=10ms ./s2/.
 
       - name: S2/FuzzEncodingBlocks/noasm
-        run: go test -tags=noasm -run=none -fuzz=FuzzEncodingBlocks -fuzztime=500000x -test.fuzzminimizetime=10ms ./s2/.
+        run: go test -tags=noasm,nounsafe -run=none -fuzz=FuzzEncodingBlocks -fuzztime=500000x -test.fuzzminimizetime=10ms ./s2/.
 
       - name: S2/FuzzLZ4Block/noasm
-        run: go test -tags=noasm -run=none -fuzz=FuzzLZ4Block -fuzztime=500000x -test.fuzzminimizetime=10ms ./s2/.
+        run: go test -tags=noasm,nounsafe -run=none -fuzz=FuzzLZ4Block -fuzztime=500000x -test.fuzzminimizetime=10ms ./s2/.
 
   fuzz-zstd:
     env:
@@ -153,28 +159,28 @@ jobs:
         uses: actions/checkout@v4
 
       - name: zstd/FuzzDecodeAll
-        run: go test -run=none -fuzz=FuzzDecodeAll -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/.
+        run: go test -tags=nounsafe -run=none -fuzz=FuzzDecodeAll -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/.
 
       - name: zstd/FuzzDecAllNoBMI2
-        run: go test -run=none -fuzz=FuzzDecAllNoBMI2 -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/.
+        run: go test -tags=nounsafe -run=none -fuzz=FuzzDecAllNoBMI2 -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/.
 
       - name: zstd/FuzzDecoder
-        run: go test -run=none -fuzz=FuzzDecoder -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/.
+        run: go test -tags=nounsafe -run=none -fuzz=FuzzDecoder -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/.
 
       - name: zstd/FuzzNoBMI2Dec
-        run: go test -run=none -fuzz=FuzzNoBMI2Dec -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/.
+        run: go test -tags=nounsafe -run=none -fuzz=FuzzNoBMI2Dec -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/.
 
       - name: zstd/FuzzEncoding
-        run: cd zstd&&go test -run=none -fuzz=FuzzEncoding -fuzztime=250000x -test.fuzzminimizetime=10ms -fuzz-end=3&&cd ..
+        run: cd zstd&&go test -tags=nounsafe -run=none -fuzz=FuzzEncoding -fuzztime=250000x -test.fuzzminimizetime=10ms -fuzz-end=3&&cd ..
 
       - name: zstd/FuzzDecodeAll/noasm
-        run: go test -tags=noasm -run=none -fuzz=FuzzDecodeAll -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/.
+        run: go test -tags=noasm,nounsafe -run=none -fuzz=FuzzDecodeAll -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/.
 
       - name: zstd/FuzzDecoder/noasm
-        run: go test -tags=noasm -run=none -fuzz=FuzzDecoder -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/.
+        run: go test -tags=noasm,nounsafe -run=none -fuzz=FuzzDecoder -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/.
 
       - name: zstd/FuzzEncoding/noasm
-        run: cd zstd&&go test -tags=noasm -run=none -fuzz=FuzzEncoding -fuzztime=250000x -test.fuzzminimizetime=10ms -fuzz-end=3&&cd ..
+        run: cd zstd&&go test -tags=noasm,nounsafe -run=none -fuzz=FuzzEncoding -fuzztime=250000x -test.fuzzminimizetime=10ms -fuzz-end=3&&cd ..
 
       - name: zstd/FuzzEncodingBest
         run: cd zstd&&go test -run=none -fuzz=FuzzEncoding -fuzztime=25000x -test.fuzzminimizetime=10ms -fuzz-start=4&&cd ..
@@ -193,16 +199,16 @@ jobs:
         uses: actions/checkout@v4
 
       - name: flate/FuzzEncoding
-        run: go test -run=none -fuzz=FuzzEncoding -fuzztime=100000x -test.fuzzminimizetime=10ms ./flate/.
+        run: go test -tags=nounsafe -run=none -fuzz=FuzzEncoding -fuzztime=100000x -test.fuzzminimizetime=10ms ./flate/.
 
       - name: flate/FuzzEncoding/noasm
-        run: go test -run=none -tags=noasm -fuzz=FuzzEncoding -fuzztime=100000x -test.fuzzminimizetime=10ms ./flate/.
+        run: go test -run=none -tags=noasm,nounsafe -fuzz=FuzzEncoding -fuzztime=100000x -test.fuzzminimizetime=10ms ./flate/.
 
       - name: zip/FuzzReader
-        run: go test -run=none -fuzz=FuzzReader -fuzztime=500000x -test.fuzzminimizetime=10ms ./zip/.
+        run: go test -tags=nounsafe -run=none -fuzz=FuzzReader -fuzztime=500000x -test.fuzzminimizetime=10ms ./zip/.
 
       - name: fse/FuzzCompress
-        run: go test -run=none -fuzz=FuzzCompress -fuzztime=1000000x -test.fuzzminimizetime=10ms ./fse/.
+        run: go test -tags=nounsafe -run=none -fuzz=FuzzCompress -fuzztime=1000000x -test.fuzzminimizetime=10ms ./fse/.
 
       - name: fse/FuzzDecompress
-        run: go test -run=none -fuzz=FuzzDecompress -fuzztime=1000000x -test.fuzzminimizetime=10ms ./fse/.
\ No newline at end of file
+        run: go test -tags=nounsafe -run=none -fuzz=FuzzDecompress -fuzztime=1000000x -test.fuzzminimizetime=10ms ./fse/.
diff --git a/flate/fast_encoder.go b/flate/fast_encoder.go
index c8124b5c49..433977767b 100644
--- a/flate/fast_encoder.go
+++ b/flate/fast_encoder.go
@@ -6,8 +6,9 @@
 package flate
 
 import (
-	"encoding/binary"
 	"fmt"
+
+	"github.com/klauspost/compress/internal/le"
 )
 
 type fastEnc interface {
@@ -58,11 +59,11 @@ const (
 )
 
 func load3232(b []byte, i int32) uint32 {
-	return binary.LittleEndian.Uint32(b[i:])
+	return le.Load32(b, i)
 }
 
 func load6432(b []byte, i int32) uint64 {
-	return binary.LittleEndian.Uint64(b[i:])
+	return le.Load64(b, i)
 }
 
 type tableEntry struct {
diff --git a/flate/fuzz_test.go b/flate/fuzz_test.go
index 5529a78be0..8b02f460cf 100644
--- a/flate/fuzz_test.go
+++ b/flate/fuzz_test.go
@@ -1,5 +1,4 @@
 //go:build go1.18
-// +build go1.18
 
 package flate
 
diff --git a/internal/le/le.go b/internal/le/le.go
new file mode 100644
index 0000000000..e54909e16f
--- /dev/null
+++ b/internal/le/le.go
@@ -0,0 +1,5 @@
+package le
+
+type Indexer interface {
+	int | int8 | int16 | int32 | int64 | uint | uint8 | uint16 | uint32 | uint64
+}
diff --git a/internal/le/unsafe_disabled.go b/internal/le/unsafe_disabled.go
new file mode 100644
index 0000000000..f9d81b17c1
--- /dev/null
+++ b/internal/le/unsafe_disabled.go
@@ -0,0 +1,27 @@
+//go:build !(amd64 || arm64 || ppc64le || riscv64) || nounsafe || purego || appengine
+
+package le
+
+import (
+	"encoding/binary"
+)
+
+func Load16[I Indexer](b []byte, i I) uint16 {
+	return binary.LittleEndian.Uint16(b[i:])
+}
+
+func Load32[I Indexer](b []byte, i I) uint32 {
+	return binary.LittleEndian.Uint32(b[i:])
+}
+
+func Load64[I Indexer](b []byte, i I) uint64 {
+	return binary.LittleEndian.Uint64(b[i:])
+}
+
+func Store16(b []byte, v uint16) {
+	binary.LittleEndian.PutUint16(b, v)
+}
+
+func Store32(b []byte, v uint32) {
+	binary.LittleEndian.PutUint32(b, v)
+}
diff --git a/internal/le/unsafe_enabled.go b/internal/le/unsafe_enabled.go
new file mode 100644
index 0000000000..18f3c4d102
--- /dev/null
+++ b/internal/le/unsafe_enabled.go
@@ -0,0 +1,37 @@
+// We enable 64 bit LE platforms:
+
+//go:build (amd64 || arm64 || ppc64le || riscv64) && !nounsafe && !purego && !appengine
+
+package le
+
+import (
+	"unsafe"
+)
+
+func Load16[I Indexer](b []byte, i I) uint16 {
+	//return binary.LittleEndian.Uint16(b[i:])
+	//return *(*uint16)(unsafe.Pointer(&b[i]))
+	return *(*uint16)(unsafe.Pointer(uintptr(unsafe.Pointer(&b[0])) + uintptr(i)*unsafe.Sizeof(b[0])))
+}
+
+func Load32[I Indexer](b []byte, i I) uint32 {
+	//return binary.LittleEndian.Uint32(b[i:])
+	//return *(*uint32)(unsafe.Pointer(&b[i]))
+	return *(*uint32)(unsafe.Pointer(uintptr(unsafe.Pointer(&b[0])) + uintptr(i)*unsafe.Sizeof(b[0])))
+}
+
+func Load64[I Indexer](b []byte, i I) uint64 {
+	//return binary.LittleEndian.Uint64(b[i:])
+	//return *(*uint64)(unsafe.Pointer(&b[i]))
+	return *(*uint64)(unsafe.Pointer(uintptr(unsafe.Pointer(&b[0])) + uintptr(i)*unsafe.Sizeof(b[0])))
+}
+
+func Store16(b []byte, v uint16) {
+	//binary.LittleEndian.PutUint16(b, v)
+	*(*uint16)(unsafe.Pointer(&b[0])) = v
+}
+
+func Store32(b []byte, v uint32) {
+	//binary.LittleEndian.PutUint32(b, v)
+	*(*uint32)(unsafe.Pointer(&b[0])) = v
+}
diff --git a/s2/encode_all.go b/s2/encode_all.go
index 9977045696..c56ce52e7d 100644
--- a/s2/encode_all.go
+++ b/s2/encode_all.go
@@ -10,14 +10,16 @@ import (
 	"encoding/binary"
 	"fmt"
 	"math/bits"
+
+	"github.com/klauspost/compress/internal/le"
 )
 
 func load32(b []byte, i int) uint32 {
-	return binary.LittleEndian.Uint32(b[i:])
+	return le.Load32(b, i)
 }
 
 func load64(b []byte, i int) uint64 {
-	return binary.LittleEndian.Uint64(b[i:])
+	return le.Load64(b, i)
 }
 
 // hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
diff --git a/zstd/_generate/gen.go b/zstd/_generate/gen.go
index 4aa9ffdde7..c872cb1053 100644
--- a/zstd/_generate/gen.go
+++ b/zstd/_generate/gen.go
@@ -157,7 +157,7 @@ func (o options) generateBody(name string, executeSingleTriple func(ctx *execute
 		Load(br.Field("value"), brValue)
 		Load(br.Field("bitsRead"), brBitsRead)
 		Load(br.Field("in").Base(), brPointer)
-		Load(br.Field("in").Len(), brOffset)
+		Load(br.Field("off"), brOffset)
 		ADDQ(brOffset, brPointer) // Add current offset to read pointer.
 		MOVQ(brPointer, brPointerStash)
 	}
@@ -438,7 +438,7 @@ func (o options) generateBody(name string, executeSingleTriple func(ctx *execute
 	br := Dereference(Param("br"))
 	Store(brValue, br.Field("value"))
 	Store(brBitsRead.As8(), br.Field("bitsRead"))
-	Store(brOffset, br.Field("in").Len())
+	Store(brOffset, br.Field("off"))
 
 	if !o.useSeqs {
 		Comment("Update the context")
diff --git a/zstd/bitreader.go b/zstd/bitreader.go
index 25ca983941..34e285fb73 100644
--- a/zstd/bitreader.go
+++ b/zstd/bitreader.go
@@ -5,11 +5,12 @@
 package zstd
 
 import (
-	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
 	"math/bits"
+
+	"github.com/klauspost/compress/internal/le"
 )
 
 // bitReader reads a bitstream in reverse.
@@ -18,6 +19,7 @@ import (
 type bitReader struct {
 	in       []byte
 	value    uint64 // Maybe use [16]byte, but shifting is awkward.
+	off      int    // offset where next read should end
 	bitsRead uint8
 }
 
@@ -32,6 +34,7 @@ func (b *bitReader) init(in []byte) error {
 	if v == 0 {
 		return errors.New("corrupt stream, did not find end of stream")
 	}
+	b.off = len(in)
 	b.bitsRead = 64
 	b.value = 0
 	if len(in) >= 8 {
@@ -67,18 +70,15 @@ func (b *bitReader) fillFast() {
 	if b.bitsRead < 32 {
 		return
 	}
-	v := b.in[len(b.in)-4:]
-	b.in = b.in[:len(b.in)-4]
-	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	b.value = (b.value << 32) | uint64(low)
+	b.off -= 4
+	b.value = (b.value << 32) | uint64(le.Load32(b.in, b.off))
 	b.bitsRead -= 32
 }
 
 // fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
 func (b *bitReader) fillFastStart() {
-	v := b.in[len(b.in)-8:]
-	b.in = b.in[:len(b.in)-8]
-	b.value = binary.LittleEndian.Uint64(v)
+	b.off -= 8
+	b.value = le.Load64(b.in, b.off)
 	b.bitsRead = 0
 }
 
@@ -87,25 +87,23 @@ func (b *bitReader) fill() {
 	if b.bitsRead < 32 {
 		return
 	}
-	if len(b.in) >= 4 {
-		v := b.in[len(b.in)-4:]
-		b.in = b.in[:len(b.in)-4]
-		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-		b.value = (b.value << 32) | uint64(low)
+	if b.off >= 4 {
+		b.off -= 4
+		b.value = (b.value << 32) | uint64(le.Load32(b.in, b.off))
 		b.bitsRead -= 32
 		return
 	}
 
-	b.bitsRead -= uint8(8 * len(b.in))
-	for len(b.in) > 0 {
-		b.value = (b.value << 8) | uint64(b.in[len(b.in)-1])
-		b.in = b.in[:len(b.in)-1]
+	b.bitsRead -= uint8(8 * b.off)
+	for b.off > 0 {
+		b.off -= 1
+		b.value = (b.value << 8) | uint64(b.in[b.off])
 	}
 }
 
 // finished returns true if all bits have been read from the bit stream.
 func (b *bitReader) finished() bool {
-	return len(b.in) == 0 && b.bitsRead >= 64
+	return b.off == 0 && b.bitsRead >= 64
 }
 
 // overread returns true if more bits have been requested than is on the stream.
@@ -115,13 +113,14 @@ func (b *bitReader) overread() bool {
 
 // remain returns the number of bits remaining.
 func (b *bitReader) remain() uint {
-	return 8*uint(len(b.in)) + 64 - uint(b.bitsRead)
+	return 8*uint(b.off) + 64 - uint(b.bitsRead)
 }
 
 // close the bitstream and returns an error if out-of-buffer reads occurred.
 func (b *bitReader) close() error {
 	// Release reference.
 	b.in = nil
+	b.off = 0
 	if !b.finished() {
 		return fmt.Errorf("%d extra bits on block, should be 0", b.remain())
 	}
diff --git a/zstd/decoder.go b/zstd/decoder.go
index 0170da828c..ffdb889f7f 100644
--- a/zstd/decoder.go
+++ b/zstd/decoder.go
@@ -323,6 +323,7 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 		frame.bBuf = nil
 		if frame.history.decoders.br != nil {
 			frame.history.decoders.br.in = nil
+			frame.history.decoders.br.off = 0
 		}
 		d.decoders <- block
 	}()
diff --git a/zstd/matchlen_generic.go b/zstd/matchlen_generic.go
index 57b9c31c02..741e784491 100644
--- a/zstd/matchlen_generic.go
+++ b/zstd/matchlen_generic.go
@@ -7,15 +7,16 @@
 package zstd
 
 import (
-	"encoding/binary"
 	"math/bits"
+
+	"github.com/klauspost/compress/internal/le"
 )
 
 // matchLen returns the maximum common prefix length of a and b.
 // a must be the shortest of the two.
 func matchLen(a, b []byte) (n int) {
 	for ; len(a) >= 8 && len(b) >= 8; a, b = a[8:], b[8:] {
-		diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b)
+		diff := le.Load64(a, 0) ^ le.Load64(b, 0)
 		if diff != 0 {
 			return n + bits.TrailingZeros64(diff)>>3
 		}
diff --git a/zstd/seqdec.go b/zstd/seqdec.go
index d7fe6d82d9..1d69e6d72f 100644
--- a/zstd/seqdec.go
+++ b/zstd/seqdec.go
@@ -245,7 +245,7 @@ func (s *sequenceDecs) decodeSync(hist []byte) error {
 			return io.ErrUnexpectedEOF
 		}
 		var ll, mo, ml int
-		if len(br.in) > 4+((maxOffsetBits+16+16)>>3) {
+		if br.off > 4+((maxOffsetBits+16+16)>>3) {
 			// inlined function:
 			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
 
diff --git a/zstd/seqdec_amd64.s b/zstd/seqdec_amd64.s
index f5591fa1e8..a708ca6d3d 100644
--- a/zstd/seqdec_amd64.s
+++ b/zstd/seqdec_amd64.s
@@ -7,9 +7,9 @@
 TEXT ·sequenceDecs_decode_amd64(SB), $8-32
 	MOVQ    br+8(FP), CX
 	MOVQ    24(CX), DX
-	MOVBQZX 32(CX), BX
+	MOVBQZX 40(CX), BX
 	MOVQ    (CX), AX
-	MOVQ    8(CX), SI
+	MOVQ    32(CX), SI
 	ADDQ    SI, AX
 	MOVQ    AX, (SP)
 	MOVQ    ctx+16(FP), AX
@@ -299,8 +299,8 @@ sequenceDecs_decode_amd64_match_len_ofs_ok:
 	MOVQ R13, 160(AX)
 	MOVQ br+8(FP), AX
 	MOVQ DX, 24(AX)
-	MOVB BL, 32(AX)
-	MOVQ SI, 8(AX)
+	MOVB BL, 40(AX)
+	MOVQ SI, 32(AX)
 
 	// Return success
 	MOVQ $0x00000000, ret+24(FP)
@@ -335,9 +335,9 @@ error_overread:
 TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
 	MOVQ    br+8(FP), CX
 	MOVQ    24(CX), DX
-	MOVBQZX 32(CX), BX
+	MOVBQZX 40(CX), BX
 	MOVQ    (CX), AX
-	MOVQ    8(CX), SI
+	MOVQ    32(CX), SI
 	ADDQ    SI, AX
 	MOVQ    AX, (SP)
 	MOVQ    ctx+16(FP), AX
@@ -598,8 +598,8 @@ sequenceDecs_decode_56_amd64_match_len_ofs_ok:
 	MOVQ R13, 160(AX)
 	MOVQ br+8(FP), AX
 	MOVQ DX, 24(AX)
-	MOVB BL, 32(AX)
-	MOVQ SI, 8(AX)
+	MOVB BL, 40(AX)
+	MOVQ SI, 32(AX)
 
 	// Return success
 	MOVQ $0x00000000, ret+24(FP)
@@ -634,9 +634,9 @@ error_overread:
 TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
 	MOVQ    br+8(FP), BX
 	MOVQ    24(BX), AX
-	MOVBQZX 32(BX), DX
+	MOVBQZX 40(BX), DX
 	MOVQ    (BX), CX
-	MOVQ    8(BX), BX
+	MOVQ    32(BX), BX
 	ADDQ    BX, CX
 	MOVQ    CX, (SP)
 	MOVQ    ctx+16(FP), CX
@@ -884,8 +884,8 @@ sequenceDecs_decode_bmi2_match_len_ofs_ok:
 	MOVQ R12, 160(CX)
 	MOVQ br+8(FP), CX
 	MOVQ AX, 24(CX)
-	MOVB DL, 32(CX)
-	MOVQ BX, 8(CX)
+	MOVB DL, 40(CX)
+	MOVQ BX, 32(CX)
 
 	// Return success
 	MOVQ $0x00000000, ret+24(FP)
@@ -920,9 +920,9 @@ error_overread:
 TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
 	MOVQ    br+8(FP), BX
 	MOVQ    24(BX), AX
-	MOVBQZX 32(BX), DX
+	MOVBQZX 40(BX), DX
 	MOVQ    (BX), CX
-	MOVQ    8(BX), BX
+	MOVQ    32(BX), BX
 	ADDQ    BX, CX
 	MOVQ    CX, (SP)
 	MOVQ    ctx+16(FP), CX
@@ -1141,8 +1141,8 @@ sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
 	MOVQ R12, 160(CX)
 	MOVQ br+8(FP), CX
 	MOVQ AX, 24(CX)
-	MOVB DL, 32(CX)
-	MOVQ BX, 8(CX)
+	MOVB DL, 40(CX)
+	MOVQ BX, 32(CX)
 
 	// Return success
 	MOVQ $0x00000000, ret+24(FP)
@@ -1787,9 +1787,9 @@ empty_seqs:
 TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
 	MOVQ    br+8(FP), CX
 	MOVQ    24(CX), DX
-	MOVBQZX 32(CX), BX
+	MOVBQZX 40(CX), BX
 	MOVQ    (CX), AX
-	MOVQ    8(CX), SI
+	MOVQ    32(CX), SI
 	ADDQ    SI, AX
 	MOVQ    AX, (SP)
 	MOVQ    ctx+16(FP), AX
@@ -2281,8 +2281,8 @@ handle_loop:
 loop_finished:
 	MOVQ br+8(FP), AX
 	MOVQ DX, 24(AX)
-	MOVB BL, 32(AX)
-	MOVQ SI, 8(AX)
+	MOVB BL, 40(AX)
+	MOVQ SI, 32(AX)
 
 	// Update the context
 	MOVQ ctx+16(FP), AX
@@ -2349,9 +2349,9 @@ error_not_enough_space:
 TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
 	MOVQ    br+8(FP), BX
 	MOVQ    24(BX), AX
-	MOVBQZX 32(BX), DX
+	MOVBQZX 40(BX), DX
 	MOVQ    (BX), CX
-	MOVQ    8(BX), BX
+	MOVQ    32(BX), BX
 	ADDQ    BX, CX
 	MOVQ    CX, (SP)
 	MOVQ    ctx+16(FP), CX
@@ -2801,8 +2801,8 @@ handle_loop:
 loop_finished:
 	MOVQ br+8(FP), CX
 	MOVQ AX, 24(CX)
-	MOVB DL, 32(CX)
-	MOVQ BX, 8(CX)
+	MOVB DL, 40(CX)
+	MOVQ BX, 32(CX)
 
 	// Update the context
 	MOVQ ctx+16(FP), AX
@@ -2869,9 +2869,9 @@ error_not_enough_space:
 TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
 	MOVQ    br+8(FP), CX
 	MOVQ    24(CX), DX
-	MOVBQZX 32(CX), BX
+	MOVBQZX 40(CX), BX
 	MOVQ    (CX), AX
-	MOVQ    8(CX), SI
+	MOVQ    32(CX), SI
 	ADDQ    SI, AX
 	MOVQ    AX, (SP)
 	MOVQ    ctx+16(FP), AX
@@ -3465,8 +3465,8 @@ handle_loop:
 loop_finished:
 	MOVQ br+8(FP), AX
 	MOVQ DX, 24(AX)
-	MOVB BL, 32(AX)
-	MOVQ SI, 8(AX)
+	MOVB BL, 40(AX)
+	MOVQ SI, 32(AX)
 
 	// Update the context
 	MOVQ ctx+16(FP), AX
@@ -3533,9 +3533,9 @@ error_not_enough_space:
 TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
 	MOVQ    br+8(FP), BX
 	MOVQ    24(BX), AX
-	MOVBQZX 32(BX), DX
+	MOVBQZX 40(BX), DX
 	MOVQ    (BX), CX
-	MOVQ    8(BX), BX
+	MOVQ    32(BX), BX
 	ADDQ    BX, CX
 	MOVQ    CX, (SP)
 	MOVQ    ctx+16(FP), CX
@@ -4087,8 +4087,8 @@ handle_loop:
 loop_finished:
 	MOVQ br+8(FP), CX
 	MOVQ AX, 24(CX)
-	MOVB DL, 32(CX)
-	MOVQ BX, 8(CX)
+	MOVB DL, 40(CX)
+	MOVQ BX, 32(CX)
 
 	// Update the context
 	MOVQ ctx+16(FP), AX
diff --git a/zstd/seqdec_generic.go b/zstd/seqdec_generic.go
index 2fb35b788c..ac2a80d291 100644
--- a/zstd/seqdec_generic.go
+++ b/zstd/seqdec_generic.go
@@ -29,7 +29,7 @@ func (s *sequenceDecs) decode(seqs []seqVals) error {
 	}
 	for i := range seqs {
 		var ll, mo, ml int
-		if len(br.in) > 4+((maxOffsetBits+16+16)>>3) {
+		if br.off > 4+((maxOffsetBits+16+16)>>3) {
 			// inlined function:
 			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
 
diff --git a/zstd/zstd.go b/zstd/zstd.go
index 066bef2a4f..6252b46ae6 100644
--- a/zstd/zstd.go
+++ b/zstd/zstd.go
@@ -5,10 +5,11 @@ package zstd
 
 import (
 	"bytes"
-	"encoding/binary"
 	"errors"
 	"log"
 	"math"
+
+	"github.com/klauspost/compress/internal/le"
 )
 
 // enable debug printing
@@ -110,11 +111,11 @@ func printf(format string, a ...interface{}) {
 }
 
 func load3232(b []byte, i int32) uint32 {
-	return binary.LittleEndian.Uint32(b[:len(b):len(b)][i:])
+	return le.Load32(b, i)
 }
 
 func load6432(b []byte, i int32) uint64 {
-	return binary.LittleEndian.Uint64(b[:len(b):len(b)][i:])
+	return le.Load64(b, i)
 }
 
 type byter interface {

From fef4be656d453b33f0e3e401d4e677ab2d95c56a Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Sun, 5 Jan 2025 21:08:12 +0100
Subject: [PATCH 2/9] Try new test matrix

---
 .github/workflows/go.yml | 89 +++++++++++++++++-----------------------
 zstd/bitreader.go        | 32 +++++++--------
 zstd/decoder.go          |  2 +-
 zstd/seqdec.go           |  2 +-
 zstd/seqdec_generic.go   |  2 +-
 5 files changed, 56 insertions(+), 71 deletions(-)

diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
index 3b04386f22..02b1285b43 100644
--- a/.github/workflows/go.yml
+++ b/.github/workflows/go.yml
@@ -32,14 +32,14 @@ jobs:
     - name: Test
       run: go test ./...
 
-    - name: Test Noasm
+    - name: Test No-asm
       run: go test -tags=noasm ./...
 
-    - name: Test Nounsafe
+    - name: Test No-unsafe
       run: go test -tags=nounsafe ./...
 
-    - name: Test Nounsafe, noasm
-      run: go test -tags=nounsafe,noasm ./...
+    - name: Test No-unsafe, noasm
+      run: go test -tags="nounsafe,noasm" ./...
 
     - name: Test Race 1 CPU
       env:
@@ -118,6 +118,9 @@ jobs:
     env:
       CGO_ENABLED: 0
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        tags: [ 'nounsafe', '"noasm,nounsafe"' ]
     steps:
       - name: Set up Go
         uses: actions/setup-go@v5.2.0
@@ -127,28 +130,23 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v4
 
-      - name: S2/FuzzDictBlocks
-        run: go test -tags=nounsafe -run=none -fuzz=FuzzDictBlocks -fuzztime=100000x -test.fuzzminimizetime=10ms ./s2/.
-
-      - name: S2/FuzzEncodingBlocks
-        run: go test -tags=nounsafe -run=none -fuzz=FuzzEncodingBlocks -fuzztime=500000x -test.fuzzminimizetime=10ms ./s2/.
-
-      - name: S2/FuzzLZ4Block
-        run: go test -tags=nounsafe -run=none -fuzz=FuzzLZ4Block -fuzztime=500000x -test.fuzzminimizetime=10ms ./s2/.
+      - name: S2/FuzzDictBlocks/${{ matrix.tags }}
+        run: go test -tags=${{ matrix.tags }} -run=none -fuzz=FuzzDictBlocks -fuzztime=100000x -test.fuzzminimizetime=10ms ./s2/.
 
-      - name: S2/FuzzDictBlocks/noasm
-        run: go test -tags=noasm,nounsafe -run=none -fuzz=FuzzDictBlocks -fuzztime=100000x -test.fuzzminimizetime=10ms ./s2/.
+      - name: S2/FuzzEncodingBlocks/${{ matrix.tags }}
+        run: go test -tags=${{ matrix.tags }} -run=none -fuzz=FuzzEncodingBlocks -fuzztime=500000x -test.fuzzminimizetime=10ms ./s2/.
 
-      - name: S2/FuzzEncodingBlocks/noasm
-        run: go test -tags=noasm,nounsafe -run=none -fuzz=FuzzEncodingBlocks -fuzztime=500000x -test.fuzzminimizetime=10ms ./s2/.
+      - name: S2/FuzzLZ4Block/${{ matrix.tags }}
+        run: go test -tags=${{ matrix.tags }} -run=none -fuzz=FuzzLZ4Block -fuzztime=500000x -test.fuzzminimizetime=10ms ./s2/.
 
-      - name: S2/FuzzLZ4Block/noasm
-        run: go test -tags=noasm,nounsafe -run=none -fuzz=FuzzLZ4Block -fuzztime=500000x -test.fuzzminimizetime=10ms ./s2/.
 
   fuzz-zstd:
     env:
       CGO_ENABLED: 0
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        tags: [ 'nounsafe', '"noasm,nounsafe"' ]
     steps:
       - name: Set up Go
         uses: actions/setup-go@v5.2.0
@@ -158,57 +156,44 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v4
 
-      - name: zstd/FuzzDecodeAll
-        run: go test -tags=nounsafe -run=none -fuzz=FuzzDecodeAll -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/.
-
-      - name: zstd/FuzzDecAllNoBMI2
-        run: go test -tags=nounsafe -run=none -fuzz=FuzzDecAllNoBMI2 -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/.
+      - name: zstd/FuzzDecodeAll/${{ matrix.tags }}
+        run: go test -tags=${{ matrix.tags }} -run=none -fuzz=FuzzDecodeAll -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/.
 
-      - name: zstd/FuzzDecoder
-        run: go test -tags=nounsafe -run=none -fuzz=FuzzDecoder -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/.
+      - name: zstd/FuzzDecAllNoBMI2/${{ matrix.tags }}
+        run: go test -tags=${{ matrix.tags }} -run=none -fuzz=FuzzDecAllNoBMI2 -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/.
 
-      - name: zstd/FuzzNoBMI2Dec
-        run: go test -tags=nounsafe -run=none -fuzz=FuzzNoBMI2Dec -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/.
+      - name: zstd/FuzzDecoder/${{ matrix.tags }}
+        run: go test -tags=${{ matrix.tags }} -run=none -fuzz=FuzzDecoder -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/.
 
-      - name: zstd/FuzzEncoding
-        run: cd zstd&&go test -tags=nounsafe -run=none -fuzz=FuzzEncoding -fuzztime=250000x -test.fuzzminimizetime=10ms -fuzz-end=3&&cd ..
+      - name: zstd/FuzzNoBMI2Dec/${{ matrix.tags }}
+        run: go test -tags=${{ matrix.tags }} -run=none -fuzz=FuzzNoBMI2Dec -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/.
 
-      - name: zstd/FuzzDecodeAll/noasm
-        run: go test -tags=noasm,nounsafe -run=none -fuzz=FuzzDecodeAll -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/.
-
-      - name: zstd/FuzzDecoder/noasm
-        run: go test -tags=noasm,nounsafe -run=none -fuzz=FuzzDecoder -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/.
-
-      - name: zstd/FuzzEncoding/noasm
-        run: cd zstd&&go test -tags=noasm,nounsafe -run=none -fuzz=FuzzEncoding -fuzztime=250000x -test.fuzzminimizetime=10ms -fuzz-end=3&&cd ..
-
-      - name: zstd/FuzzEncodingBest
-        run: cd zstd&&go test -run=none -fuzz=FuzzEncoding -fuzztime=25000x -test.fuzzminimizetime=10ms -fuzz-start=4&&cd ..
+      - name: zstd/FuzzEncoding/${{ matrix.tags }}
+        run: cd zstd&&go test -tags=${{ matrix.tags }} -run=none -fuzz=FuzzEncoding -fuzztime=250000x -test.fuzzminimizetime=10ms -fuzz-end=3&&cd ..
 
   fuzz-other:
     env:
       CGO_ENABLED: 0
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        tags: [ 'nounsafe', '"noasm,nounsafe"' ]
     steps:
       - name: Set up Go
         uses: actions/setup-go@v5.2.0
         with:
           go-version: 1.23.x
-
       - name: Checkout code
         uses: actions/checkout@v4
 
-      - name: flate/FuzzEncoding
-        run: go test -tags=nounsafe -run=none -fuzz=FuzzEncoding -fuzztime=100000x -test.fuzzminimizetime=10ms ./flate/.
-
-      - name: flate/FuzzEncoding/noasm
-        run: go test -run=none -tags=noasm,nounsafe -fuzz=FuzzEncoding -fuzztime=100000x -test.fuzzminimizetime=10ms ./flate/.
+      - name: flate/FuzzEncoding/${{ matrix.tags }}
+        run: go test -tags=${{ matrix.tags }} -run=none -fuzz=FuzzEncoding -fuzztime=100000x -test.fuzzminimizetime=10ms ./flate/.
 
-      - name: zip/FuzzReader
-        run: go test -tags=nounsafe -run=none -fuzz=FuzzReader -fuzztime=500000x -test.fuzzminimizetime=10ms ./zip/.
+      - name: zip/FuzzReader/${{ matrix.tags }}
+        run: go test -tags=${{ matrix.tags }} -run=none -fuzz=FuzzReader -fuzztime=500000x -test.fuzzminimizetime=10ms ./zip/.
 
-      - name: fse/FuzzCompress
-        run: go test -tags=nounsafe -run=none -fuzz=FuzzCompress -fuzztime=1000000x -test.fuzzminimizetime=10ms ./fse/.
+      - name: fse/FuzzCompress/${{ matrix.tags }}
+        run: go test -tags=${{ matrix.tags }} -run=none -fuzz=FuzzCompress -fuzztime=1000000x -test.fuzzminimizetime=10ms ./fse/.
 
-      - name: fse/FuzzDecompress
-        run: go test -tags=nounsafe -run=none -fuzz=FuzzDecompress -fuzztime=1000000x -test.fuzzminimizetime=10ms ./fse/.
+      - name: fse/FuzzDecompress/${{ matrix.tags }}
+        run: go test -tags=${{ matrix.tags }} -run=none -fuzz=FuzzDecompress -fuzztime=1000000x -test.fuzzminimizetime=10ms ./fse/.
diff --git a/zstd/bitreader.go b/zstd/bitreader.go
index 34e285fb73..d41e3e1709 100644
--- a/zstd/bitreader.go
+++ b/zstd/bitreader.go
@@ -19,7 +19,7 @@ import (
 type bitReader struct {
 	in       []byte
 	value    uint64 // Maybe use [16]byte, but shifting is awkward.
-	off      int    // offset where next read should end
+	cursor   int    // offset where next read should end
 	bitsRead uint8
 }
 
@@ -34,7 +34,7 @@ func (b *bitReader) init(in []byte) error {
 	if v == 0 {
 		return errors.New("corrupt stream, did not find end of stream")
 	}
-	b.off = len(in)
+	b.cursor = len(in)
 	b.bitsRead = 64
 	b.value = 0
 	if len(in) >= 8 {
@@ -70,15 +70,15 @@ func (b *bitReader) fillFast() {
 	if b.bitsRead < 32 {
 		return
 	}
-	b.off -= 4
-	b.value = (b.value << 32) | uint64(le.Load32(b.in, b.off))
+	b.cursor -= 4
+	b.value = (b.value << 32) | uint64(le.Load32(b.in, b.cursor))
 	b.bitsRead -= 32
 }
 
 // fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
 func (b *bitReader) fillFastStart() {
-	b.off -= 8
-	b.value = le.Load64(b.in, b.off)
+	b.cursor -= 8
+	b.value = le.Load64(b.in, b.cursor)
 	b.bitsRead = 0
 }
 
@@ -87,23 +87,23 @@ func (b *bitReader) fill() {
 	if b.bitsRead < 32 {
 		return
 	}
-	if b.off >= 4 {
-		b.off -= 4
-		b.value = (b.value << 32) | uint64(le.Load32(b.in, b.off))
+	if b.cursor >= 4 {
+		b.cursor -= 4
+		b.value = (b.value << 32) | uint64(le.Load32(b.in, b.cursor))
 		b.bitsRead -= 32
 		return
 	}
 
-	b.bitsRead -= uint8(8 * b.off)
-	for b.off > 0 {
-		b.off -= 1
-		b.value = (b.value << 8) | uint64(b.in[b.off])
+	b.bitsRead -= uint8(8 * b.cursor)
+	for b.cursor > 0 {
+		b.cursor -= 1
+		b.value = (b.value << 8) | uint64(b.in[b.cursor])
 	}
 }
 
 // finished returns true if all bits have been read from the bit stream.
 func (b *bitReader) finished() bool {
-	return b.off == 0 && b.bitsRead >= 64
+	return b.cursor == 0 && b.bitsRead >= 64
 }
 
 // overread returns true if more bits have been requested than is on the stream.
@@ -113,14 +113,14 @@ func (b *bitReader) overread() bool {
 
 // remain returns the number of bits remaining.
 func (b *bitReader) remain() uint {
-	return 8*uint(b.off) + 64 - uint(b.bitsRead)
+	return 8*uint(b.cursor) + 64 - uint(b.bitsRead)
 }
 
 // close the bitstream and returns an error if out-of-buffer reads occurred.
 func (b *bitReader) close() error {
 	// Release reference.
 	b.in = nil
-	b.off = 0
+	b.cursor = 0
 	if !b.finished() {
 		return fmt.Errorf("%d extra bits on block, should be 0", b.remain())
 	}
diff --git a/zstd/decoder.go b/zstd/decoder.go
index ffdb889f7f..ea2a19376c 100644
--- a/zstd/decoder.go
+++ b/zstd/decoder.go
@@ -323,7 +323,7 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 		frame.bBuf = nil
 		if frame.history.decoders.br != nil {
 			frame.history.decoders.br.in = nil
-			frame.history.decoders.br.off = 0
+			frame.history.decoders.br.cursor = 0
 		}
 		d.decoders <- block
 	}()
diff --git a/zstd/seqdec.go b/zstd/seqdec.go
index 1d69e6d72f..9a7de82f9e 100644
--- a/zstd/seqdec.go
+++ b/zstd/seqdec.go
@@ -245,7 +245,7 @@ func (s *sequenceDecs) decodeSync(hist []byte) error {
 			return io.ErrUnexpectedEOF
 		}
 		var ll, mo, ml int
-		if br.off > 4+((maxOffsetBits+16+16)>>3) {
+		if br.cursor > 4+((maxOffsetBits+16+16)>>3) {
 			// inlined function:
 			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
 
diff --git a/zstd/seqdec_generic.go b/zstd/seqdec_generic.go
index ac2a80d291..7cec2197cd 100644
--- a/zstd/seqdec_generic.go
+++ b/zstd/seqdec_generic.go
@@ -29,7 +29,7 @@ func (s *sequenceDecs) decode(seqs []seqVals) error {
 	}
 	for i := range seqs {
 		var ll, mo, ml int
-		if br.off > 4+((maxOffsetBits+16+16)>>3) {
+		if br.cursor > 4+((maxOffsetBits+16+16)>>3) {
 			// inlined function:
 			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
 

From 1471edf4c6716f01721c1c501ad9ac1647e5a7a9 Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Mon, 6 Jan 2025 10:33:52 +0100
Subject: [PATCH 3/9] Fix field name.

---
 zstd/_generate/gen.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/zstd/_generate/gen.go b/zstd/_generate/gen.go
index c872cb1053..1554543b15 100644
--- a/zstd/_generate/gen.go
+++ b/zstd/_generate/gen.go
@@ -157,7 +157,7 @@ func (o options) generateBody(name string, executeSingleTriple func(ctx *execute
 		Load(br.Field("value"), brValue)
 		Load(br.Field("bitsRead"), brBitsRead)
 		Load(br.Field("in").Base(), brPointer)
-		Load(br.Field("off"), brOffset)
+		Load(br.Field("cursor"), brOffset)
 		ADDQ(brOffset, brPointer) // Add current offset to read pointer.
 		MOVQ(brPointer, brPointerStash)
 	}
@@ -438,7 +438,7 @@ func (o options) generateBody(name string, executeSingleTriple func(ctx *execute
 	br := Dereference(Param("br"))
 	Store(brValue, br.Field("value"))
 	Store(brBitsRead.As8(), br.Field("bitsRead"))
-	Store(brOffset, br.Field("off"))
+	Store(brOffset, br.Field("cursor"))
 
 	if !o.useSeqs {
 		Comment("Update the context")

From bd6469850d5396bcaf00deb6f3350b6077a761cf Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Sun, 12 Jan 2025 11:47:20 +0100
Subject: [PATCH 4/9] Use in buff0, s2 go decoder.

---
 huff0/bitreader.go       | 23 +++++++++--------------
 huff0/decompress_test.go |  8 ++++----
 s2/decode_other.go       | 26 +++++++++++---------------
 zstd/seqenc.go           |  2 --
 4 files changed, 24 insertions(+), 35 deletions(-)

diff --git a/huff0/bitreader.go b/huff0/bitreader.go
index 6686d7371b..bfc7a523de 100644
--- a/huff0/bitreader.go
+++ b/huff0/bitreader.go
@@ -6,10 +6,11 @@
 package huff0
 
 import (
-	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
+
+	"github.com/klauspost/compress/internal/le"
 )
 
 // bitReader reads a bitstream in reverse.
@@ -66,8 +67,7 @@ func (b *bitReaderBytes) fillFast() {
 	}
 
 	// 2 bounds checks.
-	v := b.in[b.off-4 : b.off]
-	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	low := le.Load32(b.in, b.off-4)
 	b.value |= uint64(low) << (b.bitsRead - 32)
 	b.bitsRead -= 32
 	b.off -= 4
@@ -76,7 +76,7 @@ func (b *bitReaderBytes) fillFast() {
 // fillFastStart() assumes the bitReaderBytes is empty and there is at least 8 bytes to read.
 func (b *bitReaderBytes) fillFastStart() {
 	// Do single re-slice to avoid bounds checks.
-	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
+	b.value = le.Load64(b.in, b.off-8)
 	b.bitsRead = 0
 	b.off -= 8
 }
@@ -86,9 +86,8 @@ func (b *bitReaderBytes) fill() {
 	if b.bitsRead < 32 {
 		return
 	}
-	if b.off > 4 {
-		v := b.in[b.off-4 : b.off]
-		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	if b.off >= 4 {
+		low := le.Load32(b.in, b.off-4)
 		b.value |= uint64(low) << (b.bitsRead - 32)
 		b.bitsRead -= 32
 		b.off -= 4
@@ -175,9 +174,7 @@ func (b *bitReaderShifted) fillFast() {
 		return
 	}
 
-	// 2 bounds checks.
-	v := b.in[b.off-4 : b.off]
-	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	low := le.Load32(b.in, b.off-4)
 	b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
 	b.bitsRead -= 32
 	b.off -= 4
@@ -185,8 +182,7 @@ func (b *bitReaderShifted) fillFast() {
 
 // fillFastStart() assumes the bitReaderShifted is empty and there is at least 8 bytes to read.
 func (b *bitReaderShifted) fillFastStart() {
-	// Do single re-slice to avoid bounds checks.
-	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
+	b.value = le.Load64(b.in, b.off-8)
 	b.bitsRead = 0
 	b.off -= 8
 }
@@ -197,8 +193,7 @@ func (b *bitReaderShifted) fill() {
 		return
 	}
 	if b.off > 4 {
-		v := b.in[b.off-4 : b.off]
-		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+		low := le.Load32(b.in, b.off-4)
 		b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
 		b.bitsRead -= 32
 		b.off -= 4
diff --git a/huff0/decompress_test.go b/huff0/decompress_test.go
index fe23514fad..47123058ff 100644
--- a/huff0/decompress_test.go
+++ b/huff0/decompress_test.go
@@ -91,7 +91,7 @@ func TestDecompress1X(t *testing.T) {
 					t.Log(string(dc))
 				}
 				//t.Errorf(test.name+": decompressed, got delta: \n%s")
-				t.Errorf(test.name + ": decompressed, got delta")
+				t.Error(test.name + ": decompressed, got delta")
 			}
 			if !t.Failed() {
 				t.Log("... roundtrip ok!")
@@ -221,7 +221,7 @@ func TestDecompress4X(t *testing.T) {
 							t.Log(string(dc))
 						}
 						//t.Errorf(test.name+": decompressed, got delta: \n%s")
-						t.Errorf(test.name + ": decompressed, got delta")
+						t.Error(test.name + ": decompressed, got delta")
 					}
 					if !t.Failed() {
 						t.Log("... roundtrip ok!")
@@ -315,7 +315,7 @@ func TestRoundtrip1XFuzz(t *testing.T) {
 					t.Log(string(dc))
 				}
 				//t.Errorf(test.name+": decompressed, got delta: \n%s")
-				t.Errorf(test.name + ": decompressed, got delta")
+				t.Error(test.name + ": decompressed, got delta")
 			}
 			if !t.Failed() {
 				t.Log("... roundtrip ok!")
@@ -406,7 +406,7 @@ func TestRoundtrip4XFuzz(t *testing.T) {
 					t.Log(string(dc))
 				}
 				//t.Errorf(test.name+": decompressed, got delta: \n%s")
-				t.Errorf(test.name + ": decompressed, got delta")
+				t.Error(test.name + ": decompressed, got delta")
 			}
 			if !t.Failed() {
 				t.Log("... roundtrip ok!")
diff --git a/s2/decode_other.go b/s2/decode_other.go
index 2cb55c2c77..c99d40b69d 100644
--- a/s2/decode_other.go
+++ b/s2/decode_other.go
@@ -11,6 +11,8 @@ package s2
 import (
 	"fmt"
 	"strconv"
+
+	"github.com/klauspost/compress/internal/le"
 )
 
 // decode writes the decoding of src to dst. It assumes that the varint-encoded
@@ -38,21 +40,18 @@ func s2Decode(dst, src []byte) int {
 			case x < 60:
 				s++
 			case x == 60:
+				x = uint32(src[s+1])
 				s += 2
-				x = uint32(src[s-1])
 			case x == 61:
-				in := src[s : s+3]
-				x = uint32(in[1]) | uint32(in[2])<<8
+				x = uint32(le.Load16(src, s+1))
 				s += 3
 			case x == 62:
-				in := src[s : s+4]
 				// Load as 32 bit and shift down.
-				x = uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24
+				x = le.Load32(src, s)
 				x >>= 8
 				s += 4
 			case x == 63:
-				in := src[s : s+5]
-				x = uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24
+				x = le.Load32(src, s+1)
 				s += 5
 			}
 			length = int(x) + 1
@@ -85,8 +84,7 @@ func s2Decode(dst, src []byte) int {
 					length = int(src[s]) + 4
 					s += 1
 				case 6:
-					in := src[s : s+2]
-					length = int(uint32(in[0])|(uint32(in[1])<<8)) + (1 << 8)
+					length = int(le.Load16(src, s)) + 1<<8
 					s += 2
 				case 7:
 					in := src[s : s+3]
@@ -99,15 +97,13 @@ func s2Decode(dst, src []byte) int {
 			}
 			length += 4
 		case tagCopy2:
-			in := src[s : s+3]
-			offset = int(uint32(in[1]) | uint32(in[2])<<8)
-			length = 1 + int(in[0])>>2
+			offset = int(le.Load16(src, s+1))
+			length = 1 + int(src[s])>>2
 			s += 3
 
 		case tagCopy4:
-			in := src[s : s+5]
-			offset = int(uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24)
-			length = 1 + int(in[0])>>2
+			offset = int(le.Load32(src, s+1))
+			length = 1 + int(src[s])>>2
 			s += 5
 		}
 
diff --git a/zstd/seqenc.go b/zstd/seqenc.go
index 8014174a77..65045eabdd 100644
--- a/zstd/seqenc.go
+++ b/zstd/seqenc.go
@@ -69,7 +69,6 @@ var llBitsTable = [maxLLCode + 1]byte{
 func llCode(litLength uint32) uint8 {
 	const llDeltaCode = 19
 	if litLength <= 63 {
-		// Compiler insists on bounds check (Go 1.12)
 		return llCodeTable[litLength&63]
 	}
 	return uint8(highBit(litLength)) + llDeltaCode
@@ -102,7 +101,6 @@ var mlBitsTable = [maxMLCode + 1]byte{
 func mlCode(mlBase uint32) uint8 {
 	const mlDeltaCode = 36
 	if mlBase <= 127 {
-		// Compiler insists on bounds check (Go 1.12)
 		return mlCodeTable[mlBase&127]
 	}
 	return uint8(highBit(mlBase)) + mlDeltaCode

From 2fcbfef02937d61375ab9fbf4ef914dc29f81b70 Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Tue, 14 Jan 2025 14:21:48 +0100
Subject: [PATCH 5/9] Avoid loop bounds check.

---
 flate/matchlen_generic.go     | 21 +++++++++++++--------
 internal/le/unsafe_enabled.go |  8 ++++++++
 zstd/matchlen_generic.go      | 16 +++++++++++-----
 3 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/flate/matchlen_generic.go b/flate/matchlen_generic.go
index ad5cd814b9..a27a9756f3 100644
--- a/flate/matchlen_generic.go
+++ b/flate/matchlen_generic.go
@@ -7,21 +7,27 @@
 package flate
 
 import (
-	"encoding/binary"
 	"math/bits"
+
+	"github.com/klauspost/compress/internal/le"
 )
 
 // matchLen returns the maximum common prefix length of a and b.
 // a must be the shortest of the two.
 func matchLen(a, b []byte) (n int) {
-	for ; len(a) >= 8 && len(b) >= 8; a, b = a[8:], b[8:] {
-		diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b)
-		if diff != 0 {
-			return n + bits.TrailingZeros64(diff)>>3
+	if len(a) >= 8 && len(b) >= 8 {
+		left := len(a) - 8
+		for left >= 0 {
+			diff := le.Load64(a, n) ^ le.Load64(b, n)
+			if diff != 0 {
+				return n + bits.TrailingZeros64(diff)>>3
+			}
+			n += 8
+			left -= 8
 		}
-		n += 8
 	}
-
+	a = a[n:]
+	b = b[n:]
 	for i := range a {
 		if a[i] != b[i] {
 			break
@@ -29,5 +35,4 @@ func matchLen(a, b []byte) (n int) {
 		n++
 	}
 	return n
-
 }
diff --git a/internal/le/unsafe_enabled.go b/internal/le/unsafe_enabled.go
index 18f3c4d102..b12e0316f4 100644
--- a/internal/le/unsafe_enabled.go
+++ b/internal/le/unsafe_enabled.go
@@ -8,24 +8,32 @@ import (
 	"unsafe"
 )
 
+// Load16 will load from b at index i.
+// If the compiler can prove that b is at least 1 byte this will be without bounds check.
 func Load16[I Indexer](b []byte, i I) uint16 {
 	//return binary.LittleEndian.Uint16(b[i:])
 	//return *(*uint16)(unsafe.Pointer(&b[i]))
 	return *(*uint16)(unsafe.Pointer(uintptr(unsafe.Pointer(&b[0])) + uintptr(i)*unsafe.Sizeof(b[0])))
 }
 
+// Load32 will load from b at index i.
+// If the compiler can prove that b is at least 1 byte this will be without bounds check.
 func Load32[I Indexer](b []byte, i I) uint32 {
 	//return binary.LittleEndian.Uint32(b[i:])
 	//return *(*uint32)(unsafe.Pointer(&b[i]))
 	return *(*uint32)(unsafe.Pointer(uintptr(unsafe.Pointer(&b[0])) + uintptr(i)*unsafe.Sizeof(b[0])))
 }
 
+// Load64 will load from b at index i.
+// If the compiler can prove that b is at least 1 byte this will be without bounds check.
 func Load64[I Indexer](b []byte, i I) uint64 {
 	//return binary.LittleEndian.Uint64(b[i:])
 	//return *(*uint64)(unsafe.Pointer(&b[i]))
 	return *(*uint64)(unsafe.Pointer(uintptr(unsafe.Pointer(&b[0])) + uintptr(i)*unsafe.Sizeof(b[0])))
 }
 
+// Store16 will store v at b.
+// If the compiler can prove
 func Store16(b []byte, v uint16) {
 	//binary.LittleEndian.PutUint16(b, v)
 	*(*uint16)(unsafe.Pointer(&b[0])) = v
diff --git a/zstd/matchlen_generic.go b/zstd/matchlen_generic.go
index 741e784491..d4627eb4ab 100644
--- a/zstd/matchlen_generic.go
+++ b/zstd/matchlen_generic.go
@@ -15,13 +15,19 @@ import (
 // matchLen returns the maximum common prefix length of a and b.
 // a must be the shortest of the two.
 func matchLen(a, b []byte) (n int) {
-	for ; len(a) >= 8 && len(b) >= 8; a, b = a[8:], b[8:] {
-		diff := le.Load64(a, 0) ^ le.Load64(b, 0)
-		if diff != 0 {
-			return n + bits.TrailingZeros64(diff)>>3
+	if len(a) >= 8 && len(b) >= 8 {
+		left := len(a) - 8
+		for left >= 0 {
+			diff := le.Load64(a, n) ^ le.Load64(b, n)
+			if diff != 0 {
+				return n + bits.TrailingZeros64(diff)>>3
+			}
+			n += 8
+			left -= 8
 		}
-		n += 8
 	}
+	a = a[n:]
+	b = b[n:]
 
 	for i := range a {
 		if a[i] != b[i] {

From 75ce7ef4e98d16cf95af267d5e61edd850fc650c Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Tue, 14 Jan 2025 15:01:23 +0100
Subject: [PATCH 6/9] Avoid length check on callers.

---
 flate/matchlen_generic.go     | 17 ++++++++---------
 internal/le/unsafe_enabled.go | 10 +++++-----
 zstd/matchlen_generic.go      | 16 +++++++---------
 3 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/flate/matchlen_generic.go b/flate/matchlen_generic.go
index a27a9756f3..8c840f9b40 100644
--- a/flate/matchlen_generic.go
+++ b/flate/matchlen_generic.go
@@ -15,17 +15,16 @@ import (
 // matchLen returns the maximum common prefix length of a and b.
 // a must be the shortest of the two.
 func matchLen(a, b []byte) (n int) {
-	if len(a) >= 8 && len(b) >= 8 {
-		left := len(a) - 8
-		for left >= 0 {
-			diff := le.Load64(a, n) ^ le.Load64(b, n)
-			if diff != 0 {
-				return n + bits.TrailingZeros64(diff)>>3
-			}
-			n += 8
-			left -= 8
+	left := len(a)
+	for left >= 8 {
+		diff := le.Load64(a, n) ^ le.Load64(b, n)
+		if diff != 0 {
+			return n + bits.TrailingZeros64(diff)>>3
 		}
+		n += 8
+		left -= 8
 	}
+
 	a = a[n:]
 	b = b[n:]
 	for i := range a {
diff --git a/internal/le/unsafe_enabled.go b/internal/le/unsafe_enabled.go
index b12e0316f4..342d4b51c2 100644
--- a/internal/le/unsafe_enabled.go
+++ b/internal/le/unsafe_enabled.go
@@ -13,7 +13,7 @@ import (
 func Load16[I Indexer](b []byte, i I) uint16 {
 	//return binary.LittleEndian.Uint16(b[i:])
 	//return *(*uint16)(unsafe.Pointer(&b[i]))
-	return *(*uint16)(unsafe.Pointer(uintptr(unsafe.Pointer(&b[0])) + uintptr(i)*unsafe.Sizeof(b[0])))
+	return *(*uint16)(unsafe.Pointer(uintptr(unsafe.Pointer(unsafe.SliceData(b))) + uintptr(i)*unsafe.Sizeof(b[0])))
 }
 
 // Load32 will load from b at index i.
@@ -21,7 +21,7 @@ func Load16[I Indexer](b []byte, i I) uint16 {
 func Load32[I Indexer](b []byte, i I) uint32 {
 	//return binary.LittleEndian.Uint32(b[i:])
 	//return *(*uint32)(unsafe.Pointer(&b[i]))
-	return *(*uint32)(unsafe.Pointer(uintptr(unsafe.Pointer(&b[0])) + uintptr(i)*unsafe.Sizeof(b[0])))
+	return *(*uint32)(unsafe.Pointer(uintptr(unsafe.Pointer(unsafe.SliceData(b))) + uintptr(i)*unsafe.Sizeof(b[0])))
 }
 
 // Load64 will load from b at index i.
@@ -29,17 +29,17 @@ func Load32[I Indexer](b []byte, i I) uint32 {
 func Load64[I Indexer](b []byte, i I) uint64 {
 	//return binary.LittleEndian.Uint64(b[i:])
 	//return *(*uint64)(unsafe.Pointer(&b[i]))
-	return *(*uint64)(unsafe.Pointer(uintptr(unsafe.Pointer(&b[0])) + uintptr(i)*unsafe.Sizeof(b[0])))
+	return *(*uint64)(unsafe.Pointer(uintptr(unsafe.Pointer(unsafe.SliceData(b))) + uintptr(i)*unsafe.Sizeof(b[0])))
 }
 
 // Store16 will store v at b.
 // If the compiler can prove
 func Store16(b []byte, v uint16) {
 	//binary.LittleEndian.PutUint16(b, v)
-	*(*uint16)(unsafe.Pointer(&b[0])) = v
+	*(*uint16)(unsafe.Pointer(unsafe.SliceData(b))) = v
 }
 
 func Store32(b []byte, v uint32) {
 	//binary.LittleEndian.PutUint32(b, v)
-	*(*uint32)(unsafe.Pointer(&b[0])) = v
+	*(*uint32)(unsafe.Pointer(unsafe.SliceData(b))) = v
 }
diff --git a/zstd/matchlen_generic.go b/zstd/matchlen_generic.go
index d4627eb4ab..bea1779e97 100644
--- a/zstd/matchlen_generic.go
+++ b/zstd/matchlen_generic.go
@@ -15,16 +15,14 @@ import (
 // matchLen returns the maximum common prefix length of a and b.
 // a must be the shortest of the two.
 func matchLen(a, b []byte) (n int) {
-	if len(a) >= 8 && len(b) >= 8 {
-		left := len(a) - 8
-		for left >= 0 {
-			diff := le.Load64(a, n) ^ le.Load64(b, n)
-			if diff != 0 {
-				return n + bits.TrailingZeros64(diff)>>3
-			}
-			n += 8
-			left -= 8
+	left := len(a)
+	for left >= 8 {
+		diff := le.Load64(a, n) ^ le.Load64(b, n)
+		if diff != 0 {
+			return n + bits.TrailingZeros64(diff)>>3
 		}
+		n += 8
+		left -= 8
 	}
 	a = a[n:]
 	b = b[n:]

From 4a120045da0c79656cb612efea4e9be4f3a98294 Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Tue, 14 Jan 2025 15:15:38 +0100
Subject: [PATCH 7/9] Use in more places.

---
 flate/huffman_bit_writer.go    | 19 ++++++++++---------
 flate/level1.go                | 31 ++++++++++++++++---------------
 internal/le/unsafe_disabled.go |  4 ++++
 internal/le/unsafe_enabled.go  | 17 ++++++++++-------
 4 files changed, 40 insertions(+), 31 deletions(-)

diff --git a/flate/huffman_bit_writer.go b/flate/huffman_bit_writer.go
index f70594c34e..afdc8c053a 100644
--- a/flate/huffman_bit_writer.go
+++ b/flate/huffman_bit_writer.go
@@ -5,10 +5,11 @@
 package flate
 
 import (
-	"encoding/binary"
 	"fmt"
 	"io"
 	"math"
+
+	"github.com/klauspost/compress/internal/le"
 )
 
 const (
@@ -438,7 +439,7 @@ func (w *huffmanBitWriter) writeOutBits() {
 	n := w.nbytes
 
 	// We over-write, but faster...
-	binary.LittleEndian.PutUint64(w.bytes[n:], bits)
+	le.Store64(w.bytes[n:], bits)
 	n += 6
 
 	if n >= bufferFlushSize {
@@ -854,7 +855,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 			bits |= c.code64() << (nbits & 63)
 			nbits += c.len()
 			if nbits >= 48 {
-				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				le.Store64(w.bytes[nbytes:], bits)
 				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
 				bits >>= 48
 				nbits -= 48
@@ -882,7 +883,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 			bits |= c.code64() << (nbits & 63)
 			nbits += c.len()
 			if nbits >= 48 {
-				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				le.Store64(w.bytes[nbytes:], bits)
 				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
 				bits >>= 48
 				nbits -= 48
@@ -905,7 +906,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 			bits |= uint64(extraLength) << (nbits & 63)
 			nbits += extraLengthBits
 			if nbits >= 48 {
-				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				le.Store64(w.bytes[nbytes:], bits)
 				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
 				bits >>= 48
 				nbits -= 48
@@ -931,7 +932,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 			bits |= c.code64() << (nbits & 63)
 			nbits += c.len()
 			if nbits >= 48 {
-				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				le.Store64(w.bytes[nbytes:], bits)
 				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
 				bits >>= 48
 				nbits -= 48
@@ -953,7 +954,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 			bits |= uint64((offset-(offsetComb>>8))&matchOffsetOnlyMask) << (nbits & 63)
 			nbits += uint8(offsetComb)
 			if nbits >= 48 {
-				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				le.Store64(w.bytes[nbytes:], bits)
 				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
 				bits >>= 48
 				nbits -= 48
@@ -1107,7 +1108,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 		// We must have at least 48 bits free.
 		if nbits >= 8 {
 			n := nbits >> 3
-			binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+			le.Store64(w.bytes[nbytes:], bits)
 			bits >>= (n * 8) & 63
 			nbits -= n * 8
 			nbytes += n
@@ -1136,7 +1137,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 	// Remaining...
 	for _, t := range input {
 		if nbits >= 48 {
-			binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+			le.Store64(w.bytes[nbytes:], bits)
 			//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
 			bits >>= 48
 			nbits -= 48
diff --git a/flate/level1.go b/flate/level1.go
index 703b9a89aa..61854a3526 100644
--- a/flate/level1.go
+++ b/flate/level1.go
@@ -1,9 +1,10 @@
 package flate
 
 import (
-	"encoding/binary"
 	"fmt"
 	"math/bits"
+
+	"github.com/klauspost/compress/internal/le"
 )
 
 // fastGen maintains the table for matches,
@@ -126,26 +127,26 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 				l = e.matchlenLong(s+4, t+4, src) + 4
 			} else {
 				// inlined:
-				a := src[s+4:]
-				b := src[t+4:]
-				for len(a) >= 8 {
-					if diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b); diff != 0 {
+				a := src[s:]
+				b := src[t:]
+				left := len(a) - 4
+				for left >= 8 {
+					if diff := le.Load64(a, l) ^ le.Load64(b, l); diff != 0 {
 						l += int32(bits.TrailingZeros64(diff) >> 3)
-						break
+						goto endMatch
 					}
 					l += 8
-					a = a[8:]
-					b = b[8:]
+					left -= 8
 				}
-				if len(a) < 8 {
-					b = b[:len(a)]
-					for i := range a {
-						if a[i] != b[i] {
-							break
-						}
-						l++
+				a = a[l:]
+				b = b[l:]
+				for i := range a {
+					if a[i] != b[i] {
+						break
 					}
+					l++
 				}
+			endMatch:
 			}
 
 			// Extend backwards
diff --git a/internal/le/unsafe_disabled.go b/internal/le/unsafe_disabled.go
index f9d81b17c1..9643495f01 100644
--- a/internal/le/unsafe_disabled.go
+++ b/internal/le/unsafe_disabled.go
@@ -25,3 +25,7 @@ func Store16(b []byte, v uint16) {
 func Store32(b []byte, v uint32) {
 	binary.LittleEndian.PutUint32(b, v)
 }
+
+func Store64(b []byte, v uint64) {
+	binary.LittleEndian.PutUint64(b, v)
+}
diff --git a/internal/le/unsafe_enabled.go b/internal/le/unsafe_enabled.go
index 342d4b51c2..5a5d2dabf0 100644
--- a/internal/le/unsafe_enabled.go
+++ b/internal/le/unsafe_enabled.go
@@ -9,37 +9,40 @@ import (
 )
 
 // Load16 will load from b at index i.
-// If the compiler can prove that b is at least 1 byte this will be without bounds check.
 func Load16[I Indexer](b []byte, i I) uint16 {
 	//return binary.LittleEndian.Uint16(b[i:])
 	//return *(*uint16)(unsafe.Pointer(&b[i]))
-	return *(*uint16)(unsafe.Pointer(uintptr(unsafe.Pointer(unsafe.SliceData(b))) + uintptr(i)*unsafe.Sizeof(b[0])))
+	return *(*uint16)(unsafe.Add(unsafe.Pointer(unsafe.SliceData(b)), i))
 }
 
 // Load32 will load from b at index i.
-// If the compiler can prove that b is at least 1 byte this will be without bounds check.
 func Load32[I Indexer](b []byte, i I) uint32 {
 	//return binary.LittleEndian.Uint32(b[i:])
 	//return *(*uint32)(unsafe.Pointer(&b[i]))
-	return *(*uint32)(unsafe.Pointer(uintptr(unsafe.Pointer(unsafe.SliceData(b))) + uintptr(i)*unsafe.Sizeof(b[0])))
+	return *(*uint32)(unsafe.Add(unsafe.Pointer(unsafe.SliceData(b)), i))
 }
 
 // Load64 will load from b at index i.
-// If the compiler can prove that b is at least 1 byte this will be without bounds check.
 func Load64[I Indexer](b []byte, i I) uint64 {
 	//return binary.LittleEndian.Uint64(b[i:])
 	//return *(*uint64)(unsafe.Pointer(&b[i]))
-	return *(*uint64)(unsafe.Pointer(uintptr(unsafe.Pointer(unsafe.SliceData(b))) + uintptr(i)*unsafe.Sizeof(b[0])))
+	return *(*uint64)(unsafe.Add(unsafe.Pointer(unsafe.SliceData(b)), i))
 }
 
 // Store16 will store v at b.
-// If the compiler can prove
 func Store16(b []byte, v uint16) {
 	//binary.LittleEndian.PutUint16(b, v)
 	*(*uint16)(unsafe.Pointer(unsafe.SliceData(b))) = v
 }
 
+// Store32 will store v at b.
 func Store32(b []byte, v uint32) {
 	//binary.LittleEndian.PutUint32(b, v)
 	*(*uint32)(unsafe.Pointer(unsafe.SliceData(b))) = v
 }
+
+// Store64 will store v at b.
+func Store64(b []byte, v uint64) {
+	//binary.LittleEndian.PutUint64(b, v)
+	*(*uint64)(unsafe.Pointer(unsafe.SliceData(b))) = v
+}

From b959ae35c8d655e318db0c10d9ca040975c11dce Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Tue, 14 Jan 2025 15:57:08 +0100
Subject: [PATCH 8/9] Bump s2sx version

---
 s2sx.mod | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/s2sx.mod b/s2sx.mod
index 5a4412f907..15b74a57c7 100644
--- a/s2sx.mod
+++ b/s2sx.mod
@@ -1,4 +1,3 @@
 module github.com/klauspost/compress
 
-go 1.19
-
+go 1.21

From 209574e8fe048b3a5dcb3ac3af29c4ada3b92426 Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Tue, 14 Jan 2025 16:40:16 +0100
Subject: [PATCH 9/9] Update docs

---
 README.md          | 11 +++++++++++
 flate/stateless.go | 13 ++++---------
 zstd/README.md     |  2 +-
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index de264c85a5..80ede339f5 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,17 @@ This package provides various compression algorithms.
 [![Go](https://github.com/klauspost/compress/actions/workflows/go.yml/badge.svg)](https://github.com/klauspost/compress/actions/workflows/go.yml)
 [![Sourcegraph Badge](https://sourcegraph.com/github.com/klauspost/compress/-/badge.svg)](https://sourcegraph.com/github.com/klauspost/compress?badge)
 
+# package usage
+
+Use `go get github.com/klauspost/compress@latest` to add it to your project.
+
+This package will support the current Go version and 2 versions back.
+
+* Use the `nounsafe` tag to disable all use of the "unsafe" package.
+* Use the `noasm` tag to disable all assembly across packages.
+
+Use the links above for more information on each.
+
 # changelog
 
 * Sep 23rd, 2024 - [1.17.10](https://github.com/klauspost/compress/releases/tag/v1.17.10)
diff --git a/flate/stateless.go b/flate/stateless.go
index f3d4139ef3..13b9b100db 100644
--- a/flate/stateless.go
+++ b/flate/stateless.go
@@ -4,6 +4,8 @@ import (
 	"io"
 	"math"
 	"sync"
+
+	"github.com/klauspost/compress/internal/le"
 )
 
 const (
@@ -152,18 +154,11 @@ func hashSL(u uint32) uint32 {
 }
 
 func load3216(b []byte, i int16) uint32 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:4]
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+	return le.Load32(b, i)
 }
 
 func load6416(b []byte, i int16) uint64 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:8]
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+	return le.Load64(b, i)
 }
 
 func statelessEnc(dst *tokens, src []byte, startAt int16) {
diff --git a/zstd/README.md b/zstd/README.md
index 92e2347bbc..c11d7fa28e 100644
--- a/zstd/README.md
+++ b/zstd/README.md
@@ -6,7 +6,7 @@ A high performance compression algorithm is implemented. For now focused on spee
 
 This package provides [compression](#Compressor) to and [decompression](#Decompressor) of Zstandard content. 
 
-This package is pure Go and without use of "unsafe". 
+This package is pure Go. Use `noasm` and `nounsafe` to disable relevant features.
 
 The `zstd` package is provided as open source software using a Go standard license.