diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index ea71ec2f2..02b1285b4 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -32,18 +32,24 @@ jobs: - name: Test run: go test ./... - - name: Test Noasm + - name: Test No-asm run: go test -tags=noasm ./... + - name: Test No-unsafe + run: go test -tags=nounsafe ./... + + - name: Test No-unsafe, noasm + run: go test -tags="nounsafe,noasm" ./... + - name: Test Race 1 CPU env: CGO_ENABLED: 1 - run: go test -cpu=1 -short -race -v ./... + run: go test -cpu=1 -short -race -tags=nounsafe -v ./... - name: Test Race 4 CPU env: CGO_ENABLED: 1 - run: go test -cpu=4 -short -race -v ./... + run: go test -cpu=4 -short -race -tags=nounsafe -v ./... generate: strategy: @@ -112,6 +118,9 @@ jobs: env: CGO_ENABLED: 0 runs-on: ubuntu-latest + strategy: + matrix: + tags: [ 'nounsafe', '"noasm,nounsafe"' ] steps: - name: Set up Go uses: actions/setup-go@v5.2.0 @@ -121,28 +130,23 @@ jobs: - name: Checkout code uses: actions/checkout@v4 - - name: S2/FuzzDictBlocks - run: go test -run=none -fuzz=FuzzDictBlocks -fuzztime=100000x -test.fuzzminimizetime=10ms ./s2/. + - name: S2/FuzzDictBlocks/${{ matrix.tags }} + run: go test -tags=${{ matrix.tags }} -run=none -fuzz=FuzzDictBlocks -fuzztime=100000x -test.fuzzminimizetime=10ms ./s2/. - - name: S2/FuzzEncodingBlocks - run: go test -run=none -fuzz=FuzzEncodingBlocks -fuzztime=500000x -test.fuzzminimizetime=10ms ./s2/. + - name: S2/FuzzEncodingBlocks/${{ matrix.tags }} + run: go test -tags=${{ matrix.tags }} -run=none -fuzz=FuzzEncodingBlocks -fuzztime=500000x -test.fuzzminimizetime=10ms ./s2/. - - name: S2/FuzzLZ4Block - run: go test -run=none -fuzz=FuzzLZ4Block -fuzztime=500000x -test.fuzzminimizetime=10ms ./s2/. + - name: S2/FuzzLZ4Block/${{ matrix.tags }} + run: go test -tags=${{ matrix.tags }} -run=none -fuzz=FuzzLZ4Block -fuzztime=500000x -test.fuzzminimizetime=10ms ./s2/. - - name: S2/FuzzDictBlocks/noasm - run: go test -tags=noasm -run=none -fuzz=FuzzDictBlocks -fuzztime=100000x -test.fuzzminimizetime=10ms ./s2/. - - - name: S2/FuzzEncodingBlocks/noasm - run: go test -tags=noasm -run=none -fuzz=FuzzEncodingBlocks -fuzztime=500000x -test.fuzzminimizetime=10ms ./s2/. - - - name: S2/FuzzLZ4Block/noasm - run: go test -tags=noasm -run=none -fuzz=FuzzLZ4Block -fuzztime=500000x -test.fuzzminimizetime=10ms ./s2/. fuzz-zstd: env: CGO_ENABLED: 0 runs-on: ubuntu-latest + strategy: + matrix: + tags: [ 'nounsafe', '"noasm,nounsafe"' ] steps: - name: Set up Go uses: actions/setup-go@v5.2.0 @@ -152,57 +156,44 @@ jobs: - name: Checkout code uses: actions/checkout@v4 - - name: zstd/FuzzDecodeAll - run: go test -run=none -fuzz=FuzzDecodeAll -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/. + - name: zstd/FuzzDecodeAll/${{ matrix.tags }} + run: go test -tags=${{ matrix.tags }} -run=none -fuzz=FuzzDecodeAll -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/. - - name: zstd/FuzzDecAllNoBMI2 - run: go test -run=none -fuzz=FuzzDecAllNoBMI2 -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/. + - name: zstd/FuzzDecAllNoBMI2/${{ matrix.tags }} + run: go test -tags=${{ matrix.tags }} -run=none -fuzz=FuzzDecAllNoBMI2 -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/. - - name: zstd/FuzzDecoder - run: go test -run=none -fuzz=FuzzDecoder -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/. + - name: zstd/FuzzDecoder/${{ matrix.tags }} + run: go test -tags=${{ matrix.tags }} -run=none -fuzz=FuzzDecoder -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/. - - name: zstd/FuzzNoBMI2Dec - run: go test -run=none -fuzz=FuzzNoBMI2Dec -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/. + - name: zstd/FuzzNoBMI2Dec/${{ matrix.tags }} + run: go test -tags=${{ matrix.tags }} -run=none -fuzz=FuzzNoBMI2Dec -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/. - - name: zstd/FuzzEncoding - run: cd zstd&&go test -run=none -fuzz=FuzzEncoding -fuzztime=250000x -test.fuzzminimizetime=10ms -fuzz-end=3&&cd .. - - - name: zstd/FuzzDecodeAll/noasm - run: go test -tags=noasm -run=none -fuzz=FuzzDecodeAll -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/. - - - name: zstd/FuzzDecoder/noasm - run: go test -tags=noasm -run=none -fuzz=FuzzDecoder -fuzztime=500000x -test.fuzzminimizetime=10ms ./zstd/. - - - name: zstd/FuzzEncoding/noasm - run: cd zstd&&go test -tags=noasm -run=none -fuzz=FuzzEncoding -fuzztime=250000x -test.fuzzminimizetime=10ms -fuzz-end=3&&cd .. - - - name: zstd/FuzzEncodingBest - run: cd zstd&&go test -run=none -fuzz=FuzzEncoding -fuzztime=25000x -test.fuzzminimizetime=10ms -fuzz-start=4&&cd .. + - name: zstd/FuzzEncoding/${{ matrix.tags }} + run: cd zstd&&go test -tags=${{ matrix.tags }} -run=none -fuzz=FuzzEncoding -fuzztime=250000x -test.fuzzminimizetime=10ms -fuzz-end=3&&cd .. fuzz-other: env: CGO_ENABLED: 0 runs-on: ubuntu-latest + strategy: + matrix: + tags: [ 'nounsafe', '"noasm,nounsafe"' ] steps: - name: Set up Go uses: actions/setup-go@v5.2.0 with: go-version: 1.23.x - - name: Checkout code uses: actions/checkout@v4 - - name: flate/FuzzEncoding - run: go test -run=none -fuzz=FuzzEncoding -fuzztime=100000x -test.fuzzminimizetime=10ms ./flate/. - - - name: flate/FuzzEncoding/noasm - run: go test -run=none -tags=noasm -fuzz=FuzzEncoding -fuzztime=100000x -test.fuzzminimizetime=10ms ./flate/. + - name: flate/FuzzEncoding/${{ matrix.tags }} + run: go test -tags=${{ matrix.tags }} -run=none -fuzz=FuzzEncoding -fuzztime=100000x -test.fuzzminimizetime=10ms ./flate/. - - name: zip/FuzzReader - run: go test -run=none -fuzz=FuzzReader -fuzztime=500000x -test.fuzzminimizetime=10ms ./zip/. + - name: zip/FuzzReader/${{ matrix.tags }} + run: go test -tags=${{ matrix.tags }} -run=none -fuzz=FuzzReader -fuzztime=500000x -test.fuzzminimizetime=10ms ./zip/. - - name: fse/FuzzCompress - run: go test -run=none -fuzz=FuzzCompress -fuzztime=1000000x -test.fuzzminimizetime=10ms ./fse/. + - name: fse/FuzzCompress/${{ matrix.tags }} + run: go test -tags=${{ matrix.tags }} -run=none -fuzz=FuzzCompress -fuzztime=1000000x -test.fuzzminimizetime=10ms ./fse/. - - name: fse/FuzzDecompress - run: go test -run=none -fuzz=FuzzDecompress -fuzztime=1000000x -test.fuzzminimizetime=10ms ./fse/. \ No newline at end of file + - name: fse/FuzzDecompress/${{ matrix.tags }} + run: go test -tags=${{ matrix.tags }} -run=none -fuzz=FuzzDecompress -fuzztime=1000000x -test.fuzzminimizetime=10ms ./fse/. diff --git a/README.md b/README.md index de264c85a..80ede339f 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,17 @@ This package provides various compression algorithms. [![Go](https://github.com/klauspost/compress/actions/workflows/go.yml/badge.svg)](https://github.com/klauspost/compress/actions/workflows/go.yml) [![Sourcegraph Badge](https://sourcegraph.com/github.com/klauspost/compress/-/badge.svg)](https://sourcegraph.com/github.com/klauspost/compress?badge) +# package usage + +Use `go get github.com/klauspost/compress@latest` to add it to your project. + +This package will support the current Go version and 2 versions back. + +* Use the `nounsafe` tag to disable all use of the "unsafe" package. +* Use the `noasm` tag to disable all assembly across packages. + +Use the links above for more information on each. + # changelog * Sep 23rd, 2024 - [1.17.10](https://github.com/klauspost/compress/releases/tag/v1.17.10) diff --git a/flate/fast_encoder.go b/flate/fast_encoder.go index c8124b5c4..433977767 100644 --- a/flate/fast_encoder.go +++ b/flate/fast_encoder.go @@ -6,8 +6,9 @@ package flate import ( - "encoding/binary" "fmt" + + "github.com/klauspost/compress/internal/le" ) type fastEnc interface { @@ -58,11 +59,11 @@ const ( ) func load3232(b []byte, i int32) uint32 { - return binary.LittleEndian.Uint32(b[i:]) + return le.Load32(b, i) } func load6432(b []byte, i int32) uint64 { - return binary.LittleEndian.Uint64(b[i:]) + return le.Load64(b, i) } type tableEntry struct { diff --git a/flate/fuzz_test.go b/flate/fuzz_test.go index 5529a78be..8b02f460c 100644 --- a/flate/fuzz_test.go +++ b/flate/fuzz_test.go @@ -1,5 +1,4 @@ //go:build go1.18 -// +build go1.18 package flate diff --git a/flate/huffman_bit_writer.go b/flate/huffman_bit_writer.go index f70594c34..afdc8c053 100644 --- a/flate/huffman_bit_writer.go +++ b/flate/huffman_bit_writer.go @@ -5,10 +5,11 @@ package flate import ( - "encoding/binary" "fmt" "io" "math" + + "github.com/klauspost/compress/internal/le" ) const ( @@ -438,7 +439,7 @@ func (w *huffmanBitWriter) writeOutBits() { n := w.nbytes // We over-write, but faster... - binary.LittleEndian.PutUint64(w.bytes[n:], bits) + le.Store64(w.bytes[n:], bits) n += 6 if n >= bufferFlushSize { @@ -854,7 +855,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) bits |= c.code64() << (nbits & 63) nbits += c.len() if nbits >= 48 { - binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) + le.Store64(w.bytes[nbytes:], bits) //*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits bits >>= 48 nbits -= 48 @@ -882,7 +883,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) bits |= c.code64() << (nbits & 63) nbits += c.len() if nbits >= 48 { - binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) + le.Store64(w.bytes[nbytes:], bits) //*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits bits >>= 48 nbits -= 48 @@ -905,7 +906,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) bits |= uint64(extraLength) << (nbits & 63) nbits += extraLengthBits if nbits >= 48 { - binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) + le.Store64(w.bytes[nbytes:], bits) //*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits bits >>= 48 nbits -= 48 @@ -931,7 +932,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) bits |= c.code64() << (nbits & 63) nbits += c.len() if nbits >= 48 { - binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) + le.Store64(w.bytes[nbytes:], bits) //*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits bits >>= 48 nbits -= 48 @@ -953,7 +954,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) bits |= uint64((offset-(offsetComb>>8))&matchOffsetOnlyMask) << (nbits & 63) nbits += uint8(offsetComb) if nbits >= 48 { - binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) + le.Store64(w.bytes[nbytes:], bits) //*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits bits >>= 48 nbits -= 48 @@ -1107,7 +1108,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { // We must have at least 48 bits free. if nbits >= 8 { n := nbits >> 3 - binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) + le.Store64(w.bytes[nbytes:], bits) bits >>= (n * 8) & 63 nbits -= n * 8 nbytes += n @@ -1136,7 +1137,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { // Remaining... for _, t := range input { if nbits >= 48 { - binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) + le.Store64(w.bytes[nbytes:], bits) //*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits bits >>= 48 nbits -= 48 diff --git a/flate/level1.go b/flate/level1.go index 703b9a89a..61854a352 100644 --- a/flate/level1.go +++ b/flate/level1.go @@ -1,9 +1,10 @@ package flate import ( - "encoding/binary" "fmt" "math/bits" + + "github.com/klauspost/compress/internal/le" ) // fastGen maintains the table for matches, @@ -126,26 +127,26 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) { l = e.matchlenLong(s+4, t+4, src) + 4 } else { // inlined: - a := src[s+4:] - b := src[t+4:] - for len(a) >= 8 { - if diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b); diff != 0 { + a := src[s:] + b := src[t:] + left := len(a) - 4 + for left >= 8 { + if diff := le.Load64(a, l) ^ le.Load64(b, l); diff != 0 { l += int32(bits.TrailingZeros64(diff) >> 3) - break + goto endMatch } l += 8 - a = a[8:] - b = b[8:] + left -= 8 } - if len(a) < 8 { - b = b[:len(a)] - for i := range a { - if a[i] != b[i] { - break - } - l++ + a = a[l:] + b = b[l:] + for i := range a { + if a[i] != b[i] { + break } + l++ } + endMatch: } // Extend backwards diff --git a/flate/matchlen_generic.go b/flate/matchlen_generic.go index ad5cd814b..8c840f9b4 100644 --- a/flate/matchlen_generic.go +++ b/flate/matchlen_generic.go @@ -7,21 +7,26 @@ package flate import ( - "encoding/binary" "math/bits" + + "github.com/klauspost/compress/internal/le" ) // matchLen returns the maximum common prefix length of a and b. // a must be the shortest of the two. func matchLen(a, b []byte) (n int) { - for ; len(a) >= 8 && len(b) >= 8; a, b = a[8:], b[8:] { - diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b) + left := len(a) + for left >= 8 { + diff := le.Load64(a, n) ^ le.Load64(b, n) if diff != 0 { return n + bits.TrailingZeros64(diff)>>3 } n += 8 + left -= 8 } + a = a[n:] + b = b[n:] for i := range a { if a[i] != b[i] { break @@ -29,5 +34,4 @@ func matchLen(a, b []byte) (n int) { n++ } return n - } diff --git a/flate/stateless.go b/flate/stateless.go index f3d4139ef..13b9b100d 100644 --- a/flate/stateless.go +++ b/flate/stateless.go @@ -4,6 +4,8 @@ import ( "io" "math" "sync" + + "github.com/klauspost/compress/internal/le" ) const ( @@ -152,18 +154,11 @@ func hashSL(u uint32) uint32 { } func load3216(b []byte, i int16) uint32 { - // Help the compiler eliminate bounds checks on the read so it can be done in a single read. - b = b[i:] - b = b[:4] - return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 + return le.Load32(b, i) } func load6416(b []byte, i int16) uint64 { - // Help the compiler eliminate bounds checks on the read so it can be done in a single read. - b = b[i:] - b = b[:8] - return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | - uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 + return le.Load64(b, i) } func statelessEnc(dst *tokens, src []byte, startAt int16) { diff --git a/huff0/bitreader.go b/huff0/bitreader.go index 6686d7371..bfc7a523d 100644 --- a/huff0/bitreader.go +++ b/huff0/bitreader.go @@ -6,10 +6,11 @@ package huff0 import ( - "encoding/binary" "errors" "fmt" "io" + + "github.com/klauspost/compress/internal/le" ) // bitReader reads a bitstream in reverse. @@ -66,8 +67,7 @@ func (b *bitReaderBytes) fillFast() { } // 2 bounds checks. - v := b.in[b.off-4 : b.off] - low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) + low := le.Load32(b.in, b.off-4) b.value |= uint64(low) << (b.bitsRead - 32) b.bitsRead -= 32 b.off -= 4 @@ -76,7 +76,7 @@ func (b *bitReaderBytes) fillFast() { // fillFastStart() assumes the bitReaderBytes is empty and there is at least 8 bytes to read. func (b *bitReaderBytes) fillFastStart() { // Do single re-slice to avoid bounds checks. - b.value = binary.LittleEndian.Uint64(b.in[b.off-8:]) + b.value = le.Load64(b.in, b.off-8) b.bitsRead = 0 b.off -= 8 } @@ -86,9 +86,8 @@ func (b *bitReaderBytes) fill() { if b.bitsRead < 32 { return } - if b.off > 4 { - v := b.in[b.off-4 : b.off] - low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) + if b.off >= 4 { + low := le.Load32(b.in, b.off-4) b.value |= uint64(low) << (b.bitsRead - 32) b.bitsRead -= 32 b.off -= 4 @@ -175,9 +174,7 @@ func (b *bitReaderShifted) fillFast() { return } - // 2 bounds checks. - v := b.in[b.off-4 : b.off] - low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) + low := le.Load32(b.in, b.off-4) b.value |= uint64(low) << ((b.bitsRead - 32) & 63) b.bitsRead -= 32 b.off -= 4 @@ -185,8 +182,7 @@ func (b *bitReaderShifted) fillFast() { // fillFastStart() assumes the bitReaderShifted is empty and there is at least 8 bytes to read. func (b *bitReaderShifted) fillFastStart() { - // Do single re-slice to avoid bounds checks. - b.value = binary.LittleEndian.Uint64(b.in[b.off-8:]) + b.value = le.Load64(b.in, b.off-8) b.bitsRead = 0 b.off -= 8 } @@ -197,8 +193,7 @@ func (b *bitReaderShifted) fill() { return } if b.off > 4 { - v := b.in[b.off-4 : b.off] - low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) + low := le.Load32(b.in, b.off-4) b.value |= uint64(low) << ((b.bitsRead - 32) & 63) b.bitsRead -= 32 b.off -= 4 diff --git a/huff0/decompress_test.go b/huff0/decompress_test.go index fe23514fa..47123058f 100644 --- a/huff0/decompress_test.go +++ b/huff0/decompress_test.go @@ -91,7 +91,7 @@ func TestDecompress1X(t *testing.T) { t.Log(string(dc)) } //t.Errorf(test.name+": decompressed, got delta: \n%s") - t.Errorf(test.name + ": decompressed, got delta") + t.Error(test.name + ": decompressed, got delta") } if !t.Failed() { t.Log("... roundtrip ok!") @@ -221,7 +221,7 @@ func TestDecompress4X(t *testing.T) { t.Log(string(dc)) } //t.Errorf(test.name+": decompressed, got delta: \n%s") - t.Errorf(test.name + ": decompressed, got delta") + t.Error(test.name + ": decompressed, got delta") } if !t.Failed() { t.Log("... roundtrip ok!") @@ -315,7 +315,7 @@ func TestRoundtrip1XFuzz(t *testing.T) { t.Log(string(dc)) } //t.Errorf(test.name+": decompressed, got delta: \n%s") - t.Errorf(test.name + ": decompressed, got delta") + t.Error(test.name + ": decompressed, got delta") } if !t.Failed() { t.Log("... roundtrip ok!") @@ -406,7 +406,7 @@ func TestRoundtrip4XFuzz(t *testing.T) { t.Log(string(dc)) } //t.Errorf(test.name+": decompressed, got delta: \n%s") - t.Errorf(test.name + ": decompressed, got delta") + t.Error(test.name + ": decompressed, got delta") } if !t.Failed() { t.Log("... roundtrip ok!") diff --git a/internal/le/le.go b/internal/le/le.go new file mode 100644 index 000000000..e54909e16 --- /dev/null +++ b/internal/le/le.go @@ -0,0 +1,5 @@ +package le + +type Indexer interface { + int | int8 | int16 | int32 | int64 | uint | uint8 | uint16 | uint32 | uint64 +} diff --git a/internal/le/unsafe_disabled.go b/internal/le/unsafe_disabled.go new file mode 100644 index 000000000..9643495f0 --- /dev/null +++ b/internal/le/unsafe_disabled.go @@ -0,0 +1,31 @@ +//go:build !(amd64 || arm64 || ppc64le || riscv64) || nounsafe || purego || appengine + +package le + +import ( + "encoding/binary" +) + +func Load16[I Indexer](b []byte, i I) uint16 { + return binary.LittleEndian.Uint16(b[i:]) +} + +func Load32[I Indexer](b []byte, i I) uint32 { + return binary.LittleEndian.Uint32(b[i:]) +} + +func Load64[I Indexer](b []byte, i I) uint64 { + return binary.LittleEndian.Uint64(b[i:]) +} + +func Store16(b []byte, v uint16) { + binary.LittleEndian.PutUint16(b, v) +} + +func Store32(b []byte, v uint32) { + binary.LittleEndian.PutUint32(b, v) +} + +func Store64(b []byte, v uint64) { + binary.LittleEndian.PutUint64(b, v) +} diff --git a/internal/le/unsafe_enabled.go b/internal/le/unsafe_enabled.go new file mode 100644 index 000000000..5a5d2dabf --- /dev/null +++ b/internal/le/unsafe_enabled.go @@ -0,0 +1,48 @@ +// We enable 64 bit LE platforms: + +//go:build (amd64 || arm64 || ppc64le || riscv64) && !nounsafe && !purego && !appengine + +package le + +import ( + "unsafe" +) + +// Load16 will load from b at index i. +func Load16[I Indexer](b []byte, i I) uint16 { + //return binary.LittleEndian.Uint16(b[i:]) + //return *(*uint16)(unsafe.Pointer(&b[i])) + return *(*uint16)(unsafe.Add(unsafe.Pointer(unsafe.SliceData(b)), i)) +} + +// Load32 will load from b at index i. +func Load32[I Indexer](b []byte, i I) uint32 { + //return binary.LittleEndian.Uint32(b[i:]) + //return *(*uint32)(unsafe.Pointer(&b[i])) + return *(*uint32)(unsafe.Add(unsafe.Pointer(unsafe.SliceData(b)), i)) +} + +// Load64 will load from b at index i. +func Load64[I Indexer](b []byte, i I) uint64 { + //return binary.LittleEndian.Uint64(b[i:]) + //return *(*uint64)(unsafe.Pointer(&b[i])) + return *(*uint64)(unsafe.Add(unsafe.Pointer(unsafe.SliceData(b)), i)) +} + +// Store16 will store v at b. +func Store16(b []byte, v uint16) { + //binary.LittleEndian.PutUint16(b, v) + *(*uint16)(unsafe.Pointer(unsafe.SliceData(b))) = v +} + +// Store32 will store v at b. +func Store32(b []byte, v uint32) { + //binary.LittleEndian.PutUint32(b, v) + *(*uint32)(unsafe.Pointer(unsafe.SliceData(b))) = v +} + +// Store64 will store v at b. +func Store64(b []byte, v uint64) { + //binary.LittleEndian.PutUint64(b, v) + *(*uint64)(unsafe.Pointer(unsafe.SliceData(b))) = v +} diff --git a/s2/decode_other.go b/s2/decode_other.go index 2cb55c2c7..c99d40b69 100644 --- a/s2/decode_other.go +++ b/s2/decode_other.go @@ -11,6 +11,8 @@ package s2 import ( "fmt" "strconv" + + "github.com/klauspost/compress/internal/le" ) // decode writes the decoding of src to dst. It assumes that the varint-encoded @@ -38,21 +40,18 @@ func s2Decode(dst, src []byte) int { case x < 60: s++ case x == 60: + x = uint32(src[s+1]) s += 2 - x = uint32(src[s-1]) case x == 61: - in := src[s : s+3] - x = uint32(in[1]) | uint32(in[2])<<8 + x = uint32(le.Load16(src, s+1)) s += 3 case x == 62: - in := src[s : s+4] // Load as 32 bit and shift down. - x = uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24 + x = le.Load32(src, s) x >>= 8 s += 4 case x == 63: - in := src[s : s+5] - x = uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24 + x = le.Load32(src, s+1) s += 5 } length = int(x) + 1 @@ -85,8 +84,7 @@ func s2Decode(dst, src []byte) int { length = int(src[s]) + 4 s += 1 case 6: - in := src[s : s+2] - length = int(uint32(in[0])|(uint32(in[1])<<8)) + (1 << 8) + length = int(le.Load16(src, s)) + 1<<8 s += 2 case 7: in := src[s : s+3] @@ -99,15 +97,13 @@ func s2Decode(dst, src []byte) int { } length += 4 case tagCopy2: - in := src[s : s+3] - offset = int(uint32(in[1]) | uint32(in[2])<<8) - length = 1 + int(in[0])>>2 + offset = int(le.Load16(src, s+1)) + length = 1 + int(src[s])>>2 s += 3 case tagCopy4: - in := src[s : s+5] - offset = int(uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24) - length = 1 + int(in[0])>>2 + offset = int(le.Load32(src, s+1)) + length = 1 + int(src[s])>>2 s += 5 } diff --git a/s2/encode_all.go b/s2/encode_all.go index 997704569..c56ce52e7 100644 --- a/s2/encode_all.go +++ b/s2/encode_all.go @@ -10,14 +10,16 @@ import ( "encoding/binary" "fmt" "math/bits" + + "github.com/klauspost/compress/internal/le" ) func load32(b []byte, i int) uint32 { - return binary.LittleEndian.Uint32(b[i:]) + return le.Load32(b, i) } func load64(b []byte, i int) uint64 { - return binary.LittleEndian.Uint64(b[i:]) + return le.Load64(b, i) } // hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits. diff --git a/s2sx.mod b/s2sx.mod index 5a4412f90..15b74a57c 100644 --- a/s2sx.mod +++ b/s2sx.mod @@ -1,4 +1,3 @@ module github.com/klauspost/compress -go 1.19 - +go 1.21 diff --git a/zstd/README.md b/zstd/README.md index 92e2347bb..c11d7fa28 100644 --- a/zstd/README.md +++ b/zstd/README.md @@ -6,7 +6,7 @@ A high performance compression algorithm is implemented. For now focused on spee This package provides [compression](#Compressor) to and [decompression](#Decompressor) of Zstandard content. -This package is pure Go and without use of "unsafe". +This package is pure Go. Use `noasm` and `nounsafe` to disable relevant features. The `zstd` package is provided as open source software using a Go standard license. diff --git a/zstd/_generate/gen.go b/zstd/_generate/gen.go index 4aa9ffdde..1554543b1 100644 --- a/zstd/_generate/gen.go +++ b/zstd/_generate/gen.go @@ -157,7 +157,7 @@ func (o options) generateBody(name string, executeSingleTriple func(ctx *execute Load(br.Field("value"), brValue) Load(br.Field("bitsRead"), brBitsRead) Load(br.Field("in").Base(), brPointer) - Load(br.Field("in").Len(), brOffset) + Load(br.Field("cursor"), brOffset) ADDQ(brOffset, brPointer) // Add current offset to read pointer. MOVQ(brPointer, brPointerStash) } @@ -438,7 +438,7 @@ func (o options) generateBody(name string, executeSingleTriple func(ctx *execute br := Dereference(Param("br")) Store(brValue, br.Field("value")) Store(brBitsRead.As8(), br.Field("bitsRead")) - Store(brOffset, br.Field("in").Len()) + Store(brOffset, br.Field("cursor")) if !o.useSeqs { Comment("Update the context") diff --git a/zstd/bitreader.go b/zstd/bitreader.go index 25ca98394..d41e3e170 100644 --- a/zstd/bitreader.go +++ b/zstd/bitreader.go @@ -5,11 +5,12 @@ package zstd import ( - "encoding/binary" "errors" "fmt" "io" "math/bits" + + "github.com/klauspost/compress/internal/le" ) // bitReader reads a bitstream in reverse. @@ -18,6 +19,7 @@ import ( type bitReader struct { in []byte value uint64 // Maybe use [16]byte, but shifting is awkward. + cursor int // offset where next read should end bitsRead uint8 } @@ -32,6 +34,7 @@ func (b *bitReader) init(in []byte) error { if v == 0 { return errors.New("corrupt stream, did not find end of stream") } + b.cursor = len(in) b.bitsRead = 64 b.value = 0 if len(in) >= 8 { @@ -67,18 +70,15 @@ func (b *bitReader) fillFast() { if b.bitsRead < 32 { return } - v := b.in[len(b.in)-4:] - b.in = b.in[:len(b.in)-4] - low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) - b.value = (b.value << 32) | uint64(low) + b.cursor -= 4 + b.value = (b.value << 32) | uint64(le.Load32(b.in, b.cursor)) b.bitsRead -= 32 } // fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read. func (b *bitReader) fillFastStart() { - v := b.in[len(b.in)-8:] - b.in = b.in[:len(b.in)-8] - b.value = binary.LittleEndian.Uint64(v) + b.cursor -= 8 + b.value = le.Load64(b.in, b.cursor) b.bitsRead = 0 } @@ -87,25 +87,23 @@ func (b *bitReader) fill() { if b.bitsRead < 32 { return } - if len(b.in) >= 4 { - v := b.in[len(b.in)-4:] - b.in = b.in[:len(b.in)-4] - low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) - b.value = (b.value << 32) | uint64(low) + if b.cursor >= 4 { + b.cursor -= 4 + b.value = (b.value << 32) | uint64(le.Load32(b.in, b.cursor)) b.bitsRead -= 32 return } - b.bitsRead -= uint8(8 * len(b.in)) - for len(b.in) > 0 { - b.value = (b.value << 8) | uint64(b.in[len(b.in)-1]) - b.in = b.in[:len(b.in)-1] + b.bitsRead -= uint8(8 * b.cursor) + for b.cursor > 0 { + b.cursor -= 1 + b.value = (b.value << 8) | uint64(b.in[b.cursor]) } } // finished returns true if all bits have been read from the bit stream. func (b *bitReader) finished() bool { - return len(b.in) == 0 && b.bitsRead >= 64 + return b.cursor == 0 && b.bitsRead >= 64 } // overread returns true if more bits have been requested than is on the stream. @@ -115,13 +113,14 @@ func (b *bitReader) overread() bool { // remain returns the number of bits remaining. func (b *bitReader) remain() uint { - return 8*uint(len(b.in)) + 64 - uint(b.bitsRead) + return 8*uint(b.cursor) + 64 - uint(b.bitsRead) } // close the bitstream and returns an error if out-of-buffer reads occurred. func (b *bitReader) close() error { // Release reference. b.in = nil + b.cursor = 0 if !b.finished() { return fmt.Errorf("%d extra bits on block, should be 0", b.remain()) } diff --git a/zstd/decoder.go b/zstd/decoder.go index 0170da828..ea2a19376 100644 --- a/zstd/decoder.go +++ b/zstd/decoder.go @@ -323,6 +323,7 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) { frame.bBuf = nil if frame.history.decoders.br != nil { frame.history.decoders.br.in = nil + frame.history.decoders.br.cursor = 0 } d.decoders <- block }() diff --git a/zstd/matchlen_generic.go b/zstd/matchlen_generic.go index 57b9c31c0..bea1779e9 100644 --- a/zstd/matchlen_generic.go +++ b/zstd/matchlen_generic.go @@ -7,20 +7,25 @@ package zstd import ( - "encoding/binary" "math/bits" + + "github.com/klauspost/compress/internal/le" ) // matchLen returns the maximum common prefix length of a and b. // a must be the shortest of the two. func matchLen(a, b []byte) (n int) { - for ; len(a) >= 8 && len(b) >= 8; a, b = a[8:], b[8:] { - diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b) + left := len(a) + for left >= 8 { + diff := le.Load64(a, n) ^ le.Load64(b, n) if diff != 0 { return n + bits.TrailingZeros64(diff)>>3 } n += 8 + left -= 8 } + a = a[n:] + b = b[n:] for i := range a { if a[i] != b[i] { diff --git a/zstd/seqdec.go b/zstd/seqdec.go index d7fe6d82d..9a7de82f9 100644 --- a/zstd/seqdec.go +++ b/zstd/seqdec.go @@ -245,7 +245,7 @@ func (s *sequenceDecs) decodeSync(hist []byte) error { return io.ErrUnexpectedEOF } var ll, mo, ml int - if len(br.in) > 4+((maxOffsetBits+16+16)>>3) { + if br.cursor > 4+((maxOffsetBits+16+16)>>3) { // inlined function: // ll, mo, ml = s.nextFast(br, llState, mlState, ofState) diff --git a/zstd/seqdec_amd64.s b/zstd/seqdec_amd64.s index f5591fa1e..a708ca6d3 100644 --- a/zstd/seqdec_amd64.s +++ b/zstd/seqdec_amd64.s @@ -7,9 +7,9 @@ TEXT ·sequenceDecs_decode_amd64(SB), $8-32 MOVQ br+8(FP), CX MOVQ 24(CX), DX - MOVBQZX 32(CX), BX + MOVBQZX 40(CX), BX MOVQ (CX), AX - MOVQ 8(CX), SI + MOVQ 32(CX), SI ADDQ SI, AX MOVQ AX, (SP) MOVQ ctx+16(FP), AX @@ -299,8 +299,8 @@ sequenceDecs_decode_amd64_match_len_ofs_ok: MOVQ R13, 160(AX) MOVQ br+8(FP), AX MOVQ DX, 24(AX) - MOVB BL, 32(AX) - MOVQ SI, 8(AX) + MOVB BL, 40(AX) + MOVQ SI, 32(AX) // Return success MOVQ $0x00000000, ret+24(FP) @@ -335,9 +335,9 @@ error_overread: TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32 MOVQ br+8(FP), CX MOVQ 24(CX), DX - MOVBQZX 32(CX), BX + MOVBQZX 40(CX), BX MOVQ (CX), AX - MOVQ 8(CX), SI + MOVQ 32(CX), SI ADDQ SI, AX MOVQ AX, (SP) MOVQ ctx+16(FP), AX @@ -598,8 +598,8 @@ sequenceDecs_decode_56_amd64_match_len_ofs_ok: MOVQ R13, 160(AX) MOVQ br+8(FP), AX MOVQ DX, 24(AX) - MOVB BL, 32(AX) - MOVQ SI, 8(AX) + MOVB BL, 40(AX) + MOVQ SI, 32(AX) // Return success MOVQ $0x00000000, ret+24(FP) @@ -634,9 +634,9 @@ error_overread: TEXT ·sequenceDecs_decode_bmi2(SB), $8-32 MOVQ br+8(FP), BX MOVQ 24(BX), AX - MOVBQZX 32(BX), DX + MOVBQZX 40(BX), DX MOVQ (BX), CX - MOVQ 8(BX), BX + MOVQ 32(BX), BX ADDQ BX, CX MOVQ CX, (SP) MOVQ ctx+16(FP), CX @@ -884,8 +884,8 @@ sequenceDecs_decode_bmi2_match_len_ofs_ok: MOVQ R12, 160(CX) MOVQ br+8(FP), CX MOVQ AX, 24(CX) - MOVB DL, 32(CX) - MOVQ BX, 8(CX) + MOVB DL, 40(CX) + MOVQ BX, 32(CX) // Return success MOVQ $0x00000000, ret+24(FP) @@ -920,9 +920,9 @@ error_overread: TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32 MOVQ br+8(FP), BX MOVQ 24(BX), AX - MOVBQZX 32(BX), DX + MOVBQZX 40(BX), DX MOVQ (BX), CX - MOVQ 8(BX), BX + MOVQ 32(BX), BX ADDQ BX, CX MOVQ CX, (SP) MOVQ ctx+16(FP), CX @@ -1141,8 +1141,8 @@ sequenceDecs_decode_56_bmi2_match_len_ofs_ok: MOVQ R12, 160(CX) MOVQ br+8(FP), CX MOVQ AX, 24(CX) - MOVB DL, 32(CX) - MOVQ BX, 8(CX) + MOVB DL, 40(CX) + MOVQ BX, 32(CX) // Return success MOVQ $0x00000000, ret+24(FP) @@ -1787,9 +1787,9 @@ empty_seqs: TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32 MOVQ br+8(FP), CX MOVQ 24(CX), DX - MOVBQZX 32(CX), BX + MOVBQZX 40(CX), BX MOVQ (CX), AX - MOVQ 8(CX), SI + MOVQ 32(CX), SI ADDQ SI, AX MOVQ AX, (SP) MOVQ ctx+16(FP), AX @@ -2281,8 +2281,8 @@ handle_loop: loop_finished: MOVQ br+8(FP), AX MOVQ DX, 24(AX) - MOVB BL, 32(AX) - MOVQ SI, 8(AX) + MOVB BL, 40(AX) + MOVQ SI, 32(AX) // Update the context MOVQ ctx+16(FP), AX @@ -2349,9 +2349,9 @@ error_not_enough_space: TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32 MOVQ br+8(FP), BX MOVQ 24(BX), AX - MOVBQZX 32(BX), DX + MOVBQZX 40(BX), DX MOVQ (BX), CX - MOVQ 8(BX), BX + MOVQ 32(BX), BX ADDQ BX, CX MOVQ CX, (SP) MOVQ ctx+16(FP), CX @@ -2801,8 +2801,8 @@ handle_loop: loop_finished: MOVQ br+8(FP), CX MOVQ AX, 24(CX) - MOVB DL, 32(CX) - MOVQ BX, 8(CX) + MOVB DL, 40(CX) + MOVQ BX, 32(CX) // Update the context MOVQ ctx+16(FP), AX @@ -2869,9 +2869,9 @@ error_not_enough_space: TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32 MOVQ br+8(FP), CX MOVQ 24(CX), DX - MOVBQZX 32(CX), BX + MOVBQZX 40(CX), BX MOVQ (CX), AX - MOVQ 8(CX), SI + MOVQ 32(CX), SI ADDQ SI, AX MOVQ AX, (SP) MOVQ ctx+16(FP), AX @@ -3465,8 +3465,8 @@ handle_loop: loop_finished: MOVQ br+8(FP), AX MOVQ DX, 24(AX) - MOVB BL, 32(AX) - MOVQ SI, 8(AX) + MOVB BL, 40(AX) + MOVQ SI, 32(AX) // Update the context MOVQ ctx+16(FP), AX @@ -3533,9 +3533,9 @@ error_not_enough_space: TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32 MOVQ br+8(FP), BX MOVQ 24(BX), AX - MOVBQZX 32(BX), DX + MOVBQZX 40(BX), DX MOVQ (BX), CX - MOVQ 8(BX), BX + MOVQ 32(BX), BX ADDQ BX, CX MOVQ CX, (SP) MOVQ ctx+16(FP), CX @@ -4087,8 +4087,8 @@ handle_loop: loop_finished: MOVQ br+8(FP), CX MOVQ AX, 24(CX) - MOVB DL, 32(CX) - MOVQ BX, 8(CX) + MOVB DL, 40(CX) + MOVQ BX, 32(CX) // Update the context MOVQ ctx+16(FP), AX diff --git a/zstd/seqdec_generic.go b/zstd/seqdec_generic.go index 2fb35b788..7cec2197c 100644 --- a/zstd/seqdec_generic.go +++ b/zstd/seqdec_generic.go @@ -29,7 +29,7 @@ func (s *sequenceDecs) decode(seqs []seqVals) error { } for i := range seqs { var ll, mo, ml int - if len(br.in) > 4+((maxOffsetBits+16+16)>>3) { + if br.cursor > 4+((maxOffsetBits+16+16)>>3) { // inlined function: // ll, mo, ml = s.nextFast(br, llState, mlState, ofState) diff --git a/zstd/seqenc.go b/zstd/seqenc.go index 8014174a7..65045eabd 100644 --- a/zstd/seqenc.go +++ b/zstd/seqenc.go @@ -69,7 +69,6 @@ var llBitsTable = [maxLLCode + 1]byte{ func llCode(litLength uint32) uint8 { const llDeltaCode = 19 if litLength <= 63 { - // Compiler insists on bounds check (Go 1.12) return llCodeTable[litLength&63] } return uint8(highBit(litLength)) + llDeltaCode @@ -102,7 +101,6 @@ var mlBitsTable = [maxMLCode + 1]byte{ func mlCode(mlBase uint32) uint8 { const mlDeltaCode = 36 if mlBase <= 127 { - // Compiler insists on bounds check (Go 1.12) return mlCodeTable[mlBase&127] } return uint8(highBit(mlBase)) + mlDeltaCode diff --git a/zstd/zstd.go b/zstd/zstd.go index 066bef2a4..6252b46ae 100644 --- a/zstd/zstd.go +++ b/zstd/zstd.go @@ -5,10 +5,11 @@ package zstd import ( "bytes" - "encoding/binary" "errors" "log" "math" + + "github.com/klauspost/compress/internal/le" ) // enable debug printing @@ -110,11 +111,11 @@ func printf(format string, a ...interface{}) { } func load3232(b []byte, i int32) uint32 { - return binary.LittleEndian.Uint32(b[:len(b):len(b)][i:]) + return le.Load32(b, i) } func load6432(b []byte, i int32) uint64 { - return binary.LittleEndian.Uint64(b[:len(b):len(b)][i:]) + return le.Load64(b, i) } type byter interface {