Skip to content

Commit 24dc135

Browse files
committed
record: add a bitflip check to WAL corruption
Checks if there was a bitflip that occurred when a chunk corruption is found. Similar to #4406.
1 parent 1e3ee7b commit 24dc135

File tree

5 files changed

+127
-41
lines changed

5 files changed

+127
-41
lines changed

internal/bitflip/bitflip.go

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
package bitflip
2+
3+
// CheckSliceForBitFlip flips bits in data to see if it matches the expected checksum.
4+
// Returns the index and bit if successful.
5+
func CheckSliceForBitFlip(
6+
data []byte, computeChecksum func([]byte) uint32, expectedChecksum uint32,
7+
) (found bool, indexFound int, bitFound int) {
8+
// TODO(edward) This checking process likely can be made faster.
9+
iterationLimit := 40 * (1 << 10) // 40KB
10+
for i := 0; i < min(len(data), iterationLimit); i++ {
11+
foundFlip, bit := checkByteForFlip(data, i, computeChecksum, expectedChecksum)
12+
if foundFlip {
13+
return true, i, bit
14+
}
15+
}
16+
return false, 0, 0
17+
}
18+
19+
func checkByteForFlip(
20+
data []byte, i int, computeChecksum func([]byte) uint32, expectedChecksum uint32,
21+
) (found bool, bit int) {
22+
for bit := 0; bit < 8; bit++ {
23+
data[i] ^= (1 << bit)
24+
var computedChecksum = computeChecksum(data)
25+
data[i] ^= (1 << bit)
26+
if computedChecksum == expectedChecksum {
27+
return true, bit
28+
}
29+
}
30+
return false, 0
31+
}

open_test.go

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"os"
1717
"path/filepath"
1818
"reflect"
19+
"regexp"
1920
"runtime"
2021
"runtime/debug"
2122
"slices"
@@ -1789,3 +1790,68 @@ func TestWALCorruption(t *testing.T) {
17891790
})
17901791
require.True(t, errors.Is(err, ErrCorruption))
17911792
}
1793+
1794+
func TestWALCorruptionBitFlip(t *testing.T) {
1795+
fs := vfs.NewMem()
1796+
d, err := Open("", testingRandomized(t, &Options{
1797+
FS: fs,
1798+
FormatMajorVersion: FormatWALSyncChunks,
1799+
}))
1800+
require.NoError(t, err)
1801+
require.NoError(t, d.Flush())
1802+
1803+
fourKiBValue := bytes.Repeat([]byte{'a'}, 4096)
1804+
for i := 1; i <= 32; i++ {
1805+
require.NoError(t, d.Set([]byte(fmt.Sprintf("key-%d", i)), fourKiBValue, Sync))
1806+
}
1807+
require.NoError(t, d.Close())
1808+
1809+
// We should have two WALs.
1810+
logs, err := fs.List("")
1811+
require.NoError(t, err)
1812+
logs = slices.DeleteFunc(logs, func(s string) bool { return filepath.Ext(s) != ".log" })
1813+
sort.Slice(logs, func(i, j int) bool {
1814+
return logs[i] < logs[j]
1815+
})
1816+
lastLog := logs[len(logs)-1]
1817+
1818+
// Corrupt the WAL by flipping one byte, 100 bytes from the end
1819+
// of the file.
1820+
f, err := fs.OpenReadWrite(lastLog, vfs.WriteCategoryUnspecified)
1821+
require.NoError(t, err)
1822+
1823+
buf := []byte{0}
1824+
_, err = f.ReadAt(buf, 100)
1825+
require.NoError(t, err)
1826+
1827+
bitToFlip := byte(1 << rand.IntN(8))
1828+
buf[0] ^= bitToFlip
1829+
_, err = f.WriteAt(buf, 100)
1830+
require.NoError(t, err)
1831+
require.NoError(t, f.Close())
1832+
t.Logf("zeroed one byte in %s at offset %d\n", lastLog, 100)
1833+
1834+
// Re-opening the database should detect and report the corruption and bit flip.
1835+
_, err = Open("", &Options{
1836+
FS: fs,
1837+
FormatMajorVersion: FormatWALSyncChunks,
1838+
})
1839+
require.True(t, errors.Is(err, ErrCorruption))
1840+
1841+
checkBitFlipErr := func(err error, t *testing.T) bool {
1842+
if err != nil {
1843+
details := errors.GetAllSafeDetails(err)
1844+
re := regexp.MustCompile(`bit flip found.+byte index \d+\. got: 0x[0-9A-Fa-f]{1,2}\. want: 0x[0-9A-Fa-f]{1,2}\.`)
1845+
for _, d := range details {
1846+
for _, s := range d.SafeDetails {
1847+
if re.MatchString(s) {
1848+
return true
1849+
}
1850+
}
1851+
}
1852+
require.Fail(t, "expected at least one detail to match bit flip found pattern", err)
1853+
}
1854+
return false
1855+
}
1856+
checkBitFlipErr(err, t)
1857+
}

record/record.go

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ import (
112112

113113
"github.com/cockroachdb/errors"
114114
"github.com/cockroachdb/pebble/internal/base"
115+
"github.com/cockroachdb/pebble/internal/bitflip"
115116
"github.com/cockroachdb/pebble/internal/crc"
116117
)
117118

@@ -359,9 +360,19 @@ func (r *Reader) nextChunk(wantFirst bool) error {
359360
r.invalidOffset = uint64(r.blockNum)*blockSize + uint64(r.begin)
360361
return ErrInvalidChunk
361362
}
362-
if checksum != crc.New(r.buf[r.begin-headerSize+6:r.end]).Value() {
363+
data := r.buf[r.begin-headerSize+6 : r.end]
364+
if checksum != crc.New(data).Value() {
365+
computeChecksum := func(data []byte) uint32 { return crc.New(data).Value() }
366+
// Check if there was a bit flip.
367+
found, indexFound, bitFound := bitflip.CheckSliceForBitFlip(data, computeChecksum, checksum)
368+
err := ErrInvalidChunk
369+
if found {
370+
err = errors.WithSafeDetails(err, ". bit flip found: block num %d. wal offset %d. byte index %d. got: 0x%x. want: 0x%x.",
371+
errors.Safe(r.blockNum), errors.Safe(r.invalidOffset), errors.Safe(indexFound), errors.Safe(data[indexFound]), errors.Safe(data[indexFound]^(1<<bitFound)))
372+
}
373+
363374
r.invalidOffset = uint64(r.blockNum)*blockSize + uint64(r.begin)
364-
return ErrInvalidChunk
375+
return err
365376
}
366377
if wantFirst {
367378
if chunkPosition != fullChunkPosition && chunkPosition != firstChunkPosition {

sstable/block/block.go

Lines changed: 15 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"github.com/cockroachdb/crlib/fifo"
1818
"github.com/cockroachdb/errors"
1919
"github.com/cockroachdb/pebble/internal/base"
20+
"github.com/cockroachdb/pebble/internal/bitflip"
2021
"github.com/cockroachdb/pebble/internal/cache"
2122
"github.com/cockroachdb/pebble/internal/crc"
2223
"github.com/cockroachdb/pebble/internal/invariants"
@@ -174,53 +175,30 @@ func ValidateChecksum(checksumType ChecksumType, b []byte, bh Handle) error {
174175
if expectedChecksum != computedChecksum {
175176
// Check if the checksum was due to a singular bit flip and report it.
176177
data := slices.Clone(b[:bh.Length+1])
177-
found, indexFound, bitFound := checkSliceForBitFlip(data, checksumType, expectedChecksum)
178+
var checksumFunction func([]byte) uint32
179+
switch checksumType {
180+
case ChecksumTypeCRC32c:
181+
checksumFunction = func(data []byte) uint32 {
182+
return crc.New(data).Value()
183+
}
184+
case ChecksumTypeXXHash64:
185+
checksumFunction = func(data []byte) uint32 {
186+
return uint32(xxhash.Sum64(data))
187+
}
188+
}
189+
found, indexFound, bitFound := bitflip.CheckSliceForBitFlip(data, checksumFunction, expectedChecksum)
178190
err := base.CorruptionErrorf("block %d/%d: %s checksum mismatch %x != %x",
179191
errors.Safe(bh.Offset), errors.Safe(bh.Length), checksumType,
180192
expectedChecksum, computedChecksum)
181193
if found {
182-
err = errors.WithSafeDetails(err, ". bit flip found: byte index %d. got: %x. want: %x.",
183-
indexFound, data[indexFound], data[indexFound]^(1<<bitFound))
194+
err = errors.WithSafeDetails(err, ". bit flip found: byte index %d. got: 0x%x. want: 0x%x.",
195+
errors.Safe(indexFound), errors.Safe(data[indexFound]), errors.Safe(data[indexFound]^(1<<bitFound)))
184196
}
185197
return err
186198
}
187199
return nil
188200
}
189201

190-
func checkSliceForBitFlip(
191-
data []byte, checksumType ChecksumType, expectedChecksum uint32,
192-
) (found bool, indexFound int, bitFound int) {
193-
// TODO(edward) This checking process likely can be made faster.
194-
iterationLimit := 40 * (1 << 10) // 40KB
195-
for i := 0; i < min(len(data), iterationLimit); i++ {
196-
foundFlip, bit := checkByteForFlip(data, i, checksumType, expectedChecksum)
197-
if foundFlip {
198-
return true, i, bit
199-
}
200-
}
201-
return false, 0, 0
202-
}
203-
204-
func checkByteForFlip(
205-
data []byte, i int, checksumType ChecksumType, expectedChecksum uint32,
206-
) (found bool, bit int) {
207-
for bit := 0; bit < 8; bit++ {
208-
data[i] ^= (1 << bit)
209-
var computedChecksum uint32
210-
switch checksumType {
211-
case ChecksumTypeCRC32c:
212-
computedChecksum = crc.New(data).Value()
213-
case ChecksumTypeXXHash64:
214-
computedChecksum = uint32(xxhash.Sum64(data))
215-
}
216-
data[i] ^= (1 << bit)
217-
if computedChecksum == expectedChecksum {
218-
return true, bit
219-
}
220-
}
221-
return false, 0
222-
}
223-
224202
// Metadata is an in-memory buffer that stores metadata for a block. It is
225203
// allocated together with the buffer storing the block and is initialized once
226204
// when the block is read from disk.

sstable/reader_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1512,15 +1512,15 @@ func TestReaderChecksumErrors(t *testing.T) {
15121512
checkBitFlipErr := func(err error) bool {
15131513
if err != nil {
15141514
details := errors.GetAllSafeDetails(err)
1515-
re := regexp.MustCompile(`bit flip found`)
1515+
re := regexp.MustCompile(`bit flip found.+byte index \d+\. got: 0x[0-9A-Fa-f]{1,2}\. want: 0x[0-9A-Fa-f]{1,2}\.`)
15161516
for _, d := range details {
15171517
for _, s := range d.SafeDetails {
15181518
if re.MatchString(s) {
15191519
return true
15201520
}
15211521
}
15221522
}
1523-
require.Fail(t, "expected at least one detail to match bit flip found", err)
1523+
require.Fail(t, "expected at least one detail to match bit flip pattern", err)
15241524
}
15251525
return false
15261526
}

0 commit comments

Comments
 (0)