Description
We've been having regular hangs in CI, where one of the vimto based tests (usually 6.1 but not exclusively so) hangs for 10 minutes and then times out with a weird goroutine dump:
panic: test timed out after 35s
running tests:
TestGetSetJumpOp (35s)
goroutine 63 [running]:
testing.(*M).startAlarm.func1()
/usr/local/go/src/testing/testing.go:2484 +0x394
created by time.goFunc
/usr/local/go/src/time/sleep.go:215 +0x2d
goroutine 1 [chan receive]:
testing.(*T).Run(0xc0000a61c0, {0x71e894?, 0xc00010db30?}, 0x731870)
/usr/local/go/src/testing/testing.go:1859 +0x431
testing.runTests.func1(0xc0000a61c0)
/usr/local/go/src/testing/testing.go:2279 +0x37
testing.tRunner(0xc0000a61c0, 0xc00010dc70)
/usr/local/go/src/testing/testing.go:1792 +0xf4
testing.runTests(0xc00008e0a8, {0x964680, 0x10, 0x10}, {0x96b100?, 0x7?, 0x969ec0?})
/usr/local/go/src/testing/testing.go:2277 +0x4b4
testing.(*M).Run(0xc0000905a0)
/usr/local/go/src/testing/testing.go:2142 +0x64a
main.main()
_testmain.go:83 +0x9b
goroutine 35 [runnable]:
syscall.Syscall(0x1, 0x1, 0xc0000e4440, 0x2b)
/usr/local/go/src/syscall/syscall_linux.go:73 +0x25
syscall.write(0xc0000a4060?, {0xc0000e4440?, 0x7f4b5b22fa10?, 0x42f6e0?})
/usr/local/go/src/syscall/zsyscall_linux_amd64.go:964 +0x3b
syscall.Write(...)
/usr/local/go/src/syscall/syscall_unix.go:211
internal/poll.ignoringEINTRIO(...)
/usr/local/go/src/internal/poll/fd_unix.go:745
internal/poll.(*FD).Write(0xc0000a4060, {0xc0000e4440, 0x2b, 0x40})
/usr/local/go/src/internal/poll/fd_unix.go:381 +0x394
os.(*File).write(...)
/usr/local/go/src/os/file_posix.go:46
os.(*File).Write(0xc000084030, {0xc0000e4440?, 0x2b, 0xc00008aea0?})
/usr/local/go/src/os/file.go:195 +0x4e
fmt.Fprintf({0x788d00, 0xc000084030}, {0xc00008adb0, 0xe}, {0xc00008aea0, 0x1, 0x1})
/usr/local/go/src/fmt/print.go:225 +0x97
testing.(*chattyPrinter).Updatef(0xc000018120?, {0xc000018120?, 0xc0000100c0?}, {0x71d8c8?, 0x418e9b?}, {0xc00008aea0, 0x1, 0x1})
/usr/local/go/src/testing/testing.go:600 +0x187
testing.(*T).Run(0xc000140540, {0xc0000100c0?, 0xc00008af40?}, 0xc0000100d0)
/usr/local/go/src/testing/testing.go:1842 +0x331
github.com/cilium/ebpf/asm.TestGetSetJumpOp.func1(...)
/home/lorenz/dev/ebpf/asm/opcode_test.go:12
github.com/cilium/ebpf/asm.TestGetSetJumpOp(0xc000140540)
/home/lorenz/dev/ebpf/asm/opcode_test.go:49 +0x3e9
testing.tRunner(0xc000140540, 0x731870)
/usr/local/go/src/testing/testing.go:1792 +0xf4
created by testing.(*T).Run in goroutine 1
/usr/local/go/src/testing/testing.go:1851 +0x413
FAIL github.com/cilium/ebpf/asm 38.394s
(on 6.1.129 from ci-kernels)
The weird part is that the runtime seems to be stuck in a write to stdout. I've managed to reproduce this in a VM which mimics GH Actions and on my local machine, running different host kernels and qemu versions.
It does seem like this is an issue with the guest kernel, since I can't trigger the hang on older versions like v5.4. I've started bisecting the problem but it's been very slow going due to compilation of old kernels on new toolchains being pretty broken.
I did get one money shot from one of the crashes of asm
:
panic: test timed out after 1m0s
running tests:
TestGetSetJumpOp (1m0s)
goroutine 51 [running]:
testing.(*M).startAlarm.func1()
/usr/local/go/src/testing/testing.go:2484 +0x394
created by time.goFunc
/usr/local/go/src/time/sleep.go:215 +0x2d
goroutine 1 [chan receive]:
testing.(*T).Run(0xc000003a40, {0x71e894?, 0xc0000efb30?}, 0x731870)
/usr/local/go/src/testing/testing.go:1859 +0x431
testing.runTests.func1(0xc000003a40)
/usr/local/go/src/testing/testing.go:2279 +0x37
testing.tRunner(0xc000003a40, 0xc0000efc70)
/usr/local/go/src/testing/testing.go:1792 +0xf4
testing.runTests(0xc00000e0c0, {0x964680, 0x10, 0x10}, {0x96b100?, 0x7?, 0x969ec0?})
/usr/local/go/src/testing/testing.go:2277 +0x4b4
testing.(*M).Run(0xc00008a500)
/usr/local/go/src/testing/testing.go:2142 +0x64a
main.main()
_testmain.go:83 +0x9b
goroutine 23 [runnable]:
syscall.Syscall(0x1, 0x1, 0xc00001c540, 0x2b)
/usr/local/go/src/syscall/syscall_linux.go:73 +0x25
syscall.write(0xc000012180?, {0xc00001c540?, 0xc000086c00?, 0x45ea9e?})
/usr/local/go/src/syscall/zsyscall_linux_amd64.go:964 +0x3b
syscall.Write(...)
/usr/local/go/src/syscall/syscall_unix.go:211
internal/poll.ignoringEINTRIO(...)
/usr/local/go/src/internal/poll/fd_unix.go:745
internal/poll.(*FD).Write(0x0, {0x487faa, 0x95fd00, 0x40})
/usr/local/go/src/internal/poll/fd_unix.go:381 +0x394
os.(*File).write(...)
/usr/local/go/src/os/file_posix.go:46
os.(*File).Write(0xc000086d48, {0xc000086d48?, 0x419d5b, 0x96b100?})
/usr/local/go/src/os/file.go:195 +0x4e
runtime: g 23: unexpected return pc for os.(*File).Write called from 0x4
stack: frame={sp:0xc000086cb0, fp:0xc000086d10} stack=[0xc000086000,0xc000087000)
0x000000c000086bb0: 0x0000000000012100 0x0000000000000000
0x000000c000086bc0: 0x000000000047dbd2 <runtime.systemstack+0x0000000000000032> 0x000000c000086c08
0x000000c000086bd0: 0x000000000041c34b <runtime.(*mcache).refill+0x00000000000001ab> 0x000000000098b4a0
0x000000c000086be0: 0x0000000000000000 0x0000000000000190
0x000000c000086bf0: 0x000000c000086c10 0x00007f920c3cb1e8
0x000000c000086c00: 0x0000000000540010 <go/ast.(*CommentGroup).Pos+0x0000000000000010> 0x000000c000086c40
0x000000c000086c10: 0x000000000041887e <runtime.(*mcache).nextFree+0x000000000000009e> 0x00007f920c3cb1e8
0x000000c000086c20: 0x0000000000000018 0x000000000000dbd2
0x000000c000086c30: 0x00007f920c3cb1e8 0x0000000000000010
0x000000c000086c40: 0x000000c000086ca0 0x00000000004192c5 <runtime.mallocgcSmallScanNoHeader+0x00000000000002e5>
0x000000c000086c50: 0x00007f9252ffc5c0 0x000000000041ca10 <runtime.(*mcentral).cacheSpan+0x0000000000000170>
0x000000c000086c60: 0x000000c000006ca8 0x00007f920c3cb328
0x000000c000086c70: 0x080101000000000a 0x00007f920c3cb1e8
0x000000c000086c80: 0x00007f9252ffc5c0 0x0000000000000060
0x000000c000086c90: 0x000000000096b100 0x000000c000188000
0x000000c000086ca0: 0x000000c000086cd0 0x0000000000475c19 <runtime.mallocgc+0x0000000000000099>
0x000000c000086cb0: <0x0000000000000058 0x000000000070ede0
0x000000c000086cc0: 0x0000000000000030 0x000000000096b100
0x000000c000086cd0: 0x000000c000086cf8 0x0000000000419ca5 <runtime.newobject+0x0000000000000025>
0x000000c000086ce0: 0x0000000000000058 0x000000000070ede0
0x000000c000086cf0: 0x000000c000003d01 0x000000c000086d60
0x000000c000086d00: 0x0000000000443e29 <runtime.acquireSudog+0x0000000000000229> 0x000000c000086d30
0x000000c000086d10: >0x0000000000475159 <sync/atomic.StorePointer+0x0000000000000039> 0x0000000000000030
0x000000c000086d20: 0x00000000006fc7a0 0x000000c000184000
0x000000c000086d30: 0x000000c000086e00 0x000000000048a977 <internal/sync.(*HashTrieMap[go.shape.interface {},go.shape.interface {}]).Swap+0x00000000000003d7>
0x000000c000086d40: 0x000000c000086d60 0x000000000047810e <runtime.gopark+0x00000000000000ce>
0x000000c000086d50: 0x0000000000419214 <runtime.mallocgcSmallScanNoHeader+0x0000000000000234> 0x000000c000034508
0x000000c000086d60: 0x000000c000086dd8 0x0000000000411b45 <runtime.chanrecv+0x0000000000000445>
0x000000c000086d70: 0x0301010000788d00 0x030000000c3cb3c8
0x000000c000086d80: 0x00007f920c3cb3c8 0x00007f9252ffc5c0
0x000000c000086d90: 0x0000000000000018 0x000000000096b100
0x000000c000086da0: 0x0000000000000000 0x000000c000086dd8
0x000000c000086db0: 0x000000c000188000 0x000000c00012aa70
0x000000c000086dc0: 0x000000c000003dc0 0x000000c000086e00
0x000000c000086dd0: 0x000000000044d61f <runtime.newproc+0x000000000000003f> 0x000000c000086e00
0x000000c000086de0: 0x00000000004116d2 <runtime.chanrecv1+0x0000000000000012> 0x000000c00012aa10
0x000000c000086df0: 0x000000c000086e57 0x0000000000532501 <testing.(*T).Run+0x0000000000000361>
0x000000c000086e00: 0x000000c000086ed8 0x00000000005325d1 <testing.(*T).Run+0x0000000000000431>
created by testing.(*T).Run in goroutine 1
/usr/local/go/src/testing/testing.go:1851 +0x413
FAIL github.com/cilium/ebpf/asm 64.861s
This tells me we are looking at memory corruption. Since the crash is in the asm package test I'm fairly certain that the bug isn't in our (unsafe) code, since asm doesn't import anything meaningfully unsafe.
Since we also see a lot of crashes where the stack indicates a write to stdout, I'm guessing that the bug is somewhere in the virtio-console code because that is what stdout for the test program is mapped to.