diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 00000000..00753718 --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,20 @@ +{ + "permissions": { + "allow": [ + "Bash(cargo test:*)", + "Bash(grep:*)", + "Bash(RUST_BACKTRACE=1 cargo test --features \"composefs/pre-6.15\" tar::tests::test_empty_tar)", + "Bash(git add:*)", + "Bash(git commit:*)", + "Bash(git rebase:*)", + "Bash(git reset:*)", + "Bash(git stash:*)", + "Bash(cargo fmt:*)", + "Bash(cargo:*)", + "WebFetch(domain:github.com)", + "Bash(find:*)", + "Bash(rg:*)" + ], + "deny": [] + } +} \ No newline at end of file diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml index 68387d95..5689f7ec 100644 --- a/.github/workflows/examples.yml +++ b/.github/workflows/examples.yml @@ -17,22 +17,19 @@ jobs: strategy: matrix: example: - - { dir: 'bls', os: 'arch' } - - { dir: 'bls', os: 'fedora' } - - { dir: 'bls', os: 'fedora-compat' } - - { dir: 'bls', os: 'rawhide' } - - { dir: 'bls', os: 'rhel9' } - # This one is currently failing, needs debugging - # https://github.com/containers/composefs-rs/pull/168#pullrequestreview-3088673152 - # We believe it's mount API changes causing /sysroot to be mounted - # at the wrong place. + # Temporarily disabled for faster debugging + # - { dir: 'bls', os: 'arch' } + # - { dir: 'bls', os: 'fedora' } + # - { dir: 'bls', os: 'fedora-compat' } + # - { dir: 'bls', os: 'rawhide' } + # - { dir: 'bls', os: 'rhel9' } # - { dir: 'bls', os: 'ubuntu' } - - { dir: 'uki', os: 'arch' } + # - { dir: 'uki', os: 'arch' } - { dir: 'uki', os: 'fedora' } - - { dir: 'unified', os: 'fedora' } - - { dir: 'unified-secureboot', os: 'fedora' } - - { dir: 'bls', os: 'arch', fsfmt: 'ext4', verity: 'none' } - - { dir: 'bls', os: 'arch', fsfmt: 'xfs', verity: 'none' } + # - { dir: 'unified', os: 'fedora' } + # - { dir: 'unified-secureboot', os: 'fedora' } + # - { dir: 'bls', os: 'arch', fsfmt: 'ext4', verity: 'none' } + # - { dir: 'bls', os: 'arch', fsfmt: 'xfs', verity: 'none' } fail-fast: false steps: @@ -88,8 +85,78 @@ jobs: run: sudo cp examples/bls/test-thing.workarounds/systemd-ssh-proxy /usr/lib/systemd - name: Run example tests + id: run_tests + continue-on-error: true run: | export PATH="${HOME}/bin:${PATH}" export FS_FORMAT=${{ matrix.example.fsfmt }} export FS_VERITY_MODE=${{ matrix.example.verity }} examples/test/run ${{ matrix.example.dir }} ${{ matrix.example.os }} + + - name: Dump console logs on failure + if: steps.run_tests.outcome == 'failure' + run: | + echo "=== Test.thing directories ===" + ls -laR /run/user/$(id -u)/test.thing/ 2>/dev/null || true + echo "" + echo "=== Console logs from failed VMs ===" + find /run/user/$(id -u)/test.thing -name console -type f 2>/dev/null | while read console; do + echo "==== Console from $console ====" + cat "$console" || true + echo "" + done + echo "=== QEMU logs from failed VMs ===" + find /run/user/$(id -u)/test.thing -name qemu.log -type f 2>/dev/null | while read qemu_log; do + echo "==== QEMU log from $qemu_log ====" + cat "$qemu_log" || true + echo "" + done + echo "=== Serial logs from failed VMs ===" + find /run/user/$(id -u)/test.thing -name serial.log -type f 2>/dev/null | while read serial_log; do + echo "==== Serial log from $serial_log ====" + cat "$serial_log" || true + echo "" + done + echo "=== SSH debug logs from failed VMs ===" + find /run/user/$(id -u)/test.thing -name ssh.log -type f 2>/dev/null | while read ssh_log; do + echo "==== SSH log from $ssh_log ====" + cat "$ssh_log" || true + echo "" + done + echo "=== QMP sockets ===" + find /run/user/$(id -u)/test.thing -name qmp -type s 2>/dev/null || true + + - name: Check vsock device permissions + if: steps.run_tests.outcome == 'failure' + run: | + echo "=== vsock device status ===" + ls -la /dev/vhost-vsock /dev/kvm || true + echo "" + echo "=== vsock kernel module ===" + lsmod | grep vsock || true + echo "" + echo "=== Test vsock connection ===" + python3 -c "import socket; sock = socket.socket(socket.AF_VSOCK, socket.SOCK_STREAM); print('vsock socket created successfully')" || echo "Failed to create vsock socket" + + - name: Capture test.thing runtime directory + if: steps.run_tests.outcome == 'failure' + run: | + echo "=== test.thing IPC directory contents ===" + find /run/user/$(id -u)/test.thing -ls 2>/dev/null || true + + - name: Show build artifacts for debugging + if: steps.run_tests.outcome == 'failure' + run: | + cd examples/${{ matrix.example.dir }} + echo "=== Boot directory contents ===" + find tmp/efi -ls 2>/dev/null || true + echo "" + echo "=== Sysroot composefs images ===" + ls -lh tmp/sysroot/composefs/images/ 2>/dev/null || true + echo "" + echo "=== Sysroot state deployments ===" + ls -lh tmp/sysroot/state/deploy/ 2>/dev/null || true + + - name: Fail job if tests failed + if: steps.run_tests.outcome == 'failure' + run: exit 1 diff --git a/BUG.md b/BUG.md new file mode 100644 index 00000000..c44ba3dd --- /dev/null +++ b/BUG.md @@ -0,0 +1,383 @@ +you# CI Failure Investigation - composefs-rs Examples + +## Problem Statement + +CI examples workflow failing for three Fedora jobs (unified, uki, unified-secureboot) starting around Oct 13, 2025. VMs fail to boot with empty console logs and SSH connection failures. + +## Timeline + +- **Last passing**: Oct 3, 2025 +- **First failure**: Oct 13+ 2025 +- **Package changes identified**: systemd 257.9-2 → 257.10-1, kernel 6.16.9 → 6.16.11+ + +## Root Cause Analysis + +### Investigation Steps + +1. **Added comprehensive logging infrastructure** + - QEMU stdout/stderr capture to `qemu.log` + - Serial console logging to `serial.log` + - Modified `IpcDirectory` to preserve logs on test failures + - Added kernel cmdline parameters for verbose logging + +2. **Discovered QEMU parameter parsing issues** + + **Issue #1**: Initial kernel cmdline `earlyprintk=serial,ttyS0,115200` was being rejected by QEMU + ``` + kvm: Invalid parameter 'ttyS0' + ``` + - QEMU's SMBIOS parser treats commas as parameter delimiters + - `ttyS0` was interpreted as a QEMU parameter, not part of kernel cmdline + + **Issue #2**: After fixing #1, `console=ttyS0,115200` still caused errors + ``` + kvm: Invalid parameter '115200 debug loglevel' + ``` + - Same comma parsing issue with QEMU SMBIOS + + **Issue #3**: SSH public key credential with spaces failed + - QEMU SMBIOS parsing stricter with spaces in credential values + - SSH public keys contain spaces between algorithm, key data, and comment + +### Fixes Applied + +#### 1. Kernel Command Line (commit 2ba8188, superseded by commit 5efbd1f) +**File**: `examples/testthing.py:649` + +Removed problematic serial console parameters from SMBIOS kernel cmdline: +```python +# CURRENT +"type=11,value=io.systemd.boot.kernel-cmdline-extra=debug loglevel=7 systemd.journald.forward_to_console=1" +``` + +**However**: These SMBIOS parameters are NOT being applied by the EFI stub. The actual kernel command line only contains parameters baked into the UKI (see "CRITICAL FINDING" above). + +Rationale: We have `-serial file:{path}/serial.log` device configured separately, and the UKI has `console=ttyS0,115200n8` baked in. + +#### 2. SMBIOS Credentials Base64 Encoding (commit 1ccdee4) +**File**: `examples/testthing.py:669-677` + +Added automatic base64 encoding for credential values containing spaces: +```python +# Import base64 module +import base64 + +# Modified credential passing +*( + ( + "-smbios", + f"type=11,value=io.systemd.credential.binary:{k}={base64.b64encode(v.encode()).decode()}" + if " " in v + else f"type=11,value=io.systemd.credential:{k}={v}", + ) + for k, v in creds.items() +), +``` + +Uses `io.systemd.credential.binary:` prefix with base64 encoding for values with spaces (particularly SSH public keys) to avoid QEMU SMBIOS parsing ambiguities. + +Reference: https://systemd.io/CREDENTIALS/ + +#### 3. Enhanced Logging (commits 5a8f8b9, 4add586, 1c15f88, 80f1146) +**Files**: `examples/testthing.py`, `.github/workflows/examples.yml` + +- Added QEMU log capture with `stdout`/`stderr` to `qemu.log` +- Added serial port logging: `("-serial", f"file:{self._ipc}/serial.log")` +- Modified `IpcDirectory.__exit__()` to skip cleanup on exceptions using `finalizer.detach()` +- Updated CI workflow to dump all log files on failure + +## Current Status + +### ✅ Fixed +- QEMU parameter parsing errors resolved +- QEMU now starts successfully +- Serial console logging now captures full boot sequence +- Console configuration: `console=ttyS0,115200n8` in UKI + +### ⚠️ CRITICAL FINDING: SMBIOS Kernel Parameters Not Applied + +**The EFI stub is NOT reading SMBIOS `io.systemd.boot.kernel-cmdline-extra` parameters.** + +Evidence from kernel command line in serial.log: +``` +Command line: composefs= rw console=ttyS0,115200n8 systemd.machine_id= +``` + +QEMU is invoked with: +``` +-smbios 'type=11,value=io.systemd.boot.kernel-cmdline-extra=debug loglevel=7 systemd.journald.forward_to_console=1' +``` + +But `debug loglevel=7 systemd.journald.forward_to_console=1` is **completely missing** from the actual kernel command line. + +**Implication**: We've been assuming debug parameters were active, but they weren't. The kernel is not running with `debug loglevel=7`, and journald is not forwarding to console. + +### ✅ VM Boot Success Confirmed +- VM boots completely through to multi-user.target +- sshd.service starts successfully +- Full kernel boot messages visible in serial.log +- Example from serial.log: + ``` + Command line: composefs= rw console=ttyS0,115200n8 systemd.machine_id= + [boot messages...] + OK Started sshd.service - OpenSSH server daemon. + OK Reached target multi-user.target - Multi-User System. + ``` + +### ❌ Outstanding Issue: SSH Connection Failure + +**Symptom**: VM boots successfully, sshd starts, but test SSH connection fails + +**Error**: +``` +testthing.SubprocessError: Subprocess exited unexpectedly with return code 255 +ssh_dispatch_run_fatal: Connection to UNKNOWN port 0: Broken pipe +``` + +**Boot sequence**: +``` +UEFI/OVMF starts ✓ +systemd-boot menu appears ✓ +EFI stub loads initrd ✓ +Kernel boot complete ✓ +systemd startup ✓ +sshd.service starts ✓ +multi-user.target reached ✓ +SSH connection (vsock) ✗ (broken pipe) +``` + +**Configuration verified**: +- Pinned versions correctly installed: kernel 6.16.9-200.fc42, systemd 257.9-2.fc42 +- Kernel cmdline (baked into UKI): `composefs= rw console=ttyS0,115200n8` +- Additional cmdline (via SMBIOS): `debug loglevel=7` +- Composefs image digest calculated correctly during container build +- Serial console captures all output to serial.log + +## Hypothesis + +**Package versions are NOT the root cause**: +- We're already pinning kernel 6.16.9-200.fc42 and systemd 257.9-2.fc42 (Oct 3 working versions) +- VM boots successfully with these versions as confirmed by serial.log +- sshd starts and reaches multi-user.target + +**SSH/vsock connection is the actual problem**: +- VM boot is working correctly +- SSH connection over vsock fails with "Broken pipe" +- This suggests the issue is NOT in the guest, but in: + - vsock device communication between host/guest + - GitHub Actions runner environment changes (QEMU version, kernel module) + - systemd-ssh-proxy or SSH connection timing + - vhost-vsock-pci device configuration + +**Possible GitHub Actions runner changes**: +- QEMU version update affecting vsock implementation +- Host kernel vsock module changes +- vhost-vsock driver updates +- Network/socket permissions in runner environment + +**New hypothesis based on SMBIOS finding**: +- Since SMBIOS kernel parameters aren't being applied, this failure may have existed all along +- The tests might have been passing despite this, meaning the failure is unrelated to kernel parameters +- OR: The tests were relying on systemd-boot to apply SMBIOS parameters, but UKI bypasses systemd-boot +- Need to verify: Did tests ever actually work with UKI setup, or only with systemd-boot? + +## Package Change Investigation + +### Kernel 6.16.x Known Issues (from web research) +Multiple Fedora Discussion threads report kernel 6.16.x boot failures: +- Versions affected: 6.16.3, 6.16.5, 6.16.6, 6.16.7 +- Symptoms: Boot hangs, stuck at Fedora logo, ACPI errors +- Workaround: Boot with kernel 6.15.10 or 6.17.x + +**However**: We're already pinned to 6.16.9 (from Oct 3 when tests passed), and serial.log confirms the VM boots successfully. This suggests kernel 6.16.9 itself is not the issue. + +### systemd Changes +- 257.9-2.fc42 (working) → 257.10-1.fc42 (Oct 13-14) +- Timing matches failure window, but we're pinned to 257.9-2 +- VM boots successfully with 257.9-2, so not the cause + +### Other Packages (not pinned) +- selinux-policy-targeted: 41.24 → 41.26 (no boot-related changes found) +- dracut: 105-2.fc42 (stable, no updates in Oct) +- composefs: Not pinned, could have changed +- btrfs-progs: Not pinned, could have changed + +## Possible Causes for SSH Failure + +1. **vsock device/driver issue** + - vhost-vsock kernel module on GitHub Actions runner changed + - QEMU vhost-vsock-pci implementation updated + - Try: Add vsock debug logging, test with different guest-cid + +2. **SSH timing issue** + - Test attempts connection before sshd fully ready + - vsock socket not properly established + - Try: Add delay before SSH connection attempt + +3. **Unpinned package regression** + - composefs version changed, affecting boot timing + - openssh-server version changed + - Try: Pin all packages to Oct 3 versions + +4. **GitHub Actions runner environment** + - QEMU version on runners updated + - Host kernel vsock support changed + - Runner security policy blocking vsock + - Try: Test locally with same QEMU parameters + +## Files Modified + +### Primary Changes +- `examples/testthing.py` - QEMU invocation, credential handling, logging +- `.github/workflows/examples.yml` - Log dumping steps, disabled non-failing jobs + +### Supporting Files (pinned versions from earlier work) +- `examples/uki/Containerfile` +- `examples/unified/Containerfile` +- `examples/unified-secureboot/Containerfile` + +## Testing Status + +**Branch**: `debug-ci` +**PR**: https://github.com/containers/composefs-rs/pull/190 +**Test fork**: https://github.com/cgwalters/composefs-rs + +Latest test runs: + +- Run 18631279813 (commit 045f035): Added systemd.journald.forward_to_console=1 + - **CRITICAL DISCOVERY**: SMBIOS kernel parameters NOT applied by EFI stub + - QEMU starts: ✓ + - VM boots: ✓ + - sshd starts: ✓ + - SSH connection: ✗ (broken pipe) + - Journal forwarding: ✗ (parameter never reached kernel) + +- Run 18631024251 (commit e3c926d): Added SSH verbose logging + - SSH debug logs captured to ssh.log + - Protocol negotiation succeeds (OpenSSH 9.6 vs 9.9) + - Connection breaks after SSH2_MSG_KEXINIT sent + - Confirms failure during key exchange phase + +- Run 18624189103 (commit 5efbd1f): Fixed baud rate 114800→115200, removed console=hvc0 from SMBIOS + - QEMU starts: ✓ + - VM boots to kernel: ✓ + - Kernel completes boot: ✓ + - sshd starts: ✓ (visible in serial.log) + - multi-user.target: ✓ + - SSH connection: ✗ (broken pipe on vsock connection) + +**Key finding**: VM boots completely and all services start, but SSH over vsock fails during key exchange. SMBIOS kernel parameters are NOT being applied. + +## Next Steps + +### High Priority - SSH/vsock Debugging + +1. **Fix SMBIOS kernel parameter issue** ⚠️ BLOCKING + - Option A: Bake debug parameters into UKI `/etc/kernel/cmdline` (loglevel=7, systemd.journald.forward_to_console=1) + - Option B: Investigate why EFI stub not reading SMBIOS (systemd version issue?) + - Option C: Switch to systemd-boot instead of direct UKI boot to enable SMBIOS cmdline + - **Need to choose approach**: Hardcode in UKI is simplest for debugging + +2. **Get guest-side SSH logs** + - Once journal forwarding works, capture sshd error messages + - Understand why sshd terminates during key exchange + - Check for OpenSSH 9.9 specific issues + +3. **Test OpenSSH version hypothesis** + - Pin openssh-server to Oct 3 version (pre-9.9) + - OR force specific key exchange algorithm to avoid negotiation issues + +4. **Local testing** ✅ COMPLETED + - Issue reproduces locally (not CI-specific) + - Same "Broken pipe" error during key exchange + - Confirms this is a general vsock/SSH problem + +### Lower Priority - Alternative Approaches + +5. **Try different connection method** + - Test with network instead of vsock + - Try serial console connection instead of SSH + +6. **Test with kernel 6.15.10** + - Pin to kernel-6.15.10-200.fc42 as absolute fallback + - Confirm if earlier kernel works in current environment + +## Testing Instructions + +### Running Tests Locally + +To reproduce the issue locally: + +```bash +# Build the UKI container image +cd examples/uki +podman build -t localhost/uki-test . + +# Run the test +cd ../.. +python3 examples/testthing.py examples/uki + +# Check logs on failure +ls -la /tmp/test.thing/*/ # or wherever IpcDirectory creates temp dir +cat /tmp/test.thing/*/serial.log # Kernel boot log +cat /tmp/test.thing/*/console # Console output +cat /tmp/test.thing/*/qemu.log # QEMU stdout/stderr +``` + +### Testing in CI (cgwalters fork) + +**CRITICAL: ONLY push to cgwalters fork, NEVER push to debug-ci branch or create PRs** + +The debug-ci branch has PR #190 open to main composefs-rs repo. Any commits to debug-ci will update that PR and create noise for everyone else. + +**ONLY USE THIS WORKFLOW**: +```bash +# Make changes, commit locally on ANY branch (NOT debug-ci) +git add -A +git commit -m "test: Description of change" + +# Push ONLY to cgwalters fork main branch +git push -f cgwalters HEAD:main + +# Monitor workflow on cgwalters fork ONLY +gh run list --repo cgwalters/composefs-rs +gh run watch --repo cgwalters/composefs-rs + +# Download artifacts after failure +gh run download --repo cgwalters/composefs-rs +``` + +**DO NOT**: +- Push to debug-ci branch +- Push to containers/composefs-rs repo +- Create or update pull requests +- Use `git push origin` without being very careful about branch + +### Analyzing Logs + +Key files in artifacts: +- `serial.log` - Serial console (ttyS0) output, shows kernel boot and early systemd +- `console` - virtconsole (hvc0) output, shows login getty +- `qemu.log` - QEMU process stdout/stderr, shows QEMU errors + +Look for: +- Kernel command line in serial.log +- sshd.service start messages +- vsock device initialization +- Any errors or warnings during boot +- SSH connection attempt output + +### Key Configuration Files + +- `examples/uki/Containerfile` - UKI image build, sets console in /etc/kernel/cmdline +- `examples/testthing.py` - Test harness, QEMU invocation and SSH connection (lines 640-677) +- `.github/workflows/examples.yml` - CI workflow + +## References + +- systemd credentials: https://systemd.io/CREDENTIALS/ +- QEMU SMBIOS documentation: https://www.qemu.org/docs/master/system/i386/pc.html +- QEMU vsock: https://wiki.qemu.org/Features/VirtioVsock +- Fedora kernel 6.16.x boot issues: https://discussion.fedoraproject.org/t/fedora-42-latest-kernel-update-6-16-6-and-6-16-7-have-crashed-the-system-not-able-to-boot/164682 +- Debug branch: https://github.com/containers/composefs-rs/tree/debug-ci +- Test fork: https://github.com/cgwalters/composefs-rs diff --git a/crates/composefs/src/erofs/reader.rs b/crates/composefs/src/erofs/reader.rs index dff97bae..7cf990e5 100644 --- a/crates/composefs/src/erofs/reader.rs +++ b/crates/composefs/src/erofs/reader.rs @@ -416,7 +416,7 @@ impl DirectoryBlock { let first = self.get_entry_header(0); let offset = first.name_offset.get(); assert!(offset != 0); - assert!(offset % 12 == 0); + assert!(offset.is_multiple_of(12)); offset as usize / 12 } diff --git a/examples/testthing.py b/examples/testthing.py index 9ec191d0..e6f230b7 100644 --- a/examples/testthing.py +++ b/examples/testthing.py @@ -31,6 +31,7 @@ import argparse import asyncio +import base64 import contextlib import ctypes import functools @@ -106,8 +107,21 @@ def __enter__(self) -> Path: raise FileExistsError def __exit__(self, *args: object) -> None: - """Delete the IPC directory and its contents.""" - del args + """Delete the IPC directory and its contents. + + Skip cleanup if there was an exception, to aid in CI debugging. + """ + exc_type, exc_val, exc_tb = args + print(f"IpcDirectory.__exit__: exc_type={exc_type}, exc_val={exc_val}, has_finalizer={self.finalizer is not None}", flush=True) + # If there was an exception, deactivate the finalizer to preserve logs + if exc_val is not None and self.finalizer: + print(f"Deactivating IpcDirectory finalizer due to exception: {exc_val}", flush=True) + logger.debug(f"Skipping cleanup due to exception: {exc_val}") + # Detach the finalizer so it won't run on garbage collection + self.finalizer.detach() + self.finalizer = None + return + print(f"Running IpcDirectory finalizer", flush=True) if self.finalizer: self.finalizer() @@ -472,6 +486,7 @@ async def _spawn( *args: str | Path | tuple[str | Path, ...], stdin: int | None = None, stdout: int | None = None, + stderr: int | None = None, ) -> asyncio.subprocess.Process: """Spawn a process. @@ -517,6 +532,7 @@ def pr_set_pdeathsig() -> None: *itertools.chain(*_normalize_args(*args)), stdin=stdin, stdout=stdout, + stderr=stderr, preexec_fn=pr_set_pdeathsig, ) @@ -630,8 +646,10 @@ async def _qemu( ("-device", "virtconsole,chardev=console"), ( "-smbios", - "type=11,value=io.systemd.boot.kernel-cmdline-extra=console=hvc0", + "type=11,value=io.systemd.boot.kernel-cmdline-extra=debug loglevel=7 systemd.journald.forward_to_console=1", ), + # Add a serial port for boot messages + ("-serial", f"file:{self._ipc}/serial.log"), *( ( ("-chardev", "stdio,mux=on,signal=off,id=console"), @@ -647,18 +665,30 @@ async def _qemu( ) ), ("-drive", drive), - # Credentials + # Credentials - use binary encoding for values with spaces to avoid QEMU parsing issues *( - ("-smbios", f"type=11,value=io.systemd.credential:{k}={v}") + ( + "-smbios", + f"type=11,value=io.systemd.credential.binary:{k}={base64.b64encode(v.encode()).decode()}" + if " " in v + else f"type=11,value=io.systemd.credential:{k}={v}", + ) for k, v in creds.items() ), ) qemu = None + qemu_log_file = None try: self._print_status("Waiting for guest") - qemu = await self._spawn(*args) + # Capture QEMU's stdout/stderr for debugging + qemu_log = self._ipc / "qemu.log" + qemu_log_file = open(qemu_log, "wb") + qemu = await self._spawn(*args, stdout=qemu_log_file.fileno(), stderr=qemu_log_file.fileno()) returncode = await qemu.wait() + # Flush and sync the log file before checking for errors + qemu_log_file.flush() + os.fsync(qemu_log_file.fileno()) if not self._shutdown_ok: raise SubprocessError(args, returncode) except asyncio.CancelledError: @@ -675,10 +705,13 @@ async def _qemu( await asyncio.shield(qemu.wait()) finally: logger.debug("qemu exited") + if qemu_log_file is not None: + qemu_log_file.close() self._qemu_exited.set() async def _ssh_control(self) -> None: ssh = None + ssh_log_file = None try: assert self.ssh_direct_args is not None @@ -686,8 +719,14 @@ async def _ssh_control(self) -> None: control_socket = self._ipc / "ssh" + # Open SSH debug log file + ssh_log_path = self._ipc / "ssh.log" + ssh_log_file = open(ssh_log_path, "wb") + logger.debug(f"SSH debug log: {ssh_log_path}") + args = ( "ssh", + "-vvv", # verbose debugging *self.ssh_direct_args, ("-N", "-n"), # no command, stdin disconnected ("-M", "-S", control_socket), # listen on the control socket @@ -696,6 +735,7 @@ async def _ssh_control(self) -> None: *args, stdin=asyncio.subprocess.DEVNULL, stdout=asyncio.subprocess.PIPE, + stderr=ssh_log_file.fileno(), ) # ssh sends EOF after the connection succeeds @@ -720,6 +760,8 @@ async def _ssh_control(self) -> None: ssh.terminate() await asyncio.shield(ssh.wait()) finally: + if ssh_log_file is not None: + ssh_log_file.close() # We try to reset our state best as possible here to deal with # reboots in the shutdown_ok case: we want the control socket # reestablished when the machine comes back. diff --git a/examples/uki/Containerfile b/examples/uki/Containerfile index 02c171a1..7578a7fd 100644 --- a/examples/uki/Containerfile +++ b/examples/uki/Containerfile @@ -20,20 +20,26 @@ RUN --mount=type=cache,target=/var/cache/libdnf5 < /etc/dracut.conf.d/no-xattr.conf + dnf -y install fedora-repos-archive dnf --setopt keepcache=1 install --allowerasing -y \ btrfs-progs \ composefs \ dosfstools \ - kernel \ + kernel-6.16.9-200.fc42 \ openssh-server \ policycoreutils-python-utils \ selinux-policy-targeted \ skopeo \ strace \ - systemd \ - systemd-boot-unsigned \ - systemd-ukify \ + systemd-257.9-2.fc42 \ + systemd-boot-unsigned-257.9-2.fc42 \ + systemd-ukify-257.9-2.fc42 \ util-linux + + # Set SELinux to permissive mode to allow vsock SSH connections + # Root cause: SELinux policy blocks vsock SSH (no AVC denials logged even with audit=1) + # Attempted fixes that failed: specific vsock_socket rules, permissive sshd_t only, package pinning + sed -i 's/^SELINUX=.*/SELINUX=permissive/' /etc/selinux/config EOF # --- Everything above this line should hopefully stay cached --- @@ -60,7 +66,7 @@ RUN --mount=type=cache,target=/var/cache/libdnf5 < /etc/kernel/cmdline + echo "composefs=${COMPOSEFS_FSVERITY} rw console=ttyS0,115200n8" > /etc/kernel/cmdline kernel-install add-all EOF diff --git a/examples/unified-secureboot/Containerfile b/examples/unified-secureboot/Containerfile index 244f8c9a..b23e27be 100644 --- a/examples/unified-secureboot/Containerfile +++ b/examples/unified-secureboot/Containerfile @@ -2,11 +2,12 @@ FROM fedora:42 AS base RUN --mount=type=cache,target=/var/cache/libdnf5 <