diff --git a/docs/configuration.md b/docs/configuration.md index 0384b594..113c1b00 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -297,6 +297,10 @@ Importing this module will run `btrfs device scan` and pull btrfs modules. When enabled, attempts to resume after hibernation if resume= is passed on the kernel command line. +> Please use the following option with **CAUTION** as it can be unstable in certain configurations! Any writes to disks that occur pre-resume run the risk of causing system instability! For more information have a read of the warnings in the [kernel docs](https://www.kernel.org/doc/html/latest/power/swsusp.html). + +* `late_resume` (true) When enabled will attempt to resume from hibernation after decryption and device mapping, allowing resume from encrypted or otherwise hidden swap devices. + ### Cryptographic modules Several cryptographic modules are provided, mostly to assist in mounting encrypted volumes and handling keyfiles. diff --git a/src/ugrd/base/test.py b/src/ugrd/base/test.py index a1868106..4818a695 100644 --- a/src/ugrd/base/test.py +++ b/src/ugrd/base/test.py @@ -14,7 +14,7 @@ def find_kernel_path(self): if not (self["_kmod_dir"] / "vmlinuz").exists(): for search_dir in ["/boot", "/efi"]: for prefix in ["vmlinuz", "kernel", "linux", "bzImage"]: - kernel_path = Path(search_dir) / f'{prefix}-{self["kernel_version"]}' + kernel_path = Path(search_dir) / f"{prefix}-{self['kernel_version']}" if kernel_path.exists(): break if kernel_path.exists(): diff --git a/src/ugrd/base/test.toml b/src/ugrd/base/test.toml index 2fac5a51..859bd1ae 100644 --- a/src/ugrd/base/test.toml +++ b/src/ugrd/base/test.toml @@ -2,14 +2,16 @@ test_rootfs_name = 'ugrd-test-rootfs' test_rootfs_build_dir = 'initramfs_test_rootfs' test_image_size = 16 -test_copy_config = ["_mounts", "mounts", "out_dir", "tmpdir", "clean", "test_image_size", "test_flag", "cryptsetup"] +test_copy_config = ["mounts", "out_dir", "tmpdir", "clean", "test_image_size", "test_flag", "cryptsetup"] +kmod_init = ["ata_piix", "sd_mod"] test_memory = '256M' test_cpu = 'host' test_arch = 'x86_64' test_timeout = 15 test_cmdline = 'console=ttyS0,115200 panic=1' -qemu_bool_args = ['nographic', 'no-reboot', 'enable-kvm'] +#qemu_bool_args = ['nographic', 'no-reboot', 'enable-kvm'] +qemu_bool_args = ['nographic', 'enable-kvm' ] [imports.build_pre] "ugrd.base.test" = [ "init_test_vars" ] diff --git a/src/ugrd/fs/mdraid.py b/src/ugrd/fs/mdraid.py index 93bf33aa..bc1da46f 100644 --- a/src/ugrd/fs/mdraid.py +++ b/src/ugrd/fs/mdraid.py @@ -1,5 +1,8 @@ -__version__ = '0.1.2' +__version__ = '0.2.0' def md_init(self): - return 'einfo "Assembling MD devices: $(mdadm --assemble --scan 2>&1)"' + return """ + export MDADM_NO_UDEV=1 + einfo "Assembling MD devices: $(mdadm --assemble --scan 2>&1)" + """ diff --git a/src/ugrd/fs/resume.py b/src/ugrd/fs/resume.py index dba8b639..b521874f 100644 --- a/src/ugrd/fs/resume.py +++ b/src/ugrd/fs/resume.py @@ -1,10 +1,13 @@ __version__ = "0.4.2" +from zenlib.util import contains -def handle_resume(self) -> None: - """Returns a shell script handling resume from hibernation. + +def resume(self) -> None: + """Returns a bash script handling resume from hibernation. Checks that /sys/power/resume is writable, resume= is set, and noresume is not set, if so, - checks if PARTUUID= is in the resume var, and tries to use blkid to find the resume device. + checks if UUID= or PARTUUID= or LABEL= is in the resume var, + and tries to use blkid to find the resume device. If the specified device exists, writes resume device to /sys/power/resume. In the event of failure, it prints an error message, a list of block devuices, then runs rd_fail. @@ -14,24 +17,65 @@ def handle_resume(self) -> None: If the system is freshly booted, it will not be able to resume, as there is no hibernation image. Distinguising between a fresh boot and missing/borked hibernation image is not possible at run time. """ + return [ + # Check resume support + '[ -n "$1" ] || (ewarn "No device?" ; return 1)', + '[ -w /sys/power/resume ] || (ewarn "Kernel does not support resume!" ; return 1)', + '[[ ! "$(cat /sys/power/resume)" == "0:0" ]] || ewarn "/sys/power/resume not empty, resume has already been attempted!"', + # Safety checks + "if ! [ -z $(lsblk -Q MOUNTPOINT)] ; then", + r' eerror "Cannot safely resume with mounted block devices:\n$(lsblk -Q MOUNTPOINT -no PATH)"', + " return 1", + "fi", + '[ -b "$1" ] || (ewarn "\'$1\' is not a valid block device!" ; return 1)', + 'einfo "Attempting resume from: $1"', + 'echo -n "$1" > /sys/power/resume', + 'einfo "No image on $resume"', + "return 0", + ] + + +def handle_early_resume(self) -> None: return [ "resumeval=$(readvar resume)", # read the cmdline resume var 'if ! check_var noresume && [ -n "$resumeval" ] && [ -w /sys/power/resume ]; then', - ' if echo "$resumeval" | grep -q "PARTUUID="; then', # resolve partuuid to device + ' if echo "$resumeval" | grep -q "UUID=" ||', # resolve uuid to device + ' echo "$resumeval" | grep -q "PARTUUID=" ||', # or resolve partuuid to device + ' echo "$resumeval" | grep -q "LABEL=" ; then', # or resolve label to device ' resume=$(blkid -t "$resumeval" -o device)', " else", - " resume=$resumeval", + ' resume="$resumeval"', " fi", - ' if [ -e "$resume" ]; then', # Check if the resume device exists - ' einfo "Resuming from: $resume"', - ' printf "%s" "$resume" > /sys/power/resume', # Attempt to resume - ' ewarn "Failed to resume from: $resume"', + " if ! [ -z $resume ] ; then", + ' if ! resume "$resume" ; then', + ' eerror "If you wish to continue booting, remove the resume= kernel parameter."', + ''' eerror " or run 'setvar noresume 1' from the recovery shell to skip resuming."''', + ' rd_fail "Failed to resume from $(readvar resume)."', + " fi", " else", - ' ewarn "Resume device not found: $resume)"', # Warn if the resume device does not exist - r' eerror "Block devices:\n$(blkid)"', - ' eerror "If you wish to continue booting, remove the resume= kernel parameter."', - ''' eerror " or run 'setvar noresume 1' from the recovery shell to skip resuming."''', - ' rd_fail "Failed to resume from $(readvar resume)."', + " einfo \"Resume device '$resumeval' not found\"", " fi", "fi", ] + + +@contains("late_resume") +def handle_late_resume(self) -> None: + self.logger.warning( + "[late_resume] enabled, this can result in data loss if filesystems are modified before resuming. Read the docs for more info." + ) + return handle_early_resume( + self + ) # At the moment it's the same code but delayed, will change when more features are added + + +@contains("test_resume") +def test_init_swap_uuid(self): + if "test_cpu" in self: + from uuid import uuid4 + + self["test_swap_uuid"] = swap_uuid = uuid4() + + # append to test kernel cmdline and adjust planned image size to allow enough space + self["test_cmdline"] = f"{self.get('test_cmdline')} resume=UUID={swap_uuid}" + self["test_image_size"] = 256 + self.get("test_image_size") diff --git a/src/ugrd/fs/resume.toml b/src/ugrd/fs/resume.toml index 18ba3066..6a0e33d1 100644 --- a/src/ugrd/fs/resume.toml +++ b/src/ugrd/fs/resume.toml @@ -1,7 +1,23 @@ cmdline_strings = [ "resume" ] +binaries = [ 'lsblk' ] +test_copy_config = [ "test_resume", "test_swap_uuid" ] -[imports.init_main] -"ugrd.fs.resume" = [ "handle_resume" ] +[imports.build_pre] +"ugrd.fs.resume" = [ "test_init_swap_uuid" ] + +[imports.init_main] +"ugrd.fs.resume" = [ "handle_early_resume" ] + +[imports.init_premount] +"ugrd.fs.resume" = [ "handle_late_resume"] + +[imports.functions] +"ugrd.fs.resume" = ["resume"] + +[custom_parameters] +late_resume = "bool" +test_resume = "bool" +test_swap_uuid = "str" [import_order.before] handle_resume = "mount_fstab" diff --git a/src/ugrd/fs/test_image.py b/src/ugrd/fs/test_image.py index 4a527899..0459fd13 100644 --- a/src/ugrd/fs/test_image.py +++ b/src/ugrd/fs/test_image.py @@ -2,8 +2,8 @@ from tempfile import TemporaryDirectory -from zenlib.util import colorize as c_ -from zenlib.util import contains +from zenlib.util import colorize, contains +from time import sleep @contains("test_flag", "A test flag must be set to create a test image", raise_exception=True) @@ -12,18 +12,45 @@ def init_banner(self): self["banner"] = f"echo {self['test_flag']}" +@contains("test_resume") +def resume_tests(self): + return [ + 'if [ "$( /proc/sysrq-trigger)', + # Set correct resume parameters + " echo reboot > /sys/power/disk", + # trigger resume + " echo disk > /sys/power/state", + ' [ -e "/resume" ] || echo c > /proc/sysrq-trigger', + # if we reach this point, resume was successful + # reset environment in case resume needs to be rerun + " rm /resumed", + ' echo "Resume completed without error.', + "else", + ' echo "No resume device found! Resume test not possible!', + "fi", + ] + + +def complete_tests(self): + return [ + "echo s > /proc/sysrq-trigger", + "echo o > /proc/sysrq-trigger", + ] + + def _allocate_image(self, image_path, padding=0): """Allocate the test image size""" self._mkdir(image_path.parent, resolve_build=False) # Make sure the parent directory exists if image_path.exists(): if self.clean: - self.logger.warning("Removing existing filesystem image file: %s" % c_(image_path, "red")) + self.logger.warning("Removing existing filesystem image file: %s" % colorize(image_path, "red")) image_path.unlink() else: - raise Exception("File already exists and 'clean' is off: %s" % c_(image_path, "red", bold=True)) + raise Exception("File already exists and 'clean' is off: %s" % colorize(image_path, "red", bold=True)) with open(image_path, "wb") as f: - self.logger.info("Allocating test image file: %s" % c_(f.name, "green")) + self.logger.info("Allocating test image file: %s" % colorize(f.name, "green")) f.write(b"\0" * (self.test_image_size + padding) * 2**20) @@ -61,8 +88,8 @@ def make_test_luks_image(self, image_path): pass _allocate_image(self, image_path, padding=32) # First allocate the image file, adding padding for the LUKS header keyfile_path = _get_luks_keyfile(self) - self.logger.info("Using LUKS keyfile: %s" % c_(keyfile_path, "green")) - self.logger.info("Creating LUKS image: %s" % c_(image_path, "green")) + self.logger.info("Using LUKS keyfile: %s" % colorize(keyfile_path, "green")) + self.logger.info("Creating LUKS image: %s" % colorize(image_path, "green")) self._run( [ "cryptsetup", @@ -75,14 +102,14 @@ def make_test_luks_image(self, image_path): keyfile_path, ] ) - self.logger.info("Opening LUKS image: %s" % c_(image_path, "magenta")) + self.logger.info("Opening LUKS image: %s" % colorize(image_path, "magenta")) self._run(["cryptsetup", "luksOpen", image_path, "test_image", "--key-file", keyfile_path]) def make_test_image(self): """Creates a test image from the build dir""" build_dir = self._get_build_path("/").resolve() - self.logger.info("Creating test image from: %s" % c_(build_dir, "blue", bold=True)) + self.logger.info("Creating test image from: %s" % colorize(build_dir, "blue", bold=True)) rootfs_type = self["mounts"]["root"]["type"] try: @@ -99,6 +126,33 @@ def make_test_image(self): else: _allocate_image(self, image_path) + loopback = None + if self.get("test_resume"): + try: + self._run(["sgdisk", "-og", image_path]) + self._run(["sgdisk", "-n", "1:0:+256", image_path]) + self._run(["sgdisk", "-n", "2:0", image_path]) + except RuntimeError as e: + raise RuntimeError("Failed to partition test disk: %s", e) + + try: + out = self._run(["losetup", "--show", "-fP", image_path]) + loopback = out.stdout.decode("utf-8").strip() + + image_path = f"{loopback}p2" + except RuntimeError as e: + raise RuntimeError("Failed to allocate loopback device for disk creation: %s", e) + + # sleep for 100ms, to give the loopback device time to scan for partitions + # usually fast, but losetup doesn't wait for this to complete before returning. + # TODO: replace with an proper check/wait loop + sleep(0.100) + + try: + self._run(["mkswap", "-U", self["test_swap_uuid"], f"{loopback}p1"]) + except RuntimeError as e: + raise RuntimeError("Failed to create swap partition on test disk: %s", e) + if rootfs_type == "ext4": self._run(["mkfs", "-t", rootfs_type, "-d", build_dir, "-U", rootfs_uuid, "-F", image_path]) elif rootfs_type == "btrfs": @@ -117,7 +171,7 @@ def make_test_image(self): squashfs_image = self._get_out_path(f"squash/{self['squashfs_image']}") if squashfs_image.exists(): if self.clean: - self.logger.warning("Removing existing squashfs image file: %s" % c_(squashfs_image, "red")) + self.logger.warning("Removing existing squashfs image file: %s" % colorize(squashfs_image, "red")) squashfs_image.unlink() else: raise Exception("File already exists and 'clean' is off: %s" % squashfs_image) @@ -129,6 +183,11 @@ def make_test_image(self): else: raise NotImplementedError("Unsupported test rootfs type: %s" % rootfs_type) + # Clean up loopback device used to access test image partitions + if loopback: + self.logger.info("Closing test image loopback device: %s", colorize(loopback, "magenta")) + self._run(["losetup", "-d", loopback]) + if self.get("cryptsetup"): # Leave it open in the event of failure, close it before executing tests - self.logger.info("Closing LUKS image: %s" % c_(image_path, "magenta")) + self.logger.info("Closing LUKS image: %s" % colorize(image_path, "magenta")) self._run(["cryptsetup", "luksClose", "test_image"]) diff --git a/src/ugrd/fs/test_image.toml b/src/ugrd/fs/test_image.toml index cd4dc69a..22ba4d80 100644 --- a/src/ugrd/fs/test_image.toml +++ b/src/ugrd/fs/test_image.toml @@ -9,6 +9,12 @@ _cryptsetup_root = "root" [imports.build_pre] "ugrd.fs.test_image" = ["init_banner"] +[imports.init_main] +"ugrd.fs.test_image" = ["resume_tests"] + +[imports.init_final] +"ugrd.fs.test_image" = ["complete_tests"] + [imports.pack] "ugrd.fs.test_image" = ["make_test_image"] @@ -20,4 +26,4 @@ cryptsetup = "dict" # Same as above _cryptsetup_root = "str" # Define the root device for cryptsetup test_image_size = "int" # Define the size of the test image in MiB test_flag = "str" # Define the success flag used to determine if the test was successful - +test_resume = "bool" # Enable code to test the suspend/resume pathways diff --git a/src/ugrd/main.py b/src/ugrd/main.py index 424156b3..1ca4d609 100755 --- a/src/ugrd/main.py +++ b/src/ugrd/main.py @@ -183,7 +183,7 @@ def main(): logger.critical(e) exit(1) except Exception as e: - logger.criical("An unhandled exception occurred while building:") + logger.critical("An unhandled exception occurred while building:") print(generator.config_dict) logger.critical(e, exc_info=True) exit(1)