From d12e23a58c42cf7dc18d33fcabda6074df351d37 Mon Sep 17 00:00:00 2001 From: Molly Miller Date: Wed, 16 Oct 2024 15:20:57 +0200 Subject: [PATCH 1/2] agent: trigger cold reboots when KVM environment parameters change Add support for a mechanism in fc.qemu similar to the existing qemu binary generation handling which additionally seeds various parameters related to the KVM environment at boot time and which writes these parameters into /run periodically at runtime. Request a cold reboot when the KVM environment parameters change at runtime to allow the hypervisor to perform changes which require the qemu process to be offline or restarted. PL-131857 --- pkgs/fc/agent/fc/maintenance/cli.py | 4 +- pkgs/fc/agent/fc/maintenance/maintenance.py | 170 +++++++++++++++----- 2 files changed, 135 insertions(+), 39 deletions(-) diff --git a/pkgs/fc/agent/fc/maintenance/cli.py b/pkgs/fc/agent/fc/maintenance/cli.py index 741534ee4..022ea0412 100644 --- a/pkgs/fc/agent/fc/maintenance/cli.py +++ b/pkgs/fc/agent/fc/maintenance/cli.py @@ -11,8 +11,8 @@ from fc.maintenance.maintenance import ( request_reboot_for_cpu, request_reboot_for_kernel, + request_reboot_for_kvm_environment, request_reboot_for_memory, - request_reboot_for_qemu, request_update, ) from fc.maintenance.reqmanager import DEFAULT_SPOOLDIR, ReqManager @@ -299,7 +299,7 @@ def system_properties(): if enc["parameters"]["machine"] == "virtual": rm.add(request_reboot_for_memory(log, enc)) rm.add(request_reboot_for_cpu(log, enc)) - rm.add(request_reboot_for_qemu(log)) + rm.add(request_reboot_for_kvm_environment(log)) rm.add(request_reboot_for_kernel(log, current_requests)) log.info("fc-maintenance-system-properties-finished") diff --git a/pkgs/fc/agent/fc/maintenance/maintenance.py b/pkgs/fc/agent/fc/maintenance/maintenance.py index f0473fd69..727119ca4 100644 --- a/pkgs/fc/agent/fc/maintenance/maintenance.py +++ b/pkgs/fc/agent/fc/maintenance/maintenance.py @@ -5,6 +5,8 @@ TODO: better logging for created activites. """ + +import json import os.path import shutil import typing @@ -23,6 +25,10 @@ from fc.maintenance.state import State from fc.util import nixos +QEMU_STATE_DIR = "/var/lib/qemu" +KVM_SEED_DIR = "/tmp/fc-data" +RUNTIME_DIR = "/run" + def request_reboot_for_memory(log, enc) -> Optional[Request]: """Schedules reboot if the memory size has changed.""" @@ -56,53 +62,143 @@ def request_reboot_for_cpu(log, enc) -> Optional[Request]: return Request(activity) -def request_reboot_for_qemu(log) -> Optional[Request]: - """Schedules a reboot if the Qemu binary environment has changed.""" - # Update the -booted marker if necessary. We need to store the marker - # in a place where it does not get removed after _internal_ reboots - # of the virtual machine. However, if we got rebooted with a fresh - # Qemu instance, we need to update it from the marker on the tmp - # partition. - log.debug("request-reboot-for-qemu-start") - if not os.path.isdir("/var/lib/qemu"): - os.makedirs("/var/lib/qemu") - if os.path.exists("/tmp/fc-data/qemu-binary-generation-booted"): +def request_reboot_for_kvm_environment(log) -> Optional[Request]: + """Schedules a reboot if the hypervisor environment has changed.""" + # Update the -booted marker if necessary. We need to store the + # marker in a place where it does not get removed after warm + # reboots of the virtual machine (i.e. same Qemu process on the + # KVM host). However, if we get a cold reboot (i.e. fresh Qemu + # process) then we need to update it from the marker seeded on the + # tmp partition. + log.debug("request-reboot-for-kvm-environment-start") + + qemu_state_dir = Path(QEMU_STATE_DIR) + + cold_state_file = lambda stem: Path(KVM_SEED_DIR) / f"qemu-{stem}-booted" + warm_state_file = lambda stem: qemu_state_dir / f"qemu-{stem}-booted" + running_state_file = ( + lambda stem: Path(RUNTIME_DIR) / f"qemu-{stem}-current" + ) + + if not qemu_state_dir.is_dir(): + qemu_state_dir.mkdir(parents=True) + + # Newer versions of fc.qemu can signal multiple relevant guest + # properties in a single file. Older versions will only signal the + # boot-time qemu binary generation. + if cold_state_file("guest-properties").exists(): shutil.move( - "/tmp/fc-data/qemu-binary-generation-booted", - "/var/lib/qemu/qemu-binary-generation-booted", + cold_state_file("guest-properties"), + warm_state_file("guest-properties"), + ) + cold_state_file("binary-generation").unlink(missing_ok=True) + elif cold_state_file("binary-generation").exists(): + shutil.move( + cold_state_file("binary-generation"), + warm_state_file("binary-generation"), ) - # Schedule maintenance if the current marker differs from booted - # marker. - if not os.path.exists("/run/qemu-binary-generation-current"): - return + # Optimistically load combined guest properties files try: - with open( - "/run/qemu-binary-generation-current", encoding="ascii" - ) as f: - current_generation = int(f.read().strip()) + with open(running_state_file("guest-properties")) as f: + current_properties = json.load(f) + except FileNotFoundError: + current_properties = None except Exception: - # Do not perform maintenance if no current marker is there. + current_properties = {} + + # If the combined properties do not exist then fall back to the + # old binary generation file. + if current_properties is None: + try: + with open( + running_state_file("binary-generation"), encoding="ascii" + ) as f: + current_generation = int(f.read().strip()) + except Exception: + current_generation = None + + try: + with open( + warm_state_file("binary-generation"), encoding="ascii" + ) as f: + booted_generation = int(f.read().strip()) + except Exception: + # Assume 0 as the generation marker as that is our upgrade + # path: VMs started with an earlier version of fc.qemu + # will not have this marker at all + booted_generation = 0 + + if ( + current_generation is not None + and current_generation > booted_generation + ): + msg = ( + "Cold restart because the Qemu binary environment has changed" + ) + return Request(RebootActivity("poweroff"), comment=msg) + + return + + # If the combined properties file exists but could not be decoded, + # then ignore it. Note that we always expect at least the binary + # generation to be provided in the combined file. + if not current_properties: return try: - with open( - "/var/lib/qemu/qemu-binary-generation-booted", encoding="ascii" - ) as f: - booted_generation = int(f.read().strip()) + with open(warm_state_file("guest-properties")) as f: + booted_properties = json.load(f) except Exception: - # Assume 0 as the generation marker as that is our upgrade path: - # VMs started with an earlier version of fc.qemu will not have - # this marker at all. - booted_generation = 0 - - if booted_generation >= current_generation: - # We do not automatically downgrade. If we ever want that then I - # want us to reconsider the side effects. - return + booted_properties = None + + # If the boot-time properties file does not exist, or does not + # have valid content, then we should reboot and re-run the seeding + # process from the start. This covers the case we were booted on a + # hypervisor which did not provide guest properties which has + # since been upgraded to start providing them, and it also handles + # the case where a bug on the hypervisor has delivered us invalid + # data, in which case we reboot optimistically in the hopes that + # the hypervisor has since been fixed. + if not booted_properties: + msg = "Cold restart because the KVM environment has been updated" + return Request(RebootActivity("poweroff"), comment=msg) + + # Special handling for the binary generation, which should ignore + # downgrades. + current_generation = current_properties.pop("binary_generation", None) + booted_generation = booted_properties.pop("binary_generation", 0) - msg = "Cold restart because the Qemu binary environment has changed" - return Request(RebootActivity("poweroff"), comment=msg) + if ( + current_generation is not None + and current_generation > booted_generation + ): + msg = "Cold restart because the Qemu binary environment has changed" + return Request(RebootActivity("poweroff"), comment=msg) + + fragments = [] + # Keys which are present at boot time but no longer at runtime are + # implicitly ignored here. + for key in current_properties.keys(): + # New keys which were not present at boot but were introduced + # at runtime should cause a reboot. + if key not in booted_properties: + fragments.append(f"{key} (new parameter)") + + # Changes in values between boot and runtime should cause a + # reboot. + elif booted_properties[key] != current_properties[key]: + fragments.append( + "{}: {} -> {}".format( + key, booted_properties[key], current_properties[key] + ) + ) + + if fragments: + msg = "Cold restart because KVM parameters have changed: {}".format( + ", ".join(fragments) + ) + return Request(RebootActivity("poweroff"), comment=msg) def request_reboot_for_kernel( From 94cd11f8692fab117c5d598f9d203f2913eacd34 Mon Sep 17 00:00:00 2001 From: Molly Miller Date: Wed, 16 Oct 2024 15:21:04 +0200 Subject: [PATCH 2/2] agent: tests for KVM environment parameter change handling PL-131857 --- .../fc/agent/fc/maintenance/tests/test_cli.py | 6 +- .../fc/maintenance/tests/test_maintenance.py | 262 +++++++++++++++--- 2 files changed, 223 insertions(+), 45 deletions(-) diff --git a/pkgs/fc/agent/fc/maintenance/tests/test_cli.py b/pkgs/fc/agent/fc/maintenance/tests/test_cli.py index 9cea33ca5..a8ec0cca5 100644 --- a/pkgs/fc/agent/fc/maintenance/tests/test_cli.py +++ b/pkgs/fc/agent/fc/maintenance/tests/test_cli.py @@ -155,7 +155,7 @@ def test_invoke_request_cold_reboot(activity, invoke_app_as_root): @unittest.mock.patch("fc.maintenance.cli.request_reboot_for_kernel") -@unittest.mock.patch("fc.maintenance.cli.request_reboot_for_qemu") +@unittest.mock.patch("fc.maintenance.cli.request_reboot_for_kvm_environment") @unittest.mock.patch("fc.maintenance.cli.request_reboot_for_cpu") @unittest.mock.patch("fc.maintenance.cli.request_reboot_for_memory") def test_invoke_request_system_properties_virtual( @@ -169,10 +169,10 @@ def test_invoke_request_system_properties_virtual( @unittest.mock.patch("fc.maintenance.cli.request_reboot_for_kernel") -@unittest.mock.patch("fc.maintenance.cli.request_reboot_for_qemu") +@unittest.mock.patch("fc.maintenance.cli.request_reboot_for_kvm_environment") @unittest.mock.patch("fc.maintenance.cli.request_reboot_for_cpu") @unittest.mock.patch("fc.maintenance.cli.request_reboot_for_memory") -def test_invoke_request_system_properties_virtual( +def test_invoke_request_system_properties_physical( memory, cpu, qemu, kernel, tmpdir, invoke_app_as_root ): enc_file = tmpdir / "enc.json" diff --git a/pkgs/fc/agent/fc/maintenance/tests/test_maintenance.py b/pkgs/fc/agent/fc/maintenance/tests/test_maintenance.py index 3f079c5b6..c2cc81ecd 100644 --- a/pkgs/fc/agent/fc/maintenance/tests/test_maintenance.py +++ b/pkgs/fc/agent/fc/maintenance/tests/test_maintenance.py @@ -1,4 +1,5 @@ import contextlib +import json import unittest.mock from unittest.mock import MagicMock @@ -8,6 +9,7 @@ from fc.maintenance.activity.reboot import RebootActivity from fc.maintenance.activity.update import UpdateActivity from fc.maintenance.activity.vm_change import VMChangeActivity +from pytest import fixture ENC = { "parameters": { @@ -17,6 +19,31 @@ } +@fixture +def guest_properties(tmp_path): + qemu_state_dir = tmp_path / "qemu" + kvm_seed_dir = tmp_path / "fc-data" + runtime_dir = tmp_path / "run" + + qemu_state_dir.mkdir(parents=True, exist_ok=True) + kvm_seed_dir.mkdir(parents=True, exist_ok=True) + runtime_dir.mkdir(parents=True, exist_ok=True) + + fc.maintenance.maintenance.QEMU_STATE_DIR = str(qemu_state_dir) + fc.maintenance.maintenance.KVM_SEED_DIR = str(kvm_seed_dir) + fc.maintenance.maintenance.RUNTIME_DIR = str(runtime_dir) + + return (qemu_state_dir, runtime_dir) + + +def write_properties_file(directory, filename, content): + with open(directory / filename, "w") as f: + if isinstance(content, dict): + json.dump(content, f) + else: + f.write(content) + + @unittest.mock.patch("fc.util.dmi_memory.main") @unittest.mock.patch("fc.util.vm.count_cores") def test_request_reboot_for_memory(count_cores, get_memory, logger): @@ -63,64 +90,215 @@ def test_do_not_request_reboot_for_unchanged_cpu( assert request is None -@unittest.mock.patch("os.path.isdir") -@unittest.mock.patch("os.path.exists") -@unittest.mock.patch("shutil.move") -def test_request_reboot_for_qemu_change( - shutil_move, - path_exists, - path_isdir, - logger, +def test_request_reboot_for_qemu_binary_generation_change( + logger, guest_properties +): + qemu, run = guest_properties + + write_properties_file(qemu, "qemu-binary-generation-booted", "1") + write_properties_file(run, "qemu-binary-generation-current", "2") + + request = fc.maintenance.maintenance.request_reboot_for_kvm_environment( + logger + ) + assert "Qemu binary environment has changed" in request.comment + activity = request.activity + assert isinstance(activity, RebootActivity) + assert activity.reboot_needed == RebootType.COLD + + +def test_do_not_request_reboot_for_unchanged_qemu_binary_generation( + logger, guest_properties ): - path_isdir.return_value = True - path_exists.return_value = True + qemu, run = guest_properties + + write_properties_file(qemu, "qemu-binary-generation-booted", "2") + write_properties_file(run, "qemu-binary-generation-current", "2") - def fake_open_qemu_files(filename, encoding): - mock = unittest.mock.Mock() - if filename == "/run/qemu-binary-generation-current": - mock.read.return_value = "2" - elif filename == "/var/lib/qemu/qemu-binary-generation-booted": - mock.read.return_value = "1" + request = fc.maintenance.maintenance.request_reboot_for_kvm_environment( + logger + ) + assert request is None - yield mock - with unittest.mock.patch( - "builtins.open", contextlib.contextmanager(fake_open_qemu_files) - ): - request = fc.maintenance.maintenance.request_reboot_for_qemu(logger) +def test_do_not_request_reboot_for_downgraded_qemu_binary_generation( + logger, guest_properties +): + qemu, run = guest_properties + + write_properties_file(qemu, "qemu-binary-generation-booted", "2") + write_properties_file(run, "qemu-binary-generation-current", "1") + request = fc.maintenance.maintenance.request_reboot_for_kvm_environment( + logger + ) + assert request is None + + +def test_request_reboot_for_guest_properties_upgrade(logger, guest_properties): + qemu, run = guest_properties + + write_properties_file(qemu, "qemu-binary-generation-booted", "2") + write_properties_file(run, "qemu-binary-generation-current", "2") + write_properties_file( + run, "qemu-guest-properties-current", {"binary_generation": 2} + ) + + request = fc.maintenance.maintenance.request_reboot_for_kvm_environment( + logger + ) + assert "KVM environment has been updated" in request.comment + activity = request.activity + assert isinstance(activity, RebootActivity) + assert activity.reboot_needed == RebootType.COLD + + +def test_request_reboot_for_guest_properties_qemu_change( + logger, guest_properties +): + qemu, run = guest_properties + + write_properties_file( + qemu, "qemu-guest-properties-booted", {"binary_generation": 1} + ) + write_properties_file( + run, "qemu-guest-properties-current", {"binary_generation": 2} + ) + + request = fc.maintenance.maintenance.request_reboot_for_kvm_environment( + logger + ) assert "Qemu binary environment has changed" in request.comment activity = request.activity assert isinstance(activity, RebootActivity) assert activity.reboot_needed == RebootType.COLD -@unittest.mock.patch("os.path.isdir") -@unittest.mock.patch("os.path.exists") -@unittest.mock.patch("shutil.move") -def test_do_not_request_reboot_for_unchanged_qemu( - shutil_move, - path_exists, - path_isdir, - logger, +def test_do_not_request_reboot_for_unchanged_guest_properties_qemu( + logger, guest_properties ): - path_isdir.return_value = True - path_exists.return_value = True + qemu, run = guest_properties - def fake_open_qemu_files(filename, encoding): - mock = unittest.mock.Mock() - if filename == "/run/qemu-binary-generation-current": - mock.read.return_value = "1" - elif filename == "/var/lib/qemu/qemu-binary-generation-booted": - mock.read.return_value = "1" + write_properties_file( + qemu, "qemu-guest-properties-booted", {"binary_generation": 2} + ) + write_properties_file( + run, "qemu-guest-properties-current", {"binary_generation": 2} + ) - yield mock + request = fc.maintenance.maintenance.request_reboot_for_kvm_environment( + logger + ) + assert request is None - with unittest.mock.patch( - "builtins.open", contextlib.contextmanager(fake_open_qemu_files) - ): - request = fc.maintenance.maintenance.request_reboot_for_qemu(logger) +def test_do_not_request_reboot_for_downgraded_guest_properties_qemu( + logger, guest_properties +): + qemu, run = guest_properties + + write_properties_file( + qemu, "qemu-guest-properties-booted", {"binary_generation": 2} + ) + write_properties_file( + run, "qemu-guest-properties-current", {"binary_generation": 1} + ) + + request = fc.maintenance.maintenance.request_reboot_for_kvm_environment( + logger + ) + assert request is None + + +def test_request_reboot_for_guest_property_new_property( + logger, guest_properties +): + qemu, run = guest_properties + + write_properties_file( + qemu, "qemu-guest-properties-booted", {"binary_generation": 2} + ) + write_properties_file( + run, + "qemu-guest-properties-current", + {"binary_generation": 2, "cpu_model": "qemu64-v1"}, + ) + + request = fc.maintenance.maintenance.request_reboot_for_kvm_environment( + logger + ) + assert "KVM parameters have changed" in request.comment + assert "cpu_model (new parameter)" in request.comment + activity = request.activity + assert isinstance(activity, RebootActivity) + assert activity.reboot_needed == RebootType.COLD + + +def test_do_not_request_reboot_for_guest_property_removed_property( + logger, guest_properties +): + qemu, run = guest_properties + + write_properties_file( + qemu, + "qemu-guest-properties-booted", + {"binary_generation": 2, "cpu_model": "qemu64-v1"}, + ) + write_properties_file( + run, "qemu-guest-properties-current", {"binary_generation": 2} + ) + + request = fc.maintenance.maintenance.request_reboot_for_kvm_environment( + logger + ) + assert request is None + + +def test_request_reboot_for_guest_property_value_change( + logger, guest_properties +): + qemu, run = guest_properties + + write_properties_file( + qemu, + "qemu-guest-properties-booted", + {"binary_generation": 2, "cpu_model": "qemu64-v1"}, + ) + write_properties_file( + run, + "qemu-guest-properties-current", + {"binary_generation": 2, "cpu_model": "Haswell-v4"}, + ) + + request = fc.maintenance.maintenance.request_reboot_for_kvm_environment( + logger + ) + assert "KVM parameters have changed" in request.comment + assert "cpu_model: qemu64-v1 -> Haswell-v4" in request.comment + activity = request.activity + assert isinstance(activity, RebootActivity) + assert activity.reboot_needed == RebootType.COLD + + +def test_do_not_request_for_unchanged_guest_properties( + logger, guest_properties +): + qemu, run = guest_properties + + write_properties_file( + qemu, + "qemu-guest-properties-booted", + {"binary_generation": 2, "cpu_model": "qemu64-v1"}, + ) + write_properties_file( + run, + "qemu-guest-properties-current", + {"binary_generation": 2, "cpu_model": "qemu64-v1"}, + ) + + request = fc.maintenance.maintenance.request_reboot_for_kvm_environment( + logger + ) assert request is None