Skip to content

Commit

Permalink
Merge pull request flyingcircusio#1124 from flyingcircusio/PL-131857-…
Browse files Browse the repository at this point in the history
…kvm-cold-reboot-signalling

Trigger cold reboots upon changes in KVM environment parameters
  • Loading branch information
ctheune authored Oct 18, 2024
2 parents 73d2d52 + 94cd11f commit 07f85a9
Show file tree
Hide file tree
Showing 4 changed files with 358 additions and 84 deletions.
4 changes: 2 additions & 2 deletions pkgs/fc/agent/fc/maintenance/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
from fc.maintenance.maintenance import (
request_reboot_for_cpu,
request_reboot_for_kernel,
request_reboot_for_kvm_environment,
request_reboot_for_memory,
request_reboot_for_qemu,
request_update,
)
from fc.maintenance.reqmanager import DEFAULT_SPOOLDIR, ReqManager
Expand Down Expand Up @@ -299,7 +299,7 @@ def system_properties():
if enc["parameters"]["machine"] == "virtual":
rm.add(request_reboot_for_memory(log, enc))
rm.add(request_reboot_for_cpu(log, enc))
rm.add(request_reboot_for_qemu(log))
rm.add(request_reboot_for_kvm_environment(log))

rm.add(request_reboot_for_kernel(log, current_requests))
log.info("fc-maintenance-system-properties-finished")
Expand Down
170 changes: 133 additions & 37 deletions pkgs/fc/agent/fc/maintenance/maintenance.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
TODO: better logging for created activites.
"""

import json
import os.path
import shutil
import typing
Expand All @@ -23,6 +25,10 @@
from fc.maintenance.state import State
from fc.util import nixos

QEMU_STATE_DIR = "/var/lib/qemu"
KVM_SEED_DIR = "/tmp/fc-data"
RUNTIME_DIR = "/run"


def request_reboot_for_memory(log, enc) -> Optional[Request]:
"""Schedules reboot if the memory size has changed."""
Expand Down Expand Up @@ -56,53 +62,143 @@ def request_reboot_for_cpu(log, enc) -> Optional[Request]:
return Request(activity)


def request_reboot_for_qemu(log) -> Optional[Request]:
"""Schedules a reboot if the Qemu binary environment has changed."""
# Update the -booted marker if necessary. We need to store the marker
# in a place where it does not get removed after _internal_ reboots
# of the virtual machine. However, if we got rebooted with a fresh
# Qemu instance, we need to update it from the marker on the tmp
# partition.
log.debug("request-reboot-for-qemu-start")
if not os.path.isdir("/var/lib/qemu"):
os.makedirs("/var/lib/qemu")
if os.path.exists("/tmp/fc-data/qemu-binary-generation-booted"):
def request_reboot_for_kvm_environment(log) -> Optional[Request]:
"""Schedules a reboot if the hypervisor environment has changed."""
# Update the -booted marker if necessary. We need to store the
# marker in a place where it does not get removed after warm
# reboots of the virtual machine (i.e. same Qemu process on the
# KVM host). However, if we get a cold reboot (i.e. fresh Qemu
# process) then we need to update it from the marker seeded on the
# tmp partition.
log.debug("request-reboot-for-kvm-environment-start")

qemu_state_dir = Path(QEMU_STATE_DIR)

cold_state_file = lambda stem: Path(KVM_SEED_DIR) / f"qemu-{stem}-booted"
warm_state_file = lambda stem: qemu_state_dir / f"qemu-{stem}-booted"
running_state_file = (
lambda stem: Path(RUNTIME_DIR) / f"qemu-{stem}-current"
)

if not qemu_state_dir.is_dir():
qemu_state_dir.mkdir(parents=True)

# Newer versions of fc.qemu can signal multiple relevant guest
# properties in a single file. Older versions will only signal the
# boot-time qemu binary generation.
if cold_state_file("guest-properties").exists():
shutil.move(
"/tmp/fc-data/qemu-binary-generation-booted",
"/var/lib/qemu/qemu-binary-generation-booted",
cold_state_file("guest-properties"),
warm_state_file("guest-properties"),
)
cold_state_file("binary-generation").unlink(missing_ok=True)
elif cold_state_file("binary-generation").exists():
shutil.move(
cold_state_file("binary-generation"),
warm_state_file("binary-generation"),
)
# Schedule maintenance if the current marker differs from booted
# marker.
if not os.path.exists("/run/qemu-binary-generation-current"):
return

# Optimistically load combined guest properties files
try:
with open(
"/run/qemu-binary-generation-current", encoding="ascii"
) as f:
current_generation = int(f.read().strip())
with open(running_state_file("guest-properties")) as f:
current_properties = json.load(f)
except FileNotFoundError:
current_properties = None
except Exception:
# Do not perform maintenance if no current marker is there.
current_properties = {}

# If the combined properties do not exist then fall back to the
# old binary generation file.
if current_properties is None:
try:
with open(
running_state_file("binary-generation"), encoding="ascii"
) as f:
current_generation = int(f.read().strip())
except Exception:
current_generation = None

try:
with open(
warm_state_file("binary-generation"), encoding="ascii"
) as f:
booted_generation = int(f.read().strip())
except Exception:
# Assume 0 as the generation marker as that is our upgrade
# path: VMs started with an earlier version of fc.qemu
# will not have this marker at all
booted_generation = 0

if (
current_generation is not None
and current_generation > booted_generation
):
msg = (
"Cold restart because the Qemu binary environment has changed"
)
return Request(RebootActivity("poweroff"), comment=msg)

return

# If the combined properties file exists but could not be decoded,
# then ignore it. Note that we always expect at least the binary
# generation to be provided in the combined file.
if not current_properties:
return

try:
with open(
"/var/lib/qemu/qemu-binary-generation-booted", encoding="ascii"
) as f:
booted_generation = int(f.read().strip())
with open(warm_state_file("guest-properties")) as f:
booted_properties = json.load(f)
except Exception:
# Assume 0 as the generation marker as that is our upgrade path:
# VMs started with an earlier version of fc.qemu will not have
# this marker at all.
booted_generation = 0

if booted_generation >= current_generation:
# We do not automatically downgrade. If we ever want that then I
# want us to reconsider the side effects.
return
booted_properties = None

# If the boot-time properties file does not exist, or does not
# have valid content, then we should reboot and re-run the seeding
# process from the start. This covers the case we were booted on a
# hypervisor which did not provide guest properties which has
# since been upgraded to start providing them, and it also handles
# the case where a bug on the hypervisor has delivered us invalid
# data, in which case we reboot optimistically in the hopes that
# the hypervisor has since been fixed.
if not booted_properties:
msg = "Cold restart because the KVM environment has been updated"
return Request(RebootActivity("poweroff"), comment=msg)

# Special handling for the binary generation, which should ignore
# downgrades.
current_generation = current_properties.pop("binary_generation", None)
booted_generation = booted_properties.pop("binary_generation", 0)

msg = "Cold restart because the Qemu binary environment has changed"
return Request(RebootActivity("poweroff"), comment=msg)
if (
current_generation is not None
and current_generation > booted_generation
):
msg = "Cold restart because the Qemu binary environment has changed"
return Request(RebootActivity("poweroff"), comment=msg)

fragments = []
# Keys which are present at boot time but no longer at runtime are
# implicitly ignored here.
for key in current_properties.keys():
# New keys which were not present at boot but were introduced
# at runtime should cause a reboot.
if key not in booted_properties:
fragments.append(f"{key} (new parameter)")

# Changes in values between boot and runtime should cause a
# reboot.
elif booted_properties[key] != current_properties[key]:
fragments.append(
"{}: {} -> {}".format(
key, booted_properties[key], current_properties[key]
)
)

if fragments:
msg = "Cold restart because KVM parameters have changed: {}".format(
", ".join(fragments)
)
return Request(RebootActivity("poweroff"), comment=msg)


def request_reboot_for_kernel(
Expand Down
6 changes: 3 additions & 3 deletions pkgs/fc/agent/fc/maintenance/tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def test_invoke_request_cold_reboot(activity, invoke_app_as_root):


@unittest.mock.patch("fc.maintenance.cli.request_reboot_for_kernel")
@unittest.mock.patch("fc.maintenance.cli.request_reboot_for_qemu")
@unittest.mock.patch("fc.maintenance.cli.request_reboot_for_kvm_environment")
@unittest.mock.patch("fc.maintenance.cli.request_reboot_for_cpu")
@unittest.mock.patch("fc.maintenance.cli.request_reboot_for_memory")
def test_invoke_request_system_properties_virtual(
Expand All @@ -169,10 +169,10 @@ def test_invoke_request_system_properties_virtual(


@unittest.mock.patch("fc.maintenance.cli.request_reboot_for_kernel")
@unittest.mock.patch("fc.maintenance.cli.request_reboot_for_qemu")
@unittest.mock.patch("fc.maintenance.cli.request_reboot_for_kvm_environment")
@unittest.mock.patch("fc.maintenance.cli.request_reboot_for_cpu")
@unittest.mock.patch("fc.maintenance.cli.request_reboot_for_memory")
def test_invoke_request_system_properties_virtual(
def test_invoke_request_system_properties_physical(
memory, cpu, qemu, kernel, tmpdir, invoke_app_as_root
):
enc_file = tmpdir / "enc.json"
Expand Down
Loading

0 comments on commit 07f85a9

Please sign in to comment.