Skip to content

Commit be5ce44

Browse files
authored
[FC] Allow skipping the resave of action snapshots with a platform property (#8808)
This only applies to action executions which were already created from a snapshot. My hypothesis is that saving an updated snapshot doesn't provide future executions any benefit in these cases. The experiment part of this will be in a separate PR.
1 parent de97c26 commit be5ce44

File tree

3 files changed

+142
-38
lines changed

3 files changed

+142
-38
lines changed

enterprise/server/remote_execution/containers/firecracker/firecracker.go

+21-13
Original file line numberDiff line numberDiff line change
@@ -546,6 +546,7 @@ type FirecrackerContainer struct {
546546
createFromSnapshot bool
547547
supportsRemoteSnapshots bool
548548
recyclingEnabled bool
549+
shouldSaveSnapshot bool
549550

550551
// If set, the snapshot used to load the VM
551552
snapshot *snaploader.Snapshot
@@ -718,6 +719,7 @@ func NewContainer(ctx context.Context, env environment.Env, task *repb.Execution
718719
// breaking change is made to the vmexec API, the executor should not
719720
// attempt to connect to snapshots that were created before the change.
720721
}
722+
c.shouldSaveSnapshot = c.supportsRemoteSnapshots || !c.createFromSnapshot || !platform.IsTrue(platform.FindEffectiveValue(task, platform.SkipResavingActionSnapshotsPropertyName))
721723

722724
return c, nil
723725
}
@@ -2584,31 +2586,35 @@ func (c *FirecrackerContainer) pause(ctx context.Context) error {
25842586

25852587
log.CtxInfof(ctx, "Pausing VM")
25862588

2587-
if c.isBalloonEnabled() && c.machineHasBalloon(ctx) {
2589+
if c.shouldSaveSnapshot && c.isBalloonEnabled() && c.machineHasBalloon(ctx) {
25882590
if err := c.reclaimMemoryWithBalloon(ctx); err != nil {
25892591
log.CtxErrorf(ctx, "Reclaiming memory with the balloon failed with: %s", err)
25902592
}
25912593
}
25922594

2593-
snapDetails := c.snapshotDetails(ctx)
2595+
snapDetails := c.snapshotDetails()
25942596

25952597
// Before taking a snapshot, set the workspace drive to point to an empty
25962598
// file, so that we don't have to persist the workspace contents.
2597-
if err := c.updateWorkspaceDriveToEmptyFile(ctx); err != nil {
2598-
return status.WrapError(err, "update workspace drive to empty file")
2599+
if c.shouldSaveSnapshot {
2600+
if err := c.updateWorkspaceDriveToEmptyFile(ctx); err != nil {
2601+
return status.WrapError(err, "update workspace drive to empty file")
2602+
}
25992603
}
26002604

26012605
if err := c.pauseVM(ctx); err != nil {
26022606
return err
26032607
}
26042608

2605-
// If an older snapshot is present -- nuke it since we're writing a new one.
2606-
if err := c.cleanupOldSnapshots(ctx, snapDetails); err != nil {
2607-
return err
2608-
}
2609+
if c.shouldSaveSnapshot {
2610+
// If an older snapshot is present -- nuke it since we're writing a new one.
2611+
if err := c.cleanupOldSnapshots(ctx, snapDetails); err != nil {
2612+
return err
2613+
}
26092614

2610-
if err := c.createSnapshot(ctx, snapDetails); err != nil {
2611-
return err
2615+
if err := c.createSnapshot(ctx, snapDetails); err != nil {
2616+
return err
2617+
}
26122618
}
26132619

26142620
// Stop the VM, UFFD page fault handler, and VBD servers to ensure nothing
@@ -2628,8 +2634,10 @@ func (c *FirecrackerContainer) pause(ctx context.Context) error {
26282634
return status.WrapError(err, "unmount vbds")
26292635
}
26302636

2631-
if err := c.saveSnapshot(ctx, snapDetails); err != nil {
2632-
return err
2637+
if c.shouldSaveSnapshot {
2638+
if err := c.saveSnapshot(ctx, snapDetails); err != nil {
2639+
return err
2640+
}
26332641
}
26342642

26352643
// Finish cleaning up VM resources
@@ -2747,7 +2755,7 @@ type snapshotDetails struct {
27472755
vmStateSnapshotName string
27482756
}
27492757

2750-
func (c *FirecrackerContainer) snapshotDetails(ctx context.Context) *snapshotDetails {
2758+
func (c *FirecrackerContainer) snapshotDetails() *snapshotDetails {
27512759
if c.recycled {
27522760
return &snapshotDetails{
27532761
snapshotType: diffSnapshotType,

enterprise/server/remote_execution/containers/firecracker/firecracker_test.go

+95
Original file line numberDiff line numberDiff line change
@@ -679,6 +679,101 @@ func TestFirecracker_LocalSnapshotSharing(t *testing.T) {
679679
err = c.Pause(ctx)
680680
require.NoError(t, err)
681681
}
682+
func TestFirecracker_LocalSnapshotSharing_DontResave(t *testing.T) {
683+
ctx := context.Background()
684+
env := getTestEnv(ctx, t, envOpts{})
685+
env.SetAuthenticator(testauth.NewTestAuthenticator(testauth.TestUsers("US1", "GR1")))
686+
rootDir := testfs.MakeTempDir(t)
687+
cfg := getExecutorConfig(t)
688+
689+
var containersToCleanup []*firecracker.FirecrackerContainer
690+
t.Cleanup(func() {
691+
for _, vm := range containersToCleanup {
692+
err := vm.Remove(ctx)
693+
assert.NoError(t, err)
694+
}
695+
})
696+
697+
workDir := testfs.MakeDirAll(t, rootDir, "work")
698+
opts := firecracker.ContainerOpts{
699+
ContainerImage: busyboxImage,
700+
ActionWorkingDirectory: workDir,
701+
VMConfiguration: &fcpb.VMConfiguration{
702+
NumCpus: 1,
703+
MemSizeMb: minMemSizeMB, // small to make snapshotting faster.
704+
EnableNetworking: false,
705+
ScratchDiskSizeMb: 100,
706+
},
707+
ExecutorConfig: cfg,
708+
}
709+
task := &repb.ExecutionTask{
710+
Command: &repb.Command{
711+
// Note: platform must match in order to share snapshots
712+
Platform: &repb.Platform{Properties: []*repb.Platform_Property{
713+
{Name: "recycle-runner", Value: "true"},
714+
{Name: platform.SkipResavingActionSnapshotsPropertyName, Value: "true"},
715+
}},
716+
},
717+
}
718+
baseVM, err := firecracker.NewContainer(ctx, env, task, opts)
719+
require.NoError(t, err)
720+
containersToCleanup = append(containersToCleanup, baseVM)
721+
err = container.PullImageIfNecessary(ctx, env, baseVM, oci.Credentials{}, opts.ContainerImage)
722+
require.NoError(t, err)
723+
err = baseVM.Create(ctx, opts.ActionWorkingDirectory)
724+
require.NoError(t, err)
725+
726+
// Create a snapshot. Data written to this snapshot should persist
727+
// when other VMs reuse the snapshot
728+
cmd := appendToLog("Base")
729+
res := baseVM.Exec(ctx, cmd, nil /*=stdio*/)
730+
require.NoError(t, res.Error)
731+
require.Equal(t, "Base\n", string(res.Stdout))
732+
err = baseVM.Pause(ctx)
733+
require.NoError(t, err)
734+
735+
// Load the same base snapshot from multiple VMs - there should be no
736+
// corruption or data transfer from snapshot sharing
737+
for i := 0; i < 4; i++ {
738+
workDir = testfs.MakeDirAll(t, rootDir, fmt.Sprintf("work-%d", i))
739+
opts = firecracker.ContainerOpts{
740+
ContainerImage: busyboxImage,
741+
ActionWorkingDirectory: workDir,
742+
VMConfiguration: &fcpb.VMConfiguration{
743+
NumCpus: 1,
744+
MemSizeMb: minMemSizeMB, // small to make snapshotting faster.
745+
EnableNetworking: false,
746+
ScratchDiskSizeMb: 100,
747+
},
748+
ExecutorConfig: cfg,
749+
}
750+
forkedVM, err := firecracker.NewContainer(ctx, env, task, opts)
751+
require.NoError(t, err)
752+
containersToCleanup = append(containersToCleanup, forkedVM)
753+
754+
// The new VM should reuse the Base VM's snapshot, whether we call
755+
// Create() or Unpause()
756+
if i%2 == 0 {
757+
err = forkedVM.Unpause(ctx)
758+
require.NoError(t, err)
759+
} else {
760+
err = forkedVM.Create(ctx, workDir)
761+
require.NoError(t, err)
762+
}
763+
764+
// Write VM-specific data to the log
765+
cmd = appendToLog(fmt.Sprintf("Fork-%d", i))
766+
res = forkedVM.Exec(ctx, cmd, nil /*=stdio*/)
767+
require.NoError(t, res.Error)
768+
// The log should contain data written to the original snapshot
769+
// and the current VM, but not from any of the other VMs sharing
770+
// the same original snapshot
771+
require.Equal(t, fmt.Sprintf("Base\nFork-%d\n", i), string(res.Stdout))
772+
// This shouldn't save a snapshot
773+
err = forkedVM.Pause(ctx)
774+
require.NoError(t, err)
775+
}
776+
}
682777

683778
func TestFirecracker_RemoteSnapshotSharing(t *testing.T) {
684779
ctx := context.Background()

enterprise/server/remote_execution/platform/platform.go

+26-25
Original file line numberDiff line numberDiff line change
@@ -73,31 +73,32 @@ const (
7373
// empty or unset.
7474
unsetContainerImageVal = "none"
7575

76-
RecycleRunnerPropertyName = "recycle-runner"
77-
RunnerRecyclingMaxWaitPropertyName = "runner-recycling-max-wait"
78-
PreserveWorkspacePropertyName = "preserve-workspace"
79-
overlayfsWorkspacePropertyName = "overlayfs-workspace"
80-
cleanWorkspaceInputsPropertyName = "clean-workspace-inputs"
81-
persistentWorkerPropertyName = "persistent-workers"
82-
persistentWorkerKeyPropertyName = "persistentWorkerKey"
83-
persistentWorkerProtocolPropertyName = "persistentWorkerProtocol"
84-
WorkflowIDPropertyName = "workflow-id"
85-
workloadIsolationPropertyName = "workload-isolation-type"
86-
initDockerdPropertyName = "init-dockerd"
87-
enableDockerdTCPPropertyName = "enable-dockerd-tcp"
88-
enableVFSPropertyName = "enable-vfs"
89-
HostedBazelAffinityKeyPropertyName = "hosted-bazel-affinity-key"
90-
useSelfHostedExecutorsPropertyName = "use-self-hosted-executors"
91-
disableMeasuredTaskSizePropertyName = "debug-disable-measured-task-size"
92-
disablePredictedTaskSizePropertyName = "debug-disable-predicted-task-size"
93-
extraArgsPropertyName = "extra-args"
94-
EnvOverridesPropertyName = "env-overrides"
95-
EnvOverridesBase64PropertyName = "env-overrides-base64"
96-
IncludeSecretsPropertyName = "include-secrets"
97-
DefaultTimeoutPropertyName = "default-timeout"
98-
TerminationGracePeriodPropertyName = "termination-grace-period"
99-
SnapshotKeyOverridePropertyName = "snapshot-key-override"
100-
RetryPropertyName = "retry"
76+
RecycleRunnerPropertyName = "recycle-runner"
77+
RunnerRecyclingMaxWaitPropertyName = "runner-recycling-max-wait"
78+
PreserveWorkspacePropertyName = "preserve-workspace"
79+
overlayfsWorkspacePropertyName = "overlayfs-workspace"
80+
cleanWorkspaceInputsPropertyName = "clean-workspace-inputs"
81+
persistentWorkerPropertyName = "persistent-workers"
82+
persistentWorkerKeyPropertyName = "persistentWorkerKey"
83+
persistentWorkerProtocolPropertyName = "persistentWorkerProtocol"
84+
WorkflowIDPropertyName = "workflow-id"
85+
workloadIsolationPropertyName = "workload-isolation-type"
86+
initDockerdPropertyName = "init-dockerd"
87+
enableDockerdTCPPropertyName = "enable-dockerd-tcp"
88+
enableVFSPropertyName = "enable-vfs"
89+
HostedBazelAffinityKeyPropertyName = "hosted-bazel-affinity-key"
90+
useSelfHostedExecutorsPropertyName = "use-self-hosted-executors"
91+
disableMeasuredTaskSizePropertyName = "debug-disable-measured-task-size"
92+
disablePredictedTaskSizePropertyName = "debug-disable-predicted-task-size"
93+
extraArgsPropertyName = "extra-args"
94+
EnvOverridesPropertyName = "env-overrides"
95+
EnvOverridesBase64PropertyName = "env-overrides-base64"
96+
IncludeSecretsPropertyName = "include-secrets"
97+
DefaultTimeoutPropertyName = "default-timeout"
98+
TerminationGracePeriodPropertyName = "termination-grace-period"
99+
SnapshotKeyOverridePropertyName = "snapshot-key-override"
100+
RetryPropertyName = "retry"
101+
SkipResavingActionSnapshotsPropertyName = "skip-resaving-action-snapshots"
101102

102103
OperatingSystemPropertyName = "OSFamily"
103104
LinuxOperatingSystemName = "linux"

0 commit comments

Comments
 (0)