Skip to content

Commit f46af40

Browse files
adrianM27meecash
authored andcommitted
Improve error handling for checkpoint and restore
* Log what failed and for which VM * Kill PID on error * Verify restored VM size
1 parent e3e0aa2 commit f46af40

File tree

2 files changed

+115
-28
lines changed

2 files changed

+115
-28
lines changed

compress.c

Lines changed: 44 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -65,20 +65,26 @@ static int lz4_write(const char *src, const size_t len, int (*_write)(int fd, co
6565
dbg("%s(%p, %zu, %p, %d)\n", __func__, src, len, _write, fd);
6666

6767
ret = LZ4_compress_default(src, dst, len, MAX_DATA_SIZE_LZ4);
68-
if (ret <= 0)
68+
if (ret <= 0) {
69+
err("%s() compression error: %d\n", __func__, ret);
6970
return -1;
71+
}
7072

7173
dst_len = ret;
7274

7375
dbg("%s() compressed %zu -> %d bytes\n", __func__, len, dst_len);
7476

7577
ret = _write(fd, &dst_len, sizeof(dst_len));
76-
if (ret != sizeof(dst_len))
78+
if (ret != sizeof(dst_len)) {
79+
err("%s() write dst_len failed: %d\n", __func__, ret);
7780
return -1;
81+
}
7882

7983
ret = _write(fd, dst, dst_len);
80-
if (ret != dst_len)
84+
if (ret != dst_len) {
85+
err("%s() write compressed data failed: %d\n", __func__, ret);
8186
return -1;
87+
}
8288

8389
return len;
8490
}
@@ -92,19 +98,27 @@ static int lz4_read(char *dst, const size_t len, int (*_read)(int fd, void *buf,
9298
dbg("%s(%p, %zu, %p, %d)\n", __func__, dst, len, _read, fd);
9399

94100
ret = _read(fd, &src_len, sizeof(src_len));
95-
if (ret != sizeof(src_len))
101+
if (ret != sizeof(src_len)) {
102+
err("%s() read src_len failed: %d\n", __func__, ret);
96103
return -1;
104+
}
97105

98-
if (src_len > sizeof(src))
106+
if (src_len > sizeof(src)) {
107+
err("%s() src_len %u exceeds buffer size %zu\n", __func__, src_len, sizeof(src));
99108
return -1;
109+
}
100110

101111
ret = _read(fd, src, src_len);
102-
if (ret != src_len)
112+
if (ret != src_len) {
113+
err("%s() read compressed data failed: %d\n", __func__, ret);
103114
return -1;
115+
}
104116

105117
ret = LZ4_decompress_safe(src, dst, src_len, len);
106-
if (ret <= 0)
118+
if (ret <= 0) {
119+
err("%s() decompression error: %d\n", __func__, ret);
107120
return -1;
121+
}
108122

109123
return len;
110124
}
@@ -122,7 +136,7 @@ static int zstd_write(const char *src, const size_t len, int (*_write)(int fd, c
122136

123137
size = ZSTD_compress(dst, MAX_DATA_SIZE_ZSTD, src, len, ZSTD_LEVEL);
124138
if (ZSTD_isError(size)) {
125-
fprintf(stderr, "compression error: %s\n", ZSTD_getErrorName(size));
139+
err("%s() compression error: %s\n", __func__, ZSTD_getErrorName(size));
126140
return -1;
127141
}
128142

@@ -131,12 +145,16 @@ static int zstd_write(const char *src, const size_t len, int (*_write)(int fd, c
131145
dbg("%s() compressed %zu -> %d bytes\n", __func__, len, dst_len);
132146

133147
ret = _write(fd, &dst_len, sizeof(dst_len));
134-
if (ret != sizeof(dst_len))
148+
if (ret != sizeof(dst_len)) {
149+
err("%s() write dst_len failed: %d\n", __func__, ret);
135150
return -1;
151+
}
136152

137153
ret = _write(fd, dst, dst_len);
138-
if (ret != dst_len)
154+
if (ret != dst_len) {
155+
err("%s() write compressed data failed: %d\n", __func__, ret);
139156
return -1;
157+
}
140158

141159
return len;
142160
}
@@ -151,19 +169,25 @@ static int zstd_read(char *dst, const size_t len, int (*_read)(int fd, void *buf
151169
dbg("%s(%p, %zu, %p, %d)\n", __func__, dst, len, _read, fd);
152170

153171
ret = _read(fd, &src_len, sizeof(src_len));
154-
if (ret != sizeof(src_len))
172+
if (ret != sizeof(src_len)) {
173+
err("%s() read src_len failed: %d\n", __func__, ret);
155174
return -1;
175+
}
156176

157-
if (src_len > sizeof(src))
177+
if (src_len > sizeof(src)) {
178+
err("%s() src_len %u exceeds buffer size %zu\n", __func__, src_len, sizeof(src));
158179
return -1;
180+
}
159181

160182
ret = _read(fd, src, src_len);
161-
if (ret != src_len)
183+
if (ret != src_len) {
184+
err("%s() read compressed data failed: %d\n", __func__, ret);
162185
return -1;
186+
}
163187

164188
size = ZSTD_decompress(dst, len, src, src_len);
165189
if (ZSTD_isError(size)) {
166-
fprintf(stderr, "decompression error: %s\n", ZSTD_getErrorName(size));
190+
err("decompression error: %s\n", ZSTD_getErrorName(size));
167191
return -1;
168192
}
169193

@@ -178,8 +202,10 @@ static int plain_write(const char *src, const size_t len, int (*_write)(int fd,
178202
dbg("%s(%p, %zu, %p, %d)\n", __func__, src, len, _write, fd);
179203

180204
ret = _write(fd, src, len);
181-
if (ret != len)
205+
if (ret != len) {
206+
err("%s() write data failed: %d\n", __func__, ret);
182207
return -1;
208+
}
183209

184210
return len;
185211
}
@@ -191,8 +217,10 @@ static int plain_read(char *dst, const size_t len, int (*_read)(int fd, void *bu
191217
dbg("%s(%p, %zu, %p, %d)\n", __func__, dst, len, _read, fd);
192218

193219
ret = _read(fd, dst, len);
194-
if (ret != len)
220+
if (ret != len) {
221+
err("%s() read data failed: %d\n", __func__, ret);
195222
return -1;
223+
}
196224

197225
return ret;
198226
}

memcr.c

Lines changed: 71 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,8 @@ static struct {
174174
.cond = PTHREAD_COND_INITIALIZER,
175175
};
176176

177+
static size_t dumped_vm_size;
178+
177179
/*
178180
* man sigaction: For a ptrace(2) event, si_code will contain SIG‐TRAP and have the ptrace event in the high byte:
179181
* (SIGTRAP | PTRACE_EVENT_foo << 8).
@@ -1029,13 +1031,31 @@ static int read_vm_region(int fd, struct vm_region *vmr, char *buf)
10291031
int ret;
10301032

10311033
ret = dump_read(fd, vmr, sizeof(struct vm_region));
1032-
if (ret != sizeof(struct vm_region))
1034+
if (ret != sizeof(struct vm_region)) {
1035+
if (ret < 0)
1036+
fprintf(stderr, "[-] dump_read() failed: %d\n", ret);
1037+
else if (ret > 0)
1038+
fprintf(stderr, "[-] dump_read() returned %d bytes, expected %zu\n", ret, sizeof(struct vm_region));
1039+
/* ret == 0 : EOF */
10331040
return ret;
1041+
}
1042+
1043+
if (vmr->len > dumped_vm_size) {
1044+
fprintf(stderr, "[-] read_vm_region() VM region %lx len %ld exceeds dumped size %zu\n", vmr->addr, vmr->len, dumped_vm_size);
1045+
return -1;
1046+
}
1047+
1048+
dumped_vm_size -= vmr->len;
10341049

10351050
if (!vm_region_valid(vmr))
10361051
return -1;
10371052

10381053
ret = compress_read(buf, vmr->len, dump_read, fd);
1054+
if (ret < 0) {
1055+
fprintf(stderr, "[-] compress_read() failed: %d for VM region %lx len %ld\n", ret, vmr->addr, vmr->len);
1056+
return ret;
1057+
}
1058+
10391059
#ifdef CHECKSUM_MD5
10401060
if (checksum && ret > 0) {
10411061
md5_update(&md5_restore_ctx, vmr, sizeof(struct vm_region));
@@ -1053,10 +1073,22 @@ static int write_vm_region(int fd, const struct vm_region *vmr, const void *buf)
10531073
return -1;
10541074

10551075
ret = dump_write(fd, vmr, sizeof(struct vm_region));
1056-
if (ret != sizeof(struct vm_region))
1076+
if (ret != sizeof(struct vm_region)) {
1077+
if (ret < 0)
1078+
fprintf(stderr, "[-] dump_write() failed: %d\n", ret);
1079+
else if (ret >= 0)
1080+
fprintf(stderr, "[-] dump_write() returned %d bytes, expected %zu\n", ret, sizeof(struct vm_region));
10571081
return -1;
1082+
}
10581083

10591084
ret = compress_write(buf, vmr->len, dump_write, fd);
1085+
if (ret < 0) {
1086+
fprintf(stderr, "[-] compress_write() failed: %d for VM region %lx len %ld\n", ret, vmr->addr, vmr->len);
1087+
return ret;
1088+
}
1089+
1090+
dumped_vm_size += vmr->len;
1091+
10601092
#ifdef CHECKSUM_MD5
10611093
if (checksum && ret > 0) {
10621094
md5_update(&md5_checkpoint_ctx, vmr, sizeof(struct vm_region));
@@ -1850,6 +1882,7 @@ static int cmd_checkpoint(pid_t pid)
18501882
static int cmd_restore(pid_t pid)
18511883
{
18521884
struct timespec ts;
1885+
int ret = 1;
18531886

18541887
if (!parasite_status_ok()) {
18551888
return 1;
@@ -1862,9 +1895,14 @@ static int cmd_restore(pid_t pid)
18621895

18631896
fprintf(stdout, "[+] uploading pages\n");
18641897
clock_gettime(CLOCK_MONOTONIC, &ts);
1865-
target_set_pages(pid);
1898+
ret = target_set_pages(pid);
18661899
fprintf(stdout, "[i] upload took %lu ms\n", diff_ms(&ts));
18671900

1901+
if (ret) {
1902+
fprintf(stderr, "target_set_pages() failed\n");
1903+
return ret;
1904+
}
1905+
18681906
#ifdef CHECKSUM_MD5
18691907
if (checksum) {
18701908
md5_final(md5_restore_digest, &md5_restore_digest_len, &md5_restore_ctx);
@@ -1886,10 +1924,7 @@ static int cmd_restore(pid_t pid)
18861924
|| (md5_restore_digest_len == 0)
18871925
|| (md5_checkpoint_digest_len != md5_restore_digest_len)
18881926
|| (memcmp(md5_checkpoint_digest, md5_restore_digest, md5_restore_digest_len) != 0) ) {
1889-
printf("[-] dump checksum do not match!\n");
1890-
1891-
fprintf(stderr, "[%d] Restore failed! Killing the target app...\n", getpid());
1892-
kill(pid, SIGKILL);
1927+
fprintf(stderr, "[-] Checksum mismatch! Checkpoint and restore digests differ!\n");
18931928
return 1;
18941929
}
18951930
}
@@ -2256,8 +2291,13 @@ static int execute_parasite_restore(pid_t pid)
22562291
{
22572292
unsigned long ret;
22582293
int status;
2294+
int err;
22592295

2260-
cmd_restore(pid);
2296+
err = cmd_restore(pid);
2297+
if (err) {
2298+
fprintf(stderr, "[-] cmd_restore() failed: %d\n", err);
2299+
return err;
2300+
}
22612301

22622302
parasite_status_wait(&status);
22632303

@@ -2457,11 +2497,13 @@ static int checkpoint_worker(pid_t pid)
24572497

24582498
ret = seize_target(pid);
24592499
if (ret)
2460-
return ret;
2500+
goto out;
24612501

24622502
ret = execute_parasite_checkpoint(pid);
2503+
2504+
out:
24632505
if (ret) {
2464-
fprintf(stderr, "[%d] Parasite checkpoint failed! Killing the target app...\n", getpid());
2506+
fprintf(stderr, "[%d] %s() Checkpoint failed! Killing the target PID %d...\n", getpid(), __func__, pid);
24652507
kill(pid, SIGKILL);
24662508
cleanup_pid(pid);
24672509
return ret;
@@ -2480,13 +2522,19 @@ static int restore_worker(int rd)
24802522

24812523
if (ret < 0 || MEMCR_RESTORE != post_checkpoint_cmd.cmd) {
24822524
fprintf(stdout, "[%d] Error reading restore command!\n", getpid());
2483-
return -1;
2525+
goto out;
24842526
}
24852527

24862528
fprintf(stdout, "[%d] Worker received RESTORE command for %d.\n", getpid(), post_checkpoint_cmd.pid);
24872529

24882530
signal(SIGCHLD, SIG_DFL);
24892531
ret = execute_parasite_restore(post_checkpoint_cmd.pid);
2532+
2533+
out:
2534+
if (ret) {
2535+
fprintf(stderr, "[%d] %s() Restore failed! Killing the target PID %d...\n", getpid(), __func__, post_checkpoint_cmd.pid);
2536+
kill(post_checkpoint_cmd.pid, SIGKILL);
2537+
}
24902538
unseize_target();
24912539
cleanup_pid(post_checkpoint_cmd.pid);
24922540

@@ -2884,8 +2932,10 @@ static int user_interactive_mode(pid_t pid)
28842932
return ret;
28852933

28862934
ret = execute_parasite_checkpoint(pid);
2887-
if (ret)
2935+
if (ret) {
2936+
fprintf(stderr, "[!] Parasite checkpoint failed: %d!\n", ret);
28882937
goto out;
2938+
}
28892939

28902940
if (!no_wait && !interrupted) {
28912941
long dms;
@@ -2906,8 +2956,17 @@ static int user_interactive_mode(pid_t pid)
29062956
}
29072957

29082958
ret = execute_parasite_restore(pid);
2959+
if (ret) {
2960+
fprintf(stderr, "[!] Parasite restore failed: %d!\n", ret);
2961+
goto out;
2962+
}
29092963

29102964
out:
2965+
if (ret) {
2966+
fprintf(stderr, "[!] Killing the target PID %d...\n", pid);
2967+
kill(pid, SIGKILL);
2968+
}
2969+
29112970
unseize_target();
29122971
cleanup_pid(pid);
29132972

0 commit comments

Comments
 (0)