Skip to content

Commit

Permalink
Bugfix/socket err (#315)
Browse files Browse the repository at this point in the history
* Add log.

* Fix the bug that SpawnProcessInInstance will close uninitialized value.

Signed-off-by: RileyW <[email protected]>

---------

Signed-off-by: RileyW <[email protected]>
  • Loading branch information
RileyWen authored Aug 21, 2024
1 parent 2e9592f commit 1440e75
Showing 1 changed file with 36 additions and 13 deletions.
49 changes: 36 additions & 13 deletions src/Craned/TaskManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -641,16 +641,18 @@ CraneErr TaskManager::SpawnProcessInInstance_(
// Move the ownership of ProcessInstance into the TaskInstance.
instance->processes.emplace(child_pid, std::move(process));

close(io_in_sock_pair[1]);
close(io_out_sock_pair[1]);
close(ctrl_sock_pair[1]);
int ctrl_fd = ctrl_sock_pair[0];
bool ok;
close(ctrl_sock_pair[1]);
if (instance->IsCrun()) {
close(io_in_sock_pair[1]);
close(io_out_sock_pair[1]);
}

setegid(saved_priv.gid);
seteuid(saved_priv.uid);
setgroups(0, nullptr);

bool ok;
FileInputStream istream(ctrl_fd);
FileOutputStream ostream(ctrl_fd);
CanStartMessage msg;
Expand Down Expand Up @@ -691,10 +693,16 @@ CraneErr TaskManager::SpawnProcessInInstance_(
// subprocess should continue to exec().
msg.set_ok(true);
ok = SerializeDelimitedToZeroCopyStream(msg, &ostream);
ok &= ostream.Flush();
if (!ok) {
CRANE_ERROR("Failed to send ok=true to subprocess {} for task #{}",
child_pid, instance->task.task_id());
CRANE_ERROR("Failed to serialize msg to ostream: {}",
strerror(ostream.GetErrno()));
}

if (ok) ok &= ostream.Flush();
if (!ok) {
CRANE_ERROR("Failed to send ok=true to subprocess {} for task #{}: {}",
child_pid, instance->task.task_id(),
strerror(ostream.GetErrno()));
close(ctrl_fd);

// Communication failure caused by process crash or grpc error.
Expand All @@ -707,10 +715,15 @@ CraneErr TaskManager::SpawnProcessInInstance_(
return CraneErr::kOk;
}

ParseDelimitedFromZeroCopyStream(&child_process_ready, &istream, nullptr);
if (!msg.ok()) {
CRANE_ERROR("Failed to read protobuf from subprocess {} of task #{}",
child_pid, instance->task.task_id());
ok = ParseDelimitedFromZeroCopyStream(&child_process_ready, &istream,
nullptr);
if (!ok || !msg.ok()) {
if (!ok)
CRANE_ERROR("Socket child endpoint failed: {}",
strerror(istream.GetErrno()));
if (!msg.ok())
CRANE_ERROR("False from subprocess {} of task #{}", child_pid,
instance->task.task_id());
close(ctrl_fd);

// See comments above.
Expand Down Expand Up @@ -768,8 +781,18 @@ CraneErr TaskManager::SpawnProcessInInstance_(
ChildProcessReady child_process_ready;
bool ok;

ParseDelimitedFromZeroCopyStream(&msg, &istream, nullptr);
if (!msg.ok()) std::abort();
ok = ParseDelimitedFromZeroCopyStream(&msg, &istream, nullptr);
if (!ok || !msg.ok()) {
if (!ok) {
int err = istream.GetErrno();
CRANE_ERROR("Failed to read socket from parent: {}", strerror(err));
}

if (!msg.ok())
CRANE_ERROR("Parent process ask not to start the subprocess.");

std::abort();
}

if (instance->task.type() == crane::grpc::Batch) {
int stdout_fd, stderr_fd;
Expand Down

0 comments on commit 1440e75

Please sign in to comment.