From 9ee70b9a1eba4a242de4cd16179e23c9d3331440 Mon Sep 17 00:00:00 2001 From: Taoran Wang <123521007+wtr0504@users.noreply.github.com> Date: Wed, 13 Nov 2024 21:14:49 +0800 Subject: [PATCH] Feat: Support cgroupv2 (#327) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 完成CgroupV2除devices的支持 * 完成对CgroupV2的支持(devices除外) * fix bug * check name * recover config.yaml * get master config.yaml * fix undefined func * solve comments * complete devices support in cgroupV2 by using bpf complete devices support in cgroupV2 by using bpf complete devices support in cgroupV2 by using bpf * rm cmake order * fix bug in bpf dev_map * resolve comment and test multitask resolve comment * refactor Cgroup and reslove comment reslove comment reslove comment reslove comment reslove commit reslove comment reslove comment reslove comment reslove comment reslove comment modify bpf cmake * check and set compile for project * reslove comment of checking compiler version and handling device run_time error in V2 format * resolve comment and modify device access logic on V1 and V2 resolve comment resolve commit * check name * add dynamic load log level in bpf.o,modify bpf_obj load logic(one or more cgroups load bpf_obj once) reslove comment reslove comment reslove comment fix bug delete BPF_DEBUG compile option resolve comment comment * comment * resolve comment * 完成CgroupV2除devices的支持 * 完成对CgroupV2的支持(devices除外) * fix bug * recover config.yaml * get master config.yaml * fix undefined func * solve comments * complete devices support in cgroupV2 by using bpf complete devices support in cgroupV2 by using bpf complete devices support in cgroupV2 by using bpf * rm cmake order * fix bug in bpf dev_map * resolve comment and test multitask resolve comment * refactor Cgroup and reslove comment reslove comment reslove comment reslove comment reslove commit reslove comment reslove comment reslove comment reslove comment reslove comment modify bpf cmake * check and set compile for project * reslove comment of checking compiler version and handling device run_time error in V2 format * resolve comment and modify device access logic on V1 and V2 resolve comment resolve commit * check name * add dynamic load log level in bpf.o,modify bpf_obj load logic(one or more cgroups load bpf_obj once) reslove comment reslove comment reslove comment fix bug delete BPF_DEBUG compile option resolve comment comment * comment * resolve comment * add v2 option CRANE_ENABLE_CGROUP_V2 and communicate with CRANE_USE_SYSTEM_LIBCGROUP * comment commit comment comment * rm pkg check libcgroup * comment * comment * add memory.move_charge_at_immigrate todo --- CMakeLists.txt | 47 + dependencies/cmake/libcgroup/CMakeLists.txt | 3 + src/Craned/CMakeLists.txt | 9 + src/Craned/CgroupManager.cpp | 1186 +++++++++++++------ src/Craned/CgroupManager.h | 290 ++++- src/Craned/Craned.cpp | 19 +- src/Craned/DeviceManager.h | 2 + src/Craned/TaskManager.cpp | 2 +- src/Craned/TaskManager.h | 2 +- src/Misc/BPF/CMakeLists.txt | 39 + src/Misc/BPF/cgroup_dev_bpf.c | 107 ++ src/Misc/CMakeLists.txt | 8 +- 12 files changed, 1296 insertions(+), 418 deletions(-) create mode 100644 src/Misc/BPF/CMakeLists.txt create mode 100644 src/Misc/BPF/cgroup_dev_bpf.c diff --git a/CMakeLists.txt b/CMakeLists.txt index e1a8f350..3044f8ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,6 +44,47 @@ if(VERSION_CONTENT STREQUAL "") endif() project(Crane VERSION ${CMAKE_PROJECT_VERSION} LANGUAGES C CXX) +# check and set compiler +set(REQUIRED_GNU_VERSION 13.0.0) +set(REQUIRED_CLANG_VERSION 19.0.0) +set(REQUIRED_BPF_CLANG_VERSION 17.0.0) + +set(CLANG_VERSION "0.0.0") + +if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL REQUIRED_GNU_VERSION) + if(CRANE_ENABLE_CGROUP_V2) + message(STATUS "Enabling Cgroup V2 will build cgroup_dev_bpf_object, which requires Clang ${REQUIRED_BPF_CLANG_VERSION}+. Use GNU ${CMAKE_CXX_COMPILER_VERSION} for other modules. ") + find_program(CLANG_EXECUTABLE NAMES clang) + if(CLANG_EXECUTABLE) + execute_process( + COMMAND ${CLANG_EXECUTABLE} --version + OUTPUT_VARIABLE CLANG_VERSION_OUTPUT + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" CLANG_VERSION ${CLANG_VERSION_OUTPUT}) + if(CLANG_VERSION VERSION_GREATER_EQUAL REQUIRED_BPF_CLANG_VERSION) + set(ENABLE_BPF ON) + message(STATUS "Found Clang at ${CLANG_EXECUTABLE} with version ${CLANG_VERSION}; using this version for the cgroup_dev_bpf_object module.") + else() + message(FATAL_ERROR "Clang found at ${CLANG_EXECUTABLE} is version ${CLANG_VERSION}, but version ${REQUIRED_BPF_CLANG_VERSION} or higher is required for device management on Cgroup V2. You may use Cgroup V1 instead.") + endif() + else() + message(FATAL_ERROR "Clang ${REQUIRED_BPF_CLANG_VERSION} or higher is required for device management on Cgroup V2. You can use Cgroup V1.") + endif() + endif() +elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL REQUIRED_CLANG_VERSION) + if(CRANE_ENABLE_CGROUP_V2) + set(ENABLE_BPF ON) + endif() + message(STATUS "Using Clang for all module.") + else() + message(FATAL_ERROR "Clang ${REQUIRED_CLANG_VERSION} or higher is required.") + endif() +else() + message(FATAL_ERROR "Neither GNU ${REQUIRED_GNU_VERSION}+ nor Clang ${REQUIRED_CLANG_VERSION}+ found. Stop compiling crane") +endif() + # Options start here ---------------------------------------------------------------------------- @@ -61,6 +102,8 @@ option(CRANE_USE_GITEE_SOURCE "Enable the Gitee repository as the download sourc option(CRANE_USE_SYSTEM_LIBCGROUP "Use libcgroup from system instead of building from source" OFF) +option(CRANE_ENABLE_CGROUP_V2 "Enable Cgroup V2 support" OFF) + option(CRANE_FULL_DYNAMIC "Enable dynamic libs" OFF) option(CRANE_ADDRESS_SANITIZER "Enable address sanitizer" OFF) @@ -289,6 +332,10 @@ set(_PROTOBUF_PROTOC $) set(_GRPC_GRPCPP grpc++) set(_GRPC_CPP_PLUGIN_EXECUTABLE $) +if(ENABLE_BPF) + find_package(PkgConfig REQUIRED) + pkg_check_modules(libbpf REQUIRED IMPORTED_TARGET libbpf>=1.4.6) +endif() # @formatter:off add_definitions(-DCRANE_BUILD_DIRECTORY=\("${CMAKE_BINARY_DIR}"\)) # @formatter:on diff --git a/dependencies/cmake/libcgroup/CMakeLists.txt b/dependencies/cmake/libcgroup/CMakeLists.txt index 1332c806..ac7f7af1 100644 --- a/dependencies/cmake/libcgroup/CMakeLists.txt +++ b/dependencies/cmake/libcgroup/CMakeLists.txt @@ -6,6 +6,9 @@ if (${CRANE_USE_SYSTEM_LIBCGROUP}) if (libcgroup_FOUND) message(STATUS "Found libcgroup ${libcgroup_VERSION} in system using pkg-config.") + if(CRANE_ENABLE_CGROUP_V2 AND libcgroup_VERSION VERSION_LESS "3.1.0") + message(FATAL_ERROR "libcgroup version must be higher than 3.1.0 when CRANE_ENABLE_CGROUP_V2 is set.") + endif() else() message(FATAL_ERROR "libcgroup in system is not found using pkg-config.") endif() diff --git a/src/Craned/CMakeLists.txt b/src/Craned/CMakeLists.txt index 473fee8a..532749b8 100644 --- a/src/Craned/CMakeLists.txt +++ b/src/Craned/CMakeLists.txt @@ -21,6 +21,13 @@ add_dependencies(craned libcgroup) target_include_directories( craned PRIVATE ${LIBCGROUP_PATH}/include/ ) + + + +if(ENABLE_BPF) + add_dependencies(craned cgroup_dev_bpf_object) + target_compile_definitions(craned PRIVATE CRANE_ENABLE_BPF) +endif() target_link_libraries(craned concurrentqueue @@ -49,6 +56,8 @@ target_link_libraries(craned result Backward::Interface + + $<$:bpf> ) # Linker flag for c++ 17 filesystem library diff --git a/src/Craned/CgroupManager.cpp b/src/Craned/CgroupManager.cpp index 36ef2a85..a3927ff0 100644 --- a/src/Craned/CgroupManager.cpp +++ b/src/Craned/CgroupManager.cpp @@ -23,6 +23,14 @@ #include "CgroupManager.h" +#ifdef CRANE_ENABLE_BPF +# include +# include +# include +#endif + +#include + #include "CranedPublicDefs.h" #include "DeviceManager.h" #include "crane/PluginClient.h" @@ -30,6 +38,10 @@ namespace Craned { +#ifdef CRANE_ENABLE_BPF +BpfRuntimeInfo CgroupV2::bpf_runtime_info_{}; +#endif + /* * Initialize libcgroup and mount the controllers Condor will use (if possible) * @@ -42,84 +54,149 @@ int CgroupManager::Init() { // cgroup_set_loglevel(CGROUP_LOG_DEBUG); - void *handle = nullptr; - controller_data info{}; + enum cg_setup_mode_t setup_mode; + setup_mode = cgroup_setup_mode(); + switch (setup_mode) { + case CGROUP_MODE_LEGACY: + //("cgroup mode: Legacy\n"); + cg_version_ = CgroupConstant::CgroupVersion::CGROUP_V1; + break; + case CGROUP_MODE_HYBRID: + //("cgroup mode: Hybrid\n"); + cg_version_ = CgroupConstant::CgroupVersion::UNDEFINED; + break; + case CGROUP_MODE_UNIFIED: + //("cgroup mode: Unified\n"); + cg_version_ = CgroupConstant::CgroupVersion::CGROUP_V2; + break; + default: + //("cgroup mode: Unknown\n"); + break; + } using CgroupConstant::Controller; using CgroupConstant::GetControllerStringView; ControllerFlags NO_CONTROLLERS; - int ret = cgroup_get_all_controller_begin(&handle, &info); - while (ret == 0) { - if (info.name == GetControllerStringView(Controller::MEMORY_CONTROLLER)) { - m_mounted_controllers_ |= - (info.hierarchy != 0) ? ControllerFlags{Controller::MEMORY_CONTROLLER} - : NO_CONTROLLERS; - - } else if (info.name == - GetControllerStringView(Controller::CPUACCT_CONTROLLER)) { - m_mounted_controllers_ |= - (info.hierarchy != 0) - ? ControllerFlags{Controller::CPUACCT_CONTROLLER} - : NO_CONTROLLERS; - - } else if (info.name == - GetControllerStringView(Controller::FREEZE_CONTROLLER)) { - m_mounted_controllers_ |= - (info.hierarchy != 0) ? ControllerFlags{Controller::FREEZE_CONTROLLER} - : NO_CONTROLLERS; + if (GetCgroupVersion() == CgroupConstant::CgroupVersion::CGROUP_V1) { + void *handle = nullptr; + controller_data info{}; + int ret = cgroup_get_all_controller_begin(&handle, &info); + while (ret == 0) { + if (info.name == GetControllerStringView(Controller::MEMORY_CONTROLLER)) { + m_mounted_controllers_ |= + (info.hierarchy != 0) + ? ControllerFlags{Controller::MEMORY_CONTROLLER} + : NO_CONTROLLERS; + + } else if (info.name == + GetControllerStringView(Controller::CPUACCT_CONTROLLER)) { + m_mounted_controllers_ |= + (info.hierarchy != 0) + ? ControllerFlags{Controller::CPUACCT_CONTROLLER} + : NO_CONTROLLERS; + + } else if (info.name == + GetControllerStringView(Controller::FREEZE_CONTROLLER)) { + m_mounted_controllers_ |= + (info.hierarchy != 0) + ? ControllerFlags{Controller::FREEZE_CONTROLLER} + : NO_CONTROLLERS; + + } else if (info.name == + GetControllerStringView(Controller::BLOCK_CONTROLLER)) { + m_mounted_controllers_ |= + (info.hierarchy != 0) + ? ControllerFlags{Controller::BLOCK_CONTROLLER} + : NO_CONTROLLERS; + + } else if (info.name == + GetControllerStringView(Controller::CPU_CONTROLLER)) { + m_mounted_controllers_ |= + (info.hierarchy != 0) ? ControllerFlags{Controller::CPU_CONTROLLER} + : NO_CONTROLLERS; + } else if (info.name == + GetControllerStringView(Controller::DEVICES_CONTROLLER)) { + m_mounted_controllers_ |= + (info.hierarchy != 0) + ? ControllerFlags{Controller::DEVICES_CONTROLLER} + : NO_CONTROLLERS; + } + ret = cgroup_get_all_controller_next(&handle, &info); + } + if (handle) { + cgroup_get_all_controller_end(&handle); + } - } else if (info.name == - GetControllerStringView(Controller::BLOCK_CONTROLLER)) { - m_mounted_controllers_ |= - (info.hierarchy != 0) ? ControllerFlags{Controller::BLOCK_CONTROLLER} - : NO_CONTROLLERS; + ControllersMounted(); + if (ret != ECGEOF) { + CRANE_WARN("Error iterating through cgroups mount information: {}\n", + cgroup_strerror(ret)); + return -1; + } + } + // cgroup don't use /proc/cgroups to manage controller + else if ((GetCgroupVersion() == CgroupConstant::CgroupVersion::CGROUP_V2)) { + struct cgroup *root = nullptr; + int ret; + if ((root = cgroup_new_cgroup("/")) == nullptr) { + CRANE_WARN("Unable to construct new root cgroup object."); + return -1; + } + if ((ret = cgroup_get_cgroup(root)) != 0) { + CRANE_WARN("Error : root cgroup not exist."); + return -1; + } - } else if (info.name == - GetControllerStringView(Controller::CPU_CONTROLLER)) { + if ((cgroup_get_controller( + root, + GetControllerStringView(Controller::CPU_CONTROLLER_V2).data())) != + nullptr) { + m_mounted_controllers_ |= ControllerFlags{Controller::CPU_CONTROLLER_V2}; + } + if ((cgroup_get_controller( + root, GetControllerStringView(Controller::MEMORY_CONTORLLER_V2) + .data())) != nullptr) { m_mounted_controllers_ |= - (info.hierarchy != 0) ? ControllerFlags{Controller::CPU_CONTROLLER} - : NO_CONTROLLERS; - } else if (info.name == - GetControllerStringView(Controller::DEVICES_CONTROLLER)) { + ControllerFlags{Controller::MEMORY_CONTORLLER_V2}; + } + if ((cgroup_get_controller( + root, GetControllerStringView(Controller::CPUSET_CONTROLLER_V2) + .data())) != nullptr) { m_mounted_controllers_ |= - (info.hierarchy != 0) - ? ControllerFlags{Controller::DEVICES_CONTROLLER} - : NO_CONTROLLERS; + ControllerFlags{Controller::CPUSET_CONTROLLER_V2}; + } + if ((cgroup_get_controller( + root, + GetControllerStringView(Controller::IO_CONTROLLER_V2).data())) != + nullptr) { + m_mounted_controllers_ |= ControllerFlags{Controller::IO_CONTROLLER_V2}; + } + if ((cgroup_get_controller( + root, + GetControllerStringView(Controller::PIDS_CONTROLLER_V2).data())) != + nullptr) { + m_mounted_controllers_ |= ControllerFlags{Controller::PIDS_CONTROLLER_V2}; } - ret = cgroup_get_all_controller_next(&handle, &info); - } - if (handle) { - cgroup_get_all_controller_end(&handle); - } - if (!Mounted(Controller::BLOCK_CONTROLLER)) { - CRANE_WARN("Cgroup controller for I/O statistics is not available.\n"); - } - if (!Mounted(Controller::FREEZE_CONTROLLER)) { - CRANE_WARN("Cgroup controller for process management is not available.\n"); - } - if (!Mounted(Controller::CPUACCT_CONTROLLER)) { - CRANE_WARN("Cgroup controller for CPU accounting is not available.\n"); - } - if (!Mounted(Controller::MEMORY_CONTROLLER)) { - CRANE_WARN("Cgroup controller for memory accounting is not available.\n"); - } - if (!Mounted(Controller::CPU_CONTROLLER)) { - CRANE_WARN("Cgroup controller for CPU is not available.\n"); - } - if (!Mounted(Controller::DEVICES_CONTROLLER)) { - CRANE_WARN("Cgroup controller for DEVICES is not available.\n"); - } - if (ret != ECGEOF) { - CRANE_WARN("Error iterating through cgroups mount information: {}\n", - cgroup_strerror(ret)); + ControllersMounted(); + // root cgroup controller can't be change or created + + } else { + CRANE_WARN("Error Cgroup version is not supported"); return -1; } - - RmAllTaskCgroups_(); - + if (cg_version_ == CgroupConstant::CgroupVersion::CGROUP_V1) { + RmAllTaskCgroups_(); + } else if (cg_version_ == CgroupConstant::CgroupVersion::CGROUP_V2) { +#ifdef CRANE_ENABLE_BPF + RmBpfDevMap(); +#endif + RmAllTaskCgroupsV2_(); + } else { + CRANE_WARN("Error Cgroup version is not supported"); + } return 0; } @@ -131,6 +208,47 @@ void CgroupManager::RmAllTaskCgroups_() { CgroupConstant::Controller::DEVICES_CONTROLLER); } +void CgroupManager::ControllersMounted() { + using namespace CgroupConstant; + if (cg_version_ == CgroupVersion::CGROUP_V1) { + if (!Mounted(Controller::BLOCK_CONTROLLER)) { + CRANE_WARN("Cgroup controller for I/O statistics is not available."); + } + if (!Mounted(Controller::FREEZE_CONTROLLER)) { + CRANE_WARN( + "Cgroup controller for process management is not available."); + } + if (!Mounted(Controller::CPUACCT_CONTROLLER)) { + CRANE_WARN("Cgroup controller for CPU accounting is not available."); + } + if (!Mounted(Controller::MEMORY_CONTROLLER)) { + CRANE_WARN("Cgroup controller for memory accounting is not available."); + } + if (!Mounted(Controller::CPU_CONTROLLER)) { + CRANE_WARN("Cgroup controller for CPU is not available."); + } + if (!Mounted(Controller::DEVICES_CONTROLLER)) { + CRANE_WARN("Cgroup controller for DEVICES is not available."); + } + } else if (cg_version_ == CgroupVersion::CGROUP_V2) { + if (!Mounted(Controller::CPU_CONTROLLER_V2)) { + CRANE_WARN("Cgroup controller for CPU is not available."); + } + if (!Mounted(Controller::MEMORY_CONTORLLER_V2)) { + CRANE_WARN("Cgroup controller for memory is not available."); + } + if (!Mounted(Controller::CPUSET_CONTROLLER_V2)) { + CRANE_WARN("Cgroup controller for cpuset is not available."); + } + if (!Mounted(Controller::IO_CONTROLLER_V2)) { + CRANE_WARN("Cgroup controller for I/O statistics is not available."); + } + if (!Mounted(Controller::PIDS_CONTROLLER_V2)) { + CRANE_WARN("Cgroup controller for pids is not available."); + } + } +} + /* * Initialize a controller for a given cgroup. * @@ -168,7 +286,7 @@ int CgroupManager::InitializeController_(struct cgroup &cgroup, controller_str); return required ? 1 : 0; } else { - // Try to turn on hierarchical memory accounting. + // Try to turn on hierarchical memory accounting in V1. if (controller == CgroupConstant::Controller::MEMORY_CONTROLLER) { if ((err = cgroup_add_value_bool(p_raw_controller, "memory.use_hierarchy", true))) { @@ -197,7 +315,7 @@ std::string CgroupManager::CgroupStrByTaskId_(task_id_t task_id) { * - -1 on error * On failure, the state of cgroup is undefined. */ -std::unique_ptr CgroupManager::CreateOrOpen_( +std::unique_ptr CgroupManager::CreateOrOpen_( const std::string &cgroup_string, ControllerFlags preferred_controllers, ControllerFlags required_controllers, bool retrieve) { using CgroupConstant::Controller; @@ -218,48 +336,86 @@ std::unique_ptr CgroupManager::CreateOrOpen_( if (retrieve && (ECGROUPNOTEXIST == cgroup_get_cgroup(native_cgroup))) { has_cgroup = false; } - // Work through the various controllers. - // if ((preferred_controllers & Controller::CPUACCT_CONTROLLER) && - // initialize_controller( - // *native_cgroup, Controller::CPUACCT_CONTROLLER, - // required_controllers & Controller::CPUACCT_CONTROLLER, has_cgroup, - // changed_cgroup)) { - // return nullptr; - // } - if ((preferred_controllers & Controller::MEMORY_CONTROLLER) && - InitializeController_( - *native_cgroup, Controller::MEMORY_CONTROLLER, - required_controllers & Controller::MEMORY_CONTROLLER, has_cgroup, - changed_cgroup)) { - return nullptr; - } - if ((preferred_controllers & Controller::FREEZE_CONTROLLER) && - InitializeController_( - *native_cgroup, Controller::FREEZE_CONTROLLER, - required_controllers & Controller::FREEZE_CONTROLLER, has_cgroup, - changed_cgroup)) { - return nullptr; - } - // if ((preferred_controllers & Controller::BLOCK_CONTROLLER) && - // initialize_controller(*native_cgroup, Controller::BLOCK_CONTROLLER, - // required_controllers & - // Controller::BLOCK_CONTROLLER, has_cgroup, - // changed_cgroup)) { - // return nullptr; - // } - if ((preferred_controllers & Controller::CPU_CONTROLLER) && - InitializeController_(*native_cgroup, Controller::CPU_CONTROLLER, - required_controllers & Controller::CPU_CONTROLLER, - has_cgroup, changed_cgroup)) { - return nullptr; - } - if ((preferred_controllers & Controller::DEVICES_CONTROLLER) && - InitializeController_( - *native_cgroup, Controller::DEVICES_CONTROLLER, - required_controllers & Controller::DEVICES_CONTROLLER, has_cgroup, - changed_cgroup)) { - return nullptr; + + if (GetCgroupVersion() == CgroupConstant::CgroupVersion::CGROUP_V1) { + // if ((preferred_controllers & Controller::CPUACCT_CONTROLLER) && + // initialize_controller( + // *native_cgroup, Controller::CPUACCT_CONTROLLER, + // required_controllers & Controller::CPUACCT_CONTROLLER, + // has_cgroup, changed_cgroup)) { + // return nullptr; + // } + if ((preferred_controllers & Controller::MEMORY_CONTROLLER) && + InitializeController_( + *native_cgroup, Controller::MEMORY_CONTROLLER, + required_controllers & Controller::MEMORY_CONTROLLER, has_cgroup, + changed_cgroup)) { + return nullptr; + } + if ((preferred_controllers & Controller::FREEZE_CONTROLLER) && + InitializeController_( + *native_cgroup, Controller::FREEZE_CONTROLLER, + required_controllers & Controller::FREEZE_CONTROLLER, has_cgroup, + changed_cgroup)) { + return nullptr; + } + // if ((preferred_controllers & Controller::BLOCK_CONTROLLER) && + // initialize_controller(*native_cgroup, Controller::BLOCK_CONTROLLER, + // required_controllers & + // Controller::BLOCK_CONTROLLER, has_cgroup, + // changed_cgroup)) { + // return nullptr; + // } + if ((preferred_controllers & Controller::CPU_CONTROLLER) && + InitializeController_(*native_cgroup, Controller::CPU_CONTROLLER, + required_controllers & Controller::CPU_CONTROLLER, + has_cgroup, changed_cgroup)) { + return nullptr; + } + if ((preferred_controllers & Controller::DEVICES_CONTROLLER) && + InitializeController_( + *native_cgroup, Controller::DEVICES_CONTROLLER, + required_controllers & Controller::DEVICES_CONTROLLER, has_cgroup, + changed_cgroup)) { + return nullptr; + } + } else if (GetCgroupVersion() == CgroupConstant::CgroupVersion::CGROUP_V2) { + if ((preferred_controllers & Controller::CPU_CONTROLLER_V2) && + InitializeController_( + *native_cgroup, Controller::CPU_CONTROLLER_V2, + required_controllers & Controller::CPU_CONTROLLER_V2, has_cgroup, + changed_cgroup)) { + return nullptr; + } + if ((preferred_controllers & Controller::MEMORY_CONTORLLER_V2) && + InitializeController_( + *native_cgroup, Controller::MEMORY_CONTORLLER_V2, + required_controllers & Controller::MEMORY_CONTORLLER_V2, has_cgroup, + changed_cgroup)) { + return nullptr; + } + if ((preferred_controllers & Controller::IO_CONTROLLER_V2) && + InitializeController_( + *native_cgroup, Controller::IO_CONTROLLER_V2, + required_controllers & Controller::IO_CONTROLLER_V2, has_cgroup, + changed_cgroup)) { + return nullptr; + } + if ((preferred_controllers & Controller::CPUSET_CONTROLLER_V2) && + InitializeController_( + *native_cgroup, Controller::CPUSET_CONTROLLER_V2, + required_controllers & Controller::CPUSET_CONTROLLER_V2, has_cgroup, + changed_cgroup)) { + return nullptr; + } + if ((preferred_controllers & Controller::PIDS_CONTROLLER_V2) && + InitializeController_( + *native_cgroup, Controller::PIDS_CONTROLLER_V2, + required_controllers & Controller::PIDS_CONTROLLER_V2, has_cgroup, + changed_cgroup)) { + return nullptr; + } } int err; @@ -267,8 +423,9 @@ std::unique_ptr CgroupManager::CreateOrOpen_( if ((err = cgroup_create_cgroup(native_cgroup, 0))) { // Only record at D_ALWAYS if any cgroup mounts are available. CRANE_WARN( - "Unable to create cgroup {}. Cgroup functionality will not work: {}", - cgroup_string.c_str(), cgroup_strerror(err)); + "Unable to create cgroup {}. Cgroup functionality will not work:" + "{} {}", + cgroup_string.c_str(), err, cgroup_strerror(err)); return nullptr; } } else if (changed_cgroup && (err = cgroup_modify_cgroup(native_cgroup))) { @@ -278,16 +435,35 @@ std::unique_ptr CgroupManager::CreateOrOpen_( cgroup_string.c_str(), err, cgroup_strerror(err)); } - return std::make_unique(cgroup_string, native_cgroup); + if (GetCgroupVersion() == CgroupConstant::CgroupVersion::CGROUP_V1) { + return std::make_unique(cgroup_string, native_cgroup); + } else if (GetCgroupVersion() == CgroupConstant::CgroupVersion::CGROUP_V2) { + struct stat cgroup_stat; + std::string slash = "/"; + std::string cgroup_full_path = + CgroupConstant::RootCgroupFullPath + slash + cgroup_string; + if (stat(cgroup_full_path.c_str(), &cgroup_stat)) { + CRANE_ERROR("Failed to get cgroup {} stat", cgroup_string); + return nullptr; + } + return std::make_unique( + cgroup_string, native_cgroup, + static_cast(cgroup_stat.st_ino)); + } else { + CRANE_WARN("Unable to create cgroup {}. Cgroup version is not supported", + cgroup_string.c_str()); + return nullptr; + } } bool CgroupManager::CheckIfCgroupForTasksExists(task_id_t task_id) { return m_task_id_to_cg_map_.Contains(task_id); } -bool CgroupManager::AllocateAndGetCgroup(task_id_t task_id, Cgroup **cg) { +bool CgroupManager::AllocateAndGetCgroup(task_id_t task_id, + CgroupInterface **cg) { crane::grpc::ResourceInNode res; - Cgroup *pcg; + CgroupInterface *pcg; { auto cg_spec_it = m_task_id_to_cg_spec_map_[task_id]; @@ -298,13 +474,25 @@ bool CgroupManager::AllocateAndGetCgroup(task_id_t task_id, Cgroup **cg) { { auto cg_it = m_task_id_to_cg_map_[task_id]; auto &cg_unique_ptr = *cg_it; - if (!cg_unique_ptr) - cg_unique_ptr = CgroupManager::CreateOrOpen_( - CgroupStrByTaskId_(task_id), - NO_CONTROLLER_FLAG | CgroupConstant::Controller::CPU_CONTROLLER | - CgroupConstant::Controller::MEMORY_CONTROLLER | - CgroupConstant::Controller::DEVICES_CONTROLLER, - NO_CONTROLLER_FLAG, false); + if (!cg_unique_ptr) { + if (GetCgroupVersion() == CgroupConstant::CgroupVersion::CGROUP_V1) { + cg_unique_ptr = CgroupManager::CreateOrOpen_( + CgroupStrByTaskId_(task_id), + NO_CONTROLLER_FLAG | CgroupConstant::Controller::CPU_CONTROLLER | + CgroupConstant::Controller::MEMORY_CONTROLLER | + CgroupConstant::Controller::DEVICES_CONTROLLER, + NO_CONTROLLER_FLAG, false); + } else if (GetCgroupVersion() == + CgroupConstant::CgroupVersion::CGROUP_V2) { + cg_unique_ptr = CgroupManager::CreateOrOpen_( + CgroupStrByTaskId_(task_id), + NO_CONTROLLER_FLAG | CgroupConstant::Controller::CPU_CONTROLLER_V2 | + CgroupConstant::Controller::MEMORY_CONTORLLER_V2, + NO_CONTROLLER_FLAG, false); + } else { + CRANE_WARN("cgroup version is not supported."); + } + } if (!cg_unique_ptr) return false; @@ -415,7 +603,7 @@ bool CgroupManager::ReleaseCgroup(uint32_t task_id, uid_t uid) { // Kind of async behavior. // avoid deadlock by Erase at next line - Cgroup *cgroup = this->m_task_id_to_cg_map_[task_id]->release(); + CgroupInterface *cgroup = this->m_task_id_to_cg_map_[task_id]->release(); this->m_task_id_to_cg_map_.Erase(task_id); if (cgroup != nullptr) { @@ -474,6 +662,61 @@ void CgroupManager::RmAllTaskCgroupsUnderController_( if (handle) cgroup_walk_tree_end(&handle); } +void CgroupManager::RmAllTaskCgroupsV2_() { + RmCgroupsV2_(CgroupConstant::RootCgroupFullPath, + CgroupConstant::kTaskCgPathPrefix); +} + +void CgroupManager::RmCgroupsV2_(const std::string &root_cgroup_path, + const std::string &match_str) { + DIR *dir = nullptr; + if ((dir = opendir(root_cgroup_path.c_str())) == nullptr) { + CRANE_ERROR("Failed to open cgroup dir {}", root_cgroup_path); + } + struct dirent *entry; + std::vector cgroup_full_path_to_delete; + while ((entry = readdir(dir)) != nullptr) { + // Skip "." and ".." directories + if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) { + continue; + } + + std::string dir_name = entry->d_name; + std::string full_path = root_cgroup_path + "/" + dir_name; + + // Check if it's a directory and if it contains the match_str + struct stat info; + if (stat(full_path.c_str(), &info) == 0 && S_ISDIR(info.st_mode)) { + if (dir_name.find(match_str) != std::string::npos) { + cgroup_full_path_to_delete.push_back(full_path); + } + } + } + closedir(dir); + for (const auto &tf : cgroup_full_path_to_delete) { + int err = rmdir(tf.c_str()); + if (err != 0) { + CRANE_ERROR("Failed to remove cgroup {}: {}", tf.c_str(), + strerror(errno)); + } + } +} + +#ifdef CRANE_ENABLE_BPF +void CgroupManager::RmBpfDevMap() { + try { + if (std::filesystem::exists(CgroupConstant::BpfDeviceMapFile)) { + std::filesystem::remove(CgroupConstant::BpfDeviceMapFile); + CRANE_TRACE("Successfully removed: {}", CgroupConstant::BpfDeviceMapFile); + } else { + CRANE_TRACE("File does not exist: {}", CgroupConstant::BpfDeviceMapFile); + } + } catch (const std::filesystem::filesystem_error &e) { + CRANE_ERROR("Error: {}", e.what()); + } +} +#endif + bool CgroupManager::QueryTaskInfoOfUidAsync(uid_t uid, TaskInfoOfUid *info) { CRANE_DEBUG("Query task info for uid {}", uid); @@ -489,7 +732,7 @@ bool CgroupManager::QueryTaskInfoOfUidAsync(uid_t uid, TaskInfoOfUid *info) { } bool CgroupManager::MigrateProcToCgroupOfTask(pid_t pid, task_id_t task_id) { - Cgroup *cg; + CgroupInterface *cg; bool ok = AllocateAndGetCgroup(task_id, &cg); if (!ok) return false; @@ -542,185 +785,6 @@ std::vector CgroupManager::GetResourceEnvListOfTask( return res; } -bool Cgroup::MigrateProcIn(pid_t pid) { - using CgroupConstant::Controller; - using CgroupConstant::GetControllerStringView; - - // We want to make sure task migration is turned on for the - // associated memory controller. So, we get to look up the original cgroup. - // - // If there is no memory controller present, we skip all this and just attempt - // a migrate - int err; -// u_int64_t orig_migrate; -// bool changed_orig = false; -// char *orig_cgroup_path = nullptr; -// struct cgroup *orig_cgroup; -// struct cgroup_controller *memory_controller; -// if (Mounted(Controller::MEMORY_CONTROLLER) && -// (err = cgroup_get_current_controller_path( -// pid, GetControllerStringView(Controller::MEMORY_CONTROLLER).data(), -// &orig_cgroup_path))) { -// CRANE_WARN( -// "Unable to determine current memory cgroup for PID {}. Error {}: -// {}\n", pid, err, cgroup_strerror(err)); -// return false; -// } -// // We will migrate the PID to the new cgroup even if it is in the proper -// // memory controller cgroup It is possible for the task to be in multiple -// // cgroups. -// if (Mounted(Controller::MEMORY_CONTROLLER) && (orig_cgroup_path != NULL) && -// (cgroup_path == orig_cgroup_path)) { -// // Yes, there are race conditions here - can't really avoid this. -// // Throughout this block, we can assume memory controller exists. -// // Get original value of migrate. -// orig_cgroup = cgroup_new_cgroup(orig_cgroup_path); -// assert(orig_cgroup != nullptr); -// if ((err = cgroup_get_cgroup(orig_cgroup))) { -// CRANE_WARN("Unable to read original cgroup {}. Error {}: {}\n", -// orig_cgroup_path, err, cgroup_strerror(err)); -// cgroup_free(&orig_cgroup); -// goto after_migrate; -// } -// if ((memory_controller = cgroup_get_controller( -// orig_cgroup, -// GetControllerStringView(Controller::MEMORY_CONTROLLER).data())) -// == -// nullptr) { -// CRANE_WARN( -// "Unable to get memory controller of cgroup {}. Error {}: {}\n", -// orig_cgroup_path, err, cgroup_strerror(err)); -// cgroup_free(&orig_cgroup); -// goto after_migrate; -// } -// if ((err = cgroup_get_value_uint64(memory_controller, -// "memory.move_charge_at_immigrate", -// &orig_migrate))) { -// if (err == ECGROUPVALUENOTEXIST) { -// // Older kernels don't have the ability to migrate memory accounting -// // to the new cgroup. -// CRANE_WARN( -// "This kernel does not support memory usage migration; cgroup " -// "{} memory statistics" -// " will be slightly incorrect.\n", -// cgroup_path.c_str()); -// } else { -// CRANE_WARN( -// "Unable to read cgroup {} memory controller settings for " -// "migration: {} {}\n", -// orig_cgroup_path, err, cgroup_strerror(err)); -// } -// cgroup_free(&orig_cgroup); -// goto after_migrate; -// } -// if (orig_migrate != 3) { -// cgroup_free(&orig_cgroup); -// orig_cgroup = cgroup_new_cgroup(orig_cgroup_path); -// memory_controller = cgroup_add_controller( -// orig_cgroup, -// GetControllerStringView(Controller::MEMORY_CONTROLLER).data()); -// assert(memory_controller != -// NULL); // Memory controller must already exist -// cgroup_add_value_uint64(memory_controller, -// "memory.move_charge_at_immigrate", 3); -// if ((err = cgroup_modify_cgroup(orig_cgroup))) { -// // Not allowed to change settings -// CRANE_WARN( -// "Unable to change cgroup {} memory controller settings for " -// "migration. " -// "Some memory accounting will be inaccurate: {} " -// "{}\n", -// orig_cgroup_path, err, cgroup_strerror(err)); -// } else { -// changed_orig = true; -// } -// } -// cgroup_free(&orig_cgroup); -// } -// -after_migrate: - - // orig_cgroup = NULL; - err = cgroup_attach_task_pid(m_cgroup_, pid); - if (err != 0) { - CRANE_WARN("Cannot attach pid {} to cgroup {}: {} {}", pid, - m_cgroup_path_.c_str(), err, cgroup_strerror(err)); - } - -// std::string cpu_cg_path = -// fmt::format("/sys/fs/cgroup/cpu,cpuacct/{}/cgroup.procs", cgroup_path); -// -// std::ifstream cpu_cg_content(cpu_cg_path); -// std::string line; -// -// FILE *cpu_cg_f = fopen(cpu_cg_path.c_str(), "ae"); -// if (cpu_cg_f == nullptr) { -// CRANE_ERROR("fopen failed: {}", strerror(errno)); -// err = 1; -// goto end; -// } else { -// CRANE_TRACE("Open {} succeeded.", cpu_cg_path); -// } -// -// err = fprintf(cpu_cg_f, "%d", pid); -// if (err < 0) { -// CRANE_ERROR("fprintf failed: {}", strerror(errno)); -// goto end; -// } else { -// CRANE_TRACE("fprintf {} bytes succeeded.", err); -// } -// -// err = fflush(cpu_cg_f); -// if (err < 0) { -// CRANE_ERROR("fflush failed: {}", strerror(errno)); -// goto end; -// } else { -// CRANE_TRACE("fflush succeeded."); -// } -// -// fclose(cpu_cg_f); -// -// if (cpu_cg_content.is_open()) { -// while (std::getline(cpu_cg_content, line)) { -// CRANE_TRACE("Pid in {}: {}", cgroup_path, line); -// } -// cpu_cg_content.close(); -// } - -// if (changed_orig) { -// if ((orig_cgroup = cgroup_new_cgroup(orig_cgroup_path)) == NULL) { -// goto after_restore; -// } -// if (((memory_controller = cgroup_add_controller( -// orig_cgroup, -// GetControllerStringView(Controller::MEMORY_CONTROLLER).data())) -// != -// nullptr) && -// (!cgroup_add_value_uint64(memory_controller, -// "memory.move_charge_at_immigrate", -// orig_migrate))) { -// if ((err = cgroup_modify_cgroup(orig_cgroup))) { -// CRANE_WARN( -// "Unable to change cgroup {} memory controller settings for " -// "migration. " -// "Some memory accounting will be inaccurate: {} " -// "{}\n", -// orig_cgroup_path, err, cgroup_strerror(err)); -// } else { -// changed_orig = true; -// } -// } -// cgroup_free(&orig_cgroup); -// } -// -// after_restore: -// if (orig_cgroup_path != nullptr) { -// free(orig_cgroup_path); -// } -end: - return err == 0; -} - /* * Cleanup cgroup. * If the cgroup was created by us in the OS, remove it.. @@ -740,59 +804,6 @@ Cgroup::~Cgroup() { } } -bool Cgroup::SetMemorySoftLimitBytes(uint64_t memory_bytes) { - return SetControllerValue( - CgroupConstant::Controller::MEMORY_CONTROLLER, - CgroupConstant::ControllerFile::MEMORY_SOFT_LIMIT_BYTES, memory_bytes); -} - -bool Cgroup::SetMemorySwLimitBytes(uint64_t mem_bytes) { - return SetControllerValue( - CgroupConstant::Controller::MEMORY_CONTROLLER, - CgroupConstant::ControllerFile::MEMORY_MEMSW_LIMIT_IN_BYTES, mem_bytes); -} - -bool Cgroup::SetMemoryLimitBytes(uint64_t memory_bytes) { - return SetControllerValue(CgroupConstant::Controller::MEMORY_CONTROLLER, - CgroupConstant::ControllerFile::MEMORY_LIMIT_BYTES, - memory_bytes); -} - -bool Cgroup::SetCpuShares(uint64_t share) { - return SetControllerValue(CgroupConstant::Controller::CPU_CONTROLLER, - CgroupConstant::ControllerFile::CPU_SHARES, share); -} - -/* - * CPU_CFS_PERIOD_US is the period of time in microseconds for how long a - * cgroup's access to CPU resources is measured. - * CPU_CFS_QUOTA_US is the maximum amount of time in microseconds for which a - * cgroup's tasks are allowed to run during one period. - * CPU_CFS_PERIOD_US should be set to between 1ms(1000) and 1s(1000'000). - * CPU_CFS_QUOTA_US should be set to -1 for unlimited, or larger than 1ms(1000). - * See - * https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/6/html/resource_management_guide/sec-cpu - */ -bool Cgroup::SetCpuCoreLimit(double core_num) { - constexpr uint32_t base = 1 << 16; - - bool ret; - ret = SetControllerValue(CgroupConstant::Controller::CPU_CONTROLLER, - CgroupConstant::ControllerFile::CPU_CFS_QUOTA_US, - uint64_t(std::round(base * core_num))); - ret &= SetControllerValue(CgroupConstant::Controller::CPU_CONTROLLER, - CgroupConstant::ControllerFile::CPU_CFS_PERIOD_US, - base); - - return ret; -} - -bool Cgroup::SetBlockioWeight(uint64_t weight) { - return SetControllerValue(CgroupConstant::Controller::BLOCK_CONTROLLER, - CgroupConstant::ControllerFile::BLOCKIO_WEIGHT, - weight); -} - bool Cgroup::SetControllerValue(CgroupConstant::Controller controller, CgroupConstant::ControllerFile controller_file, uint64_t value) { @@ -948,14 +959,90 @@ bool Cgroup::SetControllerStrs(CgroupConstant::Controller controller, return true; } -bool Cgroup::KillAllProcesses() { +bool CgroupV1::MigrateProcIn(pid_t pid) { + using CgroupConstant::Controller; + using CgroupConstant::GetControllerStringView; + + // We want to make sure task migration is turned on for the + // associated memory controller. So, we get to look up the original cgroup. + // + // If there is no memory controller present, we skip all this and just attempt + // a migrate + int err; + // TODO handle memory.move_charge_at_immigrate + // https://github.com/PKUHPC/CraneSched/pull/327/files/eaa0d04dcc4c12a1773ac9a3fd42aa9f898741aa..9dc93a50528c1b22dbf50d0bf40a11a98bbed36d#r1838007422 + err = cgroup_attach_task_pid(m_cgroup_info_.m_cgroup_, pid); + if (err != 0) { + CRANE_WARN("Cannot attach pid {} to cgroup {}: {} {}", pid, + m_cgroup_info_.m_cgroup_path_.c_str(), err, + cgroup_strerror(err)); + } + return err == 0; +} + +bool CgroupV1::SetMemorySoftLimitBytes(uint64_t memory_bytes) { + return m_cgroup_info_.SetControllerValue( + CgroupConstant::Controller::MEMORY_CONTROLLER, + CgroupConstant::ControllerFile::MEMORY_SOFT_LIMIT_BYTES, memory_bytes); +} + +bool CgroupV1::SetMemorySwLimitBytes(uint64_t mem_bytes) { + return m_cgroup_info_.SetControllerValue( + CgroupConstant::Controller::MEMORY_CONTROLLER, + CgroupConstant::ControllerFile::MEMORY_MEMSW_LIMIT_IN_BYTES, mem_bytes); +} + +bool CgroupV1::SetMemoryLimitBytes(uint64_t memory_bytes) { + return m_cgroup_info_.SetControllerValue( + CgroupConstant::Controller::MEMORY_CONTROLLER, + CgroupConstant::ControllerFile::MEMORY_LIMIT_BYTES, memory_bytes); +} + +bool CgroupV1::SetCpuShares(uint64_t share) { + return m_cgroup_info_.SetControllerValue( + CgroupConstant::Controller::CPU_CONTROLLER, + CgroupConstant::ControllerFile::CPU_SHARES, share); +} + +/* + * CPU_CFS_PERIOD_US is the period of time in microseconds for how long a + * cgroup's access to CPU resources is measured. + * CPU_CFS_QUOTA_US is the maximum amount of time in microseconds for which a + * cgroup's tasks are allowed to run during one period. + * CPU_CFS_PERIOD_US should be set to between 1ms(1000) and 1s(1000'000). + * CPU_CFS_QUOTA_US should be set to -1 for unlimited, or larger than 1ms(1000). + * See + * https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/6/html/resource_management_guide/sec-cpu + */ +bool CgroupV1::SetCpuCoreLimit(double core_num) { + constexpr uint32_t base = 1 << 16; + + bool ret; + ret = m_cgroup_info_.SetControllerValue( + CgroupConstant::Controller::CPU_CONTROLLER, + CgroupConstant::ControllerFile::CPU_CFS_QUOTA_US, + uint64_t(std::round(base * core_num))); + ret &= m_cgroup_info_.SetControllerValue( + CgroupConstant::Controller::CPU_CONTROLLER, + CgroupConstant::ControllerFile::CPU_CFS_PERIOD_US, base); + + return ret; +} + +bool CgroupV1::SetBlockioWeight(uint64_t weight) { + return m_cgroup_info_.SetControllerValue( + CgroupConstant::Controller::BLOCK_CONTROLLER, + CgroupConstant::ControllerFile::BLOCKIO_WEIGHT, weight); +} + +bool CgroupV1::KillAllProcesses() { using namespace CgroupConstant::Internal; const char *controller = CgroupConstant::GetControllerStringView( CgroupConstant::Controller::CPU_CONTROLLER) .data(); - const char *cg_name = m_cgroup_path_.c_str(); + const char *cg_name = m_cgroup_info_.m_cgroup_path_.c_str(); int size, rc; pid_t *pids; @@ -976,14 +1063,14 @@ bool Cgroup::KillAllProcesses() { } } -bool Cgroup::Empty() { +bool CgroupV1::Empty() { using namespace CgroupConstant::Internal; const char *controller = CgroupConstant::GetControllerStringView( CgroupConstant::Controller::CPU_CONTROLLER) .data(); - const char *cg_name = m_cgroup_path_.c_str(); + const char *cg_name = m_cgroup_info_.m_cgroup_path_.c_str(); int size, rc; pid_t *pids; @@ -999,22 +1086,15 @@ bool Cgroup::Empty() { return false; } } -bool Cgroup::SetDeviceAccess(const std::unordered_set &devices, - bool set_read, bool set_write, bool set_mknod) { +bool CgroupV1::SetDeviceAccess(const std::unordered_set &devices, + bool set_read, bool set_write, bool set_mknod) { std::string op; if (set_read) op += "r"; if (set_write) op += "w"; if (set_mknod) op += "m"; - std::vector allow_limits; std::vector deny_limits; for (const auto &[_, this_device] : Craned::g_this_node_device) { - if (devices.contains(this_device->dev_id)) { - for (const auto &dev_meta : this_device->device_metas) { - allow_limits.emplace_back(fmt::format("{} {}:{} {}", dev_meta.op_type, - dev_meta.major, dev_meta.minor, - op)); - } - } else { + if (!devices.contains(this_device->dev_id)) { for (const auto &dev_meta : this_device->device_metas) { deny_limits.emplace_back(fmt::format("{} {}:{} {}", dev_meta.op_type, dev_meta.major, dev_meta.minor, @@ -1023,19 +1103,357 @@ bool Cgroup::SetDeviceAccess(const std::unordered_set &devices, } } auto ok = true; - if (!allow_limits.empty()) - ok = SetControllerStrs(CgroupConstant::Controller::DEVICES_CONTROLLER, - CgroupConstant::ControllerFile::DEVICES_ALLOW, - allow_limits); - if (ok && !deny_limits.empty()) - ok &= SetControllerStrs(CgroupConstant::Controller::DEVICES_CONTROLLER, - CgroupConstant::ControllerFile::DEVICES_DENY, - deny_limits); + if (!deny_limits.empty()) + ok &= m_cgroup_info_.SetControllerStrs( + CgroupConstant::Controller::DEVICES_CONTROLLER, + CgroupConstant::ControllerFile::DEVICES_DENY, deny_limits); return ok; } +#ifdef CRANE_ENABLE_BPF + +BpfRuntimeInfo::BpfRuntimeInfo() { + bpf_obj_ = nullptr; + bpf_prog_ = nullptr; + dev_map_ = nullptr; + bpf_debug_log_level_ = 0; + bpf_mtx_ = new std::mutex; + bpf_prog_fd_ = -1; + cgroup_count_ = 0; +} + +BpfRuntimeInfo::~BpfRuntimeInfo() { + bpf_obj_ = nullptr; + bpf_prog_ = nullptr; + dev_map_ = nullptr; + delete bpf_mtx_; + bpf_prog_fd_ = -1; + cgroup_count_ = 0; +} + +bool BpfRuntimeInfo::InitializeBpfObj() { + std::unique_lock lk(*bpf_mtx_); + + if (cgroup_count_ == 0) { + bpf_obj_ = bpf_object__open_file(CgroupConstant::BpfObjectFile, NULL); + if (!bpf_obj_) { + CRANE_ERROR("Failed to open BPF object file {}", + CgroupConstant::BpfObjectFile); + bpf_object__close(bpf_obj_); + return false; + } + + // ban libbpf log + libbpf_print_fn_t fn = libbpf_set_print(NULL); + + if (bpf_object__load(bpf_obj_)) { + CRANE_ERROR("Failed to load BPF object {}", + CgroupConstant::BpfObjectFile); + bpf_object__close(bpf_obj_); + return false; + } + + bpf_prog_ = bpf_object__find_program_by_name( + bpf_obj_, CgroupConstant::BpfProgramName); + if (!bpf_prog_) { + CRANE_ERROR("Failed to find BPF program {}", + CgroupConstant::BpfProgramName); + bpf_object__close(bpf_obj_); + return false; + } + + bpf_prog_fd_ = bpf_program__fd(bpf_prog_); + if (bpf_prog_fd_ < 0) { + CRANE_ERROR("Failed to get BPF program file descriptor {}", + CgroupConstant::BpfObjectFile); + bpf_object__close(bpf_obj_); + return false; + } + + dev_map_ = + bpf_object__find_map_by_name(bpf_obj_, CgroupConstant::BpfMapName); + if (!dev_map_) { + CRANE_ERROR("Failed to find BPF map {}", CgroupConstant::BpfMapName); + close(bpf_prog_fd_); + bpf_object__close(bpf_obj_); + return false; + } + + struct BpfKey key = {static_cast(0), static_cast(0), + static_cast(0)}; + struct BpfDeviceMeta meta = {static_cast(bpf_debug_log_level_), + static_cast(0), static_cast(0), + static_cast(0), static_cast(0)}; + if (bpf_map__update_elem(dev_map_, &key, sizeof(BpfKey), &meta, + sizeof(BpfDeviceMeta), BPF_ANY)) { + CRANE_ERROR("Failed to set debug log level in BPF"); + return false; + } + } + return ++cgroup_count_ >= 1; +} + +void BpfRuntimeInfo::CloseBpfObj() { + std::unique_lock lk(*bpf_mtx_); + if (BpfInvalid() && --cgroup_count_ == 0) { + close(bpf_prog_fd_); + bpf_object__close(bpf_obj_); + bpf_prog_fd_ = -1; + bpf_obj_ = nullptr; + bpf_prog_ = nullptr; + dev_map_ = nullptr; + RmBpfDeviceMap(); + } +} + +void BpfRuntimeInfo::RmBpfDeviceMap() { + try { + if (std::filesystem::exists(CgroupConstant::BpfDeviceMapFile)) { + std::filesystem::remove(CgroupConstant::BpfDeviceMapFile); + CRANE_TRACE("Successfully removed: {}", CgroupConstant::BpfDeviceMapFile); + } else { + CRANE_TRACE("File does not exist: {}", CgroupConstant::BpfDeviceMapFile); + } + } catch (const std::filesystem::filesystem_error &e) { + CRANE_ERROR("Error: {}", e.what()); + } +} +#endif + +CgroupV2::CgroupV2(const std::string &path, struct cgroup *handle, uint64_t id) + : m_cgroup_info_(path, handle, id) { +#ifdef CRANE_ENABLE_BPF + if (bpf_runtime_info_.InitializeBpfObj()) { + CRANE_TRACE("Bpf object initialization succeed"); + } else { + CRANE_TRACE("Bpf object initialization failed"); + } +#endif +} + +CgroupV2::~CgroupV2() { +#ifdef CRANE_ENABLE_BPF + if (!m_cgroup_bpf_devices.empty()) { + EraseBpfDeviceMap(); + } + bpf_runtime_info_.CloseBpfObj(); +#endif +} + +/** + *If a controller implements an absolute resource guarantee and/or limit, + * the interface files should be named “min” and “max” respectively. + * If a controller implements best effort resource guarantee and/or limit, + * the interface files should be named “low” and “high” respectively. + */ + +bool CgroupV2::SetCpuCoreLimit(double core_num) { + constexpr uint32_t period = 1 << 16; + uint64_t quota = static_cast(period * core_num); + std::string cpuMaxValue = + std::to_string(quota) + " " + std::to_string(period); + return m_cgroup_info_.SetControllerStr( + CgroupConstant::Controller::CPU_CONTROLLER_V2, + CgroupConstant::ControllerFile::CPU_MAX_V2, cpuMaxValue.c_str()); +} + +bool CgroupV2::SetCpuShares(uint64_t share) { + return m_cgroup_info_.SetControllerValue( + CgroupConstant::Controller::CPU_CONTROLLER_V2, + CgroupConstant::ControllerFile::CPU_WEIGHT_V2, share); +} + +bool CgroupV2::SetMemoryLimitBytes(uint64_t memory_bytes) { + return m_cgroup_info_.SetControllerValue( + CgroupConstant::Controller::MEMORY_CONTORLLER_V2, + CgroupConstant::ControllerFile::MEMORY_MAX_V2, memory_bytes); +} + +bool CgroupV2::SetMemorySoftLimitBytes(uint64_t memory_bytes) { + return m_cgroup_info_.SetControllerValue( + CgroupConstant::Controller::MEMORY_CONTORLLER_V2, + CgroupConstant::ControllerFile::MEMORY_HIGH_V2, memory_bytes); +} + +bool CgroupV2::SetMemorySwLimitBytes(uint64_t memory_bytes) { + return m_cgroup_info_.SetControllerValue( + CgroupConstant::Controller::MEMORY_CONTORLLER_V2, + CgroupConstant::ControllerFile::MEMORY_SWAP_MAX_V2, memory_bytes); +} + +bool CgroupV2::SetBlockioWeight(uint64_t weight) { + return m_cgroup_info_.SetControllerValue( + CgroupConstant::Controller::IO_CONTROLLER_V2, + CgroupConstant::ControllerFile::IO_WEIGHT_V2, weight); +} + +bool CgroupV2::SetDeviceAccess(const std::unordered_set &devices, + bool set_read, bool set_write, bool set_mknod) { +#ifdef CRANE_ENABLE_BPF + if (!bpf_runtime_info_.BpfInvalid()) { + CRANE_WARN("BPF is not initialized."); + return false; + } + int cgroup_fd; + std::string slash = "/"; + std::string cgroup_path = CgroupConstant::RootCgroupFullPath + slash + + m_cgroup_info_.m_cgroup_path_; + cgroup_fd = open(cgroup_path.c_str(), O_RDONLY); + if (cgroup_fd < 0) { + CRANE_ERROR("Failed to open cgroup"); + return false; + } + + short access = 0; + if (set_read) access |= BPF_DEVCG_ACC_READ; + if (set_write) access |= BPF_DEVCG_ACC_WRITE; + if (set_mknod) access |= BPF_DEVCG_ACC_MKNOD; + + auto &bpf_devices = m_cgroup_bpf_devices; + for (const auto &[_, this_device] : Craned::g_this_node_device) { + if (!devices.contains(this_device->dev_id)) { + for (const auto &dev_meta : this_device->device_metas) { + short op_type = 0; + if (dev_meta.op_type == 'c') { + op_type |= BPF_DEVCG_DEV_CHAR; + } else if (dev_meta.op_type == 'b') { + op_type |= BPF_DEVCG_DEV_BLOCK; + } else { + op_type |= 0xffff; + } + bpf_devices.push_back({dev_meta.major, dev_meta.minor, + BPF_PERMISSION::DENY, access, op_type}); + } + } + } + { + std::unique_lock lk(*bpf_runtime_info_.BpfMutex()); + for (int i = 0; i < bpf_devices.size(); i++) { + struct BpfKey key = {m_cgroup_info_.m_cgroup_id, bpf_devices[i].major, + bpf_devices[i].minor}; + if (bpf_map__update_elem(bpf_runtime_info_.BpfDevMap(), &key, + sizeof(BpfKey), &bpf_devices[i], + sizeof(BpfDeviceMeta), BPF_ANY)) { + CRANE_ERROR("Failed to update BPF map major {},minor {} cgroup id {}", + bpf_devices[i].major, bpf_devices[i].minor, key.cgroup_id); + close(cgroup_fd); + return false; + } + } + + if (bpf_prog_attach(bpf_runtime_info_.BpfProgFd(), cgroup_fd, + BPF_CGROUP_DEVICE, 0) < 0) { + CRANE_ERROR("Failed to attach BPF program"); + close(cgroup_fd); + return false; + } + } + close(cgroup_fd); + return true; +#endif + +#ifndef CRANE_ENABLE_BPF + CRANE_WARN( + "BPF is disabled in craned, you can use Cgroup V1 to set devices access"); + return false; +#endif +} + +#ifdef CRANE_ENABLE_BPF +bool CgroupV2::EraseBpfDeviceMap() { + { + if (!bpf_runtime_info_.BpfInvalid()) { + CRANE_WARN("BPF is not initialized."); + return false; + } + std::unique_lock lk(*bpf_runtime_info_.BpfMutex()); + auto &bpf_devices = m_cgroup_bpf_devices; + for (int i = 0; i < bpf_devices.size(); i++) { + struct BpfKey key = {m_cgroup_info_.m_cgroup_id, bpf_devices[i].major, + bpf_devices[i].minor}; + if (bpf_map__delete_elem(bpf_runtime_info_.BpfDevMap(), &key, + sizeof(BpfKey), BPF_ANY)) { + CRANE_ERROR( + "Failed to delete BPF map major {},minor {} in cgroup id {}", + bpf_devices[i].major, bpf_devices[i].minor, key.cgroup_id); + return false; + } + } + } + return true; +} +#endif + +bool CgroupV2::KillAllProcesses() { + using namespace CgroupConstant::Internal; + + const char *controller = CgroupConstant::GetControllerStringView( + CgroupConstant::Controller::CPU_CONTROLLER_V2) + .data(); + + const char *cg_name = m_cgroup_info_.m_cgroup_path_.c_str(); + + int size, rc; + pid_t *pids; + + rc = cgroup_get_procs(const_cast(cg_name), + const_cast(controller), &pids, &size); + + if (rc == 0) { + for (int i = 0; i < size; ++i) { + kill(pids[i], SIGKILL); + } + free(pids); + return true; + } else { + CRANE_ERROR("cgroup_get_procs error on cgroup \"{}\": {}", cg_name, + cgroup_strerror(rc)); + return false; + } +} + +bool CgroupV2::Empty() { + using namespace CgroupConstant::Internal; + + const char *controller = CgroupConstant::GetControllerStringView( + CgroupConstant::Controller::CPU_CONTROLLER_V2) + .data(); + + const char *cg_name = m_cgroup_info_.m_cgroup_path_.c_str(); + + int size, rc; + pid_t *pids; + + rc = cgroup_get_procs(const_cast(cg_name), + const_cast(controller), &pids, &size); + if (rc == 0) { + free(pids); + return size == 0; + } else { + CRANE_ERROR("cgroup_get_procs error on cgroup \"{}\": {}", cg_name, + cgroup_strerror(rc)); + return false; + } +} + +bool CgroupV2::MigrateProcIn(pid_t pid) { + using CgroupConstant::Controller; + using CgroupConstant::GetControllerStringView; + int err; +after_migrate: + + err = cgroup_attach_task_pid(m_cgroup_info_.m_cgroup_, pid); + if (err != 0) { + CRANE_WARN("Cannot attach pid {} to cgroup {}: {} {}", pid, + m_cgroup_info_.m_cgroup_path_.c_str(), err, + cgroup_strerror(err)); + } +end: + return err == 0; +} + bool AllocatableResourceAllocator::Allocate(const AllocatableResource &resource, - Cgroup *cg) { + CgroupInterface *cg) { bool ok; ok = cg->SetCpuCoreLimit(static_cast(resource.cpu_count)); ok &= cg->SetMemoryLimitBytes(resource.memory_bytes); @@ -1048,7 +1466,7 @@ bool AllocatableResourceAllocator::Allocate(const AllocatableResource &resource, } bool AllocatableResourceAllocator::Allocate( - const crane::grpc::AllocatableResource &resource, Cgroup *cg) { + const crane::grpc::AllocatableResource &resource, CgroupInterface *cg) { bool ok; ok = cg->SetCpuCoreLimit(resource.cpu_core_limit()); ok &= cg->SetMemoryLimitBytes(resource.memory_limit_bytes()); @@ -1061,14 +1479,24 @@ bool AllocatableResourceAllocator::Allocate( } bool DedicatedResourceAllocator::Allocate( - const crane::grpc::DedicatedResourceInNode &request_resource, Cgroup *cg) { + const crane::grpc::DedicatedResourceInNode &request_resource, + CgroupInterface *cg) { std::unordered_set all_request_slots; for (const auto &[_, type_slots_map] : request_resource.name_type_map()) { for (const auto &[__, slots] : type_slots_map.type_slots_map()) all_request_slots.insert(slots.slots().cbegin(), slots.slots().cend()); }; - if (!cg->SetDeviceAccess(all_request_slots, true, true, true)) return false; + if (!cg->SetDeviceAccess(all_request_slots, true, true, true)) { + if (g_cg_mgr->GetCgroupVersion() == + CgroupConstant::CgroupVersion::CGROUP_V1) { + CRANE_WARN("Allocate devices access failed in Cgroup V1."); + } else if (g_cg_mgr->GetCgroupVersion() == + CgroupConstant::CgroupVersion::CGROUP_V2) { + CRANE_WARN("Allocate devices access failed in Cgroup V2."); + } + return true; + } return true; } } // namespace Craned \ No newline at end of file diff --git a/src/Craned/CgroupManager.h b/src/Craned/CgroupManager.h index 3af26f8c..50c01962 100644 --- a/src/Craned/CgroupManager.h +++ b/src/Craned/CgroupManager.h @@ -27,14 +27,25 @@ #include + #include "CranedPublicDefs.h" #include "crane/AtomicHashMap.h" #include "crane/OS.h" +#ifdef CRANE_ENABLE_BPF +# include +#endif + namespace Craned { namespace CgroupConstant { +enum class CgroupVersion : uint64_t { + CGROUP_V1 = 0, + CGROUP_V2, + UNDEFINED, +}; + enum class Controller : uint64_t { MEMORY_CONTROLLER = 0, CPUACCT_CONTROLLER, @@ -43,6 +54,12 @@ enum class Controller : uint64_t { CPU_CONTROLLER, DEVICES_CONTROLLER, + MEMORY_CONTORLLER_V2, + CPU_CONTROLLER_V2, + IO_CONTROLLER_V2, + CPUSET_CONTROLLER_V2, + PIDS_CONTROLLER_V2, + ControllerCount, }; @@ -59,18 +76,46 @@ enum class ControllerFile : uint64_t { DEVICES_DENY, DEVICES_ALLOW, + // V2 + + CPU_WEIGHT_V2, + CPU_MAX_V2, + + MEMORY_MAX_V2, + MEMORY_SWAP_MAX_V2, + MEMORY_HIGH_V2, + + IO_WEIGHT_V2, + // root cgroup controller can't be change or created - ControllerFileCount + ControllerFileCount, }; inline const char *kTaskCgPathPrefix = "Crane_Task_"; - +inline const char *RootCgroupFullPath = "/sys/fs/cgroup"; +#ifdef CRANE_ENABLE_BPF +inline const char *BpfObjectFile = "/etc/crane/cgroup_dev_bpf.o"; +inline const char *BpfDeviceMapFile = "/sys/fs/bpf/craned_dev_map"; +inline const char *BpfMapName = "craned_dev_map"; +inline const char *BpfProgramName = "craned_device_access"; +#endif namespace Internal { constexpr std::array(Controller::ControllerCount)> ControllerStringView{ - "memory", "cpuacct", "freezer", "blkio", "cpu", "devices", + "memory", + "cpuacct", + "freezer", + "blkio", + "cpu", + "devices", + // V2 + "memory", + "cpu", + "io", + "cpuset", + "pids", }; constexpr std::array &devices, + bool set_read, bool set_write, + bool set_mknod) = 0; + virtual bool MigrateProcIn(pid_t pid) = 0; + + virtual bool KillAllProcesses() = 0; + + virtual bool Empty() = 0; + virtual const std::string &GetCgroupString() const = 0; +}; + class Cgroup { public: - Cgroup(const std::string &path, struct cgroup *handle) - : m_cgroup_path_(path), m_cgroup_(handle) {} + Cgroup(const std::string &path, struct cgroup *handle, uint64_t id = 0) + : m_cgroup_path_(path), m_cgroup_(handle), m_cgroup_id(id) {} ~Cgroup(); struct cgroup *NativeHandle() { return m_cgroup_; } - const std::string &GetCgroupString() const { return m_cgroup_path_; }; - // Using the zombie object pattern as exceptions are not available. bool Valid() const { return m_cgroup_ != nullptr; } - bool SetCpuCoreLimit(double core_num); - bool SetCpuShares(uint64_t share); - bool SetMemoryLimitBytes(uint64_t memory_bytes); - bool SetMemorySwLimitBytes(uint64_t mem_bytes); - bool SetMemorySoftLimitBytes(uint64_t memory_bytes); - bool SetBlockioWeight(uint64_t weight); - bool SetDeviceAccess(const std::unordered_set &devices, bool set_read, - bool set_write, bool set_mknod); bool SetControllerValue(CgroupConstant::Controller controller, CgroupConstant::ControllerFile controller_file, uint64_t value); @@ -223,30 +312,150 @@ class Cgroup { bool SetControllerStrs(CgroupConstant::Controller controller, CgroupConstant::ControllerFile controller_file, const std::vector &strs); - bool KillAllProcesses(); - bool Empty(); + // CgroupConstant::CgroupVersion cg_vsion; // maybe for hybird mode + bool ModifyCgroup_(CgroupConstant::ControllerFile controller_file); + std::string m_cgroup_path_; + mutable struct cgroup *m_cgroup_; + uint64_t m_cgroup_id; +}; + +class CgroupV1 : public CgroupInterface { + public: + CgroupV1(const std::string &path, struct cgroup *handle) + : m_cgroup_info_(path, handle) {} + ~CgroupV1() override = default; + bool SetCpuCoreLimit(double core_num) override; + bool SetCpuShares(uint64_t share) override; + bool SetMemoryLimitBytes(uint64_t memory_bytes) override; + bool SetMemorySwLimitBytes(uint64_t mem_bytes) override; + bool SetMemorySoftLimitBytes(uint64_t memory_bytes) override; + bool SetBlockioWeight(uint64_t weight) override; + + bool SetDeviceAccess(const std::unordered_set &devices, bool set_read, + bool set_write, bool set_mknod) override; + + bool KillAllProcesses() override; + + bool Empty() override; + + bool MigrateProcIn(pid_t pid) override; - bool MigrateProcIn(pid_t pid); + const std::string &GetCgroupString() const override { + return m_cgroup_info_.m_cgroup_path_; + } private: - bool ModifyCgroup_(CgroupConstant::ControllerFile controller_file); + Cgroup m_cgroup_info_; +}; - std::string m_cgroup_path_; - mutable struct cgroup *m_cgroup_; +#ifdef CRANE_ENABLE_BPF +class BpfRuntimeInfo { + public: + BpfRuntimeInfo(); + ~BpfRuntimeInfo(); + bool InitializeBpfObj(); + void CloseBpfObj(); + void RmBpfDeviceMap(); + + struct bpf_object *BpfObj() { return bpf_obj_; } + struct bpf_program *BpfProgram() { return bpf_prog_; } + std::mutex *BpfMutex() { return bpf_mtx_; } + struct bpf_map *BpfDevMap() { return dev_map_; } + int BpfProgFd() { return bpf_prog_fd_; } + void SetLogLevel(uint32_t log_devel) { bpf_debug_log_level_ = log_devel; } + bool BpfInvalid() { + return bpf_obj_ && bpf_prog_ && dev_map_ && bpf_prog_fd_ != -1 && + cgroup_count_ > 0; + } + + private: + uint32_t bpf_debug_log_level_; + struct bpf_object *bpf_obj_; + struct bpf_program *bpf_prog_; + struct bpf_map *dev_map_; + int bpf_prog_fd_; + std::mutex *bpf_mtx_; + size_t cgroup_count_; +}; +#endif + +class CgroupV2 : public CgroupInterface { + public: + CgroupV2(const std::string &path, struct cgroup *handle, uint64_t id); + ~CgroupV2() override; + bool SetCpuCoreLimit(double core_num) override; + bool SetCpuShares(uint64_t share) override; + bool SetMemoryLimitBytes(uint64_t memory_bytes) override; + bool SetMemorySwLimitBytes(uint64_t mem_bytes) override; + bool SetMemorySoftLimitBytes(uint64_t memory_bytes) override; + bool SetBlockioWeight(uint64_t weight) override; + + // use BPF + /** + * Device controller manages access to device files. + It includes both creation of new device files (using mknod), + and access to the existing device files. + + Cgroup v2 device controller has no interface files and + is implemented on top of cgroup BPF. To control access + to device files, a user may create bpf programs of the + BPF_CGROUP_DEVICE type and attach them to cgroups. + On an attempt to access a device file, corresponding BPF + programs will be executed, and depending on the return + value the attempt will succeed or fail with -EPERM. + + A BPF_CGROUP_DEVICE program takes a pointer to the + bpf_cgroup_dev_ctx structure, which describes the device + access attempt: access type (mknod/read/write) and device + (type, major and minor numbers). + If the program returns 0, the attempt fails with -EPERM, otherwise it + succeeds. + + An example of BPF_CGROUP_DEVICE program may be found + in the kernel source tree in the tools/testing/selftests/bpf/dev_cgroup.c + file. reference from: + https://www.kernel.org/doc/html/v5.10/admin-guide/cgroup-v2.html#device-controller + */ + bool SetDeviceAccess(const std::unordered_set &devices, bool set_read, + bool set_write, bool set_mknod) override; +#ifdef CRANE_ENABLE_BPF + bool EraseBpfDeviceMap(); + static void SetBpfDebugLogLevel(uint32_t l) { + bpf_runtime_info_.SetLogLevel(l); + } +#endif + bool KillAllProcesses() override; + + bool Empty() override; + + bool MigrateProcIn(pid_t pid) override; + + const std::string &GetCgroupString() const override { + return m_cgroup_info_.m_cgroup_path_; + } + + private: +#ifdef CRANE_ENABLE_BPF + std::vector m_cgroup_bpf_devices{}; + static BpfRuntimeInfo bpf_runtime_info_; +#endif + Cgroup m_cgroup_info_; }; class AllocatableResourceAllocator { public: - static bool Allocate(const AllocatableResource &resource, Cgroup *cg); + static bool Allocate(const AllocatableResource &resource, + CgroupInterface *cg); static bool Allocate(const crane::grpc::AllocatableResource &resource, - Cgroup *cg); + CgroupInterface *cg); }; class DedicatedResourceAllocator { public: static bool Allocate( - const crane::grpc::DedicatedResourceInNode &request_resource, Cgroup *cg); + const crane::grpc::DedicatedResourceInNode &request_resource, + CgroupInterface *cg); }; class CgroupManager { @@ -257,6 +466,8 @@ class CgroupManager { return bool(m_mounted_controllers_ & ControllerFlags{controller}); } + void ControllersMounted(); + bool QueryTaskInfoOfUidAsync(uid_t uid, TaskInfoOfUid *info); std::optional QueryTaskExecutionNode(task_id_t task_id); @@ -265,7 +476,7 @@ class CgroupManager { bool CheckIfCgroupForTasksExists(task_id_t task_id); - bool AllocateAndGetCgroup(task_id_t task_id, Cgroup **cg); + bool AllocateAndGetCgroup(task_id_t task_id, CgroupInterface **cg); bool MigrateProcToCgroupOfTask(pid_t pid, task_id_t task_id); @@ -281,28 +492,43 @@ class CgroupManager { std::vector GetResourceEnvListOfTask(task_id_t task_id); + void SetCgroupVersion(CgroupConstant::CgroupVersion v) { cg_version_ = v; } + + CgroupConstant::CgroupVersion GetCgroupVersion() { return cg_version_; } + private: static std::string CgroupStrByTaskId_(task_id_t task_id); - std::unique_ptr CreateOrOpen_(const std::string &cgroup_string, - ControllerFlags preferred_controllers, - ControllerFlags required_controllers, - bool retrieve); + std::unique_ptr CreateOrOpen_( + const std::string &cgroup_string, ControllerFlags preferred_controllers, + ControllerFlags required_controllers, bool retrieve); int InitializeController_(struct cgroup &cgroup, CgroupConstant::Controller controller, bool required, bool has_cgroup, bool &changed_cgroup); - void RmAllTaskCgroups_(); - void RmAllTaskCgroupsUnderController_(CgroupConstant::Controller controller); + static void RmAllTaskCgroups_(); + static void RmAllTaskCgroupsUnderController_( + CgroupConstant::Controller controller); + + void RmAllTaskCgroupsV2_(); + void RmCgroupsV2_(const std::string &root_cgroup_path, + const std::string &match_str); + +#ifdef CRANE_ENABLE_BPF + void RmBpfDevMap(); +#endif ControllerFlags m_mounted_controllers_; + CgroupConstant::CgroupVersion cg_version_; + util::AtomicHashMap m_task_id_to_cg_spec_map_; - util::AtomicHashMap> + util::AtomicHashMap> m_task_id_to_cg_map_; util::AtomicHashMap(log_level)); +#endif if (config["CranedUnixSockPath"]) g_config.CranedUnixSockPath = g_config.CraneBaseDir + @@ -608,12 +610,21 @@ void GlobalVariableInit() { using Craned::CgroupConstant::Controller; g_cg_mgr = std::make_unique(); g_cg_mgr->Init(); - if (!g_cg_mgr->Mounted(Controller::CPU_CONTROLLER) || - !g_cg_mgr->Mounted(Controller::MEMORY_CONTROLLER) || - !g_cg_mgr->Mounted(Controller::DEVICES_CONTROLLER)) { + if (g_cg_mgr->GetCgroupVersion() == + Craned::CgroupConstant::CgroupVersion::CGROUP_V1 && + (!g_cg_mgr->Mounted(Controller::CPU_CONTROLLER) || + !g_cg_mgr->Mounted(Controller::MEMORY_CONTROLLER) || + !g_cg_mgr->Mounted(Controller::DEVICES_CONTROLLER))) { CRANE_ERROR("Failed to initialize cpu,memory,devices cgroups controller."); std::exit(1); } + if (g_cg_mgr->GetCgroupVersion() == + Craned::CgroupConstant::CgroupVersion::CGROUP_V2 && + (!g_cg_mgr->Mounted(Controller::CPU_CONTROLLER_V2) || + !g_cg_mgr->Mounted(Controller::MEMORY_CONTORLLER_V2))) { + CRANE_ERROR("Failed to initialize cpu,memory cgroups controller."); + std::exit(1); + } g_thread_pool = std::make_unique(std::thread::hardware_concurrency()); diff --git a/src/Craned/DeviceManager.h b/src/Craned/DeviceManager.h index a83b3835..d28f3597 100644 --- a/src/Craned/DeviceManager.h +++ b/src/Craned/DeviceManager.h @@ -23,6 +23,8 @@ namespace Craned { + + struct BasicDevice { std::string dev_id; diff --git a/src/Craned/TaskManager.cpp b/src/Craned/TaskManager.cpp index 08575054..4ef708f7 100644 --- a/src/Craned/TaskManager.cpp +++ b/src/Craned/TaskManager.cpp @@ -1042,7 +1042,7 @@ void TaskManager::LaunchTaskInstanceMt_(TaskInstance* instance) { return; } - Cgroup* cg; + CgroupInterface* cg; bool ok = g_cg_mgr->AllocateAndGetCgroup(task_id, &cg); if (!ok) { CRANE_ERROR("Failed to allocate cgroup for task #{}", task_id); diff --git a/src/Craned/TaskManager.h b/src/Craned/TaskManager.h index a7f5e80d..cbef087d 100644 --- a/src/Craned/TaskManager.h +++ b/src/Craned/TaskManager.h @@ -190,7 +190,7 @@ struct TaskInstance { std::unique_ptr meta; std::string cgroup_path; - Cgroup* cgroup; + CgroupInterface* cgroup; struct event* termination_timer{nullptr}; // Task execution results diff --git a/src/Misc/BPF/CMakeLists.txt b/src/Misc/BPF/CMakeLists.txt new file mode 100644 index 00000000..2e5e166c --- /dev/null +++ b/src/Misc/BPF/CMakeLists.txt @@ -0,0 +1,39 @@ +set(BPF_SOURCE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/cgroup_dev_bpf.c") +set(BPF_OBJECT_FILE "${CMAKE_CURRENT_BINARY_DIR}/cgroup_dev_bpf.o") + +if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + set(BPF_ARCH "__TARGET_ARCH_x86") +elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") + set(BPF_ARCH "__TARGET_ARCH_arm64") +elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm") + set(BPF_ARCH "__TARGET_ARCH_arm") +else() + message(FATAL_ERROR "Unsupported architecture: ${CMAKE_SYSTEM_PROCESSOR}") +endif() + +if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + set(BPF_COMPILER ${CLANG_EXECUTABLE}) +else() + # set clang + set(BPF_COMPILER ${CMAKE_C_COMPILER}) +endif() + +list(APPEND BPF_COMPILE_OPTIONS -g -O2 ) + +add_custom_command( + OUTPUT ${BPF_OBJECT_FILE} + COMMAND ${BPF_COMPILER} -target bpf ${BPF_COMPILE_OPTIONS} -D${BPF_ARCH} -c ${BPF_SOURCE_FILE} -o ${BPF_OBJECT_FILE} + DEPENDS ${BPF_SOURCE_FILE} + COMMENT "Building BPF object: ${BPF_OBJECT_FILE}" +) + +add_custom_target(cgroup_dev_bpf_object ALL + DEPENDS ${BPF_OBJECT_FILE} +) + + + + + + + diff --git a/src/Misc/BPF/cgroup_dev_bpf.c b/src/Misc/BPF/cgroup_dev_bpf.c new file mode 100644 index 00000000..cbf69639 --- /dev/null +++ b/src/Misc/BPF/cgroup_dev_bpf.c @@ -0,0 +1,107 @@ +#include +#include +#include + +enum BPF_PERMISSION { ALLOW = 0, DENY }; + +#pragma pack(push, 8) +struct BpfKey { + uint64_t cgroup_id; + uint32_t major; + uint32_t minor; +}; +#pragma pack(pop) + +#pragma pack(push, 8) +struct BpfDeviceMeta { + uint32_t major; + uint32_t minor; + int permission; + short access; + short type; +}; +#pragma pack(pop) + +#define MAX_ENTRIES 4096 + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct BpfKey); + __type(value, struct BpfDeviceMeta); + __uint(max_entries, MAX_ENTRIES); + __uint(pinning, LIBBPF_PIN_BY_NAME); +} craned_dev_map SEC(".maps"); + +SEC("cgroup/dev") +int craned_device_access(struct bpf_cgroup_dev_ctx *ctx) { + struct BpfKey key = {bpf_get_current_cgroup_id(), ctx->major, ctx->minor}; + // value.major in map(0,0,0) contains log level + struct BpfKey log_key = {(uint64_t)0, (uint32_t)0, (uint32_t)0}; + struct BpfDeviceMeta *log_level_meta; + + log_level_meta = + (struct BpfDeviceMeta *)bpf_map_lookup_elem(&craned_dev_map, &log_key); + // 0 means trace mode ,1 means debug mode + uint32_t log_level; + if (!log_level_meta) { + log_level = 0; + } else { + log_level = log_level_meta->major; + } + + if (log_level <= 1) bpf_printk("ctx cgroup ID : %lu\n", key.cgroup_id); + struct BpfDeviceMeta *meta; + + meta = (struct BpfDeviceMeta *)bpf_map_lookup_elem(&craned_dev_map, &key); + if (!meta) { + if (log_level <= 1) { + bpf_printk("BpfDeviceMeta not found for key cgroup ID: %d,\n", + key.cgroup_id); + bpf_printk("Access allowed for device major=%d, minor=%d\n", ctx->major, + ctx->minor); + } + return 1; + } + + short type = ctx->access_type & 0xFFFF; + short access = ctx->access_type >> 16; + + if (log_level <= 1) + bpf_printk("meta Device major=%d, minor=%d, access_type=%d\n", meta->major, + meta->minor, meta->access); + + if (ctx->major == meta->major && ctx->minor == meta->minor) { + if (meta->permission == DENY) { + int flag = 1; + if (access & BPF_DEVCG_ACC_READ) + if (meta->access & BPF_DEVCG_ACC_READ) { + if (log_level <= 1) + bpf_printk("Read access denied for device major=%d, minor=%d\n", + ctx->major, ctx->minor); + flag &= 0; + } + if (access & BPF_DEVCG_ACC_WRITE) + if (meta->access & BPF_DEVCG_ACC_WRITE) { + if (log_level <= 1) + bpf_printk("Write access denied for device major=%d, minor=%d\n", + ctx->major, ctx->minor); + flag &= 0; + } + if (access & BPF_DEVCG_ACC_MKNOD) + if (meta->access & BPF_DEVCG_ACC_MKNOD) { + if (log_level <= 1) + bpf_printk("Write access denied for device major=%d, minor=%d\n", + ctx->major, ctx->minor); + flag &= 0; + } + return flag; + } + } + + if (log_level <= 1) + bpf_printk("Access allowed for device major=%d, minor=%d\n", ctx->major, + ctx->minor); + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/src/Misc/CMakeLists.txt b/src/Misc/CMakeLists.txt index 718140f3..5ea61ec2 100644 --- a/src/Misc/CMakeLists.txt +++ b/src/Misc/CMakeLists.txt @@ -1 +1,7 @@ -add_subdirectory(Pam) \ No newline at end of file +add_subdirectory(Pam) +if(ENABLE_BPF) + add_subdirectory(BPF) + message(STATUS "cgroup_dev_bpf_object is enabled.") +else() + message(STATUS "cgroup_dev_bpf_object is disabled") +endif() \ No newline at end of file