Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
599f968
[wip] [core] (cgroups 14/n) Clean up bazel targets and expose just the
israbbani Sep 30, 2025
77f6b64
[core] Cleaning up Cgroup related bazel targets. CgroupManagerInteface
israbbani Oct 6, 2025
9fd1160
[core] (cgroups 14/n) Clean up bazel targets and enable cross-platform
israbbani Oct 6, 2025
a5f4b5a
Merge branch 'master' into irabbani/cgroups-14
israbbani Oct 6, 2025
762b5cf
Merge branch 'irabbani/cgroups-14' of github.com:ray-project/ray into…
israbbani Oct 6, 2025
8d07f6f
Merge branch 'master' into irabbani/cgroups-14
israbbani Oct 6, 2025
b92677e
Forgot to use clang locally
israbbani Oct 7, 2025
a870d5a
Merge branch 'irabbani/cgroups-14' of github.com:ray-project/ray into…
israbbani Oct 7, 2025
e34f19b
Unused imports
israbbani Oct 7, 2025
55f1ec8
unused includes breaking the build
israbbani Oct 7, 2025
4c7545e
fixing the macos build
israbbani Oct 7, 2025
aef6bd8
Merge branch 'master' into irabbani/cgroups-14
israbbani Oct 7, 2025
bac50d3
[core] (cgroups 15/n) Changing the cgroup heirarchy to have three
israbbani Oct 7, 2025
50b2d14
Merge branch 'irabbani/cgroups-14' into irabbani/cgroups-15
israbbani Oct 7, 2025
59366ce
move operators for NoopCgroupManager
israbbani Oct 7, 2025
44ab09e
Merge branch 'irabbani/cgroups-14' of github.com:ray-project/ray into…
israbbani Oct 7, 2025
0c8d8e3
Update src/ray/common/cgroup2/cgroup_manager_factory.h
israbbani Oct 7, 2025
6dc39ad
feedback
israbbani Oct 7, 2025
60d77bb
up
israbbani Oct 8, 2025
bfd2482
Merge branch 'master' into irabbani/cgroups-14
israbbani Oct 9, 2025
59a0bef
Merge branch 'irabbani/cgroups-14' into irabbani/cgroups-15
israbbani Oct 9, 2025
ee024ea
Different cgroup hierarchy.
israbbani Oct 10, 2025
bf390de
Merge branch 'master' into irabbani/cgroups-15
israbbani Oct 10, 2025
028f3d2
Merge branch 'irabbani/cgroups-15' of github.com:ray-project/ray into…
israbbani Oct 10, 2025
cb34c9b
typo
israbbani Oct 10, 2025
fb7d1ac
one more typo
israbbani Oct 10, 2025
8b443f5
one more
israbbani Oct 10, 2025
505c4d5
[core] (cgroups 16/n) Changing default values for the system cgroup to
israbbani Oct 11, 2025
eddb0b2
Cleaning up docs and log lines
israbbani Oct 11, 2025
4550bae
Merge branch 'master' into irabbani/cgroups-15
israbbani Oct 11, 2025
30770e0
Merge branch 'irabbani/cgroups-15' of github.com:ray-project/ray into…
israbbani Oct 11, 2025
117248d
[core] (cgroups 16/n) Updating the algorithm for determining the default
israbbani Oct 12, 2025
c4884ee
Merge branch 'master' into irabbani/cgroups-15
israbbani Oct 12, 2025
35ba7be
Merge branch 'irabbani/cgroups-15' into irabbani/cgroups-16
israbbani Oct 12, 2025
be2f048
typos
israbbani Oct 12, 2025
d49a1e0
Merge branch 'irabbani/cgroups-16' of github.com:ray-project/ray into…
israbbani Oct 12, 2025
a111fd3
another typo
israbbani Oct 12, 2025
8a43394
another one
israbbani Oct 12, 2025
071286f
Merge branch 'master' into irabbani/cgroups-16
edoakes Oct 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 35 additions & 25 deletions python/ray/_private/ray_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,41 @@ def env_set_by_user(key):

ID_SIZE = 28

# The following constants are used to create default values for
# resource isolation when it is enabled.
# TODO(54703): Link to OSS documentation about the feature once it's available.
DEFAULT_CGROUP_PATH = "/sys/fs/cgroup"
# The default proportion of cpu cores to reserve for ray system processes.
DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION = env_float(
"RAY_DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION", 0.05
)
# The default minimum number of cpu cores to reserve for ray system processes.
# This value is used if the available_cores * DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION < this value.
DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES = env_float(
"RAY_DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES", 1.0
)
# The default maximum number of cpu cores to reserve for ray system processes.
# This value is used if the available_cores * DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION > this value.
DEFAULT_MAX_SYSTEM_RESERVED_CPU_CORES = env_float(
"RAY_DEFAULT_MAX_SYSTEM_RESERVED_CPU_CORES", 3.0
)
# The values for SYSTEM_RESERVED_MEMORY do not include the memory reserveed
# for the object store.
# The default proportion available memory to reserve for ray system processes.
DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION = env_float(
"RAY_DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION", 0.10
)
# The default minimum number of bytes to reserve for ray system processes.
# This value is used if the available_memory * DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION < this value.
DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES = env_integer(
"RAY_DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES", (500) * (1024**2)
)
# The default maximum number of bytes to reserve for ray system processes.
# This value is used if the available_memory * DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION > this value.
DEFAULT_MAX_SYSTEM_RESERVED_MEMORY_BYTES = env_integer(
"RAY_DEFAULT_MAX_SYSTEM_RESERVED_MEMORY_BYTES", (10) * (1024**3)
)

# The default maximum number of bytes to allocate to the object store unless
# overridden by the user.
DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES = env_integer(
Expand All @@ -77,31 +112,6 @@ def env_set_by_user(key):
"RAY_DEFAULT_OBJECT_STORE_MEMORY_PROPORTION",
0.3,
)

# The following values are only used when resource isolation is enabled
# ===== The default number of bytes to reserve for ray system processes
DEFAULT_SYSTEM_RESERVED_MEMORY_BYTES = env_integer(
"RAY_DEFAULT_DEFAULT_SYSTEM_RESERVED_MEMORY_BYTES", (25) * (10**9)
)
# The default proportion available memory to reserve for ray system processes
DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION = env_integer(
"RAY_DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION", 0.10
)
# The default number of cpu cores to reserve for ray system processes
DEFAULT_SYSTEM_RESERVED_CPU_CORES = env_float(
"RAY_DEFAULT_SYSTEM_RESERVED_CPU_CORES", 1.0
)
# The default proportion of cpu cores to reserve for ray system processes
DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION = env_float(
"RAY_DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION", 0.05
)
# The smallest number of cores that ray system processes can be guaranteed
MINIMUM_SYSTEM_RESERVED_CPU_CORES = 0.5
# The smallest number of bytes that ray system processes can be guaranteed
MINIMUM_SYSTEM_RESERVED_MEMORY_BYTES = (100) * (10**6)
# The default path for cgroupv2
DEFAULT_CGROUP_PATH = "/sys/fs/cgroup"

# The smallest cap on the memory used by the object store that we allow.
# This must be greater than MEMORY_RESOURCE_UNIT_BYTES
OBJECT_STORE_MINIMUM_MEMORY_BYTES = 75 * 1024 * 1024
Expand Down
145 changes: 95 additions & 50 deletions python/ray/_private/resource_isolation_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,10 @@


class ResourceIsolationConfig:
"""Configuration for enabling resource isolation by reserving memory
and cpu for ray system processes through cgroupv2.
This class validates configuration for resource isolation by
enforcing types, correct combinations of values, applying default values,
and sanity checking cpu and memory reservations.
Also, converts system_reserved_cpu into cpu.weights for cgroupv2.
"""Configuration for enabling resource isolation by reserving memory and cpu for ray system processes through cgroupv2.

Validates configuration for resource isolation by enforcing types, correct combinations of values, applying default values,
and sanity checking cpu and memory reservations. Also, converts system_reserved_cpu into cpu.weights for cgroupv2.

Raises:
ValueError: On invalid inputs.
Expand All @@ -34,6 +32,8 @@ class ResourceIsolationConfig:
system_reserved_memory: The amount of memory in bytes reserved
for ray system processes. Must be >= ray_constants.MINIMUM_SYSTEM_RESERVED_MEMORY_BYTES
and system_reserved_cpu + object_store_bytes < the total memory available.

TODO(54703): Link documentation when it's available.
"""

def __init__(
Expand All @@ -47,18 +47,22 @@ def __init__(
self.cgroup_path = cgroup_path
self.system_reserved_memory = system_reserved_memory
self.system_pids = ""
# cgroupv2 cpu.weight calculated from system_reserved_cpu
# assumes ray uses all available cores.

# cgroupv2 cpu.weight calculated from system_reserved_cpu assumes ray uses all available cores.
self.system_reserved_cpu_weight: int = None
# TODO(irabbani): this is used to ensure
# that object_store_memory is not added twice
# to self._system_reserved_memory. This should
# be refactored in the future so that ResourceIsolationConfig
# can take object_store_memory as a constructor parameter
# and be constructed fully by the constructor.

# TODO(irabbani): this is used to ensure that object_store_memory is not added twice
# to self._system_reserved_memory. This should be refactored in the future so that ResourceIsolationConfig
# can take object_store_memory as a constructor parameter and be constructed fully by the constructor.
self._constructed = False

if not enable_resource_isolation:
if self.cgroup_path:
raise ValueError(
"cgroup_path cannot be set when resource isolation is not enabled. "
"Set enable_resource_isolation to True if you're using ray.init or use the "
"--enable-resource-isolation flag if you're using the ray cli."
)
if system_reserved_cpu:
raise ValueError(
"system_reserved_cpu cannot be set when resource isolation is not enabled. "
Expand All @@ -72,45 +76,49 @@ def __init__(
"Set enable_resource_isolation to True if you're using ray.init or use the "
"--enable-resource-isolation flag if you're using the ray cli."
)
if self.cgroup_path:
raise ValueError(
"cgroup_path cannot be set when resource isolation is not enabled. "
"Set enable_resource_isolation to True if you're using ray.init or use the "
"--enable-resource-isolation flag if you're using the ray cli."
)
return

# resource isolation is enabled
self.system_reserved_cpu_weight = self._validate_and_get_system_reserved_cpu(
system_reserved_cpu
)

self.system_reserved_memory = self._validate_and_get_system_reserved_memory(
system_reserved_memory
)

self.cgroup_path = self._validate_and_get_cgroup_path(cgroup_path)

def is_enabled(self) -> bool:
return self._resource_isolation_enabled

def add_object_store_memory(self, object_store_memory: int):
"""This is only supposed to be called once. It also cannot be
called if resouce isolation is not enabled.
def add_object_store_memory(self, object_store_memory_bytes: int):
"""Adds object_store_memory to the memory reserved for system processes.

Args:
object_store_memory_bytes: The amount processes. Must be >= ray_constants.MINIMUM_SYSTEM_RESERVED_CPU_CORES
and < the total number of cores available.

Raises:
AssertionError: If called with resource isolation not enabled or called more than once for the same instance.
ValueError: If the input is not an integer or if the system_reserved_memory + object_store_memory is greater
than the total memory available on the system.

"""
assert self.is_enabled(), (
"Cannot add object_store_memory to system_reserved_memory when "
"enable_resource_isolation is False."
)
assert not self._constructed, (
"Cannot add object_store_memory to system_reserved_memory when"
"multiple times."
"Cannot call add_object_store_memory more than once with an instance "
"ResourceIsolationConfig. This is a bug in the ray code. "
)
self.system_reserved_memory += object_store_memory
self.system_reserved_memory += object_store_memory_bytes
available_system_memory = ray._common.utils.get_system_memory()
if self.system_reserved_memory > available_system_memory:
raise ValueError(
f"The total requested system_reserved_memory={self.system_reserved_memory}, calculated by "
" object_store_bytes + system_reserved_memory, is greater than the total memory "
f" available={available_system_memory}. Pick a smaller number of bytes for object_store_bytes "
"object_store_bytes + system_reserved_memory, is greater than the total memory "
f"available={available_system_memory}. Pick a smaller number of bytes for object_store_bytes "
"or system_reserved_memory."
)
self._constructed = True
Expand All @@ -121,8 +129,7 @@ def add_system_pids(self, system_pids: str):

@staticmethod
def _validate_and_get_cgroup_path(cgroup_path: Optional[str]) -> str:
"""Returns the ray_constants.DEFAULT_CGROUP_PATH if cgroup_path is not
specified. Checks the type of cgroup_path.
"""Returns the ray_constants.DEFAULT_CGROUP_PATH if cgroup_path is not specified.

Args:
cgroup_path: The path for the cgroup the raylet should use to enforce
Expand Down Expand Up @@ -150,25 +157,47 @@ def _validate_and_get_cgroup_path(cgroup_path: Optional[str]) -> str:
def _validate_and_get_system_reserved_cpu(
system_reserved_cpu: Optional[float],
) -> int:
"""If system_reserved_cpu is not specified, returns the default value. Otherwise,
checks the type, makes sure that the value is in range, and converts it into cpu.weights
for cgroupv2. See https://docs.kernel.org/admin-guide/cgroup-v2.html#weights for more information.
"""If system_reserved_cpu is specified, validates it, otherwise returns the default value.

Validation entails checking the type, ensuring that the value is in range, and converts it
into cpu.weights for cgroupv2. See https://docs.kernel.org/admin-guide/cgroup-v2.html#weights
for more information.

If system_reserved_cpu is not specified, returns a default value between
[DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES, DEFAULT_MAX_SYSTEM_RESERVED_CPU_CORES].

# TODO(54703): The errors from this method are user-facing and thus need
to be linked the user-facing documentation once it's available.

Args:
system_reserved_cpu: The amount of cores reserved for ray system
processes. Must be >= ray_constants.MINIMUM_SYSTEM_RESERVED_CPU_CORES
and < the total number of cores available.

Raises:
ValueError: If system_reserved_cpu is specified, but invalid.
ValueError: If system_reserved_cpu is specified, but invalid or if the system
does not have enough available cpus.

"""
available_system_cpus = utils.get_num_cpus()

if available_system_cpus < ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES:
raise ValueError(
f"The available number of cpu cores on this system {available_system_cpus} is less than "
f"the minimum amount that is required for ray's system processes. "
f"Pick a number of cpu cores greater than or equal to {ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES}"
)

if not system_reserved_cpu:
system_reserved_cpu = min(
ray_constants.DEFAULT_SYSTEM_RESERVED_CPU_CORES,
ray_constants.DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION
* available_system_cpus,
system_reserved_cpu = float(
min(
max(
ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES,
ray_constants.DEFAULT_SYSTEM_RESERVED_CPU_PROPORTION
* available_system_cpus,
),
ray_constants.DEFAULT_MAX_SYSTEM_RESERVED_CPU_CORES,
)
)

if not (
Expand All @@ -183,12 +212,12 @@ def _validate_and_get_system_reserved_cpu(

system_reserved_cpu = float(system_reserved_cpu)

if system_reserved_cpu < ray_constants.MINIMUM_SYSTEM_RESERVED_CPU_CORES:
if system_reserved_cpu < ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES:
raise ValueError(
f"The requested system_reserved_cpu={system_reserved_cpu} is less than "
f"the minimum number of cpus that can be used for resource isolation. "
"Pick a number of cpu cores to reserve for ray system processes "
f"greater than or equal to {ray_constants.MINIMUM_SYSTEM_RESERVED_CPU_CORES}"
f"greater than or equal to {ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_CPU_CORES}"
)

if system_reserved_cpu > available_system_cpus:
Expand All @@ -200,8 +229,8 @@ def _validate_and_get_system_reserved_cpu(

# Converting the number of cores the user defined into cpu.weights
# This assumes that ray is allowed to use all available CPU
# cores and distribute them between system processes and
# application processes
# cores and distribute them between system, worker and
# user processes
return int(
(system_reserved_cpu / float(available_system_cpus))
* _CGROUP_CPU_MAX_WEIGHT
Expand All @@ -227,28 +256,44 @@ def _validate_and_get_system_reserved_memory(
"""
available_system_memory = ray._common.utils.get_system_memory()

if (
available_system_memory
< ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES
):
raise ValueError(
f"The available memory on this system {available_system_memory} is less than "
f"the minimum amount that is required for ray's system processes. "
f"Pick a number of bytes greater than or equal to {ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES}"
)

if not system_reserved_memory:
system_reserved_memory = int(
min(
ray_constants.DEFAULT_SYSTEM_RESERVED_MEMORY_BYTES,
ray_constants.DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION
* available_system_memory,
max(
ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES,
ray_constants.DEFAULT_SYSTEM_RESERVED_MEMORY_PROPORTION
* available_system_memory,
),
ray_constants.DEFAULT_MAX_SYSTEM_RESERVED_MEMORY_BYTES,
)
)

if not isinstance(system_reserved_memory, int):
raise ValueError(
f"Invalid value={system_reserved_memory} for system_reserved_memory. "
f"Invalid value {system_reserved_memory} for system_reserved_memory. "
"Use an integer to represent the number bytes that need to be reserved for "
"ray system processes to enable resource isolation."
)

if system_reserved_memory < ray_constants.MINIMUM_SYSTEM_RESERVED_MEMORY_BYTES:
if (
system_reserved_memory
< ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES
):
raise ValueError(
f"The requested system_reserved_memory={system_reserved_memory} is less than "
f"The requested system_reserved_memory {system_reserved_memory} is less than "
f"the minimum number of bytes that can be used for resource isolation. "
"Pick a number of bytes to reserve for ray system processes "
f"greater than or equal to {ray_constants.MINIMUM_SYSTEM_RESERVED_MEMORY_BYTES}"
f"greater than or equal to {ray_constants.DEFAULT_MIN_SYSTEM_RESERVED_MEMORY_BYTES}"
)

if system_reserved_memory > available_system_memory:
Expand Down
Loading