Skip to content

Commit 305182e

Browse files
galipremsagarvyasrMatt711wence-
authored
Enable unified memory by default in cudf_polars (#17375)
This PR enables Unified memory as the default memory resource for `cudf_polars` --------- Co-authored-by: Vyas Ramasubramani <[email protected]> Co-authored-by: Vyas Ramasubramani <[email protected]> Co-authored-by: Matthew Murray <[email protected]> Co-authored-by: Lawrence Mitchell <[email protected]> Co-authored-by: Matthew Murray <[email protected]>
1 parent f54c1a5 commit 305182e

File tree

4 files changed

+84
-5
lines changed

4 files changed

+84
-5
lines changed

docs/cudf/source/cudf_polars/engine_options.md

+7
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,10 @@ engine = GPUEngine(
2323
result = query.collect(engine=engine)
2424
```
2525
Note that passing `chunked: False` disables chunked reading entirely, and thus `chunk_read_limit` and `pass_read_limit` will have no effect.
26+
27+
## Disabling CUDA Managed Memory
28+
29+
By default `cudf_polars` will default to [CUDA managed memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#unified-memory-introduction) with RMM's pool allocator. On systems that don't support managed memory, a non-managed asynchronous pool
30+
allocator is used.
31+
Managed memory can be turned off by setting `POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY` to `0`. System requirements for managed memory can be found [here](
32+
https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#system-requirements-for-unified-memory).

docs/cudf/source/cudf_polars/index.rst

+6
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,12 @@ and run on the CPU.
99

1010
Benchmark
1111
---------
12+
13+
.. note::
14+
The following benchmarks were performed with `POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY` environment variable set to `"0"`.
15+
Using managed memory (the default) imposes a performance cost in order to avoid out of memory errors.
16+
Peak performance can still be attained by setting the environment variable to 1.
17+
1218
We reproduced the `Polars Decision Support (PDS) <https://github.com/pola-rs/polars-benchmark>`__ benchmark to compare Polars GPU engine with the default CPU settings across several dataset sizes. Here are the results:
1319

1420
.. figure:: ../_static/pds_benchmark_polars.png

python/cudf_polars/cudf_polars/callback.py

+51-5
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
from polars.exceptions import ComputeError, PerformanceWarning
1717

18+
import pylibcudf
1819
import rmm
1920
from rmm._cuda import gpu
2021

@@ -32,8 +33,26 @@
3233
__all__: list[str] = ["execute_with_cudf"]
3334

3435

36+
_SUPPORTED_PREFETCHES = {
37+
"column_view::get_data",
38+
"mutable_column_view::get_data",
39+
"gather",
40+
"hash_join",
41+
}
42+
43+
44+
def _env_get_int(name, default):
45+
try:
46+
return int(os.getenv(name, default))
47+
except (ValueError, TypeError): # pragma: no cover
48+
return default # pragma: no cover
49+
50+
3551
@cache
36-
def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource:
52+
def default_memory_resource(
53+
device: int,
54+
cuda_managed_memory: bool, # noqa: FBT001
55+
) -> rmm.mr.DeviceMemoryResource:
3756
"""
3857
Return the default memory resource for cudf-polars.
3958
@@ -42,15 +61,35 @@ def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource:
4261
device
4362
Disambiguating device id when selecting the device. Must be
4463
the active device when this function is called.
64+
cuda_managed_memory
65+
Whether to use managed memory or not.
4566
4667
Returns
4768
-------
4869
rmm.mr.DeviceMemoryResource
4970
The default memory resource that cudf-polars uses. Currently
50-
an async pool resource.
71+
a managed memory resource, if `cuda_managed_memory` is `True`.
72+
else, an async pool resource is returned.
5173
"""
5274
try:
53-
return rmm.mr.CudaAsyncMemoryResource()
75+
if (
76+
cuda_managed_memory
77+
and pylibcudf.utils._is_concurrent_managed_access_supported()
78+
):
79+
# Allocating 80% of the available memory for the pool.
80+
# Leaving a 20% headroom to avoid OOM errors.
81+
free_memory, _ = rmm.mr.available_device_memory()
82+
free_memory = int(round(float(free_memory) * 0.80 / 256) * 256)
83+
for key in _SUPPORTED_PREFETCHES:
84+
pylibcudf.experimental.enable_prefetching(key)
85+
mr = rmm.mr.PrefetchResourceAdaptor(
86+
rmm.mr.PoolMemoryResource(
87+
rmm.mr.ManagedMemoryResource(),
88+
initial_pool_size=free_memory,
89+
)
90+
)
91+
else:
92+
mr = rmm.mr.CudaAsyncMemoryResource()
5493
except RuntimeError as e: # pragma: no cover
5594
msg, *_ = e.args
5695
if (
@@ -64,6 +103,8 @@ def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource:
64103
) from None
65104
else:
66105
raise
106+
else:
107+
return mr
67108

68109

69110
@contextlib.contextmanager
@@ -89,10 +130,15 @@ def set_memory_resource(
89130
at entry. If a memory resource is provided, it must be valid to
90131
use with the currently active device.
91132
"""
133+
previous = rmm.mr.get_current_device_resource()
92134
if mr is None:
93135
device: int = gpu.getDevice()
94-
mr = default_memory_resource(device)
95-
previous = rmm.mr.get_current_device_resource()
136+
mr = default_memory_resource(
137+
device=device,
138+
cuda_managed_memory=bool(
139+
_env_get_int("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", default=1) != 0
140+
),
141+
)
96142
rmm.mr.set_current_device_resource(mr)
97143
try:
98144
yield mr

python/cudf_polars/tests/test_config.py

+20
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
import rmm
1212

13+
from cudf_polars.callback import default_memory_resource
1314
from cudf_polars.dsl.ir import DataFrameScan
1415
from cudf_polars.testing.asserts import (
1516
assert_gpu_result_equal,
@@ -58,6 +59,25 @@ def test_invalid_memory_resource_raises(mr):
5859
q.collect(engine=pl.GPUEngine(memory_resource=mr))
5960

6061

62+
@pytest.mark.parametrize("disable_managed_memory", ["1", "0"])
63+
def test_cudf_polars_enable_disable_managed_memory(monkeypatch, disable_managed_memory):
64+
q = pl.LazyFrame({"a": [1, 2, 3]})
65+
66+
with monkeypatch.context() as monkeycontext:
67+
monkeycontext.setenv(
68+
"POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", disable_managed_memory
69+
)
70+
result = q.collect(engine=pl.GPUEngine())
71+
mr = default_memory_resource(0, bool(disable_managed_memory == "1"))
72+
if disable_managed_memory == "1":
73+
assert isinstance(mr, rmm.mr.PrefetchResourceAdaptor)
74+
assert isinstance(mr.upstream_mr, rmm.mr.PoolMemoryResource)
75+
else:
76+
assert isinstance(mr, rmm.mr.CudaAsyncMemoryResource)
77+
monkeycontext.delenv("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY")
78+
assert_frame_equal(q.collect(), result)
79+
80+
6181
def test_explicit_device_zero():
6282
q = pl.LazyFrame({"a": [1, 2, 3]})
6383

0 commit comments

Comments
 (0)