-
Notifications
You must be signed in to change notification settings - Fork 613
Add fix for devices that do not have memory resources #6823
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
a54c313
2dc6c17
b4f2c85
1f8b2c1
7f9c957
3c49669
a0230fd
965f0d4
114f3dd
507fbfe
c66fb47
711c6e9
e70314b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,7 +15,6 @@ | |
| # | ||
|
|
||
| import os | ||
| import subprocess | ||
| from datetime import timedelta | ||
| from math import ceil | ||
| from ssl import create_default_context | ||
|
|
@@ -27,6 +26,7 @@ | |
| import hypothesis | ||
| import numpy as np | ||
| import pandas as pd | ||
| import pynvml | ||
| import pytest | ||
| from sklearn import datasets | ||
| from sklearn.datasets import fetch_20newsgroups, fetch_california_housing | ||
|
|
@@ -275,20 +275,67 @@ def pytest_pyfunc_call(pyfuncitem): | |
| pytest.skip("Test requires cudf.pandas accelerator") | ||
|
|
||
|
|
||
| def _get_gpu_memory(): | ||
| """Get the total GPU memory in GB.""" | ||
| bash_command = "nvidia-smi --query-gpu=memory.total --format=csv" | ||
| output = subprocess.check_output(bash_command, shell=True).decode("utf-8") | ||
| lines = output.split("\n") | ||
| lines.pop(0) | ||
| gpus_memory = [] | ||
| for line in lines: | ||
| tokens = line.split(" ") | ||
| if len(tokens) > 1: | ||
| gpus_memory.append(int(tokens[0])) | ||
| gpus_memory.sort() | ||
| max_gpu_memory = ceil(gpus_memory[-1] / 1024) | ||
| return max_gpu_memory | ||
| def get_gpu_handle(device_id=0): | ||
| """Get GPU handle from device index or UUID. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| device_id: int or str | ||
| The index or UUID of the device from which to obtain the handle. | ||
viclafargue marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| Raises | ||
| ------ | ||
| ValueError | ||
| If acquiring the device handle for the device specified failed. | ||
| pynvml.NVMLError | ||
| If any NVML error occurred while initializing. | ||
|
|
||
| Examples | ||
| -------- | ||
| >>> get_gpu_handle(device_id=0) | ||
|
|
||
| >>> get_gpu_handle(device_id="GPU-9fb42d6f-7d6b-368f-f79c-3c3e784c93f6") | ||
| """ | ||
| pynvml.nvmlInit() | ||
|
|
||
| try: | ||
| if device_id and not str(device_id).isnumeric(): | ||
| # This means device_id is UUID. | ||
| # This works for both MIG and non-MIG device UUIDs. | ||
| handle = pynvml.nvmlDeviceGetHandleByUUID(str.encode(device_id)) | ||
| if pynvml.nvmlDeviceIsMigDeviceHandle(handle): | ||
| # Additionally get parent device handle | ||
| # if the device itself is a MIG instance | ||
| handle = pynvml.nvmlDeviceGetDeviceHandleFromMigDeviceHandle( | ||
| handle | ||
| ) | ||
| else: | ||
| handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) | ||
| return handle | ||
| except pynvml.NVMLError: | ||
| raise ValueError(f"Invalid device index or UUID: {device_id}") | ||
|
Comment on lines
+305
to
+320
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems fairly complicated for what appears to be a rather basic function. Is this really the recommended approach for this?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For general support, yes, but I presume this is for CI only so not necessarily all is required. However, this is a verbatim copy from Dask-CUDA, which is probably the only place this function is tested, so I think it makes sense to have a verbatim copy here as it will be less headache for you. In the long-term, I'd like to have those functions in some shared package so that all RAPIDS projects can piggyback instead of copying verbatim. I've been pushing on that for 2 years but it has been really hard to convince our management of its value, perhaps now that we have similar functions copied in like 50 different places its value will finally become obvious. @quasiben |
||
|
|
||
|
|
||
| def _get_gpu_memory(device_index=0): | ||
| """Return total memory of CUDA device with index or with device identifier UUID. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| device_index: int or str | ||
| The index or UUID of the device from which to obtain the CPU affinity. | ||
|
|
||
| Returns | ||
| ------- | ||
| The total memory of the CUDA Device in GB, or ``None`` for devices that do not | ||
| have a dedicated memory resource, as is usually the case for system on a chip (SoC) | ||
| devices. | ||
| """ | ||
| handle = get_gpu_handle(device_index) | ||
|
|
||
| try: | ||
| return ceil(pynvml.nvmlDeviceGetMemoryInfo(handle).total / 2**30) | ||
viclafargue marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| except pynvml.NVMLError_NotSupported: | ||
| return None | ||
|
|
||
|
|
||
| # ============================================================================= | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.