Skip to content

Commit 9469fe8

Browse files
authored
RE-Implemented NVIDIA Energy capture via C (#1167)
* RE-Implemented NVIDIA Energy capture via C * Name change from NVIDIA SMI to NVML [skip ci] * Makefile cleanup * Adding NVIDIA Headers download to install * Directory rename * Changed resolution to sampling_rate [skip ci] * Fixing installer --nvidia-gpu optione * Installing libs for fedora for --nvidia-gpu * Including fedora libs. Should not harm Ubuntu target * Nvidia lm was duplicate; nvmlShutdown added when checking card [skip ci]
1 parent 970a320 commit 9469fe8

File tree

10 files changed

+201
-40
lines changed

10 files changed

+201
-40
lines changed

.github/actions/gmt-pytest/action.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,12 @@ runs:
4747
if ${{inputs.ee}}; then
4848
if [[ "${{inputs.ee-branch}}" != '' ]]; then
4949
echo "Using ee-branch ${{inputs.ee-branch}}"
50-
./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j -d -g -e github-actions-test --ee-branch ${{inputs.ee-branch}}
50+
./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j -d -g -e github-actions-test --ee-branch ${{inputs.ee-branch}} --nvidia-gpu
5151
else
52-
./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j -d -g -e github-actions-test
52+
./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j -d -g -e github-actions-test --nvidia-gpu
5353
fi
5454
else
55-
./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j
55+
./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j --nvidia-gpu
5656
fi
5757
source venv/bin/activate
5858
env:

config.yml.example

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ measurement:
124124
# psu.energy.dc.rapl.msr.machine.provider.PsuEnergyDcRaplMsrMachineProvider:
125125
# sampling_rate: 99
126126
#--- GPU - Only enable these if you have GPUs with power measurement enabled in your machine
127-
# gpu.energy.nvidia.smi.component.provider.GpuEnergyNvidiaSmiComponentProvider:
127+
# gpu.energy.nvidia.nvml.component.provider.GpuEnergyNvidiaNvmlComponentProvider:
128128
# sampling_rate: 99
129129
#--- Sensors - these providers need the lm-sensors package installed
130130
# lmsensors.temperature.component.provider.LmsensorsTemperatureComponentProvider:

frontend/js/helpers/config.js.example

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -411,15 +411,15 @@ METRIC_MAPPINGS = {
411411
"source": "formula",
412412
"explanation": "Network total data traffic for the whole accumulated from procfs data"
413413
},
414-
"gpu_energy_nvidia_smi_component": {
414+
"gpu_energy_nvidia_nvml_component": {
415415
"clean_name": "GPU Energy",
416-
"source": "NVIDIA SMI",
417-
"explanation": "Derived NVIDIA SMI based GPU energy"
416+
"source": "NVIDIA NVML",
417+
"explanation": "Derived NVIDIA NVML based GPU energy"
418418
},
419-
"gpu_power_nvidia_smi_component": {
419+
"gpu_power_nvidia_nvml_component": {
420420
"clean_name": "GPU Power",
421-
"source": "NVIDIA SMI",
422-
"explanation": "NVIDIA SMI based GPU power"
421+
"source": "NVIDIA NVML",
422+
"explanation": "NVIDIA NVML based GPU power"
423423
},
424424
"cpu_energy_rapl_msr_component": {
425425
"clean_name": "CPU Energy (Package)",

install_linux.sh

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,26 @@ if [[ $activate_scenario_runner == true ]] ; then
4242
sudo systemctl stop tinyproxy
4343
sudo systemctl disable tinyproxy
4444

45+
if [[ $install_nvidia_toolkit_headers == true ]] ; then
46+
print_message "Installing nvidia toolkit headers"
47+
if lsb_release -is | grep -q "Fedora"; then
48+
curl -O https://developer.download.nvidia.com/compute/cuda/repos/fedora$(rpm -E %fedora)/x86_64/cuda-fedora$(rpm -E %fedora).repo
49+
sudo mv cuda-fedora$(rpm -E %fedora).repo /etc/yum.repos.d/
50+
sudo dnf makecache
51+
if ! sudo dnf -y install libnvidia-ml cuda-nvml-devel-12-9; then
52+
print_message "Failed to install nvidia toolkit headers; Please remove --nvidia-gpu flag and install manually" >&2
53+
exit 1
54+
else
55+
sudo ln -s /usr/lib64/libnvidia-ml.so.1 /usr/lib64/libnvidia-ml.so
56+
fi
57+
else
58+
if ! sudo apt-get install -y libnvidia-ml-dev; then
59+
print_message "Failed to install nvidia toolkit headers; Please remove --nvidia-gpu flag and install manually" >&2
60+
exit 1
61+
fi
62+
fi
63+
fi
64+
4565
print_message "Building C libs"
4666
make -C "lib/c"
4767

lib/install_shared.sh

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ cert_file=''
3636
enterprise=false
3737
ask_ping=true
3838
force_send_ping=false
39+
install_nvidia_toolkit_headers=false
3940
ee_branch=''
4041

4142
function print_message {
@@ -329,6 +330,9 @@ function build_binaries() {
329330
if [[ "$make_path" == *"/lmsensors/"* ]] && [[ "${install_sensors}" == false ]]; then
330331
continue
331332
fi
333+
if [[ "$make_path" == *"/nvidia/"* ]] && [[ "${install_nvidia_toolkit_headers}" == false ]]; then
334+
continue
335+
fi
332336
echo "Installing $subdir/metric-provider-binary ..."
333337
rm -f $subdir/metric-provider-binary 2> /dev/null
334338
make -C $subdir
@@ -407,18 +411,20 @@ check_python_version
407411

408412
while [[ $# -gt 0 ]]; do
409413
case "$1" in
414+
--nvidia-gpu)
415+
install_nvidia_toolkit_headers=true
416+
shift
417+
;;
410418
--ai) # This is not documented in the help, as it is only for GCS internal use
411419
ask_ai_optimisations=false
412420
activate_ai_optimisations=true
413421
shift
414422
;;
415-
416423
--no-ai) # This is not documented in the help, as it is only for GCS internal use
417424
ask_ai_optimisations=false
418425
activate_ai_optimisations=false
419426
shift
420427
;;
421-
422428
--ee-branch) # This is not documented in the help, as it is only for GCS internal use
423429
check_optarg 'ee-branch' "${2:-}"
424430
ee_branch="$2"
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
CFLAGS = -O3 -Wall -Werror -I../../../../../../lib/c -I/usr/local/cuda-12.9/targets/x86_64-linux/include
2+
LDFLAGS = -L../../../../../../lib/c -lnvidia-ml -lc
3+
4+
metric-provider-binary: source.c
5+
gcc $(CFLAGS) ../../../../../../lib/c/gmt-lib.o $< $(LDFLAGS) -o $@
File renamed without changes.

metric_providers/gpu/energy/nvidia/smi/component/provider.py renamed to metric_providers/gpu/energy/nvidia/nvml/component/provider.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,20 @@
22

33
from metric_providers.base import BaseMetricProvider
44

5-
class GpuEnergyNvidiaSmiComponentProvider(BaseMetricProvider):
5+
class GpuEnergyNvidiaNvmlComponentProvider(BaseMetricProvider):
66
def __init__(self, sampling_rate, skip_check=False):
77
super().__init__(
8-
metric_name='gpu_energy_nvidia_smi_component',
9-
metrics={'time': int, 'value': int},
8+
metric_name='gpu_energy_nvidia_nvml_component',
9+
metrics={'time': int, 'value': int, 'card_model': str},
1010
sampling_rate=sampling_rate,
1111
unit='uJ',
1212
current_dir=os.path.dirname(os.path.abspath(__file__)),
13-
metric_provider_executable='metric-provider-nvidia-smi-wrapper.sh',
1413
skip_check=skip_check,
1514
)
1615

17-
18-
def check_system(self, check_command="default", check_error_message=None, check_parallel_provider=True):
19-
super().check_system(check_command=['which', 'nvidia-smi'], check_error_message="nvidia-smi is not installed on the system")
20-
2116
def _parse_metrics(self, df):
2217
df = super()._parse_metrics(df) # sets detail_name
18+
df['detail_name'] = df.card_model
2319

2420
'''
2521
Conversion to Joules
@@ -29,7 +25,7 @@ def _parse_metrics(self, df):
2925
WITH times as (
3026
SELECT id, value, detail_name, time, (time - LAG(time) OVER (ORDER BY detail_name ASC, time ASC)) AS diff, unit
3127
FROM measurements
32-
WHERE run_id = RUN_ID AND metric = 'gpu_energy_nvidia_smi_component'
28+
WHERE run_id = RUN_ID AND metric = 'gpu_energy_nvidia_nvml_component'
3329
3430
ORDER BY detail_name ASC, time ASC)
3531
SELECT *, value / (diff / 1000) as power FROM times;
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
#include <stdio.h>
2+
#include <stdlib.h>
3+
#include <errno.h>
4+
#include <unistd.h>
5+
#include <sys/time.h>
6+
#include <time.h>
7+
#include <string.h> // for strtok
8+
#include <getopt.h>
9+
#include <limits.h>
10+
#include <stdbool.h>
11+
#include <nvml.h>
12+
#include "gmt-lib.h"
13+
14+
15+
// All variables are made static, because we believe that this will
16+
// keep them local in scope to the file and not make them persist in state
17+
// between Threads.
18+
// in any case, none of these variables should change between threads
19+
static unsigned int msleep_time=1000;
20+
static struct timespec offset;
21+
22+
static void output_stats() {
23+
struct timeval now;
24+
nvmlReturn_t result;
25+
unsigned int device_count;
26+
nvmlDevice_t device;
27+
char name[NVML_DEVICE_NAME_BUFFER_SIZE];
28+
// nvmlUtilization_t utilization;
29+
// nvmlMemory_t memory;
30+
unsigned int power_usage;
31+
// unsigned int power_limit;
32+
33+
result = nvmlInit();
34+
if (result != NVML_SUCCESS) {
35+
fprintf(stderr, "Failed to initialize NVML: %s\n", nvmlErrorString(result));
36+
exit(1);
37+
}
38+
39+
result = nvmlDeviceGetCount(&device_count);
40+
if (result != NVML_SUCCESS) {
41+
fprintf(stderr, "Failed to get device count: %s\n", nvmlErrorString(result));
42+
nvmlShutdown();
43+
exit(1);
44+
}
45+
46+
while (1) {
47+
get_adjusted_time(&now, &offset);
48+
49+
for (unsigned int i = 0; i < device_count; i++) {
50+
51+
nvmlDeviceGetHandleByIndex(i, &device);
52+
nvmlDeviceGetName(device, name, sizeof(name));
53+
// printf("GPU %u: %s\n", i, name);
54+
55+
// nvmlDeviceGetUtilizationRates(device, &utilization);
56+
// printf(" Utilization: %u%%\n", utilization.gpu);
57+
58+
// nvmlDeviceGetMemoryInfo(device, &memory);
59+
// printf(" Memory: %llu MiB / %llu MiB\n", memory.used / 1024 / 1024, memory.total / 1024 / 1024);
60+
61+
// nvmlDeviceGetEnforcedPowerLimit(device, &power_limit); // mW
62+
63+
nvmlDeviceGetPowerUsage(device, &power_usage); // mW
64+
printf("%ld%06ld %u \"%s-%u\"\n", now.tv_sec, now.tv_usec, power_usage, name, i);
65+
66+
}
67+
usleep(msleep_time*1000);
68+
}
69+
70+
71+
}
72+
73+
74+
static int check_system() {
75+
nvmlReturn_t result;
76+
nvmlDevice_t device;
77+
unsigned int power_usage;
78+
unsigned int device_count;
79+
80+
result = nvmlInit();
81+
if (result != NVML_SUCCESS) {
82+
fprintf(stderr, "Failed to initialize NVML: %s\n", nvmlErrorString(result));
83+
return 1;
84+
}
85+
86+
result = nvmlDeviceGetCount(&device_count);
87+
if (result != NVML_SUCCESS) {
88+
fprintf(stderr, "Failed to get device count: %s\n", nvmlErrorString(result));
89+
nvmlShutdown();
90+
exit(1);
91+
}
92+
93+
if (device_count <= 0) {
94+
fprintf(stderr, "No NVIDIA cards found\n");
95+
nvmlShutdown();
96+
exit(1);
97+
98+
}
99+
100+
nvmlDeviceGetHandleByIndex(0, &device);
101+
nvmlDeviceGetPowerUsage(device, &power_usage);
102+
nvmlShutdown();
103+
104+
return 0;
105+
}
106+
107+
int main(int argc, char **argv) {
108+
109+
int c;
110+
bool check_system_flag = false;
111+
112+
static struct option long_options[] =
113+
{
114+
{"help", no_argument, NULL, 'h'},
115+
{"interval", no_argument, NULL, 'i'},
116+
{"check", no_argument, NULL, 'c'},
117+
{NULL, 0, NULL, 0}
118+
};
119+
120+
while ((c = getopt_long(argc, argv, "i:hc", long_options, NULL)) != -1) {
121+
switch (c) {
122+
case 'h':
123+
printf("Usage: %s [-i msleep_time] [-h]\n\n",argv[0]);
124+
printf("\t-h : displays this help\n");
125+
printf("\t-i : specifies the milliseconds sleep time that will be slept between measurements\n");
126+
printf("\t-c : check system and exit\n");
127+
printf("\n");
128+
129+
exit(0);
130+
case 'i':
131+
msleep_time = parse_int(optarg);
132+
break;
133+
case 'c':
134+
check_system_flag = true;
135+
break;
136+
default:
137+
fprintf(stderr,"Unknown option %c\n",c);
138+
exit(-1);
139+
}
140+
}
141+
142+
if(check_system_flag){
143+
exit(check_system());
144+
}
145+
146+
get_time_offset(&offset);
147+
148+
output_stats();
149+
150+
nvmlShutdown();
151+
152+
return 0;
153+
}

metric_providers/gpu/energy/nvidia/smi/component/metric-provider-nvidia-smi-wrapper.sh

Lines changed: 0 additions & 19 deletions
This file was deleted.

0 commit comments

Comments
 (0)