RE-Implemented NVIDIA Energy capture via C (#1167)

ArneTR · web-flow · commit 9469fe88c2ae · 2025-05-24T12:48:58.000+02:00
* RE-Implemented NVIDIA Energy capture via C

* Name change from NVIDIA SMI to NVML [skip ci]

* Makefile cleanup

* Adding NVIDIA Headers download to install

* Directory rename

* Changed resolution to sampling_rate [skip ci]

* Fixing installer --nvidia-gpu optione

* Installing libs for fedora for --nvidia-gpu

* Including fedora libs. Should not harm Ubuntu target

* Nvidia lm was duplicate; nvmlShutdown added when checking card [skip ci]
diff --git a/.github/actions/gmt-pytest/action.yml b/.github/actions/gmt-pytest/action.yml
@@ -47,12 +47,12 @@ runs:
         if ${{inputs.ee}}; then
           if [[ "${{inputs.ee-branch}}" != '' ]]; then
             echo "Using ee-branch ${{inputs.ee-branch}}"
-            ./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j -d -g -e github-actions-test --ee-branch ${{inputs.ee-branch}}
+            ./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j -d -g -e github-actions-test --ee-branch ${{inputs.ee-branch}} --nvidia-gpu
           else
-            ./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j -d -g -e github-actions-test
+            ./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j -d -g -e github-actions-test --nvidia-gpu
           fi
         else
-          ./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j
+          ./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j --nvidia-gpu
         fi
         source venv/bin/activate
       env:
diff --git a/config.yml.example b/config.yml.example
@@ -124,7 +124,7 @@ measurement:
 #      psu.energy.dc.rapl.msr.machine.provider.PsuEnergyDcRaplMsrMachineProvider:
 #        sampling_rate: 99
     #--- GPU - Only enable these if you have GPUs with power measurement enabled in your machine
-#      gpu.energy.nvidia.smi.component.provider.GpuEnergyNvidiaSmiComponentProvider:
+#      gpu.energy.nvidia.nvml.component.provider.GpuEnergyNvidiaNvmlComponentProvider:
 #        sampling_rate: 99
     #--- Sensors - these providers need the lm-sensors package installed
 #      lmsensors.temperature.component.provider.LmsensorsTemperatureComponentProvider:
diff --git a/frontend/js/helpers/config.js.example b/frontend/js/helpers/config.js.example
@@ -411,15 +411,15 @@ METRIC_MAPPINGS = {
         "source": "formula",
         "explanation": "Network total data traffic for the whole accumulated from procfs data"
     },
-    "gpu_energy_nvidia_smi_component": {
+    "gpu_energy_nvidia_nvml_component": {
         "clean_name": "GPU Energy",
-        "source": "NVIDIA SMI",
-        "explanation": "Derived NVIDIA SMI based GPU energy"
+        "source": "NVIDIA NVML",
+        "explanation": "Derived NVIDIA NVML based GPU energy"
     },
-    "gpu_power_nvidia_smi_component": {
+    "gpu_power_nvidia_nvml_component": {
         "clean_name": "GPU Power",
-        "source": "NVIDIA SMI",
-        "explanation": "NVIDIA SMI based GPU power"
+        "source": "NVIDIA NVML",
+        "explanation": "NVIDIA NVML based GPU power"
     },
     "cpu_energy_rapl_msr_component": {
         "clean_name": "CPU Energy (Package)",
diff --git a/install_linux.sh b/install_linux.sh
@@ -42,6 +42,26 @@ if [[ $activate_scenario_runner == true ]] ; then
     sudo systemctl stop tinyproxy
     sudo systemctl disable tinyproxy
 
+    if [[ $install_nvidia_toolkit_headers == true ]] ; then
+        print_message "Installing nvidia toolkit headers"
+        if lsb_release -is | grep -q "Fedora"; then
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/fedora$(rpm -E %fedora)/x86_64/cuda-fedora$(rpm -E %fedora).repo
+            sudo mv cuda-fedora$(rpm -E %fedora).repo /etc/yum.repos.d/
+            sudo dnf makecache
+            if ! sudo dnf -y install libnvidia-ml cuda-nvml-devel-12-9; then
+                print_message "Failed to install nvidia toolkit headers; Please remove --nvidia-gpu flag and install manually" >&2
+                exit 1
+            else
+                sudo ln -s /usr/lib64/libnvidia-ml.so.1 /usr/lib64/libnvidia-ml.so
+            fi
+        else
+            if ! sudo apt-get install -y libnvidia-ml-dev; then
+                print_message "Failed to install nvidia toolkit headers; Please remove --nvidia-gpu flag and install manually" >&2
+                exit 1
+            fi
+        fi
+    fi
+
     print_message "Building C libs"
     make -C "lib/c"
 
diff --git a/lib/install_shared.sh b/lib/install_shared.sh
@@ -36,6 +36,7 @@ cert_file=''
 enterprise=false
 ask_ping=true
 force_send_ping=false
+install_nvidia_toolkit_headers=false
 ee_branch=''
 
 function print_message {
@@ -329,6 +330,9 @@ function build_binaries() {
             if [[ "$make_path" == *"/lmsensors/"* ]] && [[ "${install_sensors}" == false ]]; then
                 continue
             fi
+            if [[ "$make_path" == *"/nvidia/"* ]] && [[ "${install_nvidia_toolkit_headers}" == false ]]; then
+                continue
+            fi
             echo "Installing $subdir/metric-provider-binary ..."
             rm -f $subdir/metric-provider-binary 2> /dev/null
             make -C $subdir
@@ -407,18 +411,20 @@ check_python_version
 
 while [[ $# -gt 0 ]]; do
     case "$1" in
+        --nvidia-gpu)
+            install_nvidia_toolkit_headers=true
+            shift
+            ;;
         --ai) # This is not documented in the help, as it is only for GCS internal use
             ask_ai_optimisations=false
             activate_ai_optimisations=true
             shift
             ;;
-
         --no-ai) # This is not documented in the help, as it is only for GCS internal use
             ask_ai_optimisations=false
             activate_ai_optimisations=false
             shift
             ;;
-
         --ee-branch) # This is not documented in the help, as it is only for GCS internal use
             check_optarg 'ee-branch' "${2:-}"
             ee_branch="$2"
diff --git a/metric_providers/gpu/energy/nvidia/nvml/component/Makefile b/metric_providers/gpu/energy/nvidia/nvml/component/Makefile
@@ -0,0 +1,5 @@
+CFLAGS  = -O3 -Wall -Werror -I../../../../../../lib/c -I/usr/local/cuda-12.9/targets/x86_64-linux/include
+LDFLAGS = -L../../../../../../lib/c -lnvidia-ml -lc
+
+metric-provider-binary: source.c
+	gcc $(CFLAGS) ../../../../../../lib/c/gmt-lib.o $< $(LDFLAGS) -o $@
diff --git a/metric_providers/gpu/energy/nvidia/nvml/component/README.md b/metric_providers/gpu/energy/nvidia/nvml/component/README.md
diff --git a/metric_providers/gpu/energy/nvidia/nvml/component/provider.py b/metric_providers/gpu/energy/nvidia/nvml/component/provider.py
@@ -2,24 +2,20 @@
 
 from metric_providers.base import BaseMetricProvider
 
-class GpuEnergyNvidiaSmiComponentProvider(BaseMetricProvider):
+class GpuEnergyNvidiaNvmlComponentProvider(BaseMetricProvider):
     def __init__(self, sampling_rate, skip_check=False):
         super().__init__(
-            metric_name='gpu_energy_nvidia_smi_component',
-            metrics={'time': int, 'value': int},
+            metric_name='gpu_energy_nvidia_nvml_component',
+            metrics={'time': int, 'value': int, 'card_model': str},
             sampling_rate=sampling_rate,
             unit='uJ',
             current_dir=os.path.dirname(os.path.abspath(__file__)),
-            metric_provider_executable='metric-provider-nvidia-smi-wrapper.sh',
             skip_check=skip_check,
         )
 
-
-    def check_system(self, check_command="default", check_error_message=None, check_parallel_provider=True):
-        super().check_system(check_command=['which', 'nvidia-smi'], check_error_message="nvidia-smi is not installed on the system")
-
     def _parse_metrics(self, df):
         df = super()._parse_metrics(df) # sets detail_name
+        df['detail_name'] = df.card_model
 
         '''
         Conversion to Joules
@@ -29,7 +25,7 @@ def _parse_metrics(self, df):
         WITH times as (
                     SELECT id, value, detail_name, time, (time - LAG(time) OVER (ORDER BY detail_name ASC, time ASC)) AS diff, unit
                     FROM measurements
-                    WHERE run_id = RUN_ID AND metric = 'gpu_energy_nvidia_smi_component'
+                    WHERE run_id = RUN_ID AND metric = 'gpu_energy_nvidia_nvml_component'
 
                     ORDER BY detail_name ASC, time ASC)
                     SELECT *, value / (diff / 1000) as power FROM times;
diff --git a/metric_providers/gpu/energy/nvidia/nvml/component/source.c b/metric_providers/gpu/energy/nvidia/nvml/component/source.c
@@ -0,0 +1,153 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <time.h>
+#include <string.h> // for strtok
+#include <getopt.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <nvml.h>
+#include "gmt-lib.h"
+
+
+// All variables are made static, because we believe that this will
+// keep them local in scope to the file and not make them persist in state
+// between Threads.
+// in any case, none of these variables should change between threads
+static unsigned int msleep_time=1000;
+static struct timespec offset;
+
+static void output_stats() {
+    struct timeval now;
+    nvmlReturn_t result;
+    unsigned int device_count;
+    nvmlDevice_t device;
+    char name[NVML_DEVICE_NAME_BUFFER_SIZE];
+//    nvmlUtilization_t utilization;
+//    nvmlMemory_t memory;
+    unsigned int power_usage;
+//    unsigned int power_limit;
+
+    result = nvmlInit();
+    if (result != NVML_SUCCESS) {
+        fprintf(stderr, "Failed to initialize NVML: %s\n", nvmlErrorString(result));
+        exit(1);
+    }
+
+    result = nvmlDeviceGetCount(&device_count);
+    if (result != NVML_SUCCESS) {
+        fprintf(stderr, "Failed to get device count: %s\n", nvmlErrorString(result));
+        nvmlShutdown();
+        exit(1);
+    }
+
+    while (1) {
+        get_adjusted_time(&now, &offset);
+
+        for (unsigned int i = 0; i < device_count; i++) {
+
+            nvmlDeviceGetHandleByIndex(i, &device);
+            nvmlDeviceGetName(device, name, sizeof(name));
+//            printf("GPU %u: %s\n", i, name);
+
+//            nvmlDeviceGetUtilizationRates(device, &utilization);
+//            printf("  Utilization: %u%%\n", utilization.gpu);
+
+//            nvmlDeviceGetMemoryInfo(device, &memory);
+//            printf("  Memory: %llu MiB / %llu MiB\n", memory.used / 1024 / 1024, memory.total / 1024 / 1024);
+
+//            nvmlDeviceGetEnforcedPowerLimit(device, &power_limit); // mW
+
+            nvmlDeviceGetPowerUsage(device, &power_usage);         // mW
+            printf("%ld%06ld %u \"%s-%u\"\n", now.tv_sec, now.tv_usec, power_usage, name, i);
+
+        }
+        usleep(msleep_time*1000);
+    }
+
+
+}
+
+
+static int check_system() {
+    nvmlReturn_t result;
+    nvmlDevice_t device;
+    unsigned int power_usage;
+    unsigned int device_count;
+
+    result = nvmlInit();
+    if (result != NVML_SUCCESS) {
+        fprintf(stderr, "Failed to initialize NVML: %s\n", nvmlErrorString(result));
+        return 1;
+    }
+
+    result = nvmlDeviceGetCount(&device_count);
+    if (result != NVML_SUCCESS) {
+        fprintf(stderr, "Failed to get device count: %s\n", nvmlErrorString(result));
+        nvmlShutdown();
+        exit(1);
+    }
+
+    if (device_count <= 0) {
+        fprintf(stderr, "No NVIDIA cards found\n");
+        nvmlShutdown();
+        exit(1);
+
+    }
+
+    nvmlDeviceGetHandleByIndex(0, &device);
+    nvmlDeviceGetPowerUsage(device, &power_usage);
+    nvmlShutdown();
+
+    return 0;
+}
+
+int main(int argc, char **argv) {
+
+    int c;
+    bool check_system_flag = false;
+
+    static struct option long_options[] =
+    {
+        {"help", no_argument, NULL, 'h'},
+        {"interval", no_argument, NULL, 'i'},
+        {"check", no_argument, NULL, 'c'},
+        {NULL, 0, NULL, 0}
+    };
+
+    while ((c = getopt_long(argc, argv, "i:hc", long_options, NULL)) != -1) {
+        switch (c) {
+        case 'h':
+            printf("Usage: %s [-i msleep_time] [-h]\n\n",argv[0]);
+            printf("\t-h      : displays this help\n");
+            printf("\t-i      : specifies the milliseconds sleep time that will be slept between measurements\n");
+            printf("\t-c      : check system and exit\n");
+            printf("\n");
+
+            exit(0);
+        case 'i':
+            msleep_time = parse_int(optarg);
+            break;
+        case 'c':
+            check_system_flag = true;
+            break;
+        default:
+            fprintf(stderr,"Unknown option %c\n",c);
+            exit(-1);
+        }
+    }
+
+    if(check_system_flag){
+        exit(check_system());
+    }
+
+    get_time_offset(&offset);
+
+    output_stats();
+
+    nvmlShutdown();
+
+    return 0;
+}
diff --git a/metric_providers/gpu/energy/nvidia/smi/component/metric-provider-nvidia-smi-wrapper.sh b/metric_providers/gpu/energy/nvidia/smi/component/metric-provider-nvidia-smi-wrapper.sh