diff --git a/.github/actions/gmt-pytest/action.yml b/.github/actions/gmt-pytest/action.yml index 6b4c628c1..e4b9bb8a4 100644 --- a/.github/actions/gmt-pytest/action.yml +++ b/.github/actions/gmt-pytest/action.yml @@ -47,12 +47,12 @@ runs: if ${{inputs.ee}}; then if [[ "${{inputs.ee-branch}}" != '' ]]; then echo "Using ee-branch ${{inputs.ee-branch}}" - ./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j -d -g -e github-actions-test --ee-branch ${{inputs.ee-branch}} + ./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j -d -g -e github-actions-test --ee-branch ${{inputs.ee-branch}} --nvidia-gpu else - ./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j -d -g -e github-actions-test + ./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j -d -g -e github-actions-test --nvidia-gpu fi else - ./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j + ./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j --nvidia-gpu fi source venv/bin/activate env: diff --git a/config.yml.example b/config.yml.example index a119b4d63..a96f9d046 100644 --- a/config.yml.example +++ b/config.yml.example @@ -124,7 +124,7 @@ measurement: # psu.energy.dc.rapl.msr.machine.provider.PsuEnergyDcRaplMsrMachineProvider: # resolution: 99 #--- GPU - Only enable these if you have GPUs with power measurement enabled in your machine -# gpu.energy.nvidia.smi.component.provider.GpuEnergyNvidiaSmiComponentProvider: +# gpu.energy.nvidia.nvml.component.provider.GpuEnergyNvidiaNvmlComponentProvider: # resolution: 99 #--- Sensors - these providers need the lm-sensors package installed # lmsensors.temperature.component.provider.LmsensorsTemperatureComponentProvider: diff --git a/frontend/js/helpers/config.js.example b/frontend/js/helpers/config.js.example index 46361b5de..07bfa8eb6 100644 --- a/frontend/js/helpers/config.js.example +++ b/frontend/js/helpers/config.js.example @@ -400,15 +400,15 @@ METRIC_MAPPINGS = { "source": "formula", "explanation": "Network total data traffic for the whole accumulated from procfs data" }, - "gpu_energy_nvidia_smi_component": { + "gpu_energy_nvidia_nvml_component": { "clean_name": "GPU Energy", - "source": "NVIDIA SMI", - "explanation": "Derived NVIDIA SMI based GPU energy" + "source": "NVIDIA NVML", + "explanation": "Derived NVIDIA NVML based GPU energy" }, - "gpu_power_nvidia_smi_component": { + "gpu_power_nvidia_nvml_component": { "clean_name": "GPU Power", - "source": "NVIDIA SMI", - "explanation": "NVIDIA SMI based GPU power" + "source": "NVIDIA NVML", + "explanation": "NVIDIA NVML based GPU power" }, "cpu_energy_rapl_msr_component": { "clean_name": "CPU Energy (Package)", diff --git a/install_linux.sh b/install_linux.sh index 5fc7940d3..a6ba66144 100755 --- a/install_linux.sh +++ b/install_linux.sh @@ -38,6 +38,19 @@ if [[ $activate_scenario_runner == true ]] ; then sudo systemctl stop tinyproxy sudo systemctl disable tinyproxy + if [[ $install_nvidia_toolkit_headers == true ]] ; then + print_message "Installing nvidia toolkit headers" + if lsb_release -is | grep -q "Fedora"; then + if ! sudo dnf -y install cuda-nvml-dev; then + print_message "Failed to install msr-tools; continuing without RAPL." + fi + else + if ! sudo apt-get install -y libnvidia-ml-dev; then + print_message "Failed to install msr-tools; continuing without RAPL." + fi + fi + fi + print_message "Building C libs" make -C "lib/c" diff --git a/lib/install_shared.sh b/lib/install_shared.sh index 32cae417f..0d2d28af3 100644 --- a/lib/install_shared.sh +++ b/lib/install_shared.sh @@ -34,6 +34,7 @@ cert_file='' enterprise=false ask_ping=true force_send_ping=false +install_nvidia_toolkit_headers=false ee_branch='' function print_message { @@ -302,6 +303,9 @@ function build_binaries() { if [[ "$make_path" == *"/lmsensors/"* ]] && [[ "${install_sensors}" == false ]]; then continue fi + if [[ "$make_path" == *"/nvidia/"* ]] && [[ "${install_nvidia_toolkit_headers}" == false ]]; then + continue + fi echo "Installing $subdir/metric-provider-binary ..." rm -f $subdir/metric-provider-binary 2> /dev/null make -C $subdir @@ -380,6 +384,11 @@ check_python_version while [[ $# -gt 0 ]]; do case "$1" in + --nvidia-gpu) + install_nvidia_toolkit_headers=true + shift + ;; + --ee-branch) # This is not documented in the help, as it is only for GCS internal use check_optarg 'ee-branch' "${2:-}" ee_branch="$2" diff --git a/metric_providers/gpu/energy/nvidia/nvml/component/Makefile b/metric_providers/gpu/energy/nvidia/nvml/component/Makefile new file mode 100644 index 000000000..eb6b25189 --- /dev/null +++ b/metric_providers/gpu/energy/nvidia/nvml/component/Makefile @@ -0,0 +1,5 @@ +CFLAGS = -O3 -Wall -Werror -I../../../../../../lib/c +LDFLAGS = -L../../../../../../lib/c -lnvidia-ml -lc + +metric-provider-binary: source.c + gcc $(CFLAGS) ../../../../../../lib/c/gmt-lib.o $< $(LDFLAGS) -o $@ \ No newline at end of file diff --git a/metric_providers/gpu/energy/nvidia/smi/component/README.md b/metric_providers/gpu/energy/nvidia/nvml/component/README.md similarity index 100% rename from metric_providers/gpu/energy/nvidia/smi/component/README.md rename to metric_providers/gpu/energy/nvidia/nvml/component/README.md diff --git a/metric_providers/gpu/energy/nvidia/smi/component/provider.py b/metric_providers/gpu/energy/nvidia/nvml/component/provider.py similarity index 76% rename from metric_providers/gpu/energy/nvidia/smi/component/provider.py rename to metric_providers/gpu/energy/nvidia/nvml/component/provider.py index f1cf3bdc3..14a618ed4 100644 --- a/metric_providers/gpu/energy/nvidia/smi/component/provider.py +++ b/metric_providers/gpu/energy/nvidia/nvml/component/provider.py @@ -2,24 +2,20 @@ from metric_providers.base import BaseMetricProvider -class GpuEnergyNvidiaSmiComponentProvider(BaseMetricProvider): +class GpuEnergyNvidiaNvmlComponentProvider(BaseMetricProvider): def __init__(self, resolution, skip_check=False): super().__init__( - metric_name='gpu_energy_nvidia_smi_component', - metrics={'time': int, 'value': int}, + metric_name='gpu_energy_nvidia_nvml_component', + metrics={'time': int, 'value': int, 'card_model': str}, resolution=resolution, unit='uJ', current_dir=os.path.dirname(os.path.abspath(__file__)), - metric_provider_executable='metric-provider-nvidia-smi-wrapper.sh', skip_check=skip_check, ) - - def check_system(self, check_command="default", check_error_message=None, check_parallel_provider=True): - super().check_system(check_command=['which', 'nvidia-smi'], check_error_message="nvidia-smi is not installed on the system") - def _parse_metrics(self, df): df = super()._parse_metrics(df) # sets detail_name + df['detail_name'] = df.card_model ''' Conversion to Joules @@ -29,7 +25,7 @@ def _parse_metrics(self, df): WITH times as ( SELECT id, value, detail_name, time, (time - LAG(time) OVER (ORDER BY detail_name ASC, time ASC)) AS diff, unit FROM measurements - WHERE run_id = RUN_ID AND metric = 'gpu_energy_nvidia_smi_component' + WHERE run_id = RUN_ID AND metric = 'gpu_energy_nvidia_nvml_component' ORDER BY detail_name ASC, time ASC) SELECT *, value / (diff / 1000) as power FROM times; diff --git a/metric_providers/gpu/energy/nvidia/nvml/component/source.c b/metric_providers/gpu/energy/nvidia/nvml/component/source.c new file mode 100755 index 000000000..f3c25d0f7 --- /dev/null +++ b/metric_providers/gpu/energy/nvidia/nvml/component/source.c @@ -0,0 +1,152 @@ +#include +#include +#include +#include +#include +#include +#include // for strtok +#include +#include +#include +#include +#include "gmt-lib.h" + + +// All variables are made static, because we believe that this will +// keep them local in scope to the file and not make them persist in state +// between Threads. +// in any case, none of these variables should change between threads +static unsigned int msleep_time=1000; +static struct timespec offset; + +static void output_stats() { + struct timeval now; + nvmlReturn_t result; + unsigned int device_count; + nvmlDevice_t device; + char name[NVML_DEVICE_NAME_BUFFER_SIZE]; +// nvmlUtilization_t utilization; +// nvmlMemory_t memory; + unsigned int power_usage; +// unsigned int power_limit; + + result = nvmlInit(); + if (result != NVML_SUCCESS) { + fprintf(stderr, "Failed to initialize NVML: %s\n", nvmlErrorString(result)); + exit(1); + } + + result = nvmlDeviceGetCount(&device_count); + if (result != NVML_SUCCESS) { + fprintf(stderr, "Failed to get device count: %s\n", nvmlErrorString(result)); + nvmlShutdown(); + exit(1); + } + + while (1) { + get_adjusted_time(&now, &offset); + + for (unsigned int i = 0; i < device_count; i++) { + + nvmlDeviceGetHandleByIndex(i, &device); + nvmlDeviceGetName(device, name, sizeof(name)); +// printf("GPU %u: %s\n", i, name); + +// nvmlDeviceGetUtilizationRates(device, &utilization); +// printf(" Utilization: %u%%\n", utilization.gpu); + +// nvmlDeviceGetMemoryInfo(device, &memory); +// printf(" Memory: %llu MiB / %llu MiB\n", memory.used / 1024 / 1024, memory.total / 1024 / 1024); + +// nvmlDeviceGetEnforcedPowerLimit(device, &power_limit); // mW + + nvmlDeviceGetPowerUsage(device, &power_usage); // mW + printf("%ld%06ld %u \"%s-%u\"\n", now.tv_sec, now.tv_usec, power_usage, name, i); + + } + usleep(msleep_time*1000); + } + + +} + + +static int check_system() { + nvmlReturn_t result; + nvmlDevice_t device; + unsigned int power_usage; + unsigned int device_count; + + result = nvmlInit(); + if (result != NVML_SUCCESS) { + fprintf(stderr, "Failed to initialize NVML: %s\n", nvmlErrorString(result)); + return 1; + } + + result = nvmlDeviceGetCount(&device_count); + if (result != NVML_SUCCESS) { + fprintf(stderr, "Failed to get device count: %s\n", nvmlErrorString(result)); + nvmlShutdown(); + exit(1); + } + + if (device_count <= 0) { + fprintf(stderr, "No NVIDIA cards found\n"); + nvmlShutdown(); + exit(1); + + } + + nvmlDeviceGetHandleByIndex(0, &device); + nvmlDeviceGetPowerUsage(device, &power_usage); + + return 0; +} + +int main(int argc, char **argv) { + + int c; + bool check_system_flag = false; + + static struct option long_options[] = + { + {"help", no_argument, NULL, 'h'}, + {"interval", no_argument, NULL, 'i'}, + {"check", no_argument, NULL, 'c'}, + {NULL, 0, NULL, 0} + }; + + while ((c = getopt_long(argc, argv, "i:hc", long_options, NULL)) != -1) { + switch (c) { + case 'h': + printf("Usage: %s [-i msleep_time] [-h]\n\n",argv[0]); + printf("\t-h : displays this help\n"); + printf("\t-i : specifies the milliseconds sleep time that will be slept between measurements\n"); + printf("\t-c : check system and exit\n"); + printf("\n"); + + exit(0); + case 'i': + msleep_time = parse_int(optarg); + break; + case 'c': + check_system_flag = true; + break; + default: + fprintf(stderr,"Unknown option %c\n",c); + exit(-1); + } + } + + if(check_system_flag){ + exit(check_system()); + } + + get_time_offset(&offset); + + output_stats(); + + nvmlShutdown(); + + return 0; +} \ No newline at end of file diff --git a/metric_providers/gpu/energy/nvidia/smi/component/metric-provider-nvidia-smi-wrapper.sh b/metric_providers/gpu/energy/nvidia/smi/component/metric-provider-nvidia-smi-wrapper.sh deleted file mode 100755 index a047f2485..000000000 --- a/metric_providers/gpu/energy/nvidia/smi/component/metric-provider-nvidia-smi-wrapper.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -set -euo pipefail - -i='100' - -while getopts "i:" o; do - case "$o" in - i) - i=${OPTARG} - ;; - esac -done - -i=$(bc <<< "scale=3; $i / 1000") - -while true; do - echo -en $(date +"%s%6N") $(nvidia-smi --query-gpu=power.draw --format=csv,noheader,nounits| awk '{ gsub("\\.", ""); print }')"0\n" - sleep $i -done