Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/actions/gmt-pytest/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,12 @@ runs:
if ${{inputs.ee}}; then
if [[ "${{inputs.ee-branch}}" != '' ]]; then
echo "Using ee-branch ${{inputs.ee-branch}}"
./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j -d -g -e github-actions-test --ee-branch ${{inputs.ee-branch}}
./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j -d -g -e github-actions-test --ee-branch ${{inputs.ee-branch}} --nvidia-gpu
else
./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j -d -g -e github-actions-test
./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j -d -g -e github-actions-test --nvidia-gpu
fi
else
./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j
./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j --nvidia-gpu
fi
source venv/bin/activate
env:
Expand Down
2 changes: 1 addition & 1 deletion config.yml.example
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ measurement:
# psu.energy.dc.rapl.msr.machine.provider.PsuEnergyDcRaplMsrMachineProvider:
# sampling_rate: 99
#--- GPU - Only enable these if you have GPUs with power measurement enabled in your machine
# gpu.energy.nvidia.smi.component.provider.GpuEnergyNvidiaSmiComponentProvider:
# gpu.energy.nvidia.nvml.component.provider.GpuEnergyNvidiaNvmlComponentProvider:
# sampling_rate: 99
#--- Sensors - these providers need the lm-sensors package installed
# lmsensors.temperature.component.provider.LmsensorsTemperatureComponentProvider:
Expand Down
12 changes: 6 additions & 6 deletions frontend/js/helpers/config.js.example
Original file line number Diff line number Diff line change
Expand Up @@ -411,15 +411,15 @@ METRIC_MAPPINGS = {
"source": "formula",
"explanation": "Network total data traffic for the whole accumulated from procfs data"
},
"gpu_energy_nvidia_smi_component": {
"gpu_energy_nvidia_nvml_component": {
"clean_name": "GPU Energy",
"source": "NVIDIA SMI",
"explanation": "Derived NVIDIA SMI based GPU energy"
"source": "NVIDIA NVML",
"explanation": "Derived NVIDIA NVML based GPU energy"
},
"gpu_power_nvidia_smi_component": {
"gpu_power_nvidia_nvml_component": {
"clean_name": "GPU Power",
"source": "NVIDIA SMI",
"explanation": "NVIDIA SMI based GPU power"
"source": "NVIDIA NVML",
"explanation": "NVIDIA NVML based GPU power"
},
"cpu_energy_rapl_msr_component": {
"clean_name": "CPU Energy (Package)",
Expand Down
20 changes: 20 additions & 0 deletions install_linux.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,26 @@ if [[ $activate_scenario_runner == true ]] ; then
sudo systemctl stop tinyproxy
sudo systemctl disable tinyproxy

if [[ $install_nvidia_toolkit_headers == true ]] ; then
print_message "Installing nvidia toolkit headers"
if lsb_release -is | grep -q "Fedora"; then
curl -O https://developer.download.nvidia.com/compute/cuda/repos/fedora$(rpm -E %fedora)/x86_64/cuda-fedora$(rpm -E %fedora).repo
sudo mv cuda-fedora$(rpm -E %fedora).repo /etc/yum.repos.d/
sudo dnf makecache
if ! sudo dnf -y install libnvidia-ml cuda-nvml-devel-12-9; then
print_message "Failed to install nvidia toolkit headers; Please remove --nvidia-gpu flag and install manually" >&2
exit 1
else
sudo ln -s /usr/lib64/libnvidia-ml.so.1 /usr/lib64/libnvidia-ml.so
fi
else
if ! sudo apt-get install -y libnvidia-ml-dev; then
print_message "Failed to install nvidia toolkit headers; Please remove --nvidia-gpu flag and install manually" >&2
exit 1
fi
fi
fi

print_message "Building C libs"
make -C "lib/c"

Expand Down
10 changes: 8 additions & 2 deletions lib/install_shared.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ cert_file=''
enterprise=false
ask_ping=true
force_send_ping=false
install_nvidia_toolkit_headers=false
ee_branch=''

function print_message {
Expand Down Expand Up @@ -329,6 +330,9 @@ function build_binaries() {
if [[ "$make_path" == *"/lmsensors/"* ]] && [[ "${install_sensors}" == false ]]; then
continue
fi
if [[ "$make_path" == *"/nvidia/"* ]] && [[ "${install_nvidia_toolkit_headers}" == false ]]; then
continue
fi
echo "Installing $subdir/metric-provider-binary ..."
rm -f $subdir/metric-provider-binary 2> /dev/null
make -C $subdir
Expand Down Expand Up @@ -407,18 +411,20 @@ check_python_version

while [[ $# -gt 0 ]]; do
case "$1" in
--nvidia-gpu)
install_nvidia_toolkit_headers=true
shift
;;
--ai) # This is not documented in the help, as it is only for GCS internal use
ask_ai_optimisations=false
activate_ai_optimisations=true
shift
;;

--no-ai) # This is not documented in the help, as it is only for GCS internal use
ask_ai_optimisations=false
activate_ai_optimisations=false
shift
;;

--ee-branch) # This is not documented in the help, as it is only for GCS internal use
check_optarg 'ee-branch' "${2:-}"
ee_branch="$2"
Expand Down
5 changes: 5 additions & 0 deletions metric_providers/gpu/energy/nvidia/nvml/component/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
CFLAGS = -O3 -Wall -Werror -I../../../../../../lib/c -I/usr/local/cuda-12.9/targets/x86_64-linux/include
LDFLAGS = -L../../../../../../lib/c -lnvidia-ml -lc -lnvidia-ml

metric-provider-binary: source.c
gcc $(CFLAGS) ../../../../../../lib/c/gmt-lib.o $< $(LDFLAGS) -o $@
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,20 @@

from metric_providers.base import BaseMetricProvider

class GpuEnergyNvidiaSmiComponentProvider(BaseMetricProvider):
class GpuEnergyNvidiaNvmlComponentProvider(BaseMetricProvider):
def __init__(self, sampling_rate, skip_check=False):
super().__init__(
metric_name='gpu_energy_nvidia_smi_component',
metrics={'time': int, 'value': int},
metric_name='gpu_energy_nvidia_nvml_component',
metrics={'time': int, 'value': int, 'card_model': str},
sampling_rate=sampling_rate,
unit='uJ',
current_dir=os.path.dirname(os.path.abspath(__file__)),
metric_provider_executable='metric-provider-nvidia-smi-wrapper.sh',
skip_check=skip_check,
)


def check_system(self, check_command="default", check_error_message=None, check_parallel_provider=True):
super().check_system(check_command=['which', 'nvidia-smi'], check_error_message="nvidia-smi is not installed on the system")

def _parse_metrics(self, df):
df = super()._parse_metrics(df) # sets detail_name
df['detail_name'] = df.card_model

'''
Conversion to Joules
Expand All @@ -29,7 +25,7 @@ def _parse_metrics(self, df):
WITH times as (
SELECT id, value, detail_name, time, (time - LAG(time) OVER (ORDER BY detail_name ASC, time ASC)) AS diff, unit
FROM measurements
WHERE run_id = RUN_ID AND metric = 'gpu_energy_nvidia_smi_component'
WHERE run_id = RUN_ID AND metric = 'gpu_energy_nvidia_nvml_component'
ORDER BY detail_name ASC, time ASC)
SELECT *, value / (diff / 1000) as power FROM times;
Expand Down
152 changes: 152 additions & 0 deletions metric_providers/gpu/energy/nvidia/nvml/component/source.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <unistd.h>
#include <sys/time.h>
#include <time.h>
#include <string.h> // for strtok
#include <getopt.h>
#include <limits.h>
#include <stdbool.h>
#include <nvml.h>
#include "gmt-lib.h"


// All variables are made static, because we believe that this will
// keep them local in scope to the file and not make them persist in state
// between Threads.
// in any case, none of these variables should change between threads
static unsigned int msleep_time=1000;
static struct timespec offset;

static void output_stats() {
struct timeval now;
nvmlReturn_t result;
unsigned int device_count;
nvmlDevice_t device;
char name[NVML_DEVICE_NAME_BUFFER_SIZE];
// nvmlUtilization_t utilization;
// nvmlMemory_t memory;
unsigned int power_usage;
// unsigned int power_limit;

result = nvmlInit();
if (result != NVML_SUCCESS) {
fprintf(stderr, "Failed to initialize NVML: %s\n", nvmlErrorString(result));
exit(1);
}

result = nvmlDeviceGetCount(&device_count);
if (result != NVML_SUCCESS) {
fprintf(stderr, "Failed to get device count: %s\n", nvmlErrorString(result));
nvmlShutdown();
exit(1);
}

while (1) {
get_adjusted_time(&now, &offset);

for (unsigned int i = 0; i < device_count; i++) {

nvmlDeviceGetHandleByIndex(i, &device);
nvmlDeviceGetName(device, name, sizeof(name));
// printf("GPU %u: %s\n", i, name);

// nvmlDeviceGetUtilizationRates(device, &utilization);
// printf(" Utilization: %u%%\n", utilization.gpu);

// nvmlDeviceGetMemoryInfo(device, &memory);
// printf(" Memory: %llu MiB / %llu MiB\n", memory.used / 1024 / 1024, memory.total / 1024 / 1024);

// nvmlDeviceGetEnforcedPowerLimit(device, &power_limit); // mW

nvmlDeviceGetPowerUsage(device, &power_usage); // mW
printf("%ld%06ld %u \"%s-%u\"\n", now.tv_sec, now.tv_usec, power_usage, name, i);

}
usleep(msleep_time*1000);
}


}


static int check_system() {
nvmlReturn_t result;
nvmlDevice_t device;
unsigned int power_usage;
unsigned int device_count;

result = nvmlInit();
if (result != NVML_SUCCESS) {
fprintf(stderr, "Failed to initialize NVML: %s\n", nvmlErrorString(result));
return 1;
}

result = nvmlDeviceGetCount(&device_count);
if (result != NVML_SUCCESS) {
fprintf(stderr, "Failed to get device count: %s\n", nvmlErrorString(result));
nvmlShutdown();
exit(1);
}

if (device_count <= 0) {
fprintf(stderr, "No NVIDIA cards found\n");
nvmlShutdown();
exit(1);

}

nvmlDeviceGetHandleByIndex(0, &device);
nvmlDeviceGetPowerUsage(device, &power_usage);

return 0;
}

int main(int argc, char **argv) {

int c;
bool check_system_flag = false;

static struct option long_options[] =
{
{"help", no_argument, NULL, 'h'},
{"interval", no_argument, NULL, 'i'},
{"check", no_argument, NULL, 'c'},
{NULL, 0, NULL, 0}
};

while ((c = getopt_long(argc, argv, "i:hc", long_options, NULL)) != -1) {
switch (c) {
case 'h':
printf("Usage: %s [-i msleep_time] [-h]\n\n",argv[0]);
printf("\t-h : displays this help\n");
printf("\t-i : specifies the milliseconds sleep time that will be slept between measurements\n");
printf("\t-c : check system and exit\n");
printf("\n");

exit(0);
case 'i':
msleep_time = parse_int(optarg);
break;
case 'c':
check_system_flag = true;
break;
default:
fprintf(stderr,"Unknown option %c\n",c);
exit(-1);
}
}

if(check_system_flag){
exit(check_system());
}

get_time_offset(&offset);

output_stats();

nvmlShutdown();

return 0;
}

This file was deleted.