Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/actions/gmt-pytest/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,12 @@ runs:
if ${{inputs.ee}}; then
if [[ "${{inputs.ee-branch}}" != '' ]]; then
echo "Using ee-branch ${{inputs.ee-branch}}"
./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j -d -g -e github-actions-test --ee-branch ${{inputs.ee-branch}}
./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j -d -g -e github-actions-test --ee-branch ${{inputs.ee-branch}} --nvidia-gpu
else
./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j -d -g -e github-actions-test
./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j -d -g -e github-actions-test --nvidia-gpu
fi
else
./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j
./install_linux.sh -p testpw -a http://api.green-coding.internal:9142 -m http://metrics.green-coding.internal:9142 -B -T -L -z -f -j --nvidia-gpu
fi
source venv/bin/activate
env:
Expand Down
2 changes: 1 addition & 1 deletion config.yml.example
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ measurement:
# psu.energy.dc.rapl.msr.machine.provider.PsuEnergyDcRaplMsrMachineProvider:
# sampling_rate: 99
#--- GPU - Only enable these if you have GPUs with power measurement enabled in your machine
# gpu.energy.nvidia.smi.component.provider.GpuEnergyNvidiaSmiComponentProvider:
# gpu.energy.nvidia.nvml.component.provider.GpuEnergyNvidiaNvmlComponentProvider:
# sampling_rate: 99
#--- Sensors - these providers need the lm-sensors package installed
# lmsensors.temperature.component.provider.LmsensorsTemperatureComponentProvider:
Expand Down
12 changes: 6 additions & 6 deletions frontend/js/helpers/config.js.example
Original file line number Diff line number Diff line change
Expand Up @@ -411,15 +411,15 @@ METRIC_MAPPINGS = {
"source": "formula",
"explanation": "Network total data traffic for the whole accumulated from procfs data"
},
"gpu_energy_nvidia_smi_component": {
"gpu_energy_nvidia_nvml_component": {
"clean_name": "GPU Energy",
"source": "NVIDIA SMI",
"explanation": "Derived NVIDIA SMI based GPU energy"
"source": "NVIDIA NVML",
"explanation": "Derived NVIDIA NVML based GPU energy"
},
"gpu_power_nvidia_smi_component": {
"gpu_power_nvidia_nvml_component": {
"clean_name": "GPU Power",
"source": "NVIDIA SMI",
"explanation": "NVIDIA SMI based GPU power"
"source": "NVIDIA NVML",
"explanation": "NVIDIA NVML based GPU power"
},
"cpu_energy_rapl_msr_component": {
"clean_name": "CPU Energy (Package)",
Expand Down
13 changes: 13 additions & 0 deletions install_linux.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,19 @@ if [[ $activate_scenario_runner == true ]] ; then
sudo systemctl stop tinyproxy
sudo systemctl disable tinyproxy

if [[ $install_nvidia_toolkit_headers == true ]] ; then
print_message "Installing nvidia toolkit headers"
if lsb_release -is | grep -q "Fedora"; then
if ! sudo dnf -y install cuda-nvml-dev; then
print_message "Failed to install msr-tools; continuing without RAPL."
fi
else
if ! sudo apt-get install -y libnvidia-ml-dev; then
print_message "Failed to install msr-tools; continuing without RAPL."
fi
fi
fi

print_message "Building C libs"
make -C "lib/c"

Expand Down
6 changes: 6 additions & 0 deletions lib/install_shared.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ cert_file=''
enterprise=false
ask_ping=true
force_send_ping=false
install_nvidia_toolkit_headers=false
ee_branch=''

function print_message {
Expand Down Expand Up @@ -329,6 +330,9 @@ function build_binaries() {
if [[ "$make_path" == *"/lmsensors/"* ]] && [[ "${install_sensors}" == false ]]; then
continue
fi
if [[ "$make_path" == *"/nvidia/"* ]] && [[ "${install_nvidia_toolkit_headers}" == false ]]; then
continue
fi
echo "Installing $subdir/metric-provider-binary ..."
rm -f $subdir/metric-provider-binary 2> /dev/null
make -C $subdir
Expand Down Expand Up @@ -407,6 +411,8 @@ check_python_version

while [[ $# -gt 0 ]]; do
case "$1" in
--nvidia-gpu)
install_nvidia_toolkit_headers=true
--ai) # This is not documented in the help, as it is only for GCS internal use
ask_ai_optimisations=false
activate_ai_optimisations=true
Expand Down
5 changes: 5 additions & 0 deletions metric_providers/gpu/energy/nvidia/nvml/component/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
CFLAGS = -O3 -Wall -Werror -I../../../../../../lib/c
LDFLAGS = -L../../../../../../lib/c -lnvidia-ml -lc

metric-provider-binary: source.c
gcc $(CFLAGS) ../../../../../../lib/c/gmt-lib.o $< $(LDFLAGS) -o $@
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,20 @@

from metric_providers.base import BaseMetricProvider

class GpuEnergyNvidiaSmiComponentProvider(BaseMetricProvider):
class GpuEnergyNvidiaNvmlComponentProvider(BaseMetricProvider):
def __init__(self, sampling_rate, skip_check=False):
super().__init__(
metric_name='gpu_energy_nvidia_smi_component',
metrics={'time': int, 'value': int},
metric_name='gpu_energy_nvidia_nvml_component',
metrics={'time': int, 'value': int, 'card_model': str},
sampling_rate=sampling_rate,
unit='uJ',
current_dir=os.path.dirname(os.path.abspath(__file__)),
metric_provider_executable='metric-provider-nvidia-smi-wrapper.sh',
skip_check=skip_check,
)


def check_system(self, check_command="default", check_error_message=None, check_parallel_provider=True):
super().check_system(check_command=['which', 'nvidia-smi'], check_error_message="nvidia-smi is not installed on the system")

def _parse_metrics(self, df):
df = super()._parse_metrics(df) # sets detail_name
df['detail_name'] = df.card_model

'''
Conversion to Joules
Expand All @@ -29,7 +25,7 @@ def _parse_metrics(self, df):
WITH times as (
SELECT id, value, detail_name, time, (time - LAG(time) OVER (ORDER BY detail_name ASC, time ASC)) AS diff, unit
FROM measurements
WHERE run_id = RUN_ID AND metric = 'gpu_energy_nvidia_smi_component'
WHERE run_id = RUN_ID AND metric = 'gpu_energy_nvidia_nvml_component'

ORDER BY detail_name ASC, time ASC)
SELECT *, value / (diff / 1000) as power FROM times;
Expand Down
152 changes: 152 additions & 0 deletions metric_providers/gpu/energy/nvidia/nvml/component/source.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <unistd.h>
#include <sys/time.h>
#include <time.h>
#include <string.h> // for strtok
#include <getopt.h>
#include <limits.h>
#include <stdbool.h>
#include <nvml.h>
#include "gmt-lib.h"


// All variables are made static, because we believe that this will
// keep them local in scope to the file and not make them persist in state
// between Threads.
// in any case, none of these variables should change between threads
static unsigned int msleep_time=1000;
static struct timespec offset;

static void output_stats() {
struct timeval now;
nvmlReturn_t result;
unsigned int device_count;
nvmlDevice_t device;
char name[NVML_DEVICE_NAME_BUFFER_SIZE];
// nvmlUtilization_t utilization;
// nvmlMemory_t memory;
unsigned int power_usage;
// unsigned int power_limit;

result = nvmlInit();
if (result != NVML_SUCCESS) {
fprintf(stderr, "Failed to initialize NVML: %s\n", nvmlErrorString(result));
exit(1);
}

result = nvmlDeviceGetCount(&device_count);
if (result != NVML_SUCCESS) {
fprintf(stderr, "Failed to get device count: %s\n", nvmlErrorString(result));
nvmlShutdown();
exit(1);
}

while (1) {
get_adjusted_time(&now, &offset);

for (unsigned int i = 0; i < device_count; i++) {

nvmlDeviceGetHandleByIndex(i, &device);
nvmlDeviceGetName(device, name, sizeof(name));
// printf("GPU %u: %s\n", i, name);

// nvmlDeviceGetUtilizationRates(device, &utilization);
// printf(" Utilization: %u%%\n", utilization.gpu);

// nvmlDeviceGetMemoryInfo(device, &memory);
// printf(" Memory: %llu MiB / %llu MiB\n", memory.used / 1024 / 1024, memory.total / 1024 / 1024);

// nvmlDeviceGetEnforcedPowerLimit(device, &power_limit); // mW

nvmlDeviceGetPowerUsage(device, &power_usage); // mW
printf("%ld%06ld %u \"%s-%u\"\n", now.tv_sec, now.tv_usec, power_usage, name, i);

}
usleep(msleep_time*1000);
}


}


static int check_system() {
nvmlReturn_t result;
nvmlDevice_t device;
unsigned int power_usage;
unsigned int device_count;

result = nvmlInit();
if (result != NVML_SUCCESS) {
fprintf(stderr, "Failed to initialize NVML: %s\n", nvmlErrorString(result));
return 1;
}

result = nvmlDeviceGetCount(&device_count);
if (result != NVML_SUCCESS) {
fprintf(stderr, "Failed to get device count: %s\n", nvmlErrorString(result));
nvmlShutdown();
exit(1);
}

if (device_count <= 0) {
fprintf(stderr, "No NVIDIA cards found\n");
nvmlShutdown();
exit(1);

}

nvmlDeviceGetHandleByIndex(0, &device);
nvmlDeviceGetPowerUsage(device, &power_usage);

return 0;
}

int main(int argc, char **argv) {

int c;
bool check_system_flag = false;

static struct option long_options[] =
{
{"help", no_argument, NULL, 'h'},
{"interval", no_argument, NULL, 'i'},
{"check", no_argument, NULL, 'c'},
{NULL, 0, NULL, 0}
};

while ((c = getopt_long(argc, argv, "i:hc", long_options, NULL)) != -1) {
switch (c) {
case 'h':
printf("Usage: %s [-i msleep_time] [-h]\n\n",argv[0]);
printf("\t-h : displays this help\n");
printf("\t-i : specifies the milliseconds sleep time that will be slept between measurements\n");
printf("\t-c : check system and exit\n");
printf("\n");

exit(0);
case 'i':
msleep_time = parse_int(optarg);
break;
case 'c':
check_system_flag = true;
break;
default:
fprintf(stderr,"Unknown option %c\n",c);
exit(-1);
}
}

if(check_system_flag){
exit(check_system());
}

get_time_offset(&offset);

output_stats();

nvmlShutdown();

return 0;
}

This file was deleted.