Skip to content

RE-Implemented NVIDIA Energy capture via C #1167

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions metric_providers/gpu/energy/nvidia/smi/component/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
CFLAGS = -O3 -Wall -Werror -lc -I../../../../../../lib/c -lnvidia-ml

metric-provider-binary: source.c
gcc ../../../../../../lib/c/gmt-lib.o $< $(CFLAGS) -o $@

This file was deleted.

8 changes: 2 additions & 6 deletions metric_providers/gpu/energy/nvidia/smi/component/provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,16 @@ class GpuEnergyNvidiaSmiComponentProvider(BaseMetricProvider):
def __init__(self, resolution, skip_check=False):
super().__init__(
metric_name='gpu_energy_nvidia_smi_component',
metrics={'time': int, 'value': int},
metrics={'time': int, 'value': int, 'card_model': str},
resolution=resolution,
unit='uJ',
current_dir=os.path.dirname(os.path.abspath(__file__)),
metric_provider_executable='metric-provider-nvidia-smi-wrapper.sh',
skip_check=skip_check,
)


def check_system(self, check_command="default", check_error_message=None, check_parallel_provider=True):
super().check_system(check_command=['which', 'nvidia-smi'], check_error_message="nvidia-smi is not installed on the system")

def _parse_metrics(self, df):
df = super()._parse_metrics(df) # sets detail_name
df['detail_name'] = df.card_model

'''
Conversion to Joules
Expand Down
152 changes: 152 additions & 0 deletions metric_providers/gpu/energy/nvidia/smi/component/source.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <unistd.h>
#include <sys/time.h>
#include <time.h>
#include <string.h> // for strtok
#include <getopt.h>
#include <limits.h>
#include <stdbool.h>
#include <nvml.h>
#include "gmt-lib.h"


// All variables are made static, because we believe that this will
// keep them local in scope to the file and not make them persist in state
// between Threads.
// in any case, none of these variables should change between threads
static unsigned int msleep_time=1000;
static struct timespec offset;

static void output_stats() {
struct timeval now;
nvmlReturn_t result;
unsigned int device_count;
nvmlDevice_t device;
char name[NVML_DEVICE_NAME_BUFFER_SIZE];
// nvmlUtilization_t utilization;
// nvmlMemory_t memory;
unsigned int power_usage;
// unsigned int power_limit;

result = nvmlInit();
if (result != NVML_SUCCESS) {
fprintf(stderr, "Failed to initialize NVML: %s\n", nvmlErrorString(result));
exit(1);
}

result = nvmlDeviceGetCount(&device_count);
if (result != NVML_SUCCESS) {
fprintf(stderr, "Failed to get device count: %s\n", nvmlErrorString(result));
nvmlShutdown();
exit(1);
}

while (1) {
get_adjusted_time(&now, &offset);

for (unsigned int i = 0; i < device_count; i++) {

nvmlDeviceGetHandleByIndex(i, &device);
nvmlDeviceGetName(device, name, sizeof(name));
// printf("GPU %u: %s\n", i, name);

// nvmlDeviceGetUtilizationRates(device, &utilization);
// printf(" Utilization: %u%%\n", utilization.gpu);

// nvmlDeviceGetMemoryInfo(device, &memory);
// printf(" Memory: %llu MiB / %llu MiB\n", memory.used / 1024 / 1024, memory.total / 1024 / 1024);

// nvmlDeviceGetEnforcedPowerLimit(device, &power_limit); // mW

nvmlDeviceGetPowerUsage(device, &power_usage); // mW
printf("%ld%06ld %u \"%s-%u\"\n", now.tv_sec, now.tv_usec, power_usage, name, i);

}
usleep(msleep_time*1000);
}


}


static int check_system() {
nvmlReturn_t result;
nvmlDevice_t device;
unsigned int power_usage;
unsigned int device_count;

result = nvmlInit();
if (result != NVML_SUCCESS) {
fprintf(stderr, "Failed to initialize NVML: %s\n", nvmlErrorString(result));
return 1;
}

result = nvmlDeviceGetCount(&device_count);
if (result != NVML_SUCCESS) {
fprintf(stderr, "Failed to get device count: %s\n", nvmlErrorString(result));
nvmlShutdown();
exit(1);
}

if (device_count <= 0) {
fprintf(stderr, "No NVIDIA cards found\n");
nvmlShutdown();
exit(1);

}

nvmlDeviceGetHandleByIndex(0, &device);
nvmlDeviceGetPowerUsage(device, &power_usage);

return 0;
}

int main(int argc, char **argv) {

int c;
bool check_system_flag = false;

static struct option long_options[] =
{
{"help", no_argument, NULL, 'h'},
{"interval", no_argument, NULL, 'i'},
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

logic: interval option is marked as no_argument but requires an argument in getopt_long

Suggested change
{"help", no_argument, NULL, 'h'},
{"interval", no_argument, NULL, 'i'},
{"help", no_argument, NULL, 'h'},
{"interval", required_argument, NULL, 'i'},

{"check", no_argument, NULL, 'c'},
{NULL, 0, NULL, 0}
};

while ((c = getopt_long(argc, argv, "i:hc", long_options, NULL)) != -1) {
switch (c) {
case 'h':
printf("Usage: %s [-i msleep_time] [-h]\n\n",argv[0]);
printf("\t-h : displays this help\n");
printf("\t-i : specifies the milliseconds sleep time that will be slept between measurements\n");
printf("\t-c : check system and exit\n");
printf("\n");

exit(0);
case 'i':
msleep_time = parse_int(optarg);
break;
case 'c':
check_system_flag = true;
break;
default:
fprintf(stderr,"Unknown option %c\n",c);
exit(-1);
}
}

if(check_system_flag){
exit(check_system());
}

get_time_offset(&offset);

output_stats();

nvmlShutdown();

return 0;
}
Loading