Skip to content

Commit 8c8bc18

Browse files
authored
feat(exporter): remove metrics if process is gone (#107)
1 parent 83f90f3 commit 8c8bc18

File tree

6 files changed

+42
-9
lines changed

6 files changed

+42
-9
lines changed

.pre-commit-config.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ repos:
2525
- id: debug-statements
2626
- id: double-quote-string-fixer
2727
- repo: https://github.com/astral-sh/ruff-pre-commit
28-
rev: v0.1.5
28+
rev: v0.1.6
2929
hooks:
3030
- id: ruff
3131
args: [--fix, --exit-non-zero-on-fix]

CHANGELOG.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1313

1414
### Added
1515

16-
-
16+
- Remove metrics if process is gone in `nvitop-exporter` by [@XuehaiPan](https://github.com/XuehaiPan) in [#107](https://github.com/XuehaiPan/nvitop/pull/107).
1717

1818
### Changed
1919

nvitop-exporter/nvitop_exporter/exporter.py

+35-2
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,13 @@ def __init__( # pylint: disable=too-many-statements
5353
self.hostname = hostname or get_ip_address()
5454
self.registry = registry
5555
self.interval = interval
56+
self.alive_pids: dict[Device, set[tuple[int, str]]] = {
57+
device: set() for device in self.devices
58+
}
5659

5760
self.info = Info(
5861
'nvitop',
59-
documentation='NVITOP.',
62+
documentation='NVITOP Prometheus Exporter.',
6063
labelnames=['hostname'],
6164
registry=self.registry,
6265
)
@@ -503,6 +506,7 @@ def update_host(self) -> None:
503506
(self.host_disk_io_write_data, disk_io_counter.write_bytes / MiB),
504507
):
505508
gauge.labels(hostname=self.hostname, partition=partition).set(value)
509+
506510
for partition in host.disk_partitions(): # type: ignore[attr-defined]
507511
try:
508512
partition_usage = host.disk_usage(partition.mountpoint) # type: ignore[attr-defined]
@@ -516,7 +520,7 @@ def update_host(self) -> None:
516520
):
517521
gauge.labels(hostname=self.hostname, mountpoint=partition.mountpoint).set(value)
518522

519-
def update_device(self, device: Device) -> None:
523+
def update_device(self, device: Device) -> None: # pylint: disable=too-many-locals
520524
"""Update metrics for a single device."""
521525
index = (
522526
str(device.index) if isinstance(device.index, int) else ':'.join(map(str, device.index))
@@ -567,11 +571,16 @@ def update_device(self, device: Device) -> None:
567571
link=link,
568572
).set(throughput / 1024.0)
569573

574+
alive_pids = self.alive_pids[device]
575+
previous_alive_pids = alive_pids.copy()
576+
alive_pids.clear()
577+
570578
with GpuProcess.failsafe():
571579
for pid, process in device.processes().items():
572580
with process.oneshot():
573581
username = process.username()
574582
running_time = process.running_time()
583+
alive_pids.add((pid, username))
575584
for gauge, value in (
576585
(
577586
self.process_running_time,
@@ -606,3 +615,27 @@ def update_device(self, device: Device) -> None:
606615
pid=pid,
607616
username=username,
608617
).set(value)
618+
619+
for pid, username in previous_alive_pids.difference(alive_pids):
620+
for gauge in (
621+
self.process_running_time,
622+
self.process_cpu_percent,
623+
self.process_rss_memory,
624+
self.process_memory_percent,
625+
self.process_gpu_memory,
626+
self.process_gpu_sm_utilization,
627+
self.process_gpu_memory_utilization,
628+
self.process_gpu_encoder_utilization,
629+
self.process_gpu_decoder_utilization,
630+
):
631+
try:
632+
gauge.remove(
633+
self.hostname,
634+
index,
635+
name,
636+
uuid,
637+
pid,
638+
username,
639+
)
640+
except KeyError:
641+
pass

nvitop/api/device.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -2991,7 +2991,7 @@ def _parse_cuda_visible_devices( # pylint: disable=too-many-branches,too-many-s
29912991
try:
29922992
physical_device_attrs = _get_all_physical_device_attrs()
29932993
except libnvml.NVMLError:
2994-
return [] # type: ignore[return-value]
2994+
return []
29952995
gpu_uuids = set(physical_device_attrs)
29962996

29972997
try:
@@ -3072,7 +3072,7 @@ def strip_identifier(identifier: str) -> str:
30723072

30733073
for identifier in map(strip_identifier, cuda_visible_devices.split(',')):
30743074
if identifier in presented:
3075-
return [] # type: ignore[return-value] # duplicate identifiers found
3075+
return [] # duplicate identifiers found
30763076

30773077
try:
30783078
device = from_index_or_uuid(identifier)

nvitop/cli.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -309,9 +309,9 @@ def main() -> int:
309309
return 1
310310

311311
if args.gpu_util_thresh is not None:
312-
Device.GPU_UTILIZATION_THRESHOLDS = tuple(sorted(args.gpu_util_thresh)) # type: ignore[assignment]
312+
Device.GPU_UTILIZATION_THRESHOLDS = tuple(sorted(args.gpu_util_thresh))
313313
if args.mem_util_thresh is not None:
314-
Device.MEMORY_UTILIZATION_THRESHOLDS = tuple(sorted(args.mem_util_thresh)) # type: ignore[assignment]
314+
Device.MEMORY_UTILIZATION_THRESHOLDS = tuple(sorted(args.mem_util_thresh))
315315

316316
if args.only is not None:
317317
indices = set(args.only)

nvitop/select.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ def select_devices( # pylint: disable=too-many-branches,too-many-statements,too
215215

216216
if max_count is not None:
217217
if max_count == 0:
218-
return [] # type: ignore[return-value]
218+
return []
219219
assert max_count >= min_count >= 0
220220

221221
free_accounts = set(free_accounts or [])

0 commit comments

Comments
 (0)