@@ -44,7 +44,7 @@ def __init__(self):
4444 self .gpuMemoryTotal = 0
4545 self .gpuName = ''
4646 self .curves = defaultdict (list )
47-
47+ self . nvidia_smi = None
4848 self ._isInit = False
4949
5050 def initOnFirstTime (self ):
@@ -53,40 +53,21 @@ def initOnFirstTime(self):
5353 self ._isInit = True
5454
5555 self .cpuFreq = psutil .cpu_freq ().max
56- self .ramTotal = psutil .virtual_memory ().total / 1024 / 1024 / 1024
56+ self .ramTotal = psutil .virtual_memory ().total / ( 1024 * 1024 * 1024 )
5757
5858 if platform .system () == "Windows" :
5959 from distutils import spawn
6060 # If the platform is Windows and nvidia-smi
61- # could not be found from the environment path,
62- # try to find it from system drive with default installation path
6361 self .nvidia_smi = spawn .find_executable ('nvidia-smi' )
6462 if self .nvidia_smi is None :
65- self .nvidia_smi = "%s\\ Program Files\\ NVIDIA Corporation\\ NVSMI\\ nvidia-smi.exe" % os .environ ['systemdrive' ]
63+ # could not be found from the environment path,
64+ # try to find it from system drive with default installation path
65+ default_nvidia_smi = "%s\\ Program Files\\ NVIDIA Corporation\\ NVSMI\\ nvidia-smi.exe" % os .environ ['systemdrive' ]
66+ if os .path .isfile (default_nvidia_smi ):
67+ self .nvidia_smi = default_nvidia_smi
6668 else :
6769 self .nvidia_smi = "nvidia-smi"
6870
69- try :
70- p = subprocess .Popen ([self .nvidia_smi , "-q" , "-x" ], stdout = subprocess .PIPE )
71- xmlGpu , stdError = p .communicate ()
72-
73- smiTree = ET .fromstring (xmlGpu )
74- gpuTree = smiTree .find ('gpu' )
75-
76- try :
77- self .gpuMemoryTotal = gpuTree .find ('fb_memory_usage' ).find ('total' ).text .split (" " )[0 ]
78- except Exception as e :
79- logging .debug ('Failed to get gpuMemoryTotal: "{}".' .format (str (e )))
80- pass
81- try :
82- self .gpuName = gpuTree .find ('product_name' ).text
83- except Exception as e :
84- logging .debug ('Failed to get gpuName: "{}".' .format (str (e )))
85- pass
86-
87- except Exception as e :
88- logging .debug ('Failed to get information from nvidia_smi at init: "{}".' .format (str (e )))
89-
9071 def _addKV (self , k , v ):
9172 if isinstance (v , tuple ):
9273 for ki , vi in v ._asdict ().items ():
@@ -98,18 +79,23 @@ def _addKV(self, k, v):
9879 self .curves [k ].append (v )
9980
10081 def update (self ):
101- self .initOnFirstTime ()
102- self ._addKV ('cpuUsage' , psutil .cpu_percent (percpu = True )) # interval=None => non-blocking (percentage since last call)
103- self ._addKV ('ramUsage' , psutil .virtual_memory ().percent )
104- self ._addKV ('swapUsage' , psutil .swap_memory ().percent )
105- self ._addKV ('vramUsage' , 0 )
106- self ._addKV ('ioCounters' , psutil .disk_io_counters ())
107- self .updateGpu ()
82+ try :
83+ self .initOnFirstTime ()
84+ self ._addKV ('cpuUsage' , psutil .cpu_percent (percpu = True )) # interval=None => non-blocking (percentage since last call)
85+ self ._addKV ('ramUsage' , psutil .virtual_memory ().percent )
86+ self ._addKV ('swapUsage' , psutil .swap_memory ().percent )
87+ self ._addKV ('vramUsage' , 0 )
88+ self ._addKV ('ioCounters' , psutil .disk_io_counters ())
89+ self .updateGpu ()
90+ except Exception as e :
91+ logging .debug ('Failed to get statistics: "{}".' .format (str (e )))
10892
10993 def updateGpu (self ):
94+ if not self .nvidia_smi :
95+ return
11096 try :
111- p = subprocess .Popen ([self .nvidia_smi , "-q" , "-x" ], stdout = subprocess .PIPE )
112- xmlGpu , stdError = p .communicate ()
97+ p = subprocess .Popen ([self .nvidia_smi , "-q" , "-x" ], stdout = subprocess .PIPE , stderr = subprocess . PIPE )
98+ xmlGpu , stdError = p .communicate (timeout = 10 ) # 10 seconds
11399
114100 smiTree = ET .fromstring (xmlGpu )
115101 gpuTree = smiTree .find ('gpu' )
@@ -129,7 +115,11 @@ def updateGpu(self):
129115 except Exception as e :
130116 logging .debug ('Failed to get gpuTemperature: "{}".' .format (str (e )))
131117 pass
132-
118+ except subprocess .TimeoutExpired as e :
119+ logging .debug ('Timeout when retrieving information from nvidia_smi: "{}".' .format (str (e )))
120+ p .kill ()
121+ outs , errs = p .communicate ()
122+ return
133123 except Exception as e :
134124 logging .debug ('Failed to get information from nvidia_smi: "{}".' .format (str (e )))
135125 return
@@ -201,15 +191,19 @@ def update(self, proc):
201191 data = proc .as_dict (self .dynamicKeys )
202192 for k , v in data .items ():
203193 self ._addKV (k , v )
204-
205- files = [f .path for f in proc .open_files ()]
206- if self .lastIterIndexWithFiles != - 1 :
207- if set (files ) != set (self .openFiles [self .lastIterIndexWithFiles ]):
208- self .openFiles [self .iterIndex ] = files
209- self .lastIterIndexWithFiles = self .iterIndex
210- elif files :
211- self .openFiles [self .iterIndex ] = files
212- self .lastIterIndexWithFiles = self .iterIndex
194+
195+ ## Note: Do not collect stats about open files for now,
196+ # as there is bug in psutil-5.7.2 on Windows which crashes the application.
197+ # https://github.com/giampaolo/psutil/issues/1763
198+ #
199+ # files = [f.path for f in proc.open_files()]
200+ # if self.lastIterIndexWithFiles != -1:
201+ # if set(files) != set(self.openFiles[self.lastIterIndexWithFiles]):
202+ # self.openFiles[self.iterIndex] = files
203+ # self.lastIterIndexWithFiles = self.iterIndex
204+ # elif files:
205+ # self.openFiles[self.iterIndex] = files
206+ # self.lastIterIndexWithFiles = self.iterIndex
213207 self .iterIndex += 1
214208
215209 def toDict (self ):
@@ -234,7 +228,7 @@ def __init__(self):
234228 self .computer = ComputerStatistics ()
235229 self .process = ProcStatistics ()
236230 self .times = []
237- self .interval = 5
231+ self .interval = 10 # refresh interval in seconds
238232
239233 def update (self , proc ):
240234 '''
0 commit comments