@@ -499,6 +499,14 @@ def run(opts: CliOpts, image: str) -> int:
499499 # Safety check: ensure all tests can fit in available RAM
500500 available_ram_gb = get_available_memory_gb ()
501501 usable_ram_gb = available_ram_gb - SYSTEM_RESERVE_GB
502+ cpu_count = os .cpu_count ()
503+ if cpu_count is None :
504+ raise RuntimeError ("Cannot determine CPU count (os.cpu_count() returned None)" )
505+ max_load = cpu_count * 1.0
506+ current_load = os .getloadavg ()[0 ]
507+ logging .info ("System: %d CPUs, load %.2f, %.1f GB RAM total, %.1f GB reserved, %.1f GB usable, %d ND tests, %d destructive tests" ,
508+ cpu_count , current_load , available_ram_gb , SYSTEM_RESERVE_GB , usable_ram_gb ,
509+ nondestructive_tests_len , destructive_tests_len )
502510 all_tests = nondestructive_tests + destructive_tests
503511 if all_tests :
504512 max_test_cost = max (t .cost for t in all_tests )
@@ -619,26 +627,31 @@ def run(opts: CliOpts, image: str) -> int:
619627 return sum (len (getattr (t .__class__ , "provision" , None ) or {"machine" : {}})
620628 for t in running_tests if not t .nondestructive )
621629
622- # Start destructive tests if we have capacity based on available RAM
630+ # Start destructive tests if we have capacity based on available RAM and system load
623631 while destructive_tests :
624632 available_ram_gb = get_available_memory_gb ()
633+ current_load = os .getloadavg ()[0 ]
625634 next_test = destructive_tests [0 ]
626635 current_cost = running_cost ()
627636 usable_ram = available_ram_gb - SYSTEM_RESERVE_GB
628637 free_ram = usable_ram - current_cost
629638 num_vms = count_running_vms ()
630639
631- logging .info ("Scheduler: %.1f GB avail , %.1f GB reserved , %.1f GB usable , %.1f GB used by %d tests , "
632- "%.1f GB free, %d VMs running, next test %s needs %.1f GB" ,
633- available_ram_gb , SYSTEM_RESERVE_GB , usable_ram , current_cost ,
640+ logging .info ("Scheduler: load %.2f (max %.1f) , %.1f GB avail , %.1f GB reserved , %.1f GB usable , "
641+ "%.1f GB used by %d tests, %.1f GB free, %d VMs running, next test %s needs %.1f GB" ,
642+ current_load , max_load , available_ram_gb , SYSTEM_RESERVE_GB , usable_ram , current_cost ,
634643 len (running_tests ), free_ram , num_vms , next_test , next_test .cost )
635644
645+ # Check system load first - high load means CPU starvation
646+ if current_load > max_load :
647+ logging .info ("NOT starting %s: high load (%.2f > %.1f max)" , next_test , current_load , max_load )
648+ break
636649 # Check available RAM
637650 if current_cost + next_test .cost > usable_ram :
638651 logging .info ("NOT starting %s: insufficient RAM (would use %.1f GB, only %.1f GB usable)" ,
639652 next_test , current_cost + next_test .cost , usable_ram )
640653 break
641- # RAM check passed, start the test
654+ # Load and RAM checks passed, start the test
642655 test = destructive_tests .pop (0 )
643656 logging .info ("STARTING %s (%.1f GB), will have %d tests using %.1f GB total" ,
644657 test , test .cost , len (running_tests ) + 1 , current_cost + test .cost )
0 commit comments