1313from drgn_tools .bt import bt
1414from drgn_tools .bt import bt_has_any
1515from drgn_tools .corelens import CorelensModule
16+ from drgn_tools .smp import _irq_enabled
1617from drgn_tools .table import print_table
1718from drgn_tools .task import task_lastrun2now
1819from drgn_tools .util import timestamp_str
@@ -116,40 +117,98 @@ def dump_tasks_waiting_fsnotify(prog, min_run_time_seconds: int) -> None:
116117
117118
118119def scan_lockup (
119- prog : Program , min_run_time_seconds : int = 1 , skip_swapper : bool = True
120+ prog : Program , min_run_time_seconds : int = 1 , top_k : int = - 1 , skip_swapper : bool = True
120121) -> None :
121122 """
122- Scan potential lockups on cpus and tasks waiting for RCU.
123+ Scan potential lockups on CPUs and tasks waiting for RCU.
124+ Collect all long-running tasks, rank them by run time, and print top_k.
123125
124126 :param prog: drgn program
125127 :param min_run_time_seconds: int
126128 :param skip_swapper: bool
127129 """
128- nr_processes = 0
129- for cpus in for_each_online_cpu (prog ):
130- runqueue = per_cpu (prog ["runqueues" ], cpus )
130+ entries = []
131+
132+ for cpu in for_each_online_cpu (prog ):
133+ runqueue = per_cpu (prog ["runqueues" ], cpu )
131134 curr_task_addr = runqueue .curr .value_ ()
132135 curr_task = runqueue .curr [0 ]
133136 comm = escape_ascii_string (curr_task .comm .string_ ())
134137 pid = curr_task .pid .value_ ()
135- run_time = task_lastrun2now (curr_task )
136138 prio = curr_task .prio .value_ ()
139+ run_time = task_lastrun2now (curr_task ) # ns
137140 if run_time < min_run_time_seconds * 1e9 :
138141 continue
139- if skip_swapper and comm == f"swapper/{ cpus } " :
142+
143+ if skip_swapper and comm == f"swapper/{ cpu } " :
140144 continue
141- print (f"CPU { cpus } RUNQUEUE: { runqueue .address_of_ ().value_ ():x} " )
145+
146+ entries .append ({
147+ "cpu" : cpu ,
148+ "runqueue" : runqueue ,
149+ "task_addr" : curr_task_addr ,
150+ "task" : curr_task ,
151+ "comm" : comm ,
152+ "pid" : pid ,
153+ "prio" : prio ,
154+ "runtime" : run_time ,
155+ })
156+
157+ entries .sort (key = lambda x : x ["runtime" ], reverse = True )
158+
159+ # detect lockup type (soft or hard)
160+ longest_running_task = entries [0 ]
161+ irq_enabled = _irq_enabled (prog , longest_running_task ['cpu' ])
162+
163+ # convert ns -> s
164+ runtime_s = longest_running_task ["runtime" ] / 1e9
165+
166+ SOFT_LOCKUP_THRESHOLD = 20
167+ HARD_LOCKUP_THRESHOLD = 10
168+
169+ if irq_enabled and runtime_s > SOFT_LOCKUP_THRESHOLD :
170+ print (
171+ f"Soft lockup detected - CPU#{ longest_running_task ['cpu' ]} "
172+ f"stuck for { runtime_s :.0f} s! "
173+ f"[{ longest_running_task ['comm' ]} :{ longest_running_task ['pid' ]} ]"
174+ )
175+
176+ elif (not irq_enabled ) and runtime_s > HARD_LOCKUP_THRESHOLD :
177+ print (
178+ f"Hard lockup detected - CPU#{ longest_running_task ['cpu' ]} "
179+ f"stuck for { runtime_s :.0f} s! "
180+ f"[{ longest_running_task ['comm' ]} :{ longest_running_task ['pid' ]} ]"
181+ )
182+
183+ total_entries = len (entries )
184+
185+ if top_k > 0 :
186+ entries = entries [:top_k ]
187+
188+ # print results
189+ for e in entries :
190+ cpu = e ["cpu" ]
191+ runqueue = e ["runqueue" ]
192+ curr_task_addr = e ["task_addr" ]
193+ curr_task = e ["task" ]
194+ comm = e ["comm" ]
195+ pid = e ["pid" ]
196+ prio = e ["prio" ]
197+ run_time = e ["runtime" ]
198+
199+ print (f"CPU { cpu } RUNQUEUE: { runqueue .address_of_ ().value_ ():x} " )
142200 print (
143201 f" PID: { pid :<6d} TASK: { curr_task_addr :x} PRIO: { prio } "
144- f' COMMAND: "{ comm } "' ,
202+ f' COMMAND: "{ comm } "'
145203 f" LOCKUP TIME: { timestamp_str (run_time )} " ,
146204 )
147205 print ("\n Calltrace:" )
148206 bt (task_or_prog = curr_task .address_of_ ())
149207 print ()
150- nr_processes += 1
151208
152- print (f"We found { nr_processes } processes running more than { min_run_time_seconds } seconds" )
209+ print (f"We found { total_entries } processes exceeding { min_run_time_seconds } seconds." )
210+ if top_k > 0 :
211+ print (f"Longest { len (entries )} processes are printed above." )
153212
154213 dump_tasks_waiting_rcu_gp (prog , min_run_time_seconds )
155214 dump_tasks_waiting_spinlock (prog , min_run_time_seconds )
@@ -169,6 +228,13 @@ def add_args(self, parser: argparse.ArgumentParser) -> None:
169228 default = 2 ,
170229 help = "list all the processes that have been running more than <time> seconds" ,
171230 )
231+ parser .add_argument (
232+ "--top_k" ,
233+ "-k" ,
234+ type = int ,
235+ default = 10 ,
236+ help = "list top k processes in terms of runtime" ,
237+ )
172238
173239 def run (self , prog : Program , args : argparse .Namespace ) -> None :
174- scan_lockup (prog , args .time )
240+ scan_lockup (prog , args .time , args . top_k )
0 commit comments