1313from drgn_tools .bt import bt
1414from drgn_tools .bt import bt_has_any
1515from drgn_tools .corelens import CorelensModule
16+ from drgn_tools .smp import _irq_enabled
1617from drgn_tools .table import print_table
1718from drgn_tools .task import task_lastrun2now
1819from drgn_tools .util import timestamp_str
@@ -116,40 +117,101 @@ def dump_tasks_waiting_fsnotify(prog, min_run_time_seconds: int) -> None:
116117
117118
118119def scan_lockup (
119- prog : Program , min_run_time_seconds : int = 1 , skip_swapper : bool = True
120+ prog : Program , stack : bool , min_run_time_seconds : int = 1 , top_k : int = - 1 , skip_swapper : bool = True
120121) -> None :
121122 """
122- Scan potential lockups on cpus and tasks waiting for RCU.
123+ Scan potential lockups on CPUs and tasks waiting for RCU.
124+ Collect all long-running tasks, rank them by run time, and print top_k.
123125
124126 :param prog: drgn program
127+ :param stack: bool
125128 :param min_run_time_seconds: int
129+ :param top_k: int
126130 :param skip_swapper: bool
127131 """
128- nr_processes = 0
129- for cpus in for_each_online_cpu (prog ):
130- runqueue = per_cpu (prog ["runqueues" ], cpus )
132+ entries = []
133+
134+ for cpu in for_each_online_cpu (prog ):
135+ runqueue = per_cpu (prog ["runqueues" ], cpu )
131136 curr_task_addr = runqueue .curr .value_ ()
132137 curr_task = runqueue .curr [0 ]
133138 comm = escape_ascii_string (curr_task .comm .string_ ())
134139 pid = curr_task .pid .value_ ()
135- run_time = task_lastrun2now (curr_task )
136140 prio = curr_task .prio .value_ ()
141+ run_time = task_lastrun2now (curr_task ) # ns
137142 if run_time < min_run_time_seconds * 1e9 :
138143 continue
139- if skip_swapper and comm == f"swapper/{ cpus } " :
144+
145+ if skip_swapper and comm == f"swapper/{ cpu } " :
140146 continue
141- print (f"CPU { cpus } RUNQUEUE: { runqueue .address_of_ ().value_ ():x} " )
147+
148+ entries .append ({
149+ "cpu" : cpu ,
150+ "runqueue" : runqueue ,
151+ "task_addr" : curr_task_addr ,
152+ "task" : curr_task ,
153+ "comm" : comm ,
154+ "pid" : pid ,
155+ "prio" : prio ,
156+ "runtime" : run_time ,
157+ })
158+
159+ entries .sort (key = lambda x : x ["runtime" ], reverse = True )
160+
161+ # detect lockup type (soft or hard)
162+ longest_running_task = entries [0 ]
163+ irq_enabled = _irq_enabled (prog , longest_running_task ['cpu' ])
164+
165+ # convert ns -> s
166+ runtime_s = longest_running_task ["runtime" ] / 1e9
167+
168+ SOFT_LOCKUP_THRESHOLD = 20
169+ HARD_LOCKUP_THRESHOLD = 10
170+
171+ if irq_enabled and runtime_s > SOFT_LOCKUP_THRESHOLD :
172+ print (
173+ f"Soft lockup detected - CPU#{ longest_running_task ['cpu' ]} "
174+ f"stuck for { runtime_s :.0f} s! "
175+ f"[{ longest_running_task ['comm' ]} :{ longest_running_task ['pid' ]} ]"
176+ )
177+
178+ elif (not irq_enabled ) and runtime_s > HARD_LOCKUP_THRESHOLD :
179+ print (
180+ f"Hard lockup detected - CPU#{ longest_running_task ['cpu' ]} "
181+ f"stuck for { runtime_s :.0f} s! "
182+ f"[{ longest_running_task ['comm' ]} :{ longest_running_task ['pid' ]} ]"
183+ )
184+
185+ total_entries = len (entries )
186+
187+ if top_k > 0 :
188+ entries = entries [:top_k ]
189+
190+ # print results
191+ for e in entries :
192+ cpu = e ["cpu" ]
193+ runqueue = e ["runqueue" ]
194+ curr_task_addr = e ["task_addr" ]
195+ curr_task = e ["task" ]
196+ comm = e ["comm" ]
197+ pid = e ["pid" ]
198+ prio = e ["prio" ]
199+ run_time = e ["runtime" ]
200+
201+ print (f"CPU { cpu } RUNQUEUE: { runqueue .address_of_ ().value_ ():x} " )
142202 print (
143203 f" PID: { pid :<6d} TASK: { curr_task_addr :x} PRIO: { prio } "
144- f' COMMAND: "{ comm } "' ,
204+ f' COMMAND: "{ comm } "'
145205 f" LOCKUP TIME: { timestamp_str (run_time )} " ,
146206 )
147- print ( " \n Calltrace:" )
148- bt ( task_or_prog = curr_task . address_of_ () )
149- print ( )
150- nr_processes += 1
207+ if stack :
208+ print ( " \n Calltrace:" )
209+ bt ( task_or_prog = curr_task . address_of_ () )
210+ print ()
151211
152- print (f"We found { nr_processes } processes running more than { min_run_time_seconds } seconds" )
212+ print (f"We found { total_entries } processes exceeding { min_run_time_seconds } seconds." )
213+ if top_k > 0 :
214+ print (f"Longest { len (entries )} processes are printed above." )
153215
154216 dump_tasks_waiting_rcu_gp (prog , min_run_time_seconds )
155217 dump_tasks_waiting_spinlock (prog , min_run_time_seconds )
@@ -162,13 +224,25 @@ class LockUp(CorelensModule):
162224 name = "lockup"
163225
164226 def add_args (self , parser : argparse .ArgumentParser ) -> None :
227+ parser .add_argument (
228+ "--stack" ,
229+ action = "store_true" ,
230+ help = "Print the stacks."
231+ )
165232 parser .add_argument (
166233 "--time" ,
167234 "-t" ,
168235 type = float ,
169236 default = 2 ,
170237 help = "list all the processes that have been running more than <time> seconds" ,
171238 )
239+ parser .add_argument (
240+ "--top_k" ,
241+ "-k" ,
242+ type = int ,
243+ default = - 1 ,
244+ help = "list top k processes in terms of runtime" ,
245+ )
172246
173247 def run (self , prog : Program , args : argparse .Namespace ) -> None :
174- scan_lockup (prog , args .time )
248+ scan_lockup (prog , args .stack , args . time , args . top_k )
0 commit comments