Skip to content

Commit 18fb4d4

Browse files
committed
lockup: add lockup type detection and a flag to rank top k longest processes
Orabug: 38750252 Signed-off-by: Richard Li <[email protected]>
1 parent 674b22c commit 18fb4d4

File tree

1 file changed

+78
-12
lines changed

1 file changed

+78
-12
lines changed

drgn_tools/lockup.py

Lines changed: 78 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from drgn_tools.bt import bt
1414
from drgn_tools.bt import bt_has_any
1515
from drgn_tools.corelens import CorelensModule
16+
from drgn_tools.smp import _irq_enabled
1617
from drgn_tools.table import print_table
1718
from drgn_tools.task import task_lastrun2now
1819
from drgn_tools.util import timestamp_str
@@ -116,40 +117,98 @@ def dump_tasks_waiting_fsnotify(prog, min_run_time_seconds: int) -> None:
116117

117118

118119
def scan_lockup(
119-
prog: Program, min_run_time_seconds: int = 1, skip_swapper: bool = True
120+
prog: Program, min_run_time_seconds: int = 1, top_k: int = -1, skip_swapper: bool = True
120121
) -> None:
121122
"""
122-
Scan potential lockups on cpus and tasks waiting for RCU.
123+
Scan potential lockups on CPUs and tasks waiting for RCU.
124+
Collect all long-running tasks, rank them by run time, and print top_k.
123125
124126
:param prog: drgn program
125127
:param min_run_time_seconds: int
126128
:param skip_swapper: bool
127129
"""
128-
nr_processes = 0
129-
for cpus in for_each_online_cpu(prog):
130-
runqueue = per_cpu(prog["runqueues"], cpus)
130+
entries = []
131+
132+
for cpu in for_each_online_cpu(prog):
133+
runqueue = per_cpu(prog["runqueues"], cpu)
131134
curr_task_addr = runqueue.curr.value_()
132135
curr_task = runqueue.curr[0]
133136
comm = escape_ascii_string(curr_task.comm.string_())
134137
pid = curr_task.pid.value_()
135-
run_time = task_lastrun2now(curr_task)
136138
prio = curr_task.prio.value_()
139+
run_time = task_lastrun2now(curr_task) # ns
137140
if run_time < min_run_time_seconds * 1e9:
138141
continue
139-
if skip_swapper and comm == f"swapper/{cpus}":
142+
143+
if skip_swapper and comm == f"swapper/{cpu}":
140144
continue
141-
print(f"CPU {cpus} RUNQUEUE: {runqueue.address_of_().value_():x}")
145+
146+
entries.append({
147+
"cpu": cpu,
148+
"runqueue": runqueue,
149+
"task_addr": curr_task_addr,
150+
"task": curr_task,
151+
"comm": comm,
152+
"pid": pid,
153+
"prio": prio,
154+
"runtime": run_time,
155+
})
156+
157+
entries.sort(key=lambda x: x["runtime"], reverse=True)
158+
159+
# detect lockup type (soft or hard)
160+
longest_running_task = entries[0]
161+
irq_enabled = _irq_enabled(prog, longest_running_task['cpu'])
162+
163+
# convert ns -> s
164+
runtime_s = longest_running_task["runtime"] / 1e9
165+
166+
SOFT_LOCKUP_THRESHOLD = 20
167+
HARD_LOCKUP_THRESHOLD = 10
168+
169+
if irq_enabled and runtime_s > SOFT_LOCKUP_THRESHOLD:
170+
print(
171+
f"Soft lockup detected - CPU#{longest_running_task['cpu']} "
172+
f"stuck for {runtime_s:.0f}s! "
173+
f"[{longest_running_task['comm']}:{longest_running_task['pid']}]"
174+
)
175+
176+
elif (not irq_enabled) and runtime_s > HARD_LOCKUP_THRESHOLD:
177+
print(
178+
f"Hard lockup detected - CPU#{longest_running_task['cpu']} "
179+
f"stuck for {runtime_s:.0f}s! "
180+
f"[{longest_running_task['comm']}:{longest_running_task['pid']}]"
181+
)
182+
183+
total_entries = len(entries)
184+
185+
if top_k > 0:
186+
entries = entries[:top_k]
187+
188+
# print results
189+
for e in entries:
190+
cpu = e["cpu"]
191+
runqueue = e["runqueue"]
192+
curr_task_addr = e["task_addr"]
193+
curr_task = e["task"]
194+
comm = e["comm"]
195+
pid = e["pid"]
196+
prio = e["prio"]
197+
run_time = e["runtime"]
198+
199+
print(f"CPU {cpu} RUNQUEUE: {runqueue.address_of_().value_():x}")
142200
print(
143201
f" PID: {pid:<6d} TASK: {curr_task_addr:x} PRIO: {prio}"
144-
f' COMMAND: "{comm}"',
202+
f' COMMAND: "{comm}"'
145203
f" LOCKUP TIME: {timestamp_str(run_time)}",
146204
)
147205
print("\nCalltrace:")
148206
bt(task_or_prog=curr_task.address_of_())
149207
print()
150-
nr_processes += 1
151208

152-
print(f"We found {nr_processes} processes running more than {min_run_time_seconds} seconds")
209+
print(f"We found {total_entries} processes exceeding {min_run_time_seconds} seconds.")
210+
if top_k > 0:
211+
print(f"Longest {len(entries)} processes are printed above.")
153212

154213
dump_tasks_waiting_rcu_gp(prog, min_run_time_seconds)
155214
dump_tasks_waiting_spinlock(prog, min_run_time_seconds)
@@ -169,6 +228,13 @@ def add_args(self, parser: argparse.ArgumentParser) -> None:
169228
default=2,
170229
help="list all the processes that have been running more than <time> seconds",
171230
)
231+
parser.add_argument(
232+
"--top_k",
233+
"-k",
234+
type=int,
235+
default=10,
236+
help="list top k processes in terms of runtime",
237+
)
172238

173239
def run(self, prog: Program, args: argparse.Namespace) -> None:
174-
scan_lockup(prog, args.time)
240+
scan_lockup(prog, args.time, args.top_k)

0 commit comments

Comments
 (0)