Skip to content

Commit 49e4531

Browse files
committed
lockup: add lockup type detection and additional flags
Orabug: 38750252 Signed-off-by: Richard Li <[email protected]>
1 parent 674b22c commit 49e4531

File tree

2 files changed

+90
-16
lines changed

2 files changed

+90
-16
lines changed

drgn_tools/lockup.py

Lines changed: 89 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from drgn_tools.bt import bt
1414
from drgn_tools.bt import bt_has_any
1515
from drgn_tools.corelens import CorelensModule
16+
from drgn_tools.smp import _irq_enabled
1617
from drgn_tools.table import print_table
1718
from drgn_tools.task import task_lastrun2now
1819
from drgn_tools.util import timestamp_str
@@ -116,40 +117,101 @@ def dump_tasks_waiting_fsnotify(prog, min_run_time_seconds: int) -> None:
116117

117118

118119
def scan_lockup(
119-
prog: Program, min_run_time_seconds: int = 1, skip_swapper: bool = True
120+
prog: Program, stack: bool, min_run_time_seconds: int = 1, top_k: int = -1, skip_swapper: bool = True
120121
) -> None:
121122
"""
122-
Scan potential lockups on cpus and tasks waiting for RCU.
123+
Scan potential lockups on CPUs and tasks waiting for RCU.
124+
Collect all long-running tasks, rank them by run time, and print top_k.
123125
124126
:param prog: drgn program
127+
:param stack: bool
125128
:param min_run_time_seconds: int
129+
:param top_k: int
126130
:param skip_swapper: bool
127131
"""
128-
nr_processes = 0
129-
for cpus in for_each_online_cpu(prog):
130-
runqueue = per_cpu(prog["runqueues"], cpus)
132+
entries = []
133+
134+
for cpu in for_each_online_cpu(prog):
135+
runqueue = per_cpu(prog["runqueues"], cpu)
131136
curr_task_addr = runqueue.curr.value_()
132137
curr_task = runqueue.curr[0]
133138
comm = escape_ascii_string(curr_task.comm.string_())
134139
pid = curr_task.pid.value_()
135-
run_time = task_lastrun2now(curr_task)
136140
prio = curr_task.prio.value_()
141+
run_time = task_lastrun2now(curr_task) # ns
137142
if run_time < min_run_time_seconds * 1e9:
138143
continue
139-
if skip_swapper and comm == f"swapper/{cpus}":
144+
145+
if skip_swapper and comm == f"swapper/{cpu}":
140146
continue
141-
print(f"CPU {cpus} RUNQUEUE: {runqueue.address_of_().value_():x}")
147+
148+
entries.append({
149+
"cpu": cpu,
150+
"runqueue": runqueue,
151+
"task_addr": curr_task_addr,
152+
"task": curr_task,
153+
"comm": comm,
154+
"pid": pid,
155+
"prio": prio,
156+
"runtime": run_time,
157+
})
158+
159+
entries.sort(key=lambda x: x["runtime"], reverse=True)
160+
161+
# detect lockup type (soft or hard)
162+
longest_running_task = entries[0]
163+
irq_enabled = _irq_enabled(prog, longest_running_task['cpu'])
164+
165+
# convert ns -> s
166+
runtime_s = longest_running_task["runtime"] / 1e9
167+
168+
SOFT_LOCKUP_THRESHOLD = 20
169+
HARD_LOCKUP_THRESHOLD = 10
170+
171+
if irq_enabled and runtime_s > SOFT_LOCKUP_THRESHOLD:
172+
print(
173+
f"Soft lockup detected - CPU#{longest_running_task['cpu']} "
174+
f"stuck for {runtime_s:.0f}s! "
175+
f"[{longest_running_task['comm']}:{longest_running_task['pid']}]"
176+
)
177+
178+
elif (not irq_enabled) and runtime_s > HARD_LOCKUP_THRESHOLD:
179+
print(
180+
f"Hard lockup detected - CPU#{longest_running_task['cpu']} "
181+
f"stuck for {runtime_s:.0f}s! "
182+
f"[{longest_running_task['comm']}:{longest_running_task['pid']}]"
183+
)
184+
185+
total_entries = len(entries)
186+
187+
if top_k > 0:
188+
entries = entries[:top_k]
189+
190+
# print results
191+
for e in entries:
192+
cpu = e["cpu"]
193+
runqueue = e["runqueue"]
194+
curr_task_addr = e["task_addr"]
195+
curr_task = e["task"]
196+
comm = e["comm"]
197+
pid = e["pid"]
198+
prio = e["prio"]
199+
run_time = e["runtime"]
200+
201+
print(f"CPU {cpu} RUNQUEUE: {runqueue.address_of_().value_():x}")
142202
print(
143203
f" PID: {pid:<6d} TASK: {curr_task_addr:x} PRIO: {prio}"
144-
f' COMMAND: "{comm}"',
204+
f' COMMAND: "{comm}"'
145205
f" LOCKUP TIME: {timestamp_str(run_time)}",
146206
)
147-
print("\nCalltrace:")
148-
bt(task_or_prog=curr_task.address_of_())
149-
print()
150-
nr_processes += 1
207+
if stack:
208+
print("\nCalltrace:")
209+
bt(task_or_prog=curr_task.address_of_())
210+
print()
151211

152-
print(f"We found {nr_processes} processes running more than {min_run_time_seconds} seconds")
212+
print(f"We found {total_entries} processes exceeding {min_run_time_seconds} seconds.")
213+
if top_k > 0:
214+
print(f"Longest {len(entries)} processes are printed above.")
153215

154216
dump_tasks_waiting_rcu_gp(prog, min_run_time_seconds)
155217
dump_tasks_waiting_spinlock(prog, min_run_time_seconds)
@@ -162,13 +224,25 @@ class LockUp(CorelensModule):
162224
name = "lockup"
163225

164226
def add_args(self, parser: argparse.ArgumentParser) -> None:
227+
parser.add_argument(
228+
"--stack",
229+
action="store_true",
230+
help="Print the stacks."
231+
)
165232
parser.add_argument(
166233
"--time",
167234
"-t",
168235
type=float,
169236
default=2,
170237
help="list all the processes that have been running more than <time> seconds",
171238
)
239+
parser.add_argument(
240+
"--top_k",
241+
"-k",
242+
type=int,
243+
default=-1,
244+
help="list top k processes in terms of runtime",
245+
)
172246

173247
def run(self, prog: Program, args: argparse.Namespace) -> None:
174-
scan_lockup(prog, args.time)
248+
scan_lockup(prog, args.stack, args.time, args.top_k)

tests/test_lockup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@
44

55

66
def test_lockup(prog):
7-
lockup.scan_lockup(prog)
7+
lockup.scan_lockup(prog, stack=True)

0 commit comments

Comments
 (0)