Skip to content

Commit 01c0617

Browse files
committed
Add a hang detector
Orabug: 38750304 Signed-off-by: Richard Li <[email protected]>
1 parent ad09667 commit 01c0617

File tree

2 files changed

+97
-0
lines changed

2 files changed

+97
-0
lines changed

drgn_tools/hang.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
# Copyright (c) 2025, Oracle and/or its affiliates.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
3+
"""
4+
Help detect hang issues
5+
"""
6+
import argparse
7+
8+
from drgn import Program
9+
from drgn.helpers.common import escape_ascii_string
10+
from drgn.helpers.linux.pid import for_each_task
11+
from drgn.helpers.linux.sched import task_state_to_char
12+
13+
from drgn_tools.bt import bt
14+
from drgn_tools.corelens import CorelensModule
15+
from drgn_tools.task import task_lastrun2now
16+
from drgn_tools.util import timestamp_str
17+
18+
19+
def for_each_tasks_in_d_state(prog: Program):
20+
for task in for_each_task(prog):
21+
st = task_state_to_char(task)
22+
if st != "D":
23+
continue
24+
yield task
25+
26+
27+
def detect_hang(prog: Program, stack: bool, time: int) -> None:
28+
"""
29+
Scan hung tasks.
30+
31+
:param prog: drgn program
32+
:param stack: bool
33+
:param time: int
34+
"""
35+
n_hung_tasks = 0
36+
tasks = list(for_each_tasks_in_d_state(prog))
37+
if not tasks:
38+
print("There is no tasks in D state.")
39+
return
40+
41+
tasks.sort(key=task_lastrun2now, reverse=True)
42+
longest_hang_task = tasks[0]
43+
longest_hang_time = task_lastrun2now(longest_hang_task) / 1e9
44+
for task in tasks:
45+
run_time = task_lastrun2now(task)
46+
comm = escape_ascii_string(task.comm.string_())
47+
pid = task.pid.value_()
48+
prio = task.prio.value_()
49+
if run_time / 1e9 > time:
50+
n_hung_tasks += 1
51+
print(
52+
f"PID: {pid:<6d} TASK: {task.value_():x} PRIO: {prio}"
53+
f' COMMAND: "{comm}"'
54+
f" HUNG TIME: {timestamp_str(run_time)}",
55+
)
56+
if stack:
57+
print("Calltrace:")
58+
bt(task)
59+
print()
60+
61+
print(
62+
f"There are {n_hung_tasks} tasks hung (in D state) for more than {time} seconds as above."
63+
)
64+
print(
65+
f"The longest hung task as below has remained in the D state for {longest_hang_time:.2f} seconds."
66+
)
67+
bt(longest_hang_task)
68+
69+
70+
class Hang(CorelensModule):
71+
"""Detectors for hang issues"""
72+
73+
name = "hang"
74+
75+
def add_args(self, parser: argparse.ArgumentParser) -> None:
76+
parser.add_argument(
77+
"--stack",
78+
action="store_true",
79+
help="Print the stacks. Only the stack of longest hung task is printed if not set.",
80+
)
81+
parser.add_argument(
82+
"--time",
83+
"-t",
84+
type=float,
85+
default=10,
86+
help="list all the processes that have been hung more than <time> seconds",
87+
)
88+
89+
def run(self, prog: Program, args: argparse.Namespace) -> None:
90+
detect_hang(prog, args.stack, args.time)

tests/test_hang.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Copyright (c) 2025, Oracle and/or its affiliates.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
3+
from drgn_tools import hang
4+
5+
6+
def test_hang(prog):
7+
hang.detect_hang(prog, stack=True, time=10)

0 commit comments

Comments
 (0)