Skip to content

Commit 9c1f362

Browse files
committed
zfsdist
1 parent bc54bb6 commit 9c1f362

File tree

4 files changed

+435
-0
lines changed

4 files changed

+435
-0
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ Tools:
106106
- tools/[wakeuptime](tools/wakeuptime.py): Summarize sleep to wakeup time by waker kernel stack. [Examples](tools/wakeuptime_example.txt).
107107
- tools/[xfsdist](tools/xfsdist.py): Summarize XFS operation latency. [Examples](tools/xfsdist_example.txt).
108108
- tools/[xfsslower](tools/xfsslower.py): Trace slow XFS operations. [Examples](tools/xfsslower_example.txt).
109+
- tools/[zfsdist](tools/zfsdist.py): Summarize ZFS operation latency. [Examples](tools/zfsdist_example.txt).
109110
- tools/[zfsslower](tools/zfsslower.py): Trace slow ZFS operations. [Examples](tools/zfsslower_example.txt).
110111

111112
### Networking

man/man8/zfsdist.8

+83
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
.TH zfsdist 8 "2016-02-12" "USER COMMANDS"
2+
.SH NAME
3+
zfsdist \- Summarize ZFS operation latency. Uses Linux eBPF/bcc.
4+
.SH SYNOPSIS
5+
.B zfsdist [\-h] [\-T] [\-N] [\-d] [interval] [count]
6+
.SH DESCRIPTION
7+
This tool summarizes time (latency) spent in common ZFS file operations: reads,
8+
writes, opens, and syncs, and presents it as a power-of-2 histogram. It uses an
9+
in-kernel eBPF map to store the histogram for efficiency.
10+
11+
This uses kernel dynamic tracing of the ZPL interface (ZFS POSIX
12+
Layer), and will need updates to match any changes to this interface.
13+
.TP
14+
This is intended to work with the ZFS on Linux project:
15+
http://zfsonlinux.org
16+
.PP
17+
Since this uses BPF, only the root user can use this tool.
18+
.SH REQUIREMENTS
19+
CONFIG_BPF and bcc.
20+
.SH OPTIONS
21+
.TP
22+
\-h
23+
Print usage message.
24+
.TP
25+
\-T
26+
Don't include timestamps on interval output.
27+
.TP
28+
\-m
29+
Output in milliseconds.
30+
.TP
31+
\-p PID
32+
Trace this PID only.
33+
.SH EXAMPLES
34+
.TP
35+
Trace ZFS operation time, and print a summary on Ctrl-C:
36+
#
37+
.B zfsdist
38+
.TP
39+
Trace PID 181 only:
40+
#
41+
.B zfsdist -p 181
42+
.TP
43+
Print 1 second summaries, 10 times:
44+
#
45+
.B zfsdist 1 10
46+
.TP
47+
1 second summaries, printed in milliseconds
48+
#
49+
.B zfsdist \-m 1
50+
.SH FIELDS
51+
.TP
52+
msecs
53+
Range of milliseconds for this bucket.
54+
.TP
55+
usecs
56+
Range of microseconds for this bucket.
57+
.TP
58+
count
59+
Number of operations in this time range.
60+
.TP
61+
distribution
62+
ASCII representation of the distribution (the count column).
63+
.SH OVERHEAD
64+
This adds low-overhead instrumentation to these ZFS operations,
65+
including reads and writes from the file system cache. Such reads and writes
66+
can be very frequent (depending on the workload; eg, 1M/sec), at which
67+
point the overhead of this tool may become noticeable.
68+
Measure and quantify before use.
69+
.SH SOURCE
70+
This is from bcc.
71+
.IP
72+
https://github.com/iovisor/bcc
73+
.PP
74+
Also look in the bcc distribution for a companion _examples.txt file containing
75+
example usage, output, and commentary for this tool.
76+
.SH OS
77+
Linux
78+
.SH STABILITY
79+
Unstable - in development.
80+
.SH AUTHOR
81+
Brendan Gregg
82+
.SH SEE ALSO
83+
zfssnoop(8)

tools/zfsdist.py

+168
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
#!/usr/bin/python
2+
# @lint-avoid-python-3-compatibility-imports
3+
#
4+
# zfsdist Summarize ZFS operation latency.
5+
# For Linux, uses BCC, eBPF.
6+
#
7+
# USAGE: zfsdist [-h] [-T] [-m] [-p PID] [interval] [count]
8+
#
9+
# Copyright 2016 Netflix, Inc.
10+
# Licensed under the Apache License, Version 2.0 (the "License")
11+
#
12+
# 14-Feb-2016 Brendan Gregg Created this.
13+
14+
from __future__ import print_function
15+
from bcc import BPF
16+
from time import sleep, strftime
17+
import argparse
18+
19+
# arguments
20+
examples = """examples:
21+
./zfsdist # show operation latency as a histogram
22+
./zfsdist -p 181 # trace PID 181 only
23+
./zfsdist 1 10 # print 1 second summaries, 10 times
24+
./zfsdist -m 5 # 5s summaries, milliseconds
25+
"""
26+
parser = argparse.ArgumentParser(
27+
description="Summarize ZFS operation latency",
28+
formatter_class=argparse.RawDescriptionHelpFormatter,
29+
epilog=examples)
30+
parser.add_argument("-T", "--notimestamp", action="store_true",
31+
help="don't include timestamp on interval output")
32+
parser.add_argument("-m", "--milliseconds", action="store_true",
33+
help="output in milliseconds")
34+
parser.add_argument("-p", "--pid",
35+
help="trace this PID only")
36+
parser.add_argument("interval", nargs="?",
37+
help="output interval, in seconds")
38+
parser.add_argument("count", nargs="?", default=99999999,
39+
help="number of outputs")
40+
args = parser.parse_args()
41+
pid = args.pid
42+
countdown = int(args.count)
43+
if args.milliseconds:
44+
factor = 1000000
45+
label = "msecs"
46+
else:
47+
factor = 1000
48+
label = "usecs"
49+
if args.interval and int(args.interval) == 0:
50+
print("ERROR: interval 0. Exiting.")
51+
exit()
52+
debug = 0
53+
54+
# define BPF program
55+
bpf_text = """
56+
#include <uapi/linux/ptrace.h>
57+
#include <linux/fs.h>
58+
#include <linux/sched.h>
59+
60+
#define OP_NAME_LEN 8
61+
typedef struct dist_key {
62+
char op[OP_NAME_LEN];
63+
u64 slot;
64+
} dist_key_t;
65+
BPF_HASH(start, u32);
66+
BPF_HISTOGRAM(dist, dist_key_t);
67+
68+
// time operation
69+
int trace_entry(struct pt_regs *ctx)
70+
{
71+
u32 pid = bpf_get_current_pid_tgid();
72+
if (FILTER_PID)
73+
return 0;
74+
u64 ts = bpf_ktime_get_ns();
75+
start.update(&pid, &ts);
76+
return 0;
77+
}
78+
79+
static int trace_return(struct pt_regs *ctx, const char *op)
80+
{
81+
u64 *tsp;
82+
u32 pid = bpf_get_current_pid_tgid();
83+
84+
// fetch timestamp and calculate delta
85+
tsp = start.lookup(&pid);
86+
if (tsp == 0) {
87+
return 0; // missed start or filtered
88+
}
89+
u64 delta = (bpf_ktime_get_ns() - *tsp) / FACTOR;
90+
91+
// store as histogram
92+
dist_key_t key = {.slot = bpf_log2l(delta)};
93+
__builtin_memcpy(&key.op, op, sizeof(key.op));
94+
dist.increment(key);
95+
96+
start.delete(&pid);
97+
return 0;
98+
}
99+
100+
int trace_read_return(struct pt_regs *ctx)
101+
{
102+
char *op = "read";
103+
return trace_return(ctx, op);
104+
}
105+
106+
int trace_write_return(struct pt_regs *ctx)
107+
{
108+
char *op = "write";
109+
return trace_return(ctx, op);
110+
}
111+
112+
int trace_open_return(struct pt_regs *ctx)
113+
{
114+
char *op = "open";
115+
return trace_return(ctx, op);
116+
}
117+
118+
int trace_fsync_return(struct pt_regs *ctx)
119+
{
120+
char *op = "fsync";
121+
return trace_return(ctx, op);
122+
}
123+
"""
124+
bpf_text = bpf_text.replace('FACTOR', str(factor))
125+
if args.pid:
126+
bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid)
127+
else:
128+
bpf_text = bpf_text.replace('FILTER_PID', '0')
129+
if debug:
130+
print(bpf_text)
131+
132+
# load BPF program
133+
b = BPF(text=bpf_text)
134+
135+
# common file functions
136+
b.attach_kprobe(event="zpl_read", fn_name="trace_entry")
137+
b.attach_kprobe(event="zpl_write", fn_name="trace_entry")
138+
b.attach_kprobe(event="zpl_open", fn_name="trace_entry")
139+
b.attach_kprobe(event="zpl_fsync", fn_name="trace_entry")
140+
b.attach_kretprobe(event="zpl_read", fn_name="trace_read_return")
141+
b.attach_kretprobe(event="zpl_write", fn_name="trace_write_return")
142+
b.attach_kretprobe(event="zpl_open", fn_name="trace_open_return")
143+
b.attach_kretprobe(event="zpl_fsync", fn_name="trace_fsync_return")
144+
145+
print("Tracing ZFS operation latency... Hit Ctrl-C to end.")
146+
147+
# output
148+
exiting = 0
149+
dist = b.get_table("dist")
150+
while (1):
151+
try:
152+
if args.interval:
153+
sleep(int(args.interval))
154+
else:
155+
sleep(99999999)
156+
except KeyboardInterrupt:
157+
exiting = 1
158+
159+
print()
160+
if args.interval and (not args.notimestamp):
161+
print(strftime("%H:%M:%S:"))
162+
163+
dist.print_log2_hist(label, "operation")
164+
dist.clear()
165+
166+
countdown -= 1
167+
if exiting or countdown == 0:
168+
exit()

0 commit comments

Comments
 (0)