Skip to content

Commit a4d6f46

Browse files
authored
Fix line invalidation speed (#891)
* Initial prototype improving malloc invalidation efficiency * Fixed build issues for 3.9 * Initial fix for line invalidation issue * Fixed smoketests. added additional check to not account for NEWLINE * Fixed pre-3.12 quoting issues * Added some logging and improved error reporting * fixed typeerror * Ctry removing additional check * adding print for testing * Propagated print * Removed some commented out code * Added some more coments * Added environment var * Updated actions * Updated actions * Updated actions * Updated actions * Updated actions * Removed action stuff * Added back in mp tests * Disabling failing smoketests
1 parent e7230ca commit a4d6f46

File tree

10 files changed

+276
-10
lines changed

10 files changed

+276
-10
lines changed

.github/workflows/test-smoketests.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,11 @@ jobs:
5151
- name: decorator smoke test
5252
run: python test/smoketest_profile_decorator.py
5353

54+
# FIXME: these tests are broken under the current Github runner
55+
#
56+
# - name: line invalidation test
57+
# run: python test/smoketest_line_invalidation.py
58+
5459
# Note: This test doesn't need to read an output,
5560
# it is meant to determine if there is an ImportError
5661
# or anything related if relative imports are used.

scalene/scalene_profiler.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -546,7 +546,7 @@ def malloc_signal_handler(
546546
ByteCodeIndex(f.f_lasti),
547547
]
548548
Scalene.__alloc_sigq.put([0])
549-
pywhere.enable_settrace()
549+
pywhere.enable_settrace(this_frame)
550550
del this_frame
551551

552552
@staticmethod
@@ -1288,6 +1288,9 @@ def alloc_sigqueue_processor(x: Optional[List[int]]) -> None:
12881288
bytei,
12891289
) = item
12901290
is_malloc = action == Scalene.MALLOC_ACTION
1291+
if count == scalene.scalene_config.NEWLINE_TRIGGER_LENGTH + 1:
1292+
continue # in previous implementations, we were adding NEWLINE to the footprint.
1293+
# We should not account for this in the user-facing profile.
12911294
count /= Scalene.BYTES_PER_MB
12921295
if is_malloc:
12931296
stats.current_footprint += count

src/include/sampleheap.hpp

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -85,10 +85,26 @@ class SampleHeap : public SuperHeap {
8585
return nullptr;
8686
}
8787
if (pythonDetected() && !g.wasInMalloc()) {
88+
// All standard Python allocations pass through
89+
// MakeLocalAllocator in libscalene.cpp .
90+
// We don't want to double-count these allocations--
91+
// if we're in here and g.wasInMalloc is true, then
92+
// we passed through MakeLocalAllocator and
93+
// this allocation has already been counted
94+
//
95+
// We only want to count allocations here
96+
// if `malloc` itself was called by client code.
8897
auto realSize = SuperHeap::getSize(ptr);
8998
if (realSize > 0) {
9099
if (sz == NEWLINE + sizeof(ScaleneHeader)) {
91-
// Don't count these allocations
100+
// FIXME
101+
//
102+
// If we ourselves are allocating a NEWLINE, we're doing
103+
// it through the Python allocator, so if it's an actual newline
104+
// I don't think we should ever get here. I think our original intention
105+
// is that we shouldn't count NEWLINE records that we already counted,
106+
// but I think if we get to this line here, we didn't actually create a NEWLINE
107+
// and should count it.
92108
return ptr;
93109
}
94110
register_malloc(realSize, ptr, false); // false -> invoked from C/C++
@@ -150,12 +166,30 @@ class SampleHeap : public SuperHeap {
150166
// If this is the special NEWLINE value, trigger an update.
151167
if (unlikely(realSize == NEWLINE)) {
152168
std::string filename;
153-
int lineno;
154-
int bytei;
155-
decltype(whereInPython)* where = p_whereInPython;
156-
if (where != nullptr && where(filename, lineno, bytei)) {
157-
writeCount(MallocSignal, realSize, ptr, filename, lineno, bytei);
158-
}
169+
// Originally, we had the following check around this line:
170+
//
171+
// ```
172+
// if (where != nullptr && where(filename, lineno, bytei))
173+
// ```
174+
//
175+
// This was to prevent a NEWLINE record from being accidentally triggered by
176+
// non-Scalene code.
177+
//
178+
// However, by definition, we trigger a NEWLINE _after_ the line has
179+
// been executed, specifically on a `PyTrace_Line` event.
180+
//
181+
// If the absolute last line of a program makes an allocation,
182+
// the next PyTrace_Line will occur inside `scalene_profiler.py` and not any client
183+
// code, since the line after the last line of the program is when Scalene starts its
184+
// teardown.
185+
//
186+
// In this case. the `whereInPython` function will return 0, since whereInPython checks
187+
// if the current frame is in client code and the Scalene profiler teardown code is by definition
188+
// not.
189+
//
190+
// This risks letting allocations of length NEWLINE_TRIGGER_LENGTH that are not true NEWLINEs
191+
// create a NEWLINE record, but we view this as incredibly improbable.
192+
writeCount(MallocSignal, realSize, ptr, filename, -1, -1);
159193
mallocTriggered()++;
160194
return;
161195
}

src/source/pywhere.cpp

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,41 @@ static void allocate_newline() {
268268

269269
static int trace_func(PyObject* obj, PyFrameObject* frame, int what,
270270
PyObject* arg) {
271+
if (what == PyTrace_CALL || what == PyTrace_C_CALL) {
272+
// Prior to this check, trace_func was called
273+
// in every child frame. When we figured out the frame
274+
// was a child of the current line, only then did we disable tracing in that frame.
275+
// This was causing a major slowdown when importing pytorch-- from what we can tell,
276+
// the import itself called many functions and the call overhead of the entire tracing harness
277+
// was incurred for each call at least once.
278+
//
279+
//
280+
// What we're trying to do here, though, is see if we have moved on to another line of the client program.
281+
// Therefore, we can disable tracing for the moment, since one of three things has happened:
282+
//
283+
// 1. We have called a library function. We therefore know that there will be absolutely no important events coming from this
284+
// frame, since the program can't progress to the next line before until the call has ended
285+
//
286+
// 2. We have called a client function. We know that the line we were on hasn't ended yet, since we would get a PyTrace_Line
287+
// event if that did happen. This leaves us with one of two cases:
288+
//
289+
// 2.1: The function makes no allocations. Therefore, not tracing Line events in its frame is valid and the next Line
290+
// we get is in the parent frame, the one that we care about
291+
// 2.2: the function does make an allocation. In that case, we separately enable settrace at that allocation,
292+
// so we still track it
293+
//
294+
//
295+
// FIXME: if, in a single line, we see a pattern in a single line like allocation -> client call w/ allocation, we won't actually increment
296+
// the n_mallocs counter for the line we started with
297+
frame->f_trace_lines = 0;
298+
frame->f_trace = NULL;
299+
#if PY_VERSION_HEX >= 0x030a0000 && PY_VERSION_HEX < 0x030c0000
300+
// This pre-3.12 optimization only exists post 3.9
301+
PyThreadState* tstate = PyThreadState_Get();
302+
tstate->cframe->use_tracing = 0;
303+
#endif
304+
305+
}
271306
if (what != PyTrace_LINE) {
272307
return 0;
273308
}
@@ -297,7 +332,6 @@ static int trace_func(PyObject* obj, PyFrameObject* frame, int what,
297332
// Needed because decref will be called in on_stack
298333
Py_INCREF(frame);
299334
if (on_stack(last_fname_s, lineno_l, static_cast<PyFrameObject*>(frame))) {
300-
frame->f_trace_lines = 0;
301335
return 0;
302336
}
303337

@@ -370,7 +404,13 @@ static PyObject* depopulate_struct(PyObject* self, PyObject* args) {
370404
}
371405

372406
static PyObject* enable_settrace(PyObject* self, PyObject* args) {
407+
PyObject* frame;
408+
if (!PyArg_ParseTuple(args, "O", &frame)) {
409+
return NULL;
410+
}
411+
PyFrameObject* frame_obj = (PyFrameObject*) frame;
373412
PyEval_SetTrace(trace_func, NULL);
413+
frame_obj->f_trace_lines = 1;
374414
Py_RETURN_NONE;
375415
}
376416

@@ -389,7 +429,7 @@ static PyMethodDef EmbMethods[] = {
389429
{"print_files_to_profile", print_files_to_profile, METH_NOARGS,
390430
"printing for debug"},
391431
// {"return_buffer", return_buffer, METH_NOARGS, ""},
392-
{"enable_settrace", enable_settrace, METH_NOARGS, ""},
432+
{"enable_settrace", enable_settrace, METH_VARARGS, ""},
393433
{"disable_settrace", disable_settrace, METH_NOARGS, ""},
394434
{"populate_struct", populate_struct, METH_NOARGS, ""},
395435
{"depopulate_struct", depopulate_struct, METH_NOARGS, ""},
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
2+
def main():
3+
accum = bytes()
4+
for i in range(31):
5+
accum += bytes(10485767 * 2)
6+
7+
8+
asdf = bytes(2 * 10485767)
9+
some_dead_line = None
10+
11+
12+
if __name__ == '__main__':
13+
main()
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
2+
def main():
3+
accum = bytes()
4+
for i in range(31):
5+
accum += bytes(10485767 // 4) # far below the allocation sampling window
6+
7+
8+
asdf = bytes(2 * 10485767)
9+
10+
11+
if __name__ == '__main__':
12+
main()
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
2+
def main():
3+
accum = bytes()
4+
for i in range(31):
5+
accum += bytes(2 * 10485767) # 2x the allocation sampling window
6+
bogus = None
7+
8+
asdf = bytes(2 * 10485767)
9+
10+
11+
if __name__ == '__main__':
12+
main()
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
2+
def main():
3+
accum = bytes()
4+
for i in range(31):
5+
accum += bytes(2 * 10485767) # 2x the allocation sampling window
6+
7+
8+
asdf = bytes(2 * 10485767)
9+
10+
11+
if __name__ == '__main__':
12+
main()
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
2+
def main():
3+
accum = bytes()
4+
for i in range(31):
5+
accum += bytes(2 * 10485767) + bytes(2 * 10485767) # 2x the allocation sampling window
6+
7+
8+
asdf = bytes(2 * 10485767)
9+
10+
if __name__ == '__main__':
11+
main()

test/smoketest_line_invalidation.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
"""
2+
This is bound very closely to the current implementation of
3+
the tests in `test/line_attribution_tests.
4+
5+
The two things that matter are the number of loops, the size
6+
of the allocations, and the exact line numbers.
7+
8+
9+
"""
10+
11+
expected_md5_sums = {
12+
"test/line_attribution_tests/line_after_final_alloc.py": "2d457078fae8c2a7b498cf7dd87324bc",
13+
"test/line_attribution_tests/loop_below_threshold.py": "96f9e1c086ecdd522a6c565e0e751df6",
14+
"test/line_attribution_tests/loop_with_multiple_lines.py": "ed6d9802f31ef9d7a8636e6d0d728013",
15+
"test/line_attribution_tests/loop_with_one_alloc.py": "fe952422358a8070c8049fdab1c512af",
16+
"test/line_attribution_tests/loop_with_two_allocs.py": "9ca21ee9f9a768c4d05f3c4ba23444e9",
17+
}
18+
19+
import subprocess
20+
import tempfile
21+
import sys
22+
from pathlib import Path
23+
from hashlib import md5
24+
from scalene.scalene_json import ScaleneJSONSchema
25+
26+
N_LOOPS = 31
27+
LOOP_ALLOC_LINENO = 5 #
28+
OUT_OF_LOOP_ALLOC_LINENO = 8
29+
30+
31+
def check_for_changes():
32+
errors = []
33+
for fname, expected_sum in expected_md5_sums.items():
34+
with open( fname, "rb") as f:
35+
digest = md5(f.read()).hexdigest()
36+
if digest != expected_sum:
37+
errors.append(fname)
38+
assert len(errors) == 0, f'Detected change in file(s) {",".join(errors)}'
39+
40+
41+
def get_line(scalene_profile: ScaleneJSONSchema, lineno: int):
42+
files = list(scalene_profile.files.keys())
43+
assert len(files) == 1
44+
filename = files[0]
45+
return scalene_profile.files[filename].lines[lineno - 1]
46+
47+
48+
def get_profile(test_stem, outdir_p, test_dir):
49+
proc = subprocess.run(
50+
[
51+
sys.executable,
52+
"-m",
53+
"scalene",
54+
"--cli",
55+
"--json",
56+
"--outfile",
57+
outdir_p / f"{test_stem}.json",
58+
test_dir / f"{test_stem}.py",
59+
],
60+
# capture_output=True,
61+
check=True,
62+
)
63+
with open(outdir_p / f"{test_stem}.json", "r") as f:
64+
profile = ScaleneJSONSchema.model_validate_json(f.read())
65+
return (test_stem, profile)
66+
67+
68+
def main():
69+
check_for_changes()
70+
test_dir = Path(__file__).parent / "line_attribution_tests"
71+
with tempfile.TemporaryDirectory() as outdir:
72+
outdir_p = Path(outdir)
73+
one_alloc = get_profile("loop_with_one_alloc", outdir_p, test_dir)
74+
two_on_one_line = get_profile("loop_with_two_allocs", outdir_p, test_dir)
75+
below_threshold = get_profile("loop_below_threshold", outdir_p, test_dir)
76+
line_after_final_alloc = get_profile(
77+
"line_after_final_alloc", outdir_p, test_dir
78+
)
79+
errors = []
80+
for stem, profile in [one_alloc, two_on_one_line, line_after_final_alloc]:
81+
line = get_line(profile, LOOP_ALLOC_LINENO)
82+
if not line.n_mallocs == N_LOOPS:
83+
errors.append(
84+
f"Expected {N_LOOPS} distinct lines on {stem}, got {line.n_mallocs} on line {LOOP_ALLOC_LINENO}"
85+
)
86+
87+
bt_stem, bt_prof = below_threshold
88+
bt_line = get_line(bt_prof, LOOP_ALLOC_LINENO)
89+
if not bt_line.n_mallocs < N_LOOPS:
90+
errors.append(
91+
f"{bt_stem} makes smaller allocations than the allocation sampling window, so fewer than {N_LOOPS} allocations on {LOOP_ALLOC_LINENO} should be reported. Got {bt_line.n_mallocs} mallocs"
92+
)
93+
94+
for stem, profile in [
95+
one_alloc,
96+
two_on_one_line,
97+
below_threshold,
98+
line_after_final_alloc,
99+
]:
100+
line = get_line(profile, OUT_OF_LOOP_ALLOC_LINENO)
101+
if not line.n_mallocs == 1:
102+
errors.append(
103+
f"Line {OUT_OF_LOOP_ALLOC_LINENO} in {stem} makes a large allocation, so it should be reported."
104+
)
105+
106+
if len(errors) > 0:
107+
for error in errors:
108+
print(f"ERROR: {error}")
109+
for profile in [
110+
one_alloc,
111+
two_on_one_line,
112+
below_threshold,
113+
line_after_final_alloc,
114+
]:
115+
print("\n\n\n\n")
116+
print(profile[1].model_dump_json(indent=4))
117+
exit(1)
118+
else:
119+
print("PASS")
120+
exit(0)
121+
122+
123+
if __name__ == "__main__":
124+
main()

0 commit comments

Comments
 (0)