Skip to content

Commit e96cec6

Browse files
committed
Improve performance 6x by not tokenizing for every linting function
Previously, the linter re-tokenized the code for every single linting function, of which there are tens. After all that effort trying to benchmark Python code, it indeed turns out that the issue is never where you think it is.
1 parent 8b5ead5 commit e96cec6

File tree

4 files changed

+14
-17
lines changed

4 files changed

+14
-17
lines changed

README.md

-4
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,6 @@ std::vector<int>; /* ... */ Foo_t::iterator`).
7575
* Use of post-increment instead of pre-increment for iterators.
7676
* Catching exceptions by value instead of by reference.
7777

78-
# Bugs
79-
80-
The tokenizer is *extremely* slow.
81-
8278
# License
8379

8480
`lint381` is licensed under GPLv3.

lint381/linter.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,8 @@ def lint(self, code):
5252
"""
5353
errors = []
5454

55+
tokens = tokenize(code)
5556
for func in self.linters:
56-
errors.extend(func(tokenize(code)))
57+
errors.extend(func(tokens))
5758

5859
return errors

lint381/matcher.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,7 @@ def match_tokens(tokens, *, start, end=None, lookahead=0, length=None):
6868
yield tokens[i:j + 1]
6969
else:
7070
# Scan ahead for the matching end token.
71-
for j in range(i, len(tokens)):
72-
end_token = tokens[j]
73-
71+
for j, end_token in enumerate(tokens[i:], i):
7472
# If we find a better starting point, use that instead.
7573
# This minimizes the distance between the start and the end
7674
# token.

lint381/tokenizer.py

+11-9
Original file line numberDiff line numberDiff line change
@@ -223,17 +223,19 @@ def _get_next_token(self):
223223
if token:
224224
return token
225225

226-
token_values = ((group, self._match_pattern(pattern))
227-
for group, pattern in self._TOKEN_PATTERNS)
228-
token_values = ((group, value)
226+
token_values = [(group, self._match_pattern(pattern))
227+
for group, pattern in self._TOKEN_PATTERNS]
228+
token_values = [(group, value)
229229
for group, value in token_values
230-
if value)
231-
try:
232-
# Maximal munch -- pick the longest token.
233-
group, value = max(token_values, key=lambda i: len(i[1]))
234-
except ValueError as e:
230+
if value]
231+
if not token_values:
235232
raise ValueError("Couldn't parse token at {}"
236-
.format(self._position().line_display)) from e
233+
.format(self._position().line_display))
234+
235+
# Maximal munch -- pick the longest token.
236+
token_values.sort(key=lambda i: len(i[1]),
237+
reverse=True)
238+
group, value = token_values[0]
237239

238240
start_position = self._position()
239241
# Leave our cursor at the value at the end of the token.

0 commit comments

Comments
 (0)