Improve performance 6x by not tokenizing for every linting function

arxanas · arxanas · commit e96cec6881ac · 2016-02-12T01:09:40.000-05:00
Previously, the linter re-tokenized the code for every single linting
function, of which there are tens.

After all that effort trying to benchmark Python code, it indeed turns
out that the issue is never where you think it is.
diff --git a/README.md b/README.md
@@ -75,10 +75,6 @@ std::vector<int>; /* ... */ Foo_t::iterator`).
   * Use of post-increment instead of pre-increment for iterators.
   * Catching exceptions by value instead of by reference.
 
-# Bugs
-
-The tokenizer is *extremely* slow.
-
 # License
 
 `lint381` is licensed under GPLv3.
diff --git a/lint381/linter.py b/lint381/linter.py
@@ -52,7 +52,8 @@ def lint(self, code):
         """
         errors = []
 
+        tokens = tokenize(code)
         for func in self.linters:
-            errors.extend(func(tokenize(code)))
+            errors.extend(func(tokens))
 
         return errors
diff --git a/lint381/matcher.py b/lint381/matcher.py
@@ -68,9 +68,7 @@ def match_tokens(tokens, *, start, end=None, lookahead=0, length=None):
                     yield tokens[i:j + 1]
             else:
                 # Scan ahead for the matching end token.
-                for j in range(i, len(tokens)):
-                    end_token = tokens[j]
-
+                for j, end_token in enumerate(tokens[i:], i):
                     # If we find a better starting point, use that instead.
                     # This minimizes the distance between the start and the end
                     # token.
diff --git a/lint381/tokenizer.py b/lint381/tokenizer.py
@@ -223,17 +223,19 @@ def _get_next_token(self):
             if token:
                 return token
 
-        token_values = ((group, self._match_pattern(pattern))
-                        for group, pattern in self._TOKEN_PATTERNS)
-        token_values = ((group, value)
+        token_values = [(group, self._match_pattern(pattern))
+                        for group, pattern in self._TOKEN_PATTERNS]
+        token_values = [(group, value)
                         for group, value in token_values
-                        if value)
-        try:
-            # Maximal munch -- pick the longest token.
-            group, value = max(token_values, key=lambda i: len(i[1]))
-        except ValueError as e:
+                        if value]
+        if not token_values:
             raise ValueError("Couldn't parse token at {}"
-                             .format(self._position().line_display)) from e
+                             .format(self._position().line_display))
+
+        # Maximal munch -- pick the longest token.
+        token_values.sort(key=lambda i: len(i[1]),
+                          reverse=True)
+        group, value = token_values[0]
 
         start_position = self._position()
         # Leave our cursor at the value at the end of the token.