Skip to content

Commit 884d18b

Browse files
committed
Address review comments
1 parent ca0cd55 commit 884d18b

File tree

6 files changed

+57
-15
lines changed

6 files changed

+57
-15
lines changed

lark/lark.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from typing import Literal
1717
else:
1818
from typing_extensions import Literal
19-
from .parser_frontends import ParsingFrontend
19+
from .parser_frontends import ParsingFrontend, ScanMatch
2020

2121
from .exceptions import ConfigurationError, assert_config, UnexpectedInput
2222
from .utils import Serialize, SerializeMemoizer, FS, isascii, logger
@@ -661,7 +661,7 @@ def parse(self, text: str, start: Optional[str] = None,
661661
return self.parser.parse(text, start=start, on_error=on_error, start_pos=start_pos, end_pos=end_pos)
662662

663663
def scan(self, text: str, start: Optional[str] = None, *, start_pos: Optional[int] = None,
664-
end_pos: Optional[int] = None) -> Iterator[Tuple[Tuple[int, int], 'ParseTree']]:
664+
end_pos: Optional[int] = None) -> Iterable['ScanMatch']:
665665
"""
666666
Scans the input text for non-overlapping matches of the rule specified by 'start' and
667667
yields the start and end position as well as the resulting tree.

lark/lexer.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -404,17 +404,17 @@ def match(self, text, pos, *, end_pos=sys.maxsize):
404404
return m.group(0), m.lastgroup
405405

406406
def search(self, text, start_pos, end_pos):
407-
best = None, float("inf")
407+
best = None
408408
for mre in self._mres:
409409
mre: re.Pattern
410410
m = mre.search(text, start_pos, end_pos)
411411
if m:
412-
if m.start() < best[1]:
413-
best = (m.group(0), m.lastgroup), m.start()
414-
if best[0] is None:
415-
return None
416-
else:
412+
if best is None or m.start() < best.start():
413+
best = m
414+
if best is None:
417415
return best
416+
else:
417+
return (best.group(0), best.lastgroup), best.start()
418418

419419

420420
def _regexp_has_newline(r: str):

lark/parser_frontends.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING
1+
from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING, NamedTuple, Iterable, Tuple
22

33
from .exceptions import ConfigurationError, GrammarError, assert_config, UnexpectedInput
44
from .utils import get_regexp_width, Serialize
@@ -14,6 +14,12 @@
1414

1515
###{standalone
1616

17+
18+
class ScanMatch(NamedTuple):
19+
range: Tuple[int, int]
20+
tree: Tree
21+
22+
1723
def _wrap_lexer(lexer_class):
1824
future_interface = getattr(lexer_class, '__future_interface__', False)
1925
if future_interface:
@@ -128,13 +134,13 @@ def parse_interactive(self, text: Optional[str]=None, start=None,
128134

129135

130136
def scan(self, text: str, start: Optional[str]=None, *, start_pos: Optional[int] = None,
131-
end_pos: Optional[int] = None):
137+
end_pos: Optional[int] = None) -> Iterable[ScanMatch]:
132138
"""
133139
In contrast to the other functions here, this one actually does work. See `Lark.scan`
134140
for a description of what this function is for.
135141
"""
136142
if self.options.parser != 'lalr':
137-
raise ValueError("scan requires parser='lalr' and lexer='contextual'")
143+
raise ValueError("scan requires parser='lalr'")
138144
start_states = self.parser._parse_table.start_states
139145
chosen_start = self._verify_start(start)
140146
start_state = start_states[chosen_start]
@@ -143,8 +149,7 @@ def scan(self, text: str, start: Optional[str]=None, *, start_pos: Optional[int]
143149
if pos < 0:
144150
pos += len(text)
145151
if end_pos < 0:
146-
pos += len(text)
147-
del start_pos
152+
end_pos += len(text)
148153
while True:
149154
# Find the next candidate location
150155
found = self.lexer.search_start(text, start_state, pos, end_pos)
@@ -175,7 +180,7 @@ def scan(self, text: str, start: Optional[str]=None, *, start_pos: Optional[int]
175180
except UnexpectedInput:
176181
continue
177182
else:
178-
yield ((found.start_pos, last.end_pos), res)
183+
yield ScanMatch((found.start_pos, last.end_pos), res)
179184
pos = last.end_pos
180185
break
181186
else:

lark/tools/standalone.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
from typing import (
3131
TypeVar, Generic, Type, Tuple, List, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any,
3232
Union, Iterable, IO, TYPE_CHECKING, overload, Sequence,
33-
Pattern as REPattern, ClassVar, Set, Mapping
33+
Pattern as REPattern, ClassVar, Set, Mapping, NamedTuple
3434
)
3535
###}
3636

tests/test_parser.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2599,6 +2599,8 @@ def test_subset_parse(self):
25992599
parser = _Lark(grammar)
26002600
self.assertEqual(parser.parse(" abc def ", start_pos=1, end_pos=-1),
26012601
Tree('start', [Token('WORD', 'abc'), Token('WORD', 'def')]))
2602+
self.assertEqual(parser.parse(" abc def ", start_pos=1-9, end_pos=-1+9),
2603+
Tree('start', [Token('WORD', 'abc'), Token('WORD', 'def')]))
26022604
self.assertEqual(parser.parse("xabc def ", start_pos=1, end_pos=-1),
26032605
Tree('start', [Token('FRAG_END', 'abc'), Token('WORD', 'def')]))
26042606

tests/test_scan.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,20 @@ def test_scan(self):
1818
((21, 30), Tree('expr', ['c', Tree('expr', [Tree('expr', ['d'])])])),
1919
])
2020

21+
def test_scan_basic_lexer(self):
22+
parser = Lark(r"""
23+
expr: "(" (WORD|expr)* ")"
24+
%ignore / +/
25+
WORD: /\w+/
26+
""", parser='lalr', start="expr", lexer='basic')
27+
28+
text = "|() | (a) | ((//)) | (c ((d))) |"
29+
finds = list(parser.scan(text))
30+
self.assertEqual(finds, [((1, 3), Tree('expr', [])),
31+
((6, 9), Tree('expr', ['a'])),
32+
((21, 30), Tree('expr', ['c', Tree('expr', [Tree('expr', ['d'])])])),
33+
])
34+
2135
def test_scan_meta(self):
2236
parser = Lark(r"""
2337
expr: "(" (WORD|expr)* ")"
@@ -70,3 +84,24 @@ def test_scan_backtrack(self):
7084
((15, 18), Tree('start', [Tree('expr', ['e'])])),
7185
((22, 25), Tree('start', [Tree('expr', ['f'])])),
7286
])
87+
88+
def test_scan_subset(self):
89+
parser = Lark(r"""
90+
expr: "(" (WORD|expr)* ")"
91+
%ignore /\s+/
92+
WORD: /\w+/
93+
""", parser='lalr', start="expr", propagate_positions=True)
94+
95+
text = "()\n()(a)\n(b)\n (\n) | \n(\n)"
96+
finds = list(parser.scan(text, start_pos=5, end_pos=-1))
97+
self.assertEqual(finds, [((5, 8), Tree('expr', ['a'])),
98+
((9, 12), Tree('expr', ['b'])),
99+
((14, 17), Tree('expr', []))])
100+
self.assertEqual(2, finds[0][1].meta.line)
101+
102+
text = "()\n()(a)\n(b)\n (\n) | \n(\n)"
103+
finds = list(parser.scan(text, start_pos=5-len(text), end_pos=-1+len(text)))
104+
self.assertEqual(finds, [((5, 8), Tree('expr', ['a'])),
105+
((9, 12), Tree('expr', ['b'])),
106+
((14, 17), Tree('expr', []))])
107+
self.assertEqual(2, finds[0][1].meta.line)

0 commit comments

Comments
 (0)