Skip to content

Commit 524ceaf

Browse files
committed
added buffered reading to tokenizer
1 parent 0135559 commit 524ceaf

File tree

4 files changed

+21
-14
lines changed

4 files changed

+21
-14
lines changed

src/json_stream/loader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
from json_stream.select_tokenizer import default_tokenizer
44

55

6-
def load(fp_or_iterable, persistent=False, tokenizer=default_tokenizer):
6+
def load(fp_or_iterable, persistent=False, tokenizer=default_tokenizer, buffering=-1):
77
fp = ensure_file(fp_or_iterable)
8-
token_stream = tokenizer(fp)
8+
token_stream = tokenizer(fp, buffering=buffering)
99
token_type, token = next(token_stream)
1010
if token_type == TokenType.OPERATOR:
1111
return StreamingJSONBase.factory(token, token_stream, persistent)

src/json_stream/tests/test_buffering.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@ def test_buffering(self):
1212
self._test_buffering(tokenizer=rust_tokenizer_or_raise())
1313

1414
def test_buffering_python_tokenizer(self):
15-
self._test_buffering(tokenizer=tokenize)
15+
self._test_buffering(tokenizer=tokenize, buffering=0)
1616

17-
def _test_buffering(self, tokenizer):
17+
def _test_buffering(self, tokenizer, **load_args):
1818
happenings = []
1919

2020
def data_in_chunks(data, chunk_size=15):
@@ -24,7 +24,7 @@ def data_in_chunks(data, chunk_size=15):
2424
yield part
2525

2626
json_string = b'{"tasks":[{"id":1,"title":"task1"},{"id":2,"title":"task2"},{"id":3,"title":"task3"}]}'
27-
stream = json_stream.load(data_in_chunks(json_string), tokenizer=tokenizer)
27+
stream = json_stream.load(data_in_chunks(json_string), tokenizer=tokenizer, **load_args)
2828

2929
for task in stream["tasks"]:
3030
happenings.append(('item', to_standard_types(task)))

src/json_stream/tokenizer.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def _ensure_text(stream):
7878
return stream
7979

8080

81-
def tokenize(stream):
81+
def tokenize(stream, *, buffering=-1, **_):
8282
stream = _ensure_text(stream)
8383

8484
def is_delimiter(char):
@@ -365,9 +365,19 @@ def process_char(char):
365365

366366
return advance, next_state
367367
state = State.WHITESPACE
368-
c = stream.read(1)
369-
index = 0
370-
while c:
368+
if not buffering:
369+
buffering = 1
370+
elif buffering <= 0:
371+
buffering = io.DEFAULT_BUFFER_SIZE
372+
buffering = buffering.__index__()
373+
buffer = stream.read(buffering)
374+
c = None
375+
index = -1
376+
advance = True
377+
while buffer:
378+
if advance:
379+
c, buffer = buffer[0], buffer[1:] or stream.read(buffering)
380+
index += 1
371381
try:
372382
advance, state = process_char(c)
373383
except ValueError as e:
@@ -376,9 +386,6 @@ def process_char(char):
376386
completed = False
377387
token = []
378388
yield now_token
379-
if advance:
380-
c = stream.read(1)
381-
index += 1
382389
process_char(SpecialChar.EOF)
383390
if completed:
384391
yield now_token

src/json_stream/visitor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@ def _visit(obj, visitor, path):
1919
visitor(obj, path)
2020

2121

22-
def visit(fp_or_iterator, visitor, tokenizer=default_tokenizer):
22+
def visit(fp_or_iterator, visitor, tokenizer=default_tokenizer, buffering=-1):
2323
fp = ensure_file(fp_or_iterator)
24-
token_stream = tokenizer(fp)
24+
token_stream = tokenizer(fp, buffering=buffering)
2525
_, token = next(token_stream)
2626
obj = StreamingJSONBase.factory(token, token_stream, persistent=False)
2727
_visit(obj, visitor, ())

0 commit comments

Comments
 (0)