diff --git a/src/json_stream/loader.py b/src/json_stream/loader.py index 680e801..9ee2b06 100644 --- a/src/json_stream/loader.py +++ b/src/json_stream/loader.py @@ -3,9 +3,9 @@ from json_stream.select_tokenizer import default_tokenizer -def load(fp_or_iterable, persistent=False, tokenizer=default_tokenizer): +def load(fp_or_iterable, persistent=False, tokenizer=default_tokenizer, buffering=-1, strings_as_files=False): fp = ensure_file(fp_or_iterable) - token_stream = tokenizer(fp) + token_stream = tokenizer(fp, buffering=buffering, strings_as_files=strings_as_files) token_type, token = next(token_stream) if token_type == TokenType.OPERATOR: return StreamingJSONBase.factory(token, token_stream, persistent) diff --git a/src/json_stream/tests/test_buffering.py b/src/json_stream/tests/test_buffering.py index f60033a..0e969e9 100644 --- a/src/json_stream/tests/test_buffering.py +++ b/src/json_stream/tests/test_buffering.py @@ -12,9 +12,9 @@ def test_buffering(self): self._test_buffering(tokenizer=rust_tokenizer_or_raise()) def test_buffering_python_tokenizer(self): - self._test_buffering(tokenizer=tokenize) + self._test_buffering(tokenizer=tokenize, buffering=0) - def _test_buffering(self, tokenizer): + def _test_buffering(self, tokenizer, **load_args): happenings = [] def data_in_chunks(data, chunk_size=15): @@ -24,7 +24,7 @@ def data_in_chunks(data, chunk_size=15): yield part json_string = b'{"tasks":[{"id":1,"title":"task1"},{"id":2,"title":"task2"},{"id":3,"title":"task3"}]}' - stream = json_stream.load(data_in_chunks(json_string), tokenizer=tokenizer) + stream = json_stream.load(data_in_chunks(json_string), tokenizer=tokenizer, **load_args) for task in stream["tasks"]: happenings.append(('item', to_standard_types(task))) diff --git a/src/json_stream/tokenizer.py b/src/json_stream/tokenizer/__init__.py similarity index 59% rename from src/json_stream/tokenizer.py rename to src/json_stream/tokenizer/__init__.py index 21b0bb4..f1c1dc1 100644 --- a/src/json_stream/tokenizer.py +++ b/src/json_stream/tokenizer/__init__.py @@ -6,9 +6,9 @@ Copyright (c) 2019 Daniel Yule """ import io -import unicodedata +from typing import Optional, Tuple -SURROGATE = 'Cs' +from json_stream.tokenizer.strings import JsonStringReader class TokenType: @@ -29,7 +29,6 @@ class State: FLOATING_POINT_0 = 6 FLOATING_POINT = 8 STRING = 9 - STRING_ESCAPE = 10 STRING_END = 11 TRUE_1 = 12 TRUE_2 = 13 @@ -41,10 +40,6 @@ class State: NULL_1 = 19 NULL_2 = 20 NULL_3 = 21 - UNICODE = 22 - UNICODE_SURROGATE_START = 23 - UNICODE_SURROGATE_STRING_ESCAPE = 24 - UNICODE_SURROGATE = 25 class SpecialChar: @@ -78,22 +73,20 @@ def _ensure_text(stream): return stream -def tokenize(stream): +def tokenize(stream, *, buffering=-1, strings_as_files=False, **_): stream = _ensure_text(stream) def is_delimiter(char): return char.isspace() or char in "{}[]:," or char == SpecialChar.EOF token = [] - unicode_buffer = "" completed = False - now_token = "" + now_token: Optional[Tuple] = None def process_char(char): - nonlocal token, completed, now_token, unicode_buffer + nonlocal completed, now_token, state, buffer, index advance = True add_char = False - next_state = state if state == State.WHITESPACE: if char == "{": completed = True @@ -113,36 +106,40 @@ def process_char(char): elif char == ":": completed = True now_token = (TokenType.OPERATOR, ":") - elif char == "\"": - next_state = State.STRING + elif char == '"': + state = State.STRING + now_token = (TokenType.STRING, JsonStringReader(stream, buffer)) + if strings_as_files: + completed = True + advance = False elif char in "123456789": - next_state = State.INTEGER + state = State.INTEGER add_char = True elif char == "0": - next_state = State.INTEGER_0 + state = State.INTEGER_0 add_char = True elif char == "-": - next_state = State.INTEGER_SIGN + state = State.INTEGER_SIGN add_char = True elif char == "f": - next_state = State.FALSE_1 + state = State.FALSE_1 elif char == "t": - next_state = State.TRUE_1 + state = State.TRUE_1 elif char == "n": - next_state = State.NULL_1 + state = State.NULL_1 elif not char.isspace() and not char == SpecialChar.EOF: raise ValueError("Invalid JSON character: '{0}'".format(char)) elif state == State.INTEGER: if char in "0123456789": add_char = True elif char == ".": - next_state = State.FLOATING_POINT_0 + state = State.FLOATING_POINT_0 add_char = True elif char == "e" or char == 'E': - next_state = State.INTEGER_EXP_0 + state = State.INTEGER_EXP_0 add_char = True elif is_delimiter(char): - next_state = State.WHITESPACE + state = State.WHITESPACE completed = True now_token = (TokenType.NUMBER, int("".join(token))) advance = False @@ -150,13 +147,13 @@ def process_char(char): raise ValueError("A number must contain only digits. Got '{}'".format(char)) elif state == State.INTEGER_0: if char == ".": - next_state = State.FLOATING_POINT_0 + state = State.FLOATING_POINT_0 add_char = True elif char == "e" or char == 'E': - next_state = State.INTEGER_EXP_0 + state = State.INTEGER_EXP_0 add_char = True elif is_delimiter(char): - next_state = State.WHITESPACE + state = State.WHITESPACE completed = True now_token = (TokenType.NUMBER, 0) advance = False @@ -164,16 +161,16 @@ def process_char(char): raise ValueError("A 0 must be followed by a '.' or a 'e'. Got '{0}'".format(char)) elif state == State.INTEGER_SIGN: if char == "0": - next_state = State.INTEGER_0 + state = State.INTEGER_0 add_char = True elif char in "123456789": - next_state = State.INTEGER + state = State.INTEGER add_char = True else: raise ValueError("A - must be followed by a digit. Got '{0}'".format(char)) elif state == State.INTEGER_EXP_0: if char == "+" or char == "-" or char in "0123456789": - next_state = State.INTEGER_EXP + state = State.INTEGER_EXP add_char = True else: raise ValueError("An e in a number must be followed by a '+', '-' or digit. Got '{0}'".format(char)) @@ -183,7 +180,7 @@ def process_char(char): elif is_delimiter(char): completed = True now_token = (TokenType.NUMBER, float("".join(token))) - next_state = State.WHITESPACE + state = State.WHITESPACE advance = False else: raise ValueError("A number exponent must consist only of digits. Got '{}'".format(char)) @@ -191,194 +188,123 @@ def process_char(char): if char in "0123456789": add_char = True elif char == "e" or char == "E": - next_state = State.INTEGER_EXP_0 + state = State.INTEGER_EXP_0 add_char = True elif is_delimiter(char): completed = True now_token = (TokenType.NUMBER, float("".join(token))) - next_state = State.WHITESPACE + state = State.WHITESPACE advance = False else: raise ValueError("A number must include only digits") elif state == State.FLOATING_POINT_0: if char in "0123456789": - next_state = State.FLOATING_POINT + state = State.FLOATING_POINT add_char = True else: raise ValueError("A number with a decimal point must be followed by a fractional part") elif state == State.FALSE_1: if char == "a": - next_state = State.FALSE_2 + state = State.FALSE_2 else: raise ValueError("Invalid JSON character: '{0}'".format(char)) elif state == State.FALSE_2: if char == "l": - next_state = State.FALSE_3 + state = State.FALSE_3 else: raise ValueError("Invalid JSON character: '{0}'".format(char)) elif state == State.FALSE_3: if char == "s": - next_state = State.FALSE_4 + state = State.FALSE_4 else: raise ValueError("Invalid JSON character: '{0}'".format(char)) elif state == State.FALSE_4: if char == "e": - next_state = State.WHITESPACE + state = State.WHITESPACE completed = True now_token = (TokenType.BOOLEAN, False) else: raise ValueError("Invalid JSON character: '{0}'".format(char)) elif state == State.TRUE_1: if char == "r": - next_state = State.TRUE_2 + state = State.TRUE_2 else: raise ValueError("Invalid JSON character: '{0}'".format(char)) elif state == State.TRUE_2: if char == "u": - next_state = State.TRUE_3 + state = State.TRUE_3 else: raise ValueError("Invalid JSON character: '{0}'".format(char)) elif state == State.TRUE_3: if char == "e": - next_state = State.WHITESPACE + state = State.WHITESPACE completed = True now_token = (TokenType.BOOLEAN, True) else: raise ValueError("Invalid JSON character: '{0}'".format(char)) elif state == State.NULL_1: if char == "u": - next_state = State.NULL_2 + state = State.NULL_2 else: raise ValueError("Invalid JSON character: '{0}'".format(char)) elif state == State.NULL_2: if char == "l": - next_state = State.NULL_3 + state = State.NULL_3 else: raise ValueError("Invalid JSON character: '{0}'".format(char)) elif state == State.NULL_3: if char == "l": - next_state = State.WHITESPACE + state = State.WHITESPACE completed = True now_token = (TokenType.NULL, None) else: raise ValueError("Invalid JSON character: '{0}'".format(char)) elif state == State.STRING: - if char == "\"": + reader: JsonStringReader = now_token[1] + try: + s = reader.read() + finally: + index += reader.index + if not strings_as_files: + now_token = (TokenType.STRING, s) completed = True - now_token = (TokenType.STRING, "".join(token)) - next_state = State.STRING_END - elif char == "\\": - next_state = State.STRING_ESCAPE - elif char == SpecialChar.EOF: - raise ValueError("Unterminated string at end of file") - else: - add_char = True + buffer = reader.buffer + state = State.STRING_END elif state == State.STRING_END: if is_delimiter(char): advance = False - next_state = State.WHITESPACE + state = State.WHITESPACE else: raise ValueError("Expected whitespace or an operator after string. Got '{}'".format(char)) - elif state == State.STRING_ESCAPE: - next_state = State.STRING - if char == "\\" or char == "\"": - add_char = True - elif char == "b": - char = "\b" - add_char = True - elif char == "f": - char = "\f" - add_char = True - elif char == "n": - char = "\n" - add_char = True - elif char == "t": - char = "\t" - add_char = True - elif char == "r": - char = "\r" - add_char = True - elif char == "/": - char = "/" - add_char = True - elif char == "u": - next_state = State.UNICODE - unicode_buffer = "" - else: - raise ValueError("Invalid string escape: {}".format(char)) - elif state == State.UNICODE: - if char == SpecialChar.EOF: - raise ValueError('Unterminated unicode literal at end of file') - unicode_buffer += char - if len(unicode_buffer) == 4: - try: - code_point = int(unicode_buffer, 16) - except ValueError: - raise ValueError(f"Invalid unicode literal: \\u{unicode_buffer}") - char = chr(code_point) - if unicodedata.category(char) == SURROGATE: - next_state = State.UNICODE_SURROGATE_START - else: - next_state = State.STRING - add_char = True - elif state == State.UNICODE_SURROGATE_START: - if char == "\\": - next_state = State.UNICODE_SURROGATE_STRING_ESCAPE - elif char == SpecialChar.EOF: - raise ValueError("Unpaired UTF-16 surrogate at end of file") - else: - raise ValueError(f"Unpaired UTF-16 surrogate") - - elif state == State.UNICODE_SURROGATE_STRING_ESCAPE: - if char == "u": - next_state = State.UNICODE_SURROGATE - elif char == SpecialChar.EOF: - raise ValueError("Unpaired UTF-16 surrogate at end of file") - else: - raise ValueError(f"Unpaired UTF-16 surrogate") - - elif state == State.UNICODE_SURROGATE: - if char == SpecialChar.EOF: - raise ValueError('Unterminated unicode literal at end of file') - unicode_buffer += char - if len(unicode_buffer) == 8: - code_point_1 = int(unicode_buffer[:4], 16) - try: - code_point_2 = int(unicode_buffer[4:], 16) - except ValueError: - raise ValueError(f"Invalid unicode literal: \\u{unicode_buffer[4:]}") - char = chr(code_point_2) - if unicodedata.category(char) != SURROGATE: - raise ValueError(f"Second half of UTF-16 surrogate pair is not a surrogate!") - try: - pair = int.to_bytes(code_point_1, 2, 'little') + int.to_bytes(code_point_2, 2, 'little') - char = pair.decode('utf-16-le') - except ValueError: - raise ValueError( - f"Error decoding UTF-16 surrogate pair \\u{unicode_buffer[:4]}\\u{unicode_buffer[4:]}" - ) - next_state = State.STRING - add_char = True if add_char: token.append(char) - return advance, next_state + return advance + state = State.WHITESPACE - c = stream.read(1) - index = 0 - while c: + if not buffering: + buffering = 1 + elif buffering <= 0: + buffering = io.DEFAULT_BUFFER_SIZE + buffering = buffering.__index__() + buffer = stream.read(buffering) + c = None + index = -1 + advance = True + while buffer: + if advance: + c, buffer = buffer[0], buffer[1:] or stream.read(buffering) + index += 1 try: - advance, state = process_char(c) + advance = process_char(c) except ValueError as e: raise ValueError("".join([e.args[0], " at index {}".format(index)])) if completed: completed = False token = [] yield now_token - if advance: - c = stream.read(1) - index += 1 + process_char(SpecialChar.EOF) if completed: yield now_token diff --git a/src/json_stream/tokenizer/strings.py b/src/json_stream/tokenizer/strings.py new file mode 100644 index 0000000..cfe38ed --- /dev/null +++ b/src/json_stream/tokenizer/strings.py @@ -0,0 +1,169 @@ +import io +import unicodedata +from typing import Union +from io import DEFAULT_BUFFER_SIZE + +STRING_ESCAPE_CODES = { + '\\': '\\', + '/': '/', + '"': '"', + 'b': '\b', + 'f': '\f', + 'n': '\n', + 't': '\t', + 'r': '\r' +} + +SURROGATE = 'Cs' + +CHAR = 1 +STRING_ESCAPE = 2 +UNICODE = 4 +UNICODE_SURROGATE_START = 5 +UNICODE_SURROGATE_STRING_ESCAPE = 6 +UNICODE_SURROGATE = 7 + + +class JsonStringReader(io.TextIOBase): + def __init__(self, stream: io.TextIOBase, initial_buffer=''): + self.stream = stream + self.buffer = initial_buffer + self.readline_buffer = '' + self.unicode_buffer = '' + self.state = CHAR + self.end_of_string = False + self.index = 0 + + @property + def complete(self): + return self.end_of_string and not self.readline_buffer + + def readable(self) -> bool: + return True + + def read(self, size: Union[int, None] = None) -> str: + result = '' + length = DEFAULT_BUFFER_SIZE + while not self.complete and (size is None or not result): + if size: + length = size - len(result) + result += self._read_chunk(length) + return result + + def _read_chunk(self, size: int) -> str: + if self.readline_buffer: + result, self.readline_buffer = self.readline_buffer[:size], self.readline_buffer[size:] + return result + chunk = self.buffer or self.stream.read(size) + if not chunk: + raise ValueError("Unterminated string at end of file") + state = self.state + unicode_buffer = self.unicode_buffer + result = "" + start = 0 + for i, c in enumerate(chunk): + self.index += 1 + if i == size: + if state == CHAR: + result += chunk[start:i] + self.buffer = chunk[i:] + break + if state == CHAR: + if c == '"': + result += chunk[start:i] + self.end_of_string = True + self.buffer = chunk[i + 1:] + break + elif c == "\\": + state = STRING_ESCAPE + result += chunk[start:i] + start = i + 1 + + elif state == STRING_ESCAPE: + char = STRING_ESCAPE_CODES.get(c) + start = i + 1 + if char: + result += char + state = CHAR + elif c == 'u': + state = UNICODE + else: + raise ValueError("Invalid string escape: {}".format(c)) + + elif state == UNICODE: + unicode_buffer += c + start = i + 1 + if len(unicode_buffer) == 4: + try: + code_point = int(unicode_buffer, 16) + except ValueError: + raise ValueError(f"Invalid unicode literal: \\u{unicode_buffer}") + char = chr(code_point) + if unicodedata.category(char) == SURROGATE: + state = UNICODE_SURROGATE_START + else: + result += char + unicode_buffer = '' + state = CHAR + + elif state == UNICODE_SURROGATE_START: + if c == "\\": + state = UNICODE_SURROGATE_STRING_ESCAPE + start = i + 1 + else: + raise ValueError(f"Unpaired UTF-16 surrogate") + + elif state == UNICODE_SURROGATE_STRING_ESCAPE: + if c == "u": + state = UNICODE_SURROGATE + start = i + 1 + else: + raise ValueError(f"Unpaired UTF-16 surrogate") + + elif state == UNICODE_SURROGATE: + unicode_buffer += c + start = i + 1 + if len(unicode_buffer) == 8: + code_point_1 = int(unicode_buffer[:4], 16) + try: + code_point_2 = int(unicode_buffer[4:], 16) + except ValueError: + raise ValueError(f"Invalid unicode literal: \\u{unicode_buffer[4:]}") + if unicodedata.category(chr(code_point_2)) != SURROGATE: + raise ValueError(f"Second half of UTF-16 surrogate pair is not a surrogate!") + try: + pair = int.to_bytes(code_point_1, 2, 'little') + int.to_bytes(code_point_2, 2, 'little') + result += pair.decode('utf-16-le') + except ValueError: + raise ValueError( + f"Error decoding UTF-16 surrogate pair \\u{unicode_buffer[:4]}\\u{unicode_buffer[4:]}" + ) + unicode_buffer = '' + state = CHAR + else: + result += chunk[start:] + self.buffer = '' + + self.state = state + self.unicode_buffer = unicode_buffer + return result + + def readline(self, size: int = None) -> str: + result = '' + read_size = DEFAULT_BUFFER_SIZE + while not self.complete: + if size: + result_length = len(result) + if result_length >= size: + result, self.readline_buffer = result[:size], result[size:] + self.readline_buffer + break + read_size = size - result_length + chunk = self._read_chunk(read_size) + i = chunk.find('\n') + if i < 0: + result += chunk + else: + chunk, self.readline_buffer = chunk[:i+1], chunk[i+1:] + result += chunk + break + return result diff --git a/src/json_stream/tokenizer/tests/__init__.py b/src/json_stream/tokenizer/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/json_stream/tokenizer/tests/test_strings.py b/src/json_stream/tokenizer/tests/test_strings.py new file mode 100644 index 0000000..4244b14 --- /dev/null +++ b/src/json_stream/tokenizer/tests/test_strings.py @@ -0,0 +1,330 @@ +import re +from io import StringIO +from unittest import TestCase +from unittest.mock import patch + +from json_stream.tokenizer.strings import JsonStringReader + + +class TestJsonStringReader(TestCase): + def test_string_parsing(self): + self.assertStringEquals("word", r'"word"') + self.assertStringEquals("this char at end: Ȃ", r'"this char at end: \u0202"') + self.assertStringEquals("this char in middle: Ȃ.", r'"this char in middle: \u0202."') + + def test_empty_string(self): + self.assertStringEquals("", r'""') + + def test_escaping(self): + self.assertStringEquals("with\tescape", r'"with\tescape"') + self.assertStringEquals("with\n a different escape", r'"with\n a different escape"') + self.assertStringEquals("using a \bbackspace", r'"using a \bbackspace"') + self.assertStringEquals("now we have \f a formfeed", r'"now we have \f a formfeed"') + self.assertStringEquals('"a quote"', r'"\"a quote\""') + self.assertStringEquals("/", r'"\/"') + + def test_unicode_literal(self): + self.assertStringEquals('Ä', r'"\u00c4"') + self.assertStringEquals("꽸", r'"\uaf78"') + self.assertStringEquals("訋", r'"\u8A0b"') + self.assertStringEquals("돧", r'"\uB3e7"') + self.assertStringEquals("ዯ", r'"\u12eF"') + + def test_invalid_string_escape(self): + self.assertStringRaises(r'"\h"', "Invalid string escape: h") + self.assertStringRaises(r'"\2"', "Invalid string escape: 2") + self.assertStringRaises(r'"\!"', "Invalid string escape: !") + + def test_unicode_literal_truncated(self): + self.assertStringRaises(r'"\u00c"', re.escape(r'Invalid unicode literal: \u00c"')) + + def test_unicode_literal_bad_hex(self): + self.assertStringRaises(r'"\u00x4"', re.escape(r"Invalid unicode literal: \u00x4")) + + def test_unicode_surrogate_pair_literal(self): + self.assertStringEquals('𝄞', r'"\ud834\udd1e"') + + def test_unicode_surrogate_pair_unpaired(self): + self.assertStringRaises(r'"\ud834"', "Unpaired UTF-16 surrogate") + self.assertStringRaises(r'"\ud834', "Unterminated string at end of file") + self.assertStringRaises(r'"\ud834\x', "Unpaired UTF-16 surrogate") + self.assertStringRaises(r'"\ud834' + '\\', "Unterminated string at end of file") + + def test_unicode_surrogate_pair_non_surrogate(self): + self.assertStringRaises(r'"\ud834\u00c4"', "Second half of UTF-16 surrogate pair is not a surrogate!") + + def test_unicode_surrogate_pair_literal_truncated(self): + self.assertStringRaises(r'"\ud834\u00c"', re.escape(r'Invalid unicode literal: \u00c"')) + + def test_unicode_surrogate_pair_literal_bad_hex(self): + self.assertStringRaises(r'"\ud834\u00x4"', re.escape(r"Invalid unicode literal: \u00x4")) + + def test_unicode_surrogate_pair_literal_invalid(self): + message = re.escape(r"Error decoding UTF-16 surrogate pair \ud834\ud834") + self.assertStringRaises(r'"\ud834\ud834"', message) + + def test_unicode_surrogate_pair_literal_unterminated(self): + self.assertStringRaises(r'"\ud834\ud83', r"Unterminated string at end of file") + + def test_unterminated_strings(self): + self.assertStringRaises('"unterminated', "Unterminated string at end of file") + + def test_unterminated_strings_while_in_escape(self): + self.assertStringRaises(r'"\"', "Unterminated string at end of file") + self.assertStringRaises(r'"\u"', "Unterminated string at end of file") + self.assertStringRaises(r'"\u!"', "Unterminated string at end of file") + self.assertStringRaises(r'"\u!!"', "Unterminated string at end of file") + self.assertStringRaises(r'"\u!!!', "Unterminated string at end of file") + + def test_with_initial_buffer(self): + self.assertStringEquals("there will be more string", buffer='"there will be ', stream='more string"') # x x x + + def test_remainder(self): + reader, f = self.assertStringEquals( + "after the string", + stream='"after the string"there is more stuff', + remaining_buffer='there is more stuff', + ) + self.assertRead(reader, f, '', remaining_buffer='there is more stuff') + + def test_remainder_read_past_end_of_string(self): + reader, f = self.assertStringEquals( + "after the string", + stream='"after the string"there is more stuff', + remaining_buffer='the', remaining_stream='re is more stuff', amount=20 + ) + self.assertRead(reader, f, '', remaining_buffer='the', remaining_stream='re is more stuff', amount=20) + + def test_remainder_when_string_ends_after_initial_buffer(self): + reader, f = self.assertStringEquals( + "after the string", + buffer='"after the', stream=' string"there is more stuff', + remaining_buffer='there is more stuff', + ) + self.assertRead(reader, f, '', remaining_buffer='there is more stuff') + + def test_remainder_when_string_ends_within_initial_buffer(self): + reader, f = self.assertStringEquals( + "after the string", + buffer='"after the string"there', stream=' is more stuff', + remaining_buffer='there', remaining_stream=' is more stuff', + ) + self.assertRead(reader, f, '', remaining_buffer='there', remaining_stream=' is more stuff') + + def test_read_part_shorter_initial_buffer(self): + reader, f = self.assertStringEquals( + "there", + buffer='"there will be ', stream='more string"', + remaining_buffer=' will be ', remaining_stream='more string"', amount=5, complete=False, + ) + self.assertRead(reader, f, ' will be more string') + + def test_read_part_longer_than_initial_buffer(self): + reader, f = self.assertStringEquals( + "there will be ", + buffer='"there will be ', stream='more string"', + remaining_buffer='', remaining_stream='more string"', amount=20, complete=False, + ) + self.assertRead(reader, f, 'more string') + + def test_read_over_split_escape(self): + json = r'"abcde\u00c4edcba"' + for i in range(len(json)): + buffer, stream = json[:i], json[i:] + self.assertStringEquals("abcdeÄedcba", buffer=buffer, stream=stream) + + def test_readable(self): + reader = JsonStringReader(StringIO()) + self.assertTrue(reader.readable()) + + def test_readline(self): + stream = StringIO(r'some\nlines\nof\ntext"') + reader = JsonStringReader(stream) + self.assertReadline( + reader, stream, + result='some\n', + remaining_readline_buffer='lines\nof\ntext', + complete=False, + ) + self.assertReadline( + reader, stream, + result='lines\n', + remaining_readline_buffer='of\ntext', + complete=False, + ) + self.assertReadline( + reader, stream, + result='of\n', + remaining_readline_buffer='text', + complete=False, + ) + self.assertReadline( + reader, stream, + result='text', + ) + + @patch('json_stream.tokenizer.strings.DEFAULT_BUFFER_SIZE', 10) + def test_readline_needs_multiple_reads(self): + stream = StringIO(r'aaaaaaaaaabbbbb\ncccdddddddd"') + reader = JsonStringReader(stream) + self.assertReadline( + reader, stream, + result='aaaaaaaaaabbbbb\n', + remaining_readline_buffer='ccc', + remaining_stream='dddddddd"', + complete=False, + ) + self.assertReadline(reader, stream, 'cccdddddddd') + + def test_readline_eof_without_newline(self): + stream = StringIO(r'aaaaaaaaaabbbbbcccdddddddd"') + reader = JsonStringReader(stream) + self.assertReadline( + reader, stream, + result='aaaaaaaaaabbbbbcccdddddddd', + ) + self.assertReadline(reader, stream, '') + + @patch('json_stream.tokenizer.strings.DEFAULT_BUFFER_SIZE', 10) + def test_readline_then_read(self): + stream = StringIO(r'aaaaaaaaaabbbbbbbb\ndddddddd"') + reader = JsonStringReader(stream) + self.assertReadline( + reader, stream, + result='aaaaaaaaaabbbbbbbb\n', + remaining_stream='dddddddd"', + complete=False, + ) + self.assertRead(reader, stream, result='dddddddd') + + @patch('json_stream.tokenizer.strings.DEFAULT_BUFFER_SIZE', 10) + def test_readline_then_read_with_data_in_buffer(self): + stream = StringIO(r'aaaaaaaaaabbbbb\ncccdddddddd"') + reader = JsonStringReader(stream) + self.assertReadline( + reader, stream, + result='aaaaaaaaaabbbbb\n', + remaining_readline_buffer='ccc', + remaining_stream='dddddddd"', + complete=False, + ) + self.assertRead(reader, stream, result='cccdddddddd') + + def test_read_then_readline(self): + stream = StringIO(r'aaaaaaaaaabbbbb\ncccdddddddd"') + reader = JsonStringReader(stream) + self.assertRead( + reader, stream, + result='aaaaaaaaaa', + remaining_stream=r'bbbbb\ncccdddddddd"', + amount=10, + complete=False, + ) + self.assertReadline( + reader, stream, + result='bbbbb\n', + remaining_readline_buffer='cccdddddddd', + complete=False, + ) + self.assertReadline( + reader, stream, + result='cccdddddddd', + ) + + def test_readline_with_size_shorter_than_line(self): + stream = StringIO(r'aaaaaaaaaabbbbb\ncccdddddddd"') + reader = JsonStringReader(stream) + self.assertReadline( + reader, stream, + result='aaaaaaaaaa', + remaining_stream=r'bbbbb\ncccdddddddd"', + amount=10, + complete=False, + ) + self.assertReadline( + reader, stream, + result='bbbbb\n', + remaining_readline_buffer='cccdddddddd', + complete=False, + ) + self.assertReadline( + reader, stream, + result='cccdddddddd', + ) + + def test_readline_with_size_longer_than_line(self): + stream = StringIO(r'aaaaaaaaaabbbbb\ncccdddddddd"') + reader = JsonStringReader(stream) + self.assertReadline( + reader, stream, + result='aaaaaaaaaabbbbb\n', + remaining_readline_buffer='ccc', + remaining_stream='dddddddd"', + amount=20, + complete=False, + ) + self.assertReadline(reader, stream, 'cccdddddddd') + + def test_readline_trailing_newline(self): + stream = StringIO(r'a\n"') + reader = JsonStringReader(stream) + self.assertReadline( + reader, stream, + result='a\n', + ) + + def test_readline_no_trailing_newline(self): + stream = StringIO(r'a\nb"') + reader = JsonStringReader(stream) + self.assertReadline( + reader, stream, + result='a\n', + remaining_readline_buffer='b', + complete=False + ) + self.assertReadline(reader, stream, 'b') + + def test_readlines(self): + stream = StringIO(r'some\nlines\nof\ntext"') + reader = JsonStringReader(stream) + self.assertListEqual(["some\n", "lines\n", "of\n", "text"], reader.readlines()) + self.assertEqual('', reader.readline_buffer) + self.assertEqual('', reader.buffer) + self.assertEqual('', stream.read()) + self.assertTrue(reader.complete) + + def assertStringEquals(self, result, stream, buffer='', remaining_buffer='', remaining_stream='', amount=None, + complete=True): + if buffer: + buffer = buffer[1:] + else: + stream = stream[1:] + f = StringIO(stream) + reader = JsonStringReader(f, buffer) + self.assertRead(reader, f, result, remaining_buffer, remaining_stream, amount, complete) + return reader, f + + def assertRead(self, reader, stream, result, remaining_buffer='', remaining_stream='', amount=None, complete=True): + self.assertEqual(result, reader.read(amount)) + self.assertEqual(reader.readline_buffer, '') + self.assertEqual(remaining_buffer, reader.buffer) + pos = stream.tell() + self.assertEqual(remaining_stream, stream.read()) + stream.seek(pos) + self.assertEqual(complete, reader.complete) + + def assertReadline(self, reader, stream, result, remaining_readline_buffer='', remaining_buffer='', + remaining_stream='', amount=None, complete=True): + self.assertEqual(result, reader.readline(amount)) + self.assertEqual(remaining_readline_buffer, reader.readline_buffer) + self.assertEqual(remaining_buffer, reader.buffer) + pos = stream.tell() + self.assertEqual(remaining_stream, stream.read()) + stream.seek(pos) + self.assertEqual(complete, reader.complete) + + def assertStringRaises(self, s, error): + stream = StringIO(s[1:]) + f = JsonStringReader(stream) + with self.assertRaisesRegex(ValueError, error): + f.read() diff --git a/src/json_stream/tests/test_tokenizer.py b/src/json_stream/tokenizer/tests/test_tokenizer.py similarity index 94% rename from src/json_stream/tests/test_tokenizer.py rename to src/json_stream/tokenizer/tests/test_tokenizer.py index e27b4a7..141755f 100644 --- a/src/json_stream/tests/test_tokenizer.py +++ b/src/json_stream/tokenizer/tests/test_tokenizer.py @@ -90,7 +90,7 @@ def test_string_parsing(self): self.tokenize_sequence(r'"\2"') with self.assertRaisesRegex(ValueError, "Invalid string escape: ! at index 2"): self.tokenize_sequence(r'"\!"') - with self.assertRaisesRegex(ValueError, "Unterminated unicode literal at end of file"): + with self.assertRaisesRegex(ValueError, "Unterminated string at end of file at index 4"): self.tokenize_sequence(r'"\u!"') def test_unterminated_strings(self): @@ -172,11 +172,11 @@ def test_unicode_surrogate_pair_literal(self): def test_unicode_surrogate_pair_unpaired(self): with self.assertRaisesRegex(ValueError, "Unpaired UTF-16 surrogate at index 7"): list(tokenize(StringIO(r'"\ud834"'))) - with self.assertRaisesRegex(ValueError, "Unpaired UTF-16 surrogate at end of file"): + with self.assertRaisesRegex(ValueError, "Unterminated string at end of file"): list(tokenize(StringIO(r'"\ud834'))) with self.assertRaisesRegex(ValueError, "Unpaired UTF-16 surrogate at index 8"): list(tokenize(StringIO(r'"\ud834\x'))) - with self.assertRaisesRegex(ValueError, "Unpaired UTF-16 surrogate at end of file"): + with self.assertRaisesRegex(ValueError, "Unterminated string at end of file"): list(tokenize(StringIO(r'"\ud834' + '\\'))) def test_unicode_surrogate_pair_non_surrogate(self): @@ -197,5 +197,9 @@ def test_unicode_surrogate_pair_literal_invalid(self): list(tokenize(StringIO(r'"\ud834\ud834"'))) def test_unicode_surrogate_pair_literal_unterminated(self): - with self.assertRaisesRegex(ValueError, r"Unterminated unicode literal at end of file"): + with self.assertRaisesRegex(ValueError, r"Unterminated string at end of file at index 11"): list(tokenize(StringIO(r'"\ud834\ud83'))) + + def test_unicode_surrogate_pair_literal_unterminated_first_half(self): + with self.assertRaisesRegex(ValueError, r"Unterminated string at end of file"): + list(tokenize(StringIO(r'"\ud83'))) diff --git a/src/json_stream/visitor.py b/src/json_stream/visitor.py index 99edd38..fc15a3d 100644 --- a/src/json_stream/visitor.py +++ b/src/json_stream/visitor.py @@ -19,9 +19,9 @@ def _visit(obj, visitor, path): visitor(obj, path) -def visit(fp_or_iterator, visitor, tokenizer=default_tokenizer): +def visit(fp_or_iterator, visitor, tokenizer=default_tokenizer, buffering=-1, strings_as_files=False): fp = ensure_file(fp_or_iterator) - token_stream = tokenizer(fp) + token_stream = tokenizer(fp, buffering=buffering, strings_as_files=strings_as_files) _, token = next(token_stream) obj = StreamingJSONBase.factory(token, token_stream, persistent=False) _visit(obj, visitor, ())