Skip to content

Commit 61ef0f9

Browse files
committed
added json string stream
1 parent 524ceaf commit 61ef0f9

File tree

5 files changed

+358
-139
lines changed

5 files changed

+358
-139
lines changed

src/json_stream/tokenizer.py renamed to src/json_stream/tokenizer/__init__.py

Lines changed: 54 additions & 135 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
Copyright (c) 2019 Daniel Yule
77
"""
88
import io
9-
import unicodedata
9+
from typing import Optional, Tuple
1010

11-
SURROGATE = 'Cs'
11+
from json_stream.tokenizer.strings import JsonStringReader
1212

1313

1414
class TokenType:
@@ -29,7 +29,6 @@ class State:
2929
FLOATING_POINT_0 = 6
3030
FLOATING_POINT = 8
3131
STRING = 9
32-
STRING_ESCAPE = 10
3332
STRING_END = 11
3433
TRUE_1 = 12
3534
TRUE_2 = 13
@@ -41,10 +40,6 @@ class State:
4140
NULL_1 = 19
4241
NULL_2 = 20
4342
NULL_3 = 21
44-
UNICODE = 22
45-
UNICODE_SURROGATE_START = 23
46-
UNICODE_SURROGATE_STRING_ESCAPE = 24
47-
UNICODE_SURROGATE = 25
4843

4944

5045
class SpecialChar:
@@ -78,22 +73,20 @@ def _ensure_text(stream):
7873
return stream
7974

8075

81-
def tokenize(stream, *, buffering=-1, **_):
76+
def tokenize(stream, *, buffering=-1, strings_as_files, **_):
8277
stream = _ensure_text(stream)
8378

8479
def is_delimiter(char):
8580
return char.isspace() or char in "{}[]:," or char == SpecialChar.EOF
8681

8782
token = []
88-
unicode_buffer = ""
8983
completed = False
90-
now_token = ""
84+
now_token: Optional[Tuple] = None
9185

9286
def process_char(char):
93-
nonlocal token, completed, now_token, unicode_buffer
87+
nonlocal completed, now_token, state, buffer, index
9488
advance = True
9589
add_char = False
96-
next_state = state
9790
if state == State.WHITESPACE:
9891
if char == "{":
9992
completed = True
@@ -113,67 +106,71 @@ def process_char(char):
113106
elif char == ":":
114107
completed = True
115108
now_token = (TokenType.OPERATOR, ":")
116-
elif char == "\"":
117-
next_state = State.STRING
109+
elif char == '"':
110+
state = State.STRING
111+
now_token = (TokenType.STRING, JsonStringReader(stream, buffer))
112+
if strings_as_files:
113+
completed = True
114+
advance = False
118115
elif char in "123456789":
119-
next_state = State.INTEGER
116+
state = State.INTEGER
120117
add_char = True
121118
elif char == "0":
122-
next_state = State.INTEGER_0
119+
state = State.INTEGER_0
123120
add_char = True
124121
elif char == "-":
125-
next_state = State.INTEGER_SIGN
122+
state = State.INTEGER_SIGN
126123
add_char = True
127124
elif char == "f":
128-
next_state = State.FALSE_1
125+
state = State.FALSE_1
129126
elif char == "t":
130-
next_state = State.TRUE_1
127+
state = State.TRUE_1
131128
elif char == "n":
132-
next_state = State.NULL_1
129+
state = State.NULL_1
133130
elif not char.isspace() and not char == SpecialChar.EOF:
134131
raise ValueError("Invalid JSON character: '{0}'".format(char))
135132
elif state == State.INTEGER:
136133
if char in "0123456789":
137134
add_char = True
138135
elif char == ".":
139-
next_state = State.FLOATING_POINT_0
136+
state = State.FLOATING_POINT_0
140137
add_char = True
141138
elif char == "e" or char == 'E':
142-
next_state = State.INTEGER_EXP_0
139+
state = State.INTEGER_EXP_0
143140
add_char = True
144141
elif is_delimiter(char):
145-
next_state = State.WHITESPACE
142+
state = State.WHITESPACE
146143
completed = True
147144
now_token = (TokenType.NUMBER, int("".join(token)))
148145
advance = False
149146
else:
150147
raise ValueError("A number must contain only digits. Got '{}'".format(char))
151148
elif state == State.INTEGER_0:
152149
if char == ".":
153-
next_state = State.FLOATING_POINT_0
150+
state = State.FLOATING_POINT_0
154151
add_char = True
155152
elif char == "e" or char == 'E':
156-
next_state = State.INTEGER_EXP_0
153+
state = State.INTEGER_EXP_0
157154
add_char = True
158155
elif is_delimiter(char):
159-
next_state = State.WHITESPACE
156+
state = State.WHITESPACE
160157
completed = True
161158
now_token = (TokenType.NUMBER, 0)
162159
advance = False
163160
else:
164161
raise ValueError("A 0 must be followed by a '.' or a 'e'. Got '{0}'".format(char))
165162
elif state == State.INTEGER_SIGN:
166163
if char == "0":
167-
next_state = State.INTEGER_0
164+
state = State.INTEGER_0
168165
add_char = True
169166
elif char in "123456789":
170-
next_state = State.INTEGER
167+
state = State.INTEGER
171168
add_char = True
172169
else:
173170
raise ValueError("A - must be followed by a digit. Got '{0}'".format(char))
174171
elif state == State.INTEGER_EXP_0:
175172
if char == "+" or char == "-" or char in "0123456789":
176-
next_state = State.INTEGER_EXP
173+
state = State.INTEGER_EXP
177174
add_char = True
178175
else:
179176
raise ValueError("An e in a number must be followed by a '+', '-' or digit. Got '{0}'".format(char))
@@ -183,187 +180,108 @@ def process_char(char):
183180
elif is_delimiter(char):
184181
completed = True
185182
now_token = (TokenType.NUMBER, float("".join(token)))
186-
next_state = State.WHITESPACE
183+
state = State.WHITESPACE
187184
advance = False
188185
else:
189186
raise ValueError("A number exponent must consist only of digits. Got '{}'".format(char))
190187
elif state == State.FLOATING_POINT:
191188
if char in "0123456789":
192189
add_char = True
193190
elif char == "e" or char == "E":
194-
next_state = State.INTEGER_EXP_0
191+
state = State.INTEGER_EXP_0
195192
add_char = True
196193
elif is_delimiter(char):
197194
completed = True
198195
now_token = (TokenType.NUMBER, float("".join(token)))
199-
next_state = State.WHITESPACE
196+
state = State.WHITESPACE
200197
advance = False
201198
else:
202199
raise ValueError("A number must include only digits")
203200
elif state == State.FLOATING_POINT_0:
204201
if char in "0123456789":
205-
next_state = State.FLOATING_POINT
202+
state = State.FLOATING_POINT
206203
add_char = True
207204
else:
208205
raise ValueError("A number with a decimal point must be followed by a fractional part")
209206
elif state == State.FALSE_1:
210207
if char == "a":
211-
next_state = State.FALSE_2
208+
state = State.FALSE_2
212209
else:
213210
raise ValueError("Invalid JSON character: '{0}'".format(char))
214211
elif state == State.FALSE_2:
215212
if char == "l":
216-
next_state = State.FALSE_3
213+
state = State.FALSE_3
217214
else:
218215
raise ValueError("Invalid JSON character: '{0}'".format(char))
219216
elif state == State.FALSE_3:
220217
if char == "s":
221-
next_state = State.FALSE_4
218+
state = State.FALSE_4
222219
else:
223220
raise ValueError("Invalid JSON character: '{0}'".format(char))
224221
elif state == State.FALSE_4:
225222
if char == "e":
226-
next_state = State.WHITESPACE
223+
state = State.WHITESPACE
227224
completed = True
228225
now_token = (TokenType.BOOLEAN, False)
229226
else:
230227
raise ValueError("Invalid JSON character: '{0}'".format(char))
231228
elif state == State.TRUE_1:
232229
if char == "r":
233-
next_state = State.TRUE_2
230+
state = State.TRUE_2
234231
else:
235232
raise ValueError("Invalid JSON character: '{0}'".format(char))
236233
elif state == State.TRUE_2:
237234
if char == "u":
238-
next_state = State.TRUE_3
235+
state = State.TRUE_3
239236
else:
240237
raise ValueError("Invalid JSON character: '{0}'".format(char))
241238
elif state == State.TRUE_3:
242239
if char == "e":
243-
next_state = State.WHITESPACE
240+
state = State.WHITESPACE
244241
completed = True
245242
now_token = (TokenType.BOOLEAN, True)
246243
else:
247244
raise ValueError("Invalid JSON character: '{0}'".format(char))
248245
elif state == State.NULL_1:
249246
if char == "u":
250-
next_state = State.NULL_2
247+
state = State.NULL_2
251248
else:
252249
raise ValueError("Invalid JSON character: '{0}'".format(char))
253250
elif state == State.NULL_2:
254251
if char == "l":
255-
next_state = State.NULL_3
252+
state = State.NULL_3
256253
else:
257254
raise ValueError("Invalid JSON character: '{0}'".format(char))
258255
elif state == State.NULL_3:
259256
if char == "l":
260-
next_state = State.WHITESPACE
257+
state = State.WHITESPACE
261258
completed = True
262259
now_token = (TokenType.NULL, None)
263260
else:
264261
raise ValueError("Invalid JSON character: '{0}'".format(char))
265262
elif state == State.STRING:
266-
if char == "\"":
263+
reader: JsonStringReader = now_token[1]
264+
try:
265+
s = reader.read()
266+
finally:
267+
index += reader.index
268+
if not strings_as_files:
269+
now_token = (TokenType.STRING, s)
267270
completed = True
268-
now_token = (TokenType.STRING, "".join(token))
269-
next_state = State.STRING_END
270-
elif char == "\\":
271-
next_state = State.STRING_ESCAPE
272-
elif char == SpecialChar.EOF:
273-
raise ValueError("Unterminated string at end of file")
274-
else:
275-
add_char = True
271+
buffer = reader.buffer
272+
state = State.STRING_END
276273
elif state == State.STRING_END:
277274
if is_delimiter(char):
278275
advance = False
279-
next_state = State.WHITESPACE
276+
state = State.WHITESPACE
280277
else:
281278
raise ValueError("Expected whitespace or an operator after string. Got '{}'".format(char))
282-
elif state == State.STRING_ESCAPE:
283-
next_state = State.STRING
284-
if char == "\\" or char == "\"":
285-
add_char = True
286-
elif char == "b":
287-
char = "\b"
288-
add_char = True
289-
elif char == "f":
290-
char = "\f"
291-
add_char = True
292-
elif char == "n":
293-
char = "\n"
294-
add_char = True
295-
elif char == "t":
296-
char = "\t"
297-
add_char = True
298-
elif char == "r":
299-
char = "\r"
300-
add_char = True
301-
elif char == "/":
302-
char = "/"
303-
add_char = True
304-
elif char == "u":
305-
next_state = State.UNICODE
306-
unicode_buffer = ""
307-
else:
308-
raise ValueError("Invalid string escape: {}".format(char))
309-
elif state == State.UNICODE:
310-
if char == SpecialChar.EOF:
311-
raise ValueError('Unterminated unicode literal at end of file')
312-
unicode_buffer += char
313-
if len(unicode_buffer) == 4:
314-
try:
315-
code_point = int(unicode_buffer, 16)
316-
except ValueError:
317-
raise ValueError(f"Invalid unicode literal: \\u{unicode_buffer}")
318-
char = chr(code_point)
319-
if unicodedata.category(char) == SURROGATE:
320-
next_state = State.UNICODE_SURROGATE_START
321-
else:
322-
next_state = State.STRING
323-
add_char = True
324-
elif state == State.UNICODE_SURROGATE_START:
325-
if char == "\\":
326-
next_state = State.UNICODE_SURROGATE_STRING_ESCAPE
327-
elif char == SpecialChar.EOF:
328-
raise ValueError("Unpaired UTF-16 surrogate at end of file")
329-
else:
330-
raise ValueError(f"Unpaired UTF-16 surrogate")
331-
332-
elif state == State.UNICODE_SURROGATE_STRING_ESCAPE:
333-
if char == "u":
334-
next_state = State.UNICODE_SURROGATE
335-
elif char == SpecialChar.EOF:
336-
raise ValueError("Unpaired UTF-16 surrogate at end of file")
337-
else:
338-
raise ValueError(f"Unpaired UTF-16 surrogate")
339-
340-
elif state == State.UNICODE_SURROGATE:
341-
if char == SpecialChar.EOF:
342-
raise ValueError('Unterminated unicode literal at end of file')
343-
unicode_buffer += char
344-
if len(unicode_buffer) == 8:
345-
code_point_1 = int(unicode_buffer[:4], 16)
346-
try:
347-
code_point_2 = int(unicode_buffer[4:], 16)
348-
except ValueError:
349-
raise ValueError(f"Invalid unicode literal: \\u{unicode_buffer[4:]}")
350-
char = chr(code_point_2)
351-
if unicodedata.category(char) != SURROGATE:
352-
raise ValueError(f"Second half of UTF-16 surrogate pair is not a surrogate!")
353-
try:
354-
pair = int.to_bytes(code_point_1, 2, 'little') + int.to_bytes(code_point_2, 2, 'little')
355-
char = pair.decode('utf-16-le')
356-
except ValueError:
357-
raise ValueError(
358-
f"Error decoding UTF-16 surrogate pair \\u{unicode_buffer[:4]}\\u{unicode_buffer[4:]}"
359-
)
360-
next_state = State.STRING
361-
add_char = True
362279

363280
if add_char:
364281
token.append(char)
365282

366-
return advance, next_state
283+
return advance
284+
367285
state = State.WHITESPACE
368286
if not buffering:
369287
buffering = 1
@@ -379,13 +297,14 @@ def process_char(char):
379297
c, buffer = buffer[0], buffer[1:] or stream.read(buffering)
380298
index += 1
381299
try:
382-
advance, state = process_char(c)
300+
advance = process_char(c)
383301
except ValueError as e:
384302
raise ValueError("".join([e.args[0], " at index {}".format(index)]))
385303
if completed:
386304
completed = False
387305
token = []
388306
yield now_token
307+
389308
process_char(SpecialChar.EOF)
390309
if completed:
391310
yield now_token

0 commit comments

Comments
 (0)