66Copyright (c) 2019 Daniel Yule
77"""
88import io
9- import unicodedata
9+ from typing import Optional , Tuple
1010
11- SURROGATE = 'Cs'
11+ from json_stream . tokenizer . strings import JsonStringReader
1212
1313
1414class TokenType :
@@ -29,7 +29,6 @@ class State:
2929 FLOATING_POINT_0 = 6
3030 FLOATING_POINT = 8
3131 STRING = 9
32- STRING_ESCAPE = 10
3332 STRING_END = 11
3433 TRUE_1 = 12
3534 TRUE_2 = 13
@@ -41,10 +40,6 @@ class State:
4140 NULL_1 = 19
4241 NULL_2 = 20
4342 NULL_3 = 21
44- UNICODE = 22
45- UNICODE_SURROGATE_START = 23
46- UNICODE_SURROGATE_STRING_ESCAPE = 24
47- UNICODE_SURROGATE = 25
4843
4944
5045class SpecialChar :
@@ -78,22 +73,20 @@ def _ensure_text(stream):
7873 return stream
7974
8075
81- def tokenize (stream , * , buffering = - 1 , ** _ ):
76+ def tokenize (stream , * , buffering = - 1 , strings_as_files , ** _ ):
8277 stream = _ensure_text (stream )
8378
8479 def is_delimiter (char ):
8580 return char .isspace () or char in "{}[]:," or char == SpecialChar .EOF
8681
8782 token = []
88- unicode_buffer = ""
8983 completed = False
90- now_token = ""
84+ now_token : Optional [ Tuple ] = None
9185
9286 def process_char (char ):
93- nonlocal token , completed , now_token , unicode_buffer
87+ nonlocal completed , now_token , state , buffer , index
9488 advance = True
9589 add_char = False
96- next_state = state
9790 if state == State .WHITESPACE :
9891 if char == "{" :
9992 completed = True
@@ -113,67 +106,71 @@ def process_char(char):
113106 elif char == ":" :
114107 completed = True
115108 now_token = (TokenType .OPERATOR , ":" )
116- elif char == "\" " :
117- next_state = State .STRING
109+ elif char == '"' :
110+ state = State .STRING
111+ now_token = (TokenType .STRING , JsonStringReader (stream , buffer ))
112+ if strings_as_files :
113+ completed = True
114+ advance = False
118115 elif char in "123456789" :
119- next_state = State .INTEGER
116+ state = State .INTEGER
120117 add_char = True
121118 elif char == "0" :
122- next_state = State .INTEGER_0
119+ state = State .INTEGER_0
123120 add_char = True
124121 elif char == "-" :
125- next_state = State .INTEGER_SIGN
122+ state = State .INTEGER_SIGN
126123 add_char = True
127124 elif char == "f" :
128- next_state = State .FALSE_1
125+ state = State .FALSE_1
129126 elif char == "t" :
130- next_state = State .TRUE_1
127+ state = State .TRUE_1
131128 elif char == "n" :
132- next_state = State .NULL_1
129+ state = State .NULL_1
133130 elif not char .isspace () and not char == SpecialChar .EOF :
134131 raise ValueError ("Invalid JSON character: '{0}'" .format (char ))
135132 elif state == State .INTEGER :
136133 if char in "0123456789" :
137134 add_char = True
138135 elif char == "." :
139- next_state = State .FLOATING_POINT_0
136+ state = State .FLOATING_POINT_0
140137 add_char = True
141138 elif char == "e" or char == 'E' :
142- next_state = State .INTEGER_EXP_0
139+ state = State .INTEGER_EXP_0
143140 add_char = True
144141 elif is_delimiter (char ):
145- next_state = State .WHITESPACE
142+ state = State .WHITESPACE
146143 completed = True
147144 now_token = (TokenType .NUMBER , int ("" .join (token )))
148145 advance = False
149146 else :
150147 raise ValueError ("A number must contain only digits. Got '{}'" .format (char ))
151148 elif state == State .INTEGER_0 :
152149 if char == "." :
153- next_state = State .FLOATING_POINT_0
150+ state = State .FLOATING_POINT_0
154151 add_char = True
155152 elif char == "e" or char == 'E' :
156- next_state = State .INTEGER_EXP_0
153+ state = State .INTEGER_EXP_0
157154 add_char = True
158155 elif is_delimiter (char ):
159- next_state = State .WHITESPACE
156+ state = State .WHITESPACE
160157 completed = True
161158 now_token = (TokenType .NUMBER , 0 )
162159 advance = False
163160 else :
164161 raise ValueError ("A 0 must be followed by a '.' or a 'e'. Got '{0}'" .format (char ))
165162 elif state == State .INTEGER_SIGN :
166163 if char == "0" :
167- next_state = State .INTEGER_0
164+ state = State .INTEGER_0
168165 add_char = True
169166 elif char in "123456789" :
170- next_state = State .INTEGER
167+ state = State .INTEGER
171168 add_char = True
172169 else :
173170 raise ValueError ("A - must be followed by a digit. Got '{0}'" .format (char ))
174171 elif state == State .INTEGER_EXP_0 :
175172 if char == "+" or char == "-" or char in "0123456789" :
176- next_state = State .INTEGER_EXP
173+ state = State .INTEGER_EXP
177174 add_char = True
178175 else :
179176 raise ValueError ("An e in a number must be followed by a '+', '-' or digit. Got '{0}'" .format (char ))
@@ -183,187 +180,108 @@ def process_char(char):
183180 elif is_delimiter (char ):
184181 completed = True
185182 now_token = (TokenType .NUMBER , float ("" .join (token )))
186- next_state = State .WHITESPACE
183+ state = State .WHITESPACE
187184 advance = False
188185 else :
189186 raise ValueError ("A number exponent must consist only of digits. Got '{}'" .format (char ))
190187 elif state == State .FLOATING_POINT :
191188 if char in "0123456789" :
192189 add_char = True
193190 elif char == "e" or char == "E" :
194- next_state = State .INTEGER_EXP_0
191+ state = State .INTEGER_EXP_0
195192 add_char = True
196193 elif is_delimiter (char ):
197194 completed = True
198195 now_token = (TokenType .NUMBER , float ("" .join (token )))
199- next_state = State .WHITESPACE
196+ state = State .WHITESPACE
200197 advance = False
201198 else :
202199 raise ValueError ("A number must include only digits" )
203200 elif state == State .FLOATING_POINT_0 :
204201 if char in "0123456789" :
205- next_state = State .FLOATING_POINT
202+ state = State .FLOATING_POINT
206203 add_char = True
207204 else :
208205 raise ValueError ("A number with a decimal point must be followed by a fractional part" )
209206 elif state == State .FALSE_1 :
210207 if char == "a" :
211- next_state = State .FALSE_2
208+ state = State .FALSE_2
212209 else :
213210 raise ValueError ("Invalid JSON character: '{0}'" .format (char ))
214211 elif state == State .FALSE_2 :
215212 if char == "l" :
216- next_state = State .FALSE_3
213+ state = State .FALSE_3
217214 else :
218215 raise ValueError ("Invalid JSON character: '{0}'" .format (char ))
219216 elif state == State .FALSE_3 :
220217 if char == "s" :
221- next_state = State .FALSE_4
218+ state = State .FALSE_4
222219 else :
223220 raise ValueError ("Invalid JSON character: '{0}'" .format (char ))
224221 elif state == State .FALSE_4 :
225222 if char == "e" :
226- next_state = State .WHITESPACE
223+ state = State .WHITESPACE
227224 completed = True
228225 now_token = (TokenType .BOOLEAN , False )
229226 else :
230227 raise ValueError ("Invalid JSON character: '{0}'" .format (char ))
231228 elif state == State .TRUE_1 :
232229 if char == "r" :
233- next_state = State .TRUE_2
230+ state = State .TRUE_2
234231 else :
235232 raise ValueError ("Invalid JSON character: '{0}'" .format (char ))
236233 elif state == State .TRUE_2 :
237234 if char == "u" :
238- next_state = State .TRUE_3
235+ state = State .TRUE_3
239236 else :
240237 raise ValueError ("Invalid JSON character: '{0}'" .format (char ))
241238 elif state == State .TRUE_3 :
242239 if char == "e" :
243- next_state = State .WHITESPACE
240+ state = State .WHITESPACE
244241 completed = True
245242 now_token = (TokenType .BOOLEAN , True )
246243 else :
247244 raise ValueError ("Invalid JSON character: '{0}'" .format (char ))
248245 elif state == State .NULL_1 :
249246 if char == "u" :
250- next_state = State .NULL_2
247+ state = State .NULL_2
251248 else :
252249 raise ValueError ("Invalid JSON character: '{0}'" .format (char ))
253250 elif state == State .NULL_2 :
254251 if char == "l" :
255- next_state = State .NULL_3
252+ state = State .NULL_3
256253 else :
257254 raise ValueError ("Invalid JSON character: '{0}'" .format (char ))
258255 elif state == State .NULL_3 :
259256 if char == "l" :
260- next_state = State .WHITESPACE
257+ state = State .WHITESPACE
261258 completed = True
262259 now_token = (TokenType .NULL , None )
263260 else :
264261 raise ValueError ("Invalid JSON character: '{0}'" .format (char ))
265262 elif state == State .STRING :
266- if char == "\" " :
263+ reader : JsonStringReader = now_token [1 ]
264+ try :
265+ s = reader .read ()
266+ finally :
267+ index += reader .index
268+ if not strings_as_files :
269+ now_token = (TokenType .STRING , s )
267270 completed = True
268- now_token = (TokenType .STRING , "" .join (token ))
269- next_state = State .STRING_END
270- elif char == "\\ " :
271- next_state = State .STRING_ESCAPE
272- elif char == SpecialChar .EOF :
273- raise ValueError ("Unterminated string at end of file" )
274- else :
275- add_char = True
271+ buffer = reader .buffer
272+ state = State .STRING_END
276273 elif state == State .STRING_END :
277274 if is_delimiter (char ):
278275 advance = False
279- next_state = State .WHITESPACE
276+ state = State .WHITESPACE
280277 else :
281278 raise ValueError ("Expected whitespace or an operator after string. Got '{}'" .format (char ))
282- elif state == State .STRING_ESCAPE :
283- next_state = State .STRING
284- if char == "\\ " or char == "\" " :
285- add_char = True
286- elif char == "b" :
287- char = "\b "
288- add_char = True
289- elif char == "f" :
290- char = "\f "
291- add_char = True
292- elif char == "n" :
293- char = "\n "
294- add_char = True
295- elif char == "t" :
296- char = "\t "
297- add_char = True
298- elif char == "r" :
299- char = "\r "
300- add_char = True
301- elif char == "/" :
302- char = "/"
303- add_char = True
304- elif char == "u" :
305- next_state = State .UNICODE
306- unicode_buffer = ""
307- else :
308- raise ValueError ("Invalid string escape: {}" .format (char ))
309- elif state == State .UNICODE :
310- if char == SpecialChar .EOF :
311- raise ValueError ('Unterminated unicode literal at end of file' )
312- unicode_buffer += char
313- if len (unicode_buffer ) == 4 :
314- try :
315- code_point = int (unicode_buffer , 16 )
316- except ValueError :
317- raise ValueError (f"Invalid unicode literal: \\ u{ unicode_buffer } " )
318- char = chr (code_point )
319- if unicodedata .category (char ) == SURROGATE :
320- next_state = State .UNICODE_SURROGATE_START
321- else :
322- next_state = State .STRING
323- add_char = True
324- elif state == State .UNICODE_SURROGATE_START :
325- if char == "\\ " :
326- next_state = State .UNICODE_SURROGATE_STRING_ESCAPE
327- elif char == SpecialChar .EOF :
328- raise ValueError ("Unpaired UTF-16 surrogate at end of file" )
329- else :
330- raise ValueError (f"Unpaired UTF-16 surrogate" )
331-
332- elif state == State .UNICODE_SURROGATE_STRING_ESCAPE :
333- if char == "u" :
334- next_state = State .UNICODE_SURROGATE
335- elif char == SpecialChar .EOF :
336- raise ValueError ("Unpaired UTF-16 surrogate at end of file" )
337- else :
338- raise ValueError (f"Unpaired UTF-16 surrogate" )
339-
340- elif state == State .UNICODE_SURROGATE :
341- if char == SpecialChar .EOF :
342- raise ValueError ('Unterminated unicode literal at end of file' )
343- unicode_buffer += char
344- if len (unicode_buffer ) == 8 :
345- code_point_1 = int (unicode_buffer [:4 ], 16 )
346- try :
347- code_point_2 = int (unicode_buffer [4 :], 16 )
348- except ValueError :
349- raise ValueError (f"Invalid unicode literal: \\ u{ unicode_buffer [4 :]} " )
350- char = chr (code_point_2 )
351- if unicodedata .category (char ) != SURROGATE :
352- raise ValueError (f"Second half of UTF-16 surrogate pair is not a surrogate!" )
353- try :
354- pair = int .to_bytes (code_point_1 , 2 , 'little' ) + int .to_bytes (code_point_2 , 2 , 'little' )
355- char = pair .decode ('utf-16-le' )
356- except ValueError :
357- raise ValueError (
358- f"Error decoding UTF-16 surrogate pair \\ u{ unicode_buffer [:4 ]} \\ u{ unicode_buffer [4 :]} "
359- )
360- next_state = State .STRING
361- add_char = True
362279
363280 if add_char :
364281 token .append (char )
365282
366- return advance , next_state
283+ return advance
284+
367285 state = State .WHITESPACE
368286 if not buffering :
369287 buffering = 1
@@ -379,13 +297,14 @@ def process_char(char):
379297 c , buffer = buffer [0 ], buffer [1 :] or stream .read (buffering )
380298 index += 1
381299 try :
382- advance , state = process_char (c )
300+ advance = process_char (c )
383301 except ValueError as e :
384302 raise ValueError ("" .join ([e .args [0 ], " at index {}" .format (index )]))
385303 if completed :
386304 completed = False
387305 token = []
388306 yield now_token
307+
389308 process_char (SpecialChar .EOF )
390309 if completed :
391310 yield now_token
0 commit comments