77use crate :: int:: { AppropriateInt , ParseIntError } ;
88use crate :: remainder:: StreamData ;
99use crate :: suitable_stream:: { make_suitable_stream, SuitableStream } ;
10+ use crate :: user_facing_json_string_reader:: UserFacingJsonStringReader ;
1011use compact_str:: CompactString ;
1112use pyo3:: exceptions:: { PyIOError , PyValueError } ;
1213use pyo3:: prelude:: * ;
1314use std:: borrow:: BorrowMut ;
15+ use std:: io;
16+ use std:: mem:: swap;
1417use std:: num:: ParseFloatError ;
1518use std:: str:: FromStr ;
1619use thiserror:: Error ;
@@ -30,6 +33,7 @@ mod suitable_unbuffered_bytes_stream;
3033mod suitable_unbuffered_text_stream;
3134mod suitable_unseekable_buffered_bytes_stream;
3235mod suitable_unseekable_buffered_text_stream;
36+ mod user_facing_json_string_reader;
3337mod utf8_char_source;
3438
3539mod char_or_eof;
@@ -87,6 +91,7 @@ enum State {
8791/// UTF-8).
8892/// buffering: Internal buffer size. -1 (the default) means to let the
8993/// implementation choose a buffer size. Can conflict with `correct_cursor`.
94+ /// strings_as_files: Whether to return strings as file-like objects instead.
9095/// correct_cursor: *(not part of API yet, may be removed at any point)*
9196/// Whether it is required that the cursor is left in the correct position
9297/// (behind the last processed character) after park_cursor() has been
@@ -95,9 +100,10 @@ enum State {
95100/// unrelated to the actual tokenization progress. For seekable streams, the
96101/// improvement shouldn't be noticable.
97102#[ pyclass]
98- #[ pyo3( text_signature = "(stream, *, buffering=-1, correct_cursor=True)" ) ]
99- struct RustTokenizer {
103+ #[ pyo3( text_signature = "(stream, *, buffering=-1, strings_as_files=False, correct_cursor=True)" ) ]
104+ pub struct RustTokenizer {
100105 stream : Box < dyn SuitableStream + Send > ,
106+ strings_as_files : bool ,
101107 completed : bool ,
102108 advance : bool ,
103109 token : String ,
@@ -144,9 +150,28 @@ impl From<UnicodeError> for ParsingError {
144150 }
145151}
146152
153+ pub enum JsonStreamingError {
154+ ParsingError ( ParsingError ) ,
155+ IOError ( io:: Error ) ,
156+ }
157+
158+ impl From < ParsingError > for JsonStreamingError {
159+ fn from ( e : ParsingError ) -> JsonStreamingError {
160+ JsonStreamingError :: ParsingError ( e)
161+ }
162+ }
163+
164+ impl From < io:: Error > for JsonStreamingError {
165+ fn from ( e : io:: Error ) -> JsonStreamingError {
166+ JsonStreamingError :: IOError ( e)
167+ }
168+ }
169+
170+ #[ derive( Clone ) ]
147171enum Token {
148172 Operator ( String ) ,
149173 String_ ( String ) ,
174+ StringAsFile , // handled specially
150175 Integer ( AppropriateInt ) ,
151176 Float ( f64 ) ,
152177 Boolean ( bool ) ,
@@ -156,8 +181,13 @@ enum Token {
156181#[ pymethods]
157182impl RustTokenizer {
158183 #[ new]
159- #[ args( "*" , buffering = -1 , correct_cursor = "true" ) ]
160- fn new ( stream : PyObject , buffering : i64 , correct_cursor : bool ) -> PyResult < Self > {
184+ #[ args( "*" , buffering = -1 , strings_as_files = "false" , correct_cursor = "true" ) ]
185+ fn new (
186+ stream : PyObject ,
187+ buffering : i64 ,
188+ strings_as_files : bool ,
189+ correct_cursor : bool ,
190+ ) -> PyResult < Self > {
161191 let buffering_mode = if buffering < 0 {
162192 BufferingMode :: DontCare
163193 } else if buffering == 0 || buffering == 1 {
@@ -168,6 +198,7 @@ impl RustTokenizer {
168198 let stream = make_suitable_stream ( stream, buffering_mode, correct_cursor) ?;
169199 Ok ( RustTokenizer {
170200 stream,
201+ strings_as_files,
171202 completed : false ,
172203 advance : true ,
173204 token : String :: new ( ) ,
@@ -179,81 +210,34 @@ impl RustTokenizer {
179210 prev_charcode : None ,
180211 } )
181212 }
213+
182214 fn __iter__ ( slf : PyRef < ' _ , Self > ) -> PyRef < ' _ , Self > {
183215 slf
184216 }
217+
185218 fn __next__ (
186219 mut slf : PyRefMut < ' _ , Self > ,
187220 py : Python < ' _ > ,
188221 ) -> PyResult < Option < ( TokenType , Option < PyObject > ) > > {
189- let mut now_token;
190- loop {
191- if slf. advance {
192- match slf. stream . read_char ( ) {
193- Ok ( r) => match r {
194- Some ( r) => slf. c = Some ( r) ,
195- None => slf. c = None ,
196- } ,
197- Err ( e) => {
198- let index = slf. index ;
199- return Err ( PyIOError :: new_err ( format ! (
200- "I/O error while parsing (index {index}): {e:?}"
201- ) ) ) ;
202- }
203- }
204- slf. index += 1 ;
205- }
206- match slf. c {
207- Some ( c) => {
208- match RustTokenizer :: process_char_py ( slf. borrow_mut ( ) , py, Char ( c) ) {
209- Ok ( tok) => {
210- now_token = tok;
211- slf. state = slf. next_state . clone ( ) ;
212- }
213- Err ( e) => {
214- let index = slf. index ;
215- return Err ( PyValueError :: new_err ( format ! ( "{e} at index {index}" ) ) ) ;
216- }
217- }
218- if slf. completed {
219- slf. completed = false ;
220- slf. token = String :: new ( ) ;
221- return Ok ( now_token. clone ( ) ) ;
222- }
223- }
224- None => {
225- slf. advance = false ;
226- break ;
227- }
228- }
229- }
230- match RustTokenizer :: process_char_py ( slf. borrow_mut ( ) , py, Eof ) {
231- Ok ( tok) => {
232- now_token = tok;
233- }
234- Err ( e) => {
222+ match RustTokenizer :: read_next_token ( & mut slf) {
223+ Ok ( maybe_tok) => Ok ( match maybe_tok {
224+ Some ( tok) => Some ( RustTokenizer :: token_to_py_tuple ( slf, tok, py) ) ,
225+ None => None ,
226+ } ) ,
227+ Err ( e) => Err ( {
235228 let index = slf. index ;
236- return Err ( PyValueError :: new_err ( format ! ( "{e} at index {index}" ) ) ) ;
237- }
238- }
239- if slf. completed {
240- match now_token {
241- Some ( now_token) => {
242- // these are just to ensure in the next iteration we'll end
243- // up in the slf.completed = false branch and quit:
244- slf. completed = false ;
245- slf. state = State :: Whitespace ;
246- // final token
247- return Ok ( Some ( now_token) ) ;
248- }
249- None => {
250- return Ok ( None ) ;
229+ match e {
230+ JsonStreamingError :: ParsingError ( e) => {
231+ PyValueError :: new_err ( format ! ( "{e} at index {index}" ) )
232+ }
233+ JsonStreamingError :: IOError ( e) => PyIOError :: new_err ( format ! (
234+ "I/O error while parsing (index {index}): {e:?}"
235+ ) ) ,
251236 }
252- }
253- } else {
254- return Ok ( None ) ;
237+ } ) ,
255238 }
256239 }
240+
257241 /// Rewind the inner Python stream/file to undo readahead buffering.
258242 ///
259243 /// Required because reading char-by-char without buffering is
@@ -292,20 +276,68 @@ impl RustTokenizer {
292276}
293277
294278impl RustTokenizer {
295- fn process_char_py < ' a > (
296- slf : & mut Self ,
279+ fn read_next_token ( slf : & mut Self ) -> Result < Option < Token > , JsonStreamingError > {
280+ let mut now_token;
281+ loop {
282+ if slf. advance {
283+ match slf. stream . read_char ( ) ? {
284+ Some ( r) => slf. c = Some ( r) ,
285+ None => slf. c = None ,
286+ }
287+ slf. index += 1 ;
288+ }
289+ match slf. c {
290+ Some ( c) => {
291+ now_token = RustTokenizer :: process_char ( slf. borrow_mut ( ) , Char ( c) ) ?;
292+ slf. state = slf. next_state . clone ( ) ;
293+ if slf. completed {
294+ slf. completed = false ;
295+ slf. token = String :: new ( ) ;
296+ return Ok ( now_token. clone ( ) ) ;
297+ }
298+ }
299+ None => {
300+ slf. advance = false ;
301+ break ;
302+ }
303+ }
304+ }
305+ now_token = RustTokenizer :: process_char ( slf. borrow_mut ( ) , Eof ) ?;
306+ if slf. completed {
307+ match now_token {
308+ Some ( now_token) => {
309+ // these are just to ensure in the next iteration we'll end
310+ // up in the slf.completed = false branch and quit:
311+ slf. completed = false ;
312+ slf. state = State :: Whitespace ;
313+ // final token
314+ return Ok ( Some ( now_token) ) ;
315+ }
316+ None => {
317+ return Ok ( None ) ;
318+ }
319+ }
320+ } else {
321+ return Ok ( None ) ;
322+ }
323+ }
324+
325+ fn token_to_py_tuple < ' a > (
326+ slf : PyRefMut < ' _ , Self > ,
327+ tok : Token ,
297328 py : Python < ' _ > ,
298- c : CharOrEof ,
299- ) -> Result < Option < ( TokenType , Option < PyObject > ) > , ParsingError > {
300- match RustTokenizer :: process_char ( slf. borrow_mut ( ) , c) {
301- Ok ( Some ( Token :: Operator ( s) ) ) => Ok ( Some ( ( TokenType :: Operator , Some ( s. into_py ( py) ) ) ) ) ,
302- Ok ( Some ( Token :: String_ ( s) ) ) => Ok ( Some ( ( TokenType :: String_ , Some ( s. into_py ( py) ) ) ) ) ,
303- Ok ( Some ( Token :: Integer ( n) ) ) => Ok ( Some ( ( TokenType :: Number , Some ( n. into_py ( py) ) ) ) ) ,
304- Ok ( Some ( Token :: Float ( f) ) ) => Ok ( Some ( ( TokenType :: Number , Some ( f. into_py ( py) ) ) ) ) ,
305- Ok ( Some ( Token :: Boolean ( b) ) ) => Ok ( Some ( ( TokenType :: Boolean , Some ( b. into_py ( py) ) ) ) ) ,
306- Ok ( Some ( Token :: Null ) ) => Ok ( Some ( ( TokenType :: Null , None ) ) ) ,
307- Ok ( None ) => Ok ( None ) ,
308- Err ( e) => Err ( e) ,
329+ ) -> ( TokenType , Option < PyObject > ) {
330+ match tok {
331+ Token :: Operator ( s) => ( TokenType :: Operator , Some ( s. into_py ( py) ) ) ,
332+ Token :: String_ ( s) => ( TokenType :: String_ , Some ( s. into_py ( py) ) ) ,
333+ Token :: StringAsFile => (
334+ TokenType :: String_ ,
335+ Some ( UserFacingJsonStringReader :: new ( slf. into ( ) ) . into_py ( py) ) ,
336+ ) ,
337+ Token :: Integer ( n) => ( TokenType :: Number , Some ( n. into_py ( py) ) ) ,
338+ Token :: Float ( f) => ( TokenType :: Number , Some ( f. into_py ( py) ) ) ,
339+ Token :: Boolean ( b) => ( TokenType :: Boolean , Some ( b. into_py ( py) ) ) ,
340+ Token :: Null => ( TokenType :: Null , None ) ,
309341 }
310342 }
311343
@@ -344,6 +376,10 @@ impl RustTokenizer {
344376 }
345377 Char ( '"' ) => {
346378 slf. next_state = State :: String_ ;
379+ if slf. strings_as_files {
380+ slf. completed = true ;
381+ now_token = Some ( Token :: StringAsFile ) ;
382+ }
347383 }
348384 Char ( '1' ..='9' ) => {
349385 slf. next_state = State :: Integer ;
@@ -803,6 +839,31 @@ impl RustTokenizer {
803839
804840 Ok ( now_token)
805841 }
842+
843+ fn parse_string_contents < ' a > (
844+ & mut self ,
845+ max_n_chars : Option < usize > ,
846+ ) -> Result < Option < String > , JsonStreamingError > {
847+ while max_n_chars. map_or ( true , |n| self . token . len ( ) < n) {
848+ let c = match self
849+ . stream
850+ . read_char ( )
851+ . map_err ( |e| <io:: Error as Into < JsonStreamingError > >:: into ( e) ) ?
852+ {
853+ Some ( c) => Char ( c) ,
854+ None => Eof ,
855+ } ;
856+ self . index += 1 ; // TODO DRY => pull into new read_char() method on this cls?
857+ RustTokenizer :: process_char ( self , c) ?;
858+ if let State :: StringEnd = self . next_state {
859+ self . completed = false ;
860+ self . advance = true ;
861+ }
862+ }
863+ let mut s = String :: new ( ) ;
864+ swap ( & mut s, & mut self . token ) ;
865+ Ok ( Some ( s) )
866+ }
806867}
807868
808869/// supports_bigint()
0 commit comments