@@ -11,6 +11,8 @@ use compact_str::CompactString;
1111use pyo3:: exceptions:: { PyIOError , PyValueError } ;
1212use pyo3:: prelude:: * ;
1313use std:: borrow:: BorrowMut ;
14+ use std:: io;
15+ use std:: mem:: swap;
1416use std:: num:: ParseFloatError ;
1517use std:: str:: FromStr ;
1618use thiserror:: Error ;
@@ -25,19 +27,20 @@ mod read_string;
2527mod remainder;
2628mod suitable_seekable_buffered_bytes_stream;
2729mod suitable_seekable_buffered_text_stream;
28- mod suitable_unseekable_buffered_bytes_stream;
29- mod suitable_unseekable_buffered_text_stream;
3030mod suitable_stream;
3131mod suitable_unbuffered_bytes_stream;
3232mod suitable_unbuffered_text_stream;
33+ mod suitable_unseekable_buffered_bytes_stream;
34+ mod suitable_unseekable_buffered_text_stream;
35+ mod user_facing_json_string_reader;
3336mod utf8_char_source;
3437
3538mod char_or_eof;
3639use crate :: char_or_eof:: CharOrEof ;
3740use CharOrEof :: { Char , Eof } ;
3841
3942mod unicode_utils;
40- use crate :: unicode_utils:: { is_surrogate , decode_surrogate_pair , UnicodeError } ;
43+ use crate :: unicode_utils:: { decode_surrogate_pair , is_surrogate , UnicodeError } ;
4144
4245use crate :: suitable_stream:: BufferingMode ;
4346
@@ -123,7 +126,7 @@ impl IntoPy<PyObject> for TokenType {
123126}
124127
125128#[ derive( Error , Debug ) ]
126- pub enum ParsingError {
129+ pub enum ParsingError {
127130 #[ error( "{0}" ) ]
128131 InvalidJson ( String ) ,
129132 #[ error( "Error due to limitation: {0}" ) ]
@@ -144,6 +147,24 @@ impl From<UnicodeError> for ParsingError {
144147 }
145148}
146149
150+ pub enum JsonStreamingError {
151+ ParsingError ( ParsingError ) ,
152+ IOError ( io:: Error ) ,
153+ }
154+
155+ impl From < ParsingError > for JsonStreamingError {
156+ fn from ( e : ParsingError ) -> JsonStreamingError {
157+ JsonStreamingError :: ParsingError ( e)
158+ }
159+ }
160+
161+ impl From < io:: Error > for JsonStreamingError {
162+ fn from ( e : io:: Error ) -> JsonStreamingError {
163+ JsonStreamingError :: IOError ( e)
164+ }
165+ }
166+
167+ #[ derive( Clone ) ]
147168enum Token {
148169 Operator ( String ) ,
149170 String_ ( String ) ,
@@ -179,81 +200,30 @@ impl RustTokenizer {
179200 prev_charcode : None ,
180201 } )
181202 }
203+
182204 fn __iter__ ( slf : PyRef < ' _ , Self > ) -> PyRef < ' _ , Self > {
183205 slf
184206 }
207+
185208 fn __next__ (
186209 mut slf : PyRefMut < ' _ , Self > ,
187210 py : Python < ' _ > ,
188211 ) -> PyResult < Option < ( TokenType , Option < PyObject > ) > > {
189- let mut now_token;
190- loop {
191- if slf. advance {
192- match slf. stream . read_char ( ) {
193- Ok ( r) => match r {
194- Some ( r) => slf. c = Some ( r) ,
195- None => slf. c = None ,
196- } ,
197- Err ( e) => {
198- let index = slf. index ;
199- return Err ( PyIOError :: new_err ( format ! (
200- "I/O error while parsing (index {index}): {e:?}"
201- ) ) ) ;
202- }
203- }
204- slf. index += 1 ;
205- }
206- match slf. c {
207- Some ( c) => {
208- match RustTokenizer :: process_char_py ( slf. borrow_mut ( ) , py, Char ( c) ) {
209- Ok ( tok) => {
210- now_token = tok;
211- slf. state = slf. next_state . clone ( ) ;
212- }
213- Err ( e) => {
214- let index = slf. index ;
215- return Err ( PyValueError :: new_err ( format ! ( "{e} at index {index}" ) ) ) ;
216- }
217- }
218- if slf. completed {
219- slf. completed = false ;
220- slf. token = String :: new ( ) ;
221- return Ok ( now_token. clone ( ) ) ;
222- }
223- }
224- None => {
225- slf. advance = false ;
226- break ;
227- }
228- }
229- }
230- match RustTokenizer :: process_char_py ( slf. borrow_mut ( ) , py, Eof ) {
231- Ok ( tok) => {
232- now_token = tok;
233- }
234- Err ( e) => {
212+ RustTokenizer :: read_next_token ( & mut slf)
213+ . map ( |maybe_tok| maybe_tok. map ( |tok| RustTokenizer :: token_to_py_tuple ( tok, py) ) )
214+ . map_err ( |e| -> PyErr {
235215 let index = slf. index ;
236- return Err ( PyValueError :: new_err ( format ! ( "{e} at index {index}" ) ) ) ;
237- }
238- }
239- if slf. completed {
240- match now_token {
241- Some ( now_token) => {
242- // these are just to ensure in the next iteration we'll end
243- // up in the slf.completed = false branch and quit:
244- slf. completed = false ;
245- slf. state = State :: Whitespace ;
246- // final token
247- return Ok ( Some ( now_token) ) ;
248- }
249- None => {
250- return Ok ( None ) ;
216+ match e {
217+ JsonStreamingError :: ParsingError ( e) => {
218+ PyValueError :: new_err ( format ! ( "{e} at index {index}" ) )
219+ }
220+ JsonStreamingError :: IOError ( e) => PyIOError :: new_err ( format ! (
221+ "I/O error while parsing (index {index}): {e:?}"
222+ ) ) ,
251223 }
252- }
253- } else {
254- return Ok ( None ) ;
255- }
224+ } )
256225 }
226+
257227 /// Rewind the inner Python stream/file to undo readahead buffering.
258228 ///
259229 /// Required because reading char-by-char without buffering is
@@ -292,20 +262,60 @@ impl RustTokenizer {
292262}
293263
294264impl RustTokenizer {
295- fn process_char_py < ' a > (
296- slf : & mut Self ,
297- py : Python < ' _ > ,
298- c : CharOrEof ,
299- ) -> Result < Option < ( TokenType , Option < PyObject > ) > , ParsingError > {
300- match RustTokenizer :: process_char ( slf. borrow_mut ( ) , c) {
301- Ok ( Some ( Token :: Operator ( s) ) ) => Ok ( Some ( ( TokenType :: Operator , Some ( s. into_py ( py) ) ) ) ) ,
302- Ok ( Some ( Token :: String_ ( s) ) ) => Ok ( Some ( ( TokenType :: String_ , Some ( s. into_py ( py) ) ) ) ) ,
303- Ok ( Some ( Token :: Integer ( n) ) ) => Ok ( Some ( ( TokenType :: Number , Some ( n. into_py ( py) ) ) ) ) ,
304- Ok ( Some ( Token :: Float ( f) ) ) => Ok ( Some ( ( TokenType :: Number , Some ( f. into_py ( py) ) ) ) ) ,
305- Ok ( Some ( Token :: Boolean ( b) ) ) => Ok ( Some ( ( TokenType :: Boolean , Some ( b. into_py ( py) ) ) ) ) ,
306- Ok ( Some ( Token :: Null ) ) => Ok ( Some ( ( TokenType :: Null , None ) ) ) ,
307- Ok ( None ) => Ok ( None ) ,
308- Err ( e) => Err ( e) ,
265+ fn read_next_token ( slf : & mut Self ) -> Result < Option < Token > , JsonStreamingError > {
266+ let mut now_token;
267+ loop {
268+ if slf. advance {
269+ match slf. stream . read_char ( ) ? {
270+ Some ( r) => slf. c = Some ( r) ,
271+ None => slf. c = None ,
272+ }
273+ slf. index += 1 ;
274+ }
275+ match slf. c {
276+ Some ( c) => {
277+ now_token = RustTokenizer :: process_char ( slf. borrow_mut ( ) , Char ( c) ) ?;
278+ slf. state = slf. next_state . clone ( ) ;
279+ if slf. completed {
280+ slf. completed = false ;
281+ slf. token = String :: new ( ) ;
282+ return Ok ( now_token. clone ( ) ) ;
283+ }
284+ }
285+ None => {
286+ slf. advance = false ;
287+ break ;
288+ }
289+ }
290+ }
291+ now_token = RustTokenizer :: process_char ( slf. borrow_mut ( ) , Eof ) ?;
292+ if slf. completed {
293+ match now_token {
294+ Some ( now_token) => {
295+ // these are just to ensure in the next iteration we'll end
296+ // up in the slf.completed = false branch and quit:
297+ slf. completed = false ;
298+ slf. state = State :: Whitespace ;
299+ // final token
300+ return Ok ( Some ( now_token) ) ;
301+ }
302+ None => {
303+ return Ok ( None ) ;
304+ }
305+ }
306+ } else {
307+ return Ok ( None ) ;
308+ }
309+ }
310+
311+ fn token_to_py_tuple < ' a > ( tok : Token , py : Python < ' _ > ) -> ( TokenType , Option < PyObject > ) {
312+ match tok {
313+ Token :: Operator ( s) => ( TokenType :: Operator , Some ( s. into_py ( py) ) ) ,
314+ Token :: String_ ( s) => ( TokenType :: String_ , Some ( s. into_py ( py) ) ) ,
315+ Token :: Integer ( n) => ( TokenType :: Number , Some ( n. into_py ( py) ) ) ,
316+ Token :: Float ( f) => ( TokenType :: Number , Some ( f. into_py ( py) ) ) ,
317+ Token :: Boolean ( b) => ( TokenType :: Boolean , Some ( b. into_py ( py) ) ) ,
318+ Token :: Null => ( TokenType :: Null , None ) ,
309319 }
310320 }
311321
@@ -372,7 +382,7 @@ impl RustTokenizer {
372382 "Invalid JSON character: {c:?}"
373383 ) ) ) ;
374384 }
375- } ,
385+ }
376386 Eof => ( ) ,
377387 } ,
378388 State :: Integer => match c {
@@ -721,41 +731,37 @@ impl RustTokenizer {
721731 }
722732 }
723733 }
724- State :: UnicodeSurrogateStart => {
725- match c {
726- Char ( '\\' ) => {
727- slf. next_state = State :: UnicodeSurrogateStringEscape ;
728- }
729- Char ( _) => {
730- return Err ( ParsingError :: InvalidJson ( format ! (
731- "Unpaired UTF-16 surrogate"
732- ) ) ) ;
733- }
734- Eof => {
735- return Err ( ParsingError :: InvalidJson ( format ! (
736- "Unpaired UTF-16 surrogate at end of file"
737- ) ) ) ;
738- }
734+ State :: UnicodeSurrogateStart => match c {
735+ Char ( '\\' ) => {
736+ slf. next_state = State :: UnicodeSurrogateStringEscape ;
739737 }
740- }
741- State :: UnicodeSurrogateStringEscape => {
742- match c {
743- Char ( 'u' ) => {
744- slf. unicode_buffer = CompactString :: with_capacity ( 4 ) ;
745- slf. next_state = State :: UnicodeSurrogate ;
746- }
747- Char ( _) => {
748- return Err ( ParsingError :: InvalidJson ( format ! (
749- "Unpaired UTF-16 surrogate"
750- ) ) ) ;
751- }
752- Eof => {
753- return Err ( ParsingError :: InvalidJson ( format ! (
754- "Unpaired UTF-16 surrogate at end of file"
755- ) ) ) ;
756- }
738+ Char ( _) => {
739+ return Err ( ParsingError :: InvalidJson ( format ! (
740+ "Unpaired UTF-16 surrogate"
741+ ) ) ) ;
757742 }
758- }
743+ Eof => {
744+ return Err ( ParsingError :: InvalidJson ( format ! (
745+ "Unpaired UTF-16 surrogate at end of file"
746+ ) ) ) ;
747+ }
748+ } ,
749+ State :: UnicodeSurrogateStringEscape => match c {
750+ Char ( 'u' ) => {
751+ slf. unicode_buffer = CompactString :: with_capacity ( 4 ) ;
752+ slf. next_state = State :: UnicodeSurrogate ;
753+ }
754+ Char ( _) => {
755+ return Err ( ParsingError :: InvalidJson ( format ! (
756+ "Unpaired UTF-16 surrogate"
757+ ) ) ) ;
758+ }
759+ Eof => {
760+ return Err ( ParsingError :: InvalidJson ( format ! (
761+ "Unpaired UTF-16 surrogate at end of file"
762+ ) ) ) ;
763+ }
764+ } ,
759765 State :: UnicodeSurrogate => {
760766 match c {
761767 Char ( c) => {
@@ -786,13 +792,12 @@ impl RustTokenizer {
786792 "This should never happen, please report it as a bug..."
787793 ) ) ) ;
788794 } ;
789- c = Char (
790- decode_surrogate_pair ( prev_charcode, charcode)
791- . map_err ( |_| ParsingError :: InvalidJson ( format ! (
795+ c = Char ( decode_surrogate_pair ( prev_charcode, charcode) . map_err ( |_| {
796+ ParsingError :: InvalidJson ( format ! (
792797 "Error decoding UTF-16 surrogate pair \
793798 \\ u{prev_charcode:x}\\ u{charcode:x}"
794- ) ) ) ?
795- ) ;
799+ ) )
800+ } ) ? ) ;
796801 slf. prev_charcode = None ;
797802 slf. next_state = State :: String_ ;
798803 add_char = true ;
@@ -808,6 +813,30 @@ impl RustTokenizer {
808813
809814 Ok ( now_token)
810815 }
816+
817+ fn parse_string_contents < ' a > (
818+ & mut self ,
819+ max_n_chars : Option < usize > ,
820+ ) -> Result < Option < String > , JsonStreamingError > {
821+ while max_n_chars. map_or ( true , |n| self . token . len ( ) < n) {
822+ let c = match self
823+ . stream
824+ . read_char ( )
825+ . map_err ( |e| <io:: Error as Into < JsonStreamingError > >:: into ( e) ) ?
826+ {
827+ Some ( c) => Char ( c) ,
828+ None => Eof ,
829+ } ;
830+ RustTokenizer :: process_char ( self , c) ?;
831+ if let State :: StringEnd = self . next_state {
832+ self . completed = false ;
833+ self . advance = true ;
834+ }
835+ }
836+ let mut s = String :: new ( ) ;
837+ swap ( & mut s, & mut self . token ) ;
838+ Ok ( Some ( s) )
839+ }
811840}
812841
813842/// supports_bigint()
0 commit comments