WIP: String streaming (mostly w/in main class)

smheidrich · smheidrich · commit d12aaaebb565 · 2023-08-21T21:25:20.000+02:00
Real mess...
diff --git a/src/int.rs b/src/int.rs
@@ -18,12 +18,14 @@ pub enum ParseIntError {
 use num_bigint::BigInt;
 
 #[cfg(not(any(Py_LIMITED_API, PyPy)))]
+#[derive(Clone)]
 pub enum AppropriateInt {
     Normal(i64),
     Big(BigInt),
 }
 
 #[cfg(all(any(Py_LIMITED_API, PyPy)))]
+#[derive(Clone)]
 pub enum AppropriateInt {
     Normal(i64),
     Big(String), // to be converted into int on the Python side
diff --git a/src/lib.rs b/src/lib.rs
@@ -11,6 +11,8 @@ use compact_str::CompactString;
 use pyo3::exceptions::{PyIOError, PyValueError};
 use pyo3::prelude::*;
 use std::borrow::BorrowMut;
+use std::io;
+use std::mem::swap;
 use std::num::ParseFloatError;
 use std::str::FromStr;
 use thiserror::Error;
@@ -25,19 +27,20 @@ mod read_string;
 mod remainder;
 mod suitable_seekable_buffered_bytes_stream;
 mod suitable_seekable_buffered_text_stream;
-mod suitable_unseekable_buffered_bytes_stream;
-mod suitable_unseekable_buffered_text_stream;
 mod suitable_stream;
 mod suitable_unbuffered_bytes_stream;
 mod suitable_unbuffered_text_stream;
+mod suitable_unseekable_buffered_bytes_stream;
+mod suitable_unseekable_buffered_text_stream;
+mod user_facing_json_string_reader;
 mod utf8_char_source;
 
 mod char_or_eof;
 use crate::char_or_eof::CharOrEof;
 use CharOrEof::{Char, Eof};
 
 mod unicode_utils;
-use crate::unicode_utils::{is_surrogate, decode_surrogate_pair, UnicodeError};
+use crate::unicode_utils::{decode_surrogate_pair, is_surrogate, UnicodeError};
 
 use crate::suitable_stream::BufferingMode;
 
@@ -123,7 +126,7 @@ impl IntoPy<PyObject> for TokenType {
 }
 
 #[derive(Error, Debug)]
-    pub enum ParsingError {
+pub enum ParsingError {
     #[error("{0}")]
     InvalidJson(String),
     #[error("Error due to limitation: {0}")]
@@ -144,6 +147,24 @@ impl From<UnicodeError> for ParsingError {
     }
 }
 
+pub enum JsonStreamingError {
+    ParsingError(ParsingError),
+    IOError(io::Error),
+}
+
+impl From<ParsingError> for JsonStreamingError {
+    fn from(e: ParsingError) -> JsonStreamingError {
+        JsonStreamingError::ParsingError(e)
+    }
+}
+
+impl From<io::Error> for JsonStreamingError {
+    fn from(e: io::Error) -> JsonStreamingError {
+        JsonStreamingError::IOError(e)
+    }
+}
+
+#[derive(Clone)]
 enum Token {
     Operator(String),
     String_(String),
@@ -179,81 +200,30 @@ impl RustTokenizer {
             prev_charcode: None,
         })
     }
+
     fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> {
         slf
     }
+
     fn __next__(
         mut slf: PyRefMut<'_, Self>,
         py: Python<'_>,
     ) -> PyResult<Option<(TokenType, Option<PyObject>)>> {
-        let mut now_token;
-        loop {
-            if slf.advance {
-                match slf.stream.read_char() {
-                    Ok(r) => match r {
-                        Some(r) => slf.c = Some(r),
-                        None => slf.c = None,
-                    },
-                    Err(e) => {
-                        let index = slf.index;
-                        return Err(PyIOError::new_err(format!(
-                            "I/O error while parsing (index {index}): {e:?}"
-                        )));
-                    }
-                }
-                slf.index += 1;
-            }
-            match slf.c {
-                Some(c) => {
-                    match RustTokenizer::process_char_py(slf.borrow_mut(), py, Char(c)) {
-                        Ok(tok) => {
-                            now_token = tok;
-                            slf.state = slf.next_state.clone();
-                        }
-                        Err(e) => {
-                            let index = slf.index;
-                            return Err(PyValueError::new_err(format!("{e} at index {index}")));
-                        }
-                    }
-                    if slf.completed {
-                        slf.completed = false;
-                        slf.token = String::new();
-                        return Ok(now_token.clone());
-                    }
-                }
-                None => {
-                    slf.advance = false;
-                    break;
-                }
-            }
-        }
-        match RustTokenizer::process_char_py(slf.borrow_mut(), py, Eof) {
-            Ok(tok) => {
-                now_token = tok;
-            }
-            Err(e) => {
+        RustTokenizer::read_next_token(&mut slf)
+            .map(|maybe_tok| maybe_tok.map(|tok| RustTokenizer::token_to_py_tuple(tok, py)))
+            .map_err(|e| -> PyErr {
                 let index = slf.index;
-                return Err(PyValueError::new_err(format!("{e} at index {index}")));
-            }
-        }
-        if slf.completed {
-            match now_token {
-                Some(now_token) => {
-                    // these are just to ensure in the next iteration we'll end
-                    // up in the slf.completed = false branch and quit:
-                    slf.completed = false;
-                    slf.state = State::Whitespace;
-                    // final token
-                    return Ok(Some(now_token));
-                }
-                None => {
-                    return Ok(None);
+                match e {
+                    JsonStreamingError::ParsingError(e) => {
+                        PyValueError::new_err(format!("{e} at index {index}"))
+                    }
+                    JsonStreamingError::IOError(e) => PyIOError::new_err(format!(
+                        "I/O error while parsing (index {index}): {e:?}"
+                    )),
                 }
-            }
-        } else {
-            return Ok(None);
-        }
+            })
     }
+
     /// Rewind the inner Python stream/file to undo readahead buffering.
     ///
     /// Required because reading char-by-char without buffering is
@@ -292,20 +262,60 @@ impl RustTokenizer {
 }
 
 impl RustTokenizer {
-    fn process_char_py<'a>(
-        slf: &mut Self,
-        py: Python<'_>,
-        c: CharOrEof,
-    ) -> Result<Option<(TokenType, Option<PyObject>)>, ParsingError> {
-        match RustTokenizer::process_char(slf.borrow_mut(), c) {
-            Ok(Some(Token::Operator(s))) => Ok(Some((TokenType::Operator, Some(s.into_py(py))))),
-            Ok(Some(Token::String_(s))) => Ok(Some((TokenType::String_, Some(s.into_py(py))))),
-            Ok(Some(Token::Integer(n))) => Ok(Some((TokenType::Number, Some(n.into_py(py))))),
-            Ok(Some(Token::Float(f))) => Ok(Some((TokenType::Number, Some(f.into_py(py))))),
-            Ok(Some(Token::Boolean(b))) => Ok(Some((TokenType::Boolean, Some(b.into_py(py))))),
-            Ok(Some(Token::Null)) => Ok(Some((TokenType::Null, None))),
-            Ok(None) => Ok(None),
-            Err(e) => Err(e),
+    fn read_next_token(slf: &mut Self) -> Result<Option<Token>, JsonStreamingError> {
+        let mut now_token;
+        loop {
+            if slf.advance {
+                match slf.stream.read_char()? {
+                    Some(r) => slf.c = Some(r),
+                    None => slf.c = None,
+                }
+                slf.index += 1;
+            }
+            match slf.c {
+                Some(c) => {
+                    now_token = RustTokenizer::process_char(slf.borrow_mut(), Char(c))?;
+                    slf.state = slf.next_state.clone();
+                    if slf.completed {
+                        slf.completed = false;
+                        slf.token = String::new();
+                        return Ok(now_token.clone());
+                    }
+                }
+                None => {
+                    slf.advance = false;
+                    break;
+                }
+            }
+        }
+        now_token = RustTokenizer::process_char(slf.borrow_mut(), Eof)?;
+        if slf.completed {
+            match now_token {
+                Some(now_token) => {
+                    // these are just to ensure in the next iteration we'll end
+                    // up in the slf.completed = false branch and quit:
+                    slf.completed = false;
+                    slf.state = State::Whitespace;
+                    // final token
+                    return Ok(Some(now_token));
+                }
+                None => {
+                    return Ok(None);
+                }
+            }
+        } else {
+            return Ok(None);
+        }
+    }
+
+    fn token_to_py_tuple<'a>(tok: Token, py: Python<'_>) -> (TokenType, Option<PyObject>) {
+        match tok {
+            Token::Operator(s) => (TokenType::Operator, Some(s.into_py(py))),
+            Token::String_(s) => (TokenType::String_, Some(s.into_py(py))),
+            Token::Integer(n) => (TokenType::Number, Some(n.into_py(py))),
+            Token::Float(f) => (TokenType::Number, Some(f.into_py(py))),
+            Token::Boolean(b) => (TokenType::Boolean, Some(b.into_py(py))),
+            Token::Null => (TokenType::Null, None),
         }
     }
 
@@ -372,7 +382,7 @@ impl RustTokenizer {
                             "Invalid JSON character: {c:?}"
                         )));
                     }
-                },
+                }
                 Eof => (),
             },
             State::Integer => match c {
@@ -721,41 +731,37 @@ impl RustTokenizer {
                     }
                 }
             }
-            State::UnicodeSurrogateStart => {
-                match c {
-                    Char('\\') => {
-                        slf.next_state = State::UnicodeSurrogateStringEscape;
-                    }
-                    Char(_) => {
-                        return Err(ParsingError::InvalidJson(format!(
-                            "Unpaired UTF-16 surrogate"
-                        )));
-                    }
-                    Eof => {
-                        return Err(ParsingError::InvalidJson(format!(
-                            "Unpaired UTF-16 surrogate at end of file"
-                        )));
-                    }
+            State::UnicodeSurrogateStart => match c {
+                Char('\\') => {
+                    slf.next_state = State::UnicodeSurrogateStringEscape;
                 }
-            }
-            State::UnicodeSurrogateStringEscape => {
-                match c {
-                    Char('u') => {
-                        slf.unicode_buffer = CompactString::with_capacity(4);
-                        slf.next_state = State::UnicodeSurrogate;
-                    }
-                    Char(_) => {
-                        return Err(ParsingError::InvalidJson(format!(
-                            "Unpaired UTF-16 surrogate"
-                        )));
-                    }
-                    Eof => {
-                        return Err(ParsingError::InvalidJson(format!(
-                            "Unpaired UTF-16 surrogate at end of file"
-                        )));
-                    }
+                Char(_) => {
+                    return Err(ParsingError::InvalidJson(format!(
+                        "Unpaired UTF-16 surrogate"
+                    )));
                 }
-            }
+                Eof => {
+                    return Err(ParsingError::InvalidJson(format!(
+                        "Unpaired UTF-16 surrogate at end of file"
+                    )));
+                }
+            },
+            State::UnicodeSurrogateStringEscape => match c {
+                Char('u') => {
+                    slf.unicode_buffer = CompactString::with_capacity(4);
+                    slf.next_state = State::UnicodeSurrogate;
+                }
+                Char(_) => {
+                    return Err(ParsingError::InvalidJson(format!(
+                        "Unpaired UTF-16 surrogate"
+                    )));
+                }
+                Eof => {
+                    return Err(ParsingError::InvalidJson(format!(
+                        "Unpaired UTF-16 surrogate at end of file"
+                    )));
+                }
+            },
             State::UnicodeSurrogate => {
                 match c {
                     Char(c) => {
@@ -786,13 +792,12 @@ impl RustTokenizer {
                             "This should never happen, please report it as a bug..."
                         )));
                     };
-                    c = Char(
-                        decode_surrogate_pair(prev_charcode, charcode)
-                        .map_err(|_| ParsingError::InvalidJson(format!(
+                    c = Char(decode_surrogate_pair(prev_charcode, charcode).map_err(|_| {
+                        ParsingError::InvalidJson(format!(
                             "Error decoding UTF-16 surrogate pair \
                             \\u{prev_charcode:x}\\u{charcode:x}"
-                        )))?
-                    );
+                        ))
+                    })?);
                     slf.prev_charcode = None;
                     slf.next_state = State::String_;
                     add_char = true;
@@ -808,6 +813,30 @@ impl RustTokenizer {
 
         Ok(now_token)
     }
+
+    fn parse_string_contents<'a>(
+        &mut self,
+        max_n_chars: Option<usize>,
+    ) -> Result<Option<String>, JsonStreamingError> {
+        while max_n_chars.map_or(true, |n| self.token.len() < n) {
+            let c = match self
+                .stream
+                .read_char()
+                .map_err(|e| <io::Error as Into<JsonStreamingError>>::into(e))?
+            {
+                Some(c) => Char(c),
+                None => Eof,
+            };
+            RustTokenizer::process_char(self, c)?;
+            if let State::StringEnd = self.next_state {
+                self.completed = false;
+                self.advance = true;
+            }
+        }
+        let mut s = String::new();
+        swap(&mut s, &mut self.token);
+        Ok(Some(s))
+    }
 }
 
 /// supports_bigint()
diff --git a/src/user_facing_json_string_reader.rs b/src/user_facing_json_string_reader.rs