Skip to content

Commit d12aaae

Browse files
committed
WIP: String streaming (mostly w/in main class)
Real mess...
1 parent ac16188 commit d12aaae

File tree

3 files changed

+198
-122
lines changed

3 files changed

+198
-122
lines changed

src/int.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,14 @@ pub enum ParseIntError {
1818
use num_bigint::BigInt;
1919

2020
#[cfg(not(any(Py_LIMITED_API, PyPy)))]
21+
#[derive(Clone)]
2122
pub enum AppropriateInt {
2223
Normal(i64),
2324
Big(BigInt),
2425
}
2526

2627
#[cfg(all(any(Py_LIMITED_API, PyPy)))]
28+
#[derive(Clone)]
2729
pub enum AppropriateInt {
2830
Normal(i64),
2931
Big(String), // to be converted into int on the Python side

src/lib.rs

Lines changed: 151 additions & 122 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ use compact_str::CompactString;
1111
use pyo3::exceptions::{PyIOError, PyValueError};
1212
use pyo3::prelude::*;
1313
use std::borrow::BorrowMut;
14+
use std::io;
15+
use std::mem::swap;
1416
use std::num::ParseFloatError;
1517
use std::str::FromStr;
1618
use thiserror::Error;
@@ -25,19 +27,20 @@ mod read_string;
2527
mod remainder;
2628
mod suitable_seekable_buffered_bytes_stream;
2729
mod suitable_seekable_buffered_text_stream;
28-
mod suitable_unseekable_buffered_bytes_stream;
29-
mod suitable_unseekable_buffered_text_stream;
3030
mod suitable_stream;
3131
mod suitable_unbuffered_bytes_stream;
3232
mod suitable_unbuffered_text_stream;
33+
mod suitable_unseekable_buffered_bytes_stream;
34+
mod suitable_unseekable_buffered_text_stream;
35+
mod user_facing_json_string_reader;
3336
mod utf8_char_source;
3437

3538
mod char_or_eof;
3639
use crate::char_or_eof::CharOrEof;
3740
use CharOrEof::{Char, Eof};
3841

3942
mod unicode_utils;
40-
use crate::unicode_utils::{is_surrogate, decode_surrogate_pair, UnicodeError};
43+
use crate::unicode_utils::{decode_surrogate_pair, is_surrogate, UnicodeError};
4144

4245
use crate::suitable_stream::BufferingMode;
4346

@@ -123,7 +126,7 @@ impl IntoPy<PyObject> for TokenType {
123126
}
124127

125128
#[derive(Error, Debug)]
126-
pub enum ParsingError {
129+
pub enum ParsingError {
127130
#[error("{0}")]
128131
InvalidJson(String),
129132
#[error("Error due to limitation: {0}")]
@@ -144,6 +147,24 @@ impl From<UnicodeError> for ParsingError {
144147
}
145148
}
146149

150+
pub enum JsonStreamingError {
151+
ParsingError(ParsingError),
152+
IOError(io::Error),
153+
}
154+
155+
impl From<ParsingError> for JsonStreamingError {
156+
fn from(e: ParsingError) -> JsonStreamingError {
157+
JsonStreamingError::ParsingError(e)
158+
}
159+
}
160+
161+
impl From<io::Error> for JsonStreamingError {
162+
fn from(e: io::Error) -> JsonStreamingError {
163+
JsonStreamingError::IOError(e)
164+
}
165+
}
166+
167+
#[derive(Clone)]
147168
enum Token {
148169
Operator(String),
149170
String_(String),
@@ -179,81 +200,30 @@ impl RustTokenizer {
179200
prev_charcode: None,
180201
})
181202
}
203+
182204
fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> {
183205
slf
184206
}
207+
185208
fn __next__(
186209
mut slf: PyRefMut<'_, Self>,
187210
py: Python<'_>,
188211
) -> PyResult<Option<(TokenType, Option<PyObject>)>> {
189-
let mut now_token;
190-
loop {
191-
if slf.advance {
192-
match slf.stream.read_char() {
193-
Ok(r) => match r {
194-
Some(r) => slf.c = Some(r),
195-
None => slf.c = None,
196-
},
197-
Err(e) => {
198-
let index = slf.index;
199-
return Err(PyIOError::new_err(format!(
200-
"I/O error while parsing (index {index}): {e:?}"
201-
)));
202-
}
203-
}
204-
slf.index += 1;
205-
}
206-
match slf.c {
207-
Some(c) => {
208-
match RustTokenizer::process_char_py(slf.borrow_mut(), py, Char(c)) {
209-
Ok(tok) => {
210-
now_token = tok;
211-
slf.state = slf.next_state.clone();
212-
}
213-
Err(e) => {
214-
let index = slf.index;
215-
return Err(PyValueError::new_err(format!("{e} at index {index}")));
216-
}
217-
}
218-
if slf.completed {
219-
slf.completed = false;
220-
slf.token = String::new();
221-
return Ok(now_token.clone());
222-
}
223-
}
224-
None => {
225-
slf.advance = false;
226-
break;
227-
}
228-
}
229-
}
230-
match RustTokenizer::process_char_py(slf.borrow_mut(), py, Eof) {
231-
Ok(tok) => {
232-
now_token = tok;
233-
}
234-
Err(e) => {
212+
RustTokenizer::read_next_token(&mut slf)
213+
.map(|maybe_tok| maybe_tok.map(|tok| RustTokenizer::token_to_py_tuple(tok, py)))
214+
.map_err(|e| -> PyErr {
235215
let index = slf.index;
236-
return Err(PyValueError::new_err(format!("{e} at index {index}")));
237-
}
238-
}
239-
if slf.completed {
240-
match now_token {
241-
Some(now_token) => {
242-
// these are just to ensure in the next iteration we'll end
243-
// up in the slf.completed = false branch and quit:
244-
slf.completed = false;
245-
slf.state = State::Whitespace;
246-
// final token
247-
return Ok(Some(now_token));
248-
}
249-
None => {
250-
return Ok(None);
216+
match e {
217+
JsonStreamingError::ParsingError(e) => {
218+
PyValueError::new_err(format!("{e} at index {index}"))
219+
}
220+
JsonStreamingError::IOError(e) => PyIOError::new_err(format!(
221+
"I/O error while parsing (index {index}): {e:?}"
222+
)),
251223
}
252-
}
253-
} else {
254-
return Ok(None);
255-
}
224+
})
256225
}
226+
257227
/// Rewind the inner Python stream/file to undo readahead buffering.
258228
///
259229
/// Required because reading char-by-char without buffering is
@@ -292,20 +262,60 @@ impl RustTokenizer {
292262
}
293263

294264
impl RustTokenizer {
295-
fn process_char_py<'a>(
296-
slf: &mut Self,
297-
py: Python<'_>,
298-
c: CharOrEof,
299-
) -> Result<Option<(TokenType, Option<PyObject>)>, ParsingError> {
300-
match RustTokenizer::process_char(slf.borrow_mut(), c) {
301-
Ok(Some(Token::Operator(s))) => Ok(Some((TokenType::Operator, Some(s.into_py(py))))),
302-
Ok(Some(Token::String_(s))) => Ok(Some((TokenType::String_, Some(s.into_py(py))))),
303-
Ok(Some(Token::Integer(n))) => Ok(Some((TokenType::Number, Some(n.into_py(py))))),
304-
Ok(Some(Token::Float(f))) => Ok(Some((TokenType::Number, Some(f.into_py(py))))),
305-
Ok(Some(Token::Boolean(b))) => Ok(Some((TokenType::Boolean, Some(b.into_py(py))))),
306-
Ok(Some(Token::Null)) => Ok(Some((TokenType::Null, None))),
307-
Ok(None) => Ok(None),
308-
Err(e) => Err(e),
265+
fn read_next_token(slf: &mut Self) -> Result<Option<Token>, JsonStreamingError> {
266+
let mut now_token;
267+
loop {
268+
if slf.advance {
269+
match slf.stream.read_char()? {
270+
Some(r) => slf.c = Some(r),
271+
None => slf.c = None,
272+
}
273+
slf.index += 1;
274+
}
275+
match slf.c {
276+
Some(c) => {
277+
now_token = RustTokenizer::process_char(slf.borrow_mut(), Char(c))?;
278+
slf.state = slf.next_state.clone();
279+
if slf.completed {
280+
slf.completed = false;
281+
slf.token = String::new();
282+
return Ok(now_token.clone());
283+
}
284+
}
285+
None => {
286+
slf.advance = false;
287+
break;
288+
}
289+
}
290+
}
291+
now_token = RustTokenizer::process_char(slf.borrow_mut(), Eof)?;
292+
if slf.completed {
293+
match now_token {
294+
Some(now_token) => {
295+
// these are just to ensure in the next iteration we'll end
296+
// up in the slf.completed = false branch and quit:
297+
slf.completed = false;
298+
slf.state = State::Whitespace;
299+
// final token
300+
return Ok(Some(now_token));
301+
}
302+
None => {
303+
return Ok(None);
304+
}
305+
}
306+
} else {
307+
return Ok(None);
308+
}
309+
}
310+
311+
fn token_to_py_tuple<'a>(tok: Token, py: Python<'_>) -> (TokenType, Option<PyObject>) {
312+
match tok {
313+
Token::Operator(s) => (TokenType::Operator, Some(s.into_py(py))),
314+
Token::String_(s) => (TokenType::String_, Some(s.into_py(py))),
315+
Token::Integer(n) => (TokenType::Number, Some(n.into_py(py))),
316+
Token::Float(f) => (TokenType::Number, Some(f.into_py(py))),
317+
Token::Boolean(b) => (TokenType::Boolean, Some(b.into_py(py))),
318+
Token::Null => (TokenType::Null, None),
309319
}
310320
}
311321

@@ -372,7 +382,7 @@ impl RustTokenizer {
372382
"Invalid JSON character: {c:?}"
373383
)));
374384
}
375-
},
385+
}
376386
Eof => (),
377387
},
378388
State::Integer => match c {
@@ -721,41 +731,37 @@ impl RustTokenizer {
721731
}
722732
}
723733
}
724-
State::UnicodeSurrogateStart => {
725-
match c {
726-
Char('\\') => {
727-
slf.next_state = State::UnicodeSurrogateStringEscape;
728-
}
729-
Char(_) => {
730-
return Err(ParsingError::InvalidJson(format!(
731-
"Unpaired UTF-16 surrogate"
732-
)));
733-
}
734-
Eof => {
735-
return Err(ParsingError::InvalidJson(format!(
736-
"Unpaired UTF-16 surrogate at end of file"
737-
)));
738-
}
734+
State::UnicodeSurrogateStart => match c {
735+
Char('\\') => {
736+
slf.next_state = State::UnicodeSurrogateStringEscape;
739737
}
740-
}
741-
State::UnicodeSurrogateStringEscape => {
742-
match c {
743-
Char('u') => {
744-
slf.unicode_buffer = CompactString::with_capacity(4);
745-
slf.next_state = State::UnicodeSurrogate;
746-
}
747-
Char(_) => {
748-
return Err(ParsingError::InvalidJson(format!(
749-
"Unpaired UTF-16 surrogate"
750-
)));
751-
}
752-
Eof => {
753-
return Err(ParsingError::InvalidJson(format!(
754-
"Unpaired UTF-16 surrogate at end of file"
755-
)));
756-
}
738+
Char(_) => {
739+
return Err(ParsingError::InvalidJson(format!(
740+
"Unpaired UTF-16 surrogate"
741+
)));
757742
}
758-
}
743+
Eof => {
744+
return Err(ParsingError::InvalidJson(format!(
745+
"Unpaired UTF-16 surrogate at end of file"
746+
)));
747+
}
748+
},
749+
State::UnicodeSurrogateStringEscape => match c {
750+
Char('u') => {
751+
slf.unicode_buffer = CompactString::with_capacity(4);
752+
slf.next_state = State::UnicodeSurrogate;
753+
}
754+
Char(_) => {
755+
return Err(ParsingError::InvalidJson(format!(
756+
"Unpaired UTF-16 surrogate"
757+
)));
758+
}
759+
Eof => {
760+
return Err(ParsingError::InvalidJson(format!(
761+
"Unpaired UTF-16 surrogate at end of file"
762+
)));
763+
}
764+
},
759765
State::UnicodeSurrogate => {
760766
match c {
761767
Char(c) => {
@@ -786,13 +792,12 @@ impl RustTokenizer {
786792
"This should never happen, please report it as a bug..."
787793
)));
788794
};
789-
c = Char(
790-
decode_surrogate_pair(prev_charcode, charcode)
791-
.map_err(|_| ParsingError::InvalidJson(format!(
795+
c = Char(decode_surrogate_pair(prev_charcode, charcode).map_err(|_| {
796+
ParsingError::InvalidJson(format!(
792797
"Error decoding UTF-16 surrogate pair \
793798
\\u{prev_charcode:x}\\u{charcode:x}"
794-
)))?
795-
);
799+
))
800+
})?);
796801
slf.prev_charcode = None;
797802
slf.next_state = State::String_;
798803
add_char = true;
@@ -808,6 +813,30 @@ impl RustTokenizer {
808813

809814
Ok(now_token)
810815
}
816+
817+
fn parse_string_contents<'a>(
818+
&mut self,
819+
max_n_chars: Option<usize>,
820+
) -> Result<Option<String>, JsonStreamingError> {
821+
while max_n_chars.map_or(true, |n| self.token.len() < n) {
822+
let c = match self
823+
.stream
824+
.read_char()
825+
.map_err(|e| <io::Error as Into<JsonStreamingError>>::into(e))?
826+
{
827+
Some(c) => Char(c),
828+
None => Eof,
829+
};
830+
RustTokenizer::process_char(self, c)?;
831+
if let State::StringEnd = self.next_state {
832+
self.completed = false;
833+
self.advance = true;
834+
}
835+
}
836+
let mut s = String::new();
837+
swap(&mut s, &mut self.token);
838+
Ok(Some(s))
839+
}
811840
}
812841

813842
/// supports_bigint()

0 commit comments

Comments
 (0)