Skip to content

Commit 8ccbf22

Browse files
committed
WIP: String streaming (mostly w/in main class)
Real mess...
1 parent 4c8cb3a commit 8ccbf22

File tree

3 files changed

+196
-81
lines changed

3 files changed

+196
-81
lines changed

src/int.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,14 @@ pub enum ParseIntError {
1818
use num_bigint::BigInt;
1919

2020
#[cfg(not(any(Py_LIMITED_API, PyPy)))]
21+
#[derive(Clone)]
2122
pub enum AppropriateInt {
2223
Normal(i64),
2324
Big(BigInt),
2425
}
2526

2627
#[cfg(all(any(Py_LIMITED_API, PyPy)))]
28+
#[derive(Clone)]
2729
pub enum AppropriateInt {
2830
Normal(i64),
2931
Big(String), // to be converted into int on the Python side

src/lib.rs

Lines changed: 142 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,13 @@
77
use crate::int::{AppropriateInt, ParseIntError};
88
use crate::remainder::StreamData;
99
use crate::suitable_stream::{make_suitable_stream, SuitableStream};
10+
use crate::user_facing_json_string_reader::UserFacingJsonStringReader;
1011
use compact_str::CompactString;
1112
use pyo3::exceptions::{PyIOError, PyValueError};
1213
use pyo3::prelude::*;
1314
use std::borrow::BorrowMut;
15+
use std::io;
16+
use std::mem::swap;
1417
use std::num::ParseFloatError;
1518
use std::str::FromStr;
1619
use thiserror::Error;
@@ -30,6 +33,7 @@ mod suitable_unbuffered_bytes_stream;
3033
mod suitable_unbuffered_text_stream;
3134
mod suitable_unseekable_buffered_bytes_stream;
3235
mod suitable_unseekable_buffered_text_stream;
36+
mod user_facing_json_string_reader;
3337
mod utf8_char_source;
3438

3539
mod char_or_eof;
@@ -87,6 +91,7 @@ enum State {
8791
/// UTF-8).
8892
/// buffering: Internal buffer size. -1 (the default) means to let the
8993
/// implementation choose a buffer size. Can conflict with `correct_cursor`.
94+
/// strings_as_files: Whether to return strings as file-like objects instead.
9095
/// correct_cursor: *(not part of API yet, may be removed at any point)*
9196
/// Whether it is required that the cursor is left in the correct position
9297
/// (behind the last processed character) after park_cursor() has been
@@ -95,9 +100,10 @@ enum State {
95100
/// unrelated to the actual tokenization progress. For seekable streams, the
96101
/// improvement shouldn't be noticable.
97102
#[pyclass]
98-
#[pyo3(text_signature = "(stream, *, buffering=-1, correct_cursor=True)")]
99-
struct RustTokenizer {
103+
#[pyo3(text_signature = "(stream, *, buffering=-1, strings_as_files=False, correct_cursor=True)")]
104+
pub struct RustTokenizer {
100105
stream: Box<dyn SuitableStream + Send>,
106+
strings_as_files: bool,
101107
completed: bool,
102108
advance: bool,
103109
token: String,
@@ -144,9 +150,28 @@ impl From<UnicodeError> for ParsingError {
144150
}
145151
}
146152

153+
pub enum JsonStreamingError {
154+
ParsingError(ParsingError),
155+
IOError(io::Error),
156+
}
157+
158+
impl From<ParsingError> for JsonStreamingError {
159+
fn from(e: ParsingError) -> JsonStreamingError {
160+
JsonStreamingError::ParsingError(e)
161+
}
162+
}
163+
164+
impl From<io::Error> for JsonStreamingError {
165+
fn from(e: io::Error) -> JsonStreamingError {
166+
JsonStreamingError::IOError(e)
167+
}
168+
}
169+
170+
#[derive(Clone)]
147171
enum Token {
148172
Operator(String),
149173
String_(String),
174+
StringAsFile, // handled specially
150175
Integer(AppropriateInt),
151176
Float(f64),
152177
Boolean(bool),
@@ -156,8 +181,13 @@ enum Token {
156181
#[pymethods]
157182
impl RustTokenizer {
158183
#[new]
159-
#[args("*", buffering = -1, correct_cursor = "true")]
160-
fn new(stream: PyObject, buffering: i64, correct_cursor: bool) -> PyResult<Self> {
184+
#[args("*", buffering = -1, strings_as_files = "false", correct_cursor = "true")]
185+
fn new(
186+
stream: PyObject,
187+
buffering: i64,
188+
strings_as_files: bool,
189+
correct_cursor: bool,
190+
) -> PyResult<Self> {
161191
let buffering_mode = if buffering < 0 {
162192
BufferingMode::DontCare
163193
} else if buffering == 0 || buffering == 1 {
@@ -168,6 +198,7 @@ impl RustTokenizer {
168198
let stream = make_suitable_stream(stream, buffering_mode, correct_cursor)?;
169199
Ok(RustTokenizer {
170200
stream,
201+
strings_as_files,
171202
completed: false,
172203
advance: true,
173204
token: String::new(),
@@ -179,81 +210,34 @@ impl RustTokenizer {
179210
prev_charcode: None,
180211
})
181212
}
213+
182214
fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> {
183215
slf
184216
}
217+
185218
fn __next__(
186219
mut slf: PyRefMut<'_, Self>,
187220
py: Python<'_>,
188221
) -> PyResult<Option<(TokenType, Option<PyObject>)>> {
189-
let mut now_token;
190-
loop {
191-
if slf.advance {
192-
match slf.stream.read_char() {
193-
Ok(r) => match r {
194-
Some(r) => slf.c = Some(r),
195-
None => slf.c = None,
196-
},
197-
Err(e) => {
198-
let index = slf.index;
199-
return Err(PyIOError::new_err(format!(
200-
"I/O error while parsing (index {index}): {e:?}"
201-
)));
202-
}
203-
}
204-
slf.index += 1;
205-
}
206-
match slf.c {
207-
Some(c) => {
208-
match RustTokenizer::process_char_py(slf.borrow_mut(), py, Char(c)) {
209-
Ok(tok) => {
210-
now_token = tok;
211-
slf.state = slf.next_state.clone();
212-
}
213-
Err(e) => {
214-
let index = slf.index;
215-
return Err(PyValueError::new_err(format!("{e} at index {index}")));
216-
}
217-
}
218-
if slf.completed {
219-
slf.completed = false;
220-
slf.token = String::new();
221-
return Ok(now_token.clone());
222-
}
223-
}
224-
None => {
225-
slf.advance = false;
226-
break;
227-
}
228-
}
229-
}
230-
match RustTokenizer::process_char_py(slf.borrow_mut(), py, Eof) {
231-
Ok(tok) => {
232-
now_token = tok;
233-
}
234-
Err(e) => {
222+
match RustTokenizer::read_next_token(&mut slf) {
223+
Ok(maybe_tok) => Ok(match maybe_tok {
224+
Some(tok) => Some(RustTokenizer::token_to_py_tuple(slf, tok, py)),
225+
None => None,
226+
}),
227+
Err(e) => Err({
235228
let index = slf.index;
236-
return Err(PyValueError::new_err(format!("{e} at index {index}")));
237-
}
238-
}
239-
if slf.completed {
240-
match now_token {
241-
Some(now_token) => {
242-
// these are just to ensure in the next iteration we'll end
243-
// up in the slf.completed = false branch and quit:
244-
slf.completed = false;
245-
slf.state = State::Whitespace;
246-
// final token
247-
return Ok(Some(now_token));
248-
}
249-
None => {
250-
return Ok(None);
229+
match e {
230+
JsonStreamingError::ParsingError(e) => {
231+
PyValueError::new_err(format!("{e} at index {index}"))
232+
}
233+
JsonStreamingError::IOError(e) => PyIOError::new_err(format!(
234+
"I/O error while parsing (index {index}): {e:?}"
235+
)),
251236
}
252-
}
253-
} else {
254-
return Ok(None);
237+
}),
255238
}
256239
}
240+
257241
/// Rewind the inner Python stream/file to undo readahead buffering.
258242
///
259243
/// Required because reading char-by-char without buffering is
@@ -292,20 +276,68 @@ impl RustTokenizer {
292276
}
293277

294278
impl RustTokenizer {
295-
fn process_char_py<'a>(
296-
slf: &mut Self,
279+
fn read_next_token(slf: &mut Self) -> Result<Option<Token>, JsonStreamingError> {
280+
let mut now_token;
281+
loop {
282+
if slf.advance {
283+
match slf.stream.read_char()? {
284+
Some(r) => slf.c = Some(r),
285+
None => slf.c = None,
286+
}
287+
slf.index += 1;
288+
}
289+
match slf.c {
290+
Some(c) => {
291+
now_token = RustTokenizer::process_char(slf.borrow_mut(), Char(c))?;
292+
slf.state = slf.next_state.clone();
293+
if slf.completed {
294+
slf.completed = false;
295+
slf.token = String::new();
296+
return Ok(now_token.clone());
297+
}
298+
}
299+
None => {
300+
slf.advance = false;
301+
break;
302+
}
303+
}
304+
}
305+
now_token = RustTokenizer::process_char(slf.borrow_mut(), Eof)?;
306+
if slf.completed {
307+
match now_token {
308+
Some(now_token) => {
309+
// these are just to ensure in the next iteration we'll end
310+
// up in the slf.completed = false branch and quit:
311+
slf.completed = false;
312+
slf.state = State::Whitespace;
313+
// final token
314+
return Ok(Some(now_token));
315+
}
316+
None => {
317+
return Ok(None);
318+
}
319+
}
320+
} else {
321+
return Ok(None);
322+
}
323+
}
324+
325+
fn token_to_py_tuple<'a>(
326+
slf: PyRefMut<'_, Self>,
327+
tok: Token,
297328
py: Python<'_>,
298-
c: CharOrEof,
299-
) -> Result<Option<(TokenType, Option<PyObject>)>, ParsingError> {
300-
match RustTokenizer::process_char(slf.borrow_mut(), c) {
301-
Ok(Some(Token::Operator(s))) => Ok(Some((TokenType::Operator, Some(s.into_py(py))))),
302-
Ok(Some(Token::String_(s))) => Ok(Some((TokenType::String_, Some(s.into_py(py))))),
303-
Ok(Some(Token::Integer(n))) => Ok(Some((TokenType::Number, Some(n.into_py(py))))),
304-
Ok(Some(Token::Float(f))) => Ok(Some((TokenType::Number, Some(f.into_py(py))))),
305-
Ok(Some(Token::Boolean(b))) => Ok(Some((TokenType::Boolean, Some(b.into_py(py))))),
306-
Ok(Some(Token::Null)) => Ok(Some((TokenType::Null, None))),
307-
Ok(None) => Ok(None),
308-
Err(e) => Err(e),
329+
) -> (TokenType, Option<PyObject>) {
330+
match tok {
331+
Token::Operator(s) => (TokenType::Operator, Some(s.into_py(py))),
332+
Token::String_(s) => (TokenType::String_, Some(s.into_py(py))),
333+
Token::StringAsFile => (
334+
TokenType::String_,
335+
Some(UserFacingJsonStringReader::new(slf.into()).into_py(py)),
336+
),
337+
Token::Integer(n) => (TokenType::Number, Some(n.into_py(py))),
338+
Token::Float(f) => (TokenType::Number, Some(f.into_py(py))),
339+
Token::Boolean(b) => (TokenType::Boolean, Some(b.into_py(py))),
340+
Token::Null => (TokenType::Null, None),
309341
}
310342
}
311343

@@ -344,6 +376,10 @@ impl RustTokenizer {
344376
}
345377
Char('"') => {
346378
slf.next_state = State::String_;
379+
if slf.strings_as_files {
380+
slf.completed = true;
381+
now_token = Some(Token::StringAsFile);
382+
}
347383
}
348384
Char('1'..='9') => {
349385
slf.next_state = State::Integer;
@@ -803,6 +839,31 @@ impl RustTokenizer {
803839

804840
Ok(now_token)
805841
}
842+
843+
fn parse_string_contents<'a>(
844+
&mut self,
845+
max_n_chars: Option<usize>,
846+
) -> Result<Option<String>, JsonStreamingError> {
847+
while max_n_chars.map_or(true, |n| self.token.len() < n) {
848+
let c = match self
849+
.stream
850+
.read_char()
851+
.map_err(|e| <io::Error as Into<JsonStreamingError>>::into(e))?
852+
{
853+
Some(c) => Char(c),
854+
None => Eof,
855+
};
856+
self.index += 1; // TODO DRY => pull into new read_char() method on this cls?
857+
RustTokenizer::process_char(self, c)?;
858+
if let State::StringEnd = self.next_state {
859+
self.completed = false;
860+
self.advance = true;
861+
}
862+
}
863+
let mut s = String::new();
864+
swap(&mut s, &mut self.token);
865+
Ok(Some(s))
866+
}
806867
}
807868

808869
/// supports_bigint()
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
use crate::JsonStreamingError;
2+
use pyo3::exceptions::{PyIOError, PyValueError};
3+
use pyo3::prelude::*;
4+
5+
use crate::RustTokenizer;
6+
7+
#[pyclass]
8+
#[derive(Clone)]
9+
pub struct UserFacingJsonStringReader {
10+
tokenizer: Py<RustTokenizer>,
11+
}
12+
13+
#[pymethods]
14+
impl UserFacingJsonStringReader {
15+
pub fn read(slf: PyRefMut<'_, Self>, size: Option<isize>, py: Python<'_>) -> PyResult<String> {
16+
// normalize size arg
17+
let max_n_chars: Option<usize> = match size {
18+
None => None,
19+
Some(size) if size < 0 => None,
20+
Some(size) if size == 0 => return Ok("".to_owned()),
21+
Some(size) => Some(size as usize),
22+
};
23+
// /normalize
24+
Ok(
25+
match RustTokenizer::parse_string_contents(
26+
&mut slf.tokenizer.borrow_mut(py),
27+
max_n_chars,
28+
)
29+
// TODO refactor (duplicate code in lib.rs)
30+
.map_err(|e| -> PyErr {
31+
let index = slf.tokenizer.borrow(py).index;
32+
match e {
33+
JsonStreamingError::ParsingError(e) => {
34+
PyValueError::new_err(format!("{e} at index {index}"))
35+
}
36+
JsonStreamingError::IOError(e) => PyIOError::new_err(format!(
37+
"I/O error while parsing (index {index}): {e:?}"
38+
)),
39+
}
40+
})? {
41+
Some(s) => s,
42+
None => "".to_owned(),
43+
},
44+
)
45+
}
46+
}
47+
48+
impl UserFacingJsonStringReader {
49+
pub fn new(tokenizer: Py<RustTokenizer>) -> Self {
50+
UserFacingJsonStringReader { tokenizer }
51+
}
52+
}

0 commit comments

Comments
 (0)