Skip to content

Commit ac16188

Browse files
committed
Prep. for string streaming: no-py process_char
1 parent d7296b4 commit ac16188

File tree

2 files changed

+41
-28
lines changed

2 files changed

+41
-28
lines changed

src/lib.rs

Lines changed: 39 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,15 @@ impl From<UnicodeError> for ParsingError {
144144
}
145145
}
146146

147+
enum Token {
148+
Operator(String),
149+
String_(String),
150+
Integer(AppropriateInt),
151+
Float(f64),
152+
Boolean(bool),
153+
Null,
154+
}
155+
147156
#[pymethods]
148157
impl RustTokenizer {
149158
#[new]
@@ -196,7 +205,7 @@ impl RustTokenizer {
196205
}
197206
match slf.c {
198207
Some(c) => {
199-
match RustTokenizer::process_char(slf.borrow_mut(), py, Char(c)) {
208+
match RustTokenizer::process_char_py(slf.borrow_mut(), py, Char(c)) {
200209
Ok(tok) => {
201210
now_token = tok;
202211
slf.state = slf.next_state.clone();
@@ -218,7 +227,7 @@ impl RustTokenizer {
218227
}
219228
}
220229
}
221-
match RustTokenizer::process_char(slf.borrow_mut(), py, Eof) {
230+
match RustTokenizer::process_char_py(slf.borrow_mut(), py, Eof) {
222231
Ok(tok) => {
223232
now_token = tok;
224233
}
@@ -283,11 +292,24 @@ impl RustTokenizer {
283292
}
284293

285294
impl RustTokenizer {
286-
fn process_char<'a>(
295+
fn process_char_py<'a>(
287296
slf: &mut Self,
288297
py: Python<'_>,
289298
c: CharOrEof,
290299
) -> Result<Option<(TokenType, Option<PyObject>)>, ParsingError> {
300+
match RustTokenizer::process_char(slf.borrow_mut(), c) {
301+
Ok(Some(Token::Operator(s))) => Ok(Some((TokenType::Operator, Some(s.into_py(py))))),
302+
Ok(Some(Token::String_(s))) => Ok(Some((TokenType::String_, Some(s.into_py(py))))),
303+
Ok(Some(Token::Integer(n))) => Ok(Some((TokenType::Number, Some(n.into_py(py))))),
304+
Ok(Some(Token::Float(f))) => Ok(Some((TokenType::Number, Some(f.into_py(py))))),
305+
Ok(Some(Token::Boolean(b))) => Ok(Some((TokenType::Boolean, Some(b.into_py(py))))),
306+
Ok(Some(Token::Null)) => Ok(Some((TokenType::Null, None))),
307+
Ok(None) => Ok(None),
308+
Err(e) => Err(e),
309+
}
310+
}
311+
312+
fn process_char<'a>(slf: &mut Self, c: CharOrEof) -> Result<Option<Token>, ParsingError> {
291313
slf.advance = true;
292314
slf.next_state = slf.state.clone();
293315
let mut now_token = None;
@@ -298,27 +320,27 @@ impl RustTokenizer {
298320
State::Whitespace => match c {
299321
Char('{') => {
300322
slf.completed = true;
301-
now_token = Some((TokenType::Operator, Some("{".into_py(py))));
323+
now_token = Some(Token::Operator("{".to_owned()));
302324
}
303325
Char('}') => {
304326
slf.completed = true;
305-
now_token = Some((TokenType::Operator, Some("}".into_py(py))));
327+
now_token = Some(Token::Operator("}".to_owned()));
306328
}
307329
Char('[') => {
308330
slf.completed = true;
309-
now_token = Some((TokenType::Operator, Some("[".into_py(py))));
331+
now_token = Some(Token::Operator("[".to_owned()));
310332
}
311333
Char(']') => {
312334
slf.completed = true;
313-
now_token = Some((TokenType::Operator, Some("]".into_py(py))));
335+
now_token = Some(Token::Operator("]".to_owned()));
314336
}
315337
Char(',') => {
316338
slf.completed = true;
317-
now_token = Some((TokenType::Operator, Some(",".into_py(py))));
339+
now_token = Some(Token::Operator(",".to_owned()));
318340
}
319341
Char(':') => {
320342
slf.completed = true;
321-
now_token = Some((TokenType::Operator, Some(":".into_py(py))));
343+
now_token = Some(Token::Operator(":".to_owned()));
322344
}
323345
Char('"') => {
324346
slf.next_state = State::String_;
@@ -370,7 +392,7 @@ impl RustTokenizer {
370392
slf.completed = true;
371393
match AppropriateInt::from_str(&slf.token) {
372394
Ok(parsed_num) => {
373-
now_token = Some((TokenType::Number, Some(parsed_num.into_py(py))));
395+
now_token = Some(Token::Integer(parsed_num));
374396
}
375397
Err(ParseIntError::General(e)) => {
376398
return Err(ParsingError::InvalidJson(format!(
@@ -403,7 +425,7 @@ impl RustTokenizer {
403425
_ if is_delimiter(c) => {
404426
slf.next_state = State::Whitespace;
405427
slf.completed = true;
406-
now_token = Some((TokenType::Number, Some(0.into_py(py))));
428+
now_token = Some(Token::Integer(AppropriateInt::Normal(0)));
407429
slf.advance = false;
408430
}
409431
_ => {
@@ -444,10 +466,7 @@ impl RustTokenizer {
444466
}
445467
_ if is_delimiter(c) => {
446468
slf.completed = true;
447-
now_token = Some((
448-
TokenType::Number,
449-
Some(slf.token.parse::<f64>()?.into_py(py)),
450-
));
469+
now_token = Some(Token::Float(slf.token.parse::<f64>()?));
451470
slf.next_state = State::Whitespace;
452471
slf.advance = false;
453472
}
@@ -467,10 +486,7 @@ impl RustTokenizer {
467486
}
468487
_ if is_delimiter(c) => {
469488
slf.completed = true;
470-
now_token = Some((
471-
TokenType::Number,
472-
Some(slf.token.parse::<f64>()?.into_py(py)),
473-
));
489+
now_token = Some(Token::Float(slf.token.parse::<f64>()?));
474490
slf.next_state = State::Whitespace;
475491
slf.advance = false;
476492
}
@@ -525,7 +541,7 @@ impl RustTokenizer {
525541
Char('e') => {
526542
slf.next_state = State::Whitespace;
527543
slf.completed = true;
528-
now_token = Some((TokenType::Boolean, Some(false.into_py(py))));
544+
now_token = Some(Token::Boolean(false));
529545
}
530546
_ => {
531547
return Err(ParsingError::InvalidJson(format!(
@@ -557,7 +573,7 @@ impl RustTokenizer {
557573
Char('e') => {
558574
slf.next_state = State::Whitespace;
559575
slf.completed = true;
560-
now_token = Some((TokenType::Boolean, Some(true.into_py(py))));
576+
now_token = Some(Token::Boolean(true));
561577
}
562578
_ => {
563579
return Err(ParsingError::InvalidJson(format!(
@@ -589,7 +605,7 @@ impl RustTokenizer {
589605
Char('l') => {
590606
slf.next_state = State::Whitespace;
591607
slf.completed = true;
592-
now_token = Some((TokenType::Null, None));
608+
now_token = Some(Token::Null);
593609
}
594610
_ => {
595611
return Err(ParsingError::InvalidJson(format!(
@@ -600,7 +616,7 @@ impl RustTokenizer {
600616
State::String_ => match c {
601617
Char('\"') => {
602618
slf.completed = true;
603-
now_token = Some((TokenType::String_, Some(slf.token.clone().into_py(py))));
619+
now_token = Some(Token::String_(slf.token.clone()));
604620
slf.next_state = State::StringEnd;
605621
}
606622
Char('\\') => {

src/unicode_utils.rs

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
use thiserror::Error;
21
use std::char::DecodeUtf16Error;
2+
use thiserror::Error;
33

44
#[derive(Error, Debug)]
55
pub enum UnicodeError {
@@ -24,10 +24,7 @@ pub fn is_surrogate(codepoint: u16) -> bool {
2424
return codepoint >= 0xD800 && codepoint <= 0xDFFF;
2525
}
2626

27-
pub fn decode_surrogate_pair(
28-
first_half: u16,
29-
second_half: u16,
30-
) -> Result<char, UnicodeError> {
27+
pub fn decode_surrogate_pair(first_half: u16, second_half: u16) -> Result<char, UnicodeError> {
3128
return match char::decode_utf16(vec![first_half, second_half]).next() {
3229
Some(result) => result.map_err(UnicodeError::from),
3330
None => Err(UnicodeError::Weirdness(format!(

0 commit comments

Comments
 (0)