@@ -215,13 +215,12 @@ txt_token_array_from_lang_kind_string(Arena *arena, TXT_LangKind lang_kind, Stri
215215 next_escaped = 1 ;
216216 }
217217
218- // rjf: take starter, push new token tasks
218+ // rjf: take token starters
219+ B32 new_token_needed = (top_task == 0 );
220+ TXT_TokenizerRule * new_rule = nil_rule ;
221+ TXT_TokenKind new_token_kind = TXT_TokenKind_Null ;
219222 {
220- TXT_TokenizerRule * new_rule = nil_rule ;
221- TXT_TokenKind new_token_kind = TXT_TokenKind_Null ;
222-
223223 // rjf: use next bytes to look up a rule from the table
224- if (top_task == 0 )
225224 {
226225 TXT_TokenizerRule * active_rule = top_task ? top_task -> rule : nil_rule ;
227226 U64 hash_1byte = u64_hash_from_str8 (string_1byte );
@@ -292,37 +291,11 @@ txt_token_array_from_lang_kind_string(Arena *arena, TXT_LangKind lang_kind, Stri
292291 byte == '>' || byte == '/' ||
293292 byte == '?' || byte == '|' ) { new_token_kind = TXT_TokenKind_Symbol ; }
294293 }
295-
296- // rjf: start new token
297- if (new_token_kind != TXT_TokenKind_Null )
298- {
299- TokenTask * task = free_task ;
300- if (task != 0 )
301- {
302- SLLStackPop (free_task );
303- }
304- else
305- {
306- task = push_array (scratch .arena , TokenTask , 1 );
307- }
308- SLLStackPush (top_task , task );
309- top_task -> rule = new_rule ;
310- top_task -> kind = new_token_kind ;
311- top_task -> start_idx = idx ;
312- }
313-
314- // rjf: invalid token kind -> emit error
315- else if (top_task == 0 )
316- {
317- TXT_Token token = {TXT_TokenKind_Error , r1u64 (idx , idx + 1 )};
318- txt_token_chunk_list_push (scratch .arena , & tokens , chunk_size , & token );
319- }
320294 }
321295
322296 // rjf: look for ender based on rule's closing symbol
323297 U64 ender_pad = 0 ;
324298 B32 ender_found = 0 ;
325- B32 task_pop = 0 ;
326299 if (top_task != 0 && idx > top_task -> start_idx )
327300 {
328301 TXT_TokenKind active_token_kind = top_task -> kind ;
@@ -373,13 +346,33 @@ txt_token_array_from_lang_kind_string(Arena *arena, TXT_LangKind lang_kind, Stri
373346 }
374347 }
375348
376- // rjf: next byte is ender => emit token
349+ // rjf: if we have a new token to start, but we have an active token, then
350+ // end the current token (but keep the same stack)
351+ B32 keep_top_task = 0 ;
352+ if (top_task != 0 && new_token_kind != TXT_TokenKind_Null && idx > top_task -> start_idx )
353+ {
354+ keep_top_task = (!ender_found );
355+ ender_found = 1 ;
356+ }
357+
358+ // rjf: if we have an ender => emit token(s) for current task
377359 if (ender_found )
378360 {
379361 TXT_Token token = {top_task -> kind , r1u64 (top_task -> start_idx , idx + ender_pad )};
380- TokenTask * popped = top_task ;
381- SLLStackPop (top_task );
382- SLLStackPush (free_task , popped );
362+ if (!keep_top_task )
363+ {
364+ TokenTask * popped = top_task ;
365+ SLLStackPop (top_task );
366+ SLLStackPush (free_task , popped );
367+ if (top_task != 0 )
368+ {
369+ top_task -> start_idx = idx + ender_pad ;
370+ }
371+ }
372+ else
373+ {
374+ top_task -> start_idx = idx + ender_pad ;
375+ }
383376
384377 // rjf: trim \r's off of end
385378 {
@@ -442,16 +435,43 @@ txt_token_array_from_lang_kind_string(Arena *arena, TXT_LangKind lang_kind, Stri
442435 {
443436 txt_token_chunk_list_push (scratch .arena , & tokens , chunk_size , & token );
444437 }
445-
446- // rjf: increment by ender padding
447- idx += ender_pad ;
438+ }
439+
440+ // rjf: start new token
441+ if (new_token_kind != TXT_TokenKind_Null )
442+ {
443+ TokenTask * task = free_task ;
444+ if (task != 0 )
445+ {
446+ SLLStackPop (free_task );
447+ }
448+ else
449+ {
450+ task = push_array (scratch .arena , TokenTask , 1 );
451+ }
452+ SLLStackPush (top_task , task );
453+ top_task -> rule = new_rule ;
454+ top_task -> kind = new_token_kind ;
455+ top_task -> start_idx = idx ;
456+ }
457+
458+ // rjf: invalid token kind -> emit error
459+ else if (new_token_needed )
460+ {
461+ TXT_Token token = {TXT_TokenKind_Error , r1u64 (idx , idx + 1 )};
462+ txt_token_chunk_list_push (scratch .arena , & tokens , chunk_size , & token );
448463 }
449464
450465 // rjf: advance by 1 byte if we haven't found an ender
451466 if (!ender_found )
452467 {
453468 idx += 1 ;
454469 }
470+
471+ // rjf: advance by ender padding
472+ idx += ender_pad ;
473+
474+ // rjf: advance escaping
455475 escaped = next_escaped ;
456476 }
457477 }
0 commit comments