@@ -47,6 +47,10 @@ txt_lang_kind_from_extension(String8 extension)
4747 {
4848 kind = TXT_LangKind_Zig ;
4949 }
50+ else if (str8_match (extension , str8_lit ("rs" ), StringMatchFlag_CaseInsensitive ))
51+ {
52+ kind = TXT_LangKind_Rust ;
53+ }
5054 return kind ;
5155}
5256
@@ -65,6 +69,7 @@ txt_extension_from_lang_kind(TXT_LangKind kind)
6569 case TXT_LangKind_Odin : {result = str8_lit ("odin" );}break ;
6670 case TXT_LangKind_Jai : {result = str8_lit ("jai" );}break ;
6771 case TXT_LangKind_Zig : {result = str8_lit ("zig" );}break ;
72+ case TXT_LangKind_Rust : {result = str8_lit ("rs" );}break ;
6873 }
6974 return result ;
7075}
@@ -93,6 +98,7 @@ txt_lex_function_from_lang_kind(TXT_LangKind kind)
9398 case TXT_LangKind_Odin : {fn = txt_token_array_from_string__odin ;}break ;
9499 case TXT_LangKind_Jai : {fn = txt_token_array_from_string__jai ;}break ;
95100 case TXT_LangKind_Zig : {fn = txt_token_array_from_string__zig ;}break ;
101+ case TXT_LangKind_Rust : {fn = txt_token_array_from_string__rust ;}break ;
96102 case TXT_LangKind_DisasmX64Intel :{fn = txt_token_array_from_string__disasm_x64_intel ;}break ;
97103 }
98104 return fn ;
@@ -1368,6 +1374,370 @@ txt_token_array_from_string__zig(Arena *arena, U64 *bytes_processed_counter, Str
13681374 return result ;
13691375}
13701376
1377+ internal TXT_TokenArray
1378+ txt_token_array_from_string__rust (Arena * arena , U64 * bytes_processed_counter , String8 string )
1379+ {
1380+ // NOTE(spey): Rust supports unicode identifiers. They are not handled in any way here,
1381+ // but it might be worth looking into in the future.
1382+
1383+ Temp scratch = scratch_begin (& arena , 1 );
1384+
1385+ //- rjf: generate token list
1386+ TXT_TokenChunkList tokens = {0 };
1387+ {
1388+ S32 multiline_comment_nesting_level = 0 ;
1389+ S32 raw_string_nesting_level = 0 ;
1390+ S32 raw_string_ender_nesting_level = 0 ;
1391+
1392+ // NOTE(spey): Rust's syntax is designed in such a way that we can't be sure what a token
1393+ // is immediately from the first character, so we have to keep track of some possibilities.
1394+ B32 token_may_be_char = 0 ;
1395+ B32 token_may_be_lifetime = 0 ;
1396+ B32 token_may_be_string = 0 ;
1397+
1398+ TXT_TokenKind active_token_kind = TXT_TokenKind_Null ;
1399+ U64 active_token_start_idx = 0 ;
1400+ B32 escaped = 0 ;
1401+ B32 next_escaped = 0 ;
1402+ U64 byte_process_start_idx = 0 ;
1403+ for (U64 idx = 0 ; idx <= string .size ;)
1404+ {
1405+ U8 byte = (idx + 0 < string .size ) ? (string .str [idx + 0 ]) : 0 ;
1406+ U8 next_byte = (idx + 1 < string .size ) ? (string .str [idx + 1 ]) : 0 ;
1407+
1408+ // rjf: update counter
1409+ if (bytes_processed_counter != 0 && ((idx - byte_process_start_idx ) >= 1000 || idx == string .size ))
1410+ {
1411+ ins_atomic_u64_add_eval (bytes_processed_counter , (idx - byte_process_start_idx ));
1412+ byte_process_start_idx = idx ;
1413+ }
1414+
1415+ // rjf: escaping
1416+ if (escaped && (byte != '\r' && byte != '\n' ))
1417+ {
1418+ next_escaped = 0 ;
1419+ }
1420+ else if (!escaped && byte == '\\' )
1421+ {
1422+ next_escaped = 1 ;
1423+ }
1424+
1425+ // rjf: take starter, determine active token kind
1426+ U64 starter_pad = 0 ;
1427+
1428+ // spey: special case of starter for nested comments
1429+ if (active_token_kind == TXT_TokenKind_Comment )
1430+ {
1431+ if (byte == '/' && next_byte == '*' ) { active_token_kind = TXT_TokenKind_Comment ; multiline_comment_nesting_level ++ ; starter_pad = 1 ; }
1432+ }
1433+ // spey: special case of starter for raw string literals
1434+ else if (active_token_kind == TXT_TokenKind_Identifier && token_may_be_string )
1435+ {
1436+ if (0 ){}
1437+ else if (byte == 'r' && next_byte == '#' ) {} // spey: still an identifier that may be a string (this branch triggers for raw byte/C string literals)
1438+ else if (byte == '#' && next_byte == '"' ) { active_token_kind = TXT_TokenKind_String ; token_may_be_string = 0 ; token_may_be_char = 0 ; raw_string_nesting_level ++ ; starter_pad = 2 ; }
1439+ else if (byte == '#' && next_byte == '#' ) { raw_string_nesting_level ++ ; }
1440+ else { token_may_be_string = 0 ; token_may_be_char = 0 ; raw_string_nesting_level = 0 ; } // spey: confirmed raw identifier
1441+ }
1442+ // spey: regular cases
1443+ else if (active_token_kind == TXT_TokenKind_Null )
1444+ {
1445+ // rjf: use next bytes to start a new token
1446+ if (0 ){}
1447+ else if (char_is_space (byte )) { active_token_kind = TXT_TokenKind_Whitespace ; }
1448+ else if (byte == 'r' && next_byte == '#' ) { active_token_kind = TXT_TokenKind_Identifier ; token_may_be_string = 1 ; } // spey: either raw identifiers or raw string literals
1449+ else if (char_is_digit (byte , 10 ) ||
1450+ (byte == '.' &&
1451+ char_is_digit (next_byte , 10 ))) { active_token_kind = TXT_TokenKind_Numeric ; }
1452+ else if (byte == '"' ) { active_token_kind = TXT_TokenKind_String ; token_may_be_char = 0 ; }
1453+ else if ((byte == 'c' || byte == 'b' ) &&
1454+ next_byte == '"' ) { active_token_kind = TXT_TokenKind_String ; token_may_be_char = 0 ; starter_pad = 1 ; }
1455+ else if ((byte == 'c' || byte == 'b' ) &&
1456+ next_byte == 'r' ) { active_token_kind = TXT_TokenKind_Identifier ; token_may_be_string = 1 ; }
1457+ else if (byte == '_' ||
1458+ char_is_alpha (byte )) { active_token_kind = TXT_TokenKind_Identifier ; }
1459+ else if (byte == '/' && next_byte == '/' ) { active_token_kind = TXT_TokenKind_Comment ; starter_pad = 1 ; }
1460+ else if (byte == '/' && next_byte == '*' ) { active_token_kind = TXT_TokenKind_Comment ; starter_pad = 1 ; multiline_comment_nesting_level ++ ; }
1461+ else if (byte == '~' || byte == '!' ||
1462+ byte == '%' || byte == '^' ||
1463+ byte == '&' || byte == '*' ||
1464+ byte == '(' || byte == ')' ||
1465+ byte == '-' || byte == '=' ||
1466+ byte == '+' || byte == '[' ||
1467+ byte == ']' || byte == '{' ||
1468+ byte == '}' || byte == ':' ||
1469+ byte == ';' || byte == ',' ||
1470+ byte == '.' || byte == '<' ||
1471+ byte == '>' || byte == '/' ||
1472+ byte == '?' || byte == '|' ) { active_token_kind = TXT_TokenKind_Symbol ; }
1473+ else if (byte == '\'' ) { active_token_kind = TXT_TokenKind_String ; token_may_be_char = 1 ; token_may_be_lifetime = 1 ; }
1474+ else if ((byte == 'c' || byte == 'b' ) &&
1475+ next_byte == '\'' ) { active_token_kind = TXT_TokenKind_String ; token_may_be_char = 1 ; starter_pad = 1 ; }
1476+
1477+ // rjf: start new token
1478+ if (active_token_kind != TXT_TokenKind_Null )
1479+ {
1480+ active_token_start_idx = idx ;
1481+ }
1482+
1483+ // rjf: invalid token kind -> emit error
1484+ else
1485+ {
1486+ TXT_Token token = {TXT_TokenKind_Error , r1u64 (idx , idx + 1 )};
1487+ txt_token_chunk_list_push (scratch .arena , & tokens , 4096 , & token );
1488+ }
1489+ }
1490+
1491+ B32 is_on_starter = idx <= active_token_start_idx || token_may_be_string ;
1492+
1493+ // spey: advance by starter padding byte(s) and reset byte/next_byte values
1494+ idx += starter_pad ;
1495+ byte = (idx + 0 < string .size ) ? (string .str [idx + 0 ]) : 0 ;
1496+ next_byte = (idx + 1 < string .size ) ? (string .str [idx + 1 ]) : 0 ;
1497+
1498+ // rjf: look for ender
1499+ U64 ender_pad = 0 ;
1500+ B32 ender_found = 0 ;
1501+ if (active_token_kind != TXT_TokenKind_Null && !is_on_starter )
1502+ {
1503+ if (idx == string .size )
1504+ {
1505+ ender_pad = 0 ;
1506+ ender_found = 1 ;
1507+ }
1508+ else switch (active_token_kind )
1509+ {
1510+ default :break ;
1511+ case TXT_TokenKind_Whitespace :
1512+ {
1513+ ender_found = !char_is_space (byte );
1514+ }break ;
1515+ case TXT_TokenKind_Identifier :
1516+ {
1517+ ender_found = (!char_is_alpha (byte ) && !char_is_digit (byte , 10 ) && byte != '_' && byte != '$' && byte != '#' && byte != '!' && byte < 128 );
1518+ }break ;
1519+ case TXT_TokenKind_Numeric :
1520+ {
1521+ ender_found = (!char_is_alpha (byte ) && !char_is_digit (byte , 10 ) && byte != '_' && byte != '.' && byte != '\'' );
1522+ }break ;
1523+ case TXT_TokenKind_String :
1524+ {
1525+ if (!escaped )
1526+ {
1527+ if (token_may_be_char )
1528+ {
1529+ if (byte == '\'' )
1530+ {
1531+ // spey: char ending
1532+ ender_found = 1 ;
1533+ }
1534+ else if (token_may_be_lifetime && !char_is_alpha (byte ) && !char_is_digit (byte , 10 ) && byte != '_' && byte < 128 )
1535+ {
1536+ // spey: lifetime ending
1537+ ender_found = 1 ;
1538+ }
1539+ }
1540+ else
1541+ {
1542+ if (0 ){}
1543+
1544+ // spey: regular string
1545+ else if (raw_string_nesting_level == 0 ) { ender_found = byte == '"' ; }
1546+
1547+ // spey: raw string
1548+ else if (byte == '"' && next_byte == '#' &&
1549+ raw_string_ender_nesting_level == 0 ) { raw_string_ender_nesting_level ++ ; }
1550+ else if (byte == '#' && next_byte != '#' &&
1551+ raw_string_ender_nesting_level == raw_string_nesting_level &&
1552+ raw_string_ender_nesting_level >= 0 ) { ender_found = 1 ; raw_string_nesting_level = 0 ; raw_string_ender_nesting_level = 0 ; }
1553+ else if (byte == '#' && next_byte != '#' &&
1554+ raw_string_ender_nesting_level >= 0 ) { raw_string_ender_nesting_level = 0 ; }
1555+ else if (byte == '#' &&
1556+ raw_string_ender_nesting_level >= 0 ) { raw_string_ender_nesting_level ++ ; }
1557+ }
1558+ }
1559+
1560+ ender_pad += 1 ;
1561+ }break ;
1562+ case TXT_TokenKind_Symbol :
1563+ {
1564+ ender_found = (byte != '~' && byte != '!' &&
1565+ byte != '%' && byte != '^' &&
1566+ byte != '&' && byte != '*' &&
1567+ byte != '(' && byte != ')' &&
1568+ byte != '-' && byte != '=' &&
1569+ byte != '+' && byte != '[' &&
1570+ byte != ']' && byte != '{' &&
1571+ byte != '}' && byte != ':' &&
1572+ byte != ';' && byte != ',' &&
1573+ byte != '.' && byte != '<' &&
1574+ byte != '>' && byte != '/' &&
1575+ byte != '?' && byte != '|' );
1576+ }break ;
1577+ case TXT_TokenKind_Comment :
1578+ {
1579+ if (multiline_comment_nesting_level == 0 )
1580+ {
1581+ ender_found = (byte == '\r' || byte == '\n' );
1582+ }
1583+ else
1584+ {
1585+ if (byte == '*' && next_byte == '/' )
1586+ multiline_comment_nesting_level -- ;
1587+
1588+ ender_found = (active_token_start_idx + 1 < idx && multiline_comment_nesting_level == 0 );
1589+ ender_pad += 2 ;
1590+ }
1591+ }break ;
1592+ }
1593+ }
1594+
1595+ // rjf: next byte is ender => emit token
1596+ if (ender_found )
1597+ {
1598+ TXT_Token token = {active_token_kind , r1u64 (active_token_start_idx , idx + ender_pad )};
1599+ active_token_kind = TXT_TokenKind_Null ;
1600+
1601+ // rjf: identifier -> keyword in special cases
1602+ if (token .kind == TXT_TokenKind_Identifier )
1603+ {
1604+ read_only local_persist String8 rust_keywords [] =
1605+ {
1606+ str8_lit_comp ("as" ),
1607+ str8_lit_comp ("break" ),
1608+ str8_lit_comp ("const" ),
1609+ str8_lit_comp ("continue" ),
1610+ str8_lit_comp ("crate" ),
1611+ str8_lit_comp ("else" ),
1612+ str8_lit_comp ("enum" ),
1613+ str8_lit_comp ("extern" ),
1614+ str8_lit_comp ("false" ),
1615+ str8_lit_comp ("fn" ),
1616+ str8_lit_comp ("for" ),
1617+ str8_lit_comp ("if" ),
1618+ str8_lit_comp ("impl" ),
1619+ str8_lit_comp ("in" ),
1620+ str8_lit_comp ("let" ),
1621+ str8_lit_comp ("loop" ),
1622+ str8_lit_comp ("match" ),
1623+ str8_lit_comp ("mod" ),
1624+ str8_lit_comp ("move" ),
1625+ str8_lit_comp ("mut" ),
1626+ str8_lit_comp ("pub" ),
1627+ str8_lit_comp ("ref" ),
1628+ str8_lit_comp ("return" ),
1629+ str8_lit_comp ("self" ),
1630+ str8_lit_comp ("Self" ),
1631+ str8_lit_comp ("static" ),
1632+ str8_lit_comp ("struct" ),
1633+ str8_lit_comp ("super" ),
1634+ str8_lit_comp ("trait" ),
1635+ str8_lit_comp ("true" ),
1636+ str8_lit_comp ("type" ),
1637+ str8_lit_comp ("unsafe" ),
1638+ str8_lit_comp ("use" ),
1639+ str8_lit_comp ("where" ),
1640+ str8_lit_comp ("while" ),
1641+ str8_lit_comp ("yield" ),
1642+ str8_lit_comp ("async" ),
1643+ str8_lit_comp ("await" ),
1644+ str8_lit_comp ("dyn" ),
1645+
1646+ // weak keywords
1647+ str8_lit_comp ("macro_rules" ),
1648+ str8_lit_comp ("raw" ),
1649+ str8_lit_comp ("safe" ),
1650+ str8_lit_comp ("union" ),
1651+ };
1652+ String8 token_string = str8_substr (string , r1u64 (active_token_start_idx , idx + ender_pad ));
1653+ for (U64 keyword_idx = 0 ; keyword_idx < ArrayCount (rust_keywords ); keyword_idx += 1 )
1654+ {
1655+ if (str8_match (rust_keywords [keyword_idx ], token_string , 0 ))
1656+ {
1657+ token .kind = TXT_TokenKind_Keyword ;
1658+ break ;
1659+ }
1660+ }
1661+ txt_token_chunk_list_push (scratch .arena , & tokens , 4096 , & token );
1662+ }
1663+
1664+ // rjf: split symbols by maximum-munch-rule
1665+ else if (token .kind == TXT_TokenKind_Symbol )
1666+ {
1667+ read_only local_persist String8 rust_multichar_symbol_strings [] =
1668+ {
1669+ str8_lit_comp ("<<" ),
1670+ str8_lit_comp (">>" ),
1671+ str8_lit_comp ("<=" ),
1672+ str8_lit_comp (">=" ),
1673+ str8_lit_comp ("==" ),
1674+ str8_lit_comp ("!=" ),
1675+ str8_lit_comp ("&&" ),
1676+ str8_lit_comp ("||" ),
1677+ str8_lit_comp ("|=" ),
1678+ str8_lit_comp ("&=" ),
1679+ str8_lit_comp ("^=" ),
1680+ str8_lit_comp ("~=" ),
1681+ str8_lit_comp ("+=" ),
1682+ str8_lit_comp ("-=" ),
1683+ str8_lit_comp ("*=" ),
1684+ str8_lit_comp ("/=" ),
1685+ str8_lit_comp ("%=" ),
1686+ str8_lit_comp ("<<=" ),
1687+ str8_lit_comp (">>=" ),
1688+ str8_lit_comp ("->" ),
1689+ };
1690+ String8 token_string = str8_substr (string , r1u64 (active_token_start_idx , idx + ender_pad ));
1691+ for (U64 off = 0 , next_off = token_string .size ; off < token_string .size ; off = next_off )
1692+ {
1693+ B32 found = 0 ;
1694+ for (U64 idx = 0 ; idx < ArrayCount (rust_multichar_symbol_strings ); idx += 1 )
1695+ {
1696+ if (str8_match (str8_substr (token_string , r1u64 (off , off + rust_multichar_symbol_strings [idx ].size )),
1697+ rust_multichar_symbol_strings [idx ],
1698+ 0 ))
1699+ {
1700+ found = 1 ;
1701+ next_off = off + rust_multichar_symbol_strings [idx ].size ;
1702+ TXT_Token token = {TXT_TokenKind_Symbol , r1u64 (active_token_start_idx + off , active_token_start_idx + next_off )};
1703+ txt_token_chunk_list_push (scratch .arena , & tokens , 4096 , & token );
1704+ break ;
1705+ }
1706+ }
1707+ if (!found )
1708+ {
1709+ next_off = off + 1 ;
1710+ TXT_Token token = {TXT_TokenKind_Symbol , r1u64 (active_token_start_idx + off , active_token_start_idx + next_off )};
1711+ txt_token_chunk_list_push (scratch .arena , & tokens , 4096 , & token );
1712+ }
1713+ }
1714+ }
1715+
1716+ // rjf: all other tokens
1717+ else
1718+ {
1719+ txt_token_chunk_list_push (scratch .arena , & tokens , 4096 , & token );
1720+ }
1721+
1722+ // rjf: increment by starter and ender padding
1723+ idx += ender_pad ;
1724+ }
1725+
1726+ // rjf: advance by 1 byte if we haven't found an ender
1727+ if (!ender_found )
1728+ {
1729+ idx += 1 ;
1730+ }
1731+ escaped = next_escaped ;
1732+ }
1733+ }
1734+
1735+ //- rjf: token list -> token array
1736+ TXT_TokenArray result = txt_token_array_from_chunk_list (arena , & tokens );
1737+ scratch_end (scratch );
1738+ return result ;
1739+ }
1740+
13711741internal TXT_TokenArray
13721742txt_token_array_from_string__disasm_x64_intel (Arena * arena , U64 * bytes_processed_counter , String8 string )
13731743{
0 commit comments