Skip to content

Commit 27d01bc

Browse files
Speykiousryanfleury
authored andcommitted
Implement Rust syntax highlighting
1 parent 1ffd662 commit 27d01bc

File tree

2 files changed

+372
-0
lines changed

2 files changed

+372
-0
lines changed

src/text/text.c

Lines changed: 370 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,10 @@ txt_lang_kind_from_extension(String8 extension)
4747
{
4848
kind = TXT_LangKind_Zig;
4949
}
50+
else if(str8_match(extension, str8_lit("rs"), StringMatchFlag_CaseInsensitive))
51+
{
52+
kind = TXT_LangKind_Rust;
53+
}
5054
return kind;
5155
}
5256

@@ -65,6 +69,7 @@ txt_extension_from_lang_kind(TXT_LangKind kind)
6569
case TXT_LangKind_Odin: {result = str8_lit("odin");}break;
6670
case TXT_LangKind_Jai: {result = str8_lit("jai");}break;
6771
case TXT_LangKind_Zig: {result = str8_lit("zig");}break;
72+
case TXT_LangKind_Rust: {result = str8_lit("rs");}break;
6873
}
6974
return result;
7075
}
@@ -93,6 +98,7 @@ txt_lex_function_from_lang_kind(TXT_LangKind kind)
9398
case TXT_LangKind_Odin: {fn = txt_token_array_from_string__odin;}break;
9499
case TXT_LangKind_Jai: {fn = txt_token_array_from_string__jai;}break;
95100
case TXT_LangKind_Zig: {fn = txt_token_array_from_string__zig;}break;
101+
case TXT_LangKind_Rust: {fn = txt_token_array_from_string__rust;}break;
96102
case TXT_LangKind_DisasmX64Intel:{fn = txt_token_array_from_string__disasm_x64_intel;}break;
97103
}
98104
return fn;
@@ -1368,6 +1374,370 @@ txt_token_array_from_string__zig(Arena *arena, U64 *bytes_processed_counter, Str
13681374
return result;
13691375
}
13701376

1377+
internal TXT_TokenArray
1378+
txt_token_array_from_string__rust(Arena *arena, U64 *bytes_processed_counter, String8 string)
1379+
{
1380+
// NOTE(spey): Rust supports unicode identifiers. They are not handled in any way here,
1381+
// but it might be worth looking into in the future.
1382+
1383+
Temp scratch = scratch_begin(&arena, 1);
1384+
1385+
//- rjf: generate token list
1386+
TXT_TokenChunkList tokens = {0};
1387+
{
1388+
S32 multiline_comment_nesting_level = 0;
1389+
S32 raw_string_nesting_level = 0;
1390+
S32 raw_string_ender_nesting_level = 0;
1391+
1392+
// NOTE(spey): Rust's syntax is designed in such a way that we can't be sure what a token
1393+
// is immediately from the first character, so we have to keep track of some possibilities.
1394+
B32 token_may_be_char = 0;
1395+
B32 token_may_be_lifetime = 0;
1396+
B32 token_may_be_string = 0;
1397+
1398+
TXT_TokenKind active_token_kind = TXT_TokenKind_Null;
1399+
U64 active_token_start_idx = 0;
1400+
B32 escaped = 0;
1401+
B32 next_escaped = 0;
1402+
U64 byte_process_start_idx = 0;
1403+
for(U64 idx = 0; idx <= string.size;)
1404+
{
1405+
U8 byte = (idx+0 < string.size) ? (string.str[idx+0]) : 0;
1406+
U8 next_byte = (idx+1 < string.size) ? (string.str[idx+1]) : 0;
1407+
1408+
// rjf: update counter
1409+
if(bytes_processed_counter != 0 && ((idx-byte_process_start_idx) >= 1000 || idx == string.size))
1410+
{
1411+
ins_atomic_u64_add_eval(bytes_processed_counter, (idx-byte_process_start_idx));
1412+
byte_process_start_idx = idx;
1413+
}
1414+
1415+
// rjf: escaping
1416+
if(escaped && (byte != '\r' && byte != '\n'))
1417+
{
1418+
next_escaped = 0;
1419+
}
1420+
else if(!escaped && byte == '\\')
1421+
{
1422+
next_escaped = 1;
1423+
}
1424+
1425+
// rjf: take starter, determine active token kind
1426+
U64 starter_pad = 0;
1427+
1428+
// spey: special case of starter for nested comments
1429+
if(active_token_kind == TXT_TokenKind_Comment)
1430+
{
1431+
if(byte == '/' && next_byte == '*') { active_token_kind = TXT_TokenKind_Comment; multiline_comment_nesting_level++; starter_pad = 1; }
1432+
}
1433+
// spey: special case of starter for raw string literals
1434+
else if(active_token_kind == TXT_TokenKind_Identifier && token_may_be_string)
1435+
{
1436+
if(0){}
1437+
else if(byte == 'r' && next_byte == '#') {} // spey: still an identifier that may be a string (this branch triggers for raw byte/C string literals)
1438+
else if(byte == '#' && next_byte == '"') { active_token_kind = TXT_TokenKind_String; token_may_be_string = 0; token_may_be_char = 0; raw_string_nesting_level++; starter_pad = 2; }
1439+
else if(byte == '#' && next_byte == '#') { raw_string_nesting_level++; }
1440+
else { token_may_be_string = 0; token_may_be_char = 0; raw_string_nesting_level = 0; } // spey: confirmed raw identifier
1441+
}
1442+
// spey: regular cases
1443+
else if(active_token_kind == TXT_TokenKind_Null)
1444+
{
1445+
// rjf: use next bytes to start a new token
1446+
if(0){}
1447+
else if(char_is_space(byte)) { active_token_kind = TXT_TokenKind_Whitespace; }
1448+
else if(byte == 'r' && next_byte == '#') { active_token_kind = TXT_TokenKind_Identifier; token_may_be_string = 1; } // spey: either raw identifiers or raw string literals
1449+
else if(char_is_digit(byte, 10) ||
1450+
(byte == '.' &&
1451+
char_is_digit(next_byte, 10))) { active_token_kind = TXT_TokenKind_Numeric; }
1452+
else if(byte == '"') { active_token_kind = TXT_TokenKind_String; token_may_be_char = 0; }
1453+
else if((byte == 'c' || byte == 'b') &&
1454+
next_byte == '"') { active_token_kind = TXT_TokenKind_String; token_may_be_char = 0; starter_pad = 1; }
1455+
else if((byte == 'c' || byte == 'b') &&
1456+
next_byte == 'r') { active_token_kind = TXT_TokenKind_Identifier; token_may_be_string = 1; }
1457+
else if(byte == '_' ||
1458+
char_is_alpha(byte)) { active_token_kind = TXT_TokenKind_Identifier; }
1459+
else if(byte == '/' && next_byte == '/') { active_token_kind = TXT_TokenKind_Comment; starter_pad = 1; }
1460+
else if(byte == '/' && next_byte == '*') { active_token_kind = TXT_TokenKind_Comment; starter_pad = 1; multiline_comment_nesting_level++; }
1461+
else if(byte == '~' || byte == '!' ||
1462+
byte == '%' || byte == '^' ||
1463+
byte == '&' || byte == '*' ||
1464+
byte == '(' || byte == ')' ||
1465+
byte == '-' || byte == '=' ||
1466+
byte == '+' || byte == '[' ||
1467+
byte == ']' || byte == '{' ||
1468+
byte == '}' || byte == ':' ||
1469+
byte == ';' || byte == ',' ||
1470+
byte == '.' || byte == '<' ||
1471+
byte == '>' || byte == '/' ||
1472+
byte == '?' || byte == '|') { active_token_kind = TXT_TokenKind_Symbol; }
1473+
else if(byte == '\'') { active_token_kind = TXT_TokenKind_String; token_may_be_char = 1; token_may_be_lifetime = 1; }
1474+
else if((byte == 'c' || byte == 'b') &&
1475+
next_byte == '\'') { active_token_kind = TXT_TokenKind_String; token_may_be_char = 1; starter_pad = 1; }
1476+
1477+
// rjf: start new token
1478+
if(active_token_kind != TXT_TokenKind_Null)
1479+
{
1480+
active_token_start_idx = idx;
1481+
}
1482+
1483+
// rjf: invalid token kind -> emit error
1484+
else
1485+
{
1486+
TXT_Token token = {TXT_TokenKind_Error, r1u64(idx, idx+1)};
1487+
txt_token_chunk_list_push(scratch.arena, &tokens, 4096, &token);
1488+
}
1489+
}
1490+
1491+
B32 is_on_starter = idx <= active_token_start_idx || token_may_be_string;
1492+
1493+
// spey: advance by starter padding byte(s) and reset byte/next_byte values
1494+
idx += starter_pad;
1495+
byte = (idx+0 < string.size) ? (string.str[idx+0]) : 0;
1496+
next_byte = (idx+1 < string.size) ? (string.str[idx+1]) : 0;
1497+
1498+
// rjf: look for ender
1499+
U64 ender_pad = 0;
1500+
B32 ender_found = 0;
1501+
if(active_token_kind != TXT_TokenKind_Null && !is_on_starter)
1502+
{
1503+
if(idx == string.size)
1504+
{
1505+
ender_pad = 0;
1506+
ender_found = 1;
1507+
}
1508+
else switch(active_token_kind)
1509+
{
1510+
default:break;
1511+
case TXT_TokenKind_Whitespace:
1512+
{
1513+
ender_found = !char_is_space(byte);
1514+
}break;
1515+
case TXT_TokenKind_Identifier:
1516+
{
1517+
ender_found = (!char_is_alpha(byte) && !char_is_digit(byte, 10) && byte != '_' && byte != '$' && byte != '#' && byte != '!' && byte < 128);
1518+
}break;
1519+
case TXT_TokenKind_Numeric:
1520+
{
1521+
ender_found = (!char_is_alpha(byte) && !char_is_digit(byte, 10) && byte != '_' && byte != '.' && byte != '\'');
1522+
}break;
1523+
case TXT_TokenKind_String:
1524+
{
1525+
if(!escaped)
1526+
{
1527+
if(token_may_be_char)
1528+
{
1529+
if(byte == '\'')
1530+
{
1531+
// spey: char ending
1532+
ender_found = 1;
1533+
}
1534+
else if(token_may_be_lifetime && !char_is_alpha(byte) && !char_is_digit(byte, 10) && byte != '_' && byte < 128)
1535+
{
1536+
// spey: lifetime ending
1537+
ender_found = 1;
1538+
}
1539+
}
1540+
else
1541+
{
1542+
if(0){}
1543+
1544+
// spey: regular string
1545+
else if(raw_string_nesting_level == 0) { ender_found = byte == '"'; }
1546+
1547+
// spey: raw string
1548+
else if(byte == '"' && next_byte == '#' &&
1549+
raw_string_ender_nesting_level == 0) { raw_string_ender_nesting_level++; }
1550+
else if(byte == '#' && next_byte != '#' &&
1551+
raw_string_ender_nesting_level == raw_string_nesting_level &&
1552+
raw_string_ender_nesting_level >= 0) { ender_found = 1; raw_string_nesting_level = 0; raw_string_ender_nesting_level = 0; }
1553+
else if(byte == '#' && next_byte != '#' &&
1554+
raw_string_ender_nesting_level >= 0) { raw_string_ender_nesting_level = 0; }
1555+
else if(byte == '#' &&
1556+
raw_string_ender_nesting_level >= 0) { raw_string_ender_nesting_level++; }
1557+
}
1558+
}
1559+
1560+
ender_pad += 1;
1561+
}break;
1562+
case TXT_TokenKind_Symbol:
1563+
{
1564+
ender_found = (byte != '~' && byte != '!' &&
1565+
byte != '%' && byte != '^' &&
1566+
byte != '&' && byte != '*' &&
1567+
byte != '(' && byte != ')' &&
1568+
byte != '-' && byte != '=' &&
1569+
byte != '+' && byte != '[' &&
1570+
byte != ']' && byte != '{' &&
1571+
byte != '}' && byte != ':' &&
1572+
byte != ';' && byte != ',' &&
1573+
byte != '.' && byte != '<' &&
1574+
byte != '>' && byte != '/' &&
1575+
byte != '?' && byte != '|');
1576+
}break;
1577+
case TXT_TokenKind_Comment:
1578+
{
1579+
if(multiline_comment_nesting_level == 0)
1580+
{
1581+
ender_found = (byte == '\r' || byte == '\n');
1582+
}
1583+
else
1584+
{
1585+
if (byte == '*' && next_byte == '/')
1586+
multiline_comment_nesting_level--;
1587+
1588+
ender_found = (active_token_start_idx+1 < idx && multiline_comment_nesting_level == 0);
1589+
ender_pad += 2;
1590+
}
1591+
}break;
1592+
}
1593+
}
1594+
1595+
// rjf: next byte is ender => emit token
1596+
if(ender_found)
1597+
{
1598+
TXT_Token token = {active_token_kind, r1u64(active_token_start_idx, idx+ender_pad)};
1599+
active_token_kind = TXT_TokenKind_Null;
1600+
1601+
// rjf: identifier -> keyword in special cases
1602+
if(token.kind == TXT_TokenKind_Identifier)
1603+
{
1604+
read_only local_persist String8 rust_keywords[] =
1605+
{
1606+
str8_lit_comp("as"),
1607+
str8_lit_comp("break"),
1608+
str8_lit_comp("const"),
1609+
str8_lit_comp("continue"),
1610+
str8_lit_comp("crate"),
1611+
str8_lit_comp("else"),
1612+
str8_lit_comp("enum"),
1613+
str8_lit_comp("extern"),
1614+
str8_lit_comp("false"),
1615+
str8_lit_comp("fn"),
1616+
str8_lit_comp("for"),
1617+
str8_lit_comp("if"),
1618+
str8_lit_comp("impl"),
1619+
str8_lit_comp("in"),
1620+
str8_lit_comp("let"),
1621+
str8_lit_comp("loop"),
1622+
str8_lit_comp("match"),
1623+
str8_lit_comp("mod"),
1624+
str8_lit_comp("move"),
1625+
str8_lit_comp("mut"),
1626+
str8_lit_comp("pub"),
1627+
str8_lit_comp("ref"),
1628+
str8_lit_comp("return"),
1629+
str8_lit_comp("self"),
1630+
str8_lit_comp("Self"),
1631+
str8_lit_comp("static"),
1632+
str8_lit_comp("struct"),
1633+
str8_lit_comp("super"),
1634+
str8_lit_comp("trait"),
1635+
str8_lit_comp("true"),
1636+
str8_lit_comp("type"),
1637+
str8_lit_comp("unsafe"),
1638+
str8_lit_comp("use"),
1639+
str8_lit_comp("where"),
1640+
str8_lit_comp("while"),
1641+
str8_lit_comp("yield"),
1642+
str8_lit_comp("async"),
1643+
str8_lit_comp("await"),
1644+
str8_lit_comp("dyn"),
1645+
1646+
// weak keywords
1647+
str8_lit_comp("macro_rules"),
1648+
str8_lit_comp("raw"),
1649+
str8_lit_comp("safe"),
1650+
str8_lit_comp("union"),
1651+
};
1652+
String8 token_string = str8_substr(string, r1u64(active_token_start_idx, idx+ender_pad));
1653+
for(U64 keyword_idx = 0; keyword_idx < ArrayCount(rust_keywords); keyword_idx += 1)
1654+
{
1655+
if(str8_match(rust_keywords[keyword_idx], token_string, 0))
1656+
{
1657+
token.kind = TXT_TokenKind_Keyword;
1658+
break;
1659+
}
1660+
}
1661+
txt_token_chunk_list_push(scratch.arena, &tokens, 4096, &token);
1662+
}
1663+
1664+
// rjf: split symbols by maximum-munch-rule
1665+
else if(token.kind == TXT_TokenKind_Symbol)
1666+
{
1667+
read_only local_persist String8 rust_multichar_symbol_strings[] =
1668+
{
1669+
str8_lit_comp("<<"),
1670+
str8_lit_comp(">>"),
1671+
str8_lit_comp("<="),
1672+
str8_lit_comp(">="),
1673+
str8_lit_comp("=="),
1674+
str8_lit_comp("!="),
1675+
str8_lit_comp("&&"),
1676+
str8_lit_comp("||"),
1677+
str8_lit_comp("|="),
1678+
str8_lit_comp("&="),
1679+
str8_lit_comp("^="),
1680+
str8_lit_comp("~="),
1681+
str8_lit_comp("+="),
1682+
str8_lit_comp("-="),
1683+
str8_lit_comp("*="),
1684+
str8_lit_comp("/="),
1685+
str8_lit_comp("%="),
1686+
str8_lit_comp("<<="),
1687+
str8_lit_comp(">>="),
1688+
str8_lit_comp("->"),
1689+
};
1690+
String8 token_string = str8_substr(string, r1u64(active_token_start_idx, idx+ender_pad));
1691+
for(U64 off = 0, next_off = token_string.size; off < token_string.size; off = next_off)
1692+
{
1693+
B32 found = 0;
1694+
for(U64 idx = 0; idx < ArrayCount(rust_multichar_symbol_strings); idx += 1)
1695+
{
1696+
if(str8_match(str8_substr(token_string, r1u64(off, off+rust_multichar_symbol_strings[idx].size)),
1697+
rust_multichar_symbol_strings[idx],
1698+
0))
1699+
{
1700+
found = 1;
1701+
next_off = off + rust_multichar_symbol_strings[idx].size;
1702+
TXT_Token token = {TXT_TokenKind_Symbol, r1u64(active_token_start_idx+off, active_token_start_idx+next_off)};
1703+
txt_token_chunk_list_push(scratch.arena, &tokens, 4096, &token);
1704+
break;
1705+
}
1706+
}
1707+
if(!found)
1708+
{
1709+
next_off = off+1;
1710+
TXT_Token token = {TXT_TokenKind_Symbol, r1u64(active_token_start_idx+off, active_token_start_idx+next_off)};
1711+
txt_token_chunk_list_push(scratch.arena, &tokens, 4096, &token);
1712+
}
1713+
}
1714+
}
1715+
1716+
// rjf: all other tokens
1717+
else
1718+
{
1719+
txt_token_chunk_list_push(scratch.arena, &tokens, 4096, &token);
1720+
}
1721+
1722+
// rjf: increment by starter and ender padding
1723+
idx += ender_pad;
1724+
}
1725+
1726+
// rjf: advance by 1 byte if we haven't found an ender
1727+
if(!ender_found)
1728+
{
1729+
idx += 1;
1730+
}
1731+
escaped = next_escaped;
1732+
}
1733+
}
1734+
1735+
//- rjf: token list -> token array
1736+
TXT_TokenArray result = txt_token_array_from_chunk_list(arena, &tokens);
1737+
scratch_end(scratch);
1738+
return result;
1739+
}
1740+
13711741
internal TXT_TokenArray
13721742
txt_token_array_from_string__disasm_x64_intel(Arena *arena, U64 *bytes_processed_counter, String8 string)
13731743
{

0 commit comments

Comments
 (0)