This repository was archived by the owner on Jan 1, 2026. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathTokenizer.cpp
More file actions
169 lines (147 loc) · 3.8 KB
/
Tokenizer.cpp
File metadata and controls
169 lines (147 loc) · 3.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#include "Tokenizer.h"
Tokenizer::Tokenizer(string& s) : input(s)
{
whitespace = " \r\t\n()";
token_string = "";
position = 0;
}
Tokenizer::~Tokenizer()
{
}
bool Tokenizer::is_whitespace(char c)
{
for (unsigned int i = 0; i < whitespace.length(); i++) {
if (c == whitespace[i]) return true;
}
return false;
}
bool Tokenizer::is_quote(char c)
{
return (c == '"' || c == '\'');
}
bool Tokenizer::IsTripleQuote(int index, char c)
{
if (!is_quote(c)) return false;
if (index + 2 >= input.length()) return false;
return (input[index + 1] == c && input[index + 2] == c);
}
Token Tokenizer::NextToken()
{
token_string = "";
return transition_from_START();
}
Token Tokenizer::transition_from_START()
{
while (position < input.length())
{
char c = input[position++];
if (is_whitespace(c)) continue;
else if (c == '#') return transition_from_COMMENT();
else if (c == ':') return transition_from_START_DEFINITION();
else if (c == ';') return Token(TokenType::END_DEFINITION);
else if (c == '[') return Token(TokenType::START_ARRAY);
else if (c == ']') return Token(TokenType::END_ARRAY);
else if (c == '{') return transition_from_GATHER_MODULE();
else if (c == '}') return Token(TokenType::END_MODULE);
else if (IsTripleQuote(position - 1, c))
{
position += 2; // Skip 2nd and 3rd quote chars
return transition_from_GATHER_TRIPLE_QUOTE_STRING(c);
}
else if (is_quote(c))
{
return transition_from_GATHER_STRING(c);
}
else return transition_from_GATHER_WORD(c);
}
return Token(TokenType::EOS);
}
Token Tokenizer::transition_from_COMMENT()
{
while (position < input.length())
{
char c = input[position++];
if (c == '\n') break;
token_string += c;
}
return Token(TokenType::COMMENT, token_string);
}
Token Tokenizer::transition_from_START_DEFINITION()
{
while (position < input.length())
{
char c = input[position++];
if (is_whitespace(c)) continue;
else if (c == '"' || c == '\'') throw "Definition cannot start with a quote";
else
{
position--;
return transition_from_GATHER_DEFINITION_NAME();
}
}
throw "Got EOS in START_DEFINITION";
}
Token Tokenizer::transition_from_GATHER_DEFINITION_NAME()
{
while (position < input.length())
{
char c = input[position++];
if (is_whitespace(c)) break;
else token_string += c;
}
return Token(TokenType::START_DEFINITION, token_string);
}
Token Tokenizer::transition_from_GATHER_MODULE()
{
while (position < input.length())
{
char c = input[position++];
if (is_whitespace(c)) break;
else if (c == '}')
{
position--;
break;
}
else token_string += c;
}
return Token(TokenType::START_MODULE, token_string);
}
Token Tokenizer::transition_from_GATHER_TRIPLE_QUOTE_STRING(char delim)
{
while (position < input.length())
{
char c = input[position];
if (c == delim && IsTripleQuote(position, c))
{
position += 3;
return Token(TokenType::STRING, token_string);
}
else
{
token_string += c;
position++;
}
}
throw "Unterminated triple quote string";
}
Token Tokenizer::transition_from_GATHER_STRING(char delim)
{
while (position < input.length())
{
char c = input[position++];
if (c == delim) return Token(TokenType::STRING, token_string);
else token_string += c;
}
throw "Unterminated string";
}
Token Tokenizer::transition_from_GATHER_WORD(char first_char)
{
token_string += first_char;
while (position < input.length())
{
char c = input[position++];
if (is_whitespace(c)) break;
else token_string += c;
}
return Token(TokenType::WORD, token_string);
}