-
Notifications
You must be signed in to change notification settings - Fork 0
/
lexer.cpp
298 lines (263 loc) · 9.15 KB
/
lexer.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
#include <set>
#include "lexer.h"
#include "logger.h"
/**
* Lexer class constructor
* @param filereader pointer to a FileReader object
*/
Lexer::Lexer(Input *input) : input(input) { }
/**
* Check if lexer is at the end of the input
* @return true if lexer is at the end of the input, false otherwise
*/
bool Lexer::is_at_end() {
return current >= input->data.length();
}
/**
* Advance the lexer to the next character in the input
* @return the next character in the input
*/
char Lexer::advance() {
column++;
return input->data[current++];
}
/**
* Creates a token of the given type with the given range
* @param token_type the type of token to create
* @param start the start index of the substring
* @param end the end index of the substring
* @return the created token
*/
Token Lexer::create_token(TokenType token_type, int start, int end) {
return Token(token_type, input->data.substr(start, (end - start)), line, column - current + start);
}
/**
* Creates a token of the given type with the given lexeme
* @param token_type the type of token to create
* @param lexeme the lexeme of the token
* @return the created token
*/
Token Lexer::create_token(TokenType token_type, std::string lexeme) {
return Token(token_type, lexeme, line, column - current + start);
}
/**
* Check if the next character in the input matches the expected character
* @param expected character to match
* @return true if the next character matches the expected character, false otherwise
*/
bool Lexer::match(char expected) {
if (peek() != expected)
return false;
advance();
return true;
}
/**
* Checks if the current character in the input matches the expected character
* If it does, it returns the matched token type, otherwise it returns the unmatched token type
* @param expected the character that is expected to be matched
* @param matched the token type to be returned if the character is matched
* @param unmatched the token type to be returned if the character is not matched
* @return the matched or unmatched token type
*/
TokenType Lexer::either(char expected, TokenType matched, TokenType unmatched) {
return match(expected) ? matched : unmatched;
}
/**
* Returns the next character in the input without advancing the current position
* @return the next character in the input
*/
char Lexer::peek() {
return input->data.at(current);
}
/**
* Infers if a semicolon token should be inserted
* @return the semicolon token if the last token is valid, otherwise nullopt
*/
std::optional<Token> Lexer::infer_semicolon() {
const std::set<TokenType> valid = {
Identifier, Integer, String, Break, Return, RightParen, RightBracket
};
if (tokens.size() > 0)
if (valid.count(tokens.back().type))
return create_token(Semicolon, "");
return std::nullopt;
}
/**
* Creates a semicolon token if the last token is a valid token, and increments the line and column count
* @return the semicolon token if the last token is valid, otherwise nullopt
*/
std::optional<Token> Lexer::newline() {
auto token = infer_semicolon();
line++;
column = 1;
return token;
}
/**
* Scans the input for a number and creates a token of type Integer.
* @return a token of type Integer
*/
Token Lexer::number() {
while (!is_at_end() && is_digit(peek()))
advance();
return create_token(Integer);
}
/**
* Scans the input for an identifier and creates a token of type Identifier or Keyword.
* @return a token of type Identifier or Keyword
*/
Token Lexer::identifier() {
while (!is_at_end() && is_alphanumeric(peek()))
advance();
auto lexeme = input->data.substr(start, (current - start));
if (Keywords.count(lexeme))
return create_token(Keywords[lexeme]);
return create_token(Identifier);
}
/**
* Creates a token of the given type
* @param token_type the type of token to create
* @return the created token
*/
Token Lexer::create_token(TokenType token_type) {
return create_token(token_type, start, current);
}
/**
* Returns true if the given character is a digit
* @param c the character to check
* @return true if the character is a digit, false otherwise
*/
bool Lexer::is_digit(char c) {
return c >= '0' && c <= '9';
}
/**
* Returns true if the given character is alpha
* @param c the character to check
* @return true if the character is alpha, false otherwise
*/
bool Lexer::is_alpha(char c) {
return (c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
c == '_';
}
/**
* Returns true if the given character is alphanumeric
* @param c the character to check
* @return true if the character is alphanumeric, false otherwise
*/
bool Lexer::is_alphanumeric(char c) {
return is_alpha(c) || is_digit(c);
}
/**
* Iterates through the input string, matching a token at each iteration
* When the end of the input string is reached, the function adds a Eof token to the tokens vector and returns it
* @param verbose should the token be printed
* @return a vector of all the matched tokens
*/
std::vector<Token> Lexer::match_tokens(bool verbose) {
while (!is_at_end()) {
start = current;
auto token = match_token();
if (token.has_value()) {
tokens.push_back(token.value());
if (verbose)
std::cout << token.value() << std::endl;
}
}
auto closing_semicolon = infer_semicolon();
if(closing_semicolon.has_value())
tokens.push_back(closing_semicolon.value());
tokens.push_back(create_token(Eof, current, current));
return tokens;
}
/**
* Matches a single token from the input and returns it
* @return the matched token, or nullopt if a token was not matched
*/
std::optional<Token> Lexer::match_token() {
char c = advance();
// Ugly approach to allow complex statements to occupy a single line
// TODO: Refactor this lmao
if(c == '}' && tokens.back().type != Semicolon) {
current--;
return create_token(Semicolon);
}
switch (c) {
// Whitespace
case ' ':
case '\r':
case '\t': return std::nullopt;
// Newline (with semicolon inference)
case '\n': return newline();
// Single character
case '{': return create_token(LeftBracket);
case '}': return create_token(RightBracket);
case '(': return create_token(LeftParen);
case ')': return create_token(RightParen);
case ';': return create_token(Semicolon);
case ':': return create_token(Colon);
case ',': return create_token(Comma);
case '+': return create_token(Add);
case '-': return create_token(Subtract);
case '*': return create_token(Multiply);
case '%': return create_token(Modulo);
// Either
case '!': return create_token(either('=', NotEqual, Not));
case '=': return create_token(either('=', EqualEqual, Equal));
case '>': return create_token(either('=', GreaterEqual, Greater));
case '<': return create_token(either('=', LessEqual, Less));
// Binary
case '&':
if (match('&'))
return create_token(And);
else
Logger::error(input, line, column, 1, "bitwise AND not supported");
case '|':
if (match('|'))
return create_token(Or);
else
Logger::error(input, line, column, 1, "bitwise OR not supported");
// Comment
case '/':
if (match('/')) {
while (!is_at_end() && peek() != '\n')
advance();
return std::nullopt;
} else
return create_token(Divide);
// String literal
case '"':
while (!is_at_end() && peek() != '"') {
if (match('\\'))
if (match('b') || match('f') || match('n') || match('r') ||
match('t') || match('\\') || match('\"'))
continue;
else
Logger::error(input, line, column, 2,
"bad string escape '\\" + std::string(1, peek()) + "'");
if (peek() == '\n')
Logger::error(input, line, column - current + start + 1, current - start + 1,
"string contains newline");
advance();
}
if (is_at_end())
Logger::error(input, line, column - current + start + 1, current - start + 1, "unterminated string");
advance();
return create_token(String, start + 1, current - 1);
// Non-trivial
default:
// Integer literal
if (is_digit(c))
return number();
// Identifier
else if (is_alpha(c))
return identifier();
// Non-ascii character
else if (!isascii(c))
Logger::warning(input, line, column, 1, "skipping non-ascii character");
// Unknown
else
Logger::warning(input, line, column, 1, "skipping unknown character '" + std::string(1, c) + "'");
}
// Couldn't match any character, ignore
return std::nullopt;
}