CIRCT 20.0.0git
Loading...
Searching...
No Matches
FIRLexer.cpp
Go to the documentation of this file.
1//===- FIRLexer.cpp - .fir file lexer implementation ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This implements a .fir file lexer.
10//
11//===----------------------------------------------------------------------===//
12
13#include "FIRLexer.h"
14#include "mlir/IR/Diagnostics.h"
15#include "llvm/ADT/StringExtras.h"
16#include "llvm/ADT/StringSwitch.h"
17#include "llvm/Support/SourceMgr.h"
18#include "llvm/Support/raw_ostream.h"
19
20using namespace circt;
21using namespace firrtl;
22using llvm::SMLoc;
23using llvm::SMRange;
24using llvm::SourceMgr;
25
26#define isdigit(x) DO_NOT_USE_SLOW_CTYPE_FUNCTIONS
27#define isalpha(x) DO_NOT_USE_SLOW_CTYPE_FUNCTIONS
28
29//===----------------------------------------------------------------------===//
30// FIRToken
31//===----------------------------------------------------------------------===//
32
33SMLoc FIRToken::getLoc() const {
34 return SMLoc::getFromPointer(spelling.data());
35}
36
37SMLoc FIRToken::getEndLoc() const {
38 return SMLoc::getFromPointer(spelling.data() + spelling.size());
39}
40
41SMRange FIRToken::getLocRange() const { return SMRange(getLoc(), getEndLoc()); }
42
43/// Return true if this is one of the keyword token kinds (e.g. kw_wire).
44bool FIRToken::isKeyword() const {
45 switch (kind) {
46 default:
47 return false;
48#define TOK_KEYWORD(SPELLING) \
49 case kw_##SPELLING: \
50 return true;
51#include "FIRTokenKinds.def"
52 }
53}
54
55/// Given a token containing a string literal, return its value, including
56/// removing the quote characters and unescaping the contents of the string. The
57/// lexer has already verified that this token is valid.
58std::string FIRToken::getStringValue() const {
59 assert(getKind() == string);
61}
62
63std::string FIRToken::getStringValue(StringRef spelling) {
64 // Start by dropping the quotes.
65 StringRef bytes = spelling.drop_front().drop_back();
66
67 std::string result;
68 result.reserve(bytes.size());
69 for (size_t i = 0, e = bytes.size(); i != e;) {
70 auto c = bytes[i++];
71 if (c != '\\') {
72 result.push_back(c);
73 continue;
74 }
75
76 assert(i + 1 <= e && "invalid string should be caught by lexer");
77 auto c1 = bytes[i++];
78 switch (c1) {
79 case '\\':
80 case '"':
81 case '\'':
82 result.push_back(c1);
83 continue;
84 case 'b':
85 result.push_back('\b');
86 continue;
87 case 'n':
88 result.push_back('\n');
89 continue;
90 case 't':
91 result.push_back('\t');
92 continue;
93 case 'f':
94 result.push_back('\f');
95 continue;
96 case 'r':
97 result.push_back('\r');
98 continue;
99 // TODO: Handle the rest of the escapes (octal and unicode).
100 default:
101 break;
102 }
103
104 assert(i + 1 <= e && "invalid string should be caught by lexer");
105 auto c2 = bytes[i++];
106
107 assert(llvm::isHexDigit(c1) && llvm::isHexDigit(c2) && "invalid escape");
108 result.push_back((llvm::hexDigitValue(c1) << 4) | llvm::hexDigitValue(c2));
109 }
110
111 return result;
112}
113
114/// Given a token containing a verbatim string, return its value, including
115/// removing the quote characters and unescaping the quotes of the string. The
116/// lexer has already verified that this token is valid.
118 assert(getKind() == verbatim_string);
120}
121
122std::string FIRToken::getVerbatimStringValue(StringRef spelling) {
123 // Start by dropping the quotes.
124 StringRef bytes = spelling.drop_front().drop_back();
125
126 std::string result;
127 result.reserve(bytes.size());
128 for (size_t i = 0, e = bytes.size(); i != e;) {
129 auto c = bytes[i++];
130 if (c != '\\') {
131 result.push_back(c);
132 continue;
133 }
134
135 assert(i + 1 <= e && "invalid string should be caught by lexer");
136 auto c1 = bytes[i++];
137 if (c1 != '\'') {
138 result.push_back(c);
139 }
140 result.push_back(c1);
141 }
142
143 return result;
144}
145
146//===----------------------------------------------------------------------===//
147// FIRLexer
148//===----------------------------------------------------------------------===//
149
150static StringAttr getMainBufferNameIdentifier(const llvm::SourceMgr &sourceMgr,
151 MLIRContext *context) {
152 auto mainBuffer = sourceMgr.getMemoryBuffer(sourceMgr.getMainFileID());
153 StringRef bufferName = mainBuffer->getBufferIdentifier();
154 if (bufferName.empty())
155 bufferName = "<unknown>";
156 return StringAttr::get(context, bufferName);
157}
158
159FIRLexer::FIRLexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context)
160 : sourceMgr(sourceMgr),
161 bufferNameIdentifier(getMainBufferNameIdentifier(sourceMgr, context)),
162 curBuffer(
163 sourceMgr.getMemoryBuffer(sourceMgr.getMainFileID())->getBuffer()),
164 curPtr(curBuffer.begin()),
165 // Prime the first token.
166 curToken(lexTokenImpl()) {}
167
168/// Encode the specified source location information into a Location object
169/// for attachment to the IR or error reporting.
170Location FIRLexer::translateLocation(llvm::SMLoc loc) {
171 assert(loc.isValid());
172 unsigned mainFileID = sourceMgr.getMainFileID();
173 auto lineAndColumn = sourceMgr.getLineAndColumn(loc, mainFileID);
174 return FileLineColLoc::get(bufferNameIdentifier, lineAndColumn.first,
175 lineAndColumn.second);
176}
177
178/// Emit an error message and return a FIRToken::error token.
179FIRToken FIRLexer::emitError(const char *loc, const Twine &message) {
180 mlir::emitError(translateLocation(SMLoc::getFromPointer(loc)), message);
181 return formToken(FIRToken::error, loc);
182}
183
184/// Return the indentation level of the specified token.
185std::optional<unsigned> FIRLexer::getIndentation(const FIRToken &tok) const {
186 // Count the number of horizontal whitespace characters before the token.
187 auto *bufStart = curBuffer.begin();
188
189 auto isHorizontalWS = [](char c) -> bool { return c == ' ' || c == '\t'; };
190 auto isVerticalWS = [](char c) -> bool {
191 return c == '\n' || c == '\r' || c == '\f' || c == '\v';
192 };
193
194 unsigned indent = 0;
195 const auto *ptr = (const char *)tok.getSpelling().data();
196 while (ptr != bufStart && isHorizontalWS(ptr[-1]))
197 --ptr, ++indent;
198
199 // If the character we stopped at isn't the start of line, then return none.
200 if (ptr != bufStart && !isVerticalWS(ptr[-1]))
201 return std::nullopt;
202
203 return indent;
204}
205
206//===----------------------------------------------------------------------===//
207// Lexer Implementation Methods
208//===----------------------------------------------------------------------===//
209
211 while (true) {
212 const char *tokStart = curPtr;
213 switch (*curPtr++) {
214 default:
215 // Handle identifiers.
216 if (llvm::isAlpha(curPtr[-1]))
217 return lexIdentifierOrKeyword(tokStart);
218
219 // Unknown character, emit an error.
220 return emitError(tokStart, "unexpected character");
221
222 case 0:
223 // This may either be a nul character in the source file or may be the EOF
224 // marker that llvm::MemoryBuffer guarantees will be there.
225 if (curPtr - 1 == curBuffer.end())
226 return formToken(FIRToken::eof, tokStart);
227
228 [[fallthrough]]; // Treat as whitespace.
229
230 case ' ':
231 case '\t':
232 case '\n':
233 case '\r':
234 // Handle whitespace.
235 continue;
236
237 case '`':
238 case '_':
239 // Handle identifiers.
240 return lexIdentifierOrKeyword(tokStart);
241
242 case '.':
243 return formToken(FIRToken::period, tokStart);
244 case ',':
245 return formToken(FIRToken::comma, tokStart);
246 case ':':
247 return formToken(FIRToken::colon, tokStart);
248 case '(':
249 return formToken(FIRToken::l_paren, tokStart);
250 case ')':
251 return formToken(FIRToken::r_paren, tokStart);
252 case '{':
253 if (*curPtr == '|')
254 return ++curPtr, formToken(FIRToken::l_brace_bar, tokStart);
255 return formToken(FIRToken::l_brace, tokStart);
256 case '}':
257 return formToken(FIRToken::r_brace, tokStart);
258 case '[':
259 return formToken(FIRToken::l_square, tokStart);
260 case ']':
261 return formToken(FIRToken::r_square, tokStart);
262 case '<':
263 if (*curPtr == '=')
264 return ++curPtr, formToken(FIRToken::less_equal, tokStart);
265 return formToken(FIRToken::less, tokStart);
266 case '>':
267 return formToken(FIRToken::greater, tokStart);
268 case '=':
269 if (*curPtr == '>')
270 return ++curPtr, formToken(FIRToken::equal_greater, tokStart);
271 return formToken(FIRToken::equal, tokStart);
272 case '?':
273 return formToken(FIRToken::question, tokStart);
274 case '@':
275 if (*curPtr == '[')
276 return lexFileInfo(tokStart);
277 // Unknown character, emit an error.
278 return emitError(tokStart, "unexpected character");
279 case '%':
280 if (*curPtr == '[')
281 return lexInlineAnnotation(tokStart);
282 return emitError(tokStart, "unexpected character following '%'");
283 case '|':
284 if (*curPtr == '}')
285 return ++curPtr, formToken(FIRToken::r_brace_bar, tokStart);
286 // Unknown character, emit an error.
287 return emitError(tokStart, "unexpected character");
288
289 case ';':
290 skipComment();
291 continue;
292
293 case '"':
294 return lexString(tokStart, /*isVerbatim=*/false);
295 case '\'':
296 return lexString(tokStart, /*isVerbatim=*/true);
297
298 case '-':
299 case '+':
300 case '0':
301 case '1':
302 case '2':
303 case '3':
304 case '4':
305 case '5':
306 case '6':
307 case '7':
308 case '8':
309 case '9':
310 return lexNumber(tokStart);
311 }
312 }
313}
314
315/// Lex a file info specifier.
316///
317/// FileInfo ::= '@[' ('\]'|.)* ']'
318///
319FIRToken FIRLexer::lexFileInfo(const char *tokStart) {
320 while (1) {
321 switch (*curPtr++) {
322 case ']': // This is the end of the fileinfo literal.
323 return formToken(FIRToken::fileinfo, tokStart);
324 case '\\':
325 // Ignore escaped ']'
326 if (*curPtr == ']')
327 ++curPtr;
328 break;
329 case 0:
330 // This could be the end of file in the middle of the fileinfo. If so
331 // emit an error.
332 if (curPtr - 1 != curBuffer.end())
333 break;
334 [[fallthrough]];
335 case '\n': // Vertical whitespace isn't allowed in a fileinfo.
336 case '\v':
337 case '\f':
338 return emitError(tokStart, "unterminated file info specifier");
339 default:
340 // Skip over other characters.
341 break;
342 }
343 }
344}
345
346/// Lex a non-standard inline Annotation file.
347///
348/// InlineAnnotation ::= '%[' (.)* ']'
349///
351 size_t depth = 0;
352 bool stringMode = false;
353 while (1) {
354 switch (*curPtr++) {
355 case '"':
356 stringMode = !stringMode;
357 break;
358 case ']':
359 if (stringMode)
360 break;
361 if (depth == 1)
362 return formToken(FIRToken::inlineannotation, tokStart);
363 --depth;
364 break;
365 case '[':
366 if (stringMode)
367 break;
368 ++depth;
369 break;
370 case '\\':
371 ++curPtr;
372 [[fallthrough]];
373 case 0:
374 if (curPtr - 1 != curBuffer.end())
375 break;
376 return emitError(tokStart, "unterminated inline annotation");
377 default:
378 break;
379 }
380 }
381}
382
383/// Lex an identifier or keyword that starts with a letter.
384///
385/// LegalStartChar ::= [a-zA-Z_]
386/// LegalIdChar ::= LegalStartChar | [0-9] | '$'
387///
388/// Id ::= LegalStartChar (LegalIdChar)*
389/// LiteralId ::= [a-zA-Z0-9$_]+
390///
392 // Remember that this is a literalID
393 bool isLiteralId = *tokStart == '`';
394
395 // Match the rest of the identifier regex: [0-9a-zA-Z_$-]*
396 while (llvm::isAlpha(*curPtr) || llvm::isDigit(*curPtr) || *curPtr == '_' ||
397 *curPtr == '$' || *curPtr == '-')
398 ++curPtr;
399
400 // Consume the trailing '`' in a literal identifier.
401 if (isLiteralId) {
402 if (*curPtr != '`')
403 return emitError(tokStart, "unterminated literal identifier");
404 ++curPtr;
405 }
406
407 StringRef spelling(tokStart, curPtr - tokStart);
408
409 // Check to see if this is a 'primop', which is an identifier juxtaposed with
410 // a '(' character.
411 if (*curPtr == '(') {
412 FIRToken::Kind kind = llvm::StringSwitch<FIRToken::Kind>(spelling)
413#define TOK_LPKEYWORD(SPELLING) .Case(#SPELLING, FIRToken::lp_##SPELLING)
414#include "FIRTokenKinds.def"
415 .Default(FIRToken::identifier);
416 if (kind != FIRToken::identifier) {
417 ++curPtr;
418 return formToken(kind, tokStart);
419 }
420 }
421
422 // See if the identifier is a keyword. By default, it is an identifier.
423 FIRToken::Kind kind = llvm::StringSwitch<FIRToken::Kind>(spelling)
424#define TOK_KEYWORD(SPELLING) .Case(#SPELLING, FIRToken::kw_##SPELLING)
425#include "FIRTokenKinds.def"
426 .Default(FIRToken::identifier);
427
428 // If this has the backticks of a literal identifier and it fell through the
429 // above switch, indicating that it was not found to e a keyword, then change
430 // its kind from identifier to literal identifier.
431 if (isLiteralId && kind == FIRToken::identifier)
432 kind = FIRToken::literal_identifier;
433
434 return FIRToken(kind, spelling);
435}
436
437/// Skip a comment line, starting with a ';' and going to end of line.
439 while (true) {
440 switch (*curPtr++) {
441 case '\n':
442 case '\r':
443 // Newline is end of comment.
444 return;
445 case 0:
446 // If this is the end of the buffer, end the comment.
447 if (curPtr - 1 == curBuffer.end()) {
448 --curPtr;
449 return;
450 }
451 [[fallthrough]];
452 default:
453 // Skip over other characters.
454 break;
455 }
456 }
457}
458
459/// StringLit ::= '"' UnquotedString? '"'
460/// VerbatimStringLit ::= '\'' UnquotedString? '\''
461/// UnquotedString ::= ( '\\\'' | '\\"' | ~[\r\n] )+?
462///
463FIRToken FIRLexer::lexString(const char *tokStart, bool isVerbatim) {
464 while (1) {
465 switch (*curPtr++) {
466 case '"': // This is the end of the string literal.
467 if (isVerbatim)
468 break;
469 return formToken(FIRToken::string, tokStart);
470 case '\'': // This is the end of the raw string.
471 if (!isVerbatim)
472 break;
473 return formToken(FIRToken::verbatim_string, tokStart);
474 case '\\':
475 // Ignore escaped '\'' or '"'
476 if (*curPtr == '\'' || *curPtr == '"' || *curPtr == '\\')
477 ++curPtr;
478 else if (*curPtr == 'u' || *curPtr == 'U')
479 return emitError(tokStart, "unicode escape not supported in string");
480 break;
481 case 0:
482 // This could be the end of file in the middle of the string. If so
483 // emit an error.
484 if (curPtr - 1 != curBuffer.end())
485 break;
486 [[fallthrough]];
487 case '\n': // Vertical whitespace isn't allowed in a string.
488 case '\r':
489 case '\v':
490 case '\f':
491 return emitError(tokStart, "unterminated string");
492 default:
493 if (curPtr[-1] & ~0x7F)
494 return emitError(tokStart, "string characters must be 7-bit ASCII");
495 // Skip over other characters.
496 break;
497 }
498 }
499}
500
501/// Lex a number literal.
502///
503/// UnsignedInt ::= '0' | PosInt
504/// PosInt ::= [1-9] ([0-9])*
505/// DoubleLit ::=
506/// ( '+' | '-' )? Digit+ '.' Digit+ ( 'E' ( '+' | '-' )? Digit+ )?
507/// TripleLit ::=
508/// Digit+ '.' Digit+ '.' Digit+
509/// Radix-specified Integer ::=
510/// ( '-' )? '0' ( 'b' | 'o' | 'd' | 'h' ) LegalDigit*
511///
512FIRToken FIRLexer::lexNumber(const char *tokStart) {
513 assert(llvm::isDigit(curPtr[-1]) || curPtr[-1] == '+' || curPtr[-1] == '-');
514
515 // There needs to be at least one digit.
516 if (!llvm::isDigit(*curPtr) && !llvm::isDigit(curPtr[-1]))
517 return emitError(tokStart, "unexpected character after sign");
518
519 // If we encounter a "b", "o", "d", or "h", this is a radix-specified integer
520 // literal. This is only supported for FIRRTL 2.4.0 or later. This is always
521 // lexed, but rejected during parsing if the version is too old.
522 const char *oldPtr = curPtr;
523 if (curPtr[-1] == '-' && *curPtr == '0')
524 ++curPtr;
525 if (curPtr[-1] == '0') {
526 switch (*curPtr) {
527 case 'b':
528 ++curPtr;
529 while (*curPtr >= '0' && *curPtr <= '1')
530 ++curPtr;
531 return formToken(FIRToken::radix_specified_integer, tokStart);
532 case 'o':
533 ++curPtr;
534 while (*curPtr >= '0' && *curPtr <= '7')
535 ++curPtr;
536 return formToken(FIRToken::radix_specified_integer, tokStart);
537 case 'd':
538 ++curPtr;
539 while (llvm::isDigit(*curPtr))
540 ++curPtr;
541 return formToken(FIRToken::radix_specified_integer, tokStart);
542 case 'h':
543 ++curPtr;
544 while (llvm::isHexDigit(*curPtr))
545 ++curPtr;
546 return formToken(FIRToken::radix_specified_integer, tokStart);
547 default:
548 curPtr = oldPtr;
549 break;
550 }
551 }
552
553 while (llvm::isDigit(*curPtr))
554 ++curPtr;
555
556 // If we encounter a '.' followed by a digit, then this is a floating point
557 // literal, otherwise this is an integer or negative integer.
558 if (*curPtr != '.' || !llvm::isDigit(curPtr[1])) {
559 if (*tokStart == '-' || *tokStart == '+')
560 return formToken(FIRToken::signed_integer, tokStart);
561 return formToken(FIRToken::integer, tokStart);
562 }
563
564 // Lex a floating point literal.
565 curPtr += 2;
566 while (llvm::isDigit(*curPtr))
567 ++curPtr;
568
569 bool hasE = false;
570 if (*curPtr == 'E') {
571 hasE = true;
572 ++curPtr;
573 if (*curPtr == '+' || *curPtr == '-')
574 ++curPtr;
575 while (llvm::isDigit(*curPtr))
576 ++curPtr;
577 }
578
579 // If we encounter a '.' followed by a digit, again, and there was no
580 // exponent, then this is a version literal. Otherwise it is a floating point
581 // literal.
582 if (*curPtr != '.' || !llvm::isDigit(curPtr[1]) || hasE)
583 return formToken(FIRToken::floatingpoint, tokStart);
584
585 // Lex a version literal.
586 curPtr += 2;
587 while (llvm::isDigit(*curPtr))
588 ++curPtr;
589 return formToken(FIRToken::version, tokStart);
590}
assert(baseType &&"element must be base type")
static StringAttr getMainBufferNameIdentifier(const llvm::SourceMgr &sourceMgr, MLIRContext *context)
Definition FIRLexer.cpp:150
FIRToken lexFileInfo(const char *tokStart)
Lex a file info specifier.
Definition FIRLexer.cpp:319
FIRToken lexIdentifierOrKeyword(const char *tokStart)
Lex an identifier or keyword that starts with a letter.
Definition FIRLexer.cpp:391
const llvm::SourceMgr & sourceMgr
Definition FIRLexer.h:138
FIRToken formToken(FIRToken::Kind kind, const char *tokStart)
Definition FIRLexer.h:124
FIRToken lexNumber(const char *tokStart)
Lex a number literal.
Definition FIRLexer.cpp:512
FIRToken lexString(const char *tokStart, bool isVerbatim)
StringLit ::= '"' UnquotedString? '"' VerbatimStringLit ::= '\'' UnquotedString? '\'' UnquotedString ...
Definition FIRLexer.cpp:463
const char * curPtr
Definition FIRLexer.h:142
std::optional< unsigned > getIndentation(const FIRToken &tok) const
Return the indentation level of the specified token or None if this token is preceded by another toke...
Definition FIRLexer.cpp:185
void skipComment()
Skip a comment line, starting with a ';' and going to end of line.
Definition FIRLexer.cpp:438
FIRToken emitError(const char *loc, const Twine &message)
Emit an error message and return a FIRToken::error token.
Definition FIRLexer.cpp:179
FIRLexer(const llvm::SourceMgr &sourceMgr, mlir::MLIRContext *context)
mlir::Location translateLocation(llvm::SMLoc loc)
Encode the specified source location information into a Location object for attachment to the IR or e...
Definition FIRLexer.cpp:170
FIRToken lexInlineAnnotation(const char *tokStart)
Lex a non-standard inline Annotation file.
Definition FIRLexer.cpp:350
const mlir::StringAttr bufferNameIdentifier
Definition FIRLexer.h:139
This represents a specific token for .fir files.
Definition FIRLexer.h:29
std::string getVerbatimStringValue() const
Given a token containing a verbatim string, return its value, including removing the quote characters...
Definition FIRLexer.cpp:117
StringRef getSpelling() const
Definition FIRLexer.h:44
StringRef spelling
A reference to the entire token contents; this is always a pointer into a memory buffer owned by the ...
Definition FIRLexer.h:94
llvm::SMRange getLocRange() const
Definition FIRLexer.cpp:41
std::string getStringValue() const
Given a token containing a string literal, return its value, including removing the quote characters ...
Definition FIRLexer.cpp:58
Kind kind
Discriminator that indicates the sort of token this is.
Definition FIRLexer.h:90
llvm::SMLoc getEndLoc() const
Definition FIRLexer.cpp:37
Kind getKind() const
Definition FIRLexer.h:47
llvm::SMLoc getLoc() const
Definition FIRLexer.cpp:33
bool isKeyword() const
Return true if this is one of the keyword token kinds (e.g. kw_wire).
Definition FIRLexer.cpp:44
The InstanceGraph op interface, see InstanceGraphInterface.td for more details.