CIRCT 23.0.0git
Loading...
Searching...
No Matches
FIRLexer.cpp
Go to the documentation of this file.
1//===- FIRLexer.cpp - .fir file lexer implementation ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This implements a .fir file lexer.
10//
11//===----------------------------------------------------------------------===//
12
13#include "FIRLexer.h"
14#include "mlir/IR/Diagnostics.h"
15#include "llvm/ADT/StringExtras.h"
16#include "llvm/ADT/StringSwitch.h"
17#include "llvm/Support/SourceMgr.h"
18#include "llvm/Support/raw_ostream.h"
19
20using namespace circt;
21using namespace firrtl;
22using llvm::SMLoc;
23using llvm::SMRange;
24using llvm::SourceMgr;
25
26#define isdigit(x) DO_NOT_USE_SLOW_CTYPE_FUNCTIONS
27#define isalpha(x) DO_NOT_USE_SLOW_CTYPE_FUNCTIONS
28
29//===----------------------------------------------------------------------===//
30// FIRToken
31//===----------------------------------------------------------------------===//
32
33SMLoc FIRToken::getLoc() const {
34 return SMLoc::getFromPointer(spelling.data());
35}
36
37SMLoc FIRToken::getEndLoc() const {
38 return SMLoc::getFromPointer(spelling.data() + spelling.size());
39}
40
41SMRange FIRToken::getLocRange() const { return SMRange(getLoc(), getEndLoc()); }
42
43/// Return true if this is one of the keyword token kinds (e.g. kw_wire).
44bool FIRToken::isKeyword() const {
45 switch (kind) {
46 default:
47 return false;
48#define TOK_KEYWORD(SPELLING) \
49 case kw_##SPELLING: \
50 return true;
51#include "FIRTokenKinds.def"
52 }
53}
54
55/// Given a token containing a string literal, return its value, including
56/// removing the quote characters and unescaping the contents of the string. The
57/// lexer has already verified that this token is valid.
58std::string FIRToken::getStringValue() const {
59 assert(getKind() == string);
61}
62
63std::string FIRToken::getStringValue(StringRef spelling) {
64 // Start by dropping the quotes.
65 StringRef bytes = spelling.drop_front().drop_back();
66
67 std::string result;
68 result.reserve(bytes.size());
69 for (size_t i = 0, e = bytes.size(); i != e;) {
70 auto c = bytes[i++];
71 if (c != '\\') {
72 result.push_back(c);
73 continue;
74 }
75
76 assert(i + 1 <= e && "invalid string should be caught by lexer");
77 auto c1 = bytes[i++];
78 switch (c1) {
79 case '\\':
80 case '"':
81 case '\'':
82 result.push_back(c1);
83 continue;
84 case 'b':
85 result.push_back('\b');
86 continue;
87 case 'n':
88 result.push_back('\n');
89 continue;
90 case 't':
91 result.push_back('\t');
92 continue;
93 case 'f':
94 result.push_back('\f');
95 continue;
96 case 'r':
97 result.push_back('\r');
98 continue;
99 // TODO: Handle the rest of the escapes (octal and unicode).
100 default:
101 break;
102 }
103
104 assert(i + 1 <= e && "invalid string should be caught by lexer");
105 auto c2 = bytes[i++];
106
107 assert(llvm::isHexDigit(c1) && llvm::isHexDigit(c2) && "invalid escape");
108 result.push_back((llvm::hexDigitValue(c1) << 4) | llvm::hexDigitValue(c2));
109 }
110
111 return result;
112}
113
114/// Given a token containing a verbatim string, return its value, including
115/// removing the quote characters and unescaping the quotes of the string. The
116/// lexer has already verified that this token is valid.
118 assert(getKind() == verbatim_string);
120}
121
122std::string FIRToken::getVerbatimStringValue(StringRef spelling) {
123 // Start by dropping the quotes.
124 StringRef bytes = spelling.drop_front().drop_back();
125
126 std::string result;
127 result.reserve(bytes.size());
128 for (size_t i = 0, e = bytes.size(); i != e;) {
129 auto c = bytes[i++];
130 if (c != '\\') {
131 result.push_back(c);
132 continue;
133 }
134
135 assert(i + 1 <= e && "invalid string should be caught by lexer");
136 auto c1 = bytes[i++];
137 if (c1 != '\'') {
138 result.push_back(c);
139 }
140 result.push_back(c1);
141 }
142
143 return result;
144}
145
146//===----------------------------------------------------------------------===//
147// FIRLexer
148//===----------------------------------------------------------------------===//
149
150static StringAttr getMainBufferNameIdentifier(const llvm::SourceMgr &sourceMgr,
151 MLIRContext *context) {
152 auto mainBuffer = sourceMgr.getMemoryBuffer(sourceMgr.getMainFileID());
153 StringRef bufferName = mainBuffer->getBufferIdentifier();
154 if (bufferName.empty())
155 bufferName = "<unknown>";
156 return StringAttr::get(context, bufferName);
157}
158
159FIRLexer::FIRLexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context)
160 : sourceMgr(sourceMgr),
161 bufferNameIdentifier(getMainBufferNameIdentifier(sourceMgr, context)),
162 curBuffer(
163 sourceMgr.getMemoryBuffer(sourceMgr.getMainFileID())->getBuffer()),
164 curPtr(curBuffer.begin()),
165 // Prime the first token.
166 curToken(lexTokenImpl()) {}
167
168/// Encode the specified source location information into a Location object
169/// for attachment to the IR or error reporting.
170Location FIRLexer::translateLocation(llvm::SMLoc loc) {
171 assert(loc.isValid());
172 unsigned mainFileID = sourceMgr.getMainFileID();
173 auto lineAndColumn = sourceMgr.getLineAndColumn(loc, mainFileID);
174 return FileLineColLoc::get(bufferNameIdentifier, lineAndColumn.first,
175 lineAndColumn.second);
176}
177
178/// Emit an error message and return a FIRToken::error token.
179FIRToken FIRLexer::emitError(const char *loc, const Twine &message) {
180 mlir::emitError(translateLocation(SMLoc::getFromPointer(loc)), message);
181 return formToken(FIRToken::error, loc);
182}
183
184/// Return the indentation level of the specified token.
185std::optional<unsigned> FIRLexer::getIndentation(const FIRToken &tok) const {
186 // Count the number of horizontal whitespace characters before the token.
187 auto *bufStart = curBuffer.begin();
188
189 auto isHorizontalWS = [](char c) -> bool { return c == ' ' || c == '\t'; };
190 auto isVerticalWS = [](char c) -> bool {
191 return c == '\n' || c == '\r' || c == '\f' || c == '\v';
192 };
193
194 unsigned indent = 0;
195 const auto *ptr = (const char *)tok.getSpelling().data();
196 while (ptr != bufStart && isHorizontalWS(ptr[-1]))
197 --ptr, ++indent;
198
199 // If the character we stopped at isn't the start of line, then return none.
200 if (ptr != bufStart && !isVerticalWS(ptr[-1]))
201 return std::nullopt;
202
203 return indent;
204}
205
206//===----------------------------------------------------------------------===//
207// Lexer Implementation Methods
208//===----------------------------------------------------------------------===//
209
211 while (true) {
212 const char *tokStart = curPtr;
213 switch (*curPtr++) {
214 default:
215 // Handle identifiers.
216 if (llvm::isAlpha(curPtr[-1]))
217 return lexIdentifierOrKeyword(tokStart);
218
219 // Unknown character, emit an error.
220 return emitError(tokStart, "unexpected character");
221
222 case 0:
223 // This may either be a nul character in the source file or may be the EOF
224 // marker that llvm::MemoryBuffer guarantees will be there.
225 if (curPtr - 1 == curBuffer.end())
226 return formToken(FIRToken::eof, tokStart);
227
228 [[fallthrough]]; // Treat as whitespace.
229
230 case ' ':
231 case '\t':
232 case '\n':
233 case '\r':
234 // Handle whitespace.
235 continue;
236
237 case '`':
238 case '_':
239 // Handle identifiers.
240 return lexIdentifierOrKeyword(tokStart);
241
242 case '.':
243 return formToken(FIRToken::period, tokStart);
244 case ',':
245 return formToken(FIRToken::comma, tokStart);
246 case ':':
247 return formToken(FIRToken::colon, tokStart);
248 case '(':
249 return formToken(FIRToken::l_paren, tokStart);
250 case ')':
251 return formToken(FIRToken::r_paren, tokStart);
252 case '{':
253 if (*curPtr == '|')
254 return ++curPtr, formToken(FIRToken::l_brace_bar, tokStart);
255 return formToken(FIRToken::l_brace, tokStart);
256 case '}':
257 return formToken(FIRToken::r_brace, tokStart);
258 case '[':
259 return formToken(FIRToken::l_square, tokStart);
260 case ']':
261 return formToken(FIRToken::r_square, tokStart);
262 case '<':
263 if (*curPtr == '=')
264 return ++curPtr, formToken(FIRToken::less_equal, tokStart);
265 return formToken(FIRToken::less, tokStart);
266 case '>':
267 return formToken(FIRToken::greater, tokStart);
268 case '=':
269 if (*curPtr == '>')
270 return ++curPtr, formToken(FIRToken::equal_greater, tokStart);
271 return formToken(FIRToken::equal, tokStart);
272 case '?':
273 return formToken(FIRToken::question, tokStart);
274 case '@':
275 if (*curPtr == '[')
276 return lexFileInfo(tokStart);
277 // Unknown character, emit an error.
278 return emitError(tokStart, "unexpected character");
279 case '%':
280 if (*curPtr == '[')
281 return lexInlineAnnotation(tokStart);
282 return emitError(tokStart, "unexpected character following '%'");
283 case '|':
284 if (*curPtr == '}')
285 return ++curPtr, formToken(FIRToken::r_brace_bar, tokStart);
286 // Unknown character, emit an error.
287 return emitError(tokStart, "unexpected character");
288
289 case ';':
290 skipComment();
291 continue;
292
293 case '"':
294 return lexString(tokStart, /*isVerbatim=*/false);
295 case '\'':
296 return lexString(tokStart, /*isVerbatim=*/true);
297
298 case '-':
299 case '+':
300 case '0':
301 case '1':
302 case '2':
303 case '3':
304 case '4':
305 case '5':
306 case '6':
307 case '7':
308 case '8':
309 case '9':
310 return lexNumber(tokStart);
311 }
312 }
313}
314
315/// Lex a file info specifier.
316///
317/// FileInfo ::= '@[' ('\]'|.)* ']'
318///
319FIRToken FIRLexer::lexFileInfo(const char *tokStart) {
320 while (1) {
321 switch (*curPtr++) {
322 case ']': // This is the end of the fileinfo literal.
323 return formToken(FIRToken::fileinfo, tokStart);
324 case '\\':
325 // Ignore escaped ']'
326 if (*curPtr == ']')
327 ++curPtr;
328 break;
329 case 0:
330 // This could be the end of file in the middle of the fileinfo. If so
331 // emit an error.
332 if (curPtr - 1 != curBuffer.end())
333 break;
334 [[fallthrough]];
335 case '\n': // Vertical whitespace isn't allowed in a fileinfo.
336 case '\v':
337 case '\f':
338 return emitError(tokStart, "unterminated file info specifier");
339 default:
340 // Skip over other characters.
341 break;
342 }
343 }
344}
345
346/// Lex a non-standard inline Annotation file.
347///
348/// InlineAnnotation ::= '%[' (.)* ']'
349///
351 size_t depth = 0;
352 bool stringMode = false;
353 while (1) {
354 switch (*curPtr++) {
355 case '"':
356 stringMode = !stringMode;
357 break;
358 case ']':
359 if (stringMode)
360 break;
361 if (depth == 1)
362 return formToken(FIRToken::inlineannotation, tokStart);
363 --depth;
364 break;
365 case '[':
366 if (stringMode)
367 break;
368 ++depth;
369 break;
370 case '\\':
371 ++curPtr;
372 [[fallthrough]];
373 case 0:
374 if (curPtr - 1 != curBuffer.end())
375 break;
376 return emitError(tokStart, "unterminated inline annotation");
377 default:
378 break;
379 }
380 }
381}
382
383/// Lex an identifier or keyword that starts with a letter.
384///
385/// LegalStartChar ::= [a-zA-Z_]
386/// LegalIdChar ::= LegalStartChar | [0-9] | '$'
387///
388/// Id ::= LegalStartChar (LegalIdChar)*
389/// LiteralId ::= [a-zA-Z0-9$_]+
390///
392 // Remember that this is a literalID
393 bool isLiteralId = *tokStart == '`';
394
395 // Match the rest of the identifier regex: [0-9a-zA-Z_$-]*
396 while (llvm::isAlpha(*curPtr) || llvm::isDigit(*curPtr) || *curPtr == '_' ||
397 *curPtr == '$' || *curPtr == '-')
398 ++curPtr;
399
400 // Consume the trailing '`' in a literal identifier.
401 if (isLiteralId) {
402 if (*curPtr != '`')
403 return emitError(tokStart, "unterminated literal identifier");
404 ++curPtr;
405 }
406
407 StringRef spelling(tokStart, curPtr - tokStart);
408
409 // Check to see if this is a 'primop', which is an identifier juxtaposed with
410 // a '(' character.
411 if (*curPtr == '(') {
412 FIRToken::Kind kind = llvm::StringSwitch<FIRToken::Kind>(spelling)
413#define TOK_LPKEYWORD(SPELLING) .Case(#SPELLING, FIRToken::lp_##SPELLING)
414#include "FIRTokenKinds.def"
415 .Default(FIRToken::identifier);
416 if (kind != FIRToken::identifier) {
417 ++curPtr;
418 return formToken(kind, tokStart);
419 }
420 }
421
422 // Check to see if this is a keyword followed by '<' character.
423 if (*curPtr == '<') {
424 FIRToken::Kind kind = llvm::StringSwitch<FIRToken::Kind>(spelling)
425#define TOK_LESSKEYWORD(SPELLING) .Case(#SPELLING, FIRToken::langle_##SPELLING)
426#include "FIRTokenKinds.def"
427 .Default(FIRToken::identifier);
428#undef TOK_LESSKEYWORD
429 if (kind != FIRToken::identifier) {
430 ++curPtr;
431 return formToken(kind, tokStart);
432 }
433 }
434
435 // See if the identifier is a keyword. By default, it is an identifier.
436 FIRToken::Kind kind = llvm::StringSwitch<FIRToken::Kind>(spelling)
437#define TOK_KEYWORD(SPELLING) .Case(#SPELLING, FIRToken::kw_##SPELLING)
438#include "FIRTokenKinds.def"
439 .Default(FIRToken::identifier);
440
441 // If this has the backticks of a literal identifier and it fell through the
442 // above switch, indicating that it was not found to e a keyword, then change
443 // its kind from identifier to literal identifier.
444 if (isLiteralId && kind == FIRToken::identifier)
445 kind = FIRToken::literal_identifier;
446
447 return FIRToken(kind, spelling);
448}
449
450/// Skip a comment line, starting with a ';' and going to end of line.
452 while (true) {
453 switch (*curPtr++) {
454 case '\n':
455 case '\r':
456 // Newline is end of comment.
457 return;
458 case 0:
459 // If this is the end of the buffer, end the comment.
460 if (curPtr - 1 == curBuffer.end()) {
461 --curPtr;
462 return;
463 }
464 [[fallthrough]];
465 default:
466 // Skip over other characters.
467 break;
468 }
469 }
470}
471
472/// StringLit ::= '"' UnquotedString? '"'
473/// VerbatimStringLit ::= '\'' UnquotedString? '\''
474/// UnquotedString ::= ( '\\\'' | '\\"' | ~[\r\n] )+?
475///
476FIRToken FIRLexer::lexString(const char *tokStart, bool isVerbatim) {
477 while (1) {
478 switch (*curPtr++) {
479 case '"': // This is the end of the string literal.
480 if (isVerbatim)
481 break;
482 return formToken(FIRToken::string, tokStart);
483 case '\'': // This is the end of the raw string.
484 if (!isVerbatim)
485 break;
486 return formToken(FIRToken::verbatim_string, tokStart);
487 case '\\':
488 // Ignore escaped '\'' or '"'
489 if (*curPtr == '\'' || *curPtr == '"' || *curPtr == '\\')
490 ++curPtr;
491 else if (*curPtr == 'u' || *curPtr == 'U')
492 return emitError(tokStart, "unicode escape not supported in string");
493 break;
494 case 0:
495 // This could be the end of file in the middle of the string. If so
496 // emit an error.
497 if (curPtr - 1 != curBuffer.end())
498 break;
499 [[fallthrough]];
500 case '\n': // Vertical whitespace isn't allowed in a string.
501 case '\r':
502 case '\v':
503 case '\f':
504 return emitError(tokStart, "unterminated string");
505 default:
506 if (curPtr[-1] & ~0x7F)
507 return emitError(tokStart, "string characters must be 7-bit ASCII");
508 // Skip over other characters.
509 break;
510 }
511 }
512}
513
514/// Lex a number literal.
515///
516/// UnsignedInt ::= '0' | PosInt
517/// PosInt ::= [1-9] ([0-9])*
518/// DoubleLit ::=
519/// ( '+' | '-' )? Digit+ '.' Digit+ ( 'E' ( '+' | '-' )? Digit+ )?
520/// TripleLit ::=
521/// Digit+ '.' Digit+ '.' Digit+
522/// Radix-specified Integer ::=
523/// ( '-' )? '0' ( 'b' | 'o' | 'd' | 'h' ) LegalDigit*
524///
525FIRToken FIRLexer::lexNumber(const char *tokStart) {
526 assert(llvm::isDigit(curPtr[-1]) || curPtr[-1] == '+' || curPtr[-1] == '-');
527
528 // There needs to be at least one digit.
529 if (!llvm::isDigit(*curPtr) && !llvm::isDigit(curPtr[-1]))
530 return emitError(tokStart, "unexpected character after sign");
531
532 // If we encounter a "b", "o", "d", or "h", this is a radix-specified integer
533 // literal. This is only supported for FIRRTL 2.4.0 or later. This is always
534 // lexed, but rejected during parsing if the version is too old.
535 const char *oldPtr = curPtr;
536 if (curPtr[-1] == '-' && *curPtr == '0')
537 ++curPtr;
538 if (curPtr[-1] == '0') {
539 switch (*curPtr) {
540 case 'b':
541 ++curPtr;
542 while (*curPtr >= '0' && *curPtr <= '1')
543 ++curPtr;
544 return formToken(FIRToken::radix_specified_integer, tokStart);
545 case 'o':
546 ++curPtr;
547 while (*curPtr >= '0' && *curPtr <= '7')
548 ++curPtr;
549 return formToken(FIRToken::radix_specified_integer, tokStart);
550 case 'd':
551 ++curPtr;
552 while (llvm::isDigit(*curPtr))
553 ++curPtr;
554 return formToken(FIRToken::radix_specified_integer, tokStart);
555 case 'h':
556 ++curPtr;
557 while (llvm::isHexDigit(*curPtr))
558 ++curPtr;
559 return formToken(FIRToken::radix_specified_integer, tokStart);
560 default:
561 curPtr = oldPtr;
562 break;
563 }
564 }
565
566 while (llvm::isDigit(*curPtr))
567 ++curPtr;
568
569 // If we encounter a '.' followed by a digit, then this is a floating point
570 // literal, otherwise this is an integer or negative integer.
571 if (*curPtr != '.' || !llvm::isDigit(curPtr[1])) {
572 if (*tokStart == '-' || *tokStart == '+')
573 return formToken(FIRToken::signed_integer, tokStart);
574 return formToken(FIRToken::integer, tokStart);
575 }
576
577 // Lex a floating point literal.
578 curPtr += 2;
579 while (llvm::isDigit(*curPtr))
580 ++curPtr;
581
582 bool hasE = false;
583 if (*curPtr == 'E') {
584 hasE = true;
585 ++curPtr;
586 if (*curPtr == '+' || *curPtr == '-')
587 ++curPtr;
588 while (llvm::isDigit(*curPtr))
589 ++curPtr;
590 }
591
592 // If we encounter a '.' followed by a digit, again, and there was no
593 // exponent, then this is a version literal. Otherwise it is a floating point
594 // literal.
595 if (*curPtr != '.' || !llvm::isDigit(curPtr[1]) || hasE)
596 return formToken(FIRToken::floatingpoint, tokStart);
597
598 // Lex a version literal.
599 curPtr += 2;
600 while (llvm::isDigit(*curPtr))
601 ++curPtr;
602 return formToken(FIRToken::version, tokStart);
603}
assert(baseType &&"element must be base type")
static std::unique_ptr< Context > context
static StringAttr getMainBufferNameIdentifier(const llvm::SourceMgr &sourceMgr, MLIRContext *context)
Definition FIRLexer.cpp:150
FIRToken lexFileInfo(const char *tokStart)
Lex a file info specifier.
Definition FIRLexer.cpp:319
FIRToken lexIdentifierOrKeyword(const char *tokStart)
Lex an identifier or keyword that starts with a letter.
Definition FIRLexer.cpp:391
const llvm::SourceMgr & sourceMgr
Definition FIRLexer.h:139
FIRToken formToken(FIRToken::Kind kind, const char *tokStart)
Definition FIRLexer.h:125
FIRToken lexNumber(const char *tokStart)
Lex a number literal.
Definition FIRLexer.cpp:525
FIRToken lexString(const char *tokStart, bool isVerbatim)
StringLit ::= '"' UnquotedString? '"' VerbatimStringLit ::= '\'' UnquotedString? '\'' UnquotedString ...
Definition FIRLexer.cpp:476
const char * curPtr
Definition FIRLexer.h:143
std::optional< unsigned > getIndentation(const FIRToken &tok) const
Return the indentation level of the specified token or None if this token is preceded by another toke...
Definition FIRLexer.cpp:185
void skipComment()
Skip a comment line, starting with a ';' and going to end of line.
Definition FIRLexer.cpp:451
FIRToken emitError(const char *loc, const Twine &message)
Emit an error message and return a FIRToken::error token.
Definition FIRLexer.cpp:179
FIRLexer(const llvm::SourceMgr &sourceMgr, mlir::MLIRContext *context)
mlir::Location translateLocation(llvm::SMLoc loc)
Encode the specified source location information into a Location object for attachment to the IR or e...
Definition FIRLexer.cpp:170
FIRToken lexInlineAnnotation(const char *tokStart)
Lex a non-standard inline Annotation file.
Definition FIRLexer.cpp:350
const mlir::StringAttr bufferNameIdentifier
Definition FIRLexer.h:140
This represents a specific token for .fir files.
Definition FIRLexer.h:29
std::string getVerbatimStringValue() const
Given a token containing a verbatim string, return its value, including removing the quote characters...
Definition FIRLexer.cpp:117
StringRef getSpelling() const
Definition FIRLexer.h:45
StringRef spelling
A reference to the entire token contents; this is always a pointer into a memory buffer owned by the ...
Definition FIRLexer.h:95
llvm::SMRange getLocRange() const
Definition FIRLexer.cpp:41
std::string getStringValue() const
Given a token containing a string literal, return its value, including removing the quote characters ...
Definition FIRLexer.cpp:58
Kind kind
Discriminator that indicates the sort of token this is.
Definition FIRLexer.h:91
llvm::SMLoc getEndLoc() const
Definition FIRLexer.cpp:37
Kind getKind() const
Definition FIRLexer.h:48
llvm::SMLoc getLoc() const
Definition FIRLexer.cpp:33
bool isKeyword() const
Return true if this is one of the keyword token kinds (e.g. kw_wire).
Definition FIRLexer.cpp:44
The InstanceGraph op interface, see InstanceGraphInterface.td for more details.