CIRCT  18.0.0git
FIRLexer.cpp
Go to the documentation of this file.
1 //===- FIRLexer.cpp - .fir file lexer implementation ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This implements a .fir file lexer.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "FIRLexer.h"
14 #include "mlir/IR/Diagnostics.h"
15 #include "llvm/ADT/StringExtras.h"
16 #include "llvm/ADT/StringSwitch.h"
17 #include "llvm/Support/SourceMgr.h"
18 #include "llvm/Support/raw_ostream.h"
19 
20 using namespace circt;
21 using namespace firrtl;
22 using llvm::SMLoc;
23 using llvm::SMRange;
24 using llvm::SourceMgr;
25 
26 #define isdigit(x) DO_NOT_USE_SLOW_CTYPE_FUNCTIONS
27 #define isalpha(x) DO_NOT_USE_SLOW_CTYPE_FUNCTIONS
28 
29 //===----------------------------------------------------------------------===//
30 // FIRToken
31 //===----------------------------------------------------------------------===//
32 
33 SMLoc FIRToken::getLoc() const {
34  return SMLoc::getFromPointer(spelling.data());
35 }
36 
37 SMLoc FIRToken::getEndLoc() const {
38  return SMLoc::getFromPointer(spelling.data() + spelling.size());
39 }
40 
41 SMRange FIRToken::getLocRange() const { return SMRange(getLoc(), getEndLoc()); }
42 
43 /// Return true if this is one of the keyword token kinds (e.g. kw_wire).
44 bool FIRToken::isKeyword() const {
45  switch (kind) {
46  default:
47  return false;
48 #define TOK_KEYWORD(SPELLING) \
49  case kw_##SPELLING: \
50  return true;
51 #include "FIRTokenKinds.def"
52  }
53 }
54 
55 /// Given a token containing a string literal, return its value, including
56 /// removing the quote characters and unescaping the contents of the string. The
57 /// lexer has already verified that this token is valid.
58 std::string FIRToken::getStringValue() const {
59  assert(getKind() == string);
60  return getStringValue(getSpelling());
61 }
62 
63 std::string FIRToken::getStringValue(StringRef spelling) {
64  // Start by dropping the quotes.
65  StringRef bytes = spelling.drop_front().drop_back();
66 
67  std::string result;
68  result.reserve(bytes.size());
69  for (size_t i = 0, e = bytes.size(); i != e;) {
70  auto c = bytes[i++];
71  if (c != '\\') {
72  result.push_back(c);
73  continue;
74  }
75 
76  assert(i + 1 <= e && "invalid string should be caught by lexer");
77  auto c1 = bytes[i++];
78  switch (c1) {
79  case '\\':
80  case '"':
81  case '\'':
82  result.push_back(c1);
83  continue;
84  case 'b':
85  result.push_back('\b');
86  continue;
87  case 'n':
88  result.push_back('\n');
89  continue;
90  case 't':
91  result.push_back('\t');
92  continue;
93  case 'f':
94  result.push_back('\f');
95  continue;
96  case 'r':
97  result.push_back('\r');
98  continue;
99  // TODO: Handle the rest of the escapes (octal and unicode).
100  default:
101  break;
102  }
103 
104  assert(i + 1 <= e && "invalid string should be caught by lexer");
105  auto c2 = bytes[i++];
106 
107  assert(llvm::isHexDigit(c1) && llvm::isHexDigit(c2) && "invalid escape");
108  result.push_back((llvm::hexDigitValue(c1) << 4) | llvm::hexDigitValue(c2));
109  }
110 
111  return result;
112 }
113 
114 /// Given a token containing a verbatim string, return its value, including
115 /// removing the quote characters and unescaping the quotes of the string. The
116 /// lexer has already verified that this token is valid.
117 std::string FIRToken::getVerbatimStringValue() const {
118  assert(getKind() == verbatim_string);
120 }
121 
122 std::string FIRToken::getVerbatimStringValue(StringRef spelling) {
123  // Start by dropping the quotes.
124  StringRef bytes = spelling.drop_front().drop_back();
125 
126  std::string result;
127  result.reserve(bytes.size());
128  for (size_t i = 0, e = bytes.size(); i != e;) {
129  auto c = bytes[i++];
130  if (c != '\\') {
131  result.push_back(c);
132  continue;
133  }
134 
135  assert(i + 1 <= e && "invalid string should be caught by lexer");
136  auto c1 = bytes[i++];
137  if (c1 != '\'') {
138  result.push_back(c);
139  }
140  result.push_back(c1);
141  }
142 
143  return result;
144 }
145 
146 //===----------------------------------------------------------------------===//
147 // FIRLexer
148 //===----------------------------------------------------------------------===//
149 
150 static StringAttr getMainBufferNameIdentifier(const llvm::SourceMgr &sourceMgr,
151  MLIRContext *context) {
152  auto mainBuffer = sourceMgr.getMemoryBuffer(sourceMgr.getMainFileID());
153  StringRef bufferName = mainBuffer->getBufferIdentifier();
154  if (bufferName.empty())
155  bufferName = "<unknown>";
156  return StringAttr::get(context, bufferName);
157 }
158 
159 FIRLexer::FIRLexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context)
160  : sourceMgr(sourceMgr),
161  bufferNameIdentifier(getMainBufferNameIdentifier(sourceMgr, context)),
162  curBuffer(
163  sourceMgr.getMemoryBuffer(sourceMgr.getMainFileID())->getBuffer()),
164  curPtr(curBuffer.begin()),
165  // Prime the first token.
166  curToken(lexTokenImpl()) {}
167 
168 /// Encode the specified source location information into a Location object
169 /// for attachment to the IR or error reporting.
170 Location FIRLexer::translateLocation(llvm::SMLoc loc) {
171  assert(loc.isValid());
172  unsigned mainFileID = sourceMgr.getMainFileID();
173  auto lineAndColumn = sourceMgr.getLineAndColumn(loc, mainFileID);
174  return FileLineColLoc::get(bufferNameIdentifier, lineAndColumn.first,
175  lineAndColumn.second);
176 }
177 
178 /// Emit an error message and return a FIRToken::error token.
179 FIRToken FIRLexer::emitError(const char *loc, const Twine &message) {
180  mlir::emitError(translateLocation(SMLoc::getFromPointer(loc)), message);
181  return formToken(FIRToken::error, loc);
182 }
183 
184 /// Return the indentation level of the specified token.
185 std::optional<unsigned> FIRLexer::getIndentation(const FIRToken &tok) const {
186  // Count the number of horizontal whitespace characters before the token.
187  auto *bufStart = curBuffer.begin();
188 
189  auto isHorizontalWS = [](char c) -> bool {
190  return c == ' ' || c == '\t' || c == ',';
191  };
192  auto isVerticalWS = [](char c) -> bool {
193  return c == '\n' || c == '\r' || c == '\f' || c == '\v';
194  };
195 
196  unsigned indent = 0;
197  const auto *ptr = (const char *)tok.getSpelling().data();
198  while (ptr != bufStart && isHorizontalWS(ptr[-1]))
199  --ptr, ++indent;
200 
201  // If the character we stopped at isn't the start of line, then return none.
202  if (ptr != bufStart && !isVerticalWS(ptr[-1]))
203  return std::nullopt;
204 
205  return indent;
206 }
207 
208 //===----------------------------------------------------------------------===//
209 // Lexer Implementation Methods
210 //===----------------------------------------------------------------------===//
211 
213  while (true) {
214  const char *tokStart = curPtr;
215  switch (*curPtr++) {
216  default:
217  // Handle identifiers.
218  if (llvm::isAlpha(curPtr[-1]))
219  return lexIdentifierOrKeyword(tokStart);
220 
221  // Unknown character, emit an error.
222  return emitError(tokStart, "unexpected character");
223 
224  case 0:
225  // This may either be a nul character in the source file or may be the EOF
226  // marker that llvm::MemoryBuffer guarantees will be there.
227  if (curPtr - 1 == curBuffer.end())
228  return formToken(FIRToken::eof, tokStart);
229 
230  [[fallthrough]]; // Treat as whitespace.
231 
232  case ' ':
233  case '\t':
234  case '\n':
235  case '\r':
236  case ',':
237  // Handle whitespace.
238  continue;
239 
240  case '`':
241  case '_':
242  // Handle identifiers.
243  return lexIdentifierOrKeyword(tokStart);
244 
245  case '.':
246  return formToken(FIRToken::period, tokStart);
247  case ':':
248  return formToken(FIRToken::colon, tokStart);
249  case '(':
250  return formToken(FIRToken::l_paren, tokStart);
251  case ')':
252  return formToken(FIRToken::r_paren, tokStart);
253  case '{':
254  if (*curPtr == '|')
255  return ++curPtr, formToken(FIRToken::l_brace_bar, tokStart);
256  return formToken(FIRToken::l_brace, tokStart);
257  case '}':
258  return formToken(FIRToken::r_brace, tokStart);
259  case '[':
260  return formToken(FIRToken::l_square, tokStart);
261  case ']':
262  return formToken(FIRToken::r_square, tokStart);
263  case '<':
264  if (*curPtr == '-')
265  return ++curPtr, formToken(FIRToken::less_minus, tokStart);
266  if (*curPtr == '=')
267  return ++curPtr, formToken(FIRToken::less_equal, tokStart);
268  return formToken(FIRToken::less, tokStart);
269  case '>':
270  return formToken(FIRToken::greater, tokStart);
271  case '=':
272  if (*curPtr == '>')
273  return ++curPtr, formToken(FIRToken::equal_greater, tokStart);
274  return formToken(FIRToken::equal, tokStart);
275  case '?':
276  return formToken(FIRToken::question, tokStart);
277  case '@':
278  if (*curPtr == '[')
279  return lexFileInfo(tokStart);
280  // Unknown character, emit an error.
281  return emitError(tokStart, "unexpected character");
282  case '%':
283  if (*curPtr == '[')
284  return lexInlineAnnotation(tokStart);
285  return emitError(tokStart, "unexpected character following '%'");
286  case '|':
287  if (*curPtr == '}')
288  return ++curPtr, formToken(FIRToken::r_brace_bar, tokStart);
289  // Unknown character, emit an error.
290  return emitError(tokStart, "unexpected character");
291 
292  case ';':
293  skipComment();
294  continue;
295 
296  case '"':
297  return lexString(tokStart, /*isVerbatim=*/false);
298  case '\'':
299  return lexString(tokStart, /*isVerbatim=*/true);
300 
301  case '-':
302  if (*curPtr == '>')
303  return ++curPtr, formToken(FIRToken::minus_greater, tokStart);
304  return lexNumber(tokStart);
305  case '+':
306  case '0':
307  case '1':
308  case '2':
309  case '3':
310  case '4':
311  case '5':
312  case '6':
313  case '7':
314  case '8':
315  case '9':
316  return lexNumber(tokStart);
317  }
318  }
319 }
320 
321 /// Lex a file info specifier.
322 ///
323 /// FileInfo ::= '@[' ('\]'|.)* ']'
324 ///
325 FIRToken FIRLexer::lexFileInfo(const char *tokStart) {
326  while (1) {
327  switch (*curPtr++) {
328  case ']': // This is the end of the fileinfo literal.
329  return formToken(FIRToken::fileinfo, tokStart);
330  case '\\':
331  // Ignore escaped ']'
332  if (*curPtr == ']')
333  ++curPtr;
334  break;
335  case 0:
336  // This could be the end of file in the middle of the fileinfo. If so
337  // emit an error.
338  if (curPtr - 1 != curBuffer.end())
339  break;
340  [[fallthrough]];
341  case '\n': // Vertical whitespace isn't allowed in a fileinfo.
342  case '\v':
343  case '\f':
344  return emitError(tokStart, "unterminated file info specifier");
345  default:
346  // Skip over other characters.
347  break;
348  }
349  }
350 }
351 
352 /// Lex a non-standard inline Annotation file.
353 ///
354 /// InlineAnnotation ::= '%[' (.)* ']'
355 ///
357  size_t depth = 0;
358  bool stringMode = false;
359  while (1) {
360  switch (*curPtr++) {
361  case '\\':
362  ++curPtr;
363  break;
364  case '"':
365  stringMode = !stringMode;
366  break;
367  case ']':
368  if (stringMode)
369  break;
370  if (depth == 1)
371  return formToken(FIRToken::inlineannotation, tokStart);
372  --depth;
373  break;
374  case '[':
375  if (stringMode)
376  break;
377  ++depth;
378  break;
379  case 0:
380  if (curPtr - 1 != curBuffer.end())
381  break;
382  return emitError(tokStart, "unterminated inline annotation");
383  default:
384  break;
385  }
386  }
387 }
388 
389 /// Lex an identifier or keyword that starts with a letter.
390 ///
391 /// LegalStartChar ::= [a-zA-Z_]
392 /// LegalIdChar ::= LegalStartChar | [0-9] | '$'
393 ///
394 /// Id ::= LegalStartChar (LegalIdChar)*
395 /// LiteralId ::= [a-zA-Z0-9$_]+
396 ///
398  // Remember that this is a literalID
399  bool isLiteralId = *tokStart == '`';
400 
401  // Match the rest of the identifier regex: [0-9a-zA-Z_$-]*
402  while (llvm::isAlpha(*curPtr) || llvm::isDigit(*curPtr) || *curPtr == '_' ||
403  *curPtr == '$' || *curPtr == '-')
404  ++curPtr;
405 
406  // Consume the trailing '`' in a literal identifier.
407  if (isLiteralId) {
408  if (*curPtr != '`')
409  return emitError(tokStart, "unterminated literal identifier");
410  ++curPtr;
411  }
412 
413  StringRef spelling(tokStart, curPtr - tokStart);
414 
415  // Check to see if this is a 'primop', which is an identifier juxtaposed with
416  // a '(' character.
417  if (*curPtr == '(') {
418  FIRToken::Kind kind = llvm::StringSwitch<FIRToken::Kind>(spelling)
419 #define TOK_LPKEYWORD(SPELLING) .Case(#SPELLING, FIRToken::lp_##SPELLING)
420 #include "FIRTokenKinds.def"
421  .Default(FIRToken::identifier);
422  if (kind != FIRToken::identifier) {
423  ++curPtr;
424  return formToken(kind, tokStart);
425  }
426  }
427 
428  // See if the identifier is a keyword. By default, it is an identifier.
429  FIRToken::Kind kind = llvm::StringSwitch<FIRToken::Kind>(spelling)
430 #define TOK_KEYWORD(SPELLING) .Case(#SPELLING, FIRToken::kw_##SPELLING)
431 #include "FIRTokenKinds.def"
432  .Default(FIRToken::identifier);
433 
434  // If this has the backticks of a literal identifier and it fell through the
435  // above switch, indicating that it was not found to e a keyword, then change
436  // its kind from identifier to literal identifier.
437  if (isLiteralId && kind == FIRToken::identifier)
438  kind = FIRToken::literal_identifier;
439 
440  return FIRToken(kind, spelling);
441 }
442 
443 /// Skip a comment line, starting with a ';' and going to end of line.
445  while (true) {
446  switch (*curPtr++) {
447  case '\n':
448  case '\r':
449  // Newline is end of comment.
450  return;
451  case 0:
452  // If this is the end of the buffer, end the comment.
453  if (curPtr - 1 == curBuffer.end()) {
454  --curPtr;
455  return;
456  }
457  [[fallthrough]];
458  default:
459  // Skip over other characters.
460  break;
461  }
462  }
463 }
464 
465 /// StringLit ::= '"' UnquotedString? '"'
466 /// VerbatimStringLit ::= '\'' UnquotedString? '\''
467 /// UnquotedString ::= ( '\\\'' | '\\"' | ~[\r\n] )+?
468 ///
469 FIRToken FIRLexer::lexString(const char *tokStart, bool isVerbatim) {
470  while (1) {
471  switch (*curPtr++) {
472  case '"': // This is the end of the string literal.
473  if (isVerbatim)
474  break;
475  return formToken(FIRToken::string, tokStart);
476  case '\'': // This is the end of the raw string.
477  if (!isVerbatim)
478  break;
479  return formToken(FIRToken::verbatim_string, tokStart);
480  case '\\':
481  // Ignore escaped '\'' or '"'
482  if (*curPtr == '\'' || *curPtr == '"')
483  ++curPtr;
484  else if (*curPtr == 'u' || *curPtr == 'U')
485  return emitError(tokStart, "unicode escape not supported in string");
486  break;
487  case 0:
488  // This could be the end of file in the middle of the string. If so
489  // emit an error.
490  if (curPtr - 1 != curBuffer.end())
491  break;
492  [[fallthrough]];
493  case '\n': // Vertical whitespace isn't allowed in a string.
494  case '\r':
495  case '\v':
496  case '\f':
497  return emitError(tokStart, "unterminated string");
498  default:
499  if (curPtr[-1] & ~0x7F)
500  return emitError(tokStart, "string characters must be 7-bit ASCII");
501  // Skip over other characters.
502  break;
503  }
504  }
505 }
506 
507 /// Lex a number literal.
508 ///
509 /// UnsignedInt ::= '0' | PosInt
510 /// PosInt ::= [1-9] ([0-9])*
511 /// DoubleLit ::=
512 /// ( '+' | '-' )? Digit+ '.' Digit+ ( 'E' ( '+' | '-' )? Digit+ )?
513 /// TripleLit ::=
514 /// Digit+ '.' Digit+ '.' Digit+
515 /// Radix-specified Integer ::=
516 /// ( '-' )? '0' ( 'b' | 'o' | 'd' | 'h' ) LegalDigit*
517 ///
518 FIRToken FIRLexer::lexNumber(const char *tokStart) {
519  assert(llvm::isDigit(curPtr[-1]) || curPtr[-1] == '+' || curPtr[-1] == '-');
520 
521  // There needs to be at least one digit.
522  if (!llvm::isDigit(*curPtr) && !llvm::isDigit(curPtr[-1]))
523  return emitError(tokStart, "unexpected character after sign");
524 
525  // If we encounter a "b", "o", "d", or "h", this is a radix-specified integer
526  // literal. This is only supported for FIRRTL 2.4.0 or later. This is always
527  // lexed, but rejected during parsing if the version is too old.
528  const char *oldPtr = curPtr;
529  if (curPtr[-1] == '-' && *curPtr == '0')
530  ++curPtr;
531  if (curPtr[-1] == '0') {
532  switch (*curPtr) {
533  case 'b':
534  ++curPtr;
535  while (*curPtr >= '0' && *curPtr <= '1')
536  ++curPtr;
537  return formToken(FIRToken::radix_specified_integer, tokStart);
538  case 'o':
539  ++curPtr;
540  while (*curPtr >= '0' && *curPtr <= '7')
541  ++curPtr;
542  return formToken(FIRToken::radix_specified_integer, tokStart);
543  case 'd':
544  ++curPtr;
545  while (llvm::isDigit(*curPtr))
546  ++curPtr;
547  return formToken(FIRToken::radix_specified_integer, tokStart);
548  case 'h':
549  ++curPtr;
550  while (llvm::isHexDigit(*curPtr))
551  ++curPtr;
552  return formToken(FIRToken::radix_specified_integer, tokStart);
553  default:
554  curPtr = oldPtr;
555  break;
556  }
557  }
558 
559  while (llvm::isDigit(*curPtr))
560  ++curPtr;
561 
562  // If we encounter a '.' followed by a digit, then this is a floating point
563  // literal, otherwise this is an integer or negative integer.
564  if (*curPtr != '.' || !llvm::isDigit(curPtr[1])) {
565  if (*tokStart == '-' || *tokStart == '+')
566  return formToken(FIRToken::signed_integer, tokStart);
567  return formToken(FIRToken::integer, tokStart);
568  }
569 
570  // Lex a floating point literal.
571  curPtr += 2;
572  while (llvm::isDigit(*curPtr))
573  ++curPtr;
574 
575  bool hasE = false;
576  if (*curPtr == 'E') {
577  hasE = true;
578  ++curPtr;
579  if (*curPtr == '+' || *curPtr == '-')
580  ++curPtr;
581  while (llvm::isDigit(*curPtr))
582  ++curPtr;
583  }
584 
585  // If we encounter a '.' followed by a digit, again, and there was no
586  // exponent, then this is a version literal. Otherwise it is a floating point
587  // literal.
588  if (*curPtr != '.' || !llvm::isDigit(curPtr[1]) || hasE)
589  return formToken(FIRToken::floatingpoint, tokStart);
590 
591  // Lex a version literal.
592  curPtr += 2;
593  while (llvm::isDigit(*curPtr))
594  ++curPtr;
595  return formToken(FIRToken::version, tokStart);
596 }
assert(baseType &&"element must be base type")
static StringAttr getMainBufferNameIdentifier(const llvm::SourceMgr &sourceMgr, MLIRContext *context)
Definition: FIRLexer.cpp:150
FIRToken lexFileInfo(const char *tokStart)
Lex a file info specifier.
Definition: FIRLexer.cpp:325
FIRToken lexIdentifierOrKeyword(const char *tokStart)
Lex an identifier or keyword that starts with a letter.
Definition: FIRLexer.cpp:397
const llvm::SourceMgr & sourceMgr
Definition: FIRLexer.h:138
FIRToken formToken(FIRToken::Kind kind, const char *tokStart)
Definition: FIRLexer.h:124
FIRToken lexNumber(const char *tokStart)
Lex a number literal.
Definition: FIRLexer.cpp:518
FIRToken lexString(const char *tokStart, bool isVerbatim)
StringLit ::= '"' UnquotedString? '"' VerbatimStringLit ::= '\'' UnquotedString? '\'' UnquotedString ...
Definition: FIRLexer.cpp:469
const char * curPtr
Definition: FIRLexer.h:142
std::optional< unsigned > getIndentation(const FIRToken &tok) const
Return the indentation level of the specified token or None if this token is preceded by another toke...
Definition: FIRLexer.cpp:185
void skipComment()
Skip a comment line, starting with a ';' and going to end of line.
Definition: FIRLexer.cpp:444
FIRToken emitError(const char *loc, const Twine &message)
Emit an error message and return a FIRToken::error token.
Definition: FIRLexer.cpp:179
FIRLexer(const llvm::SourceMgr &sourceMgr, mlir::MLIRContext *context)
mlir::Location translateLocation(llvm::SMLoc loc)
Encode the specified source location information into a Location object for attachment to the IR or e...
Definition: FIRLexer.cpp:170
FIRToken lexInlineAnnotation(const char *tokStart)
Lex a non-standard inline Annotation file.
Definition: FIRLexer.cpp:356
const mlir::StringAttr bufferNameIdentifier
Definition: FIRLexer.h:139
This represents a specific token for .fir files.
Definition: FIRLexer.h:29
std::string getVerbatimStringValue() const
Given a token containing a verbatim string, return its value, including removing the quote characters...
Definition: FIRLexer.cpp:117
StringRef getSpelling() const
Definition: FIRLexer.h:44
StringRef spelling
A reference to the entire token contents; this is always a pointer into a memory buffer owned by the ...
Definition: FIRLexer.h:94
llvm::SMRange getLocRange() const
Definition: FIRLexer.cpp:41
std::string getStringValue() const
Given a token containing a string literal, return its value, including removing the quote characters ...
Definition: FIRLexer.cpp:58
Kind kind
Discriminator that indicates the sort of token this is.
Definition: FIRLexer.h:90
llvm::SMLoc getEndLoc() const
Definition: FIRLexer.cpp:37
Kind getKind() const
Definition: FIRLexer.h:47
llvm::SMLoc getLoc() const
Definition: FIRLexer.cpp:33
bool isKeyword() const
Return true if this is one of the keyword token kinds (e.g. kw_wire).
Definition: FIRLexer.cpp:44
Direction get(bool isOutput)
Returns an output direction if isOutput is true, otherwise returns an input direction.
Definition: CalyxOps.cpp:53
This file defines an intermediate representation for circuits acting as an abstraction for constraint...