1 //===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements the lexer for the MLIR textual form.
11 //===----------------------------------------------------------------------===//
15 #include "mlir/AsmParser/CodeComplete.h"
16 #include "mlir/IR/Diagnostics.h"
17 #include "mlir/IR/Location.h"
18 #include "mlir/IR/MLIRContext.h"
19 #include "mlir/Support/LLVM.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/StringExtras.h"
22 #include "llvm/ADT/StringSwitch.h"
23 #include "llvm/Support/ErrorHandling.h"
24 #include "llvm/Support/SourceMgr.h"
30 // Returns true if 'c' is an allowable punctuation character: [$._-]
31 // Returns false otherwise.
32 static bool isPunct(char c
) {
33 return c
== '$' || c
== '.' || c
== '_' || c
== '-';
36 Lexer::Lexer(const llvm::SourceMgr
&sourceMgr
, MLIRContext
*context
,
37 AsmParserCodeCompleteContext
*codeCompleteContext
)
38 : sourceMgr(sourceMgr
), context(context
), codeCompleteLoc(nullptr) {
39 auto bufferID
= sourceMgr
.getMainFileID();
40 curBuffer
= sourceMgr
.getMemoryBuffer(bufferID
)->getBuffer();
41 curPtr
= curBuffer
.begin();
43 // Set the code completion location if it was provided.
44 if (codeCompleteContext
)
45 codeCompleteLoc
= codeCompleteContext
->getCodeCompleteLoc().getPointer();
48 /// Encode the specified source location information into an attribute for
49 /// attachment to the IR.
50 Location
Lexer::getEncodedSourceLocation(SMLoc loc
) {
51 auto &sourceMgr
= getSourceMgr();
52 unsigned mainFileID
= sourceMgr
.getMainFileID();
54 // TODO: Fix performance issues in SourceMgr::getLineAndColumn so that we can
56 auto &bufferInfo
= sourceMgr
.getBufferInfo(mainFileID
);
57 unsigned lineNo
= bufferInfo
.getLineNumber(loc
.getPointer());
59 (loc
.getPointer() - bufferInfo
.getPointerForLineNumber(lineNo
)) + 1;
60 auto *buffer
= sourceMgr
.getMemoryBuffer(mainFileID
);
62 return FileLineColLoc::get(context
, buffer
->getBufferIdentifier(), lineNo
,
66 /// emitError - Emit an error message and return an Token::error token.
67 Token
Lexer::emitError(const char *loc
, const Twine
&message
) {
68 mlir::emitError(getEncodedSourceLocation(SMLoc::getFromPointer(loc
)),
70 return formToken(Token::error
, loc
);
73 Token
Lexer::lexToken() {
75 const char *tokStart
= curPtr
;
77 // Check to see if the current token is at the code completion location.
78 if (tokStart
== codeCompleteLoc
)
79 return formToken(Token::code_complete
, tokStart
);
81 // Lex the next token.
84 // Handle bare identifiers.
85 if (isalpha(curPtr
[-1]))
86 return lexBareIdentifierOrKeyword(tokStart
);
88 // Unknown character, emit an error.
89 return emitError(tokStart
, "unexpected character");
99 // Handle bare identifiers.
100 return lexBareIdentifierOrKeyword(tokStart
);
103 // This may either be a nul character in the source file or may be the EOF
104 // marker that llvm::MemoryBuffer guarantees will be there.
105 if (curPtr
- 1 == curBuffer
.end())
106 return formToken(Token::eof
, tokStart
);
110 return formToken(Token::colon
, tokStart
);
112 return formToken(Token::comma
, tokStart
);
114 return lexEllipsis(tokStart
);
116 return formToken(Token::l_paren
, tokStart
);
118 return formToken(Token::r_paren
, tokStart
);
120 if (*curPtr
== '-' && *(curPtr
+ 1) == '#') {
122 return formToken(Token::file_metadata_begin
, tokStart
);
124 return formToken(Token::l_brace
, tokStart
);
126 return formToken(Token::r_brace
, tokStart
);
128 return formToken(Token::l_square
, tokStart
);
130 return formToken(Token::r_square
, tokStart
);
132 return formToken(Token::less
, tokStart
);
134 return formToken(Token::greater
, tokStart
);
136 return formToken(Token::equal
, tokStart
);
139 return formToken(Token::plus
, tokStart
);
141 return formToken(Token::star
, tokStart
);
143 if (*curPtr
== '>') {
145 return formToken(Token::arrow
, tokStart
);
147 return formToken(Token::minus
, tokStart
);
150 return formToken(Token::question
, tokStart
);
153 return formToken(Token::vertical_bar
, tokStart
);
156 if (*curPtr
== '/') {
160 return emitError(tokStart
, "unexpected character");
163 return lexAtIdentifier(tokStart
);
166 if (*curPtr
== '-' && *(curPtr
+ 1) == '}') {
168 return formToken(Token::file_metadata_end
, tokStart
);
174 return lexPrefixedIdentifier(tokStart
);
176 return lexString(tokStart
);
188 return lexNumber(tokStart
);
193 /// Lex an '@foo' identifier.
195 /// symbol-ref-id ::= `@` (bare-id | string-literal)
197 Token
Lexer::lexAtIdentifier(const char *tokStart
) {
198 char cur
= *curPtr
++;
200 // Try to parse a string literal, if present.
202 Token stringIdentifier
= lexString(curPtr
);
203 if (stringIdentifier
.is(Token::error
))
204 return stringIdentifier
;
205 return formToken(Token::at_identifier
, tokStart
);
208 // Otherwise, these always start with a letter or underscore.
209 if (!isalpha(cur
) && cur
!= '_')
210 return emitError(curPtr
- 1,
211 "@ identifier expected to start with letter or '_'");
213 while (isalpha(*curPtr
) || isdigit(*curPtr
) || *curPtr
== '_' ||
214 *curPtr
== '$' || *curPtr
== '.')
216 return formToken(Token::at_identifier
, tokStart
);
219 /// Lex a bare identifier or keyword that starts with a letter.
221 /// bare-id ::= (letter|[_]) (letter|digit|[_$.])*
222 /// integer-type ::= `[su]?i[1-9][0-9]*`
224 Token
Lexer::lexBareIdentifierOrKeyword(const char *tokStart
) {
225 // Match the rest of the identifier regex: [0-9a-zA-Z_.$]*
226 while (isalpha(*curPtr
) || isdigit(*curPtr
) || *curPtr
== '_' ||
227 *curPtr
== '$' || *curPtr
== '.')
230 // Check to see if this identifier is a keyword.
231 StringRef
spelling(tokStart
, curPtr
- tokStart
);
233 auto isAllDigit
= [](StringRef str
) {
234 return llvm::all_of(str
, llvm::isDigit
);
237 // Check for i123, si456, ui789.
238 if ((spelling
.size() > 1 && tokStart
[0] == 'i' &&
239 isAllDigit(spelling
.drop_front())) ||
240 ((spelling
.size() > 2 && tokStart
[1] == 'i' &&
241 (tokStart
[0] == 's' || tokStart
[0] == 'u')) &&
242 isAllDigit(spelling
.drop_front(2))))
243 return Token(Token::inttype
, spelling
);
245 Token::Kind kind
= StringSwitch
<Token::Kind
>(spelling
)
246 #define TOK_KEYWORD(SPELLING) .Case(#SPELLING, Token::kw_##SPELLING)
247 #include "TokenKinds.def"
248 .Default(Token::bare_identifier
);
250 return Token(kind
, spelling
);
253 /// Skip a comment line, starting with a '//'.
255 /// TODO: add a regex for comments here and to the spec.
257 void Lexer::skipComment() {
258 // Advance over the second '/' in a '//' comment.
259 assert(*curPtr
== '/');
266 // Newline is end of comment.
269 // If this is the end of the buffer, end the comment.
270 if (curPtr
- 1 == curBuffer
.end()) {
276 // Skip over other characters.
284 /// ellipsis ::= '...'
286 Token
Lexer::lexEllipsis(const char *tokStart
) {
287 assert(curPtr
[-1] == '.');
289 if (curPtr
== curBuffer
.end() || *curPtr
!= '.' || *(curPtr
+ 1) != '.')
290 return emitError(curPtr
, "expected three consecutive dots for an ellipsis");
293 return formToken(Token::ellipsis
, tokStart
);
296 /// Lex a number literal.
298 /// integer-literal ::= digit+ | `0x` hex_digit+
299 /// float-literal ::= [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)?
301 Token
Lexer::lexNumber(const char *tokStart
) {
302 assert(isdigit(curPtr
[-1]));
304 // Handle the hexadecimal case.
305 if (curPtr
[-1] == '0' && *curPtr
== 'x') {
306 // If we see stuff like 0xi32, this is a literal `0` followed by an
307 // identifier `xi32`, stop after `0`.
308 if (!isxdigit(curPtr
[1]))
309 return formToken(Token::integer
, tokStart
);
312 while (isxdigit(*curPtr
))
315 return formToken(Token::integer
, tokStart
);
318 // Handle the normal decimal case.
319 while (isdigit(*curPtr
))
323 return formToken(Token::integer
, tokStart
);
326 // Skip over [0-9]*([eE][-+]?[0-9]+)?
327 while (isdigit(*curPtr
))
330 if (*curPtr
== 'e' || *curPtr
== 'E') {
331 if (isdigit(static_cast<unsigned char>(curPtr
[1])) ||
332 ((curPtr
[1] == '-' || curPtr
[1] == '+') &&
333 isdigit(static_cast<unsigned char>(curPtr
[2])))) {
335 while (isdigit(*curPtr
))
339 return formToken(Token::floatliteral
, tokStart
);
342 /// Lex an identifier that starts with a prefix followed by suffix-id.
344 /// attribute-id ::= `#` suffix-id
345 /// ssa-id ::= '%' suffix-id
346 /// block-id ::= '^' suffix-id
347 /// type-id ::= '!' suffix-id
348 /// suffix-id ::= digit+ | (letter|id-punct) (letter|id-punct|digit)*
349 /// id-punct ::= `$` | `.` | `_` | `-`
351 Token
Lexer::lexPrefixedIdentifier(const char *tokStart
) {
356 kind
= Token::hash_identifier
;
357 errorKind
= "invalid attribute name";
360 kind
= Token::percent_identifier
;
361 errorKind
= "invalid SSA name";
364 kind
= Token::caret_identifier
;
365 errorKind
= "invalid block name";
368 kind
= Token::exclamation_identifier
;
369 errorKind
= "invalid type identifier";
372 llvm_unreachable("invalid caller");
376 if (isdigit(*curPtr
)) {
377 // If suffix-id starts with a digit, the rest must be digits.
378 while (isdigit(*curPtr
))
380 } else if (isalpha(*curPtr
) || isPunct(*curPtr
)) {
383 } while (isalpha(*curPtr
) || isdigit(*curPtr
) || isPunct(*curPtr
));
384 } else if (curPtr
== codeCompleteLoc
) {
385 return formToken(Token::code_complete
, tokStart
);
387 return emitError(curPtr
- 1, errorKind
);
390 // Check for a code completion within the identifier.
391 if (codeCompleteLoc
&& codeCompleteLoc
>= tokStart
&&
392 codeCompleteLoc
<= curPtr
) {
393 return Token(Token::code_complete
,
394 StringRef(tokStart
, codeCompleteLoc
- tokStart
));
397 return formToken(kind
, tokStart
);
400 /// Lex a string literal.
402 /// string-literal ::= '"' [^"\n\f\v\r]* '"'
404 /// TODO: define escaping rules.
405 Token
Lexer::lexString(const char *tokStart
) {
406 assert(curPtr
[-1] == '"');
409 // Check to see if there is a code completion location within the string. In
410 // these cases we generate a completion location and place the currently
411 // lexed string within the token. This allows for the parser to use the
412 // partially lexed string when computing the completion results.
413 if (curPtr
== codeCompleteLoc
)
414 return formToken(Token::code_complete
, tokStart
);
418 return formToken(Token::string
, tokStart
);
420 // If this is a random nul character in the middle of a string, just
421 // include it. If it is the end of file, then it is an error.
422 if (curPtr
- 1 != curBuffer
.end())
428 return emitError(curPtr
- 1, "expected '\"' in string literal");
430 // Handle explicitly a few escapes.
431 if (*curPtr
== '"' || *curPtr
== '\\' || *curPtr
== 'n' || *curPtr
== 't')
433 else if (llvm::isHexDigit(*curPtr
) && llvm::isHexDigit(curPtr
[1]))
434 // Support \xx for two hex digits.
437 return emitError(curPtr
- 1, "unknown escape in string literal");