1 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // Implement the Lexer for TableGen.
12 //===----------------------------------------------------------------------===//
15 #include "llvm/Support/Streams.h"
16 #include "llvm/Support/MemoryBuffer.h"
18 #include "llvm/Config/config.h"
22 TGLexer::TGLexer(MemoryBuffer
*StartBuf
) : CurLineNo(1), CurBuf(StartBuf
) {
23 CurPtr
= CurBuf
->getBufferStart();
28 while (!IncludeStack
.empty()) {
29 delete IncludeStack
.back().Buffer
;
30 IncludeStack
.pop_back();
35 /// ReturnError - Set the error to the specified string at the specified
36 /// location. This is defined to always return tgtok::Error.
37 tgtok::TokKind
TGLexer::ReturnError(const char *Loc
, const std::string
&Msg
) {
42 void TGLexer::PrintIncludeStack(std::ostream
&OS
) const {
43 for (unsigned i
= 0, e
= IncludeStack
.size(); i
!= e
; ++i
)
44 OS
<< "Included from " << IncludeStack
[i
].Buffer
->getBufferIdentifier()
45 << ":" << IncludeStack
[i
].LineNo
<< ":\n";
46 OS
<< "Parsing " << CurBuf
->getBufferIdentifier() << ":"
50 /// PrintError - Print the error at the specified location.
51 void TGLexer::PrintError(const char *ErrorLoc
, const std::string
&Msg
) const {
52 PrintIncludeStack(*cerr
.stream());
54 assert(ErrorLoc
&& "Location not specified!");
56 // Scan backward to find the start of the line.
57 const char *LineStart
= ErrorLoc
;
58 while (LineStart
!= CurBuf
->getBufferStart() &&
59 LineStart
[-1] != '\n' && LineStart
[-1] != '\r')
61 // Get the end of the line.
62 const char *LineEnd
= ErrorLoc
;
63 while (LineEnd
!= CurBuf
->getBufferEnd() &&
64 LineEnd
[0] != '\n' && LineEnd
[0] != '\r')
66 // Print out the line.
67 cerr
<< std::string(LineStart
, LineEnd
) << "\n";
68 // Print out spaces before the carat.
69 for (const char *Pos
= LineStart
; Pos
!= ErrorLoc
; ++Pos
)
70 cerr
<< (*Pos
== '\t' ? '\t' : ' ');
74 int TGLexer::getNextChar() {
75 char CurChar
= *CurPtr
++;
78 return (unsigned char)CurChar
;
80 // A nul character in the stream is either the end of the current buffer or
81 // a random nul in the file. Disambiguate that here.
82 if (CurPtr
-1 != CurBuf
->getBufferEnd())
83 return 0; // Just whitespace.
85 // If this is the end of an included file, pop the parent file off the
87 if (!IncludeStack
.empty()) {
89 CurBuf
= IncludeStack
.back().Buffer
;
90 CurLineNo
= IncludeStack
.back().LineNo
;
91 CurPtr
= IncludeStack
.back().CurPtr
;
92 IncludeStack
.pop_back();
96 // Otherwise, return end of file.
97 --CurPtr
; // Another call to lex will return EOF again.
101 // Handle the newline character by ignoring it and incrementing the line
102 // count. However, be careful about 'dos style' files with \n\r in them.
103 // Only treat a \n\r or \r\n as a single line.
104 if ((*CurPtr
== '\n' || (*CurPtr
== '\r')) &&
106 ++CurPtr
; // Eat the two char newline sequence.
113 tgtok::TokKind
TGLexer::LexToken() {
115 // This always consumes at least one character.
116 int CurChar
= getNextChar();
120 // Handle letters: [a-zA-Z_]
121 if (isalpha(CurChar
) || CurChar
== '_')
122 return LexIdentifier();
124 // Unknown character, emit an error.
125 return ReturnError(TokStart
, "Unexpected character");
126 case EOF
: return tgtok::Eof
;
127 case ':': return tgtok::colon
;
128 case ';': return tgtok::semi
;
129 case '.': return tgtok::period
;
130 case ',': return tgtok::comma
;
131 case '<': return tgtok::less
;
132 case '>': return tgtok::greater
;
133 case ']': return tgtok::r_square
;
134 case '{': return tgtok::l_brace
;
135 case '}': return tgtok::r_brace
;
136 case '(': return tgtok::l_paren
;
137 case ')': return tgtok::r_paren
;
138 case '=': return tgtok::equal
;
139 case '?': return tgtok::question
;
146 // Ignore whitespace.
149 // If this is the start of a // comment, skip until the end of the line or
150 // the end of the buffer.
153 else if (*CurPtr
== '*') {
156 } else // Otherwise, this is an error.
157 return ReturnError(TokStart
, "Unexpected character");
160 case '0': case '1': case '2': case '3': case '4': case '5': case '6':
161 case '7': case '8': case '9':
163 case '"': return LexString();
164 case '$': return LexVarName();
165 case '[': return LexBracket();
166 case '!': return LexExclaim();
170 /// LexString - Lex "[^"]*"
171 tgtok::TokKind
TGLexer::LexString() {
172 const char *StrStart
= CurPtr
;
174 while (*CurPtr
!= '"') {
175 // If we hit the end of the buffer, report an error.
176 if (*CurPtr
== 0 && CurPtr
== CurBuf
->getBufferEnd())
177 return ReturnError(StrStart
, "End of file in string literal");
179 if (*CurPtr
== '\n' || *CurPtr
== '\r')
180 return ReturnError(StrStart
, "End of line in string literal");
185 CurStrVal
.assign(StrStart
, CurPtr
);
187 return tgtok::StrVal
;
190 tgtok::TokKind
TGLexer::LexVarName() {
191 if (!isalpha(CurPtr
[0]) && CurPtr
[0] != '_')
192 return ReturnError(TokStart
, "Invalid variable name");
194 // Otherwise, we're ok, consume the rest of the characters.
195 const char *VarNameStart
= CurPtr
++;
197 while (isalpha(*CurPtr
) || isdigit(*CurPtr
) || *CurPtr
== '_')
200 CurStrVal
.assign(VarNameStart
, CurPtr
);
201 return tgtok::VarName
;
205 tgtok::TokKind
TGLexer::LexIdentifier() {
206 // The first letter is [a-zA-Z_].
207 const char *IdentStart
= TokStart
;
209 // Match the rest of the identifier regex: [0-9a-zA-Z_]*
210 while (isalpha(*CurPtr
) || isdigit(*CurPtr
) || *CurPtr
== '_')
213 // Check to see if this identifier is a keyword.
214 unsigned Len
= CurPtr
-IdentStart
;
216 if (Len
== 3 && !memcmp(IdentStart
, "int", 3)) return tgtok::Int
;
217 if (Len
== 3 && !memcmp(IdentStart
, "bit", 3)) return tgtok::Bit
;
218 if (Len
== 4 && !memcmp(IdentStart
, "bits", 4)) return tgtok::Bits
;
219 if (Len
== 6 && !memcmp(IdentStart
, "string", 6)) return tgtok::String
;
220 if (Len
== 4 && !memcmp(IdentStart
, "list", 4)) return tgtok::List
;
221 if (Len
== 4 && !memcmp(IdentStart
, "code", 4)) return tgtok::Code
;
222 if (Len
== 3 && !memcmp(IdentStart
, "dag", 3)) return tgtok::Dag
;
224 if (Len
== 5 && !memcmp(IdentStart
, "class", 5)) return tgtok::Class
;
225 if (Len
== 3 && !memcmp(IdentStart
, "def", 3)) return tgtok::Def
;
226 if (Len
== 4 && !memcmp(IdentStart
, "defm", 4)) return tgtok::Defm
;
227 if (Len
== 10 && !memcmp(IdentStart
, "multiclass", 10))
228 return tgtok::MultiClass
;
229 if (Len
== 5 && !memcmp(IdentStart
, "field", 5)) return tgtok::Field
;
230 if (Len
== 3 && !memcmp(IdentStart
, "let", 3)) return tgtok::Let
;
231 if (Len
== 2 && !memcmp(IdentStart
, "in", 2)) return tgtok::In
;
233 if (Len
== 7 && !memcmp(IdentStart
, "include", 7)) {
234 if (LexInclude()) return tgtok::Error
;
238 CurStrVal
.assign(IdentStart
, CurPtr
);
242 /// LexInclude - We just read the "include" token. Get the string token that
243 /// comes next and enter the include.
244 bool TGLexer::LexInclude() {
245 // The token after the include must be a string.
246 tgtok::TokKind Tok
= LexToken();
247 if (Tok
== tgtok::Error
) return true;
248 if (Tok
!= tgtok::StrVal
) {
249 PrintError(getLoc(), "Expected filename after include");
254 std::string Filename
= CurStrVal
;
256 // Try to find the file.
257 MemoryBuffer
*NewBuf
= MemoryBuffer::getFile(&Filename
[0], Filename
.size());
259 // If the file didn't exist directly, see if it's in an include path.
260 for (unsigned i
= 0, e
= IncludeDirectories
.size(); i
!= e
&& !NewBuf
; ++i
) {
261 std::string IncFile
= IncludeDirectories
[i
] + "/" + Filename
;
262 NewBuf
= MemoryBuffer::getFile(&IncFile
[0], IncFile
.size());
266 PrintError(getLoc(), "Could not find include file '" + Filename
+ "'");
270 // Save the line number and lex buffer of the includer.
271 IncludeStack
.push_back(IncludeRec(CurBuf
, CurPtr
, CurLineNo
));
273 CurLineNo
= 1; // Reset line numbering.
275 CurPtr
= CurBuf
->getBufferStart();
279 void TGLexer::SkipBCPLComment() {
280 ++CurPtr
; // skip the second slash.
285 return; // Newline is end of comment.
287 // If this is the end of the buffer, end the comment.
288 if (CurPtr
== CurBuf
->getBufferEnd())
292 // Otherwise, skip the character.
297 /// SkipCComment - This skips C-style /**/ comments. The only difference from C
298 /// is that we allow nesting.
299 bool TGLexer::SkipCComment() {
300 ++CurPtr
; // skip the star.
301 unsigned CommentDepth
= 1;
304 int CurChar
= getNextChar();
307 PrintError(TokStart
, "Unterminated comment!");
310 // End of the comment?
311 if (CurPtr
[0] != '/') break;
313 ++CurPtr
; // End the */.
314 if (--CommentDepth
== 0)
318 // Start of a nested comment?
319 if (CurPtr
[0] != '*') break;
331 tgtok::TokKind
TGLexer::LexNumber() {
332 if (CurPtr
[-1] == '0') {
333 if (CurPtr
[0] == 'x') {
335 const char *NumStart
= CurPtr
;
336 while (isxdigit(CurPtr
[0]))
339 // Requires at least one hex digit.
340 if (CurPtr
== NumStart
)
341 return ReturnError(CurPtr
-2, "Invalid hexadecimal number");
343 CurIntVal
= strtoll(NumStart
, 0, 16);
344 return tgtok::IntVal
;
345 } else if (CurPtr
[0] == 'b') {
347 const char *NumStart
= CurPtr
;
348 while (CurPtr
[0] == '0' || CurPtr
[0] == '1')
351 // Requires at least one binary digit.
352 if (CurPtr
== NumStart
)
353 return ReturnError(CurPtr
-2, "Invalid binary number");
354 CurIntVal
= strtoll(NumStart
, 0, 2);
355 return tgtok::IntVal
;
359 // Check for a sign without a digit.
360 if (!isdigit(CurPtr
[0])) {
361 if (CurPtr
[-1] == '-')
363 else if (CurPtr
[-1] == '+')
367 while (isdigit(CurPtr
[0]))
369 CurIntVal
= strtoll(TokStart
, 0, 10);
370 return tgtok::IntVal
;
373 /// LexBracket - We just read '['. If this is a code block, return it,
374 /// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
375 tgtok::TokKind
TGLexer::LexBracket() {
376 if (CurPtr
[0] != '{')
377 return tgtok::l_square
;
379 const char *CodeStart
= CurPtr
;
381 int Char
= getNextChar();
382 if (Char
== EOF
) break;
384 if (Char
!= '}') continue;
386 Char
= getNextChar();
387 if (Char
== EOF
) break;
389 CurStrVal
.assign(CodeStart
, CurPtr
-2);
390 return tgtok::CodeFragment
;
394 return ReturnError(CodeStart
-2, "Unterminated Code Block");
397 /// LexExclaim - Lex '!' and '![a-zA-Z]+'.
398 tgtok::TokKind
TGLexer::LexExclaim() {
399 if (!isalpha(*CurPtr
))
400 return ReturnError(CurPtr
-1, "Invalid \"!operator\"");
402 const char *Start
= CurPtr
++;
403 while (isalpha(*CurPtr
))
406 // Check to see which operator this is.
407 unsigned Len
= CurPtr
-Start
;
409 if (Len
== 3 && !memcmp(Start
, "con", 3)) return tgtok::XConcat
;
410 if (Len
== 3 && !memcmp(Start
, "sra", 3)) return tgtok::XSRA
;
411 if (Len
== 3 && !memcmp(Start
, "srl", 3)) return tgtok::XSRL
;
412 if (Len
== 3 && !memcmp(Start
, "shl", 3)) return tgtok::XSHL
;
413 if (Len
== 9 && !memcmp(Start
, "strconcat", 9)) return tgtok::XStrConcat
;
415 return ReturnError(Start
-1, "Unknown operator");