1 //===- ScriptLexer.cpp ----------------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file defines a lexer for the linker script.
11 // The linker script's grammar is not complex but ambiguous due to the
12 // lack of the formal specification of the language. What we are trying to
13 // do in this and other files in LLD is to make a "reasonable" linker
16 // Among simplicity, compatibility and efficiency, we put the most
17 // emphasis on simplicity when we wrote this lexer. Compatibility with the
18 // GNU linkers is important, but we did not try to clone every tiny corner
19 // case of their lexers, as even ld.bfd and ld.gold are subtly different
20 // in various corner cases. We do not care much about efficiency because
21 // the time spent in parsing linker scripts is usually negligible.
23 // Our grammar of the linker script is LL(2), meaning that it needs at
24 // most two-token lookahead to parse. The only place we need two-token
25 // lookahead is labels in version scripts, where we need to parse "local :"
28 // Overall, this lexer works fine for most linker scripts. There might
29 // be room for improving compatibility, but that's probably not at the
30 // top of our todo list.
32 //===----------------------------------------------------------------------===//
34 #include "ScriptLexer.h"
35 #include "lld/Common/ErrorHandler.h"
36 #include "llvm/ADT/Twine.h"
37 #include "llvm/Support/ErrorHandling.h"
42 using namespace lld::elf
;
44 // Returns a whole line containing the current token.
45 StringRef
ScriptLexer::getLine() {
46 StringRef s
= getCurrentMB().getBuffer();
47 StringRef tok
= tokens
[pos
- 1];
49 size_t pos
= s
.rfind('\n', tok
.data() - s
.data());
50 if (pos
!= StringRef::npos
)
51 s
= s
.substr(pos
+ 1);
52 return s
.substr(0, s
.find_first_of("\r\n"));
55 // Returns 1-based line number of the current token.
56 size_t ScriptLexer::getLineNumber() {
59 StringRef s
= getCurrentMB().getBuffer();
60 StringRef tok
= tokens
[pos
- 1];
61 const size_t tokOffset
= tok
.data() - s
.data();
63 // For the first token, or when going backwards, start from the beginning of
64 // the buffer. If this token is after the previous token, start from the
68 if (lastLineNumberOffset
> 0 && tokOffset
>= lastLineNumberOffset
) {
69 start
= lastLineNumberOffset
;
70 line
= lastLineNumber
;
73 line
+= s
.substr(start
, tokOffset
- start
).count('\n');
75 // Store the line number of this token for reuse.
76 lastLineNumberOffset
= tokOffset
;
77 lastLineNumber
= line
;
82 // Returns 0-based column number of the current token.
83 size_t ScriptLexer::getColumnNumber() {
84 StringRef tok
= tokens
[pos
- 1];
85 return tok
.data() - getLine().data();
88 std::string
ScriptLexer::getCurrentLocation() {
89 std::string filename
= std::string(getCurrentMB().getBufferIdentifier());
90 return (filename
+ ":" + Twine(getLineNumber())).str();
93 ScriptLexer::ScriptLexer(MemoryBufferRef mb
) { tokenize(mb
); }
95 // We don't want to record cascading errors. Keep only the first one.
96 void ScriptLexer::setError(const Twine
&msg
) {
100 std::string s
= (getCurrentLocation() + ": " + msg
).str();
102 s
+= "\n>>> " + getLine().str() + "\n>>> " +
103 std::string(getColumnNumber(), ' ') + "^";
107 // Split S into linker script tokens.
108 void ScriptLexer::tokenize(MemoryBufferRef mb
) {
109 std::vector
<StringRef
> vec
;
111 StringRef s
= mb
.getBuffer();
119 // Quoted token. Note that double-quote characters are parts of a token
120 // because, in a glob match context, only unquoted tokens are interpreted
121 // as glob patterns. Double-quoted tokens are literal patterns in that
123 if (s
.starts_with("\"")) {
124 size_t e
= s
.find("\"", 1);
125 if (e
== StringRef::npos
) {
126 StringRef filename
= mb
.getBufferIdentifier();
127 size_t lineno
= begin
.substr(0, s
.data() - begin
.data()).count('\n');
128 error(filename
+ ":" + Twine(lineno
+ 1) + ": unclosed quote");
132 vec
.push_back(s
.take_front(e
+ 1));
137 // Some operators form separate tokens.
138 if (s
.starts_with("<<=") || s
.starts_with(">>=")) {
139 vec
.push_back(s
.substr(0, 3));
143 if (s
.size() > 1 && ((s
[1] == '=' && strchr("*/+-<>&^|", s
[0])) ||
144 (s
[0] == s
[1] && strchr("<>&|", s
[0])))) {
145 vec
.push_back(s
.substr(0, 2));
150 // Unquoted token. This is more relaxed than tokens in C-like language,
151 // so that you can write "file-name.cpp" as one bare token, for example.
152 size_t pos
= s
.find_first_not_of(
153 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
154 "0123456789_.$/\\~=+[]*?-!^:");
156 // A character that cannot start a word (which is usually a
157 // punctuation) forms a single character token.
160 vec
.push_back(s
.substr(0, pos
));
164 tokens
.insert(tokens
.begin() + pos
, vec
.begin(), vec
.end());
167 // Skip leading whitespace characters or comments.
168 StringRef
ScriptLexer::skipSpace(StringRef s
) {
170 if (s
.starts_with("/*")) {
171 size_t e
= s
.find("*/", 2);
172 if (e
== StringRef::npos
) {
173 setError("unclosed comment in a linker script");
179 if (s
.starts_with("#")) {
180 size_t e
= s
.find('\n', 1);
181 if (e
== StringRef::npos
)
186 size_t size
= s
.size();
188 if (s
.size() == size
)
193 // An erroneous token is handled as if it were the last token before EOF.
194 bool ScriptLexer::atEOF() { return errorCount() || tokens
.size() == pos
; }
196 // Split a given string as an expression.
197 // This function returns "3", "*" and "5" for "3*5" for example.
198 static std::vector
<StringRef
> tokenizeExpr(StringRef s
) {
199 StringRef ops
= "!~*/+-<>?^:="; // List of operators
201 // Quoted strings are literal strings, so we don't want to split it.
202 if (s
.starts_with("\""))
205 // Split S with operators as separators.
206 std::vector
<StringRef
> ret
;
208 size_t e
= s
.find_first_of(ops
);
210 // No need to split if there is no operator.
211 if (e
== StringRef::npos
) {
216 // Get a token before the operator.
218 ret
.push_back(s
.substr(0, e
));
220 // Get the operator as a token.
221 // Keep !=, ==, >=, <=, << and >> operators as a single tokens.
222 if (s
.substr(e
).starts_with("!=") || s
.substr(e
).starts_with("==") ||
223 s
.substr(e
).starts_with(">=") || s
.substr(e
).starts_with("<=") ||
224 s
.substr(e
).starts_with("<<") || s
.substr(e
).starts_with(">>")) {
225 ret
.push_back(s
.substr(e
, 2));
228 ret
.push_back(s
.substr(e
, 1));
235 // In contexts where expressions are expected, the lexer should apply
236 // different tokenization rules than the default one. By default,
237 // arithmetic operator characters are regular characters, but in the
238 // expression context, they should be independent tokens.
240 // For example, "foo*3" should be tokenized to "foo", "*" and "3" only
241 // in the expression context.
243 // This function may split the current token into multiple tokens.
244 void ScriptLexer::maybeSplitExpr() {
245 if (!inExpr
|| errorCount() || atEOF())
248 std::vector
<StringRef
> v
= tokenizeExpr(tokens
[pos
]);
251 tokens
.erase(tokens
.begin() + pos
);
252 tokens
.insert(tokens
.begin() + pos
, v
.begin(), v
.end());
255 StringRef
ScriptLexer::next() {
261 setError("unexpected EOF");
264 return tokens
[pos
++];
267 StringRef
ScriptLexer::peek() {
268 StringRef tok
= next();
275 StringRef
ScriptLexer::peek2() {
277 StringRef tok
= next();
284 bool ScriptLexer::consume(StringRef tok
) {
292 // Consumes Tok followed by ":". Space is allowed between Tok and ":".
293 bool ScriptLexer::consumeLabel(StringRef tok
) {
294 if (consume((tok
+ ":").str()))
296 if (tokens
.size() >= pos
+ 2 && tokens
[pos
] == tok
&&
297 tokens
[pos
+ 1] == ":") {
304 void ScriptLexer::skip() { (void)next(); }
306 void ScriptLexer::expect(StringRef expect
) {
309 StringRef tok
= next();
311 setError(expect
+ " expected, but got " + tok
);
314 // Returns true if S encloses T.
315 static bool encloses(StringRef s
, StringRef t
) {
316 return s
.bytes_begin() <= t
.bytes_begin() && t
.bytes_end() <= s
.bytes_end();
319 MemoryBufferRef
ScriptLexer::getCurrentMB() {
320 // Find input buffer containing the current token.
321 assert(!mbs
.empty());
324 for (MemoryBufferRef mb
: mbs
)
325 if (encloses(mb
.getBuffer(), tokens
[pos
- 1]))
327 llvm_unreachable("getCurrentMB: failed to find a token");