1 //===- ScriptLexer.cpp ----------------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file defines a lexer for the linker script.
11 // The linker script's grammar is not complex but ambiguous due to the
12 // lack of the formal specification of the language. What we are trying to
13 // do in this and other files in LLD is to make a "reasonable" linker
16 // Among simplicity, compatibility and efficiency, we put the most
17 // emphasis on simplicity when we wrote this lexer. Compatibility with the
18 // GNU linkers is important, but we did not try to clone every tiny corner
19 // case of their lexers, as even ld.bfd and ld.gold are subtly different
20 // in various corner cases. We do not care much about efficiency because
21 // the time spent in parsing linker scripts is usually negligible.
23 // Overall, this lexer works fine for most linker scripts. There might
24 // be room for improving compatibility, but that's probably not at the
25 // top of our todo list.
27 //===----------------------------------------------------------------------===//
29 #include "ScriptLexer.h"
30 #include "lld/Common/ErrorHandler.h"
31 #include "llvm/ADT/Twine.h"
32 #include "llvm/Support/ErrorHandling.h"
37 using namespace lld::elf
;
39 // Returns a whole line containing the current token.
40 StringRef
ScriptLexer::getLine() {
41 StringRef s
= getCurrentMB().getBuffer();
42 StringRef tok
= tokens
[pos
- 1];
44 size_t pos
= s
.rfind('\n', tok
.data() - s
.data());
45 if (pos
!= StringRef::npos
)
46 s
= s
.substr(pos
+ 1);
47 return s
.substr(0, s
.find_first_of("\r\n"));
50 // Returns 1-based line number of the current token.
51 size_t ScriptLexer::getLineNumber() {
54 StringRef s
= getCurrentMB().getBuffer();
55 StringRef tok
= tokens
[pos
- 1];
56 const size_t tokOffset
= tok
.data() - s
.data();
58 // For the first token, or when going backwards, start from the beginning of
59 // the buffer. If this token is after the previous token, start from the
63 if (lastLineNumberOffset
> 0 && tokOffset
>= lastLineNumberOffset
) {
64 start
= lastLineNumberOffset
;
65 line
= lastLineNumber
;
68 line
+= s
.substr(start
, tokOffset
- start
).count('\n');
70 // Store the line number of this token for reuse.
71 lastLineNumberOffset
= tokOffset
;
72 lastLineNumber
= line
;
77 // Returns 0-based column number of the current token.
78 size_t ScriptLexer::getColumnNumber() {
79 StringRef tok
= tokens
[pos
- 1];
80 return tok
.data() - getLine().data();
83 std::string
ScriptLexer::getCurrentLocation() {
84 std::string filename
= std::string(getCurrentMB().getBufferIdentifier());
85 return (filename
+ ":" + Twine(getLineNumber())).str();
88 ScriptLexer::ScriptLexer(MemoryBufferRef mb
) { tokenize(mb
); }
90 // We don't want to record cascading errors. Keep only the first one.
91 void ScriptLexer::setError(const Twine
&msg
) {
95 std::string s
= (getCurrentLocation() + ": " + msg
).str();
97 s
+= "\n>>> " + getLine().str() + "\n>>> " +
98 std::string(getColumnNumber(), ' ') + "^";
102 // Split S into linker script tokens.
103 void ScriptLexer::tokenize(MemoryBufferRef mb
) {
104 std::vector
<StringRef
> vec
;
106 StringRef s
= mb
.getBuffer();
114 // Quoted token. Note that double-quote characters are parts of a token
115 // because, in a glob match context, only unquoted tokens are interpreted
116 // as glob patterns. Double-quoted tokens are literal patterns in that
118 if (s
.starts_with("\"")) {
119 size_t e
= s
.find("\"", 1);
120 if (e
== StringRef::npos
) {
121 StringRef filename
= mb
.getBufferIdentifier();
122 size_t lineno
= begin
.substr(0, s
.data() - begin
.data()).count('\n');
123 error(filename
+ ":" + Twine(lineno
+ 1) + ": unclosed quote");
127 vec
.push_back(s
.take_front(e
+ 1));
132 // Some operators form separate tokens.
133 if (s
.starts_with("<<=") || s
.starts_with(">>=")) {
134 vec
.push_back(s
.substr(0, 3));
138 if (s
.size() > 1 && ((s
[1] == '=' && strchr("*/+-<>&^|", s
[0])) ||
139 (s
[0] == s
[1] && strchr("<>&|", s
[0])))) {
140 vec
.push_back(s
.substr(0, 2));
145 // Unquoted token. This is more relaxed than tokens in C-like language,
146 // so that you can write "file-name.cpp" as one bare token, for example.
147 size_t pos
= s
.find_first_not_of(
148 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
149 "0123456789_.$/\\~=+[]*?-!^:");
151 // A character that cannot start a word (which is usually a
152 // punctuation) forms a single character token.
155 vec
.push_back(s
.substr(0, pos
));
159 tokens
.insert(tokens
.begin() + pos
, vec
.begin(), vec
.end());
162 // Skip leading whitespace characters or comments.
163 StringRef
ScriptLexer::skipSpace(StringRef s
) {
165 if (s
.starts_with("/*")) {
166 size_t e
= s
.find("*/", 2);
167 if (e
== StringRef::npos
) {
168 setError("unclosed comment in a linker script");
174 if (s
.starts_with("#")) {
175 size_t e
= s
.find('\n', 1);
176 if (e
== StringRef::npos
)
181 size_t size
= s
.size();
183 if (s
.size() == size
)
188 // An erroneous token is handled as if it were the last token before EOF.
189 bool ScriptLexer::atEOF() { return errorCount() || tokens
.size() == pos
; }
191 // Split a given string as an expression.
192 // This function returns "3", "*" and "5" for "3*5" for example.
193 static std::vector
<StringRef
> tokenizeExpr(StringRef s
) {
194 StringRef ops
= "!~*/+-<>?^:="; // List of operators
196 // Quoted strings are literal strings, so we don't want to split it.
197 if (s
.starts_with("\""))
200 // Split S with operators as separators.
201 std::vector
<StringRef
> ret
;
203 size_t e
= s
.find_first_of(ops
);
205 // No need to split if there is no operator.
206 if (e
== StringRef::npos
) {
211 // Get a token before the operator.
213 ret
.push_back(s
.substr(0, e
));
215 // Get the operator as a token.
216 // Keep !=, ==, >=, <=, << and >> operators as a single tokens.
217 if (s
.substr(e
).starts_with("!=") || s
.substr(e
).starts_with("==") ||
218 s
.substr(e
).starts_with(">=") || s
.substr(e
).starts_with("<=") ||
219 s
.substr(e
).starts_with("<<") || s
.substr(e
).starts_with(">>")) {
220 ret
.push_back(s
.substr(e
, 2));
223 ret
.push_back(s
.substr(e
, 1));
230 // In contexts where expressions are expected, the lexer should apply
231 // different tokenization rules than the default one. By default,
232 // arithmetic operator characters are regular characters, but in the
233 // expression context, they should be independent tokens.
235 // For example, "foo*3" should be tokenized to "foo", "*" and "3" only
236 // in the expression context.
238 // This function may split the current token into multiple tokens.
239 void ScriptLexer::maybeSplitExpr() {
240 if (!inExpr
|| errorCount() || atEOF())
243 std::vector
<StringRef
> v
= tokenizeExpr(tokens
[pos
]);
246 tokens
.erase(tokens
.begin() + pos
);
247 tokens
.insert(tokens
.begin() + pos
, v
.begin(), v
.end());
250 StringRef
ScriptLexer::next() {
256 setError("unexpected EOF");
259 return tokens
[pos
++];
262 StringRef
ScriptLexer::peek() {
263 StringRef tok
= next();
270 bool ScriptLexer::consume(StringRef tok
) {
277 // Consumes Tok followed by ":". Space is allowed between Tok and ":".
278 bool ScriptLexer::consumeLabel(StringRef tok
) {
279 if (consume((tok
+ ":").str()))
281 if (tokens
.size() >= pos
+ 2 && tokens
[pos
] == tok
&&
282 tokens
[pos
+ 1] == ":") {
289 void ScriptLexer::skip() { (void)next(); }
291 void ScriptLexer::expect(StringRef expect
) {
294 StringRef tok
= next();
296 setError(expect
+ " expected, but got " + tok
);
299 // Returns true if S encloses T.
300 static bool encloses(StringRef s
, StringRef t
) {
301 return s
.bytes_begin() <= t
.bytes_begin() && t
.bytes_end() <= s
.bytes_end();
304 MemoryBufferRef
ScriptLexer::getCurrentMB() {
305 // Find input buffer containing the current token.
306 assert(!mbs
.empty());
309 for (MemoryBufferRef mb
: mbs
)
310 if (encloses(mb
.getBuffer(), tokens
[pos
- 1]))
312 llvm_unreachable("getCurrentMB: failed to find a token");