1 //===- ScriptLexer.cpp ----------------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file defines a lexer for the linker script.
11 // The linker script's grammar is not complex but ambiguous due to the
12 // lack of the formal specification of the language. What we are trying to
13 // do in this and other files in LLD is to make a "reasonable" linker
16 // Among simplicity, compatibility and efficiency, we put the most
17 // emphasis on simplicity when we wrote this lexer. Compatibility with the
18 // GNU linkers is important, but we did not try to clone every tiny corner
19 // case of their lexers, as even ld.bfd and ld.gold are subtly different
20 // in various corner cases. We do not care much about efficiency because
21 // the time spent in parsing linker scripts is usually negligible.
23 // Overall, this lexer works fine for most linker scripts. There might
24 // be room for improving compatibility, but that's probably not at the
25 // top of our todo list.
27 //===----------------------------------------------------------------------===//
29 #include "ScriptLexer.h"
31 #include "lld/Common/ErrorHandler.h"
32 #include "llvm/ADT/Twine.h"
33 #include "llvm/Support/ErrorHandling.h"
34 #include "llvm/Support/FileSystem.h"
35 #include "llvm/Support/Path.h"
40 using namespace lld::elf
;
42 ScriptLexer::Buffer::Buffer(MemoryBufferRef mb
)
43 : s(mb
.getBuffer()), filename(mb
.getBufferIdentifier()),
44 begin(mb
.getBufferStart()) {
45 if (config
->sysroot
== "")
47 StringRef path
= filename
;
48 for (; !path
.empty(); path
= sys::path::parent_path(path
)) {
49 if (!sys::fs::equivalent(config
->sysroot
, path
))
51 isUnderSysroot
= true;
56 ScriptLexer::ScriptLexer(MemoryBufferRef mb
) : curBuf(mb
), mbs(1, mb
) {
57 activeFilenames
.insert(mb
.getBufferIdentifier());
60 // Returns a whole line containing the current token.
61 StringRef
ScriptLexer::getLine() {
62 StringRef s
= getCurrentMB().getBuffer();
64 size_t pos
= s
.rfind('\n', prevTok
.data() - s
.data());
65 if (pos
!= StringRef::npos
)
66 s
= s
.substr(pos
+ 1);
67 return s
.substr(0, s
.find_first_of("\r\n"));
70 // Returns 0-based column number of the current token.
71 size_t ScriptLexer::getColumnNumber() {
72 return prevTok
.data() - getLine().data();
75 std::string
ScriptLexer::getCurrentLocation() {
76 std::string filename
= std::string(getCurrentMB().getBufferIdentifier());
77 return (filename
+ ":" + Twine(prevTokLine
)).str();
80 // We don't want to record cascading errors. Keep only the first one.
81 void ScriptLexer::setError(const Twine
&msg
) {
85 std::string s
= (getCurrentLocation() + ": " + msg
).str();
87 s
+= "\n>>> " + getLine().str() + "\n>>> " +
88 std::string(getColumnNumber(), ' ') + "^";
92 void ScriptLexer::lex() {
94 StringRef
&s
= curBuf
.s
;
97 // If this buffer is from an INCLUDE command, switch to the "return
98 // value"; otherwise, mark EOF.
99 if (buffers
.empty()) {
103 activeFilenames
.erase(curBuf
.filename
);
104 curBuf
= buffers
.pop_back_val();
107 curTokState
= inExpr
;
109 // Quoted token. Note that double-quote characters are parts of a token
110 // because, in a glob match context, only unquoted tokens are interpreted
111 // as glob patterns. Double-quoted tokens are literal patterns in that
113 if (s
.starts_with("\"")) {
114 size_t e
= s
.find("\"", 1);
115 if (e
== StringRef::npos
) {
117 StringRef(curBuf
.begin
, s
.data() - curBuf
.begin
).count('\n');
118 error(curBuf
.filename
+ ":" + Twine(lineno
+ 1) + ": unclosed quote");
122 curTok
= s
.take_front(e
+ 1);
127 // Some operators form separate tokens.
128 if (s
.starts_with("<<=") || s
.starts_with(">>=")) {
129 curTok
= s
.substr(0, 3);
133 if (s
.size() > 1 && (s
[1] == '=' && strchr("+-*/!&^|", s
[0]))) {
134 curTok
= s
.substr(0, 2);
139 // Unquoted token. The non-expression token is more relaxed than tokens in
140 // C-like languages, so that you can write "file-name.cpp" as one bare
144 pos
= s
.find_first_not_of(
145 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
147 if (pos
== 0 && s
.size() >= 2 &&
148 ((s
[0] == s
[1] && strchr("<>&|", s
[0])) ||
149 is_contained({"==", "!=", "<=", ">=", "<<", ">>"}, s
.substr(0, 2))))
152 pos
= s
.find_first_not_of(
153 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
154 "0123456789_.$/\\~=+[]*?-!^:");
159 curTok
= s
.substr(0, pos
);
165 // Skip leading whitespace characters or comments.
166 StringRef
ScriptLexer::skipSpace(StringRef s
) {
168 if (s
.starts_with("/*")) {
169 size_t e
= s
.find("*/", 2);
170 if (e
== StringRef::npos
) {
171 setError("unclosed comment in a linker script");
174 curBuf
.lineNumber
+= s
.substr(0, e
).count('\n');
178 if (s
.starts_with("#")) {
179 size_t e
= s
.find('\n', 1);
180 if (e
== StringRef::npos
)
189 auto len
= saved
.size() - s
.size();
192 curBuf
.lineNumber
+= saved
.substr(0, len
).count('\n');
196 // Used to determine whether to stop parsing. Treat errors like EOF.
197 bool ScriptLexer::atEOF() { return eof
|| errorCount(); }
199 StringRef
ScriptLexer::next() {
201 // `prevTokLine` is not updated for EOF so that the line number in `setError`
202 // will be more useful.
204 prevTokLine
= curBuf
.lineNumber
;
205 return std::exchange(curTok
, StringRef(curBuf
.s
.data(), 0));
208 StringRef
ScriptLexer::peek() {
209 // curTok is invalid if curTokState and inExpr mismatch.
210 if (curTok
.size() && curTokState
!= inExpr
) {
211 curBuf
.s
= StringRef(curTok
.data(), curBuf
.s
.end() - curTok
.data());
219 bool ScriptLexer::consume(StringRef tok
) {
226 void ScriptLexer::skip() { (void)next(); }
228 void ScriptLexer::expect(StringRef expect
) {
231 StringRef tok
= next();
234 setError("unexpected EOF");
236 setError(expect
+ " expected, but got " + tok
);
240 ScriptLexer::Token
ScriptLexer::till(StringRef tok
) {
241 StringRef str
= next();
247 setError("unexpected EOF");
251 // Returns true if S encloses T.
252 static bool encloses(StringRef s
, StringRef t
) {
253 return s
.bytes_begin() <= t
.bytes_begin() && t
.bytes_end() <= s
.bytes_end();
256 MemoryBufferRef
ScriptLexer::getCurrentMB() {
257 // Find input buffer containing the current token.
258 assert(!mbs
.empty());
259 for (MemoryBufferRef mb
: mbs
)
260 if (encloses(mb
.getBuffer(), curBuf
.s
))
262 llvm_unreachable("getCurrentMB: failed to find a token");