1 //===- ExportTrie.cpp -----------------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This is a partial implementation of the Mach-O export trie format. It's
10 // essentially a symbol table encoded as a compressed prefix trie, meaning that
11 // the common prefixes of each symbol name are shared for a more compact
12 // representation. The prefixes are stored on the edges of the trie, and one
13 // edge can represent multiple characters. For example, given two exported
14 // symbols _bar and _baz, we will have a trie like this (terminal nodes are
15 // marked with an asterisk):
32 // More documentation of the format can be found in
33 // llvm/tools/obj2yaml/macho2yaml.cpp.
35 //===----------------------------------------------------------------------===//
37 #include "ExportTrie.h"
40 #include "lld/Common/ErrorHandler.h"
41 #include "lld/Common/Memory.h"
42 #include "llvm/BinaryFormat/MachO.h"
43 #include "llvm/Support/LEB128.h"
48 using namespace lld::macho
;
53 Edge(StringRef s
, TrieNode
*node
) : substring(s
), child(node
) {}
56 struct TrieNode
*child
;
63 ExportInfo(const Symbol
&sym
, uint64_t imageBase
)
64 : address(sym
.getVA() - imageBase
) {
65 using namespace llvm::MachO
;
67 flags
|= EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION
;
69 flags
|= EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL
;
70 // TODO: Add proper support for stub-and-resolver flags.
72 if (auto *defined
= dyn_cast
<Defined
>(&sym
)) {
73 if (defined
->isAbsolute())
74 flags
|= EXPORT_SYMBOL_FLAGS_KIND_ABSOLUTE
;
75 } else if (auto *dysym
= dyn_cast
<DylibSymbol
>(&sym
)) {
76 flags
|= EXPORT_SYMBOL_FLAGS_REEXPORT
;
77 if (!dysym
->isDynamicLookup())
78 ordinal
= dysym
->getFile()->ordinal
;
85 struct macho::TrieNode
{
86 std::vector
<Edge
> edges
;
87 std::optional
<ExportInfo
> info
;
88 // Estimated offset from the start of the serialized trie to the current node.
89 // This will converge to the true offset when updateOffset() is run to a
93 uint32_t getTerminalSize() const;
94 // Returns whether the new estimated offset differs from the old one.
95 bool updateOffset(size_t &nextOffset
);
96 void writeTo(uint8_t *buf
) const;
99 // For regular symbols, the node layout (excluding the children) is
101 // uleb128 terminalSize;
105 // For re-exported symbols, the layout is
107 // uleb128 terminalSize;
110 // char[] originalName;
112 // If libfoo.dylib is linked against libbar.dylib, and libfoo exports an alias
113 // _foo to a symbol _bar in libbar, then originalName will be "_bar". If libfoo
114 // re-exports _bar directly (i.e. not via an alias), then originalName will be
117 // TODO: Support aliased re-exports. (Since we don't yet support these,
118 // originalName will always be the empty string.)
120 // For stub-and-resolver nodes, the layout is
122 // uleb128 terminalSize;
124 // uleb128 stubAddress;
125 // uleb128 resolverAddress;
127 // TODO: Support stub-and-resolver nodes.
128 uint32_t TrieNode::getTerminalSize() const {
129 uint32_t size
= getULEB128Size(info
->flags
);
130 if (info
->flags
& MachO::EXPORT_SYMBOL_FLAGS_REEXPORT
)
131 size
+= getULEB128Size(info
->ordinal
) + 1; // + 1 for the null-terminator
133 size
+= getULEB128Size(info
->address
);
137 bool TrieNode::updateOffset(size_t &nextOffset
) {
138 // Size of the whole node (including the terminalSize and the outgoing edges.)
139 // In contrast, terminalSize only records the size of the other data in the
143 uint32_t terminalSize
= getTerminalSize();
144 // Overall node size so far is the uleb128 size of the length of the symbol
145 // info + the symbol info itself.
146 nodeSize
= terminalSize
+ getULEB128Size(terminalSize
);
148 nodeSize
= 1; // Size of terminalSize (which has a value of 0)
150 // Compute size of all child edges.
151 ++nodeSize
; // Byte for number of children.
152 for (const Edge
&edge
: edges
) {
153 nodeSize
+= edge
.substring
.size() + 1 // String length.
154 + getULEB128Size(edge
.child
->offset
); // Offset len.
156 // On input, 'nextOffset' is the new preferred location for this node.
157 bool result
= (offset
!= nextOffset
);
158 // Store new location in node object for use by parents.
160 nextOffset
+= nodeSize
;
164 void TrieNode::writeTo(uint8_t *buf
) const {
167 uint32_t terminalSize
= getTerminalSize();
168 buf
+= encodeULEB128(terminalSize
, buf
);
169 buf
+= encodeULEB128(info
->flags
, buf
);
170 if (info
->flags
& MachO::EXPORT_SYMBOL_FLAGS_REEXPORT
) {
171 buf
+= encodeULEB128(info
->ordinal
, buf
);
172 *buf
++ = 0; // empty originalName string
174 buf
+= encodeULEB128(info
->address
, buf
);
177 // TrieNode with no Symbol info.
178 *buf
++ = 0; // terminalSize
180 // Add number of children. TODO: Handle case where we have more than 256.
181 assert(edges
.size() < 256);
182 *buf
++ = edges
.size();
183 // Append each child edge substring and node offset.
184 for (const Edge
&edge
: edges
) {
185 memcpy(buf
, edge
.substring
.data(), edge
.substring
.size());
186 buf
+= edge
.substring
.size();
188 buf
+= encodeULEB128(edge
.child
->offset
, buf
);
192 TrieBuilder::~TrieBuilder() {
193 for (TrieNode
*node
: nodes
)
197 TrieNode
*TrieBuilder::makeNode() {
198 auto *node
= new TrieNode();
199 nodes
.emplace_back(node
);
203 static int charAt(const Symbol
*sym
, size_t pos
) {
204 StringRef str
= sym
->getName();
205 if (pos
>= str
.size())
210 // Build the trie by performing a three-way radix quicksort: We start by sorting
211 // the strings by their first characters, then sort the strings with the same
212 // first characters by their second characters, and so on recursively. Each
213 // time the prefixes diverge, we add a node to the trie.
215 // node: The most recently created node along this path in the trie (i.e.
216 // the furthest from the root.)
217 // lastPos: The prefix length of the most recently created node, i.e. the number
218 // of characters along its path from the root.
219 // pos: The string index we are currently sorting on. Note that each symbol
220 // S contained in vec has the same prefix S[0...pos).
221 void TrieBuilder::sortAndBuild(MutableArrayRef
<const Symbol
*> vec
,
222 TrieNode
*node
, size_t lastPos
, size_t pos
) {
227 // Partition items so that items in [0, i) are less than the pivot,
228 // [i, j) are the same as the pivot, and [j, vec.size()) are greater than
230 const Symbol
*pivotSymbol
= vec
[vec
.size() / 2];
231 int pivot
= charAt(pivotSymbol
, pos
);
233 size_t j
= vec
.size();
234 for (size_t k
= 0; k
< j
;) {
235 int c
= charAt(vec
[k
], pos
);
237 std::swap(vec
[i
++], vec
[k
++]);
239 std::swap(vec
[--j
], vec
[k
]);
244 bool isTerminal
= pivot
== -1;
245 bool prefixesDiverge
= i
!= 0 || j
!= vec
.size();
246 if (lastPos
!= pos
&& (isTerminal
|| prefixesDiverge
)) {
247 TrieNode
*newNode
= makeNode();
248 node
->edges
.emplace_back(pivotSymbol
->getName().slice(lastPos
, pos
),
254 sortAndBuild(vec
.slice(0, i
), node
, lastPos
, pos
);
255 sortAndBuild(vec
.slice(j
), node
, lastPos
, pos
);
258 assert(j
- i
== 1); // no duplicate symbols
259 node
->info
= ExportInfo(*pivotSymbol
, imageBase
);
261 // This is the tail-call-optimized version of the following:
262 // sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1);
263 vec
= vec
.slice(i
, j
- i
);
269 size_t TrieBuilder::build() {
270 if (exported
.empty())
273 TrieNode
*root
= makeNode();
274 sortAndBuild(exported
, root
, 0, 0);
276 // Assign each node in the vector an offset in the trie stream, iterating
277 // until all uleb128 sizes have stabilized.
283 for (TrieNode
*node
: nodes
)
284 more
|= node
->updateOffset(offset
);
290 void TrieBuilder::writeTo(uint8_t *buf
) const {
291 for (TrieNode
*node
: nodes
)
297 // Parse a serialized trie and invoke a callback for each entry.
300 TrieParser(const uint8_t *buf
, size_t size
, const TrieEntryCallback
&callback
)
301 : start(buf
), end(start
+ size
), callback(callback
) {}
303 void parse(const uint8_t *buf
, const Twine
&cumulativeString
);
305 void parse() { parse(start
, ""); }
307 const uint8_t *start
;
309 const TrieEntryCallback
&callback
;
314 void TrieParser::parse(const uint8_t *buf
, const Twine
&cumulativeString
) {
316 fatal("Node offset points outside export section");
319 uint64_t terminalSize
= decodeULEB128(buf
, &ulebSize
);
323 if (terminalSize
!= 0) {
324 flags
= decodeULEB128(buf
, &ulebSize
);
325 callback(cumulativeString
, flags
);
328 uint8_t numEdges
= *buf
++;
329 for (uint8_t i
= 0; i
< numEdges
; ++i
) {
330 const char *cbuf
= reinterpret_cast<const char *>(buf
);
331 StringRef substring
= StringRef(cbuf
, strnlen(cbuf
, end
- buf
));
332 buf
+= substring
.size() + 1;
333 offset
= decodeULEB128(buf
, &ulebSize
);
335 parse(start
+ offset
, cumulativeString
+ substring
);
339 void macho::parseTrie(const uint8_t *buf
, size_t size
,
340 const TrieEntryCallback
&callback
) {
344 TrieParser(buf
, size
, callback
).parse();