[AArch64] Add fpext and fpround costs (#119292)
[llvm-project.git] / lld / MachO / BPSectionOrderer.cpp
blob5db2242a35ef286a16d27ab54c8927df02c5476c
1 //===- BPSectionOrderer.cpp--------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
9 #include "BPSectionOrderer.h"
10 #include "InputSection.h"
11 #include "lld/Common/ErrorHandler.h"
12 #include "llvm/ADT/DenseMap.h"
13 #include "llvm/ADT/StringMap.h"
14 #include "llvm/ProfileData/InstrProfReader.h"
15 #include "llvm/Support/BalancedPartitioning.h"
16 #include "llvm/Support/TimeProfiler.h"
17 #include "llvm/Support/VirtualFileSystem.h"
18 #include "llvm/Support/xxhash.h"
20 #define DEBUG_TYPE "bp-section-orderer"
21 using namespace llvm;
22 using namespace lld::macho;
24 using UtilityNodes = SmallVector<BPFunctionNode::UtilityNodeT>;
26 /// Symbols can be appended with "(.__uniq.xxxx)?.llvm.yyyy" where "xxxx" and
27 /// "yyyy" are numbers that could change between builds. We need to use the root
28 /// symbol name before this suffix so these symbols can be matched with profiles
29 /// which may have different suffixes.
30 static StringRef getRootSymbol(StringRef Name) {
31 auto [P0, S0] = Name.rsplit(".llvm.");
32 auto [P1, S1] = P0.rsplit(".__uniq.");
33 return P1;
36 static uint64_t getRelocHash(StringRef kind, uint64_t sectionIdx,
37 uint64_t offset, uint64_t addend) {
38 return xxHash64((kind + ": " + Twine::utohexstr(sectionIdx) + " + " +
39 Twine::utohexstr(offset) + " + " + Twine::utohexstr(addend))
40 .str());
43 static uint64_t
44 getRelocHash(const Reloc &reloc,
45 const DenseMap<const InputSection *, uint64_t> &sectionToIdx) {
46 auto *isec = reloc.getReferentInputSection();
47 std::optional<uint64_t> sectionIdx;
48 auto sectionIdxIt = sectionToIdx.find(isec);
49 if (sectionIdxIt != sectionToIdx.end())
50 sectionIdx = sectionIdxIt->getSecond();
51 std::string kind;
52 if (isec)
53 kind = ("Section " + Twine(static_cast<uint8_t>(isec->kind()))).str();
54 if (auto *sym = reloc.referent.dyn_cast<Symbol *>()) {
55 kind += (" Symbol " + Twine(static_cast<uint8_t>(sym->kind()))).str();
56 if (auto *d = dyn_cast<Defined>(sym))
57 return getRelocHash(kind, sectionIdx.value_or(0), d->value, reloc.addend);
59 return getRelocHash(kind, sectionIdx.value_or(0), 0, reloc.addend);
62 /// Given \p sectionIdxs, a list of section indexes, return a list of utility
63 /// nodes for each section index. If \p duplicateSectionIdx is provided,
64 /// populate it with nearly identical sections. Increment \p maxUN to be the
65 /// largest utility node we have used so far.
66 static SmallVector<std::pair<unsigned, UtilityNodes>> getUnsForCompression(
67 ArrayRef<const InputSection *> sections,
68 const DenseMap<const InputSection *, uint64_t> &sectionToIdx,
69 ArrayRef<unsigned> sectionIdxs,
70 DenseMap<unsigned, SmallVector<unsigned>> *duplicateSectionIdxs,
71 BPFunctionNode::UtilityNodeT &maxUN) {
72 TimeTraceScope timeScope("Build nodes for compression");
74 SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> sectionHashes;
75 sectionHashes.reserve(sectionIdxs.size());
76 SmallVector<uint64_t> hashes;
77 for (unsigned sectionIdx : sectionIdxs) {
78 const auto *isec = sections[sectionIdx];
79 constexpr unsigned windowSize = 4;
81 for (size_t i = 0; i < isec->data.size(); i++) {
82 auto window = isec->data.drop_front(i).take_front(windowSize);
83 hashes.push_back(xxHash64(window));
85 for (const auto &r : isec->relocs) {
86 if (r.length == 0 || r.referent.isNull() || r.offset >= isec->data.size())
87 continue;
88 uint64_t relocHash = getRelocHash(r, sectionToIdx);
89 uint32_t start = (r.offset < windowSize) ? 0 : r.offset - windowSize + 1;
90 for (uint32_t i = start; i < r.offset + r.length; i++) {
91 auto window = isec->data.drop_front(i).take_front(windowSize);
92 hashes.push_back(xxHash64(window) + relocHash);
96 llvm::sort(hashes);
97 hashes.erase(std::unique(hashes.begin(), hashes.end()), hashes.end());
99 sectionHashes.emplace_back(sectionIdx, hashes);
100 hashes.clear();
103 DenseMap<uint64_t, unsigned> hashFrequency;
104 for (auto &[sectionIdx, hashes] : sectionHashes)
105 for (auto hash : hashes)
106 ++hashFrequency[hash];
108 if (duplicateSectionIdxs) {
109 // Merge section that are nearly identical
110 SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> newSectionHashes;
111 DenseMap<uint64_t, unsigned> wholeHashToSectionIdx;
112 for (auto &[sectionIdx, hashes] : sectionHashes) {
113 uint64_t wholeHash = 0;
114 for (auto hash : hashes)
115 if (hashFrequency[hash] > 5)
116 wholeHash ^= hash;
117 auto [it, wasInserted] =
118 wholeHashToSectionIdx.insert(std::make_pair(wholeHash, sectionIdx));
119 if (wasInserted) {
120 newSectionHashes.emplace_back(sectionIdx, hashes);
121 } else {
122 (*duplicateSectionIdxs)[it->getSecond()].push_back(sectionIdx);
125 sectionHashes = newSectionHashes;
127 // Recompute hash frequencies
128 hashFrequency.clear();
129 for (auto &[sectionIdx, hashes] : sectionHashes)
130 for (auto hash : hashes)
131 ++hashFrequency[hash];
134 // Filter rare and common hashes and assign each a unique utility node that
135 // doesn't conflict with the trace utility nodes
136 DenseMap<uint64_t, BPFunctionNode::UtilityNodeT> hashToUN;
137 for (auto &[hash, frequency] : hashFrequency) {
138 if (frequency <= 1 || frequency * 2 > sectionHashes.size())
139 continue;
140 hashToUN[hash] = ++maxUN;
143 SmallVector<std::pair<unsigned, UtilityNodes>> sectionUns;
144 for (auto &[sectionIdx, hashes] : sectionHashes) {
145 UtilityNodes uns;
146 for (auto &hash : hashes) {
147 auto it = hashToUN.find(hash);
148 if (it != hashToUN.end())
149 uns.push_back(it->second);
151 sectionUns.emplace_back(sectionIdx, uns);
153 return sectionUns;
156 DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
157 size_t &highestAvailablePriority, StringRef profilePath,
158 bool forFunctionCompression, bool forDataCompression,
159 bool compressionSortStartupFunctions, bool verbose) {
161 SmallVector<const InputSection *> sections;
162 DenseMap<const InputSection *, uint64_t> sectionToIdx;
163 StringMap<DenseSet<unsigned>> symbolToSectionIdxs;
164 for (const auto *file : inputFiles) {
165 for (auto *sec : file->sections) {
166 for (auto &subsec : sec->subsections) {
167 auto *isec = subsec.isec;
168 if (!isec || isec->data.empty() || !isec->data.data())
169 continue;
170 unsigned sectionIdx = sections.size();
171 sectionToIdx.try_emplace(isec, sectionIdx);
172 sections.push_back(isec);
173 for (Symbol *sym : isec->symbols)
174 if (auto *d = dyn_cast_or_null<Defined>(sym))
175 symbolToSectionIdxs[d->getName()].insert(sectionIdx);
180 StringMap<DenseSet<unsigned>> rootSymbolToSectionIdxs;
181 for (auto &entry : symbolToSectionIdxs) {
182 StringRef name = entry.getKey();
183 auto &sectionIdxs = entry.getValue();
184 name = getRootSymbol(name);
185 rootSymbolToSectionIdxs[name].insert(sectionIdxs.begin(),
186 sectionIdxs.end());
187 // Linkage names can be prefixed with "_" or "l_" on Mach-O. See
188 // Mangler::getNameWithPrefix() for details.
189 if (name.consume_front("_") || name.consume_front("l_"))
190 rootSymbolToSectionIdxs[name].insert(sectionIdxs.begin(),
191 sectionIdxs.end());
194 BPFunctionNode::UtilityNodeT maxUN = 0;
195 DenseMap<unsigned, UtilityNodes> startupSectionIdxUNs;
196 // Used to define the initial order for startup functions.
197 DenseMap<unsigned, size_t> sectionIdxToTimestamp;
198 std::unique_ptr<InstrProfReader> reader;
199 if (!profilePath.empty()) {
200 auto fs = vfs::getRealFileSystem();
201 auto readerOrErr = InstrProfReader::create(profilePath, *fs);
202 lld::checkError(readerOrErr.takeError());
204 reader = std::move(readerOrErr.get());
205 for (auto &entry : *reader) {
206 // Read all entries
207 (void)entry;
209 auto &traces = reader->getTemporalProfTraces();
211 DenseMap<unsigned, BPFunctionNode::UtilityNodeT> sectionIdxToFirstUN;
212 for (size_t traceIdx = 0; traceIdx < traces.size(); traceIdx++) {
213 uint64_t currentSize = 0, cutoffSize = 1;
214 size_t cutoffTimestamp = 1;
215 auto &trace = traces[traceIdx].FunctionNameRefs;
216 for (size_t timestamp = 0; timestamp < trace.size(); timestamp++) {
217 auto [Filename, ParsedFuncName] = getParsedIRPGOName(
218 reader->getSymtab().getFuncOrVarName(trace[timestamp]));
219 ParsedFuncName = getRootSymbol(ParsedFuncName);
221 auto sectionIdxsIt = rootSymbolToSectionIdxs.find(ParsedFuncName);
222 if (sectionIdxsIt == rootSymbolToSectionIdxs.end())
223 continue;
224 auto &sectionIdxs = sectionIdxsIt->getValue();
225 // If the same symbol is found in multiple sections, they might be
226 // identical, so we arbitrarily use the size from the first section.
227 currentSize += sections[*sectionIdxs.begin()]->getSize();
229 // Since BalancedPartitioning is sensitive to the initial order, we need
230 // to explicitly define it to be ordered by earliest timestamp.
231 for (unsigned sectionIdx : sectionIdxs) {
232 auto [it, wasInserted] =
233 sectionIdxToTimestamp.try_emplace(sectionIdx, timestamp);
234 if (!wasInserted)
235 it->getSecond() = std::min<size_t>(it->getSecond(), timestamp);
238 if (timestamp >= cutoffTimestamp || currentSize >= cutoffSize) {
239 ++maxUN;
240 cutoffSize = 2 * currentSize;
241 cutoffTimestamp = 2 * cutoffTimestamp;
243 for (unsigned sectionIdx : sectionIdxs)
244 sectionIdxToFirstUN.try_emplace(sectionIdx, maxUN);
246 for (auto &[sectionIdx, firstUN] : sectionIdxToFirstUN)
247 for (auto un = firstUN; un <= maxUN; ++un)
248 startupSectionIdxUNs[sectionIdx].push_back(un);
249 ++maxUN;
250 sectionIdxToFirstUN.clear();
254 SmallVector<unsigned> sectionIdxsForFunctionCompression,
255 sectionIdxsForDataCompression;
256 for (unsigned sectionIdx = 0; sectionIdx < sections.size(); sectionIdx++) {
257 if (startupSectionIdxUNs.count(sectionIdx))
258 continue;
259 const auto *isec = sections[sectionIdx];
260 if (isCodeSection(isec)) {
261 if (forFunctionCompression)
262 sectionIdxsForFunctionCompression.push_back(sectionIdx);
263 } else {
264 if (forDataCompression)
265 sectionIdxsForDataCompression.push_back(sectionIdx);
269 if (compressionSortStartupFunctions) {
270 SmallVector<unsigned> startupIdxs;
271 for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
272 startupIdxs.push_back(sectionIdx);
273 auto unsForStartupFunctionCompression =
274 getUnsForCompression(sections, sectionToIdx, startupIdxs,
275 /*duplicateSectionIdxs=*/nullptr, maxUN);
276 for (auto &[sectionIdx, compressionUns] :
277 unsForStartupFunctionCompression) {
278 auto &uns = startupSectionIdxUNs[sectionIdx];
279 uns.append(compressionUns);
280 llvm::sort(uns);
281 uns.erase(std::unique(uns.begin(), uns.end()), uns.end());
285 // Map a section index (order directly) to a list of duplicate section indices
286 // (not ordered directly).
287 DenseMap<unsigned, SmallVector<unsigned>> duplicateSectionIdxs;
288 auto unsForFunctionCompression = getUnsForCompression(
289 sections, sectionToIdx, sectionIdxsForFunctionCompression,
290 &duplicateSectionIdxs, maxUN);
291 auto unsForDataCompression = getUnsForCompression(
292 sections, sectionToIdx, sectionIdxsForDataCompression,
293 &duplicateSectionIdxs, maxUN);
295 std::vector<BPFunctionNode> nodesForStartup, nodesForFunctionCompression,
296 nodesForDataCompression;
297 for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
298 nodesForStartup.emplace_back(sectionIdx, uns);
299 for (auto &[sectionIdx, uns] : unsForFunctionCompression)
300 nodesForFunctionCompression.emplace_back(sectionIdx, uns);
301 for (auto &[sectionIdx, uns] : unsForDataCompression)
302 nodesForDataCompression.emplace_back(sectionIdx, uns);
304 // Use the first timestamp to define the initial order for startup nodes.
305 llvm::sort(nodesForStartup, [&sectionIdxToTimestamp](auto &L, auto &R) {
306 return std::make_pair(sectionIdxToTimestamp[L.Id], L.Id) <
307 std::make_pair(sectionIdxToTimestamp[R.Id], R.Id);
309 // Sort compression nodes by their Id (which is the section index) because the
310 // input linker order tends to be not bad.
311 llvm::sort(nodesForFunctionCompression,
312 [](auto &L, auto &R) { return L.Id < R.Id; });
313 llvm::sort(nodesForDataCompression,
314 [](auto &L, auto &R) { return L.Id < R.Id; });
317 TimeTraceScope timeScope("Balanced Partitioning");
318 BalancedPartitioningConfig config;
319 BalancedPartitioning bp(config);
320 bp.run(nodesForStartup);
321 bp.run(nodesForFunctionCompression);
322 bp.run(nodesForDataCompression);
325 unsigned numStartupSections = 0;
326 unsigned numCodeCompressionSections = 0;
327 unsigned numDuplicateCodeSections = 0;
328 unsigned numDataCompressionSections = 0;
329 unsigned numDuplicateDataSections = 0;
330 SetVector<const InputSection *> orderedSections;
331 // Order startup functions,
332 for (auto &node : nodesForStartup) {
333 const auto *isec = sections[node.Id];
334 if (orderedSections.insert(isec))
335 ++numStartupSections;
337 // then functions for compression,
338 for (auto &node : nodesForFunctionCompression) {
339 const auto *isec = sections[node.Id];
340 if (orderedSections.insert(isec))
341 ++numCodeCompressionSections;
343 auto It = duplicateSectionIdxs.find(node.Id);
344 if (It == duplicateSectionIdxs.end())
345 continue;
346 for (auto dupSecIdx : It->getSecond()) {
347 const auto *dupIsec = sections[dupSecIdx];
348 if (orderedSections.insert(dupIsec))
349 ++numDuplicateCodeSections;
352 // then data for compression.
353 for (auto &node : nodesForDataCompression) {
354 const auto *isec = sections[node.Id];
355 if (orderedSections.insert(isec))
356 ++numDataCompressionSections;
357 auto It = duplicateSectionIdxs.find(node.Id);
358 if (It == duplicateSectionIdxs.end())
359 continue;
360 for (auto dupSecIdx : It->getSecond()) {
361 const auto *dupIsec = sections[dupSecIdx];
362 if (orderedSections.insert(dupIsec))
363 ++numDuplicateDataSections;
367 if (verbose) {
368 unsigned numTotalOrderedSections =
369 numStartupSections + numCodeCompressionSections +
370 numDuplicateCodeSections + numDataCompressionSections +
371 numDuplicateDataSections;
372 dbgs()
373 << "Ordered " << numTotalOrderedSections
374 << " sections using balanced partitioning:\n Functions for startup: "
375 << numStartupSections
376 << "\n Functions for compression: " << numCodeCompressionSections
377 << "\n Duplicate functions: " << numDuplicateCodeSections
378 << "\n Data for compression: " << numDataCompressionSections
379 << "\n Duplicate data: " << numDuplicateDataSections << "\n";
381 if (!profilePath.empty()) {
382 // Evaluate this function order for startup
383 StringMap<std::pair<uint64_t, uint64_t>> symbolToPageNumbers;
384 const uint64_t pageSize = (1 << 14);
385 uint64_t currentAddress = 0;
386 for (const auto *isec : orderedSections) {
387 for (Symbol *sym : isec->symbols) {
388 if (auto *d = dyn_cast_or_null<Defined>(sym)) {
389 uint64_t startAddress = currentAddress + d->value;
390 uint64_t endAddress = startAddress + d->size;
391 uint64_t firstPage = startAddress / pageSize;
392 // I think the kernel might pull in a few pages when one it touched,
393 // so it might be more accurate to force lastPage to be aligned by
394 // 4?
395 uint64_t lastPage = endAddress / pageSize;
396 StringRef rootSymbol = d->getName();
397 rootSymbol = getRootSymbol(rootSymbol);
398 symbolToPageNumbers.try_emplace(rootSymbol, firstPage, lastPage);
399 if (rootSymbol.consume_front("_") || rootSymbol.consume_front("l_"))
400 symbolToPageNumbers.try_emplace(rootSymbol, firstPage, lastPage);
404 currentAddress += isec->getSize();
407 // The area under the curve F where F(t) is the total number of page
408 // faults at step t.
409 unsigned area = 0;
410 for (auto &trace : reader->getTemporalProfTraces()) {
411 SmallSet<uint64_t, 0> touchedPages;
412 for (unsigned step = 0; step < trace.FunctionNameRefs.size(); step++) {
413 auto traceId = trace.FunctionNameRefs[step];
414 auto [Filename, ParsedFuncName] =
415 getParsedIRPGOName(reader->getSymtab().getFuncOrVarName(traceId));
416 ParsedFuncName = getRootSymbol(ParsedFuncName);
417 auto it = symbolToPageNumbers.find(ParsedFuncName);
418 if (it != symbolToPageNumbers.end()) {
419 auto &[firstPage, lastPage] = it->getValue();
420 for (uint64_t i = firstPage; i <= lastPage; i++)
421 touchedPages.insert(i);
423 area += touchedPages.size();
426 dbgs() << "Total area under the page fault curve: " << (float)area
427 << "\n";
431 DenseMap<const InputSection *, size_t> sectionPriorities;
432 for (const auto *isec : orderedSections)
433 sectionPriorities[isec] = --highestAvailablePriority;
434 return sectionPriorities;