lld/MachO/BPSectionOrderer.cpp

   1 //===- BPSectionOrderer.cpp--------------------------------------*- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #include "BPSectionOrderer.h"
  10 #include "InputSection.h"
  11 #include "lld/Common/ErrorHandler.h"
  12 #include "llvm/ADT/DenseMap.h"
  13 #include "llvm/ADT/StringMap.h"
  14 #include "llvm/ProfileData/InstrProfReader.h"
  15 #include "llvm/Support/BalancedPartitioning.h"
  16 #include "llvm/Support/TimeProfiler.h"
  17 #include "llvm/Support/VirtualFileSystem.h"
  18 #include "llvm/Support/xxhash.h"
  19
  20 #define DEBUG_TYPE "bp-section-orderer"
  21 using namespace llvm;
  22 using namespace lld::macho;
  23
  24 using UtilityNodes = SmallVector<BPFunctionNode::UtilityNodeT>;
  25
  26 /// Symbols can be appended with "(.__uniq.xxxx)?.llvm.yyyy" where "xxxx" and
  27 /// "yyyy" are numbers that could change between builds. We need to use the root
  28 /// symbol name before this suffix so these symbols can be matched with profiles
  29 /// which may have different suffixes.
  30 static StringRef getRootSymbol(StringRef Name) {
  31   auto [P0, S0] = Name.rsplit(".llvm.");
  32   auto [P1, S1] = P0.rsplit(".__uniq.");
  33   return P1;
  34 }
  35
  36 static uint64_t getRelocHash(StringRef kind, uint64_t sectionIdx,
  37                              uint64_t offset, uint64_t addend) {
  38   return xxHash64((kind + ": " + Twine::utohexstr(sectionIdx) + " + " +
  39                    Twine::utohexstr(offset) + " + " + Twine::utohexstr(addend))
  40                       .str());
  41 }
  42
  43 static uint64_t
  44 getRelocHash(const Reloc &reloc,
  45              const DenseMap<const InputSection *, uint64_t> &sectionToIdx) {
  46   auto *isec = reloc.getReferentInputSection();
  47   std::optional<uint64_t> sectionIdx;
  48   auto sectionIdxIt = sectionToIdx.find(isec);
  49   if (sectionIdxIt != sectionToIdx.end())
  50     sectionIdx = sectionIdxIt->getSecond();
  51   std::string kind;
  52   if (isec)
  53     kind = ("Section " + Twine(static_cast<uint8_t>(isec->kind()))).str();
  54   if (auto *sym = reloc.referent.dyn_cast<Symbol *>()) {
  55     kind += (" Symbol " + Twine(static_cast<uint8_t>(sym->kind()))).str();
  56     if (auto *d = dyn_cast<Defined>(sym))
  57       return getRelocHash(kind, sectionIdx.value_or(0), d->value, reloc.addend);
  58   }
  59   return getRelocHash(kind, sectionIdx.value_or(0), 0, reloc.addend);
  60 }
  61
  62 /// Given \p sectionIdxs, a list of section indexes, return a list of utility
  63 /// nodes for each section index. If \p duplicateSectionIdx is provided,
  64 /// populate it with nearly identical sections. Increment \p maxUN to be the
  65 /// largest utility node we have used so far.
  66 static SmallVector<std::pair<unsigned, UtilityNodes>> getUnsForCompression(
  67     ArrayRef<const InputSection *> sections,
  68     const DenseMap<const InputSection *, uint64_t> &sectionToIdx,
  69     ArrayRef<unsigned> sectionIdxs,
  70     DenseMap<unsigned, SmallVector<unsigned>> *duplicateSectionIdxs,
  71     BPFunctionNode::UtilityNodeT &maxUN) {
  72   TimeTraceScope timeScope("Build nodes for compression");
  73
  74   SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> sectionHashes;
  75   sectionHashes.reserve(sectionIdxs.size());
  76   SmallVector<uint64_t> hashes;
  77   for (unsigned sectionIdx : sectionIdxs) {
  78     const auto *isec = sections[sectionIdx];
  79     constexpr unsigned windowSize = 4;
  80
  81     for (size_t i = 0; i < isec->data.size(); i++) {
  82       auto window = isec->data.drop_front(i).take_front(windowSize);
  83       hashes.push_back(xxHash64(window));
  84     }
  85     for (const auto &r : isec->relocs) {
  86       if (r.length == 0 || r.referent.isNull() || r.offset >= isec->data.size())
  87         continue;
  88       uint64_t relocHash = getRelocHash(r, sectionToIdx);
  89       uint32_t start = (r.offset < windowSize) ? 0 : r.offset - windowSize + 1;
  90       for (uint32_t i = start; i < r.offset + r.length; i++) {
  91         auto window = isec->data.drop_front(i).take_front(windowSize);
  92         hashes.push_back(xxHash64(window) + relocHash);
  93       }
  94     }
  95
  96     llvm::sort(hashes);
  97     hashes.erase(std::unique(hashes.begin(), hashes.end()), hashes.end());
  98
  99     sectionHashes.emplace_back(sectionIdx, hashes);
 100     hashes.clear();
 101   }
 102
 103   DenseMap<uint64_t, unsigned> hashFrequency;
 104   for (auto &[sectionIdx, hashes] : sectionHashes)
 105     for (auto hash : hashes)
 106       ++hashFrequency[hash];
 107
 108   if (duplicateSectionIdxs) {
 109     // Merge section that are nearly identical
 110     SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> newSectionHashes;
 111     DenseMap<uint64_t, unsigned> wholeHashToSectionIdx;
 112     for (auto &[sectionIdx, hashes] : sectionHashes) {
 113       uint64_t wholeHash = 0;
 114       for (auto hash : hashes)
 115         if (hashFrequency[hash] > 5)
 116           wholeHash ^= hash;
 117       auto [it, wasInserted] =
 118           wholeHashToSectionIdx.insert(std::make_pair(wholeHash, sectionIdx));
 119       if (wasInserted) {
 120         newSectionHashes.emplace_back(sectionIdx, hashes);
 121       } else {
 122         (*duplicateSectionIdxs)[it->getSecond()].push_back(sectionIdx);
 123       }
 124     }
 125     sectionHashes = newSectionHashes;
 126
 127     // Recompute hash frequencies
 128     hashFrequency.clear();
 129     for (auto &[sectionIdx, hashes] : sectionHashes)
 130       for (auto hash : hashes)
 131         ++hashFrequency[hash];
 132   }
 133
 134   // Filter rare and common hashes and assign each a unique utility node that
 135   // doesn't conflict with the trace utility nodes
 136   DenseMap<uint64_t, BPFunctionNode::UtilityNodeT> hashToUN;
 137   for (auto &[hash, frequency] : hashFrequency) {
 138     if (frequency <= 1 || frequency * 2 > sectionHashes.size())
 139       continue;
 140     hashToUN[hash] = ++maxUN;
 141   }
 142
 143   SmallVector<std::pair<unsigned, UtilityNodes>> sectionUns;
 144   for (auto &[sectionIdx, hashes] : sectionHashes) {
 145     UtilityNodes uns;
 146     for (auto &hash : hashes) {
 147       auto it = hashToUN.find(hash);
 148       if (it != hashToUN.end())
 149         uns.push_back(it->second);
 150     }
 151     sectionUns.emplace_back(sectionIdx, uns);
 152   }
 153   return sectionUns;
 154 }
 155
 156 DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
 157     size_t &highestAvailablePriority, StringRef profilePath,
 158     bool forFunctionCompression, bool forDataCompression,
 159     bool compressionSortStartupFunctions, bool verbose) {
 160
 161   SmallVector<const InputSection *> sections;
 162   DenseMap<const InputSection *, uint64_t> sectionToIdx;
 163   StringMap<DenseSet<unsigned>> symbolToSectionIdxs;
 164   for (const auto *file : inputFiles) {
 165     for (auto *sec : file->sections) {
 166       for (auto &subsec : sec->subsections) {
 167         auto *isec = subsec.isec;
 168         if (!isec || isec->data.empty() || !isec->data.data())
 169           continue;
 170         unsigned sectionIdx = sections.size();
 171         sectionToIdx.try_emplace(isec, sectionIdx);
 172         sections.push_back(isec);
 173         for (Symbol *sym : isec->symbols)
 174           if (auto *d = dyn_cast_or_null<Defined>(sym))
 175             symbolToSectionIdxs[d->getName()].insert(sectionIdx);
 176       }
 177     }
 178   }
 179
 180   StringMap<DenseSet<unsigned>> rootSymbolToSectionIdxs;
 181   for (auto &entry : symbolToSectionIdxs) {
 182     StringRef name = entry.getKey();
 183     auto &sectionIdxs = entry.getValue();
 184     name = getRootSymbol(name);
 185     rootSymbolToSectionIdxs[name].insert(sectionIdxs.begin(),
 186                                          sectionIdxs.end());
 187     // Linkage names can be prefixed with "_" or "l_" on Mach-O. See
 188     // Mangler::getNameWithPrefix() for details.
 189     if (name.consume_front("_") || name.consume_front("l_"))
 190       rootSymbolToSectionIdxs[name].insert(sectionIdxs.begin(),
 191                                            sectionIdxs.end());
 192   }
 193
 194   BPFunctionNode::UtilityNodeT maxUN = 0;
 195   DenseMap<unsigned, UtilityNodes> startupSectionIdxUNs;
 196   // Used to define the initial order for startup functions.
 197   DenseMap<unsigned, size_t> sectionIdxToTimestamp;
 198   std::unique_ptr<InstrProfReader> reader;
 199   if (!profilePath.empty()) {
 200     auto fs = vfs::getRealFileSystem();
 201     auto readerOrErr = InstrProfReader::create(profilePath, *fs);
 202     lld::checkError(readerOrErr.takeError());
 203
 204     reader = std::move(readerOrErr.get());
 205     for (auto &entry : *reader) {
 206       // Read all entries
 207       (void)entry;
 208     }
 209     auto &traces = reader->getTemporalProfTraces();
 210
 211     DenseMap<unsigned, BPFunctionNode::UtilityNodeT> sectionIdxToFirstUN;
 212     for (size_t traceIdx = 0; traceIdx < traces.size(); traceIdx++) {
 213       uint64_t currentSize = 0, cutoffSize = 1;
 214       size_t cutoffTimestamp = 1;
 215       auto &trace = traces[traceIdx].FunctionNameRefs;
 216       for (size_t timestamp = 0; timestamp < trace.size(); timestamp++) {
 217         auto [Filename, ParsedFuncName] = getParsedIRPGOName(
 218             reader->getSymtab().getFuncOrVarName(trace[timestamp]));
 219         ParsedFuncName = getRootSymbol(ParsedFuncName);
 220
 221         auto sectionIdxsIt = rootSymbolToSectionIdxs.find(ParsedFuncName);
 222         if (sectionIdxsIt == rootSymbolToSectionIdxs.end())
 223           continue;
 224         auto &sectionIdxs = sectionIdxsIt->getValue();
 225         // If the same symbol is found in multiple sections, they might be
 226         // identical, so we arbitrarily use the size from the first section.
 227         currentSize += sections[*sectionIdxs.begin()]->getSize();
 228
 229         // Since BalancedPartitioning is sensitive to the initial order, we need
 230         // to explicitly define it to be ordered by earliest timestamp.
 231         for (unsigned sectionIdx : sectionIdxs) {
 232           auto [it, wasInserted] =
 233               sectionIdxToTimestamp.try_emplace(sectionIdx, timestamp);
 234           if (!wasInserted)
 235             it->getSecond() = std::min<size_t>(it->getSecond(), timestamp);
 236         }
 237
 238         if (timestamp >= cutoffTimestamp || currentSize >= cutoffSize) {
 239           ++maxUN;
 240           cutoffSize = 2 * currentSize;
 241           cutoffTimestamp = 2 * cutoffTimestamp;
 242         }
 243         for (unsigned sectionIdx : sectionIdxs)
 244           sectionIdxToFirstUN.try_emplace(sectionIdx, maxUN);
 245       }
 246       for (auto &[sectionIdx, firstUN] : sectionIdxToFirstUN)
 247         for (auto un = firstUN; un <= maxUN; ++un)
 248           startupSectionIdxUNs[sectionIdx].push_back(un);
 249       ++maxUN;
 250       sectionIdxToFirstUN.clear();
 251     }
 252   }
 253
 254   SmallVector<unsigned> sectionIdxsForFunctionCompression,
 255       sectionIdxsForDataCompression;
 256   for (unsigned sectionIdx = 0; sectionIdx < sections.size(); sectionIdx++) {
 257     if (startupSectionIdxUNs.count(sectionIdx))
 258       continue;
 259     const auto *isec = sections[sectionIdx];
 260     if (isCodeSection(isec)) {
 261       if (forFunctionCompression)
 262         sectionIdxsForFunctionCompression.push_back(sectionIdx);
 263     } else {
 264       if (forDataCompression)
 265         sectionIdxsForDataCompression.push_back(sectionIdx);
 266     }
 267   }
 268
 269   if (compressionSortStartupFunctions) {
 270     SmallVector<unsigned> startupIdxs;
 271     for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
 272       startupIdxs.push_back(sectionIdx);
 273     auto unsForStartupFunctionCompression =
 274         getUnsForCompression(sections, sectionToIdx, startupIdxs,
 275                              /*duplicateSectionIdxs=*/nullptr, maxUN);
 276     for (auto &[sectionIdx, compressionUns] :
 277          unsForStartupFunctionCompression) {
 278       auto &uns = startupSectionIdxUNs[sectionIdx];
 279       uns.append(compressionUns);
 280       llvm::sort(uns);
 281       uns.erase(std::unique(uns.begin(), uns.end()), uns.end());
 282     }
 283   }
 284
 285   // Map a section index (order directly) to a list of duplicate section indices
 286   // (not ordered directly).
 287   DenseMap<unsigned, SmallVector<unsigned>> duplicateSectionIdxs;
 288   auto unsForFunctionCompression = getUnsForCompression(
 289       sections, sectionToIdx, sectionIdxsForFunctionCompression,
 290       &duplicateSectionIdxs, maxUN);
 291   auto unsForDataCompression = getUnsForCompression(
 292       sections, sectionToIdx, sectionIdxsForDataCompression,
 293       &duplicateSectionIdxs, maxUN);
 294
 295   std::vector<BPFunctionNode> nodesForStartup, nodesForFunctionCompression,
 296       nodesForDataCompression;
 297   for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
 298     nodesForStartup.emplace_back(sectionIdx, uns);
 299   for (auto &[sectionIdx, uns] : unsForFunctionCompression)
 300     nodesForFunctionCompression.emplace_back(sectionIdx, uns);
 301   for (auto &[sectionIdx, uns] : unsForDataCompression)
 302     nodesForDataCompression.emplace_back(sectionIdx, uns);
 303
 304   // Use the first timestamp to define the initial order for startup nodes.
 305   llvm::sort(nodesForStartup, [&sectionIdxToTimestamp](auto &L, auto &R) {
 306     return std::make_pair(sectionIdxToTimestamp[L.Id], L.Id) <
 307            std::make_pair(sectionIdxToTimestamp[R.Id], R.Id);
 308   });
 309   // Sort compression nodes by their Id (which is the section index) because the
 310   // input linker order tends to be not bad.
 311   llvm::sort(nodesForFunctionCompression,
 312              [](auto &L, auto &R) { return L.Id < R.Id; });
 313   llvm::sort(nodesForDataCompression,
 314              [](auto &L, auto &R) { return L.Id < R.Id; });
 315
 316   {
 317     TimeTraceScope timeScope("Balanced Partitioning");
 318     BalancedPartitioningConfig config;
 319     BalancedPartitioning bp(config);
 320     bp.run(nodesForStartup);
 321     bp.run(nodesForFunctionCompression);
 322     bp.run(nodesForDataCompression);
 323   }
 324
 325   unsigned numStartupSections = 0;
 326   unsigned numCodeCompressionSections = 0;
 327   unsigned numDuplicateCodeSections = 0;
 328   unsigned numDataCompressionSections = 0;
 329   unsigned numDuplicateDataSections = 0;
 330   SetVector<const InputSection *> orderedSections;
 331   // Order startup functions,
 332   for (auto &node : nodesForStartup) {
 333     const auto *isec = sections[node.Id];
 334     if (orderedSections.insert(isec))
 335       ++numStartupSections;
 336   }
 337   // then functions for compression,
 338   for (auto &node : nodesForFunctionCompression) {
 339     const auto *isec = sections[node.Id];
 340     if (orderedSections.insert(isec))
 341       ++numCodeCompressionSections;
 342
 343     auto It = duplicateSectionIdxs.find(node.Id);
 344     if (It == duplicateSectionIdxs.end())
 345       continue;
 346     for (auto dupSecIdx : It->getSecond()) {
 347       const auto *dupIsec = sections[dupSecIdx];
 348       if (orderedSections.insert(dupIsec))
 349         ++numDuplicateCodeSections;
 350     }
 351   }
 352   // then data for compression.
 353   for (auto &node : nodesForDataCompression) {
 354     const auto *isec = sections[node.Id];
 355     if (orderedSections.insert(isec))
 356       ++numDataCompressionSections;
 357     auto It = duplicateSectionIdxs.find(node.Id);
 358     if (It == duplicateSectionIdxs.end())
 359       continue;
 360     for (auto dupSecIdx : It->getSecond()) {
 361       const auto *dupIsec = sections[dupSecIdx];
 362       if (orderedSections.insert(dupIsec))
 363         ++numDuplicateDataSections;
 364     }
 365   }
 366
 367   if (verbose) {
 368     unsigned numTotalOrderedSections =
 369         numStartupSections + numCodeCompressionSections +
 370         numDuplicateCodeSections + numDataCompressionSections +
 371         numDuplicateDataSections;
 372     dbgs()
 373         << "Ordered " << numTotalOrderedSections
 374         << " sections using balanced partitioning:\n  Functions for startup: "
 375         << numStartupSections
 376         << "\n  Functions for compression: " << numCodeCompressionSections
 377         << "\n  Duplicate functions: " << numDuplicateCodeSections
 378         << "\n  Data for compression: " << numDataCompressionSections
 379         << "\n  Duplicate data: " << numDuplicateDataSections << "\n";
 380
 381     if (!profilePath.empty()) {
 382       // Evaluate this function order for startup
 383       StringMap<std::pair<uint64_t, uint64_t>> symbolToPageNumbers;
 384       const uint64_t pageSize = (1 << 14);
 385       uint64_t currentAddress = 0;
 386       for (const auto *isec : orderedSections) {
 387         for (Symbol *sym : isec->symbols) {
 388           if (auto *d = dyn_cast_or_null<Defined>(sym)) {
 389             uint64_t startAddress = currentAddress + d->value;
 390             uint64_t endAddress = startAddress + d->size;
 391             uint64_t firstPage = startAddress / pageSize;
 392             // I think the kernel might pull in a few pages when one it touched,
 393             // so it might be more accurate to force lastPage to be aligned by
 394             // 4?
 395             uint64_t lastPage = endAddress / pageSize;
 396             StringRef rootSymbol = d->getName();
 397             rootSymbol = getRootSymbol(rootSymbol);
 398             symbolToPageNumbers.try_emplace(rootSymbol, firstPage, lastPage);
 399             if (rootSymbol.consume_front("_") || rootSymbol.consume_front("l_"))
 400               symbolToPageNumbers.try_emplace(rootSymbol, firstPage, lastPage);
 401           }
 402         }
 403
 404         currentAddress += isec->getSize();
 405       }
 406
 407       // The area under the curve F where F(t) is the total number of page
 408       // faults at step t.
 409       unsigned area = 0;
 410       for (auto &trace : reader->getTemporalProfTraces()) {
 411         SmallSet<uint64_t, 0> touchedPages;
 412         for (unsigned step = 0; step < trace.FunctionNameRefs.size(); step++) {
 413           auto traceId = trace.FunctionNameRefs[step];
 414           auto [Filename, ParsedFuncName] =
 415               getParsedIRPGOName(reader->getSymtab().getFuncOrVarName(traceId));
 416           ParsedFuncName = getRootSymbol(ParsedFuncName);
 417           auto it = symbolToPageNumbers.find(ParsedFuncName);
 418           if (it != symbolToPageNumbers.end()) {
 419             auto &[firstPage, lastPage] = it->getValue();
 420             for (uint64_t i = firstPage; i <= lastPage; i++)
 421               touchedPages.insert(i);
 422           }
 423           area += touchedPages.size();
 424         }
 425       }
 426       dbgs() << "Total area under the page fault curve: " << (float)area
 427              << "\n";
 428     }
 429   }
 430
 431   DenseMap<const InputSection *, size_t> sectionPriorities;
 432   for (const auto *isec : orderedSections)
 433     sectionPriorities[isec] = --highestAvailablePriority;
 434   return sectionPriorities;
 435 }