llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp

   1 //===- AMDGPUSplitModule.cpp ----------------------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file Implements a module splitting algorithm designed to support the
  10 /// FullLTO --lto-partitions option for parallel codegen.
  11 ///
  12 /// The role of this module splitting pass is the same as
  13 /// lib/Transforms/Utils/SplitModule.cpp: load-balance the module's functions
  14 /// across a set of N partitions to allow for parallel codegen.
  15 ///
  16 /// The similarities mostly end here, as this pass achieves load-balancing in a
  17 /// more elaborate fashion which is targeted towards AMDGPU modules. It can take
  18 /// advantage of the structure of AMDGPU modules (which are mostly
  19 /// self-contained) to allow for more efficient splitting without affecting
  20 /// codegen negatively, or causing innaccurate resource usage analysis.
  21 ///
  22 /// High-level pass overview:
  23 ///   - SplitGraph & associated classes
  24 ///      - Graph representation of the module and of the dependencies that
  25 ///      matter for splitting.
  26 ///   - RecursiveSearchSplitting
  27 ///     - Core splitting algorithm.
  28 ///   - SplitProposal
  29 ///     - Represents a suggested solution for splitting the input module. These
  30 ///     solutions can be scored to determine the best one when multiple
  31 ///     solutions are available.
  32 ///   - Driver/pass "run" function glues everything together.
  33
  34 #include "AMDGPUSplitModule.h"
  35 #include "AMDGPUTargetMachine.h"
  36 #include "Utils/AMDGPUBaseInfo.h"
  37 #include "llvm/ADT/DenseMap.h"
  38 #include "llvm/ADT/EquivalenceClasses.h"
  39 #include "llvm/ADT/GraphTraits.h"
  40 #include "llvm/ADT/SmallVector.h"
  41 #include "llvm/ADT/StringExtras.h"
  42 #include "llvm/ADT/StringRef.h"
  43 #include "llvm/Analysis/CallGraph.h"
  44 #include "llvm/Analysis/TargetTransformInfo.h"
  45 #include "llvm/IR/Function.h"
  46 #include "llvm/IR/InstIterator.h"
  47 #include "llvm/IR/Instruction.h"
  48 #include "llvm/IR/Module.h"
  49 #include "llvm/IR/Value.h"
  50 #include "llvm/Support/Allocator.h"
  51 #include "llvm/Support/Casting.h"
  52 #include "llvm/Support/DOTGraphTraits.h"
  53 #include "llvm/Support/Debug.h"
  54 #include "llvm/Support/GraphWriter.h"
  55 #include "llvm/Support/Path.h"
  56 #include "llvm/Support/Timer.h"
  57 #include "llvm/Support/raw_ostream.h"
  58 #include "llvm/Transforms/Utils/Cloning.h"
  59 #include <cassert>
  60 #include <cmath>
  61 #include <memory>
  62 #include <utility>
  63 #include <vector>
  64
  65 #ifndef NDEBUG
  66 #include "llvm/Support/LockFileManager.h"
  67 #endif
  68
  69 #define DEBUG_TYPE "amdgpu-split-module"
  70
  71 namespace llvm {
  72 namespace {
  73
  74 static cl::opt<unsigned> MaxDepth(
  75     "amdgpu-module-splitting-max-depth",
  76     cl::desc(
  77         "maximum search depth. 0 forces a greedy approach. "
  78         "warning: the algorithm is up to O(2^N), where N is the max depth."),
  79     cl::init(8));
  80
  81 static cl::opt<float> LargeFnFactor(
  82     "amdgpu-module-splitting-large-threshold", cl::init(2.0f), cl::Hidden,
  83     cl::desc(
  84         "when max depth is reached and we can no longer branch out, this "
  85         "value determines if a function is worth merging into an already "
  86         "existing partition to reduce code duplication. This is a factor "
  87         "of the ideal partition size, e.g. 2.0 means we consider the "
  88         "function for merging if its cost (including its callees) is 2x the "
  89         "size of an ideal partition."));
  90
  91 static cl::opt<float> LargeFnOverlapForMerge(
  92     "amdgpu-module-splitting-merge-threshold", cl::init(0.7f), cl::Hidden,
  93     cl::desc("when a function is considered for merging into a partition that "
  94              "already contains some of its callees, do the merge if at least "
  95              "n% of the code it can reach is already present inside the "
  96              "partition; e.g. 0.7 means only merge >70%"));
  97
  98 static cl::opt<bool> NoExternalizeGlobals(
  99     "amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
 100     cl::desc("disables externalization of global variable with local linkage; "
 101              "may cause globals to be duplicated which increases binary size"));
 102
 103 static cl::opt<bool> NoExternalizeOnAddrTaken(
 104     "amdgpu-module-splitting-no-externalize-address-taken", cl::Hidden,
 105     cl::desc(
 106         "disables externalization of functions whose addresses are taken"));
 107
 108 static cl::opt<std::string>
 109     ModuleDotCfgOutput("amdgpu-module-splitting-print-module-dotcfg",
 110                        cl::Hidden,
 111                        cl::desc("output file to write out the dotgraph "
 112                                 "representation of the input module"));
 113
 114 static cl::opt<std::string> PartitionSummariesOutput(
 115     "amdgpu-module-splitting-print-partition-summaries", cl::Hidden,
 116     cl::desc("output file to write out a summary of "
 117              "the partitions created for each module"));
 118
 119 #ifndef NDEBUG
 120 static cl::opt<bool>
 121     UseLockFile("amdgpu-module-splitting-serial-execution", cl::Hidden,
 122                 cl::desc("use a lock file so only one process in the system "
 123                          "can run this pass at once. useful to avoid mangled "
 124                          "debug output in multithreaded environments."));
 125
 126 static cl::opt<bool>
 127     DebugProposalSearch("amdgpu-module-splitting-debug-proposal-search",
 128                         cl::Hidden,
 129                         cl::desc("print all proposals received and whether "
 130                                  "they were rejected or accepted"));
 131 #endif
 132
 133 struct SplitModuleTimer : NamedRegionTimer {
 134   SplitModuleTimer(StringRef Name, StringRef Desc)
 135       : NamedRegionTimer(Name, Desc, DEBUG_TYPE, "AMDGPU Module Splitting",
 136                          TimePassesIsEnabled) {}
 137 };
 138
 139 //===----------------------------------------------------------------------===//
 140 // Utils
 141 //===----------------------------------------------------------------------===//
 142
 143 using CostType = InstructionCost::CostType;
 144 using FunctionsCostMap = DenseMap<const Function *, CostType>;
 145 using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>;
 146 static constexpr unsigned InvalidPID = -1;
 147
 148 /// \param Num numerator
 149 /// \param Dem denominator
 150 /// \returns a printable object to print (Num/Dem) using "%0.2f".
 151 static auto formatRatioOf(CostType Num, CostType Dem) {
 152   return format("%0.2f", (static_cast<double>(Num) / Dem) * 100);
 153 }
 154
 155 /// Checks whether a given function is non-copyable.
 156 ///
 157 /// Non-copyable functions cannot be cloned into multiple partitions, and only
 158 /// one copy of the function can be present across all partitions.
 159 ///
 160 /// External functions fall into this category. If we were to clone them, we
 161 /// would end up with multiple symbol definitions and a very unhappy linker.
 162 static bool isNonCopyable(const Function &F) {
 163   assert(AMDGPU::isEntryFunctionCC(F.getCallingConv())
 164              ? F.hasExternalLinkage()
 165              : true && "Kernel w/o external linkage?");
 166   return F.hasExternalLinkage() || !F.isDefinitionExact();
 167 }
 168
 169 /// If \p GV has local linkage, make it external + hidden.
 170 static void externalize(GlobalValue &GV) {
 171   if (GV.hasLocalLinkage()) {
 172     GV.setLinkage(GlobalValue::ExternalLinkage);
 173     GV.setVisibility(GlobalValue::HiddenVisibility);
 174   }
 175
 176   // Unnamed entities must be named consistently between modules. setName will
 177   // give a distinct name to each such entity.
 178   if (!GV.hasName())
 179     GV.setName("__llvmsplit_unnamed");
 180 }
 181
 182 /// Cost analysis function. Calculates the cost of each function in \p M
 183 ///
 184 /// \param GetTTI Abstract getter for TargetTransformInfo.
 185 /// \param M Module to analyze.
 186 /// \param CostMap[out] Resulting Function -> Cost map.
 187 /// \return The module's total cost.
 188 static CostType calculateFunctionCosts(GetTTIFn GetTTI, Module &M,
 189                                        FunctionsCostMap &CostMap) {
 190   SplitModuleTimer SMT("calculateFunctionCosts", "cost analysis");
 191
 192   LLVM_DEBUG(dbgs() << "[cost analysis] calculating function costs\n");
 193   CostType ModuleCost = 0;
 194   [[maybe_unused]] CostType KernelCost = 0;
 195
 196   for (auto &Fn : M) {
 197     if (Fn.isDeclaration())
 198       continue;
 199
 200     CostType FnCost = 0;
 201     const auto &TTI = GetTTI(Fn);
 202     for (const auto &BB : Fn) {
 203       for (const auto &I : BB) {
 204         auto Cost =
 205             TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
 206         assert(Cost != InstructionCost::getMax());
 207         // Assume expensive if we can't tell the cost of an instruction.
 208         CostType CostVal =
 209             Cost.getValue().value_or(TargetTransformInfo::TCC_Expensive);
 210         assert((FnCost + CostVal) >= FnCost && "Overflow!");
 211         FnCost += CostVal;
 212       }
 213     }
 214
 215     assert(FnCost != 0);
 216
 217     CostMap[&Fn] = FnCost;
 218     assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!");
 219     ModuleCost += FnCost;
 220
 221     if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv()))
 222       KernelCost += FnCost;
 223   }
 224
 225   if (CostMap.empty())
 226     return 0;
 227
 228   assert(ModuleCost);
 229   LLVM_DEBUG({
 230     const CostType FnCost = ModuleCost - KernelCost;
 231     dbgs() << " - total module cost is " << ModuleCost << ". kernels cost "
 232            << "" << KernelCost << " ("
 233            << format("%0.2f", (float(KernelCost) / ModuleCost) * 100)
 234            << "% of the module), functions cost " << FnCost << " ("
 235            << format("%0.2f", (float(FnCost) / ModuleCost) * 100)
 236            << "% of the module)\n";
 237   });
 238
 239   return ModuleCost;
 240 }
 241
 242 /// \return true if \p F can be indirectly called
 243 static bool canBeIndirectlyCalled(const Function &F) {
 244   if (F.isDeclaration() || AMDGPU::isEntryFunctionCC(F.getCallingConv()))
 245     return false;
 246   return !F.hasLocalLinkage() ||
 247          F.hasAddressTaken(/*PutOffender=*/nullptr,
 248                            /*IgnoreCallbackUses=*/false,
 249                            /*IgnoreAssumeLikeCalls=*/true,
 250                            /*IgnoreLLVMUsed=*/true,
 251                            /*IgnoreARCAttachedCall=*/false,
 252                            /*IgnoreCastedDirectCall=*/true);
 253 }
 254
 255 //===----------------------------------------------------------------------===//
 256 // Graph-based Module Representation
 257 //===----------------------------------------------------------------------===//
 258
 259 /// AMDGPUSplitModule's view of the source Module, as a graph of all components
 260 /// that can be split into different modules.
 261 ///
 262 /// The most trivial instance of this graph is just the CallGraph of the module,
 263 /// but it is not guaranteed that the graph is strictly equal to the CG. It
 264 /// currently always is but it's designed in a way that would eventually allow
 265 /// us to create abstract nodes, or nodes for different entities such as global
 266 /// variables or any other meaningful constraint we must consider.
 267 ///
 268 /// The graph is only mutable by this class, and is generally not modified
 269 /// after \ref SplitGraph::buildGraph runs. No consumers of the graph can
 270 /// mutate it.
 271 class SplitGraph {
 272 public:
 273   class Node;
 274
 275   enum class EdgeKind : uint8_t {
 276     /// The nodes are related through a direct call. This is a "strong" edge as
 277     /// it means the Src will directly reference the Dst.
 278     DirectCall,
 279     /// The nodes are related through an indirect call.
 280     /// This is a "weaker" edge and is only considered when traversing the graph
 281     /// starting from a kernel. We need this edge for resource usage analysis.
 282     ///
 283     /// The reason why we have this edge in the first place is due to how
 284     /// AMDGPUResourceUsageAnalysis works. In the presence of an indirect call,
 285     /// the resource usage of the kernel containing the indirect call is the
 286     /// max resource usage of all functions that can be indirectly called.
 287     IndirectCall,
 288   };
 289
 290   /// An edge between two nodes. Edges are directional, and tagged with a
 291   /// "kind".
 292   struct Edge {
 293     Edge(Node *Src, Node *Dst, EdgeKind Kind)
 294         : Src(Src), Dst(Dst), Kind(Kind) {}
 295
 296     Node *Src; ///< Source
 297     Node *Dst; ///< Destination
 298     EdgeKind Kind;
 299   };
 300
 301   using EdgesVec = SmallVector<const Edge *, 0>;
 302   using edges_iterator = EdgesVec::const_iterator;
 303   using nodes_iterator = const Node *const *;
 304
 305   SplitGraph(const Module &M, const FunctionsCostMap &CostMap,
 306              CostType ModuleCost)
 307       : M(M), CostMap(CostMap), ModuleCost(ModuleCost) {}
 308
 309   void buildGraph(CallGraph &CG);
 310
 311 #ifndef NDEBUG
 312   bool verifyGraph() const;
 313 #endif
 314
 315   bool empty() const { return Nodes.empty(); }
 316   const iterator_range<nodes_iterator> nodes() const {
 317     return {Nodes.begin(), Nodes.end()};
 318   }
 319   const Node &getNode(unsigned ID) const { return *Nodes[ID]; }
 320
 321   unsigned getNumNodes() const { return Nodes.size(); }
 322   BitVector createNodesBitVector() const { return BitVector(Nodes.size()); }
 323
 324   const Module &getModule() const { return M; }
 325
 326   CostType getModuleCost() const { return ModuleCost; }
 327   CostType getCost(const Function &F) const { return CostMap.at(&F); }
 328
 329   /// \returns the aggregated cost of all nodes in \p BV (bits set to 1 = node
 330   /// IDs).
 331   CostType calculateCost(const BitVector &BV) const;
 332
 333 private:
 334   /// Retrieves the node for \p GV in \p Cache, or creates a new node for it and
 335   /// updates \p Cache.
 336   Node &getNode(DenseMap<const GlobalValue *, Node *> &Cache,
 337                 const GlobalValue &GV);
 338
 339   // Create a new edge between two nodes and add it to both nodes.
 340   const Edge &createEdge(Node &Src, Node &Dst, EdgeKind EK);
 341
 342   const Module &M;
 343   const FunctionsCostMap &CostMap;
 344   CostType ModuleCost;
 345
 346   // Final list of nodes with stable ordering.
 347   SmallVector<Node *> Nodes;
 348
 349   SpecificBumpPtrAllocator<Node> NodesPool;
 350
 351   // Edges are trivially destructible objects, so as a small optimization we
 352   // use a BumpPtrAllocator which avoids destructor calls but also makes
 353   // allocation faster.
 354   static_assert(
 355       std::is_trivially_destructible_v<Edge>,
 356       "Edge must be trivially destructible to use the BumpPtrAllocator");
 357   BumpPtrAllocator EdgesPool;
 358 };
 359
 360 /// Nodes in the SplitGraph contain both incoming, and outgoing edges.
 361 /// Incoming edges have this node as their Dst, and Outgoing ones have this node
 362 /// as their Src.
 363 ///
 364 /// Edge objects are shared by both nodes in Src/Dst. They provide immediate
 365 /// feedback on how two nodes are related, and in which direction they are
 366 /// related, which is valuable information to make splitting decisions.
 367 ///
 368 /// Nodes are fundamentally abstract, and any consumers of the graph should
 369 /// treat them as such. While a node will be a function most of the time, we
 370 /// could also create nodes for any other reason. In the future, we could have
 371 /// single nodes for multiple functions, or nodes for GVs, etc.
 372 class SplitGraph::Node {
 373   friend class SplitGraph;
 374
 375 public:
 376   Node(unsigned ID, const GlobalValue &GV, CostType IndividualCost,
 377        bool IsNonCopyable)
 378       : ID(ID), GV(GV), IndividualCost(IndividualCost),
 379         IsNonCopyable(IsNonCopyable), IsEntryFnCC(false), IsGraphEntry(false) {
 380     if (auto *Fn = dyn_cast<Function>(&GV))
 381       IsEntryFnCC = AMDGPU::isEntryFunctionCC(Fn->getCallingConv());
 382   }
 383
 384   /// An 0-indexed ID for the node. The maximum ID (exclusive) is the number of
 385   /// nodes in the graph. This ID can be used as an index in a BitVector.
 386   unsigned getID() const { return ID; }
 387
 388   const Function &getFunction() const { return cast<Function>(GV); }
 389
 390   /// \returns the cost to import this component into a given module, not
 391   /// accounting for any dependencies that may need to be imported as well.
 392   CostType getIndividualCost() const { return IndividualCost; }
 393
 394   bool isNonCopyable() const { return IsNonCopyable; }
 395   bool isEntryFunctionCC() const { return IsEntryFnCC; }
 396
 397   /// \returns whether this is an entry point in the graph. Entry points are
 398   /// defined as follows: if you take all entry points in the graph, and iterate
 399   /// their dependencies, you are guaranteed to visit all nodes in the graph at
 400   /// least once.
 401   bool isGraphEntryPoint() const { return IsGraphEntry; }
 402
 403   StringRef getName() const { return GV.getName(); }
 404
 405   bool hasAnyIncomingEdges() const { return IncomingEdges.size(); }
 406   bool hasAnyIncomingEdgesOfKind(EdgeKind EK) const {
 407     return any_of(IncomingEdges, [&](const auto *E) { return E->Kind == EK; });
 408   }
 409
 410   bool hasAnyOutgoingEdges() const { return OutgoingEdges.size(); }
 411   bool hasAnyOutgoingEdgesOfKind(EdgeKind EK) const {
 412     return any_of(OutgoingEdges, [&](const auto *E) { return E->Kind == EK; });
 413   }
 414
 415   iterator_range<edges_iterator> incoming_edges() const {
 416     return IncomingEdges;
 417   }
 418
 419   iterator_range<edges_iterator> outgoing_edges() const {
 420     return OutgoingEdges;
 421   }
 422
 423   bool shouldFollowIndirectCalls() const { return isEntryFunctionCC(); }
 424
 425   /// Visit all children of this node in a recursive fashion. Also visits Self.
 426   /// If \ref shouldFollowIndirectCalls returns false, then this only follows
 427   /// DirectCall edges.
 428   ///
 429   /// \param Visitor Visitor Function.
 430   void visitAllDependencies(std::function<void(const Node &)> Visitor) const;
 431
 432   /// Adds the depedencies of this node in \p BV by setting the bit
 433   /// corresponding to each node.
 434   ///
 435   /// Implemented using \ref visitAllDependencies, hence it follows the same
 436   /// rules regarding dependencies traversal.
 437   ///
 438   /// \param[out] BV The bitvector where the bits should be set.
 439   void getDependencies(BitVector &BV) const {
 440     visitAllDependencies([&](const Node &N) { BV.set(N.getID()); });
 441   }
 442
 443 private:
 444   void markAsGraphEntry() { IsGraphEntry = true; }
 445
 446   unsigned ID;
 447   const GlobalValue &GV;
 448   CostType IndividualCost;
 449   bool IsNonCopyable : 1;
 450   bool IsEntryFnCC : 1;
 451   bool IsGraphEntry : 1;
 452
 453   // TODO: Use a single sorted vector (with all incoming/outgoing edges grouped
 454   // together)
 455   EdgesVec IncomingEdges;
 456   EdgesVec OutgoingEdges;
 457 };
 458
 459 void SplitGraph::Node::visitAllDependencies(
 460     std::function<void(const Node &)> Visitor) const {
 461   const bool FollowIndirect = shouldFollowIndirectCalls();
 462   // FIXME: If this can access SplitGraph in the future, use a BitVector
 463   // instead.
 464   DenseSet<const Node *> Seen;
 465   SmallVector<const Node *, 8> WorkList({this});
 466   while (!WorkList.empty()) {
 467     const Node *CurN = WorkList.pop_back_val();
 468     if (auto [It, Inserted] = Seen.insert(CurN); !Inserted)
 469       continue;
 470
 471     Visitor(*CurN);
 472
 473     for (const Edge *E : CurN->outgoing_edges()) {
 474       if (!FollowIndirect && E->Kind == EdgeKind::IndirectCall)
 475         continue;
 476       WorkList.push_back(E->Dst);
 477     }
 478   }
 479 }
 480
 481 /// Checks if \p I has MD_callees and if it does, parse it and put the function
 482 /// in \p Callees.
 483 ///
 484 /// \returns true if there was metadata and it was parsed correctly. false if
 485 /// there was no MD or if it contained unknown entries and parsing failed.
 486 /// If this returns false, \p Callees will contain incomplete information
 487 /// and must not be used.
 488 static bool handleCalleesMD(const Instruction &I,
 489                             SetVector<Function *> &Callees) {
 490   auto *MD = I.getMetadata(LLVMContext::MD_callees);
 491   if (!MD)
 492     return false;
 493
 494   for (const auto &Op : MD->operands()) {
 495     Function *Callee = mdconst::extract_or_null<Function>(Op);
 496     if (!Callee)
 497       return false;
 498     Callees.insert(Callee);
 499   }
 500
 501   return true;
 502 }
 503
 504 void SplitGraph::buildGraph(CallGraph &CG) {
 505   SplitModuleTimer SMT("buildGraph", "graph construction");
 506   LLVM_DEBUG(
 507       dbgs()
 508       << "[build graph] constructing graph representation of the input\n");
 509
 510   // FIXME(?): Is the callgraph really worth using if we have to iterate the
 511   // function again whenever it fails to give us enough information?
 512
 513   // We build the graph by just iterating all functions in the module and
 514   // working on their direct callees. At the end, all nodes should be linked
 515   // together as expected.
 516   DenseMap<const GlobalValue *, Node *> Cache;
 517   SmallVector<const Function *> FnsWithIndirectCalls, IndirectlyCallableFns;
 518   for (const Function &Fn : M) {
 519     if (Fn.isDeclaration())
 520       continue;
 521
 522     // Look at direct callees and create the necessary edges in the graph.
 523     SetVector<const Function *> DirectCallees;
 524     bool CallsExternal = false;
 525     for (auto &CGEntry : *CG[&Fn]) {
 526       auto *CGNode = CGEntry.second;
 527       if (auto *Callee = CGNode->getFunction()) {
 528         if (!Callee->isDeclaration())
 529           DirectCallees.insert(Callee);
 530       } else if (CGNode == CG.getCallsExternalNode())
 531         CallsExternal = true;
 532     }
 533
 534     // Keep track of this function if it contains an indirect call and/or if it
 535     // can be indirectly called.
 536     if (CallsExternal) {
 537       LLVM_DEBUG(dbgs() << "  [!] callgraph is incomplete for ";
 538                  Fn.printAsOperand(dbgs());
 539                  dbgs() << " - analyzing function\n");
 540
 541       SetVector<Function *> KnownCallees;
 542       bool HasUnknownIndirectCall = false;
 543       for (const auto &Inst : instructions(Fn)) {
 544         // look at all calls without a direct callee.
 545         const auto *CB = dyn_cast<CallBase>(&Inst);
 546         if (!CB || CB->getCalledFunction())
 547           continue;
 548
 549         // inline assembly can be ignored, unless InlineAsmIsIndirectCall is
 550         // true.
 551         if (CB->isInlineAsm()) {
 552           LLVM_DEBUG(dbgs() << "    found inline assembly\n");
 553           continue;
 554         }
 555
 556         if (handleCalleesMD(Inst, KnownCallees))
 557           continue;
 558         // If we failed to parse any !callees MD, or some was missing,
 559         // the entire KnownCallees list is now unreliable.
 560         KnownCallees.clear();
 561
 562         // Everything else is handled conservatively. If we fall into the
 563         // conservative case don't bother analyzing further.
 564         HasUnknownIndirectCall = true;
 565         break;
 566       }
 567
 568       if (HasUnknownIndirectCall) {
 569         LLVM_DEBUG(dbgs() << "    indirect call found\n");
 570         FnsWithIndirectCalls.push_back(&Fn);
 571       } else if (!KnownCallees.empty())
 572         DirectCallees.insert(KnownCallees.begin(), KnownCallees.end());
 573     }
 574
 575     Node &N = getNode(Cache, Fn);
 576     for (const auto *Callee : DirectCallees)
 577       createEdge(N, getNode(Cache, *Callee), EdgeKind::DirectCall);
 578
 579     if (canBeIndirectlyCalled(Fn))
 580       IndirectlyCallableFns.push_back(&Fn);
 581   }
 582
 583   // Post-process functions with indirect calls.
 584   for (const Function *Fn : FnsWithIndirectCalls) {
 585     for (const Function *Candidate : IndirectlyCallableFns) {
 586       Node &Src = getNode(Cache, *Fn);
 587       Node &Dst = getNode(Cache, *Candidate);
 588       createEdge(Src, Dst, EdgeKind::IndirectCall);
 589     }
 590   }
 591
 592   // Now, find all entry points.
 593   SmallVector<Node *, 16> CandidateEntryPoints;
 594   BitVector NodesReachableByKernels = createNodesBitVector();
 595   for (Node *N : Nodes) {
 596     // Functions with an Entry CC are always graph entry points too.
 597     if (N->isEntryFunctionCC()) {
 598       N->markAsGraphEntry();
 599       N->getDependencies(NodesReachableByKernels);
 600     } else if (!N->hasAnyIncomingEdgesOfKind(EdgeKind::DirectCall))
 601       CandidateEntryPoints.push_back(N);
 602   }
 603
 604   for (Node *N : CandidateEntryPoints) {
 605     // This can be another entry point if it's not reachable by a kernel
 606     // TODO: We could sort all of the possible new entries in a stable order
 607     // (e.g. by cost), then consume them one by one until
 608     // NodesReachableByKernels is all 1s. It'd allow us to avoid
 609     // considering some nodes as non-entries in some specific cases.
 610     if (!NodesReachableByKernels.test(N->getID()))
 611       N->markAsGraphEntry();
 612   }
 613
 614 #ifndef NDEBUG
 615   assert(verifyGraph());
 616 #endif
 617 }
 618
 619 #ifndef NDEBUG
 620 bool SplitGraph::verifyGraph() const {
 621   unsigned ExpectedID = 0;
 622   // Exceptionally using a set here in case IDs are messed up.
 623   DenseSet<const Node *> SeenNodes;
 624   DenseSet<const Function *> SeenFunctionNodes;
 625   for (const Node *N : Nodes) {
 626     if (N->getID() != (ExpectedID++)) {
 627       errs() << "Node IDs are incorrect!\n";
 628       return false;
 629     }
 630
 631     if (!SeenNodes.insert(N).second) {
 632       errs() << "Node seen more than once!\n";
 633       return false;
 634     }
 635
 636     if (&getNode(N->getID()) != N) {
 637       errs() << "getNode doesn't return the right node\n";
 638       return false;
 639     }
 640
 641     for (const Edge *E : N->IncomingEdges) {
 642       if (!E->Src || !E->Dst || (E->Dst != N) ||
 643           (find(E->Src->OutgoingEdges, E) == E->Src->OutgoingEdges.end())) {
 644         errs() << "ill-formed incoming edges\n";
 645         return false;
 646       }
 647     }
 648
 649     for (const Edge *E : N->OutgoingEdges) {
 650       if (!E->Src || !E->Dst || (E->Src != N) ||
 651           (find(E->Dst->IncomingEdges, E) == E->Dst->IncomingEdges.end())) {
 652         errs() << "ill-formed outgoing edges\n";
 653         return false;
 654       }
 655     }
 656
 657     const Function &Fn = N->getFunction();
 658     if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv())) {
 659       if (N->hasAnyIncomingEdges()) {
 660         errs() << "Kernels cannot have incoming edges\n";
 661         return false;
 662       }
 663     }
 664
 665     if (Fn.isDeclaration()) {
 666       errs() << "declarations shouldn't have nodes!\n";
 667       return false;
 668     }
 669
 670     auto [It, Inserted] = SeenFunctionNodes.insert(&Fn);
 671     if (!Inserted) {
 672       errs() << "one function has multiple nodes!\n";
 673       return false;
 674     }
 675   }
 676
 677   if (ExpectedID != Nodes.size()) {
 678     errs() << "Node IDs out of sync!\n";
 679     return false;
 680   }
 681
 682   if (createNodesBitVector().size() != getNumNodes()) {
 683     errs() << "nodes bit vector doesn't have the right size!\n";
 684     return false;
 685   }
 686
 687   // Check we respect the promise of Node::isKernel
 688   BitVector BV = createNodesBitVector();
 689   for (const Node *N : nodes()) {
 690     if (N->isGraphEntryPoint())
 691       N->getDependencies(BV);
 692   }
 693
 694   // Ensure each function in the module has an associated node.
 695   for (const auto &Fn : M) {
 696     if (!Fn.isDeclaration()) {
 697       if (!SeenFunctionNodes.contains(&Fn)) {
 698         errs() << "Fn has no associated node in the graph!\n";
 699         return false;
 700       }
 701     }
 702   }
 703
 704   if (!BV.all()) {
 705     errs() << "not all nodes are reachable through the graph's entry points!\n";
 706     return false;
 707   }
 708
 709   return true;
 710 }
 711 #endif
 712
 713 CostType SplitGraph::calculateCost(const BitVector &BV) const {
 714   CostType Cost = 0;
 715   for (unsigned NodeID : BV.set_bits())
 716     Cost += getNode(NodeID).getIndividualCost();
 717   return Cost;
 718 }
 719
 720 SplitGraph::Node &
 721 SplitGraph::getNode(DenseMap<const GlobalValue *, Node *> &Cache,
 722                     const GlobalValue &GV) {
 723   auto &N = Cache[&GV];
 724   if (N)
 725     return *N;
 726
 727   CostType Cost = 0;
 728   bool NonCopyable = false;
 729   if (const Function *Fn = dyn_cast<Function>(&GV)) {
 730     NonCopyable = isNonCopyable(*Fn);
 731     Cost = CostMap.at(Fn);
 732   }
 733   N = new (NodesPool.Allocate()) Node(Nodes.size(), GV, Cost, NonCopyable);
 734   Nodes.push_back(N);
 735   assert(&getNode(N->getID()) == N);
 736   return *N;
 737 }
 738
 739 const SplitGraph::Edge &SplitGraph::createEdge(Node &Src, Node &Dst,
 740                                                EdgeKind EK) {
 741   const Edge *E = new (EdgesPool.Allocate<Edge>(1)) Edge(&Src, &Dst, EK);
 742   Src.OutgoingEdges.push_back(E);
 743   Dst.IncomingEdges.push_back(E);
 744   return *E;
 745 }
 746
 747 //===----------------------------------------------------------------------===//
 748 // Split Proposals
 749 //===----------------------------------------------------------------------===//
 750
 751 /// Represents a module splitting proposal.
 752 ///
 753 /// Proposals are made of N BitVectors, one for each partition, where each bit
 754 /// set indicates that the node is present and should be copied inside that
 755 /// partition.
 756 ///
 757 /// Proposals have several metrics attached so they can be compared/sorted,
 758 /// which the driver to try multiple strategies resultings in multiple proposals
 759 /// and choose the best one out of them.
 760 class SplitProposal {
 761 public:
 762   SplitProposal(const SplitGraph &SG, unsigned MaxPartitions) : SG(&SG) {
 763     Partitions.resize(MaxPartitions, {0, SG.createNodesBitVector()});
 764   }
 765
 766   void setName(StringRef NewName) { Name = NewName; }
 767   StringRef getName() const { return Name; }
 768
 769   const BitVector &operator[](unsigned PID) const {
 770     return Partitions[PID].second;
 771   }
 772
 773   void add(unsigned PID, const BitVector &BV) {
 774     Partitions[PID].second |= BV;
 775     updateScore(PID);
 776   }
 777
 778   void print(raw_ostream &OS) const;
 779   LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
 780
 781   // Find the cheapest partition (lowest cost). In case of ties, always returns
 782   // the highest partition number.
 783   unsigned findCheapestPartition() const;
 784
 785   /// Calculate the CodeSize and Bottleneck scores.
 786   void calculateScores();
 787
 788 #ifndef NDEBUG
 789   void verifyCompleteness() const;
 790 #endif
 791
 792   /// Only available after \ref calculateScores is called.
 793   ///
 794   /// A positive number indicating the % of code duplication that this proposal
 795   /// creates. e.g. 0.2 means this proposal adds roughly 20% code size by
 796   /// duplicating some functions across partitions.
 797   ///
 798   /// Value is always rounded up to 3 decimal places.
 799   ///
 800   /// A perfect score would be 0.0, and anything approaching 1.0 is very bad.
 801   double getCodeSizeScore() const { return CodeSizeScore; }
 802
 803   /// Only available after \ref calculateScores is called.
 804   ///
 805   /// A number between [0, 1] which indicates how big of a bottleneck is
 806   /// expected from the largest partition.
 807   ///
 808   /// A score of 1.0 means the biggest partition is as big as the source module,
 809   /// so build time will be equal to or greater than the build time of the
 810   /// initial input.
 811   ///
 812   /// Value is always rounded up to 3 decimal places.
 813   ///
 814   /// This is one of the metrics used to estimate this proposal's build time.
 815   double getBottleneckScore() const { return BottleneckScore; }
 816
 817 private:
 818   void updateScore(unsigned PID) {
 819     assert(SG);
 820     for (auto &[PCost, Nodes] : Partitions) {
 821       TotalCost -= PCost;
 822       PCost = SG->calculateCost(Nodes);
 823       TotalCost += PCost;
 824     }
 825   }
 826
 827   /// \see getCodeSizeScore
 828   double CodeSizeScore = 0.0;
 829   /// \see getBottleneckScore
 830   double BottleneckScore = 0.0;
 831   /// Aggregated cost of all partitions
 832   CostType TotalCost = 0;
 833
 834   const SplitGraph *SG = nullptr;
 835   std::string Name;
 836
 837   std::vector<std::pair<CostType, BitVector>> Partitions;
 838 };
 839
 840 void SplitProposal::print(raw_ostream &OS) const {
 841   assert(SG);
 842
 843   OS << "[proposal] " << Name << ", total cost:" << TotalCost
 844      << ", code size score:" << format("%0.3f", CodeSizeScore)
 845      << ", bottleneck score:" << format("%0.3f", BottleneckScore) << '\n';
 846   for (const auto &[PID, Part] : enumerate(Partitions)) {
 847     const auto &[Cost, NodeIDs] = Part;
 848     OS << "  - P" << PID << " nodes:" << NodeIDs.count() << " cost: " << Cost
 849        << '|' << formatRatioOf(Cost, SG->getModuleCost()) << "%\n";
 850   }
 851 }
 852
 853 unsigned SplitProposal::findCheapestPartition() const {
 854   assert(!Partitions.empty());
 855   CostType CurCost = std::numeric_limits<CostType>::max();
 856   unsigned CurPID = InvalidPID;
 857   for (const auto &[Idx, Part] : enumerate(Partitions)) {
 858     if (Part.first <= CurCost) {
 859       CurPID = Idx;
 860       CurCost = Part.first;
 861     }
 862   }
 863   assert(CurPID != InvalidPID);
 864   return CurPID;
 865 }
 866
 867 void SplitProposal::calculateScores() {
 868   if (Partitions.empty())
 869     return;
 870
 871   assert(SG);
 872   CostType LargestPCost = 0;
 873   for (auto &[PCost, Nodes] : Partitions) {
 874     if (PCost > LargestPCost)
 875       LargestPCost = PCost;
 876   }
 877
 878   CostType ModuleCost = SG->getModuleCost();
 879   CodeSizeScore = double(TotalCost) / ModuleCost;
 880   assert(CodeSizeScore >= 0.0);
 881
 882   BottleneckScore = double(LargestPCost) / ModuleCost;
 883
 884   CodeSizeScore = std::ceil(CodeSizeScore * 100.0) / 100.0;
 885   BottleneckScore = std::ceil(BottleneckScore * 100.0) / 100.0;
 886 }
 887
 888 #ifndef NDEBUG
 889 void SplitProposal::verifyCompleteness() const {
 890   if (Partitions.empty())
 891     return;
 892
 893   BitVector Result = Partitions[0].second;
 894   for (const auto &P : drop_begin(Partitions))
 895     Result |= P.second;
 896   assert(Result.all() && "some nodes are missing from this proposal!");
 897 }
 898 #endif
 899
 900 //===-- RecursiveSearchStrategy -------------------------------------------===//
 901
 902 /// Partitioning algorithm.
 903 ///
 904 /// This is a recursive search algorithm that can explore multiple possiblities.
 905 ///
 906 /// When a cluster of nodes can go into more than one partition, and we haven't
 907 /// reached maximum search depth, we recurse and explore both options and their
 908 /// consequences. Both branches will yield a proposal, and the driver will grade
 909 /// both and choose the best one.
 910 ///
 911 /// If max depth is reached, we will use some heuristics to make a choice. Most
 912 /// of the time we will just use the least-pressured (cheapest) partition, but
 913 /// if a cluster is particularly big and there is a good amount of overlap with
 914 /// an existing partition, we will choose that partition instead.
 915 class RecursiveSearchSplitting {
 916 public:
 917   using SubmitProposalFn = function_ref<void(SplitProposal)>;
 918
 919   RecursiveSearchSplitting(const SplitGraph &SG, unsigned NumParts,
 920                            SubmitProposalFn SubmitProposal);
 921
 922   void run();
 923
 924 private:
 925   struct WorkListEntry {
 926     WorkListEntry(const BitVector &BV) : Cluster(BV) {}
 927
 928     unsigned NumNonEntryNodes = 0;
 929     CostType TotalCost = 0;
 930     CostType CostExcludingGraphEntryPoints = 0;
 931     BitVector Cluster;
 932   };
 933
 934   /// Collects all graph entry points's clusters and sort them so the most
 935   /// expensive clusters are viewed first. This will merge clusters together if
 936   /// they share a non-copyable dependency.
 937   void setupWorkList();
 938
 939   /// Recursive function that assigns the worklist item at \p Idx into a
 940   /// partition of \p SP.
 941   ///
 942   /// \p Depth is the current search depth. When this value is equal to
 943   /// \ref MaxDepth, we can no longer recurse.
 944   ///
 945   /// This function only recurses if there is more than one possible assignment,
 946   /// otherwise it is iterative to avoid creating a call stack that is as big as
 947   /// \ref WorkList.
 948   void pickPartition(unsigned Depth, unsigned Idx, SplitProposal SP);
 949
 950   /// \return A pair: first element is the PID of the partition that has the
 951   /// most similarities with \p Entry, or \ref InvalidPID if no partition was
 952   /// found with at least one element in common. The second element is the
 953   /// aggregated cost of all dependencies in common between \p Entry and that
 954   /// partition.
 955   std::pair<unsigned, CostType>
 956   findMostSimilarPartition(const WorkListEntry &Entry, const SplitProposal &SP);
 957
 958   const SplitGraph &SG;
 959   unsigned NumParts;
 960   SubmitProposalFn SubmitProposal;
 961
 962   // A Cluster is considered large when its cost, excluding entry points,
 963   // exceeds this value.
 964   CostType LargeClusterThreshold = 0;
 965   unsigned NumProposalsSubmitted = 0;
 966   SmallVector<WorkListEntry> WorkList;
 967 };
 968
 969 RecursiveSearchSplitting::RecursiveSearchSplitting(
 970     const SplitGraph &SG, unsigned NumParts, SubmitProposalFn SubmitProposal)
 971     : SG(SG), NumParts(NumParts), SubmitProposal(SubmitProposal) {
 972   // arbitrary max value as a safeguard. Anything above 10 will already be
 973   // slow, this is just a max value to prevent extreme resource exhaustion or
 974   // unbounded run time.
 975   if (MaxDepth > 16)
 976     report_fatal_error("[amdgpu-split-module] search depth of " +
 977                        Twine(MaxDepth) + " is too high!");
 978   LargeClusterThreshold =
 979       (LargeFnFactor != 0.0)
 980           ? CostType(((SG.getModuleCost() / NumParts) * LargeFnFactor))
 981           : std::numeric_limits<CostType>::max();
 982   LLVM_DEBUG(dbgs() << "[recursive search] large cluster threshold set at "
 983                     << LargeClusterThreshold << "\n");
 984 }
 985
 986 void RecursiveSearchSplitting::run() {
 987   {
 988     SplitModuleTimer SMT("recursive_search_prepare", "preparing worklist");
 989     setupWorkList();
 990   }
 991
 992   {
 993     SplitModuleTimer SMT("recursive_search_pick", "partitioning");
 994     SplitProposal SP(SG, NumParts);
 995     pickPartition(/*BranchDepth=*/0, /*Idx=*/0, SP);
 996   }
 997 }
 998
 999 void RecursiveSearchSplitting::setupWorkList() {
1000   // e.g. if A and B are two worklist item, and they both call a non copyable
1001   // dependency C, this does:
1002   //    A=C
1003   //    B=C
1004   // => NodeEC will create a single group (A, B, C) and we create a new
1005   // WorkList entry for that group.
1006
1007   EquivalenceClasses<unsigned> NodeEC;
1008   for (const SplitGraph::Node *N : SG.nodes()) {
1009     if (!N->isGraphEntryPoint())
1010       continue;
1011
1012     NodeEC.insert(N->getID());
1013     N->visitAllDependencies([&](const SplitGraph::Node &Dep) {
1014       if (&Dep != N && Dep.isNonCopyable())
1015         NodeEC.unionSets(N->getID(), Dep.getID());
1016     });
1017   }
1018
1019   for (auto I = NodeEC.begin(), E = NodeEC.end(); I != E; ++I) {
1020     if (!I->isLeader())
1021       continue;
1022
1023     BitVector Cluster = SG.createNodesBitVector();
1024     for (auto MI = NodeEC.member_begin(I); MI != NodeEC.member_end(); ++MI) {
1025       const SplitGraph::Node &N = SG.getNode(*MI);
1026       if (N.isGraphEntryPoint())
1027         N.getDependencies(Cluster);
1028     }
1029     WorkList.emplace_back(std::move(Cluster));
1030   }
1031
1032   // Calculate costs and other useful information.
1033   for (WorkListEntry &Entry : WorkList) {
1034     for (unsigned NodeID : Entry.Cluster.set_bits()) {
1035       const SplitGraph::Node &N = SG.getNode(NodeID);
1036       const CostType Cost = N.getIndividualCost();
1037
1038       Entry.TotalCost += Cost;
1039       if (!N.isGraphEntryPoint()) {
1040         Entry.CostExcludingGraphEntryPoints += Cost;
1041         ++Entry.NumNonEntryNodes;
1042       }
1043     }
1044   }
1045
1046   stable_sort(WorkList, [](const WorkListEntry &A, const WorkListEntry &B) {
1047     if (A.TotalCost != B.TotalCost)
1048       return A.TotalCost > B.TotalCost;
1049
1050     if (A.CostExcludingGraphEntryPoints != B.CostExcludingGraphEntryPoints)
1051       return A.CostExcludingGraphEntryPoints > B.CostExcludingGraphEntryPoints;
1052
1053     if (A.NumNonEntryNodes != B.NumNonEntryNodes)
1054       return A.NumNonEntryNodes > B.NumNonEntryNodes;
1055
1056     return A.Cluster.count() > B.Cluster.count();
1057   });
1058
1059   LLVM_DEBUG({
1060     dbgs() << "[recursive search] worklist:\n";
1061     for (const auto &[Idx, Entry] : enumerate(WorkList)) {
1062       dbgs() << "  - [" << Idx << "]: ";
1063       for (unsigned NodeID : Entry.Cluster.set_bits())
1064         dbgs() << NodeID << " ";
1065       dbgs() << "(total_cost:" << Entry.TotalCost
1066              << ", cost_excl_entries:" << Entry.CostExcludingGraphEntryPoints
1067              << ")\n";
1068     }
1069   });
1070 }
1071
1072 void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx,
1073                                              SplitProposal SP) {
1074   while (Idx < WorkList.size()) {
1075     // Step 1: Determine candidate PIDs.
1076     //
1077     const WorkListEntry &Entry = WorkList[Idx];
1078     const BitVector &Cluster = Entry.Cluster;
1079
1080     // Default option is to do load-balancing, AKA assign to least pressured
1081     // partition.
1082     const unsigned CheapestPID = SP.findCheapestPartition();
1083     assert(CheapestPID != InvalidPID);
1084
1085     // Explore assigning to the kernel that contains the most dependencies in
1086     // common.
1087     const auto [MostSimilarPID, SimilarDepsCost] =
1088         findMostSimilarPartition(Entry, SP);
1089
1090     // We can chose to explore only one path if we only have one valid path, or
1091     // if we reached maximum search depth and can no longer branch out.
1092     unsigned SinglePIDToTry = InvalidPID;
1093     if (MostSimilarPID == InvalidPID) // no similar PID found
1094       SinglePIDToTry = CheapestPID;
1095     else if (MostSimilarPID == CheapestPID) // both landed on the same PID
1096       SinglePIDToTry = CheapestPID;
1097     else if (Depth >= MaxDepth) {
1098       // We have to choose one path. Use a heuristic to guess which one will be
1099       // more appropriate.
1100       if (Entry.CostExcludingGraphEntryPoints > LargeClusterThreshold) {
1101         // Check if the amount of code in common makes it worth it.
1102         assert(SimilarDepsCost && Entry.CostExcludingGraphEntryPoints);
1103         const double Ratio =
1104             SimilarDepsCost / Entry.CostExcludingGraphEntryPoints;
1105         assert(Ratio >= 0.0 && Ratio <= 1.0);
1106         if (LargeFnOverlapForMerge > Ratio) {
1107           // For debug, just print "L", so we'll see "L3=P3" for instance, which
1108           // will mean we reached max depth and chose P3 based on this
1109           // heuristic.
1110           LLVM_DEBUG(dbgs() << 'L');
1111           SinglePIDToTry = MostSimilarPID;
1112         }
1113       } else
1114         SinglePIDToTry = CheapestPID;
1115     }
1116
1117     // Step 2: Explore candidates.
1118
1119     // When we only explore one possible path, and thus branch depth doesn't
1120     // increase, do not recurse, iterate instead.
1121     if (SinglePIDToTry != InvalidPID) {
1122       LLVM_DEBUG(dbgs() << Idx << "=P" << SinglePIDToTry << ' ');
1123       // Only one path to explore, don't clone SP, don't increase depth.
1124       SP.add(SinglePIDToTry, Cluster);
1125       ++Idx;
1126       continue;
1127     }
1128
1129     assert(MostSimilarPID != InvalidPID);
1130
1131     // We explore multiple paths: recurse at increased depth, then stop this
1132     // function.
1133
1134     LLVM_DEBUG(dbgs() << '\n');
1135
1136     // lb = load balancing = put in cheapest partition
1137     {
1138       SplitProposal BranchSP = SP;
1139       LLVM_DEBUG(dbgs().indent(Depth)
1140                  << " [lb] " << Idx << "=P" << CheapestPID << "? ");
1141       BranchSP.add(CheapestPID, Cluster);
1142       pickPartition(Depth + 1, Idx + 1, BranchSP);
1143     }
1144
1145     // ms = most similar = put in partition with the most in common
1146     {
1147       SplitProposal BranchSP = SP;
1148       LLVM_DEBUG(dbgs().indent(Depth)
1149                  << " [ms] " << Idx << "=P" << MostSimilarPID << "? ");
1150       BranchSP.add(MostSimilarPID, Cluster);
1151       pickPartition(Depth + 1, Idx + 1, BranchSP);
1152     }
1153
1154     return;
1155   }
1156
1157   // Step 3: If we assigned all WorkList items, submit the proposal.
1158
1159   assert(Idx == WorkList.size());
1160   assert(NumProposalsSubmitted <= (2u << MaxDepth) &&
1161          "Search got out of bounds?");
1162   SP.setName("recursive_search (depth=" + std::to_string(Depth) + ") #" +
1163              std::to_string(NumProposalsSubmitted++));
1164   LLVM_DEBUG(dbgs() << '\n');
1165   SubmitProposal(SP);
1166 }
1167
1168 std::pair<unsigned, CostType>
1169 RecursiveSearchSplitting::findMostSimilarPartition(const WorkListEntry &Entry,
1170                                                    const SplitProposal &SP) {
1171   if (!Entry.NumNonEntryNodes)
1172     return {InvalidPID, 0};
1173
1174   // We take the partition that is the most similar using Cost as a metric.
1175   // So we take the set of nodes in common, compute their aggregated cost, and
1176   // pick the partition with the highest cost in common.
1177   unsigned ChosenPID = InvalidPID;
1178   CostType ChosenCost = 0;
1179   for (unsigned PID = 0; PID < NumParts; ++PID) {
1180     BitVector BV = SP[PID];
1181     BV &= Entry.Cluster; // FIXME: & doesn't work between BVs?!
1182
1183     if (BV.none())
1184       continue;
1185
1186     const CostType Cost = SG.calculateCost(BV);
1187
1188     if (ChosenPID == InvalidPID || ChosenCost < Cost ||
1189         (ChosenCost == Cost && PID > ChosenPID)) {
1190       ChosenPID = PID;
1191       ChosenCost = Cost;
1192     }
1193   }
1194
1195   return {ChosenPID, ChosenCost};
1196 }
1197
1198 //===----------------------------------------------------------------------===//
1199 // DOTGraph Printing Support
1200 //===----------------------------------------------------------------------===//
1201
1202 const SplitGraph::Node *mapEdgeToDst(const SplitGraph::Edge *E) {
1203   return E->Dst;
1204 }
1205
1206 using SplitGraphEdgeDstIterator =
1207     mapped_iterator<SplitGraph::edges_iterator, decltype(&mapEdgeToDst)>;
1208
1209 } // namespace
1210
1211 template <> struct GraphTraits<SplitGraph> {
1212   using NodeRef = const SplitGraph::Node *;
1213   using nodes_iterator = SplitGraph::nodes_iterator;
1214   using ChildIteratorType = SplitGraphEdgeDstIterator;
1215
1216   using EdgeRef = const SplitGraph::Edge *;
1217   using ChildEdgeIteratorType = SplitGraph::edges_iterator;
1218
1219   static NodeRef getEntryNode(NodeRef N) { return N; }
1220
1221   static ChildIteratorType child_begin(NodeRef Ref) {
1222     return {Ref->outgoing_edges().begin(), mapEdgeToDst};
1223   }
1224   static ChildIteratorType child_end(NodeRef Ref) {
1225     return {Ref->outgoing_edges().end(), mapEdgeToDst};
1226   }
1227
1228   static nodes_iterator nodes_begin(const SplitGraph &G) {
1229     return G.nodes().begin();
1230   }
1231   static nodes_iterator nodes_end(const SplitGraph &G) {
1232     return G.nodes().end();
1233   }
1234 };
1235
1236 template <> struct DOTGraphTraits<SplitGraph> : public DefaultDOTGraphTraits {
1237   DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
1238
1239   static std::string getGraphName(const SplitGraph &SG) {
1240     return SG.getModule().getName().str();
1241   }
1242
1243   std::string getNodeLabel(const SplitGraph::Node *N, const SplitGraph &SG) {
1244     return N->getName().str();
1245   }
1246
1247   static std::string getNodeDescription(const SplitGraph::Node *N,
1248                                         const SplitGraph &SG) {
1249     std::string Result;
1250     if (N->isEntryFunctionCC())
1251       Result += "entry-fn-cc ";
1252     if (N->isNonCopyable())
1253       Result += "non-copyable ";
1254     Result += "cost:" + std::to_string(N->getIndividualCost());
1255     return Result;
1256   }
1257
1258   static std::string getNodeAttributes(const SplitGraph::Node *N,
1259                                        const SplitGraph &SG) {
1260     return N->hasAnyIncomingEdges() ? "" : "color=\"red\"";
1261   }
1262
1263   static std::string getEdgeAttributes(const SplitGraph::Node *N,
1264                                        SplitGraphEdgeDstIterator EI,
1265                                        const SplitGraph &SG) {
1266
1267     switch ((*EI.getCurrent())->Kind) {
1268     case SplitGraph::EdgeKind::DirectCall:
1269       return "";
1270     case SplitGraph::EdgeKind::IndirectCall:
1271       return "style=\"dashed\"";
1272     }
1273     llvm_unreachable("Unknown SplitGraph::EdgeKind enum");
1274   }
1275 };
1276
1277 //===----------------------------------------------------------------------===//
1278 // Driver
1279 //===----------------------------------------------------------------------===//
1280
1281 namespace {
1282
1283 // If we didn't externalize GVs, then local GVs need to be conservatively
1284 // imported into every module (including their initializers), and then cleaned
1285 // up afterwards.
1286 static bool needsConservativeImport(const GlobalValue *GV) {
1287   if (const auto *Var = dyn_cast<GlobalVariable>(GV))
1288     return Var->hasLocalLinkage();
1289   return isa<GlobalAlias>(GV);
1290 }
1291
1292 /// Prints a summary of the partition \p N, represented by module \p M, to \p
1293 /// OS.
1294 static void printPartitionSummary(raw_ostream &OS, unsigned N, const Module &M,
1295                                   unsigned PartCost, unsigned ModuleCost) {
1296   OS << "*** Partition P" << N << " ***\n";
1297
1298   for (const auto &Fn : M) {
1299     if (!Fn.isDeclaration())
1300       OS << " - [function] " << Fn.getName() << "\n";
1301   }
1302
1303   for (const auto &GV : M.globals()) {
1304     if (GV.hasInitializer())
1305       OS << " - [global] " << GV.getName() << "\n";
1306   }
1307
1308   OS << "Partition contains " << formatRatioOf(PartCost, ModuleCost)
1309      << "% of the source\n";
1310 }
1311
1312 static void evaluateProposal(SplitProposal &Best, SplitProposal New) {
1313   SplitModuleTimer SMT("proposal_evaluation", "proposal ranking algorithm");
1314
1315   LLVM_DEBUG({
1316     New.verifyCompleteness();
1317     if (DebugProposalSearch)
1318       New.print(dbgs());
1319   });
1320
1321   const double CurBScore = Best.getBottleneckScore();
1322   const double CurCSScore = Best.getCodeSizeScore();
1323   const double NewBScore = New.getBottleneckScore();
1324   const double NewCSScore = New.getCodeSizeScore();
1325
1326   // TODO: Improve this
1327   //    We can probably lower the precision of the comparison at first
1328   //    e.g. if we have
1329   //      - (Current): BScore: 0.489 CSCore 1.105
1330   //      - (New): BScore: 0.475 CSCore 1.305
1331   //    Currently we'd choose the new one because the bottleneck score is
1332   //    lower, but the new one duplicates more code. It may be worth it to
1333   //    discard the new proposal as the impact on build time is negligible.
1334
1335   // Compare them
1336   bool IsBest = false;
1337   if (NewBScore < CurBScore)
1338     IsBest = true;
1339   else if (NewBScore == CurBScore)
1340     IsBest = (NewCSScore < CurCSScore); // Use code size as tie breaker.
1341
1342   if (IsBest)
1343     Best = std::move(New);
1344
1345   LLVM_DEBUG(if (DebugProposalSearch) {
1346     if (IsBest)
1347       dbgs() << "[search] new best proposal!\n";
1348     else
1349       dbgs() << "[search] discarding - not profitable\n";
1350   });
1351 }
1352
1353 /// Trivial helper to create an identical copy of \p M.
1354 static std::unique_ptr<Module> cloneAll(const Module &M) {
1355   ValueToValueMapTy VMap;
1356   return CloneModule(M, VMap, [&](const GlobalValue *GV) { return true; });
1357 }
1358
1359 /// Writes \p SG as a DOTGraph to \ref ModuleDotCfgDir if requested.
1360 static void writeDOTGraph(const SplitGraph &SG) {
1361   if (ModuleDotCfgOutput.empty())
1362     return;
1363
1364   std::error_code EC;
1365   raw_fd_ostream OS(ModuleDotCfgOutput, EC);
1366   if (EC) {
1367     errs() << "[" DEBUG_TYPE "]: cannot open '" << ModuleDotCfgOutput
1368            << "' - DOTGraph will not be printed\n";
1369   }
1370   WriteGraph(OS, SG, /*ShortName=*/false,
1371              /*Title=*/SG.getModule().getName());
1372 }
1373
1374 static void splitAMDGPUModule(
1375     GetTTIFn GetTTI, Module &M, unsigned NumParts,
1376     function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
1377   CallGraph CG(M);
1378
1379   // Externalize functions whose address are taken.
1380   //
1381   // This is needed because partitioning is purely based on calls, but sometimes
1382   // a kernel/function may just look at the address of another local function
1383   // and not do anything (no calls). After partitioning, that local function may
1384   // end up in a different module (so it's just a declaration in the module
1385   // where its address is taken), which emits a "undefined hidden symbol" linker
1386   // error.
1387   //
1388   // Additionally, it guides partitioning to not duplicate this function if it's
1389   // called directly at some point.
1390   //
1391   // TODO: Could we be smarter about this ? This makes all functions whose
1392   // addresses are taken non-copyable. We should probably model this type of
1393   // constraint in the graph and use it to guide splitting, instead of
1394   // externalizing like this. Maybe non-copyable should really mean "keep one
1395   // visible copy, then internalize all other copies" for some functions?
1396   if (!NoExternalizeOnAddrTaken) {
1397     for (auto &Fn : M) {
1398       // TODO: Should aliases count? Probably not but they're so rare I'm not
1399       // sure it's worth fixing.
1400       if (Fn.hasLocalLinkage() && Fn.hasAddressTaken()) {
1401         LLVM_DEBUG(dbgs() << "[externalize] "; Fn.printAsOperand(dbgs());
1402                    dbgs() << " because its address is taken\n");
1403         externalize(Fn);
1404       }
1405     }
1406   }
1407
1408   // Externalize local GVs, which avoids duplicating their initializers, which
1409   // in turns helps keep code size in check.
1410   if (!NoExternalizeGlobals) {
1411     for (auto &GV : M.globals()) {
1412       if (GV.hasLocalLinkage())
1413         LLVM_DEBUG(dbgs() << "[externalize] GV " << GV.getName() << '\n');
1414       externalize(GV);
1415     }
1416   }
1417
1418   // Start by calculating the cost of every function in the module, as well as
1419   // the module's overall cost.
1420   FunctionsCostMap FnCosts;
1421   const CostType ModuleCost = calculateFunctionCosts(GetTTI, M, FnCosts);
1422
1423   // Build the SplitGraph, which represents the module's functions and models
1424   // their dependencies accurately.
1425   SplitGraph SG(M, FnCosts, ModuleCost);
1426   SG.buildGraph(CG);
1427
1428   if (SG.empty()) {
1429     LLVM_DEBUG(
1430         dbgs()
1431         << "[!] no nodes in graph, input is empty - no splitting possible\n");
1432     ModuleCallback(cloneAll(M));
1433     return;
1434   }
1435
1436   LLVM_DEBUG({
1437     dbgs() << "[graph] nodes:\n";
1438     for (const SplitGraph::Node *N : SG.nodes()) {
1439       dbgs() << "  - [" << N->getID() << "]: " << N->getName() << " "
1440              << (N->isGraphEntryPoint() ? "(entry)" : "") << " "
1441              << (N->isNonCopyable() ? "(noncopyable)" : "") << "\n";
1442     }
1443   });
1444
1445   writeDOTGraph(SG);
1446
1447   LLVM_DEBUG(dbgs() << "[search] testing splitting strategies\n");
1448
1449   std::optional<SplitProposal> Proposal;
1450   const auto EvaluateProposal = [&](SplitProposal SP) {
1451     SP.calculateScores();
1452     if (!Proposal)
1453       Proposal = std::move(SP);
1454     else
1455       evaluateProposal(*Proposal, std::move(SP));
1456   };
1457
1458   // TODO: It would be very easy to create new strategies by just adding a base
1459   // class to RecursiveSearchSplitting and abstracting it away.
1460   RecursiveSearchSplitting(SG, NumParts, EvaluateProposal).run();
1461   LLVM_DEBUG(if (Proposal) dbgs() << "[search done] selected proposal: "
1462                                   << Proposal->getName() << "\n";);
1463
1464   if (!Proposal) {
1465     LLVM_DEBUG(dbgs() << "[!] no proposal made, no splitting possible!\n");
1466     ModuleCallback(cloneAll(M));
1467     return;
1468   }
1469
1470   LLVM_DEBUG(Proposal->print(dbgs()););
1471
1472   std::optional<raw_fd_ostream> SummariesOS;
1473   if (!PartitionSummariesOutput.empty()) {
1474     std::error_code EC;
1475     SummariesOS.emplace(PartitionSummariesOutput, EC);
1476     if (EC)
1477       errs() << "[" DEBUG_TYPE "]: cannot open '" << PartitionSummariesOutput
1478              << "' - Partition summaries will not be printed\n";
1479   }
1480
1481   for (unsigned PID = 0; PID < NumParts; ++PID) {
1482     SplitModuleTimer SMT2("modules_creation",
1483                           "creating modules for each partition");
1484     LLVM_DEBUG(dbgs() << "[split] creating new modules\n");
1485
1486     DenseSet<const Function *> FnsInPart;
1487     for (unsigned NodeID : (*Proposal)[PID].set_bits())
1488       FnsInPart.insert(&SG.getNode(NodeID).getFunction());
1489
1490     ValueToValueMapTy VMap;
1491     CostType PartCost = 0;
1492     std::unique_ptr<Module> MPart(
1493         CloneModule(M, VMap, [&](const GlobalValue *GV) {
1494           // Functions go in their assigned partition.
1495           if (const auto *Fn = dyn_cast<Function>(GV)) {
1496             if (FnsInPart.contains(Fn)) {
1497               PartCost += SG.getCost(*Fn);
1498               return true;
1499             }
1500             return false;
1501           }
1502
1503           // Everything else goes in the first partition.
1504           return needsConservativeImport(GV) || PID == 0;
1505         }));
1506
1507     // FIXME: Aliases aren't seen often, and their handling isn't perfect so
1508     // bugs are possible.
1509
1510     // Clean-up conservatively imported GVs without any users.
1511     for (auto &GV : make_early_inc_range(MPart->global_values())) {
1512       if (needsConservativeImport(&GV) && GV.use_empty())
1513         GV.eraseFromParent();
1514     }
1515
1516     if (SummariesOS)
1517       printPartitionSummary(*SummariesOS, PID, *MPart, PartCost, ModuleCost);
1518
1519     LLVM_DEBUG(
1520         printPartitionSummary(dbgs(), PID, *MPart, PartCost, ModuleCost));
1521
1522     ModuleCallback(std::move(MPart));
1523   }
1524 }
1525 } // namespace
1526
1527 PreservedAnalyses AMDGPUSplitModulePass::run(Module &M,
1528                                              ModuleAnalysisManager &MAM) {
1529   SplitModuleTimer SMT(
1530       "total", "total pass runtime (incl. potentially waiting for lockfile)");
1531
1532   FunctionAnalysisManager &FAM =
1533       MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1534   const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & {
1535     return FAM.getResult<TargetIRAnalysis>(F);
1536   };
1537
1538   bool Done = false;
1539 #ifndef NDEBUG
1540   if (UseLockFile) {
1541     SmallString<128> LockFilePath;
1542     sys::path::system_temp_directory(/*ErasedOnReboot=*/true, LockFilePath);
1543     sys::path::append(LockFilePath, "amdgpu-split-module-debug");
1544     LLVM_DEBUG(dbgs() << DEBUG_TYPE " using lockfile '" << LockFilePath
1545                       << "'\n");
1546
1547     while (true) {
1548       llvm::LockFileManager Locked(LockFilePath.str());
1549       switch (Locked) {
1550       case LockFileManager::LFS_Error:
1551         LLVM_DEBUG(
1552             dbgs() << "[amdgpu-split-module] unable to acquire lockfile, debug "
1553                       "output may be mangled by other processes\n");
1554         Locked.unsafeRemoveLockFile();
1555         break;
1556       case LockFileManager::LFS_Owned:
1557         break;
1558       case LockFileManager::LFS_Shared: {
1559         switch (Locked.waitForUnlock()) {
1560         case LockFileManager::Res_Success:
1561           break;
1562         case LockFileManager::Res_OwnerDied:
1563           continue; // try again to get the lock.
1564         case LockFileManager::Res_Timeout:
1565           LLVM_DEBUG(
1566               dbgs()
1567               << "[amdgpu-split-module] unable to acquire lockfile, debug "
1568                  "output may be mangled by other processes\n");
1569           Locked.unsafeRemoveLockFile();
1570           break; // give up
1571         }
1572         break;
1573       }
1574       }
1575
1576       splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
1577       Done = true;
1578       break;
1579     }
1580   }
1581 #endif
1582
1583   if (!Done)
1584     splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
1585
1586   // We can change linkage/visibilities in the input, consider that nothing is
1587   // preserved just to be safe. This pass runs last anyway.
1588   return PreservedAnalyses::none();
1589 }
1590 } // namespace llvm