1 //===- AMDGPUSplitModule.cpp ----------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// \file Implements a module splitting algorithm designed to support the
10 /// FullLTO --lto-partitions option for parallel codegen.
12 /// The role of this module splitting pass is the same as
13 /// lib/Transforms/Utils/SplitModule.cpp: load-balance the module's functions
14 /// across a set of N partitions to allow for parallel codegen.
16 /// The similarities mostly end here, as this pass achieves load-balancing in a
17 /// more elaborate fashion which is targeted towards AMDGPU modules. It can take
18 /// advantage of the structure of AMDGPU modules (which are mostly
19 /// self-contained) to allow for more efficient splitting without affecting
20 /// codegen negatively, or causing innaccurate resource usage analysis.
22 /// High-level pass overview:
23 /// - SplitGraph & associated classes
24 /// - Graph representation of the module and of the dependencies that
25 /// matter for splitting.
26 /// - RecursiveSearchSplitting
27 /// - Core splitting algorithm.
29 /// - Represents a suggested solution for splitting the input module. These
30 /// solutions can be scored to determine the best one when multiple
31 /// solutions are available.
32 /// - Driver/pass "run" function glues everything together.
34 #include "AMDGPUSplitModule.h"
35 #include "AMDGPUTargetMachine.h"
36 #include "Utils/AMDGPUBaseInfo.h"
37 #include "llvm/ADT/DenseMap.h"
38 #include "llvm/ADT/EquivalenceClasses.h"
39 #include "llvm/ADT/GraphTraits.h"
40 #include "llvm/ADT/SmallVector.h"
41 #include "llvm/ADT/StringExtras.h"
42 #include "llvm/ADT/StringRef.h"
43 #include "llvm/Analysis/CallGraph.h"
44 #include "llvm/Analysis/TargetTransformInfo.h"
45 #include "llvm/IR/Function.h"
46 #include "llvm/IR/InstIterator.h"
47 #include "llvm/IR/Instruction.h"
48 #include "llvm/IR/Module.h"
49 #include "llvm/IR/Value.h"
50 #include "llvm/Support/Allocator.h"
51 #include "llvm/Support/Casting.h"
52 #include "llvm/Support/DOTGraphTraits.h"
53 #include "llvm/Support/Debug.h"
54 #include "llvm/Support/GraphWriter.h"
55 #include "llvm/Support/Path.h"
56 #include "llvm/Support/Timer.h"
57 #include "llvm/Support/raw_ostream.h"
58 #include "llvm/Transforms/Utils/Cloning.h"
66 #include "llvm/Support/LockFileManager.h"
69 #define DEBUG_TYPE "amdgpu-split-module"
74 static cl::opt
<unsigned> MaxDepth(
75 "amdgpu-module-splitting-max-depth",
77 "maximum search depth. 0 forces a greedy approach. "
78 "warning: the algorithm is up to O(2^N), where N is the max depth."),
81 static cl::opt
<float> LargeFnFactor(
82 "amdgpu-module-splitting-large-threshold", cl::init(2.0f
), cl::Hidden
,
84 "when max depth is reached and we can no longer branch out, this "
85 "value determines if a function is worth merging into an already "
86 "existing partition to reduce code duplication. This is a factor "
87 "of the ideal partition size, e.g. 2.0 means we consider the "
88 "function for merging if its cost (including its callees) is 2x the "
89 "size of an ideal partition."));
91 static cl::opt
<float> LargeFnOverlapForMerge(
92 "amdgpu-module-splitting-merge-threshold", cl::init(0.7f
), cl::Hidden
,
93 cl::desc("when a function is considered for merging into a partition that "
94 "already contains some of its callees, do the merge if at least "
95 "n% of the code it can reach is already present inside the "
96 "partition; e.g. 0.7 means only merge >70%"));
98 static cl::opt
<bool> NoExternalizeGlobals(
99 "amdgpu-module-splitting-no-externalize-globals", cl::Hidden
,
100 cl::desc("disables externalization of global variable with local linkage; "
101 "may cause globals to be duplicated which increases binary size"));
103 static cl::opt
<bool> NoExternalizeOnAddrTaken(
104 "amdgpu-module-splitting-no-externalize-address-taken", cl::Hidden
,
106 "disables externalization of functions whose addresses are taken"));
108 static cl::opt
<std::string
>
109 ModuleDotCfgOutput("amdgpu-module-splitting-print-module-dotcfg",
111 cl::desc("output file to write out the dotgraph "
112 "representation of the input module"));
114 static cl::opt
<std::string
> PartitionSummariesOutput(
115 "amdgpu-module-splitting-print-partition-summaries", cl::Hidden
,
116 cl::desc("output file to write out a summary of "
117 "the partitions created for each module"));
121 UseLockFile("amdgpu-module-splitting-serial-execution", cl::Hidden
,
122 cl::desc("use a lock file so only one process in the system "
123 "can run this pass at once. useful to avoid mangled "
124 "debug output in multithreaded environments."));
127 DebugProposalSearch("amdgpu-module-splitting-debug-proposal-search",
129 cl::desc("print all proposals received and whether "
130 "they were rejected or accepted"));
133 struct SplitModuleTimer
: NamedRegionTimer
{
134 SplitModuleTimer(StringRef Name
, StringRef Desc
)
135 : NamedRegionTimer(Name
, Desc
, DEBUG_TYPE
, "AMDGPU Module Splitting",
136 TimePassesIsEnabled
) {}
139 //===----------------------------------------------------------------------===//
141 //===----------------------------------------------------------------------===//
143 using CostType
= InstructionCost::CostType
;
144 using FunctionsCostMap
= DenseMap
<const Function
*, CostType
>;
145 using GetTTIFn
= function_ref
<const TargetTransformInfo
&(Function
&)>;
146 static constexpr unsigned InvalidPID
= -1;
148 /// \param Num numerator
149 /// \param Dem denominator
150 /// \returns a printable object to print (Num/Dem) using "%0.2f".
151 static auto formatRatioOf(CostType Num
, CostType Dem
) {
152 return format("%0.2f", (static_cast<double>(Num
) / Dem
) * 100);
155 /// Checks whether a given function is non-copyable.
157 /// Non-copyable functions cannot be cloned into multiple partitions, and only
158 /// one copy of the function can be present across all partitions.
160 /// External functions fall into this category. If we were to clone them, we
161 /// would end up with multiple symbol definitions and a very unhappy linker.
162 static bool isNonCopyable(const Function
&F
) {
163 assert(AMDGPU::isEntryFunctionCC(F
.getCallingConv())
164 ? F
.hasExternalLinkage()
165 : true && "Kernel w/o external linkage?");
166 return F
.hasExternalLinkage() || !F
.isDefinitionExact();
169 /// If \p GV has local linkage, make it external + hidden.
170 static void externalize(GlobalValue
&GV
) {
171 if (GV
.hasLocalLinkage()) {
172 GV
.setLinkage(GlobalValue::ExternalLinkage
);
173 GV
.setVisibility(GlobalValue::HiddenVisibility
);
176 // Unnamed entities must be named consistently between modules. setName will
177 // give a distinct name to each such entity.
179 GV
.setName("__llvmsplit_unnamed");
182 /// Cost analysis function. Calculates the cost of each function in \p M
184 /// \param GetTTI Abstract getter for TargetTransformInfo.
185 /// \param M Module to analyze.
186 /// \param CostMap[out] Resulting Function -> Cost map.
187 /// \return The module's total cost.
188 static CostType
calculateFunctionCosts(GetTTIFn GetTTI
, Module
&M
,
189 FunctionsCostMap
&CostMap
) {
190 SplitModuleTimer
SMT("calculateFunctionCosts", "cost analysis");
192 LLVM_DEBUG(dbgs() << "[cost analysis] calculating function costs\n");
193 CostType ModuleCost
= 0;
194 [[maybe_unused
]] CostType KernelCost
= 0;
197 if (Fn
.isDeclaration())
201 const auto &TTI
= GetTTI(Fn
);
202 for (const auto &BB
: Fn
) {
203 for (const auto &I
: BB
) {
205 TTI
.getInstructionCost(&I
, TargetTransformInfo::TCK_CodeSize
);
206 assert(Cost
!= InstructionCost::getMax());
207 // Assume expensive if we can't tell the cost of an instruction.
209 Cost
.getValue().value_or(TargetTransformInfo::TCC_Expensive
);
210 assert((FnCost
+ CostVal
) >= FnCost
&& "Overflow!");
217 CostMap
[&Fn
] = FnCost
;
218 assert((ModuleCost
+ FnCost
) >= ModuleCost
&& "Overflow!");
219 ModuleCost
+= FnCost
;
221 if (AMDGPU::isEntryFunctionCC(Fn
.getCallingConv()))
222 KernelCost
+= FnCost
;
230 const CostType FnCost
= ModuleCost
- KernelCost
;
231 dbgs() << " - total module cost is " << ModuleCost
<< ". kernels cost "
232 << "" << KernelCost
<< " ("
233 << format("%0.2f", (float(KernelCost
) / ModuleCost
) * 100)
234 << "% of the module), functions cost " << FnCost
<< " ("
235 << format("%0.2f", (float(FnCost
) / ModuleCost
) * 100)
236 << "% of the module)\n";
242 /// \return true if \p F can be indirectly called
243 static bool canBeIndirectlyCalled(const Function
&F
) {
244 if (F
.isDeclaration() || AMDGPU::isEntryFunctionCC(F
.getCallingConv()))
246 return !F
.hasLocalLinkage() ||
247 F
.hasAddressTaken(/*PutOffender=*/nullptr,
248 /*IgnoreCallbackUses=*/false,
249 /*IgnoreAssumeLikeCalls=*/true,
250 /*IgnoreLLVMUsed=*/true,
251 /*IgnoreARCAttachedCall=*/false,
252 /*IgnoreCastedDirectCall=*/true);
255 //===----------------------------------------------------------------------===//
256 // Graph-based Module Representation
257 //===----------------------------------------------------------------------===//
259 /// AMDGPUSplitModule's view of the source Module, as a graph of all components
260 /// that can be split into different modules.
262 /// The most trivial instance of this graph is just the CallGraph of the module,
263 /// but it is not guaranteed that the graph is strictly equal to the CG. It
264 /// currently always is but it's designed in a way that would eventually allow
265 /// us to create abstract nodes, or nodes for different entities such as global
266 /// variables or any other meaningful constraint we must consider.
268 /// The graph is only mutable by this class, and is generally not modified
269 /// after \ref SplitGraph::buildGraph runs. No consumers of the graph can
275 enum class EdgeKind
: uint8_t {
276 /// The nodes are related through a direct call. This is a "strong" edge as
277 /// it means the Src will directly reference the Dst.
279 /// The nodes are related through an indirect call.
280 /// This is a "weaker" edge and is only considered when traversing the graph
281 /// starting from a kernel. We need this edge for resource usage analysis.
283 /// The reason why we have this edge in the first place is due to how
284 /// AMDGPUResourceUsageAnalysis works. In the presence of an indirect call,
285 /// the resource usage of the kernel containing the indirect call is the
286 /// max resource usage of all functions that can be indirectly called.
290 /// An edge between two nodes. Edges are directional, and tagged with a
293 Edge(Node
*Src
, Node
*Dst
, EdgeKind Kind
)
294 : Src(Src
), Dst(Dst
), Kind(Kind
) {}
296 Node
*Src
; ///< Source
297 Node
*Dst
; ///< Destination
301 using EdgesVec
= SmallVector
<const Edge
*, 0>;
302 using edges_iterator
= EdgesVec::const_iterator
;
303 using nodes_iterator
= const Node
*const *;
305 SplitGraph(const Module
&M
, const FunctionsCostMap
&CostMap
,
307 : M(M
), CostMap(CostMap
), ModuleCost(ModuleCost
) {}
309 void buildGraph(CallGraph
&CG
);
312 bool verifyGraph() const;
315 bool empty() const { return Nodes
.empty(); }
316 const iterator_range
<nodes_iterator
> nodes() const {
317 return {Nodes
.begin(), Nodes
.end()};
319 const Node
&getNode(unsigned ID
) const { return *Nodes
[ID
]; }
321 unsigned getNumNodes() const { return Nodes
.size(); }
322 BitVector
createNodesBitVector() const { return BitVector(Nodes
.size()); }
324 const Module
&getModule() const { return M
; }
326 CostType
getModuleCost() const { return ModuleCost
; }
327 CostType
getCost(const Function
&F
) const { return CostMap
.at(&F
); }
329 /// \returns the aggregated cost of all nodes in \p BV (bits set to 1 = node
331 CostType
calculateCost(const BitVector
&BV
) const;
334 /// Retrieves the node for \p GV in \p Cache, or creates a new node for it and
335 /// updates \p Cache.
336 Node
&getNode(DenseMap
<const GlobalValue
*, Node
*> &Cache
,
337 const GlobalValue
&GV
);
339 // Create a new edge between two nodes and add it to both nodes.
340 const Edge
&createEdge(Node
&Src
, Node
&Dst
, EdgeKind EK
);
343 const FunctionsCostMap
&CostMap
;
346 // Final list of nodes with stable ordering.
347 SmallVector
<Node
*> Nodes
;
349 SpecificBumpPtrAllocator
<Node
> NodesPool
;
351 // Edges are trivially destructible objects, so as a small optimization we
352 // use a BumpPtrAllocator which avoids destructor calls but also makes
353 // allocation faster.
355 std::is_trivially_destructible_v
<Edge
>,
356 "Edge must be trivially destructible to use the BumpPtrAllocator");
357 BumpPtrAllocator EdgesPool
;
360 /// Nodes in the SplitGraph contain both incoming, and outgoing edges.
361 /// Incoming edges have this node as their Dst, and Outgoing ones have this node
364 /// Edge objects are shared by both nodes in Src/Dst. They provide immediate
365 /// feedback on how two nodes are related, and in which direction they are
366 /// related, which is valuable information to make splitting decisions.
368 /// Nodes are fundamentally abstract, and any consumers of the graph should
369 /// treat them as such. While a node will be a function most of the time, we
370 /// could also create nodes for any other reason. In the future, we could have
371 /// single nodes for multiple functions, or nodes for GVs, etc.
372 class SplitGraph::Node
{
373 friend class SplitGraph
;
376 Node(unsigned ID
, const GlobalValue
&GV
, CostType IndividualCost
,
378 : ID(ID
), GV(GV
), IndividualCost(IndividualCost
),
379 IsNonCopyable(IsNonCopyable
), IsEntryFnCC(false), IsGraphEntry(false) {
380 if (auto *Fn
= dyn_cast
<Function
>(&GV
))
381 IsEntryFnCC
= AMDGPU::isEntryFunctionCC(Fn
->getCallingConv());
384 /// An 0-indexed ID for the node. The maximum ID (exclusive) is the number of
385 /// nodes in the graph. This ID can be used as an index in a BitVector.
386 unsigned getID() const { return ID
; }
388 const Function
&getFunction() const { return cast
<Function
>(GV
); }
390 /// \returns the cost to import this component into a given module, not
391 /// accounting for any dependencies that may need to be imported as well.
392 CostType
getIndividualCost() const { return IndividualCost
; }
394 bool isNonCopyable() const { return IsNonCopyable
; }
395 bool isEntryFunctionCC() const { return IsEntryFnCC
; }
397 /// \returns whether this is an entry point in the graph. Entry points are
398 /// defined as follows: if you take all entry points in the graph, and iterate
399 /// their dependencies, you are guaranteed to visit all nodes in the graph at
401 bool isGraphEntryPoint() const { return IsGraphEntry
; }
403 StringRef
getName() const { return GV
.getName(); }
405 bool hasAnyIncomingEdges() const { return IncomingEdges
.size(); }
406 bool hasAnyIncomingEdgesOfKind(EdgeKind EK
) const {
407 return any_of(IncomingEdges
, [&](const auto *E
) { return E
->Kind
== EK
; });
410 bool hasAnyOutgoingEdges() const { return OutgoingEdges
.size(); }
411 bool hasAnyOutgoingEdgesOfKind(EdgeKind EK
) const {
412 return any_of(OutgoingEdges
, [&](const auto *E
) { return E
->Kind
== EK
; });
415 iterator_range
<edges_iterator
> incoming_edges() const {
416 return IncomingEdges
;
419 iterator_range
<edges_iterator
> outgoing_edges() const {
420 return OutgoingEdges
;
423 bool shouldFollowIndirectCalls() const { return isEntryFunctionCC(); }
425 /// Visit all children of this node in a recursive fashion. Also visits Self.
426 /// If \ref shouldFollowIndirectCalls returns false, then this only follows
427 /// DirectCall edges.
429 /// \param Visitor Visitor Function.
430 void visitAllDependencies(std::function
<void(const Node
&)> Visitor
) const;
432 /// Adds the depedencies of this node in \p BV by setting the bit
433 /// corresponding to each node.
435 /// Implemented using \ref visitAllDependencies, hence it follows the same
436 /// rules regarding dependencies traversal.
438 /// \param[out] BV The bitvector where the bits should be set.
439 void getDependencies(BitVector
&BV
) const {
440 visitAllDependencies([&](const Node
&N
) { BV
.set(N
.getID()); });
444 void markAsGraphEntry() { IsGraphEntry
= true; }
447 const GlobalValue
&GV
;
448 CostType IndividualCost
;
449 bool IsNonCopyable
: 1;
450 bool IsEntryFnCC
: 1;
451 bool IsGraphEntry
: 1;
453 // TODO: Use a single sorted vector (with all incoming/outgoing edges grouped
455 EdgesVec IncomingEdges
;
456 EdgesVec OutgoingEdges
;
459 void SplitGraph::Node::visitAllDependencies(
460 std::function
<void(const Node
&)> Visitor
) const {
461 const bool FollowIndirect
= shouldFollowIndirectCalls();
462 // FIXME: If this can access SplitGraph in the future, use a BitVector
464 DenseSet
<const Node
*> Seen
;
465 SmallVector
<const Node
*, 8> WorkList({this});
466 while (!WorkList
.empty()) {
467 const Node
*CurN
= WorkList
.pop_back_val();
468 if (auto [It
, Inserted
] = Seen
.insert(CurN
); !Inserted
)
473 for (const Edge
*E
: CurN
->outgoing_edges()) {
474 if (!FollowIndirect
&& E
->Kind
== EdgeKind::IndirectCall
)
476 WorkList
.push_back(E
->Dst
);
481 /// Checks if \p I has MD_callees and if it does, parse it and put the function
484 /// \returns true if there was metadata and it was parsed correctly. false if
485 /// there was no MD or if it contained unknown entries and parsing failed.
486 /// If this returns false, \p Callees will contain incomplete information
487 /// and must not be used.
488 static bool handleCalleesMD(const Instruction
&I
,
489 SetVector
<Function
*> &Callees
) {
490 auto *MD
= I
.getMetadata(LLVMContext::MD_callees
);
494 for (const auto &Op
: MD
->operands()) {
495 Function
*Callee
= mdconst::extract_or_null
<Function
>(Op
);
498 Callees
.insert(Callee
);
504 void SplitGraph::buildGraph(CallGraph
&CG
) {
505 SplitModuleTimer
SMT("buildGraph", "graph construction");
508 << "[build graph] constructing graph representation of the input\n");
510 // FIXME(?): Is the callgraph really worth using if we have to iterate the
511 // function again whenever it fails to give us enough information?
513 // We build the graph by just iterating all functions in the module and
514 // working on their direct callees. At the end, all nodes should be linked
515 // together as expected.
516 DenseMap
<const GlobalValue
*, Node
*> Cache
;
517 SmallVector
<const Function
*> FnsWithIndirectCalls
, IndirectlyCallableFns
;
518 for (const Function
&Fn
: M
) {
519 if (Fn
.isDeclaration())
522 // Look at direct callees and create the necessary edges in the graph.
523 SetVector
<const Function
*> DirectCallees
;
524 bool CallsExternal
= false;
525 for (auto &CGEntry
: *CG
[&Fn
]) {
526 auto *CGNode
= CGEntry
.second
;
527 if (auto *Callee
= CGNode
->getFunction()) {
528 if (!Callee
->isDeclaration())
529 DirectCallees
.insert(Callee
);
530 } else if (CGNode
== CG
.getCallsExternalNode())
531 CallsExternal
= true;
534 // Keep track of this function if it contains an indirect call and/or if it
535 // can be indirectly called.
537 LLVM_DEBUG(dbgs() << " [!] callgraph is incomplete for ";
538 Fn
.printAsOperand(dbgs());
539 dbgs() << " - analyzing function\n");
541 SetVector
<Function
*> KnownCallees
;
542 bool HasUnknownIndirectCall
= false;
543 for (const auto &Inst
: instructions(Fn
)) {
544 // look at all calls without a direct callee.
545 const auto *CB
= dyn_cast
<CallBase
>(&Inst
);
546 if (!CB
|| CB
->getCalledFunction())
549 // inline assembly can be ignored, unless InlineAsmIsIndirectCall is
551 if (CB
->isInlineAsm()) {
552 LLVM_DEBUG(dbgs() << " found inline assembly\n");
556 if (handleCalleesMD(Inst
, KnownCallees
))
558 // If we failed to parse any !callees MD, or some was missing,
559 // the entire KnownCallees list is now unreliable.
560 KnownCallees
.clear();
562 // Everything else is handled conservatively. If we fall into the
563 // conservative case don't bother analyzing further.
564 HasUnknownIndirectCall
= true;
568 if (HasUnknownIndirectCall
) {
569 LLVM_DEBUG(dbgs() << " indirect call found\n");
570 FnsWithIndirectCalls
.push_back(&Fn
);
571 } else if (!KnownCallees
.empty())
572 DirectCallees
.insert(KnownCallees
.begin(), KnownCallees
.end());
575 Node
&N
= getNode(Cache
, Fn
);
576 for (const auto *Callee
: DirectCallees
)
577 createEdge(N
, getNode(Cache
, *Callee
), EdgeKind::DirectCall
);
579 if (canBeIndirectlyCalled(Fn
))
580 IndirectlyCallableFns
.push_back(&Fn
);
583 // Post-process functions with indirect calls.
584 for (const Function
*Fn
: FnsWithIndirectCalls
) {
585 for (const Function
*Candidate
: IndirectlyCallableFns
) {
586 Node
&Src
= getNode(Cache
, *Fn
);
587 Node
&Dst
= getNode(Cache
, *Candidate
);
588 createEdge(Src
, Dst
, EdgeKind::IndirectCall
);
592 // Now, find all entry points.
593 SmallVector
<Node
*, 16> CandidateEntryPoints
;
594 BitVector NodesReachableByKernels
= createNodesBitVector();
595 for (Node
*N
: Nodes
) {
596 // Functions with an Entry CC are always graph entry points too.
597 if (N
->isEntryFunctionCC()) {
598 N
->markAsGraphEntry();
599 N
->getDependencies(NodesReachableByKernels
);
600 } else if (!N
->hasAnyIncomingEdgesOfKind(EdgeKind::DirectCall
))
601 CandidateEntryPoints
.push_back(N
);
604 for (Node
*N
: CandidateEntryPoints
) {
605 // This can be another entry point if it's not reachable by a kernel
606 // TODO: We could sort all of the possible new entries in a stable order
607 // (e.g. by cost), then consume them one by one until
608 // NodesReachableByKernels is all 1s. It'd allow us to avoid
609 // considering some nodes as non-entries in some specific cases.
610 if (!NodesReachableByKernels
.test(N
->getID()))
611 N
->markAsGraphEntry();
615 assert(verifyGraph());
620 bool SplitGraph::verifyGraph() const {
621 unsigned ExpectedID
= 0;
622 // Exceptionally using a set here in case IDs are messed up.
623 DenseSet
<const Node
*> SeenNodes
;
624 DenseSet
<const Function
*> SeenFunctionNodes
;
625 for (const Node
*N
: Nodes
) {
626 if (N
->getID() != (ExpectedID
++)) {
627 errs() << "Node IDs are incorrect!\n";
631 if (!SeenNodes
.insert(N
).second
) {
632 errs() << "Node seen more than once!\n";
636 if (&getNode(N
->getID()) != N
) {
637 errs() << "getNode doesn't return the right node\n";
641 for (const Edge
*E
: N
->IncomingEdges
) {
642 if (!E
->Src
|| !E
->Dst
|| (E
->Dst
!= N
) ||
643 (find(E
->Src
->OutgoingEdges
, E
) == E
->Src
->OutgoingEdges
.end())) {
644 errs() << "ill-formed incoming edges\n";
649 for (const Edge
*E
: N
->OutgoingEdges
) {
650 if (!E
->Src
|| !E
->Dst
|| (E
->Src
!= N
) ||
651 (find(E
->Dst
->IncomingEdges
, E
) == E
->Dst
->IncomingEdges
.end())) {
652 errs() << "ill-formed outgoing edges\n";
657 const Function
&Fn
= N
->getFunction();
658 if (AMDGPU::isEntryFunctionCC(Fn
.getCallingConv())) {
659 if (N
->hasAnyIncomingEdges()) {
660 errs() << "Kernels cannot have incoming edges\n";
665 if (Fn
.isDeclaration()) {
666 errs() << "declarations shouldn't have nodes!\n";
670 auto [It
, Inserted
] = SeenFunctionNodes
.insert(&Fn
);
672 errs() << "one function has multiple nodes!\n";
677 if (ExpectedID
!= Nodes
.size()) {
678 errs() << "Node IDs out of sync!\n";
682 if (createNodesBitVector().size() != getNumNodes()) {
683 errs() << "nodes bit vector doesn't have the right size!\n";
687 // Check we respect the promise of Node::isKernel
688 BitVector BV
= createNodesBitVector();
689 for (const Node
*N
: nodes()) {
690 if (N
->isGraphEntryPoint())
691 N
->getDependencies(BV
);
694 // Ensure each function in the module has an associated node.
695 for (const auto &Fn
: M
) {
696 if (!Fn
.isDeclaration()) {
697 if (!SeenFunctionNodes
.contains(&Fn
)) {
698 errs() << "Fn has no associated node in the graph!\n";
705 errs() << "not all nodes are reachable through the graph's entry points!\n";
713 CostType
SplitGraph::calculateCost(const BitVector
&BV
) const {
715 for (unsigned NodeID
: BV
.set_bits())
716 Cost
+= getNode(NodeID
).getIndividualCost();
721 SplitGraph::getNode(DenseMap
<const GlobalValue
*, Node
*> &Cache
,
722 const GlobalValue
&GV
) {
723 auto &N
= Cache
[&GV
];
728 bool NonCopyable
= false;
729 if (const Function
*Fn
= dyn_cast
<Function
>(&GV
)) {
730 NonCopyable
= isNonCopyable(*Fn
);
731 Cost
= CostMap
.at(Fn
);
733 N
= new (NodesPool
.Allocate()) Node(Nodes
.size(), GV
, Cost
, NonCopyable
);
735 assert(&getNode(N
->getID()) == N
);
739 const SplitGraph::Edge
&SplitGraph::createEdge(Node
&Src
, Node
&Dst
,
741 const Edge
*E
= new (EdgesPool
.Allocate
<Edge
>(1)) Edge(&Src
, &Dst
, EK
);
742 Src
.OutgoingEdges
.push_back(E
);
743 Dst
.IncomingEdges
.push_back(E
);
747 //===----------------------------------------------------------------------===//
749 //===----------------------------------------------------------------------===//
751 /// Represents a module splitting proposal.
753 /// Proposals are made of N BitVectors, one for each partition, where each bit
754 /// set indicates that the node is present and should be copied inside that
757 /// Proposals have several metrics attached so they can be compared/sorted,
758 /// which the driver to try multiple strategies resultings in multiple proposals
759 /// and choose the best one out of them.
760 class SplitProposal
{
762 SplitProposal(const SplitGraph
&SG
, unsigned MaxPartitions
) : SG(&SG
) {
763 Partitions
.resize(MaxPartitions
, {0, SG
.createNodesBitVector()});
766 void setName(StringRef NewName
) { Name
= NewName
; }
767 StringRef
getName() const { return Name
; }
769 const BitVector
&operator[](unsigned PID
) const {
770 return Partitions
[PID
].second
;
773 void add(unsigned PID
, const BitVector
&BV
) {
774 Partitions
[PID
].second
|= BV
;
778 void print(raw_ostream
&OS
) const;
779 LLVM_DUMP_METHOD
void dump() const { print(dbgs()); }
781 // Find the cheapest partition (lowest cost). In case of ties, always returns
782 // the highest partition number.
783 unsigned findCheapestPartition() const;
785 /// Calculate the CodeSize and Bottleneck scores.
786 void calculateScores();
789 void verifyCompleteness() const;
792 /// Only available after \ref calculateScores is called.
794 /// A positive number indicating the % of code duplication that this proposal
795 /// creates. e.g. 0.2 means this proposal adds roughly 20% code size by
796 /// duplicating some functions across partitions.
798 /// Value is always rounded up to 3 decimal places.
800 /// A perfect score would be 0.0, and anything approaching 1.0 is very bad.
801 double getCodeSizeScore() const { return CodeSizeScore
; }
803 /// Only available after \ref calculateScores is called.
805 /// A number between [0, 1] which indicates how big of a bottleneck is
806 /// expected from the largest partition.
808 /// A score of 1.0 means the biggest partition is as big as the source module,
809 /// so build time will be equal to or greater than the build time of the
812 /// Value is always rounded up to 3 decimal places.
814 /// This is one of the metrics used to estimate this proposal's build time.
815 double getBottleneckScore() const { return BottleneckScore
; }
818 void updateScore(unsigned PID
) {
820 for (auto &[PCost
, Nodes
] : Partitions
) {
822 PCost
= SG
->calculateCost(Nodes
);
827 /// \see getCodeSizeScore
828 double CodeSizeScore
= 0.0;
829 /// \see getBottleneckScore
830 double BottleneckScore
= 0.0;
831 /// Aggregated cost of all partitions
832 CostType TotalCost
= 0;
834 const SplitGraph
*SG
= nullptr;
837 std::vector
<std::pair
<CostType
, BitVector
>> Partitions
;
840 void SplitProposal::print(raw_ostream
&OS
) const {
843 OS
<< "[proposal] " << Name
<< ", total cost:" << TotalCost
844 << ", code size score:" << format("%0.3f", CodeSizeScore
)
845 << ", bottleneck score:" << format("%0.3f", BottleneckScore
) << '\n';
846 for (const auto &[PID
, Part
] : enumerate(Partitions
)) {
847 const auto &[Cost
, NodeIDs
] = Part
;
848 OS
<< " - P" << PID
<< " nodes:" << NodeIDs
.count() << " cost: " << Cost
849 << '|' << formatRatioOf(Cost
, SG
->getModuleCost()) << "%\n";
853 unsigned SplitProposal::findCheapestPartition() const {
854 assert(!Partitions
.empty());
855 CostType CurCost
= std::numeric_limits
<CostType
>::max();
856 unsigned CurPID
= InvalidPID
;
857 for (const auto &[Idx
, Part
] : enumerate(Partitions
)) {
858 if (Part
.first
<= CurCost
) {
860 CurCost
= Part
.first
;
863 assert(CurPID
!= InvalidPID
);
867 void SplitProposal::calculateScores() {
868 if (Partitions
.empty())
872 CostType LargestPCost
= 0;
873 for (auto &[PCost
, Nodes
] : Partitions
) {
874 if (PCost
> LargestPCost
)
875 LargestPCost
= PCost
;
878 CostType ModuleCost
= SG
->getModuleCost();
879 CodeSizeScore
= double(TotalCost
) / ModuleCost
;
880 assert(CodeSizeScore
>= 0.0);
882 BottleneckScore
= double(LargestPCost
) / ModuleCost
;
884 CodeSizeScore
= std::ceil(CodeSizeScore
* 100.0) / 100.0;
885 BottleneckScore
= std::ceil(BottleneckScore
* 100.0) / 100.0;
889 void SplitProposal::verifyCompleteness() const {
890 if (Partitions
.empty())
893 BitVector Result
= Partitions
[0].second
;
894 for (const auto &P
: drop_begin(Partitions
))
896 assert(Result
.all() && "some nodes are missing from this proposal!");
900 //===-- RecursiveSearchStrategy -------------------------------------------===//
902 /// Partitioning algorithm.
904 /// This is a recursive search algorithm that can explore multiple possiblities.
906 /// When a cluster of nodes can go into more than one partition, and we haven't
907 /// reached maximum search depth, we recurse and explore both options and their
908 /// consequences. Both branches will yield a proposal, and the driver will grade
909 /// both and choose the best one.
911 /// If max depth is reached, we will use some heuristics to make a choice. Most
912 /// of the time we will just use the least-pressured (cheapest) partition, but
913 /// if a cluster is particularly big and there is a good amount of overlap with
914 /// an existing partition, we will choose that partition instead.
915 class RecursiveSearchSplitting
{
917 using SubmitProposalFn
= function_ref
<void(SplitProposal
)>;
919 RecursiveSearchSplitting(const SplitGraph
&SG
, unsigned NumParts
,
920 SubmitProposalFn SubmitProposal
);
925 struct WorkListEntry
{
926 WorkListEntry(const BitVector
&BV
) : Cluster(BV
) {}
928 unsigned NumNonEntryNodes
= 0;
929 CostType TotalCost
= 0;
930 CostType CostExcludingGraphEntryPoints
= 0;
934 /// Collects all graph entry points's clusters and sort them so the most
935 /// expensive clusters are viewed first. This will merge clusters together if
936 /// they share a non-copyable dependency.
937 void setupWorkList();
939 /// Recursive function that assigns the worklist item at \p Idx into a
940 /// partition of \p SP.
942 /// \p Depth is the current search depth. When this value is equal to
943 /// \ref MaxDepth, we can no longer recurse.
945 /// This function only recurses if there is more than one possible assignment,
946 /// otherwise it is iterative to avoid creating a call stack that is as big as
948 void pickPartition(unsigned Depth
, unsigned Idx
, SplitProposal SP
);
950 /// \return A pair: first element is the PID of the partition that has the
951 /// most similarities with \p Entry, or \ref InvalidPID if no partition was
952 /// found with at least one element in common. The second element is the
953 /// aggregated cost of all dependencies in common between \p Entry and that
955 std::pair
<unsigned, CostType
>
956 findMostSimilarPartition(const WorkListEntry
&Entry
, const SplitProposal
&SP
);
958 const SplitGraph
&SG
;
960 SubmitProposalFn SubmitProposal
;
962 // A Cluster is considered large when its cost, excluding entry points,
963 // exceeds this value.
964 CostType LargeClusterThreshold
= 0;
965 unsigned NumProposalsSubmitted
= 0;
966 SmallVector
<WorkListEntry
> WorkList
;
969 RecursiveSearchSplitting::RecursiveSearchSplitting(
970 const SplitGraph
&SG
, unsigned NumParts
, SubmitProposalFn SubmitProposal
)
971 : SG(SG
), NumParts(NumParts
), SubmitProposal(SubmitProposal
) {
972 // arbitrary max value as a safeguard. Anything above 10 will already be
973 // slow, this is just a max value to prevent extreme resource exhaustion or
974 // unbounded run time.
976 report_fatal_error("[amdgpu-split-module] search depth of " +
977 Twine(MaxDepth
) + " is too high!");
978 LargeClusterThreshold
=
979 (LargeFnFactor
!= 0.0)
980 ? CostType(((SG
.getModuleCost() / NumParts
) * LargeFnFactor
))
981 : std::numeric_limits
<CostType
>::max();
982 LLVM_DEBUG(dbgs() << "[recursive search] large cluster threshold set at "
983 << LargeClusterThreshold
<< "\n");
986 void RecursiveSearchSplitting::run() {
988 SplitModuleTimer
SMT("recursive_search_prepare", "preparing worklist");
993 SplitModuleTimer
SMT("recursive_search_pick", "partitioning");
994 SplitProposal
SP(SG
, NumParts
);
995 pickPartition(/*BranchDepth=*/0, /*Idx=*/0, SP
);
999 void RecursiveSearchSplitting::setupWorkList() {
1000 // e.g. if A and B are two worklist item, and they both call a non copyable
1001 // dependency C, this does:
1004 // => NodeEC will create a single group (A, B, C) and we create a new
1005 // WorkList entry for that group.
1007 EquivalenceClasses
<unsigned> NodeEC
;
1008 for (const SplitGraph::Node
*N
: SG
.nodes()) {
1009 if (!N
->isGraphEntryPoint())
1012 NodeEC
.insert(N
->getID());
1013 N
->visitAllDependencies([&](const SplitGraph::Node
&Dep
) {
1014 if (&Dep
!= N
&& Dep
.isNonCopyable())
1015 NodeEC
.unionSets(N
->getID(), Dep
.getID());
1019 for (auto I
= NodeEC
.begin(), E
= NodeEC
.end(); I
!= E
; ++I
) {
1023 BitVector Cluster
= SG
.createNodesBitVector();
1024 for (auto MI
= NodeEC
.member_begin(I
); MI
!= NodeEC
.member_end(); ++MI
) {
1025 const SplitGraph::Node
&N
= SG
.getNode(*MI
);
1026 if (N
.isGraphEntryPoint())
1027 N
.getDependencies(Cluster
);
1029 WorkList
.emplace_back(std::move(Cluster
));
1032 // Calculate costs and other useful information.
1033 for (WorkListEntry
&Entry
: WorkList
) {
1034 for (unsigned NodeID
: Entry
.Cluster
.set_bits()) {
1035 const SplitGraph::Node
&N
= SG
.getNode(NodeID
);
1036 const CostType Cost
= N
.getIndividualCost();
1038 Entry
.TotalCost
+= Cost
;
1039 if (!N
.isGraphEntryPoint()) {
1040 Entry
.CostExcludingGraphEntryPoints
+= Cost
;
1041 ++Entry
.NumNonEntryNodes
;
1046 stable_sort(WorkList
, [](const WorkListEntry
&A
, const WorkListEntry
&B
) {
1047 if (A
.TotalCost
!= B
.TotalCost
)
1048 return A
.TotalCost
> B
.TotalCost
;
1050 if (A
.CostExcludingGraphEntryPoints
!= B
.CostExcludingGraphEntryPoints
)
1051 return A
.CostExcludingGraphEntryPoints
> B
.CostExcludingGraphEntryPoints
;
1053 if (A
.NumNonEntryNodes
!= B
.NumNonEntryNodes
)
1054 return A
.NumNonEntryNodes
> B
.NumNonEntryNodes
;
1056 return A
.Cluster
.count() > B
.Cluster
.count();
1060 dbgs() << "[recursive search] worklist:\n";
1061 for (const auto &[Idx
, Entry
] : enumerate(WorkList
)) {
1062 dbgs() << " - [" << Idx
<< "]: ";
1063 for (unsigned NodeID
: Entry
.Cluster
.set_bits())
1064 dbgs() << NodeID
<< " ";
1065 dbgs() << "(total_cost:" << Entry
.TotalCost
1066 << ", cost_excl_entries:" << Entry
.CostExcludingGraphEntryPoints
1072 void RecursiveSearchSplitting::pickPartition(unsigned Depth
, unsigned Idx
,
1074 while (Idx
< WorkList
.size()) {
1075 // Step 1: Determine candidate PIDs.
1077 const WorkListEntry
&Entry
= WorkList
[Idx
];
1078 const BitVector
&Cluster
= Entry
.Cluster
;
1080 // Default option is to do load-balancing, AKA assign to least pressured
1082 const unsigned CheapestPID
= SP
.findCheapestPartition();
1083 assert(CheapestPID
!= InvalidPID
);
1085 // Explore assigning to the kernel that contains the most dependencies in
1087 const auto [MostSimilarPID
, SimilarDepsCost
] =
1088 findMostSimilarPartition(Entry
, SP
);
1090 // We can chose to explore only one path if we only have one valid path, or
1091 // if we reached maximum search depth and can no longer branch out.
1092 unsigned SinglePIDToTry
= InvalidPID
;
1093 if (MostSimilarPID
== InvalidPID
) // no similar PID found
1094 SinglePIDToTry
= CheapestPID
;
1095 else if (MostSimilarPID
== CheapestPID
) // both landed on the same PID
1096 SinglePIDToTry
= CheapestPID
;
1097 else if (Depth
>= MaxDepth
) {
1098 // We have to choose one path. Use a heuristic to guess which one will be
1099 // more appropriate.
1100 if (Entry
.CostExcludingGraphEntryPoints
> LargeClusterThreshold
) {
1101 // Check if the amount of code in common makes it worth it.
1102 assert(SimilarDepsCost
&& Entry
.CostExcludingGraphEntryPoints
);
1103 const double Ratio
=
1104 SimilarDepsCost
/ Entry
.CostExcludingGraphEntryPoints
;
1105 assert(Ratio
>= 0.0 && Ratio
<= 1.0);
1106 if (LargeFnOverlapForMerge
> Ratio
) {
1107 // For debug, just print "L", so we'll see "L3=P3" for instance, which
1108 // will mean we reached max depth and chose P3 based on this
1110 LLVM_DEBUG(dbgs() << 'L');
1111 SinglePIDToTry
= MostSimilarPID
;
1114 SinglePIDToTry
= CheapestPID
;
1117 // Step 2: Explore candidates.
1119 // When we only explore one possible path, and thus branch depth doesn't
1120 // increase, do not recurse, iterate instead.
1121 if (SinglePIDToTry
!= InvalidPID
) {
1122 LLVM_DEBUG(dbgs() << Idx
<< "=P" << SinglePIDToTry
<< ' ');
1123 // Only one path to explore, don't clone SP, don't increase depth.
1124 SP
.add(SinglePIDToTry
, Cluster
);
1129 assert(MostSimilarPID
!= InvalidPID
);
1131 // We explore multiple paths: recurse at increased depth, then stop this
1134 LLVM_DEBUG(dbgs() << '\n');
1136 // lb = load balancing = put in cheapest partition
1138 SplitProposal BranchSP
= SP
;
1139 LLVM_DEBUG(dbgs().indent(Depth
)
1140 << " [lb] " << Idx
<< "=P" << CheapestPID
<< "? ");
1141 BranchSP
.add(CheapestPID
, Cluster
);
1142 pickPartition(Depth
+ 1, Idx
+ 1, BranchSP
);
1145 // ms = most similar = put in partition with the most in common
1147 SplitProposal BranchSP
= SP
;
1148 LLVM_DEBUG(dbgs().indent(Depth
)
1149 << " [ms] " << Idx
<< "=P" << MostSimilarPID
<< "? ");
1150 BranchSP
.add(MostSimilarPID
, Cluster
);
1151 pickPartition(Depth
+ 1, Idx
+ 1, BranchSP
);
1157 // Step 3: If we assigned all WorkList items, submit the proposal.
1159 assert(Idx
== WorkList
.size());
1160 assert(NumProposalsSubmitted
<= (2u << MaxDepth
) &&
1161 "Search got out of bounds?");
1162 SP
.setName("recursive_search (depth=" + std::to_string(Depth
) + ") #" +
1163 std::to_string(NumProposalsSubmitted
++));
1164 LLVM_DEBUG(dbgs() << '\n');
1168 std::pair
<unsigned, CostType
>
1169 RecursiveSearchSplitting::findMostSimilarPartition(const WorkListEntry
&Entry
,
1170 const SplitProposal
&SP
) {
1171 if (!Entry
.NumNonEntryNodes
)
1172 return {InvalidPID
, 0};
1174 // We take the partition that is the most similar using Cost as a metric.
1175 // So we take the set of nodes in common, compute their aggregated cost, and
1176 // pick the partition with the highest cost in common.
1177 unsigned ChosenPID
= InvalidPID
;
1178 CostType ChosenCost
= 0;
1179 for (unsigned PID
= 0; PID
< NumParts
; ++PID
) {
1180 BitVector BV
= SP
[PID
];
1181 BV
&= Entry
.Cluster
; // FIXME: & doesn't work between BVs?!
1186 const CostType Cost
= SG
.calculateCost(BV
);
1188 if (ChosenPID
== InvalidPID
|| ChosenCost
< Cost
||
1189 (ChosenCost
== Cost
&& PID
> ChosenPID
)) {
1195 return {ChosenPID
, ChosenCost
};
1198 //===----------------------------------------------------------------------===//
1199 // DOTGraph Printing Support
1200 //===----------------------------------------------------------------------===//
1202 const SplitGraph::Node
*mapEdgeToDst(const SplitGraph::Edge
*E
) {
1206 using SplitGraphEdgeDstIterator
=
1207 mapped_iterator
<SplitGraph::edges_iterator
, decltype(&mapEdgeToDst
)>;
1211 template <> struct GraphTraits
<SplitGraph
> {
1212 using NodeRef
= const SplitGraph::Node
*;
1213 using nodes_iterator
= SplitGraph::nodes_iterator
;
1214 using ChildIteratorType
= SplitGraphEdgeDstIterator
;
1216 using EdgeRef
= const SplitGraph::Edge
*;
1217 using ChildEdgeIteratorType
= SplitGraph::edges_iterator
;
1219 static NodeRef
getEntryNode(NodeRef N
) { return N
; }
1221 static ChildIteratorType
child_begin(NodeRef Ref
) {
1222 return {Ref
->outgoing_edges().begin(), mapEdgeToDst
};
1224 static ChildIteratorType
child_end(NodeRef Ref
) {
1225 return {Ref
->outgoing_edges().end(), mapEdgeToDst
};
1228 static nodes_iterator
nodes_begin(const SplitGraph
&G
) {
1229 return G
.nodes().begin();
1231 static nodes_iterator
nodes_end(const SplitGraph
&G
) {
1232 return G
.nodes().end();
1236 template <> struct DOTGraphTraits
<SplitGraph
> : public DefaultDOTGraphTraits
{
1237 DOTGraphTraits(bool IsSimple
= false) : DefaultDOTGraphTraits(IsSimple
) {}
1239 static std::string
getGraphName(const SplitGraph
&SG
) {
1240 return SG
.getModule().getName().str();
1243 std::string
getNodeLabel(const SplitGraph::Node
*N
, const SplitGraph
&SG
) {
1244 return N
->getName().str();
1247 static std::string
getNodeDescription(const SplitGraph::Node
*N
,
1248 const SplitGraph
&SG
) {
1250 if (N
->isEntryFunctionCC())
1251 Result
+= "entry-fn-cc ";
1252 if (N
->isNonCopyable())
1253 Result
+= "non-copyable ";
1254 Result
+= "cost:" + std::to_string(N
->getIndividualCost());
1258 static std::string
getNodeAttributes(const SplitGraph::Node
*N
,
1259 const SplitGraph
&SG
) {
1260 return N
->hasAnyIncomingEdges() ? "" : "color=\"red\"";
1263 static std::string
getEdgeAttributes(const SplitGraph::Node
*N
,
1264 SplitGraphEdgeDstIterator EI
,
1265 const SplitGraph
&SG
) {
1267 switch ((*EI
.getCurrent())->Kind
) {
1268 case SplitGraph::EdgeKind::DirectCall
:
1270 case SplitGraph::EdgeKind::IndirectCall
:
1271 return "style=\"dashed\"";
1273 llvm_unreachable("Unknown SplitGraph::EdgeKind enum");
1277 //===----------------------------------------------------------------------===//
1279 //===----------------------------------------------------------------------===//
1283 // If we didn't externalize GVs, then local GVs need to be conservatively
1284 // imported into every module (including their initializers), and then cleaned
1286 static bool needsConservativeImport(const GlobalValue
*GV
) {
1287 if (const auto *Var
= dyn_cast
<GlobalVariable
>(GV
))
1288 return Var
->hasLocalLinkage();
1289 return isa
<GlobalAlias
>(GV
);
1292 /// Prints a summary of the partition \p N, represented by module \p M, to \p
1294 static void printPartitionSummary(raw_ostream
&OS
, unsigned N
, const Module
&M
,
1295 unsigned PartCost
, unsigned ModuleCost
) {
1296 OS
<< "*** Partition P" << N
<< " ***\n";
1298 for (const auto &Fn
: M
) {
1299 if (!Fn
.isDeclaration())
1300 OS
<< " - [function] " << Fn
.getName() << "\n";
1303 for (const auto &GV
: M
.globals()) {
1304 if (GV
.hasInitializer())
1305 OS
<< " - [global] " << GV
.getName() << "\n";
1308 OS
<< "Partition contains " << formatRatioOf(PartCost
, ModuleCost
)
1309 << "% of the source\n";
1312 static void evaluateProposal(SplitProposal
&Best
, SplitProposal New
) {
1313 SplitModuleTimer
SMT("proposal_evaluation", "proposal ranking algorithm");
1316 New
.verifyCompleteness();
1317 if (DebugProposalSearch
)
1321 const double CurBScore
= Best
.getBottleneckScore();
1322 const double CurCSScore
= Best
.getCodeSizeScore();
1323 const double NewBScore
= New
.getBottleneckScore();
1324 const double NewCSScore
= New
.getCodeSizeScore();
1326 // TODO: Improve this
1327 // We can probably lower the precision of the comparison at first
1329 // - (Current): BScore: 0.489 CSCore 1.105
1330 // - (New): BScore: 0.475 CSCore 1.305
1331 // Currently we'd choose the new one because the bottleneck score is
1332 // lower, but the new one duplicates more code. It may be worth it to
1333 // discard the new proposal as the impact on build time is negligible.
1336 bool IsBest
= false;
1337 if (NewBScore
< CurBScore
)
1339 else if (NewBScore
== CurBScore
)
1340 IsBest
= (NewCSScore
< CurCSScore
); // Use code size as tie breaker.
1343 Best
= std::move(New
);
1345 LLVM_DEBUG(if (DebugProposalSearch
) {
1347 dbgs() << "[search] new best proposal!\n";
1349 dbgs() << "[search] discarding - not profitable\n";
1353 /// Trivial helper to create an identical copy of \p M.
1354 static std::unique_ptr
<Module
> cloneAll(const Module
&M
) {
1355 ValueToValueMapTy VMap
;
1356 return CloneModule(M
, VMap
, [&](const GlobalValue
*GV
) { return true; });
1359 /// Writes \p SG as a DOTGraph to \ref ModuleDotCfgDir if requested.
1360 static void writeDOTGraph(const SplitGraph
&SG
) {
1361 if (ModuleDotCfgOutput
.empty())
1365 raw_fd_ostream
OS(ModuleDotCfgOutput
, EC
);
1367 errs() << "[" DEBUG_TYPE
"]: cannot open '" << ModuleDotCfgOutput
1368 << "' - DOTGraph will not be printed\n";
1370 WriteGraph(OS
, SG
, /*ShortName=*/false,
1371 /*Title=*/SG
.getModule().getName());
1374 static void splitAMDGPUModule(
1375 GetTTIFn GetTTI
, Module
&M
, unsigned NumParts
,
1376 function_ref
<void(std::unique_ptr
<Module
> MPart
)> ModuleCallback
) {
1379 // Externalize functions whose address are taken.
1381 // This is needed because partitioning is purely based on calls, but sometimes
1382 // a kernel/function may just look at the address of another local function
1383 // and not do anything (no calls). After partitioning, that local function may
1384 // end up in a different module (so it's just a declaration in the module
1385 // where its address is taken), which emits a "undefined hidden symbol" linker
1388 // Additionally, it guides partitioning to not duplicate this function if it's
1389 // called directly at some point.
1391 // TODO: Could we be smarter about this ? This makes all functions whose
1392 // addresses are taken non-copyable. We should probably model this type of
1393 // constraint in the graph and use it to guide splitting, instead of
1394 // externalizing like this. Maybe non-copyable should really mean "keep one
1395 // visible copy, then internalize all other copies" for some functions?
1396 if (!NoExternalizeOnAddrTaken
) {
1397 for (auto &Fn
: M
) {
1398 // TODO: Should aliases count? Probably not but they're so rare I'm not
1399 // sure it's worth fixing.
1400 if (Fn
.hasLocalLinkage() && Fn
.hasAddressTaken()) {
1401 LLVM_DEBUG(dbgs() << "[externalize] "; Fn
.printAsOperand(dbgs());
1402 dbgs() << " because its address is taken\n");
1408 // Externalize local GVs, which avoids duplicating their initializers, which
1409 // in turns helps keep code size in check.
1410 if (!NoExternalizeGlobals
) {
1411 for (auto &GV
: M
.globals()) {
1412 if (GV
.hasLocalLinkage())
1413 LLVM_DEBUG(dbgs() << "[externalize] GV " << GV
.getName() << '\n');
1418 // Start by calculating the cost of every function in the module, as well as
1419 // the module's overall cost.
1420 FunctionsCostMap FnCosts
;
1421 const CostType ModuleCost
= calculateFunctionCosts(GetTTI
, M
, FnCosts
);
1423 // Build the SplitGraph, which represents the module's functions and models
1424 // their dependencies accurately.
1425 SplitGraph
SG(M
, FnCosts
, ModuleCost
);
1431 << "[!] no nodes in graph, input is empty - no splitting possible\n");
1432 ModuleCallback(cloneAll(M
));
1437 dbgs() << "[graph] nodes:\n";
1438 for (const SplitGraph::Node
*N
: SG
.nodes()) {
1439 dbgs() << " - [" << N
->getID() << "]: " << N
->getName() << " "
1440 << (N
->isGraphEntryPoint() ? "(entry)" : "") << " "
1441 << (N
->isNonCopyable() ? "(noncopyable)" : "") << "\n";
1447 LLVM_DEBUG(dbgs() << "[search] testing splitting strategies\n");
1449 std::optional
<SplitProposal
> Proposal
;
1450 const auto EvaluateProposal
= [&](SplitProposal SP
) {
1451 SP
.calculateScores();
1453 Proposal
= std::move(SP
);
1455 evaluateProposal(*Proposal
, std::move(SP
));
1458 // TODO: It would be very easy to create new strategies by just adding a base
1459 // class to RecursiveSearchSplitting and abstracting it away.
1460 RecursiveSearchSplitting(SG
, NumParts
, EvaluateProposal
).run();
1461 LLVM_DEBUG(if (Proposal
) dbgs() << "[search done] selected proposal: "
1462 << Proposal
->getName() << "\n";);
1465 LLVM_DEBUG(dbgs() << "[!] no proposal made, no splitting possible!\n");
1466 ModuleCallback(cloneAll(M
));
1470 LLVM_DEBUG(Proposal
->print(dbgs()););
1472 std::optional
<raw_fd_ostream
> SummariesOS
;
1473 if (!PartitionSummariesOutput
.empty()) {
1475 SummariesOS
.emplace(PartitionSummariesOutput
, EC
);
1477 errs() << "[" DEBUG_TYPE
"]: cannot open '" << PartitionSummariesOutput
1478 << "' - Partition summaries will not be printed\n";
1481 for (unsigned PID
= 0; PID
< NumParts
; ++PID
) {
1482 SplitModuleTimer
SMT2("modules_creation",
1483 "creating modules for each partition");
1484 LLVM_DEBUG(dbgs() << "[split] creating new modules\n");
1486 DenseSet
<const Function
*> FnsInPart
;
1487 for (unsigned NodeID
: (*Proposal
)[PID
].set_bits())
1488 FnsInPart
.insert(&SG
.getNode(NodeID
).getFunction());
1490 ValueToValueMapTy VMap
;
1491 CostType PartCost
= 0;
1492 std::unique_ptr
<Module
> MPart(
1493 CloneModule(M
, VMap
, [&](const GlobalValue
*GV
) {
1494 // Functions go in their assigned partition.
1495 if (const auto *Fn
= dyn_cast
<Function
>(GV
)) {
1496 if (FnsInPart
.contains(Fn
)) {
1497 PartCost
+= SG
.getCost(*Fn
);
1503 // Everything else goes in the first partition.
1504 return needsConservativeImport(GV
) || PID
== 0;
1507 // FIXME: Aliases aren't seen often, and their handling isn't perfect so
1508 // bugs are possible.
1510 // Clean-up conservatively imported GVs without any users.
1511 for (auto &GV
: make_early_inc_range(MPart
->global_values())) {
1512 if (needsConservativeImport(&GV
) && GV
.use_empty())
1513 GV
.eraseFromParent();
1517 printPartitionSummary(*SummariesOS
, PID
, *MPart
, PartCost
, ModuleCost
);
1520 printPartitionSummary(dbgs(), PID
, *MPart
, PartCost
, ModuleCost
));
1522 ModuleCallback(std::move(MPart
));
1527 PreservedAnalyses
AMDGPUSplitModulePass::run(Module
&M
,
1528 ModuleAnalysisManager
&MAM
) {
1529 SplitModuleTimer
SMT(
1530 "total", "total pass runtime (incl. potentially waiting for lockfile)");
1532 FunctionAnalysisManager
&FAM
=
1533 MAM
.getResult
<FunctionAnalysisManagerModuleProxy
>(M
).getManager();
1534 const auto TTIGetter
= [&FAM
](Function
&F
) -> const TargetTransformInfo
& {
1535 return FAM
.getResult
<TargetIRAnalysis
>(F
);
1541 SmallString
<128> LockFilePath
;
1542 sys::path::system_temp_directory(/*ErasedOnReboot=*/true, LockFilePath
);
1543 sys::path::append(LockFilePath
, "amdgpu-split-module-debug");
1544 LLVM_DEBUG(dbgs() << DEBUG_TYPE
" using lockfile '" << LockFilePath
1548 llvm::LockFileManager
Locked(LockFilePath
.str());
1550 case LockFileManager::LFS_Error
:
1552 dbgs() << "[amdgpu-split-module] unable to acquire lockfile, debug "
1553 "output may be mangled by other processes\n");
1554 Locked
.unsafeRemoveLockFile();
1556 case LockFileManager::LFS_Owned
:
1558 case LockFileManager::LFS_Shared
: {
1559 switch (Locked
.waitForUnlock()) {
1560 case LockFileManager::Res_Success
:
1562 case LockFileManager::Res_OwnerDied
:
1563 continue; // try again to get the lock.
1564 case LockFileManager::Res_Timeout
:
1567 << "[amdgpu-split-module] unable to acquire lockfile, debug "
1568 "output may be mangled by other processes\n");
1569 Locked
.unsafeRemoveLockFile();
1576 splitAMDGPUModule(TTIGetter
, M
, N
, ModuleCallback
);
1584 splitAMDGPUModule(TTIGetter
, M
, N
, ModuleCallback
);
1586 // We can change linkage/visibilities in the input, consider that nothing is
1587 // preserved just to be safe. This pass runs last anyway.
1588 return PreservedAnalyses::none();