1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass performs partial inlining, typically by inlining an if statement
10 // that surrounds the body of the function.
12 //===----------------------------------------------------------------------===//
14 #include "llvm/Transforms/IPO/PartialInlining.h"
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/DenseSet.h"
17 #include "llvm/ADT/None.h"
18 #include "llvm/ADT/Optional.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Statistic.h"
22 #include "llvm/Analysis/BlockFrequencyInfo.h"
23 #include "llvm/Analysis/BranchProbabilityInfo.h"
24 #include "llvm/Analysis/InlineCost.h"
25 #include "llvm/Analysis/LoopInfo.h"
26 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
27 #include "llvm/Analysis/ProfileSummaryInfo.h"
28 #include "llvm/Analysis/TargetLibraryInfo.h"
29 #include "llvm/Analysis/TargetTransformInfo.h"
30 #include "llvm/IR/Attributes.h"
31 #include "llvm/IR/BasicBlock.h"
32 #include "llvm/IR/CFG.h"
33 #include "llvm/IR/DebugLoc.h"
34 #include "llvm/IR/DiagnosticInfo.h"
35 #include "llvm/IR/Dominators.h"
36 #include "llvm/IR/Function.h"
37 #include "llvm/IR/InstrTypes.h"
38 #include "llvm/IR/Instruction.h"
39 #include "llvm/IR/Instructions.h"
40 #include "llvm/IR/IntrinsicInst.h"
41 #include "llvm/IR/Intrinsics.h"
42 #include "llvm/IR/Module.h"
43 #include "llvm/IR/User.h"
44 #include "llvm/InitializePasses.h"
45 #include "llvm/Pass.h"
46 #include "llvm/Support/BlockFrequency.h"
47 #include "llvm/Support/BranchProbability.h"
48 #include "llvm/Support/Casting.h"
49 #include "llvm/Support/CommandLine.h"
50 #include "llvm/Support/ErrorHandling.h"
51 #include "llvm/Transforms/IPO.h"
52 #include "llvm/Transforms/Utils/Cloning.h"
53 #include "llvm/Transforms/Utils/CodeExtractor.h"
54 #include "llvm/Transforms/Utils/ValueMapper.h"
66 #define DEBUG_TYPE "partial-inlining"
68 STATISTIC(NumPartialInlined
,
69 "Number of callsites functions partially inlined into.");
70 STATISTIC(NumColdOutlinePartialInlined
, "Number of times functions with "
71 "cold outlined regions were partially "
72 "inlined into its caller(s).");
73 STATISTIC(NumColdRegionsFound
,
74 "Number of cold single entry/exit regions found.");
75 STATISTIC(NumColdRegionsOutlined
,
76 "Number of cold single entry/exit regions outlined.");
78 // Command line option to disable partial-inlining. The default is false:
80 DisablePartialInlining("disable-partial-inlining", cl::init(false),
81 cl::Hidden
, cl::desc("Disable partial inlining"));
82 // Command line option to disable multi-region partial-inlining. The default is
84 static cl::opt
<bool> DisableMultiRegionPartialInline(
85 "disable-mr-partial-inlining", cl::init(false), cl::Hidden
,
86 cl::desc("Disable multi-region partial inlining"));
88 // Command line option to force outlining in regions with live exit variables.
89 // The default is false:
91 ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden
,
92 cl::desc("Force outline regions with live exits"));
94 // Command line option to enable marking outline functions with Cold Calling
95 // Convention. The default is false:
97 MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden
,
98 cl::desc("Mark outline function calls with ColdCC"));
100 // This is an option used by testing:
101 static cl::opt
<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
102 cl::init(false), cl::ZeroOrMore
,
104 cl::desc("Skip Cost Analysis"));
105 // Used to determine if a cold region is worth outlining based on
106 // its inlining cost compared to the original function. Default is set at 10%.
107 // ie. if the cold region reduces the inlining cost of the original function by
109 static cl::opt
<float> MinRegionSizeRatio(
110 "min-region-size-ratio", cl::init(0.1), cl::Hidden
,
111 cl::desc("Minimum ratio comparing relative sizes of each "
112 "outline candidate and original function"));
113 // Used to tune the minimum number of execution counts needed in the predecessor
114 // block to the cold edge. ie. confidence interval.
115 static cl::opt
<unsigned>
116 MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden
,
117 cl::desc("Minimum block executions to consider "
118 "its BranchProbabilityInfo valid"));
119 // Used to determine when an edge is considered cold. Default is set to 10%. ie.
120 // if the branch probability is 10% or less, then it is deemed as 'cold'.
121 static cl::opt
<float> ColdBranchRatio(
122 "cold-branch-ratio", cl::init(0.1), cl::Hidden
,
123 cl::desc("Minimum BranchProbability to consider a region cold."));
125 static cl::opt
<unsigned> MaxNumInlineBlocks(
126 "max-num-inline-blocks", cl::init(5), cl::Hidden
,
127 cl::desc("Max number of blocks to be partially inlined"));
129 // Command line option to set the maximum number of partial inlining allowed
130 // for the module. The default value of -1 means no limit.
131 static cl::opt
<int> MaxNumPartialInlining(
132 "max-partial-inlining", cl::init(-1), cl::Hidden
, cl::ZeroOrMore
,
133 cl::desc("Max number of partial inlining. The default is unlimited"));
135 // Used only when PGO or user annotated branch data is absent. It is
136 // the least value that is used to weigh the outline region. If BFI
137 // produces larger value, the BFI value will be used.
139 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
140 cl::Hidden
, cl::ZeroOrMore
,
141 cl::desc("Relative frequency of outline region to "
144 static cl::opt
<unsigned> ExtraOutliningPenalty(
145 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden
,
146 cl::desc("A debug option to add additional penalty to the computed one."));
150 struct FunctionOutliningInfo
{
151 FunctionOutliningInfo() = default;
153 // Returns the number of blocks to be inlined including all blocks
154 // in Entries and one return block.
155 unsigned getNumInlinedBlocks() const { return Entries
.size() + 1; }
157 // A set of blocks including the function entry that guard
158 // the region to be outlined.
159 SmallVector
<BasicBlock
*, 4> Entries
;
161 // The return block that is not included in the outlined region.
162 BasicBlock
*ReturnBlock
= nullptr;
164 // The dominating block of the region to be outlined.
165 BasicBlock
*NonReturnBlock
= nullptr;
167 // The set of blocks in Entries that that are predecessors to ReturnBlock
168 SmallVector
<BasicBlock
*, 4> ReturnBlockPreds
;
171 struct FunctionOutliningMultiRegionInfo
{
172 FunctionOutliningMultiRegionInfo()
175 // Container for outline regions
176 struct OutlineRegionInfo
{
177 OutlineRegionInfo(ArrayRef
<BasicBlock
*> Region
,
178 BasicBlock
*EntryBlock
, BasicBlock
*ExitBlock
,
179 BasicBlock
*ReturnBlock
)
180 : Region(Region
.begin(), Region
.end()), EntryBlock(EntryBlock
),
181 ExitBlock(ExitBlock
), ReturnBlock(ReturnBlock
) {}
182 SmallVector
<BasicBlock
*, 8> Region
;
183 BasicBlock
*EntryBlock
;
184 BasicBlock
*ExitBlock
;
185 BasicBlock
*ReturnBlock
;
188 SmallVector
<OutlineRegionInfo
, 4> ORI
;
191 struct PartialInlinerImpl
{
194 function_ref
<AssumptionCache
&(Function
&)> GetAC
,
195 function_ref
<AssumptionCache
*(Function
&)> LookupAC
,
196 function_ref
<TargetTransformInfo
&(Function
&)> GTTI
,
197 function_ref
<const TargetLibraryInfo
&(Function
&)> GTLI
,
198 ProfileSummaryInfo
&ProfSI
,
199 function_ref
<BlockFrequencyInfo
&(Function
&)> GBFI
= nullptr)
200 : GetAssumptionCache(GetAC
), LookupAssumptionCache(LookupAC
),
201 GetTTI(GTTI
), GetBFI(GBFI
), GetTLI(GTLI
), PSI(ProfSI
) {}
204 // Main part of the transformation that calls helper functions to find
205 // outlining candidates, clone & outline the function, and attempt to
206 // partially inline the resulting function. Returns true if
207 // inlining was successful, false otherwise. Also returns the outline
208 // function (only if we partially inlined early returns) as there is a
209 // possibility to further "peel" early return statements that were left in the
210 // outline function due to code size.
211 std::pair
<bool, Function
*> unswitchFunction(Function
&F
);
213 // This class speculatively clones the function to be partial inlined.
214 // At the end of partial inlining, the remaining callsites to the cloned
215 // function that are not partially inlined will be fixed up to reference
216 // the original function, and the cloned function will be erased.
217 struct FunctionCloner
{
218 // Two constructors, one for single region outlining, the other for
219 // multi-region outlining.
220 FunctionCloner(Function
*F
, FunctionOutliningInfo
*OI
,
221 OptimizationRemarkEmitter
&ORE
,
222 function_ref
<AssumptionCache
*(Function
&)> LookupAC
,
223 function_ref
<TargetTransformInfo
&(Function
&)> GetTTI
);
224 FunctionCloner(Function
*F
, FunctionOutliningMultiRegionInfo
*OMRI
,
225 OptimizationRemarkEmitter
&ORE
,
226 function_ref
<AssumptionCache
*(Function
&)> LookupAC
,
227 function_ref
<TargetTransformInfo
&(Function
&)> GetTTI
);
231 // Prepare for function outlining: making sure there is only
232 // one incoming edge from the extracted/outlined region to
234 void normalizeReturnBlock() const;
236 // Do function outlining for cold regions.
237 bool doMultiRegionFunctionOutlining();
238 // Do function outlining for region after early return block(s).
239 // NOTE: For vararg functions that do the vararg handling in the outlined
240 // function, we temporarily generate IR that does not properly
241 // forward varargs to the outlined function. Calling InlineFunction
242 // will update calls to the outlined functions to properly forward
244 Function
*doSingleRegionFunctionOutlining();
246 Function
*OrigFunc
= nullptr;
247 Function
*ClonedFunc
= nullptr;
249 typedef std::pair
<Function
*, BasicBlock
*> FuncBodyCallerPair
;
250 // Keep track of Outlined Functions and the basic block they're called from.
251 SmallVector
<FuncBodyCallerPair
, 4> OutlinedFunctions
;
253 // ClonedFunc is inlined in one of its callers after function
255 bool IsFunctionInlined
= false;
256 // The cost of the region to be outlined.
257 InstructionCost OutlinedRegionCost
= 0;
258 // ClonedOI is specific to outlining non-early return blocks.
259 std::unique_ptr
<FunctionOutliningInfo
> ClonedOI
= nullptr;
260 // ClonedOMRI is specific to outlining cold regions.
261 std::unique_ptr
<FunctionOutliningMultiRegionInfo
> ClonedOMRI
= nullptr;
262 std::unique_ptr
<BlockFrequencyInfo
> ClonedFuncBFI
= nullptr;
263 OptimizationRemarkEmitter
&ORE
;
264 function_ref
<AssumptionCache
*(Function
&)> LookupAC
;
265 function_ref
<TargetTransformInfo
&(Function
&)> GetTTI
;
269 int NumPartialInlining
= 0;
270 function_ref
<AssumptionCache
&(Function
&)> GetAssumptionCache
;
271 function_ref
<AssumptionCache
*(Function
&)> LookupAssumptionCache
;
272 function_ref
<TargetTransformInfo
&(Function
&)> GetTTI
;
273 function_ref
<BlockFrequencyInfo
&(Function
&)> GetBFI
;
274 function_ref
<const TargetLibraryInfo
&(Function
&)> GetTLI
;
275 ProfileSummaryInfo
&PSI
;
277 // Return the frequency of the OutlininingBB relative to F's entry point.
278 // The result is no larger than 1 and is represented using BP.
279 // (Note that the outlined region's 'head' block can only have incoming
280 // edges from the guarding entry blocks).
282 getOutliningCallBBRelativeFreq(FunctionCloner
&Cloner
) const;
284 // Return true if the callee of CB should be partially inlined with
286 bool shouldPartialInline(CallBase
&CB
, FunctionCloner
&Cloner
,
287 BlockFrequency WeightedOutliningRcost
,
288 OptimizationRemarkEmitter
&ORE
) const;
290 // Try to inline DuplicateFunction (cloned from F with call to
291 // the OutlinedFunction into its callers. Return true
292 // if there is any successful inlining.
293 bool tryPartialInline(FunctionCloner
&Cloner
);
295 // Compute the mapping from use site of DuplicationFunction to the enclosing
296 // BB's profile count.
298 computeCallsiteToProfCountMap(Function
*DuplicateFunction
,
299 DenseMap
<User
*, uint64_t> &SiteCountMap
) const;
301 bool isLimitReached() const {
302 return (MaxNumPartialInlining
!= -1 &&
303 NumPartialInlining
>= MaxNumPartialInlining
);
306 static CallBase
*getSupportedCallBase(User
*U
) {
307 if (isa
<CallInst
>(U
) || isa
<InvokeInst
>(U
))
308 return cast
<CallBase
>(U
);
309 llvm_unreachable("All uses must be calls");
313 static CallBase
*getOneCallSiteTo(Function
&F
) {
314 User
*User
= *F
.user_begin();
315 return getSupportedCallBase(User
);
318 std::tuple
<DebugLoc
, BasicBlock
*> getOneDebugLoc(Function
&F
) const {
319 CallBase
*CB
= getOneCallSiteTo(F
);
320 DebugLoc DLoc
= CB
->getDebugLoc();
321 BasicBlock
*Block
= CB
->getParent();
322 return std::make_tuple(DLoc
, Block
);
325 // Returns the costs associated with function outlining:
326 // - The first value is the non-weighted runtime cost for making the call
327 // to the outlined function, including the addtional setup cost in the
328 // outlined function itself;
329 // - The second value is the estimated size of the new call sequence in
330 // basic block Cloner.OutliningCallBB;
331 std::tuple
<InstructionCost
, InstructionCost
>
332 computeOutliningCosts(FunctionCloner
&Cloner
) const;
334 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
335 // approximate both the size and runtime cost (Note that in the current
336 // inline cost analysis, there is no clear distinction there either).
337 static InstructionCost
computeBBInlineCost(BasicBlock
*BB
,
338 TargetTransformInfo
*TTI
);
340 std::unique_ptr
<FunctionOutliningInfo
>
341 computeOutliningInfo(Function
&F
) const;
343 std::unique_ptr
<FunctionOutliningMultiRegionInfo
>
344 computeOutliningColdRegionsInfo(Function
&F
,
345 OptimizationRemarkEmitter
&ORE
) const;
348 struct PartialInlinerLegacyPass
: public ModulePass
{
349 static char ID
; // Pass identification, replacement for typeid
351 PartialInlinerLegacyPass() : ModulePass(ID
) {
352 initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
355 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
356 AU
.addRequired
<AssumptionCacheTracker
>();
357 AU
.addRequired
<ProfileSummaryInfoWrapperPass
>();
358 AU
.addRequired
<TargetTransformInfoWrapperPass
>();
359 AU
.addRequired
<TargetLibraryInfoWrapperPass
>();
362 bool runOnModule(Module
&M
) override
{
366 AssumptionCacheTracker
*ACT
= &getAnalysis
<AssumptionCacheTracker
>();
367 TargetTransformInfoWrapperPass
*TTIWP
=
368 &getAnalysis
<TargetTransformInfoWrapperPass
>();
369 ProfileSummaryInfo
&PSI
=
370 getAnalysis
<ProfileSummaryInfoWrapperPass
>().getPSI();
372 auto GetAssumptionCache
= [&ACT
](Function
&F
) -> AssumptionCache
& {
373 return ACT
->getAssumptionCache(F
);
376 auto LookupAssumptionCache
= [ACT
](Function
&F
) -> AssumptionCache
* {
377 return ACT
->lookupAssumptionCache(F
);
380 auto GetTTI
= [&TTIWP
](Function
&F
) -> TargetTransformInfo
& {
381 return TTIWP
->getTTI(F
);
384 auto GetTLI
= [this](Function
&F
) -> TargetLibraryInfo
& {
385 return this->getAnalysis
<TargetLibraryInfoWrapperPass
>().getTLI(F
);
388 return PartialInlinerImpl(GetAssumptionCache
, LookupAssumptionCache
, GetTTI
,
394 } // end anonymous namespace
396 std::unique_ptr
<FunctionOutliningMultiRegionInfo
>
397 PartialInlinerImpl::computeOutliningColdRegionsInfo(
398 Function
&F
, OptimizationRemarkEmitter
&ORE
) const {
399 BasicBlock
*EntryBlock
= &F
.front();
403 BranchProbabilityInfo
BPI(F
, LI
);
404 std::unique_ptr
<BlockFrequencyInfo
> ScopedBFI
;
405 BlockFrequencyInfo
*BFI
;
407 ScopedBFI
.reset(new BlockFrequencyInfo(F
, BPI
, LI
));
408 BFI
= ScopedBFI
.get();
412 // Return if we don't have profiling information.
413 if (!PSI
.hasInstrumentationProfile())
414 return std::unique_ptr
<FunctionOutliningMultiRegionInfo
>();
416 std::unique_ptr
<FunctionOutliningMultiRegionInfo
> OutliningInfo
=
417 std::make_unique
<FunctionOutliningMultiRegionInfo
>();
420 [&ORE
](SmallVectorImpl
<BasicBlock
*> &BlockList
) -> BasicBlock
* {
421 BasicBlock
*ExitBlock
= nullptr;
422 for (auto *Block
: BlockList
) {
423 for (BasicBlock
*Succ
: successors(Block
)) {
424 if (!is_contained(BlockList
, Succ
)) {
427 return OptimizationRemarkMissed(DEBUG_TYPE
, "MultiExitRegion",
429 << "Region dominated by "
430 << ore::NV("Block", BlockList
.front()->getName())
431 << " has more than one region exit edge.";
443 auto BBProfileCount
= [BFI
](BasicBlock
*BB
) {
444 return BFI
->getBlockProfileCount(BB
)
445 ? BFI
->getBlockProfileCount(BB
).getValue()
449 // Use the same computeBBInlineCost function to compute the cost savings of
450 // the outlining the candidate region.
451 TargetTransformInfo
*FTTI
= &GetTTI(F
);
452 InstructionCost OverallFunctionCost
= 0;
454 OverallFunctionCost
+= computeBBInlineCost(&BB
, FTTI
);
456 LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost
459 InstructionCost MinOutlineRegionCost
= OverallFunctionCost
.map(
460 [&](auto Cost
) { return Cost
* MinRegionSizeRatio
; });
462 BranchProbability
MinBranchProbability(
463 static_cast<int>(ColdBranchRatio
* MinBlockCounterExecution
),
464 MinBlockCounterExecution
);
465 bool ColdCandidateFound
= false;
466 BasicBlock
*CurrEntry
= EntryBlock
;
467 std::vector
<BasicBlock
*> DFS
;
468 DenseMap
<BasicBlock
*, bool> VisitedMap
;
469 DFS
.push_back(CurrEntry
);
470 VisitedMap
[CurrEntry
] = true;
472 // Use Depth First Search on the basic blocks to find CFG edges that are
474 // Cold regions considered must also have its inline cost compared to the
475 // overall inline cost of the original function. The region is outlined only
476 // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
478 while (!DFS
.empty()) {
479 auto *ThisBB
= DFS
.back();
481 // Only consider regions with predecessor blocks that are considered
482 // not-cold (default: part of the top 99.99% of all block counters)
483 // AND greater than our minimum block execution count (default: 100).
484 if (PSI
.isColdBlock(ThisBB
, BFI
) ||
485 BBProfileCount(ThisBB
) < MinBlockCounterExecution
)
487 for (auto SI
= succ_begin(ThisBB
); SI
!= succ_end(ThisBB
); ++SI
) {
490 VisitedMap
[*SI
] = true;
492 // If branch isn't cold, we skip to the next one.
493 BranchProbability SuccProb
= BPI
.getEdgeProbability(ThisBB
, *SI
);
494 if (SuccProb
> MinBranchProbability
)
497 LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB
->getName() << "->"
499 << "\nBranch Probability = " << SuccProb
<< "\n";);
501 SmallVector
<BasicBlock
*, 8> DominateVector
;
502 DT
.getDescendants(*SI
, DominateVector
);
503 assert(!DominateVector
.empty() &&
504 "SI should be reachable and have at least itself as descendant");
506 // We can only outline single entry regions (for now).
507 if (!DominateVector
.front()->hasNPredecessors(1)) {
508 LLVM_DEBUG(dbgs() << "ABORT: Block " << SI
->getName()
509 << " doesn't have a single predecessor in the "
510 "dominator tree\n";);
514 BasicBlock
*ExitBlock
= nullptr;
515 // We can only outline single exit regions (for now).
516 if (!(ExitBlock
= IsSingleExit(DominateVector
))) {
517 LLVM_DEBUG(dbgs() << "ABORT: Block " << SI
->getName()
518 << " doesn't have a unique successor\n";);
522 InstructionCost OutlineRegionCost
= 0;
523 for (auto *BB
: DominateVector
)
524 OutlineRegionCost
+= computeBBInlineCost(BB
, &GetTTI(*BB
->getParent()));
526 LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost
529 if (!SkipCostAnalysis
&& OutlineRegionCost
< MinOutlineRegionCost
) {
531 return OptimizationRemarkAnalysis(DEBUG_TYPE
, "TooCostly",
533 << ore::NV("Callee", &F
)
534 << " inline cost-savings smaller than "
535 << ore::NV("Cost", MinOutlineRegionCost
);
538 LLVM_DEBUG(dbgs() << "ABORT: Outline region cost is smaller than "
539 << MinOutlineRegionCost
<< "\n";);
543 // For now, ignore blocks that belong to a SISE region that is a
544 // candidate for outlining. In the future, we may want to look
545 // at inner regions because the outer region may have live-exit
547 for (auto *BB
: DominateVector
)
548 VisitedMap
[BB
] = true;
550 // ReturnBlock here means the block after the outline call
551 BasicBlock
*ReturnBlock
= ExitBlock
->getSingleSuccessor();
552 FunctionOutliningMultiRegionInfo::OutlineRegionInfo
RegInfo(
553 DominateVector
, DominateVector
.front(), ExitBlock
, ReturnBlock
);
554 OutliningInfo
->ORI
.push_back(RegInfo
);
555 LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: "
556 << DominateVector
.front()->getName() << "\n";);
557 ColdCandidateFound
= true;
558 NumColdRegionsFound
++;
562 if (ColdCandidateFound
)
563 return OutliningInfo
;
565 return std::unique_ptr
<FunctionOutliningMultiRegionInfo
>();
568 std::unique_ptr
<FunctionOutliningInfo
>
569 PartialInlinerImpl::computeOutliningInfo(Function
&F
) const {
570 BasicBlock
*EntryBlock
= &F
.front();
571 BranchInst
*BR
= dyn_cast
<BranchInst
>(EntryBlock
->getTerminator());
572 if (!BR
|| BR
->isUnconditional())
573 return std::unique_ptr
<FunctionOutliningInfo
>();
575 // Returns true if Succ is BB's successor
576 auto IsSuccessor
= [](BasicBlock
*Succ
, BasicBlock
*BB
) {
577 return is_contained(successors(BB
), Succ
);
580 auto IsReturnBlock
= [](BasicBlock
*BB
) {
581 Instruction
*TI
= BB
->getTerminator();
582 return isa
<ReturnInst
>(TI
);
585 auto GetReturnBlock
= [&](BasicBlock
*Succ1
, BasicBlock
*Succ2
) {
586 if (IsReturnBlock(Succ1
))
587 return std::make_tuple(Succ1
, Succ2
);
588 if (IsReturnBlock(Succ2
))
589 return std::make_tuple(Succ2
, Succ1
);
591 return std::make_tuple
<BasicBlock
*, BasicBlock
*>(nullptr, nullptr);
594 // Detect a triangular shape:
595 auto GetCommonSucc
= [&](BasicBlock
*Succ1
, BasicBlock
*Succ2
) {
596 if (IsSuccessor(Succ1
, Succ2
))
597 return std::make_tuple(Succ1
, Succ2
);
598 if (IsSuccessor(Succ2
, Succ1
))
599 return std::make_tuple(Succ2
, Succ1
);
601 return std::make_tuple
<BasicBlock
*, BasicBlock
*>(nullptr, nullptr);
604 std::unique_ptr
<FunctionOutliningInfo
> OutliningInfo
=
605 std::make_unique
<FunctionOutliningInfo
>();
607 BasicBlock
*CurrEntry
= EntryBlock
;
608 bool CandidateFound
= false;
610 // The number of blocks to be inlined has already reached
611 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
612 // disables partial inlining for the function.
613 if (OutliningInfo
->getNumInlinedBlocks() >= MaxNumInlineBlocks
)
616 if (succ_size(CurrEntry
) != 2)
619 BasicBlock
*Succ1
= *succ_begin(CurrEntry
);
620 BasicBlock
*Succ2
= *(succ_begin(CurrEntry
) + 1);
622 BasicBlock
*ReturnBlock
, *NonReturnBlock
;
623 std::tie(ReturnBlock
, NonReturnBlock
) = GetReturnBlock(Succ1
, Succ2
);
626 OutliningInfo
->Entries
.push_back(CurrEntry
);
627 OutliningInfo
->ReturnBlock
= ReturnBlock
;
628 OutliningInfo
->NonReturnBlock
= NonReturnBlock
;
629 CandidateFound
= true;
633 BasicBlock
*CommSucc
, *OtherSucc
;
634 std::tie(CommSucc
, OtherSucc
) = GetCommonSucc(Succ1
, Succ2
);
639 OutliningInfo
->Entries
.push_back(CurrEntry
);
640 CurrEntry
= OtherSucc
;
644 return std::unique_ptr
<FunctionOutliningInfo
>();
646 // Do sanity check of the entries: threre should not
647 // be any successors (not in the entry set) other than
648 // {ReturnBlock, NonReturnBlock}
649 assert(OutliningInfo
->Entries
[0] == &F
.front() &&
650 "Function Entry must be the first in Entries vector");
651 DenseSet
<BasicBlock
*> Entries
;
652 for (BasicBlock
*E
: OutliningInfo
->Entries
)
655 // Returns true of BB has Predecessor which is not
657 auto HasNonEntryPred
= [Entries
](BasicBlock
*BB
) {
658 for (auto *Pred
: predecessors(BB
)) {
659 if (!Entries
.count(Pred
))
664 auto CheckAndNormalizeCandidate
=
665 [Entries
, HasNonEntryPred
](FunctionOutliningInfo
*OutliningInfo
) {
666 for (BasicBlock
*E
: OutliningInfo
->Entries
) {
667 for (auto *Succ
: successors(E
)) {
668 if (Entries
.count(Succ
))
670 if (Succ
== OutliningInfo
->ReturnBlock
)
671 OutliningInfo
->ReturnBlockPreds
.push_back(E
);
672 else if (Succ
!= OutliningInfo
->NonReturnBlock
)
675 // There should not be any outside incoming edges either:
676 if (HasNonEntryPred(E
))
682 if (!CheckAndNormalizeCandidate(OutliningInfo
.get()))
683 return std::unique_ptr
<FunctionOutliningInfo
>();
685 // Now further growing the candidate's inlining region by
686 // peeling off dominating blocks from the outlining region:
687 while (OutliningInfo
->getNumInlinedBlocks() < MaxNumInlineBlocks
) {
688 BasicBlock
*Cand
= OutliningInfo
->NonReturnBlock
;
689 if (succ_size(Cand
) != 2)
692 if (HasNonEntryPred(Cand
))
695 BasicBlock
*Succ1
= *succ_begin(Cand
);
696 BasicBlock
*Succ2
= *(succ_begin(Cand
) + 1);
698 BasicBlock
*ReturnBlock
, *NonReturnBlock
;
699 std::tie(ReturnBlock
, NonReturnBlock
) = GetReturnBlock(Succ1
, Succ2
);
700 if (!ReturnBlock
|| ReturnBlock
!= OutliningInfo
->ReturnBlock
)
703 if (NonReturnBlock
->getSinglePredecessor() != Cand
)
706 // Now grow and update OutlininigInfo:
707 OutliningInfo
->Entries
.push_back(Cand
);
708 OutliningInfo
->NonReturnBlock
= NonReturnBlock
;
709 OutliningInfo
->ReturnBlockPreds
.push_back(Cand
);
710 Entries
.insert(Cand
);
713 return OutliningInfo
;
716 // Check if there is PGO data or user annotated branch data:
717 static bool hasProfileData(const Function
&F
, const FunctionOutliningInfo
&OI
) {
718 if (F
.hasProfileData())
720 // Now check if any of the entry block has MD_prof data:
721 for (auto *E
: OI
.Entries
) {
722 BranchInst
*BR
= dyn_cast
<BranchInst
>(E
->getTerminator());
723 if (!BR
|| BR
->isUnconditional())
726 if (BR
->extractProfMetadata(T
, F
))
732 BranchProbability
PartialInlinerImpl::getOutliningCallBBRelativeFreq(
733 FunctionCloner
&Cloner
) const {
734 BasicBlock
*OutliningCallBB
= Cloner
.OutlinedFunctions
.back().second
;
736 Cloner
.ClonedFuncBFI
->getBlockFreq(&Cloner
.ClonedFunc
->getEntryBlock());
737 auto OutliningCallFreq
=
738 Cloner
.ClonedFuncBFI
->getBlockFreq(OutliningCallBB
);
739 // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
740 // we outlined any regions, so we may encounter situations where the
741 // OutliningCallFreq is *slightly* bigger than the EntryFreq.
742 if (OutliningCallFreq
.getFrequency() > EntryFreq
.getFrequency())
743 OutliningCallFreq
= EntryFreq
;
745 auto OutlineRegionRelFreq
= BranchProbability::getBranchProbability(
746 OutliningCallFreq
.getFrequency(), EntryFreq
.getFrequency());
748 if (hasProfileData(*Cloner
.OrigFunc
, *Cloner
.ClonedOI
.get()))
749 return OutlineRegionRelFreq
;
751 // When profile data is not available, we need to be conservative in
752 // estimating the overall savings. Static branch prediction can usually
753 // guess the branch direction right (taken/non-taken), but the guessed
754 // branch probability is usually not biased enough. In case when the
755 // outlined region is predicted to be likely, its probability needs
756 // to be made higher (more biased) to not under-estimate the cost of
757 // function outlining. On the other hand, if the outlined region
758 // is predicted to be less likely, the predicted probablity is usually
759 // higher than the actual. For instance, the actual probability of the
760 // less likely target is only 5%, but the guessed probablity can be
761 // 40%. In the latter case, there is no need for further adjustement.
762 // FIXME: add an option for this.
763 if (OutlineRegionRelFreq
< BranchProbability(45, 100))
764 return OutlineRegionRelFreq
;
766 OutlineRegionRelFreq
= std::max(
767 OutlineRegionRelFreq
, BranchProbability(OutlineRegionFreqPercent
, 100));
769 return OutlineRegionRelFreq
;
772 bool PartialInlinerImpl::shouldPartialInline(
773 CallBase
&CB
, FunctionCloner
&Cloner
, BlockFrequency WeightedOutliningRcost
,
774 OptimizationRemarkEmitter
&ORE
) const {
777 Function
*Callee
= CB
.getCalledFunction();
778 assert(Callee
== Cloner
.ClonedFunc
);
780 if (SkipCostAnalysis
)
781 return isInlineViable(*Callee
).isSuccess();
783 Function
*Caller
= CB
.getCaller();
784 auto &CalleeTTI
= GetTTI(*Callee
);
785 bool RemarksEnabled
=
786 Callee
->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
789 getInlineCost(CB
, getInlineParams(), CalleeTTI
, GetAssumptionCache
,
790 GetTLI
, GetBFI
, &PSI
, RemarksEnabled
? &ORE
: nullptr);
794 return OptimizationRemarkAnalysis(DEBUG_TYPE
, "AlwaysInline", &CB
)
795 << NV("Callee", Cloner
.OrigFunc
)
796 << " should always be fully inlined, not partially";
803 return OptimizationRemarkMissed(DEBUG_TYPE
, "NeverInline", &CB
)
804 << NV("Callee", Cloner
.OrigFunc
) << " not partially inlined into "
805 << NV("Caller", Caller
)
806 << " because it should never be inlined (cost=never)";
813 return OptimizationRemarkAnalysis(DEBUG_TYPE
, "TooCostly", &CB
)
814 << NV("Callee", Cloner
.OrigFunc
) << " not partially inlined into "
815 << NV("Caller", Caller
) << " because too costly to inline (cost="
816 << NV("Cost", IC
.getCost()) << ", threshold="
817 << NV("Threshold", IC
.getCostDelta() + IC
.getCost()) << ")";
821 const DataLayout
&DL
= Caller
->getParent()->getDataLayout();
823 // The savings of eliminating the call:
824 int NonWeightedSavings
= getCallsiteCost(CB
, DL
);
825 BlockFrequency
NormWeightedSavings(NonWeightedSavings
);
827 // Weighted saving is smaller than weighted cost, return false
828 if (NormWeightedSavings
< WeightedOutliningRcost
) {
830 return OptimizationRemarkAnalysis(DEBUG_TYPE
, "OutliningCallcostTooHigh",
832 << NV("Callee", Cloner
.OrigFunc
) << " not partially inlined into "
833 << NV("Caller", Caller
) << " runtime overhead (overhead="
834 << NV("Overhead", (unsigned)WeightedOutliningRcost
.getFrequency())
836 << NV("Savings", (unsigned)NormWeightedSavings
.getFrequency())
838 << " of making the outlined call is too high";
845 return OptimizationRemarkAnalysis(DEBUG_TYPE
, "CanBePartiallyInlined", &CB
)
846 << NV("Callee", Cloner
.OrigFunc
) << " can be partially inlined into "
847 << NV("Caller", Caller
) << " with cost=" << NV("Cost", IC
.getCost())
849 << NV("Threshold", IC
.getCostDelta() + IC
.getCost()) << ")";
854 // TODO: Ideally we should share Inliner's InlineCost Analysis code.
855 // For now use a simplified version. The returned 'InlineCost' will be used
856 // to esimate the size cost as well as runtime cost of the BB.
858 PartialInlinerImpl::computeBBInlineCost(BasicBlock
*BB
,
859 TargetTransformInfo
*TTI
) {
860 InstructionCost InlineCost
= 0;
861 const DataLayout
&DL
= BB
->getParent()->getParent()->getDataLayout();
862 for (Instruction
&I
: BB
->instructionsWithoutDebug()) {
863 // Skip free instructions.
864 switch (I
.getOpcode()) {
865 case Instruction::BitCast
:
866 case Instruction::PtrToInt
:
867 case Instruction::IntToPtr
:
868 case Instruction::Alloca
:
869 case Instruction::PHI
:
871 case Instruction::GetElementPtr
:
872 if (cast
<GetElementPtrInst
>(&I
)->hasAllZeroIndices())
879 if (I
.isLifetimeStartOrEnd())
882 if (auto *II
= dyn_cast
<IntrinsicInst
>(&I
)) {
883 Intrinsic::ID IID
= II
->getIntrinsicID();
884 SmallVector
<Type
*, 4> Tys
;
886 for (Value
*Val
: II
->args())
887 Tys
.push_back(Val
->getType());
889 if (auto *FPMO
= dyn_cast
<FPMathOperator
>(II
))
890 FMF
= FPMO
->getFastMathFlags();
892 IntrinsicCostAttributes
ICA(IID
, II
->getType(), Tys
, FMF
);
893 InlineCost
+= TTI
->getIntrinsicInstrCost(ICA
, TTI::TCK_SizeAndLatency
);
897 if (CallInst
*CI
= dyn_cast
<CallInst
>(&I
)) {
898 InlineCost
+= getCallsiteCost(*CI
, DL
);
902 if (InvokeInst
*II
= dyn_cast
<InvokeInst
>(&I
)) {
903 InlineCost
+= getCallsiteCost(*II
, DL
);
907 if (SwitchInst
*SI
= dyn_cast
<SwitchInst
>(&I
)) {
908 InlineCost
+= (SI
->getNumCases() + 1) * InlineConstants::InstrCost
;
911 InlineCost
+= InlineConstants::InstrCost
;
917 std::tuple
<InstructionCost
, InstructionCost
>
918 PartialInlinerImpl::computeOutliningCosts(FunctionCloner
&Cloner
) const {
919 InstructionCost OutliningFuncCallCost
= 0, OutlinedFunctionCost
= 0;
920 for (auto FuncBBPair
: Cloner
.OutlinedFunctions
) {
921 Function
*OutlinedFunc
= FuncBBPair
.first
;
922 BasicBlock
* OutliningCallBB
= FuncBBPair
.second
;
923 // Now compute the cost of the call sequence to the outlined function
924 // 'OutlinedFunction' in BB 'OutliningCallBB':
925 auto *OutlinedFuncTTI
= &GetTTI(*OutlinedFunc
);
926 OutliningFuncCallCost
+=
927 computeBBInlineCost(OutliningCallBB
, OutlinedFuncTTI
);
929 // Now compute the cost of the extracted/outlined function itself:
930 for (BasicBlock
&BB
: *OutlinedFunc
)
931 OutlinedFunctionCost
+= computeBBInlineCost(&BB
, OutlinedFuncTTI
);
933 assert(OutlinedFunctionCost
>= Cloner
.OutlinedRegionCost
&&
934 "Outlined function cost should be no less than the outlined region");
936 // The code extractor introduces a new root and exit stub blocks with
937 // additional unconditional branches. Those branches will be eliminated
938 // later with bb layout. The cost should be adjusted accordingly:
939 OutlinedFunctionCost
-=
940 2 * InlineConstants::InstrCost
* Cloner
.OutlinedFunctions
.size();
942 InstructionCost OutliningRuntimeOverhead
=
943 OutliningFuncCallCost
+
944 (OutlinedFunctionCost
- Cloner
.OutlinedRegionCost
) +
945 ExtraOutliningPenalty
.getValue();
947 return std::make_tuple(OutliningFuncCallCost
, OutliningRuntimeOverhead
);
950 // Create the callsite to profile count map which is
951 // used to update the original function's entry count,
952 // after the function is partially inlined into the callsite.
953 void PartialInlinerImpl::computeCallsiteToProfCountMap(
954 Function
*DuplicateFunction
,
955 DenseMap
<User
*, uint64_t> &CallSiteToProfCountMap
) const {
956 std::vector
<User
*> Users(DuplicateFunction
->user_begin(),
957 DuplicateFunction
->user_end());
958 Function
*CurrentCaller
= nullptr;
959 std::unique_ptr
<BlockFrequencyInfo
> TempBFI
;
960 BlockFrequencyInfo
*CurrentCallerBFI
= nullptr;
962 auto ComputeCurrBFI
= [&,this](Function
*Caller
) {
963 // For the old pass manager:
965 DominatorTree
DT(*Caller
);
967 BranchProbabilityInfo
BPI(*Caller
, LI
);
968 TempBFI
.reset(new BlockFrequencyInfo(*Caller
, BPI
, LI
));
969 CurrentCallerBFI
= TempBFI
.get();
972 CurrentCallerBFI
= &(GetBFI(*Caller
));
976 for (User
*User
: Users
) {
977 CallBase
*CB
= getSupportedCallBase(User
);
978 Function
*Caller
= CB
->getCaller();
979 if (CurrentCaller
!= Caller
) {
980 CurrentCaller
= Caller
;
981 ComputeCurrBFI(Caller
);
983 assert(CurrentCallerBFI
&& "CallerBFI is not set");
985 BasicBlock
*CallBB
= CB
->getParent();
986 auto Count
= CurrentCallerBFI
->getBlockProfileCount(CallBB
);
988 CallSiteToProfCountMap
[User
] = *Count
;
990 CallSiteToProfCountMap
[User
] = 0;
994 PartialInlinerImpl::FunctionCloner::FunctionCloner(
995 Function
*F
, FunctionOutliningInfo
*OI
, OptimizationRemarkEmitter
&ORE
,
996 function_ref
<AssumptionCache
*(Function
&)> LookupAC
,
997 function_ref
<TargetTransformInfo
&(Function
&)> GetTTI
)
998 : OrigFunc(F
), ORE(ORE
), LookupAC(LookupAC
), GetTTI(GetTTI
) {
999 ClonedOI
= std::make_unique
<FunctionOutliningInfo
>();
1001 // Clone the function, so that we can hack away on it.
1002 ValueToValueMapTy VMap
;
1003 ClonedFunc
= CloneFunction(F
, VMap
);
1005 ClonedOI
->ReturnBlock
= cast
<BasicBlock
>(VMap
[OI
->ReturnBlock
]);
1006 ClonedOI
->NonReturnBlock
= cast
<BasicBlock
>(VMap
[OI
->NonReturnBlock
]);
1007 for (BasicBlock
*BB
: OI
->Entries
)
1008 ClonedOI
->Entries
.push_back(cast
<BasicBlock
>(VMap
[BB
]));
1010 for (BasicBlock
*E
: OI
->ReturnBlockPreds
) {
1011 BasicBlock
*NewE
= cast
<BasicBlock
>(VMap
[E
]);
1012 ClonedOI
->ReturnBlockPreds
.push_back(NewE
);
1014 // Go ahead and update all uses to the duplicate, so that we can just
1015 // use the inliner functionality when we're done hacking.
1016 F
->replaceAllUsesWith(ClonedFunc
);
1019 PartialInlinerImpl::FunctionCloner::FunctionCloner(
1020 Function
*F
, FunctionOutliningMultiRegionInfo
*OI
,
1021 OptimizationRemarkEmitter
&ORE
,
1022 function_ref
<AssumptionCache
*(Function
&)> LookupAC
,
1023 function_ref
<TargetTransformInfo
&(Function
&)> GetTTI
)
1024 : OrigFunc(F
), ORE(ORE
), LookupAC(LookupAC
), GetTTI(GetTTI
) {
1025 ClonedOMRI
= std::make_unique
<FunctionOutliningMultiRegionInfo
>();
1027 // Clone the function, so that we can hack away on it.
1028 ValueToValueMapTy VMap
;
1029 ClonedFunc
= CloneFunction(F
, VMap
);
1031 // Go through all Outline Candidate Regions and update all BasicBlock
1033 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo
:
1035 SmallVector
<BasicBlock
*, 8> Region
;
1036 for (BasicBlock
*BB
: RegionInfo
.Region
)
1037 Region
.push_back(cast
<BasicBlock
>(VMap
[BB
]));
1039 BasicBlock
*NewEntryBlock
= cast
<BasicBlock
>(VMap
[RegionInfo
.EntryBlock
]);
1040 BasicBlock
*NewExitBlock
= cast
<BasicBlock
>(VMap
[RegionInfo
.ExitBlock
]);
1041 BasicBlock
*NewReturnBlock
= nullptr;
1042 if (RegionInfo
.ReturnBlock
)
1043 NewReturnBlock
= cast
<BasicBlock
>(VMap
[RegionInfo
.ReturnBlock
]);
1044 FunctionOutliningMultiRegionInfo::OutlineRegionInfo
MappedRegionInfo(
1045 Region
, NewEntryBlock
, NewExitBlock
, NewReturnBlock
);
1046 ClonedOMRI
->ORI
.push_back(MappedRegionInfo
);
1048 // Go ahead and update all uses to the duplicate, so that we can just
1049 // use the inliner functionality when we're done hacking.
1050 F
->replaceAllUsesWith(ClonedFunc
);
1053 void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const {
1054 auto GetFirstPHI
= [](BasicBlock
*BB
) {
1055 BasicBlock::iterator I
= BB
->begin();
1056 PHINode
*FirstPhi
= nullptr;
1057 while (I
!= BB
->end()) {
1058 PHINode
*Phi
= dyn_cast
<PHINode
>(I
);
1069 // Shouldn't need to normalize PHIs if we're not outlining non-early return
1074 // Special hackery is needed with PHI nodes that have inputs from more than
1075 // one extracted block. For simplicity, just split the PHIs into a two-level
1076 // sequence of PHIs, some of which will go in the extracted region, and some
1077 // of which will go outside.
1078 BasicBlock
*PreReturn
= ClonedOI
->ReturnBlock
;
1079 // only split block when necessary:
1080 PHINode
*FirstPhi
= GetFirstPHI(PreReturn
);
1081 unsigned NumPredsFromEntries
= ClonedOI
->ReturnBlockPreds
.size();
1083 if (!FirstPhi
|| FirstPhi
->getNumIncomingValues() <= NumPredsFromEntries
+ 1)
1086 auto IsTrivialPhi
= [](PHINode
*PN
) -> Value
* {
1087 Value
*CommonValue
= PN
->getIncomingValue(0);
1088 if (all_of(PN
->incoming_values(),
1089 [&](Value
*V
) { return V
== CommonValue
; }))
1094 ClonedOI
->ReturnBlock
= ClonedOI
->ReturnBlock
->splitBasicBlock(
1095 ClonedOI
->ReturnBlock
->getFirstNonPHI()->getIterator());
1096 BasicBlock::iterator I
= PreReturn
->begin();
1097 Instruction
*Ins
= &ClonedOI
->ReturnBlock
->front();
1098 SmallVector
<Instruction
*, 4> DeadPhis
;
1099 while (I
!= PreReturn
->end()) {
1100 PHINode
*OldPhi
= dyn_cast
<PHINode
>(I
);
1105 PHINode::Create(OldPhi
->getType(), NumPredsFromEntries
+ 1, "", Ins
);
1106 OldPhi
->replaceAllUsesWith(RetPhi
);
1107 Ins
= ClonedOI
->ReturnBlock
->getFirstNonPHI();
1109 RetPhi
->addIncoming(&*I
, PreReturn
);
1110 for (BasicBlock
*E
: ClonedOI
->ReturnBlockPreds
) {
1111 RetPhi
->addIncoming(OldPhi
->getIncomingValueForBlock(E
), E
);
1112 OldPhi
->removeIncomingValue(E
);
1115 // After incoming values splitting, the old phi may become trivial.
1116 // Keeping the trivial phi can introduce definition inside the outline
1117 // region which is live-out, causing necessary overhead (load, store
1118 // arg passing etc).
1119 if (auto *OldPhiVal
= IsTrivialPhi(OldPhi
)) {
1120 OldPhi
->replaceAllUsesWith(OldPhiVal
);
1121 DeadPhis
.push_back(OldPhi
);
1125 for (auto *DP
: DeadPhis
)
1126 DP
->eraseFromParent();
1128 for (auto *E
: ClonedOI
->ReturnBlockPreds
)
1129 E
->getTerminator()->replaceUsesOfWith(PreReturn
, ClonedOI
->ReturnBlock
);
1132 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1134 auto ComputeRegionCost
=
1135 [&](SmallVectorImpl
<BasicBlock
*> &Region
) -> InstructionCost
{
1136 InstructionCost Cost
= 0;
1137 for (BasicBlock
* BB
: Region
)
1138 Cost
+= computeBBInlineCost(BB
, &GetTTI(*BB
->getParent()));
1142 assert(ClonedOMRI
&& "Expecting OutlineInfo for multi region outline");
1144 if (ClonedOMRI
->ORI
.empty())
1147 // The CodeExtractor needs a dominator tree.
1149 DT
.recalculate(*ClonedFunc
);
1151 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1153 BranchProbabilityInfo
BPI(*ClonedFunc
, LI
);
1154 ClonedFuncBFI
.reset(new BlockFrequencyInfo(*ClonedFunc
, BPI
, LI
));
1156 // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
1157 CodeExtractorAnalysisCache
CEAC(*ClonedFunc
);
1159 SetVector
<Value
*> Inputs
, Outputs
, Sinks
;
1160 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo
:
1162 InstructionCost CurrentOutlinedRegionCost
=
1163 ComputeRegionCost(RegionInfo
.Region
);
1165 CodeExtractor
CE(RegionInfo
.Region
, &DT
, /*AggregateArgs*/ false,
1166 ClonedFuncBFI
.get(), &BPI
,
1167 LookupAC(*RegionInfo
.EntryBlock
->getParent()),
1168 /* AllowVarargs */ false);
1170 CE
.findInputsOutputs(Inputs
, Outputs
, Sinks
);
1173 dbgs() << "inputs: " << Inputs
.size() << "\n";
1174 dbgs() << "outputs: " << Outputs
.size() << "\n";
1175 for (Value
*value
: Inputs
)
1176 dbgs() << "value used in func: " << *value
<< "\n";
1177 for (Value
*output
: Outputs
)
1178 dbgs() << "instr used in func: " << *output
<< "\n";
1181 // Do not extract regions that have live exit variables.
1182 if (Outputs
.size() > 0 && !ForceLiveExit
)
1185 if (Function
*OutlinedFunc
= CE
.extractCodeRegion(CEAC
)) {
1186 CallBase
*OCS
= PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc
);
1187 BasicBlock
*OutliningCallBB
= OCS
->getParent();
1188 assert(OutliningCallBB
->getParent() == ClonedFunc
);
1189 OutlinedFunctions
.push_back(std::make_pair(OutlinedFunc
,OutliningCallBB
));
1190 NumColdRegionsOutlined
++;
1191 OutlinedRegionCost
+= CurrentOutlinedRegionCost
;
1193 if (MarkOutlinedColdCC
) {
1194 OutlinedFunc
->setCallingConv(CallingConv::Cold
);
1195 OCS
->setCallingConv(CallingConv::Cold
);
1199 return OptimizationRemarkMissed(DEBUG_TYPE
, "ExtractFailed",
1200 &RegionInfo
.Region
.front()->front())
1201 << "Failed to extract region at block "
1202 << ore::NV("Block", RegionInfo
.Region
.front());
1206 return !OutlinedFunctions
.empty();
1210 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1211 // Returns true if the block is to be partial inlined into the caller
1212 // (i.e. not to be extracted to the out of line function)
1213 auto ToBeInlined
= [&, this](BasicBlock
*BB
) {
1214 return BB
== ClonedOI
->ReturnBlock
||
1215 llvm::is_contained(ClonedOI
->Entries
, BB
);
1218 assert(ClonedOI
&& "Expecting OutlineInfo for single region outline");
1219 // The CodeExtractor needs a dominator tree.
1221 DT
.recalculate(*ClonedFunc
);
1223 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1225 BranchProbabilityInfo
BPI(*ClonedFunc
, LI
);
1226 ClonedFuncBFI
.reset(new BlockFrequencyInfo(*ClonedFunc
, BPI
, LI
));
1228 // Gather up the blocks that we're going to extract.
1229 std::vector
<BasicBlock
*> ToExtract
;
1230 auto *ClonedFuncTTI
= &GetTTI(*ClonedFunc
);
1231 ToExtract
.push_back(ClonedOI
->NonReturnBlock
);
1232 OutlinedRegionCost
+= PartialInlinerImpl::computeBBInlineCost(
1233 ClonedOI
->NonReturnBlock
, ClonedFuncTTI
);
1234 for (BasicBlock
&BB
: *ClonedFunc
)
1235 if (!ToBeInlined(&BB
) && &BB
!= ClonedOI
->NonReturnBlock
) {
1236 ToExtract
.push_back(&BB
);
1237 // FIXME: the code extractor may hoist/sink more code
1238 // into the outlined function which may make the outlining
1239 // overhead (the difference of the outlined function cost
1240 // and OutliningRegionCost) look larger.
1241 OutlinedRegionCost
+= computeBBInlineCost(&BB
, ClonedFuncTTI
);
1244 // Extract the body of the if.
1245 CodeExtractorAnalysisCache
CEAC(*ClonedFunc
);
1246 Function
*OutlinedFunc
=
1247 CodeExtractor(ToExtract
, &DT
, /*AggregateArgs*/ false,
1248 ClonedFuncBFI
.get(), &BPI
, LookupAC(*ClonedFunc
),
1249 /* AllowVarargs */ true)
1250 .extractCodeRegion(CEAC
);
1253 BasicBlock
*OutliningCallBB
=
1254 PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc
)->getParent();
1255 assert(OutliningCallBB
->getParent() == ClonedFunc
);
1256 OutlinedFunctions
.push_back(std::make_pair(OutlinedFunc
, OutliningCallBB
));
1259 return OptimizationRemarkMissed(DEBUG_TYPE
, "ExtractFailed",
1260 &ToExtract
.front()->front())
1261 << "Failed to extract region at block "
1262 << ore::NV("Block", ToExtract
.front());
1265 return OutlinedFunc
;
1268 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1269 // Ditch the duplicate, since we're done with it, and rewrite all remaining
1270 // users (function pointers, etc.) back to the original function.
1271 ClonedFunc
->replaceAllUsesWith(OrigFunc
);
1272 ClonedFunc
->eraseFromParent();
1273 if (!IsFunctionInlined
) {
1274 // Remove each function that was speculatively created if there is no
1276 for (auto FuncBBPair
: OutlinedFunctions
) {
1277 Function
*Func
= FuncBBPair
.first
;
1278 Func
->eraseFromParent();
1283 std::pair
<bool, Function
*> PartialInlinerImpl::unswitchFunction(Function
&F
) {
1284 if (F
.hasAddressTaken())
1285 return {false, nullptr};
1287 // Let inliner handle it
1288 if (F
.hasFnAttribute(Attribute::AlwaysInline
))
1289 return {false, nullptr};
1291 if (F
.hasFnAttribute(Attribute::NoInline
))
1292 return {false, nullptr};
1294 if (PSI
.isFunctionEntryCold(&F
))
1295 return {false, nullptr};
1297 if (F
.users().empty())
1298 return {false, nullptr};
1300 OptimizationRemarkEmitter
ORE(&F
);
1302 // Only try to outline cold regions if we have a profile summary, which
1303 // implies we have profiling information.
1304 if (PSI
.hasProfileSummary() && F
.hasProfileData() &&
1305 !DisableMultiRegionPartialInline
) {
1306 std::unique_ptr
<FunctionOutliningMultiRegionInfo
> OMRI
=
1307 computeOutliningColdRegionsInfo(F
, ORE
);
1309 FunctionCloner
Cloner(&F
, OMRI
.get(), ORE
, LookupAssumptionCache
, GetTTI
);
1312 dbgs() << "HotCountThreshold = " << PSI
.getHotCountThreshold() << "\n";
1313 dbgs() << "ColdCountThreshold = " << PSI
.getColdCountThreshold()
1317 bool DidOutline
= Cloner
.doMultiRegionFunctionOutlining();
1321 dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1322 Cloner
.ClonedFunc
->print(dbgs());
1323 dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1326 if (tryPartialInline(Cloner
))
1327 return {true, nullptr};
1332 // Fall-thru to regular partial inlining if we:
1333 // i) can't find any cold regions to outline, or
1334 // ii) can't inline the outlined function anywhere.
1335 std::unique_ptr
<FunctionOutliningInfo
> OI
= computeOutliningInfo(F
);
1337 return {false, nullptr};
1339 FunctionCloner
Cloner(&F
, OI
.get(), ORE
, LookupAssumptionCache
, GetTTI
);
1340 Cloner
.normalizeReturnBlock();
1342 Function
*OutlinedFunction
= Cloner
.doSingleRegionFunctionOutlining();
1344 if (!OutlinedFunction
)
1345 return {false, nullptr};
1347 if (tryPartialInline(Cloner
))
1348 return {true, OutlinedFunction
};
1350 return {false, nullptr};
1353 bool PartialInlinerImpl::tryPartialInline(FunctionCloner
&Cloner
) {
1354 if (Cloner
.OutlinedFunctions
.empty())
1358 BlockFrequency WeightedRcost
;
1359 int NonWeightedRcost
;
1361 auto OutliningCosts
= computeOutliningCosts(Cloner
);
1362 assert(std::get
<0>(OutliningCosts
).isValid() &&
1363 std::get
<1>(OutliningCosts
).isValid() && "Expected valid costs");
1365 SizeCost
= *std::get
<0>(OutliningCosts
).getValue();
1366 NonWeightedRcost
= *std::get
<1>(OutliningCosts
).getValue();
1368 // Only calculate RelativeToEntryFreq when we are doing single region
1370 BranchProbability RelativeToEntryFreq
;
1371 if (Cloner
.ClonedOI
)
1372 RelativeToEntryFreq
= getOutliningCallBBRelativeFreq(Cloner
);
1374 // RelativeToEntryFreq doesn't make sense when we have more than one
1375 // outlined call because each call will have a different relative frequency
1376 // to the entry block. We can consider using the average, but the
1377 // usefulness of that information is questionable. For now, assume we never
1378 // execute the calls to outlined functions.
1379 RelativeToEntryFreq
= BranchProbability(0, 1);
1381 WeightedRcost
= BlockFrequency(NonWeightedRcost
) * RelativeToEntryFreq
;
1383 // The call sequence(s) to the outlined function(s) are larger than the sum of
1384 // the original outlined region size(s), it does not increase the chances of
1385 // inlining the function with outlining (The inliner uses the size increase to
1386 // model the cost of inlining a callee).
1387 if (!SkipCostAnalysis
&& Cloner
.OutlinedRegionCost
< SizeCost
) {
1388 OptimizationRemarkEmitter
OrigFuncORE(Cloner
.OrigFunc
);
1391 std::tie(DLoc
, Block
) = getOneDebugLoc(*Cloner
.ClonedFunc
);
1392 OrigFuncORE
.emit([&]() {
1393 return OptimizationRemarkAnalysis(DEBUG_TYPE
, "OutlineRegionTooSmall",
1395 << ore::NV("Function", Cloner
.OrigFunc
)
1396 << " not partially inlined into callers (Original Size = "
1397 << ore::NV("OutlinedRegionOriginalSize", Cloner
.OutlinedRegionCost
)
1398 << ", Size of call sequence to outlined function = "
1399 << ore::NV("NewSize", SizeCost
) << ")";
1404 assert(Cloner
.OrigFunc
->users().empty() &&
1405 "F's users should all be replaced!");
1407 std::vector
<User
*> Users(Cloner
.ClonedFunc
->user_begin(),
1408 Cloner
.ClonedFunc
->user_end());
1410 DenseMap
<User
*, uint64_t> CallSiteToProfCountMap
;
1411 auto CalleeEntryCount
= Cloner
.OrigFunc
->getEntryCount();
1412 if (CalleeEntryCount
)
1413 computeCallsiteToProfCountMap(Cloner
.ClonedFunc
, CallSiteToProfCountMap
);
1415 uint64_t CalleeEntryCountV
=
1416 (CalleeEntryCount
? CalleeEntryCount
.getCount() : 0);
1418 bool AnyInline
= false;
1419 for (User
*User
: Users
) {
1420 CallBase
*CB
= getSupportedCallBase(User
);
1422 if (isLimitReached())
1425 OptimizationRemarkEmitter
CallerORE(CB
->getCaller());
1426 if (!shouldPartialInline(*CB
, Cloner
, WeightedRcost
, CallerORE
))
1429 // Construct remark before doing the inlining, as after successful inlining
1430 // the callsite is removed.
1431 OptimizationRemark
OR(DEBUG_TYPE
, "PartiallyInlined", CB
);
1432 OR
<< ore::NV("Callee", Cloner
.OrigFunc
) << " partially inlined into "
1433 << ore::NV("Caller", CB
->getCaller());
1435 InlineFunctionInfo
IFI(nullptr, GetAssumptionCache
, &PSI
);
1436 // We can only forward varargs when we outlined a single region, else we
1437 // bail on vararg functions.
1438 if (!InlineFunction(*CB
, IFI
, nullptr, true,
1439 (Cloner
.ClonedOI
? Cloner
.OutlinedFunctions
.back().first
1446 // Now update the entry count:
1447 if (CalleeEntryCountV
&& CallSiteToProfCountMap
.count(User
)) {
1448 uint64_t CallSiteCount
= CallSiteToProfCountMap
[User
];
1449 CalleeEntryCountV
-= std::min(CalleeEntryCountV
, CallSiteCount
);
1453 NumPartialInlining
++;
1455 if (Cloner
.ClonedOI
)
1456 NumPartialInlined
++;
1458 NumColdOutlinePartialInlined
++;
1462 Cloner
.IsFunctionInlined
= true;
1463 if (CalleeEntryCount
)
1464 Cloner
.OrigFunc
->setEntryCount(
1465 CalleeEntryCount
.setCount(CalleeEntryCountV
));
1466 OptimizationRemarkEmitter
OrigFuncORE(Cloner
.OrigFunc
);
1467 OrigFuncORE
.emit([&]() {
1468 return OptimizationRemark(DEBUG_TYPE
, "PartiallyInlined", Cloner
.OrigFunc
)
1469 << "Partially inlined into at least one caller";
1476 bool PartialInlinerImpl::run(Module
&M
) {
1477 if (DisablePartialInlining
)
1480 std::vector
<Function
*> Worklist
;
1481 Worklist
.reserve(M
.size());
1482 for (Function
&F
: M
)
1483 if (!F
.use_empty() && !F
.isDeclaration())
1484 Worklist
.push_back(&F
);
1486 bool Changed
= false;
1487 while (!Worklist
.empty()) {
1488 Function
*CurrFunc
= Worklist
.back();
1489 Worklist
.pop_back();
1491 if (CurrFunc
->use_empty())
1494 bool Recursive
= false;
1495 for (User
*U
: CurrFunc
->users())
1496 if (Instruction
*I
= dyn_cast
<Instruction
>(U
))
1497 if (I
->getParent()->getParent() == CurrFunc
) {
1504 std::pair
<bool, Function
*> Result
= unswitchFunction(*CurrFunc
);
1506 Worklist
.push_back(Result
.second
);
1507 Changed
|= Result
.first
;
1513 char PartialInlinerLegacyPass::ID
= 0;
1515 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass
, "partial-inliner",
1516 "Partial Inliner", false, false)
1517 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker
)
1518 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass
)
1519 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass
)
1520 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass
)
1521 INITIALIZE_PASS_END(PartialInlinerLegacyPass
, "partial-inliner",
1522 "Partial Inliner", false, false)
1524 ModulePass
*llvm::createPartialInliningPass() {
1525 return new PartialInlinerLegacyPass();
1528 PreservedAnalyses
PartialInlinerPass::run(Module
&M
,
1529 ModuleAnalysisManager
&AM
) {
1530 auto &FAM
= AM
.getResult
<FunctionAnalysisManagerModuleProxy
>(M
).getManager();
1532 auto GetAssumptionCache
= [&FAM
](Function
&F
) -> AssumptionCache
& {
1533 return FAM
.getResult
<AssumptionAnalysis
>(F
);
1536 auto LookupAssumptionCache
= [&FAM
](Function
&F
) -> AssumptionCache
* {
1537 return FAM
.getCachedResult
<AssumptionAnalysis
>(F
);
1540 auto GetBFI
= [&FAM
](Function
&F
) -> BlockFrequencyInfo
& {
1541 return FAM
.getResult
<BlockFrequencyAnalysis
>(F
);
1544 auto GetTTI
= [&FAM
](Function
&F
) -> TargetTransformInfo
& {
1545 return FAM
.getResult
<TargetIRAnalysis
>(F
);
1548 auto GetTLI
= [&FAM
](Function
&F
) -> TargetLibraryInfo
& {
1549 return FAM
.getResult
<TargetLibraryAnalysis
>(F
);
1552 ProfileSummaryInfo
&PSI
= AM
.getResult
<ProfileSummaryAnalysis
>(M
);
1554 if (PartialInlinerImpl(GetAssumptionCache
, LookupAssumptionCache
, GetTTI
,
1555 GetTLI
, PSI
, GetBFI
)
1557 return PreservedAnalyses::none();
1558 return PreservedAnalyses::all();