1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass performs partial inlining, typically by inlining an if statement
10 // that surrounds the body of the function.
12 //===----------------------------------------------------------------------===//
14 #include "llvm/Transforms/IPO/PartialInlining.h"
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/DenseSet.h"
17 #include "llvm/ADT/None.h"
18 #include "llvm/ADT/Optional.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Statistic.h"
22 #include "llvm/Analysis/BlockFrequencyInfo.h"
23 #include "llvm/Analysis/BranchProbabilityInfo.h"
24 #include "llvm/Analysis/InlineCost.h"
25 #include "llvm/Analysis/LoopInfo.h"
26 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
27 #include "llvm/Analysis/ProfileSummaryInfo.h"
28 #include "llvm/Analysis/TargetLibraryInfo.h"
29 #include "llvm/Analysis/TargetTransformInfo.h"
30 #include "llvm/IR/Attributes.h"
31 #include "llvm/IR/BasicBlock.h"
32 #include "llvm/IR/CFG.h"
33 #include "llvm/IR/CallSite.h"
34 #include "llvm/IR/DebugLoc.h"
35 #include "llvm/IR/DiagnosticInfo.h"
36 #include "llvm/IR/Dominators.h"
37 #include "llvm/IR/Function.h"
38 #include "llvm/IR/InstrTypes.h"
39 #include "llvm/IR/Instruction.h"
40 #include "llvm/IR/Instructions.h"
41 #include "llvm/IR/IntrinsicInst.h"
42 #include "llvm/IR/Intrinsics.h"
43 #include "llvm/IR/Module.h"
44 #include "llvm/IR/User.h"
45 #include "llvm/Pass.h"
46 #include "llvm/Support/BlockFrequency.h"
47 #include "llvm/Support/BranchProbability.h"
48 #include "llvm/Support/Casting.h"
49 #include "llvm/Support/CommandLine.h"
50 #include "llvm/Support/ErrorHandling.h"
51 #include "llvm/Transforms/IPO.h"
52 #include "llvm/Transforms/Utils/Cloning.h"
53 #include "llvm/Transforms/Utils/CodeExtractor.h"
54 #include "llvm/Transforms/Utils/ValueMapper.h"
66 #define DEBUG_TYPE "partial-inlining"
68 STATISTIC(NumPartialInlined
,
69 "Number of callsites functions partially inlined into.");
70 STATISTIC(NumColdOutlinePartialInlined
, "Number of times functions with "
71 "cold outlined regions were partially "
72 "inlined into its caller(s).");
73 STATISTIC(NumColdRegionsFound
,
74 "Number of cold single entry/exit regions found.");
75 STATISTIC(NumColdRegionsOutlined
,
76 "Number of cold single entry/exit regions outlined.");
78 // Command line option to disable partial-inlining. The default is false:
80 DisablePartialInlining("disable-partial-inlining", cl::init(false),
81 cl::Hidden
, cl::desc("Disable partial inlining"));
82 // Command line option to disable multi-region partial-inlining. The default is
84 static cl::opt
<bool> DisableMultiRegionPartialInline(
85 "disable-mr-partial-inlining", cl::init(false), cl::Hidden
,
86 cl::desc("Disable multi-region partial inlining"));
88 // Command line option to force outlining in regions with live exit variables.
89 // The default is false:
91 ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden
,
92 cl::desc("Force outline regions with live exits"));
94 // Command line option to enable marking outline functions with Cold Calling
95 // Convention. The default is false:
97 MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden
,
98 cl::desc("Mark outline function calls with ColdCC"));
101 // Command line option to debug partial-inlining. The default is none:
102 static cl::opt
<bool> TracePartialInlining("trace-partial-inlining",
103 cl::init(false), cl::Hidden
,
104 cl::desc("Trace partial inlining."));
107 // This is an option used by testing:
108 static cl::opt
<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
109 cl::init(false), cl::ZeroOrMore
,
111 cl::desc("Skip Cost Analysis"));
112 // Used to determine if a cold region is worth outlining based on
113 // its inlining cost compared to the original function. Default is set at 10%.
114 // ie. if the cold region reduces the inlining cost of the original function by
116 static cl::opt
<float> MinRegionSizeRatio(
117 "min-region-size-ratio", cl::init(0.1), cl::Hidden
,
118 cl::desc("Minimum ratio comparing relative sizes of each "
119 "outline candidate and original function"));
120 // Used to tune the minimum number of execution counts needed in the predecessor
121 // block to the cold edge. ie. confidence interval.
122 static cl::opt
<unsigned>
123 MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden
,
124 cl::desc("Minimum block executions to consider "
125 "its BranchProbabilityInfo valid"));
126 // Used to determine when an edge is considered cold. Default is set to 10%. ie.
127 // if the branch probability is 10% or less, then it is deemed as 'cold'.
128 static cl::opt
<float> ColdBranchRatio(
129 "cold-branch-ratio", cl::init(0.1), cl::Hidden
,
130 cl::desc("Minimum BranchProbability to consider a region cold."));
132 static cl::opt
<unsigned> MaxNumInlineBlocks(
133 "max-num-inline-blocks", cl::init(5), cl::Hidden
,
134 cl::desc("Max number of blocks to be partially inlined"));
136 // Command line option to set the maximum number of partial inlining allowed
137 // for the module. The default value of -1 means no limit.
138 static cl::opt
<int> MaxNumPartialInlining(
139 "max-partial-inlining", cl::init(-1), cl::Hidden
, cl::ZeroOrMore
,
140 cl::desc("Max number of partial inlining. The default is unlimited"));
142 // Used only when PGO or user annotated branch data is absent. It is
143 // the least value that is used to weigh the outline region. If BFI
144 // produces larger value, the BFI value will be used.
146 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
147 cl::Hidden
, cl::ZeroOrMore
,
148 cl::desc("Relative frequency of outline region to "
151 static cl::opt
<unsigned> ExtraOutliningPenalty(
152 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden
,
153 cl::desc("A debug option to add additional penalty to the computed one."));
157 struct FunctionOutliningInfo
{
158 FunctionOutliningInfo() = default;
160 // Returns the number of blocks to be inlined including all blocks
161 // in Entries and one return block.
162 unsigned GetNumInlinedBlocks() const { return Entries
.size() + 1; }
164 // A set of blocks including the function entry that guard
165 // the region to be outlined.
166 SmallVector
<BasicBlock
*, 4> Entries
;
168 // The return block that is not included in the outlined region.
169 BasicBlock
*ReturnBlock
= nullptr;
171 // The dominating block of the region to be outlined.
172 BasicBlock
*NonReturnBlock
= nullptr;
174 // The set of blocks in Entries that that are predecessors to ReturnBlock
175 SmallVector
<BasicBlock
*, 4> ReturnBlockPreds
;
178 struct FunctionOutliningMultiRegionInfo
{
179 FunctionOutliningMultiRegionInfo()
182 // Container for outline regions
183 struct OutlineRegionInfo
{
184 OutlineRegionInfo(ArrayRef
<BasicBlock
*> Region
,
185 BasicBlock
*EntryBlock
, BasicBlock
*ExitBlock
,
186 BasicBlock
*ReturnBlock
)
187 : Region(Region
.begin(), Region
.end()), EntryBlock(EntryBlock
),
188 ExitBlock(ExitBlock
), ReturnBlock(ReturnBlock
) {}
189 SmallVector
<BasicBlock
*, 8> Region
;
190 BasicBlock
*EntryBlock
;
191 BasicBlock
*ExitBlock
;
192 BasicBlock
*ReturnBlock
;
195 SmallVector
<OutlineRegionInfo
, 4> ORI
;
198 struct PartialInlinerImpl
{
201 std::function
<AssumptionCache
&(Function
&)> *GetAC
,
202 function_ref
<AssumptionCache
*(Function
&)> LookupAC
,
203 std::function
<TargetTransformInfo
&(Function
&)> *GTTI
,
204 Optional
<function_ref
<BlockFrequencyInfo
&(Function
&)>> GBFI
,
205 ProfileSummaryInfo
*ProfSI
)
206 : GetAssumptionCache(GetAC
), LookupAssumptionCache(LookupAC
),
207 GetTTI(GTTI
), GetBFI(GBFI
), PSI(ProfSI
) {}
210 // Main part of the transformation that calls helper functions to find
211 // outlining candidates, clone & outline the function, and attempt to
212 // partially inline the resulting function. Returns true if
213 // inlining was successful, false otherwise. Also returns the outline
214 // function (only if we partially inlined early returns) as there is a
215 // possibility to further "peel" early return statements that were left in the
216 // outline function due to code size.
217 std::pair
<bool, Function
*> unswitchFunction(Function
*F
);
219 // This class speculatively clones the function to be partial inlined.
220 // At the end of partial inlining, the remaining callsites to the cloned
221 // function that are not partially inlined will be fixed up to reference
222 // the original function, and the cloned function will be erased.
223 struct FunctionCloner
{
224 // Two constructors, one for single region outlining, the other for
225 // multi-region outlining.
226 FunctionCloner(Function
*F
, FunctionOutliningInfo
*OI
,
227 OptimizationRemarkEmitter
&ORE
,
228 function_ref
<AssumptionCache
*(Function
&)> LookupAC
);
229 FunctionCloner(Function
*F
, FunctionOutliningMultiRegionInfo
*OMRI
,
230 OptimizationRemarkEmitter
&ORE
,
231 function_ref
<AssumptionCache
*(Function
&)> LookupAC
);
234 // Prepare for function outlining: making sure there is only
235 // one incoming edge from the extracted/outlined region to
237 void NormalizeReturnBlock();
239 // Do function outlining for cold regions.
240 bool doMultiRegionFunctionOutlining();
241 // Do function outlining for region after early return block(s).
242 // NOTE: For vararg functions that do the vararg handling in the outlined
243 // function, we temporarily generate IR that does not properly
244 // forward varargs to the outlined function. Calling InlineFunction
245 // will update calls to the outlined functions to properly forward
247 Function
*doSingleRegionFunctionOutlining();
249 Function
*OrigFunc
= nullptr;
250 Function
*ClonedFunc
= nullptr;
252 typedef std::pair
<Function
*, BasicBlock
*> FuncBodyCallerPair
;
253 // Keep track of Outlined Functions and the basic block they're called from.
254 SmallVector
<FuncBodyCallerPair
, 4> OutlinedFunctions
;
256 // ClonedFunc is inlined in one of its callers after function
258 bool IsFunctionInlined
= false;
259 // The cost of the region to be outlined.
260 int OutlinedRegionCost
= 0;
261 // ClonedOI is specific to outlining non-early return blocks.
262 std::unique_ptr
<FunctionOutliningInfo
> ClonedOI
= nullptr;
263 // ClonedOMRI is specific to outlining cold regions.
264 std::unique_ptr
<FunctionOutliningMultiRegionInfo
> ClonedOMRI
= nullptr;
265 std::unique_ptr
<BlockFrequencyInfo
> ClonedFuncBFI
= nullptr;
266 OptimizationRemarkEmitter
&ORE
;
267 function_ref
<AssumptionCache
*(Function
&)> LookupAC
;
271 int NumPartialInlining
= 0;
272 std::function
<AssumptionCache
&(Function
&)> *GetAssumptionCache
;
273 function_ref
<AssumptionCache
*(Function
&)> LookupAssumptionCache
;
274 std::function
<TargetTransformInfo
&(Function
&)> *GetTTI
;
275 Optional
<function_ref
<BlockFrequencyInfo
&(Function
&)>> GetBFI
;
276 ProfileSummaryInfo
*PSI
;
278 // Return the frequency of the OutlininingBB relative to F's entry point.
279 // The result is no larger than 1 and is represented using BP.
280 // (Note that the outlined region's 'head' block can only have incoming
281 // edges from the guarding entry blocks).
282 BranchProbability
getOutliningCallBBRelativeFreq(FunctionCloner
&Cloner
);
284 // Return true if the callee of CS should be partially inlined with
286 bool shouldPartialInline(CallSite CS
, FunctionCloner
&Cloner
,
287 BlockFrequency WeightedOutliningRcost
,
288 OptimizationRemarkEmitter
&ORE
);
290 // Try to inline DuplicateFunction (cloned from F with call to
291 // the OutlinedFunction into its callers. Return true
292 // if there is any successful inlining.
293 bool tryPartialInline(FunctionCloner
&Cloner
);
295 // Compute the mapping from use site of DuplicationFunction to the enclosing
296 // BB's profile count.
297 void computeCallsiteToProfCountMap(Function
*DuplicateFunction
,
298 DenseMap
<User
*, uint64_t> &SiteCountMap
);
300 bool IsLimitReached() {
301 return (MaxNumPartialInlining
!= -1 &&
302 NumPartialInlining
>= MaxNumPartialInlining
);
305 static CallSite
getCallSite(User
*U
) {
307 if (CallInst
*CI
= dyn_cast
<CallInst
>(U
))
309 else if (InvokeInst
*II
= dyn_cast
<InvokeInst
>(U
))
312 llvm_unreachable("All uses must be calls");
316 static CallSite
getOneCallSiteTo(Function
*F
) {
317 User
*User
= *F
->user_begin();
318 return getCallSite(User
);
321 std::tuple
<DebugLoc
, BasicBlock
*> getOneDebugLoc(Function
*F
) {
322 CallSite CS
= getOneCallSiteTo(F
);
323 DebugLoc DLoc
= CS
.getInstruction()->getDebugLoc();
324 BasicBlock
*Block
= CS
.getParent();
325 return std::make_tuple(DLoc
, Block
);
328 // Returns the costs associated with function outlining:
329 // - The first value is the non-weighted runtime cost for making the call
330 // to the outlined function, including the addtional setup cost in the
331 // outlined function itself;
332 // - The second value is the estimated size of the new call sequence in
333 // basic block Cloner.OutliningCallBB;
334 std::tuple
<int, int> computeOutliningCosts(FunctionCloner
&Cloner
);
336 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
337 // approximate both the size and runtime cost (Note that in the current
338 // inline cost analysis, there is no clear distinction there either).
339 static int computeBBInlineCost(BasicBlock
*BB
);
341 std::unique_ptr
<FunctionOutliningInfo
> computeOutliningInfo(Function
*F
);
342 std::unique_ptr
<FunctionOutliningMultiRegionInfo
>
343 computeOutliningColdRegionsInfo(Function
*F
, OptimizationRemarkEmitter
&ORE
);
346 struct PartialInlinerLegacyPass
: public ModulePass
{
347 static char ID
; // Pass identification, replacement for typeid
349 PartialInlinerLegacyPass() : ModulePass(ID
) {
350 initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
353 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
354 AU
.addRequired
<AssumptionCacheTracker
>();
355 AU
.addRequired
<ProfileSummaryInfoWrapperPass
>();
356 AU
.addRequired
<TargetTransformInfoWrapperPass
>();
359 bool runOnModule(Module
&M
) override
{
363 AssumptionCacheTracker
*ACT
= &getAnalysis
<AssumptionCacheTracker
>();
364 TargetTransformInfoWrapperPass
*TTIWP
=
365 &getAnalysis
<TargetTransformInfoWrapperPass
>();
366 ProfileSummaryInfo
*PSI
=
367 &getAnalysis
<ProfileSummaryInfoWrapperPass
>().getPSI();
369 std::function
<AssumptionCache
&(Function
&)> GetAssumptionCache
=
370 [&ACT
](Function
&F
) -> AssumptionCache
& {
371 return ACT
->getAssumptionCache(F
);
374 auto LookupAssumptionCache
= [ACT
](Function
&F
) -> AssumptionCache
* {
375 return ACT
->lookupAssumptionCache(F
);
378 std::function
<TargetTransformInfo
&(Function
&)> GetTTI
=
379 [&TTIWP
](Function
&F
) -> TargetTransformInfo
& {
380 return TTIWP
->getTTI(F
);
383 return PartialInlinerImpl(&GetAssumptionCache
, LookupAssumptionCache
,
384 &GetTTI
, NoneType::None
, PSI
)
389 } // end anonymous namespace
391 std::unique_ptr
<FunctionOutliningMultiRegionInfo
>
392 PartialInlinerImpl::computeOutliningColdRegionsInfo(Function
*F
,
393 OptimizationRemarkEmitter
&ORE
) {
394 BasicBlock
*EntryBlock
= &F
->front();
396 DominatorTree
DT(*F
);
398 BranchProbabilityInfo
BPI(*F
, LI
);
399 std::unique_ptr
<BlockFrequencyInfo
> ScopedBFI
;
400 BlockFrequencyInfo
*BFI
;
402 ScopedBFI
.reset(new BlockFrequencyInfo(*F
, BPI
, LI
));
403 BFI
= ScopedBFI
.get();
405 BFI
= &(*GetBFI
)(*F
);
407 // Return if we don't have profiling information.
408 if (!PSI
->hasInstrumentationProfile())
409 return std::unique_ptr
<FunctionOutliningMultiRegionInfo
>();
411 std::unique_ptr
<FunctionOutliningMultiRegionInfo
> OutliningInfo
=
412 std::make_unique
<FunctionOutliningMultiRegionInfo
>();
414 auto IsSingleEntry
= [](SmallVectorImpl
<BasicBlock
*> &BlockList
) {
415 BasicBlock
*Dom
= BlockList
.front();
416 return BlockList
.size() > 1 && Dom
->hasNPredecessors(1);
420 [&ORE
](SmallVectorImpl
<BasicBlock
*> &BlockList
) -> BasicBlock
* {
421 BasicBlock
*ExitBlock
= nullptr;
422 for (auto *Block
: BlockList
) {
423 for (auto SI
= succ_begin(Block
); SI
!= succ_end(Block
); ++SI
) {
424 if (!is_contained(BlockList
, *SI
)) {
427 return OptimizationRemarkMissed(DEBUG_TYPE
, "MultiExitRegion",
429 << "Region dominated by "
430 << ore::NV("Block", BlockList
.front()->getName())
431 << " has more than one region exit edge.";
442 auto BBProfileCount
= [BFI
](BasicBlock
*BB
) {
443 return BFI
->getBlockProfileCount(BB
)
444 ? BFI
->getBlockProfileCount(BB
).getValue()
448 // Use the same computeBBInlineCost function to compute the cost savings of
449 // the outlining the candidate region.
450 int OverallFunctionCost
= 0;
452 OverallFunctionCost
+= computeBBInlineCost(&BB
);
455 if (TracePartialInlining
)
456 dbgs() << "OverallFunctionCost = " << OverallFunctionCost
<< "\n";
458 int MinOutlineRegionCost
=
459 static_cast<int>(OverallFunctionCost
* MinRegionSizeRatio
);
460 BranchProbability
MinBranchProbability(
461 static_cast<int>(ColdBranchRatio
* MinBlockCounterExecution
),
462 MinBlockCounterExecution
);
463 bool ColdCandidateFound
= false;
464 BasicBlock
*CurrEntry
= EntryBlock
;
465 std::vector
<BasicBlock
*> DFS
;
466 DenseMap
<BasicBlock
*, bool> VisitedMap
;
467 DFS
.push_back(CurrEntry
);
468 VisitedMap
[CurrEntry
] = true;
469 // Use Depth First Search on the basic blocks to find CFG edges that are
471 // Cold regions considered must also have its inline cost compared to the
472 // overall inline cost of the original function. The region is outlined only
473 // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
475 while (!DFS
.empty()) {
476 auto *thisBB
= DFS
.back();
478 // Only consider regions with predecessor blocks that are considered
479 // not-cold (default: part of the top 99.99% of all block counters)
480 // AND greater than our minimum block execution count (default: 100).
481 if (PSI
->isColdBlock(thisBB
, BFI
) ||
482 BBProfileCount(thisBB
) < MinBlockCounterExecution
)
484 for (auto SI
= succ_begin(thisBB
); SI
!= succ_end(thisBB
); ++SI
) {
487 VisitedMap
[*SI
] = true;
489 // If branch isn't cold, we skip to the next one.
490 BranchProbability SuccProb
= BPI
.getEdgeProbability(thisBB
, *SI
);
491 if (SuccProb
> MinBranchProbability
)
494 if (TracePartialInlining
) {
495 dbgs() << "Found cold edge: " << thisBB
->getName() << "->"
496 << (*SI
)->getName() << "\nBranch Probability = " << SuccProb
500 SmallVector
<BasicBlock
*, 8> DominateVector
;
501 DT
.getDescendants(*SI
, DominateVector
);
502 // We can only outline single entry regions (for now).
503 if (!IsSingleEntry(DominateVector
))
505 BasicBlock
*ExitBlock
= nullptr;
506 // We can only outline single exit regions (for now).
507 if (!(ExitBlock
= IsSingleExit(DominateVector
)))
509 int OutlineRegionCost
= 0;
510 for (auto *BB
: DominateVector
)
511 OutlineRegionCost
+= computeBBInlineCost(BB
);
514 if (TracePartialInlining
)
515 dbgs() << "OutlineRegionCost = " << OutlineRegionCost
<< "\n";
518 if (OutlineRegionCost
< MinOutlineRegionCost
) {
520 return OptimizationRemarkAnalysis(DEBUG_TYPE
, "TooCostly",
522 << ore::NV("Callee", F
) << " inline cost-savings smaller than "
523 << ore::NV("Cost", MinOutlineRegionCost
);
527 // For now, ignore blocks that belong to a SISE region that is a
528 // candidate for outlining. In the future, we may want to look
529 // at inner regions because the outer region may have live-exit
531 for (auto *BB
: DominateVector
)
532 VisitedMap
[BB
] = true;
533 // ReturnBlock here means the block after the outline call
534 BasicBlock
*ReturnBlock
= ExitBlock
->getSingleSuccessor();
535 // assert(ReturnBlock && "ReturnBlock is NULL somehow!");
536 FunctionOutliningMultiRegionInfo::OutlineRegionInfo
RegInfo(
537 DominateVector
, DominateVector
.front(), ExitBlock
, ReturnBlock
);
538 OutliningInfo
->ORI
.push_back(RegInfo
);
540 if (TracePartialInlining
) {
541 dbgs() << "Found Cold Candidate starting at block: "
542 << DominateVector
.front()->getName() << "\n";
545 ColdCandidateFound
= true;
546 NumColdRegionsFound
++;
549 if (ColdCandidateFound
)
550 return OutliningInfo
;
552 return std::unique_ptr
<FunctionOutliningMultiRegionInfo
>();
555 std::unique_ptr
<FunctionOutliningInfo
>
556 PartialInlinerImpl::computeOutliningInfo(Function
*F
) {
557 BasicBlock
*EntryBlock
= &F
->front();
558 BranchInst
*BR
= dyn_cast
<BranchInst
>(EntryBlock
->getTerminator());
559 if (!BR
|| BR
->isUnconditional())
560 return std::unique_ptr
<FunctionOutliningInfo
>();
562 // Returns true if Succ is BB's successor
563 auto IsSuccessor
= [](BasicBlock
*Succ
, BasicBlock
*BB
) {
564 return is_contained(successors(BB
), Succ
);
567 auto IsReturnBlock
= [](BasicBlock
*BB
) {
568 Instruction
*TI
= BB
->getTerminator();
569 return isa
<ReturnInst
>(TI
);
572 auto GetReturnBlock
= [&](BasicBlock
*Succ1
, BasicBlock
*Succ2
) {
573 if (IsReturnBlock(Succ1
))
574 return std::make_tuple(Succ1
, Succ2
);
575 if (IsReturnBlock(Succ2
))
576 return std::make_tuple(Succ2
, Succ1
);
578 return std::make_tuple
<BasicBlock
*, BasicBlock
*>(nullptr, nullptr);
581 // Detect a triangular shape:
582 auto GetCommonSucc
= [&](BasicBlock
*Succ1
, BasicBlock
*Succ2
) {
583 if (IsSuccessor(Succ1
, Succ2
))
584 return std::make_tuple(Succ1
, Succ2
);
585 if (IsSuccessor(Succ2
, Succ1
))
586 return std::make_tuple(Succ2
, Succ1
);
588 return std::make_tuple
<BasicBlock
*, BasicBlock
*>(nullptr, nullptr);
591 std::unique_ptr
<FunctionOutliningInfo
> OutliningInfo
=
592 std::make_unique
<FunctionOutliningInfo
>();
594 BasicBlock
*CurrEntry
= EntryBlock
;
595 bool CandidateFound
= false;
597 // The number of blocks to be inlined has already reached
598 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
599 // disables partial inlining for the function.
600 if (OutliningInfo
->GetNumInlinedBlocks() >= MaxNumInlineBlocks
)
603 if (succ_size(CurrEntry
) != 2)
606 BasicBlock
*Succ1
= *succ_begin(CurrEntry
);
607 BasicBlock
*Succ2
= *(succ_begin(CurrEntry
) + 1);
609 BasicBlock
*ReturnBlock
, *NonReturnBlock
;
610 std::tie(ReturnBlock
, NonReturnBlock
) = GetReturnBlock(Succ1
, Succ2
);
613 OutliningInfo
->Entries
.push_back(CurrEntry
);
614 OutliningInfo
->ReturnBlock
= ReturnBlock
;
615 OutliningInfo
->NonReturnBlock
= NonReturnBlock
;
616 CandidateFound
= true;
620 BasicBlock
*CommSucc
;
621 BasicBlock
*OtherSucc
;
622 std::tie(CommSucc
, OtherSucc
) = GetCommonSucc(Succ1
, Succ2
);
627 OutliningInfo
->Entries
.push_back(CurrEntry
);
628 CurrEntry
= OtherSucc
;
632 return std::unique_ptr
<FunctionOutliningInfo
>();
634 // Do sanity check of the entries: threre should not
635 // be any successors (not in the entry set) other than
636 // {ReturnBlock, NonReturnBlock}
637 assert(OutliningInfo
->Entries
[0] == &F
->front() &&
638 "Function Entry must be the first in Entries vector");
639 DenseSet
<BasicBlock
*> Entries
;
640 for (BasicBlock
*E
: OutliningInfo
->Entries
)
643 // Returns true of BB has Predecessor which is not
645 auto HasNonEntryPred
= [Entries
](BasicBlock
*BB
) {
646 for (auto Pred
: predecessors(BB
)) {
647 if (!Entries
.count(Pred
))
652 auto CheckAndNormalizeCandidate
=
653 [Entries
, HasNonEntryPred
](FunctionOutliningInfo
*OutliningInfo
) {
654 for (BasicBlock
*E
: OutliningInfo
->Entries
) {
655 for (auto Succ
: successors(E
)) {
656 if (Entries
.count(Succ
))
658 if (Succ
== OutliningInfo
->ReturnBlock
)
659 OutliningInfo
->ReturnBlockPreds
.push_back(E
);
660 else if (Succ
!= OutliningInfo
->NonReturnBlock
)
663 // There should not be any outside incoming edges either:
664 if (HasNonEntryPred(E
))
670 if (!CheckAndNormalizeCandidate(OutliningInfo
.get()))
671 return std::unique_ptr
<FunctionOutliningInfo
>();
673 // Now further growing the candidate's inlining region by
674 // peeling off dominating blocks from the outlining region:
675 while (OutliningInfo
->GetNumInlinedBlocks() < MaxNumInlineBlocks
) {
676 BasicBlock
*Cand
= OutliningInfo
->NonReturnBlock
;
677 if (succ_size(Cand
) != 2)
680 if (HasNonEntryPred(Cand
))
683 BasicBlock
*Succ1
= *succ_begin(Cand
);
684 BasicBlock
*Succ2
= *(succ_begin(Cand
) + 1);
686 BasicBlock
*ReturnBlock
, *NonReturnBlock
;
687 std::tie(ReturnBlock
, NonReturnBlock
) = GetReturnBlock(Succ1
, Succ2
);
688 if (!ReturnBlock
|| ReturnBlock
!= OutliningInfo
->ReturnBlock
)
691 if (NonReturnBlock
->getSinglePredecessor() != Cand
)
694 // Now grow and update OutlininigInfo:
695 OutliningInfo
->Entries
.push_back(Cand
);
696 OutliningInfo
->NonReturnBlock
= NonReturnBlock
;
697 OutliningInfo
->ReturnBlockPreds
.push_back(Cand
);
698 Entries
.insert(Cand
);
701 return OutliningInfo
;
704 // Check if there is PGO data or user annoated branch data:
705 static bool hasProfileData(Function
*F
, FunctionOutliningInfo
*OI
) {
706 if (F
->hasProfileData())
708 // Now check if any of the entry block has MD_prof data:
709 for (auto *E
: OI
->Entries
) {
710 BranchInst
*BR
= dyn_cast
<BranchInst
>(E
->getTerminator());
711 if (!BR
|| BR
->isUnconditional())
714 if (BR
->extractProfMetadata(T
, F
))
721 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner
&Cloner
) {
722 BasicBlock
*OutliningCallBB
= Cloner
.OutlinedFunctions
.back().second
;
724 Cloner
.ClonedFuncBFI
->getBlockFreq(&Cloner
.ClonedFunc
->getEntryBlock());
725 auto OutliningCallFreq
=
726 Cloner
.ClonedFuncBFI
->getBlockFreq(OutliningCallBB
);
727 // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
728 // we outlined any regions, so we may encounter situations where the
729 // OutliningCallFreq is *slightly* bigger than the EntryFreq.
730 if (OutliningCallFreq
.getFrequency() > EntryFreq
.getFrequency()) {
731 OutliningCallFreq
= EntryFreq
;
733 auto OutlineRegionRelFreq
= BranchProbability::getBranchProbability(
734 OutliningCallFreq
.getFrequency(), EntryFreq
.getFrequency());
736 if (hasProfileData(Cloner
.OrigFunc
, Cloner
.ClonedOI
.get()))
737 return OutlineRegionRelFreq
;
739 // When profile data is not available, we need to be conservative in
740 // estimating the overall savings. Static branch prediction can usually
741 // guess the branch direction right (taken/non-taken), but the guessed
742 // branch probability is usually not biased enough. In case when the
743 // outlined region is predicted to be likely, its probability needs
744 // to be made higher (more biased) to not under-estimate the cost of
745 // function outlining. On the other hand, if the outlined region
746 // is predicted to be less likely, the predicted probablity is usually
747 // higher than the actual. For instance, the actual probability of the
748 // less likely target is only 5%, but the guessed probablity can be
749 // 40%. In the latter case, there is no need for further adjustement.
750 // FIXME: add an option for this.
751 if (OutlineRegionRelFreq
< BranchProbability(45, 100))
752 return OutlineRegionRelFreq
;
754 OutlineRegionRelFreq
= std::max(
755 OutlineRegionRelFreq
, BranchProbability(OutlineRegionFreqPercent
, 100));
757 return OutlineRegionRelFreq
;
760 bool PartialInlinerImpl::shouldPartialInline(
761 CallSite CS
, FunctionCloner
&Cloner
,
762 BlockFrequency WeightedOutliningRcost
,
763 OptimizationRemarkEmitter
&ORE
) {
766 Instruction
*Call
= CS
.getInstruction();
767 Function
*Callee
= CS
.getCalledFunction();
768 assert(Callee
== Cloner
.ClonedFunc
);
770 if (SkipCostAnalysis
)
771 return isInlineViable(*Callee
);
773 Function
*Caller
= CS
.getCaller();
774 auto &CalleeTTI
= (*GetTTI
)(*Callee
);
775 bool RemarksEnabled
=
776 Callee
->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
778 assert(Call
&& "invalid callsite for partial inline");
779 InlineCost IC
= getInlineCost(cast
<CallBase
>(*Call
), getInlineParams(),
780 CalleeTTI
, *GetAssumptionCache
, GetBFI
, PSI
,
781 RemarksEnabled
? &ORE
: nullptr);
785 return OptimizationRemarkAnalysis(DEBUG_TYPE
, "AlwaysInline", Call
)
786 << NV("Callee", Cloner
.OrigFunc
)
787 << " should always be fully inlined, not partially";
794 return OptimizationRemarkMissed(DEBUG_TYPE
, "NeverInline", Call
)
795 << NV("Callee", Cloner
.OrigFunc
) << " not partially inlined into "
796 << NV("Caller", Caller
)
797 << " because it should never be inlined (cost=never)";
804 return OptimizationRemarkAnalysis(DEBUG_TYPE
, "TooCostly", Call
)
805 << NV("Callee", Cloner
.OrigFunc
) << " not partially inlined into "
806 << NV("Caller", Caller
) << " because too costly to inline (cost="
807 << NV("Cost", IC
.getCost()) << ", threshold="
808 << NV("Threshold", IC
.getCostDelta() + IC
.getCost()) << ")";
812 const DataLayout
&DL
= Caller
->getParent()->getDataLayout();
814 // The savings of eliminating the call:
815 int NonWeightedSavings
= getCallsiteCost(cast
<CallBase
>(*Call
), DL
);
816 BlockFrequency
NormWeightedSavings(NonWeightedSavings
);
818 // Weighted saving is smaller than weighted cost, return false
819 if (NormWeightedSavings
< WeightedOutliningRcost
) {
821 return OptimizationRemarkAnalysis(DEBUG_TYPE
, "OutliningCallcostTooHigh",
823 << NV("Callee", Cloner
.OrigFunc
) << " not partially inlined into "
824 << NV("Caller", Caller
) << " runtime overhead (overhead="
825 << NV("Overhead", (unsigned)WeightedOutliningRcost
.getFrequency())
827 << NV("Savings", (unsigned)NormWeightedSavings
.getFrequency())
829 << " of making the outlined call is too high";
836 return OptimizationRemarkAnalysis(DEBUG_TYPE
, "CanBePartiallyInlined", Call
)
837 << NV("Callee", Cloner
.OrigFunc
) << " can be partially inlined into "
838 << NV("Caller", Caller
) << " with cost=" << NV("Cost", IC
.getCost())
840 << NV("Threshold", IC
.getCostDelta() + IC
.getCost()) << ")";
845 // TODO: Ideally we should share Inliner's InlineCost Analysis code.
846 // For now use a simplified version. The returned 'InlineCost' will be used
847 // to esimate the size cost as well as runtime cost of the BB.
848 int PartialInlinerImpl::computeBBInlineCost(BasicBlock
*BB
) {
850 const DataLayout
&DL
= BB
->getParent()->getParent()->getDataLayout();
851 for (Instruction
&I
: BB
->instructionsWithoutDebug()) {
852 // Skip free instructions.
853 switch (I
.getOpcode()) {
854 case Instruction::BitCast
:
855 case Instruction::PtrToInt
:
856 case Instruction::IntToPtr
:
857 case Instruction::Alloca
:
858 case Instruction::PHI
:
860 case Instruction::GetElementPtr
:
861 if (cast
<GetElementPtrInst
>(&I
)->hasAllZeroIndices())
868 if (I
.isLifetimeStartOrEnd())
871 if (CallInst
*CI
= dyn_cast
<CallInst
>(&I
)) {
872 InlineCost
+= getCallsiteCost(*CI
, DL
);
876 if (InvokeInst
*II
= dyn_cast
<InvokeInst
>(&I
)) {
877 InlineCost
+= getCallsiteCost(*II
, DL
);
881 if (SwitchInst
*SI
= dyn_cast
<SwitchInst
>(&I
)) {
882 InlineCost
+= (SI
->getNumCases() + 1) * InlineConstants::InstrCost
;
885 InlineCost
+= InlineConstants::InstrCost
;
891 PartialInlinerImpl::computeOutliningCosts(FunctionCloner
&Cloner
) {
892 int OutliningFuncCallCost
= 0, OutlinedFunctionCost
= 0;
893 for (auto FuncBBPair
: Cloner
.OutlinedFunctions
) {
894 Function
*OutlinedFunc
= FuncBBPair
.first
;
895 BasicBlock
* OutliningCallBB
= FuncBBPair
.second
;
896 // Now compute the cost of the call sequence to the outlined function
897 // 'OutlinedFunction' in BB 'OutliningCallBB':
898 OutliningFuncCallCost
+= computeBBInlineCost(OutliningCallBB
);
900 // Now compute the cost of the extracted/outlined function itself:
901 for (BasicBlock
&BB
: *OutlinedFunc
)
902 OutlinedFunctionCost
+= computeBBInlineCost(&BB
);
904 assert(OutlinedFunctionCost
>= Cloner
.OutlinedRegionCost
&&
905 "Outlined function cost should be no less than the outlined region");
907 // The code extractor introduces a new root and exit stub blocks with
908 // additional unconditional branches. Those branches will be eliminated
909 // later with bb layout. The cost should be adjusted accordingly:
910 OutlinedFunctionCost
-=
911 2 * InlineConstants::InstrCost
* Cloner
.OutlinedFunctions
.size();
913 int OutliningRuntimeOverhead
=
914 OutliningFuncCallCost
+
915 (OutlinedFunctionCost
- Cloner
.OutlinedRegionCost
) +
916 ExtraOutliningPenalty
;
918 return std::make_tuple(OutliningFuncCallCost
, OutliningRuntimeOverhead
);
921 // Create the callsite to profile count map which is
922 // used to update the original function's entry count,
923 // after the function is partially inlined into the callsite.
924 void PartialInlinerImpl::computeCallsiteToProfCountMap(
925 Function
*DuplicateFunction
,
926 DenseMap
<User
*, uint64_t> &CallSiteToProfCountMap
) {
927 std::vector
<User
*> Users(DuplicateFunction
->user_begin(),
928 DuplicateFunction
->user_end());
929 Function
*CurrentCaller
= nullptr;
930 std::unique_ptr
<BlockFrequencyInfo
> TempBFI
;
931 BlockFrequencyInfo
*CurrentCallerBFI
= nullptr;
933 auto ComputeCurrBFI
= [&,this](Function
*Caller
) {
934 // For the old pass manager:
936 DominatorTree
DT(*Caller
);
938 BranchProbabilityInfo
BPI(*Caller
, LI
);
939 TempBFI
.reset(new BlockFrequencyInfo(*Caller
, BPI
, LI
));
940 CurrentCallerBFI
= TempBFI
.get();
943 CurrentCallerBFI
= &(*GetBFI
)(*Caller
);
947 for (User
*User
: Users
) {
948 CallSite CS
= getCallSite(User
);
949 Function
*Caller
= CS
.getCaller();
950 if (CurrentCaller
!= Caller
) {
951 CurrentCaller
= Caller
;
952 ComputeCurrBFI(Caller
);
954 assert(CurrentCallerBFI
&& "CallerBFI is not set");
956 BasicBlock
*CallBB
= CS
.getInstruction()->getParent();
957 auto Count
= CurrentCallerBFI
->getBlockProfileCount(CallBB
);
959 CallSiteToProfCountMap
[User
] = *Count
;
961 CallSiteToProfCountMap
[User
] = 0;
965 PartialInlinerImpl::FunctionCloner::FunctionCloner(
966 Function
*F
, FunctionOutliningInfo
*OI
, OptimizationRemarkEmitter
&ORE
,
967 function_ref
<AssumptionCache
*(Function
&)> LookupAC
)
968 : OrigFunc(F
), ORE(ORE
), LookupAC(LookupAC
) {
969 ClonedOI
= std::make_unique
<FunctionOutliningInfo
>();
971 // Clone the function, so that we can hack away on it.
972 ValueToValueMapTy VMap
;
973 ClonedFunc
= CloneFunction(F
, VMap
);
975 ClonedOI
->ReturnBlock
= cast
<BasicBlock
>(VMap
[OI
->ReturnBlock
]);
976 ClonedOI
->NonReturnBlock
= cast
<BasicBlock
>(VMap
[OI
->NonReturnBlock
]);
977 for (BasicBlock
*BB
: OI
->Entries
) {
978 ClonedOI
->Entries
.push_back(cast
<BasicBlock
>(VMap
[BB
]));
980 for (BasicBlock
*E
: OI
->ReturnBlockPreds
) {
981 BasicBlock
*NewE
= cast
<BasicBlock
>(VMap
[E
]);
982 ClonedOI
->ReturnBlockPreds
.push_back(NewE
);
984 // Go ahead and update all uses to the duplicate, so that we can just
985 // use the inliner functionality when we're done hacking.
986 F
->replaceAllUsesWith(ClonedFunc
);
989 PartialInlinerImpl::FunctionCloner::FunctionCloner(
990 Function
*F
, FunctionOutliningMultiRegionInfo
*OI
,
991 OptimizationRemarkEmitter
&ORE
,
992 function_ref
<AssumptionCache
*(Function
&)> LookupAC
)
993 : OrigFunc(F
), ORE(ORE
), LookupAC(LookupAC
) {
994 ClonedOMRI
= std::make_unique
<FunctionOutliningMultiRegionInfo
>();
996 // Clone the function, so that we can hack away on it.
997 ValueToValueMapTy VMap
;
998 ClonedFunc
= CloneFunction(F
, VMap
);
1000 // Go through all Outline Candidate Regions and update all BasicBlock
1002 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo
:
1004 SmallVector
<BasicBlock
*, 8> Region
;
1005 for (BasicBlock
*BB
: RegionInfo
.Region
) {
1006 Region
.push_back(cast
<BasicBlock
>(VMap
[BB
]));
1008 BasicBlock
*NewEntryBlock
= cast
<BasicBlock
>(VMap
[RegionInfo
.EntryBlock
]);
1009 BasicBlock
*NewExitBlock
= cast
<BasicBlock
>(VMap
[RegionInfo
.ExitBlock
]);
1010 BasicBlock
*NewReturnBlock
= nullptr;
1011 if (RegionInfo
.ReturnBlock
)
1012 NewReturnBlock
= cast
<BasicBlock
>(VMap
[RegionInfo
.ReturnBlock
]);
1013 FunctionOutliningMultiRegionInfo::OutlineRegionInfo
MappedRegionInfo(
1014 Region
, NewEntryBlock
, NewExitBlock
, NewReturnBlock
);
1015 ClonedOMRI
->ORI
.push_back(MappedRegionInfo
);
1017 // Go ahead and update all uses to the duplicate, so that we can just
1018 // use the inliner functionality when we're done hacking.
1019 F
->replaceAllUsesWith(ClonedFunc
);
1022 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
1023 auto getFirstPHI
= [](BasicBlock
*BB
) {
1024 BasicBlock::iterator I
= BB
->begin();
1025 PHINode
*FirstPhi
= nullptr;
1026 while (I
!= BB
->end()) {
1027 PHINode
*Phi
= dyn_cast
<PHINode
>(I
);
1038 // Shouldn't need to normalize PHIs if we're not outlining non-early return
1043 // Special hackery is needed with PHI nodes that have inputs from more than
1044 // one extracted block. For simplicity, just split the PHIs into a two-level
1045 // sequence of PHIs, some of which will go in the extracted region, and some
1046 // of which will go outside.
1047 BasicBlock
*PreReturn
= ClonedOI
->ReturnBlock
;
1048 // only split block when necessary:
1049 PHINode
*FirstPhi
= getFirstPHI(PreReturn
);
1050 unsigned NumPredsFromEntries
= ClonedOI
->ReturnBlockPreds
.size();
1052 if (!FirstPhi
|| FirstPhi
->getNumIncomingValues() <= NumPredsFromEntries
+ 1)
1055 auto IsTrivialPhi
= [](PHINode
*PN
) -> Value
* {
1056 Value
*CommonValue
= PN
->getIncomingValue(0);
1057 if (all_of(PN
->incoming_values(),
1058 [&](Value
*V
) { return V
== CommonValue
; }))
1063 ClonedOI
->ReturnBlock
= ClonedOI
->ReturnBlock
->splitBasicBlock(
1064 ClonedOI
->ReturnBlock
->getFirstNonPHI()->getIterator());
1065 BasicBlock::iterator I
= PreReturn
->begin();
1066 Instruction
*Ins
= &ClonedOI
->ReturnBlock
->front();
1067 SmallVector
<Instruction
*, 4> DeadPhis
;
1068 while (I
!= PreReturn
->end()) {
1069 PHINode
*OldPhi
= dyn_cast
<PHINode
>(I
);
1074 PHINode::Create(OldPhi
->getType(), NumPredsFromEntries
+ 1, "", Ins
);
1075 OldPhi
->replaceAllUsesWith(RetPhi
);
1076 Ins
= ClonedOI
->ReturnBlock
->getFirstNonPHI();
1078 RetPhi
->addIncoming(&*I
, PreReturn
);
1079 for (BasicBlock
*E
: ClonedOI
->ReturnBlockPreds
) {
1080 RetPhi
->addIncoming(OldPhi
->getIncomingValueForBlock(E
), E
);
1081 OldPhi
->removeIncomingValue(E
);
1084 // After incoming values splitting, the old phi may become trivial.
1085 // Keeping the trivial phi can introduce definition inside the outline
1086 // region which is live-out, causing necessary overhead (load, store
1087 // arg passing etc).
1088 if (auto *OldPhiVal
= IsTrivialPhi(OldPhi
)) {
1089 OldPhi
->replaceAllUsesWith(OldPhiVal
);
1090 DeadPhis
.push_back(OldPhi
);
1094 for (auto *DP
: DeadPhis
)
1095 DP
->eraseFromParent();
1097 for (auto E
: ClonedOI
->ReturnBlockPreds
) {
1098 E
->getTerminator()->replaceUsesOfWith(PreReturn
, ClonedOI
->ReturnBlock
);
1102 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1104 auto ComputeRegionCost
= [](SmallVectorImpl
<BasicBlock
*> &Region
) {
1106 for (BasicBlock
* BB
: Region
)
1107 Cost
+= computeBBInlineCost(BB
);
1111 assert(ClonedOMRI
&& "Expecting OutlineInfo for multi region outline");
1113 if (ClonedOMRI
->ORI
.empty())
1116 // The CodeExtractor needs a dominator tree.
1118 DT
.recalculate(*ClonedFunc
);
1120 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1122 BranchProbabilityInfo
BPI(*ClonedFunc
, LI
);
1123 ClonedFuncBFI
.reset(new BlockFrequencyInfo(*ClonedFunc
, BPI
, LI
));
1125 // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
1126 CodeExtractorAnalysisCache
CEAC(*ClonedFunc
);
1128 SetVector
<Value
*> Inputs
, Outputs
, Sinks
;
1129 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo
:
1131 int CurrentOutlinedRegionCost
= ComputeRegionCost(RegionInfo
.Region
);
1133 CodeExtractor
CE(RegionInfo
.Region
, &DT
, /*AggregateArgs*/ false,
1134 ClonedFuncBFI
.get(), &BPI
,
1135 LookupAC(*RegionInfo
.EntryBlock
->getParent()),
1136 /* AllowVarargs */ false);
1138 CE
.findInputsOutputs(Inputs
, Outputs
, Sinks
);
1141 if (TracePartialInlining
) {
1142 dbgs() << "inputs: " << Inputs
.size() << "\n";
1143 dbgs() << "outputs: " << Outputs
.size() << "\n";
1144 for (Value
*value
: Inputs
)
1145 dbgs() << "value used in func: " << *value
<< "\n";
1146 for (Value
*output
: Outputs
)
1147 dbgs() << "instr used in func: " << *output
<< "\n";
1150 // Do not extract regions that have live exit variables.
1151 if (Outputs
.size() > 0 && !ForceLiveExit
)
1154 Function
*OutlinedFunc
= CE
.extractCodeRegion(CEAC
);
1157 CallSite OCS
= PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc
);
1158 BasicBlock
*OutliningCallBB
= OCS
.getInstruction()->getParent();
1159 assert(OutliningCallBB
->getParent() == ClonedFunc
);
1160 OutlinedFunctions
.push_back(std::make_pair(OutlinedFunc
,OutliningCallBB
));
1161 NumColdRegionsOutlined
++;
1162 OutlinedRegionCost
+= CurrentOutlinedRegionCost
;
1164 if (MarkOutlinedColdCC
) {
1165 OutlinedFunc
->setCallingConv(CallingConv::Cold
);
1166 OCS
.setCallingConv(CallingConv::Cold
);
1170 return OptimizationRemarkMissed(DEBUG_TYPE
, "ExtractFailed",
1171 &RegionInfo
.Region
.front()->front())
1172 << "Failed to extract region at block "
1173 << ore::NV("Block", RegionInfo
.Region
.front());
1177 return !OutlinedFunctions
.empty();
1181 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1182 // Returns true if the block is to be partial inlined into the caller
1183 // (i.e. not to be extracted to the out of line function)
1184 auto ToBeInlined
= [&, this](BasicBlock
*BB
) {
1185 return BB
== ClonedOI
->ReturnBlock
||
1186 (std::find(ClonedOI
->Entries
.begin(), ClonedOI
->Entries
.end(), BB
) !=
1187 ClonedOI
->Entries
.end());
1190 assert(ClonedOI
&& "Expecting OutlineInfo for single region outline");
1191 // The CodeExtractor needs a dominator tree.
1193 DT
.recalculate(*ClonedFunc
);
1195 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1197 BranchProbabilityInfo
BPI(*ClonedFunc
, LI
);
1198 ClonedFuncBFI
.reset(new BlockFrequencyInfo(*ClonedFunc
, BPI
, LI
));
1200 // Gather up the blocks that we're going to extract.
1201 std::vector
<BasicBlock
*> ToExtract
;
1202 ToExtract
.push_back(ClonedOI
->NonReturnBlock
);
1203 OutlinedRegionCost
+=
1204 PartialInlinerImpl::computeBBInlineCost(ClonedOI
->NonReturnBlock
);
1205 for (BasicBlock
&BB
: *ClonedFunc
)
1206 if (!ToBeInlined(&BB
) && &BB
!= ClonedOI
->NonReturnBlock
) {
1207 ToExtract
.push_back(&BB
);
1208 // FIXME: the code extractor may hoist/sink more code
1209 // into the outlined function which may make the outlining
1210 // overhead (the difference of the outlined function cost
1211 // and OutliningRegionCost) look larger.
1212 OutlinedRegionCost
+= computeBBInlineCost(&BB
);
1215 // Extract the body of the if.
1216 CodeExtractorAnalysisCache
CEAC(*ClonedFunc
);
1217 Function
*OutlinedFunc
=
1218 CodeExtractor(ToExtract
, &DT
, /*AggregateArgs*/ false,
1219 ClonedFuncBFI
.get(), &BPI
, LookupAC(*ClonedFunc
),
1220 /* AllowVarargs */ true)
1221 .extractCodeRegion(CEAC
);
1224 BasicBlock
*OutliningCallBB
=
1225 PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc
)
1228 assert(OutliningCallBB
->getParent() == ClonedFunc
);
1229 OutlinedFunctions
.push_back(std::make_pair(OutlinedFunc
, OutliningCallBB
));
1232 return OptimizationRemarkMissed(DEBUG_TYPE
, "ExtractFailed",
1233 &ToExtract
.front()->front())
1234 << "Failed to extract region at block "
1235 << ore::NV("Block", ToExtract
.front());
1238 return OutlinedFunc
;
1241 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1242 // Ditch the duplicate, since we're done with it, and rewrite all remaining
1243 // users (function pointers, etc.) back to the original function.
1244 ClonedFunc
->replaceAllUsesWith(OrigFunc
);
1245 ClonedFunc
->eraseFromParent();
1246 if (!IsFunctionInlined
) {
1247 // Remove each function that was speculatively created if there is no
1249 for (auto FuncBBPair
: OutlinedFunctions
) {
1250 Function
*Func
= FuncBBPair
.first
;
1251 Func
->eraseFromParent();
1256 std::pair
<bool, Function
*> PartialInlinerImpl::unswitchFunction(Function
*F
) {
1258 if (F
->hasAddressTaken())
1259 return {false, nullptr};
1261 // Let inliner handle it
1262 if (F
->hasFnAttribute(Attribute::AlwaysInline
))
1263 return {false, nullptr};
1265 if (F
->hasFnAttribute(Attribute::NoInline
))
1266 return {false, nullptr};
1268 if (PSI
->isFunctionEntryCold(F
))
1269 return {false, nullptr};
1271 if (F
->users().empty())
1272 return {false, nullptr};
1274 OptimizationRemarkEmitter
ORE(F
);
1276 // Only try to outline cold regions if we have a profile summary, which
1277 // implies we have profiling information.
1278 if (PSI
->hasProfileSummary() && F
->hasProfileData() &&
1279 !DisableMultiRegionPartialInline
) {
1280 std::unique_ptr
<FunctionOutliningMultiRegionInfo
> OMRI
=
1281 computeOutliningColdRegionsInfo(F
, ORE
);
1283 FunctionCloner
Cloner(F
, OMRI
.get(), ORE
, LookupAssumptionCache
);
1286 if (TracePartialInlining
) {
1287 dbgs() << "HotCountThreshold = " << PSI
->getHotCountThreshold() << "\n";
1288 dbgs() << "ColdCountThreshold = " << PSI
->getColdCountThreshold()
1292 bool DidOutline
= Cloner
.doMultiRegionFunctionOutlining();
1296 if (TracePartialInlining
) {
1297 dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1298 Cloner
.ClonedFunc
->print(dbgs());
1299 dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1303 if (tryPartialInline(Cloner
))
1304 return {true, nullptr};
1309 // Fall-thru to regular partial inlining if we:
1310 // i) can't find any cold regions to outline, or
1311 // ii) can't inline the outlined function anywhere.
1312 std::unique_ptr
<FunctionOutliningInfo
> OI
= computeOutliningInfo(F
);
1314 return {false, nullptr};
1316 FunctionCloner
Cloner(F
, OI
.get(), ORE
, LookupAssumptionCache
);
1317 Cloner
.NormalizeReturnBlock();
1319 Function
*OutlinedFunction
= Cloner
.doSingleRegionFunctionOutlining();
1321 if (!OutlinedFunction
)
1322 return {false, nullptr};
1324 bool AnyInline
= tryPartialInline(Cloner
);
1327 return {true, OutlinedFunction
};
1329 return {false, nullptr};
1332 bool PartialInlinerImpl::tryPartialInline(FunctionCloner
&Cloner
) {
1333 if (Cloner
.OutlinedFunctions
.empty())
1337 BlockFrequency WeightedRcost
;
1338 int NonWeightedRcost
;
1339 std::tie(SizeCost
, NonWeightedRcost
) = computeOutliningCosts(Cloner
);
1341 // Only calculate RelativeToEntryFreq when we are doing single region
1343 BranchProbability RelativeToEntryFreq
;
1344 if (Cloner
.ClonedOI
) {
1345 RelativeToEntryFreq
= getOutliningCallBBRelativeFreq(Cloner
);
1347 // RelativeToEntryFreq doesn't make sense when we have more than one
1348 // outlined call because each call will have a different relative frequency
1349 // to the entry block. We can consider using the average, but the
1350 // usefulness of that information is questionable. For now, assume we never
1351 // execute the calls to outlined functions.
1352 RelativeToEntryFreq
= BranchProbability(0, 1);
1354 WeightedRcost
= BlockFrequency(NonWeightedRcost
) * RelativeToEntryFreq
;
1356 // The call sequence(s) to the outlined function(s) are larger than the sum of
1357 // the original outlined region size(s), it does not increase the chances of
1358 // inlining the function with outlining (The inliner uses the size increase to
1359 // model the cost of inlining a callee).
1360 if (!SkipCostAnalysis
&& Cloner
.OutlinedRegionCost
< SizeCost
) {
1361 OptimizationRemarkEmitter
OrigFuncORE(Cloner
.OrigFunc
);
1364 std::tie(DLoc
, Block
) = getOneDebugLoc(Cloner
.ClonedFunc
);
1365 OrigFuncORE
.emit([&]() {
1366 return OptimizationRemarkAnalysis(DEBUG_TYPE
, "OutlineRegionTooSmall",
1368 << ore::NV("Function", Cloner
.OrigFunc
)
1369 << " not partially inlined into callers (Original Size = "
1370 << ore::NV("OutlinedRegionOriginalSize", Cloner
.OutlinedRegionCost
)
1371 << ", Size of call sequence to outlined function = "
1372 << ore::NV("NewSize", SizeCost
) << ")";
1377 assert(Cloner
.OrigFunc
->users().empty() &&
1378 "F's users should all be replaced!");
1380 std::vector
<User
*> Users(Cloner
.ClonedFunc
->user_begin(),
1381 Cloner
.ClonedFunc
->user_end());
1383 DenseMap
<User
*, uint64_t> CallSiteToProfCountMap
;
1384 auto CalleeEntryCount
= Cloner
.OrigFunc
->getEntryCount();
1385 if (CalleeEntryCount
)
1386 computeCallsiteToProfCountMap(Cloner
.ClonedFunc
, CallSiteToProfCountMap
);
1388 uint64_t CalleeEntryCountV
=
1389 (CalleeEntryCount
? CalleeEntryCount
.getCount() : 0);
1391 bool AnyInline
= false;
1392 for (User
*User
: Users
) {
1393 CallSite CS
= getCallSite(User
);
1395 if (IsLimitReached())
1398 OptimizationRemarkEmitter
CallerORE(CS
.getCaller());
1399 if (!shouldPartialInline(CS
, Cloner
, WeightedRcost
, CallerORE
))
1402 // Construct remark before doing the inlining, as after successful inlining
1403 // the callsite is removed.
1404 OptimizationRemark
OR(DEBUG_TYPE
, "PartiallyInlined", CS
.getInstruction());
1405 OR
<< ore::NV("Callee", Cloner
.OrigFunc
) << " partially inlined into "
1406 << ore::NV("Caller", CS
.getCaller());
1408 InlineFunctionInfo
IFI(nullptr, GetAssumptionCache
, PSI
);
1409 // We can only forward varargs when we outlined a single region, else we
1410 // bail on vararg functions.
1411 if (!InlineFunction(CS
, IFI
, nullptr, true,
1412 (Cloner
.ClonedOI
? Cloner
.OutlinedFunctions
.back().first
1418 // Now update the entry count:
1419 if (CalleeEntryCountV
&& CallSiteToProfCountMap
.count(User
)) {
1420 uint64_t CallSiteCount
= CallSiteToProfCountMap
[User
];
1421 CalleeEntryCountV
-= std::min(CalleeEntryCountV
, CallSiteCount
);
1425 NumPartialInlining
++;
1427 if (Cloner
.ClonedOI
)
1428 NumPartialInlined
++;
1430 NumColdOutlinePartialInlined
++;
1435 Cloner
.IsFunctionInlined
= true;
1436 if (CalleeEntryCount
)
1437 Cloner
.OrigFunc
->setEntryCount(
1438 CalleeEntryCount
.setCount(CalleeEntryCountV
));
1439 OptimizationRemarkEmitter
OrigFuncORE(Cloner
.OrigFunc
);
1440 OrigFuncORE
.emit([&]() {
1441 return OptimizationRemark(DEBUG_TYPE
, "PartiallyInlined", Cloner
.OrigFunc
)
1442 << "Partially inlined into at least one caller";
1450 bool PartialInlinerImpl::run(Module
&M
) {
1451 if (DisablePartialInlining
)
1454 std::vector
<Function
*> Worklist
;
1455 Worklist
.reserve(M
.size());
1456 for (Function
&F
: M
)
1457 if (!F
.use_empty() && !F
.isDeclaration())
1458 Worklist
.push_back(&F
);
1460 bool Changed
= false;
1461 while (!Worklist
.empty()) {
1462 Function
*CurrFunc
= Worklist
.back();
1463 Worklist
.pop_back();
1465 if (CurrFunc
->use_empty())
1468 bool Recursive
= false;
1469 for (User
*U
: CurrFunc
->users())
1470 if (Instruction
*I
= dyn_cast
<Instruction
>(U
))
1471 if (I
->getParent()->getParent() == CurrFunc
) {
1478 std::pair
<bool, Function
* > Result
= unswitchFunction(CurrFunc
);
1480 Worklist
.push_back(Result
.second
);
1481 Changed
|= Result
.first
;
1487 char PartialInlinerLegacyPass::ID
= 0;
1489 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass
, "partial-inliner",
1490 "Partial Inliner", false, false)
1491 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker
)
1492 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass
)
1493 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass
)
1494 INITIALIZE_PASS_END(PartialInlinerLegacyPass
, "partial-inliner",
1495 "Partial Inliner", false, false)
1497 ModulePass
*llvm::createPartialInliningPass() {
1498 return new PartialInlinerLegacyPass();
1501 PreservedAnalyses
PartialInlinerPass::run(Module
&M
,
1502 ModuleAnalysisManager
&AM
) {
1503 auto &FAM
= AM
.getResult
<FunctionAnalysisManagerModuleProxy
>(M
).getManager();
1505 std::function
<AssumptionCache
&(Function
&)> GetAssumptionCache
=
1506 [&FAM
](Function
&F
) -> AssumptionCache
& {
1507 return FAM
.getResult
<AssumptionAnalysis
>(F
);
1510 auto LookupAssumptionCache
= [&FAM
](Function
&F
) -> AssumptionCache
* {
1511 return FAM
.getCachedResult
<AssumptionAnalysis
>(F
);
1514 std::function
<BlockFrequencyInfo
&(Function
&)> GetBFI
=
1515 [&FAM
](Function
&F
) -> BlockFrequencyInfo
& {
1516 return FAM
.getResult
<BlockFrequencyAnalysis
>(F
);
1519 std::function
<TargetTransformInfo
&(Function
&)> GetTTI
=
1520 [&FAM
](Function
&F
) -> TargetTransformInfo
& {
1521 return FAM
.getResult
<TargetIRAnalysis
>(F
);
1524 ProfileSummaryInfo
*PSI
= &AM
.getResult
<ProfileSummaryAnalysis
>(M
);
1526 if (PartialInlinerImpl(&GetAssumptionCache
, LookupAssumptionCache
, &GetTTI
,
1529 return PreservedAnalyses::none();
1530 return PreservedAnalyses::all();