1 //===- FunctionSpecialization.cpp - Function Specialization ---------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "llvm/Transforms/IPO/FunctionSpecialization.h"
10 #include "llvm/ADT/Statistic.h"
11 #include "llvm/Analysis/CodeMetrics.h"
12 #include "llvm/Analysis/ConstantFolding.h"
13 #include "llvm/Analysis/InlineCost.h"
14 #include "llvm/Analysis/InstructionSimplify.h"
15 #include "llvm/Analysis/TargetTransformInfo.h"
16 #include "llvm/Analysis/ValueLattice.h"
17 #include "llvm/Analysis/ValueLatticeUtils.h"
18 #include "llvm/Analysis/ValueTracking.h"
19 #include "llvm/IR/IntrinsicInst.h"
20 #include "llvm/Transforms/Scalar/SCCP.h"
21 #include "llvm/Transforms/Utils/Cloning.h"
22 #include "llvm/Transforms/Utils/SCCPSolver.h"
23 #include "llvm/Transforms/Utils/SizeOpts.h"
28 #define DEBUG_TYPE "function-specialization"
30 STATISTIC(NumSpecsCreated
, "Number of specializations created");
32 static cl::opt
<bool> ForceSpecialization(
33 "force-specialization", cl::init(false), cl::Hidden
, cl::desc(
34 "Force function specialization for every call site with a constant "
37 static cl::opt
<unsigned> MaxClones(
38 "funcspec-max-clones", cl::init(3), cl::Hidden
, cl::desc(
39 "The maximum number of clones allowed for a single function "
42 static cl::opt
<unsigned> MaxIncomingPhiValues(
43 "funcspec-max-incoming-phi-values", cl::init(4), cl::Hidden
, cl::desc(
44 "The maximum number of incoming values a PHI node can have to be "
45 "considered during the specialization bonus estimation"));
47 static cl::opt
<unsigned> MaxBlockPredecessors(
48 "funcspec-max-block-predecessors", cl::init(2), cl::Hidden
, cl::desc(
49 "The maximum number of predecessors a basic block can have to be "
50 "considered during the estimation of dead code"));
52 static cl::opt
<unsigned> MinFunctionSize(
53 "funcspec-min-function-size", cl::init(300), cl::Hidden
, cl::desc(
54 "Don't specialize functions that have less than this number of "
57 static cl::opt
<unsigned> MaxCodeSizeGrowth(
58 "funcspec-max-codesize-growth", cl::init(3), cl::Hidden
, cl::desc(
59 "Maximum codesize growth allowed per function"));
61 static cl::opt
<unsigned> MinCodeSizeSavings(
62 "funcspec-min-codesize-savings", cl::init(20), cl::Hidden
, cl::desc(
63 "Reject specializations whose codesize savings are less than this"
64 "much percent of the original function size"));
66 static cl::opt
<unsigned> MinLatencySavings(
67 "funcspec-min-latency-savings", cl::init(70), cl::Hidden
, cl::desc(
68 "Reject specializations whose latency savings are less than this"
69 "much percent of the original function size"));
71 static cl::opt
<unsigned> MinInliningBonus(
72 "funcspec-min-inlining-bonus", cl::init(300), cl::Hidden
, cl::desc(
73 "Reject specializations whose inlining bonus is less than this"
74 "much percent of the original function size"));
76 static cl::opt
<bool> SpecializeOnAddress(
77 "funcspec-on-address", cl::init(false), cl::Hidden
, cl::desc(
78 "Enable function specialization on the address of global values"));
80 // Disabled by default as it can significantly increase compilation times.
82 // https://llvm-compile-time-tracker.com
83 // https://github.com/nikic/llvm-compile-time-tracker
84 static cl::opt
<bool> SpecializeLiteralConstant(
85 "funcspec-for-literal-constant", cl::init(false), cl::Hidden
, cl::desc(
86 "Enable specialization of functions that take a literal constant as an "
89 bool InstCostVisitor::canEliminateSuccessor(BasicBlock
*BB
, BasicBlock
*Succ
,
90 DenseSet
<BasicBlock
*> &DeadBlocks
) {
92 return all_of(predecessors(Succ
),
93 [&I
, BB
, Succ
, &DeadBlocks
] (BasicBlock
*Pred
) {
94 return I
++ < MaxBlockPredecessors
&&
95 (Pred
== BB
|| Pred
== Succ
|| DeadBlocks
.contains(Pred
));
99 // Estimates the codesize savings due to dead code after constant propagation.
100 // \p WorkList represents the basic blocks of a specialization which will
101 // eventually become dead once we replace instructions that are known to be
102 // constants. The successors of such blocks are added to the list as long as
103 // the \p Solver found they were executable prior to specialization, and only
104 // if all their predecessors are dead.
105 Cost
InstCostVisitor::estimateBasicBlocks(
106 SmallVectorImpl
<BasicBlock
*> &WorkList
) {
108 // Accumulate the instruction cost of each basic block weighted by frequency.
109 while (!WorkList
.empty()) {
110 BasicBlock
*BB
= WorkList
.pop_back_val();
112 // These blocks are considered dead as far as the InstCostVisitor
113 // is concerned. They haven't been proven dead yet by the Solver,
114 // but may become if we propagate the specialization arguments.
115 if (!DeadBlocks
.insert(BB
).second
)
118 for (Instruction
&I
: *BB
) {
119 // Disregard SSA copies.
120 if (auto *II
= dyn_cast
<IntrinsicInst
>(&I
))
121 if (II
->getIntrinsicID() == Intrinsic::ssa_copy
)
123 // If it's a known constant we have already accounted for it.
124 if (KnownConstants
.contains(&I
))
127 Cost C
= TTI
.getInstructionCost(&I
, TargetTransformInfo::TCK_CodeSize
);
129 LLVM_DEBUG(dbgs() << "FnSpecialization: CodeSize " << C
130 << " for user " << I
<< "\n");
134 // Keep adding dead successors to the list as long as they are
135 // executable and only reachable from dead blocks.
136 for (BasicBlock
*SuccBB
: successors(BB
))
137 if (isBlockExecutable(SuccBB
) &&
138 canEliminateSuccessor(BB
, SuccBB
, DeadBlocks
))
139 WorkList
.push_back(SuccBB
);
144 static Constant
*findConstantFor(Value
*V
, ConstMap
&KnownConstants
) {
145 if (auto *C
= dyn_cast
<Constant
>(V
))
147 return KnownConstants
.lookup(V
);
150 Bonus
InstCostVisitor::getBonusFromPendingPHIs() {
152 while (!PendingPHIs
.empty()) {
153 Instruction
*Phi
= PendingPHIs
.pop_back_val();
154 // The pending PHIs could have been proven dead by now.
155 if (isBlockExecutable(Phi
->getParent()))
156 B
+= getUserBonus(Phi
);
161 /// Compute a bonus for replacing argument \p A with constant \p C.
162 Bonus
InstCostVisitor::getSpecializationBonus(Argument
*A
, Constant
*C
) {
163 LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing bonus for constant: "
164 << C
->getNameOrAsOperand() << "\n");
166 for (auto *U
: A
->users())
167 if (auto *UI
= dyn_cast
<Instruction
>(U
))
168 if (isBlockExecutable(UI
->getParent()))
169 B
+= getUserBonus(UI
, A
, C
);
171 LLVM_DEBUG(dbgs() << "FnSpecialization: Accumulated bonus {CodeSize = "
172 << B
.CodeSize
<< ", Latency = " << B
.Latency
173 << "} for argument " << *A
<< "\n");
177 Bonus
InstCostVisitor::getUserBonus(Instruction
*User
, Value
*Use
, Constant
*C
) {
178 // We have already propagated a constant for this user.
179 if (KnownConstants
.contains(User
))
182 // Cache the iterator before visiting.
183 LastVisited
= Use
? KnownConstants
.insert({Use
, C
}).first
184 : KnownConstants
.end();
187 if (auto *I
= dyn_cast
<SwitchInst
>(User
)) {
188 CodeSize
= estimateSwitchInst(*I
);
189 } else if (auto *I
= dyn_cast
<BranchInst
>(User
)) {
190 CodeSize
= estimateBranchInst(*I
);
197 // Even though it doesn't make sense to bind switch and branch instructions
198 // with a constant, unlike any other instruction type, it prevents estimating
199 // their bonus multiple times.
200 KnownConstants
.insert({User
, C
});
202 CodeSize
+= TTI
.getInstructionCost(User
, TargetTransformInfo::TCK_CodeSize
);
204 uint64_t Weight
= BFI
.getBlockFreq(User
->getParent()).getFrequency() /
207 Cost Latency
= Weight
*
208 TTI
.getInstructionCost(User
, TargetTransformInfo::TCK_Latency
);
210 LLVM_DEBUG(dbgs() << "FnSpecialization: {CodeSize = " << CodeSize
211 << ", Latency = " << Latency
<< "} for user "
214 Bonus
B(CodeSize
, Latency
);
215 for (auto *U
: User
->users())
216 if (auto *UI
= dyn_cast
<Instruction
>(U
))
217 if (UI
!= User
&& isBlockExecutable(UI
->getParent()))
218 B
+= getUserBonus(UI
, User
, C
);
223 Cost
InstCostVisitor::estimateSwitchInst(SwitchInst
&I
) {
224 assert(LastVisited
!= KnownConstants
.end() && "Invalid iterator!");
226 if (I
.getCondition() != LastVisited
->first
)
229 auto *C
= dyn_cast
<ConstantInt
>(LastVisited
->second
);
233 BasicBlock
*Succ
= I
.findCaseValue(C
)->getCaseSuccessor();
234 // Initialize the worklist with the dead basic blocks. These are the
235 // destination labels which are different from the one corresponding
236 // to \p C. They should be executable and have a unique predecessor.
237 SmallVector
<BasicBlock
*> WorkList
;
238 for (const auto &Case
: I
.cases()) {
239 BasicBlock
*BB
= Case
.getCaseSuccessor();
240 if (BB
!= Succ
&& isBlockExecutable(BB
) &&
241 canEliminateSuccessor(I
.getParent(), BB
, DeadBlocks
))
242 WorkList
.push_back(BB
);
245 return estimateBasicBlocks(WorkList
);
248 Cost
InstCostVisitor::estimateBranchInst(BranchInst
&I
) {
249 assert(LastVisited
!= KnownConstants
.end() && "Invalid iterator!");
251 if (I
.getCondition() != LastVisited
->first
)
254 BasicBlock
*Succ
= I
.getSuccessor(LastVisited
->second
->isOneValue());
255 // Initialize the worklist with the dead successor as long as
256 // it is executable and has a unique predecessor.
257 SmallVector
<BasicBlock
*> WorkList
;
258 if (isBlockExecutable(Succ
) &&
259 canEliminateSuccessor(I
.getParent(), Succ
, DeadBlocks
))
260 WorkList
.push_back(Succ
);
262 return estimateBasicBlocks(WorkList
);
265 Constant
*InstCostVisitor::visitPHINode(PHINode
&I
) {
266 if (I
.getNumIncomingValues() > MaxIncomingPhiValues
)
269 bool Inserted
= VisitedPHIs
.insert(&I
).second
;
270 Constant
*Const
= nullptr;
272 for (unsigned Idx
= 0, E
= I
.getNumIncomingValues(); Idx
!= E
; ++Idx
) {
273 Value
*V
= I
.getIncomingValue(Idx
);
274 if (auto *Inst
= dyn_cast
<Instruction
>(V
))
275 if (Inst
== &I
|| DeadBlocks
.contains(I
.getIncomingBlock(Idx
)))
277 Constant
*C
= findConstantFor(V
, KnownConstants
);
280 PendingPHIs
.push_back(&I
);
291 Constant
*InstCostVisitor::visitFreezeInst(FreezeInst
&I
) {
292 assert(LastVisited
!= KnownConstants
.end() && "Invalid iterator!");
294 if (isGuaranteedNotToBeUndefOrPoison(LastVisited
->second
))
295 return LastVisited
->second
;
299 Constant
*InstCostVisitor::visitCallBase(CallBase
&I
) {
300 Function
*F
= I
.getCalledFunction();
301 if (!F
|| !canConstantFoldCallTo(&I
, F
))
304 SmallVector
<Constant
*, 8> Operands
;
305 Operands
.reserve(I
.getNumOperands());
307 for (unsigned Idx
= 0, E
= I
.getNumOperands() - 1; Idx
!= E
; ++Idx
) {
308 Value
*V
= I
.getOperand(Idx
);
309 Constant
*C
= findConstantFor(V
, KnownConstants
);
312 Operands
.push_back(C
);
315 auto Ops
= ArrayRef(Operands
.begin(), Operands
.end());
316 return ConstantFoldCall(&I
, F
, Ops
);
319 Constant
*InstCostVisitor::visitLoadInst(LoadInst
&I
) {
320 assert(LastVisited
!= KnownConstants
.end() && "Invalid iterator!");
322 if (isa
<ConstantPointerNull
>(LastVisited
->second
))
324 return ConstantFoldLoadFromConstPtr(LastVisited
->second
, I
.getType(), DL
);
327 Constant
*InstCostVisitor::visitGetElementPtrInst(GetElementPtrInst
&I
) {
328 SmallVector
<Constant
*, 8> Operands
;
329 Operands
.reserve(I
.getNumOperands());
331 for (unsigned Idx
= 0, E
= I
.getNumOperands(); Idx
!= E
; ++Idx
) {
332 Value
*V
= I
.getOperand(Idx
);
333 Constant
*C
= findConstantFor(V
, KnownConstants
);
336 Operands
.push_back(C
);
339 auto Ops
= ArrayRef(Operands
.begin(), Operands
.end());
340 return ConstantFoldInstOperands(&I
, Ops
, DL
);
343 Constant
*InstCostVisitor::visitSelectInst(SelectInst
&I
) {
344 assert(LastVisited
!= KnownConstants
.end() && "Invalid iterator!");
346 if (I
.getCondition() != LastVisited
->first
)
349 Value
*V
= LastVisited
->second
->isZeroValue() ? I
.getFalseValue()
351 Constant
*C
= findConstantFor(V
, KnownConstants
);
355 Constant
*InstCostVisitor::visitCastInst(CastInst
&I
) {
356 return ConstantFoldCastOperand(I
.getOpcode(), LastVisited
->second
,
360 Constant
*InstCostVisitor::visitCmpInst(CmpInst
&I
) {
361 assert(LastVisited
!= KnownConstants
.end() && "Invalid iterator!");
363 bool Swap
= I
.getOperand(1) == LastVisited
->first
;
364 Value
*V
= Swap
? I
.getOperand(0) : I
.getOperand(1);
365 Constant
*Other
= findConstantFor(V
, KnownConstants
);
369 Constant
*Const
= LastVisited
->second
;
371 ConstantFoldCompareInstOperands(I
.getPredicate(), Other
, Const
, DL
)
372 : ConstantFoldCompareInstOperands(I
.getPredicate(), Const
, Other
, DL
);
375 Constant
*InstCostVisitor::visitUnaryOperator(UnaryOperator
&I
) {
376 assert(LastVisited
!= KnownConstants
.end() && "Invalid iterator!");
378 return ConstantFoldUnaryOpOperand(I
.getOpcode(), LastVisited
->second
, DL
);
381 Constant
*InstCostVisitor::visitBinaryOperator(BinaryOperator
&I
) {
382 assert(LastVisited
!= KnownConstants
.end() && "Invalid iterator!");
384 bool Swap
= I
.getOperand(1) == LastVisited
->first
;
385 Value
*V
= Swap
? I
.getOperand(0) : I
.getOperand(1);
386 Constant
*Other
= findConstantFor(V
, KnownConstants
);
390 Constant
*Const
= LastVisited
->second
;
391 return dyn_cast_or_null
<Constant
>(Swap
?
392 simplifyBinOp(I
.getOpcode(), Other
, Const
, SimplifyQuery(DL
))
393 : simplifyBinOp(I
.getOpcode(), Const
, Other
, SimplifyQuery(DL
)));
396 Constant
*FunctionSpecializer::getPromotableAlloca(AllocaInst
*Alloca
,
398 Value
*StoreValue
= nullptr;
399 for (auto *User
: Alloca
->users()) {
400 // We can't use llvm::isAllocaPromotable() as that would fail because of
401 // the usage in the CallInst, which is what we check here.
404 if (auto *Bitcast
= dyn_cast
<BitCastInst
>(User
)) {
405 if (!Bitcast
->hasOneUse() || *Bitcast
->user_begin() != Call
)
410 if (auto *Store
= dyn_cast
<StoreInst
>(User
)) {
411 // This is a duplicate store, bail out.
412 if (StoreValue
|| Store
->isVolatile())
414 StoreValue
= Store
->getValueOperand();
417 // Bail if there is any other unknown usage.
424 return getCandidateConstant(StoreValue
);
427 // A constant stack value is an AllocaInst that has a single constant
428 // value stored to it. Return this constant if such an alloca stack value
429 // is a function argument.
430 Constant
*FunctionSpecializer::getConstantStackValue(CallInst
*Call
,
434 Val
= Val
->stripPointerCasts();
435 if (auto *ConstVal
= dyn_cast
<ConstantInt
>(Val
))
437 auto *Alloca
= dyn_cast
<AllocaInst
>(Val
);
438 if (!Alloca
|| !Alloca
->getAllocatedType()->isIntegerTy())
440 return getPromotableAlloca(Alloca
, Call
);
443 // To support specializing recursive functions, it is important to propagate
444 // constant arguments because after a first iteration of specialisation, a
445 // reduced example may look like this:
447 // define internal void @RecursiveFn(i32* arg1) {
448 // %temp = alloca i32, align 4
449 // store i32 2 i32* %temp, align 4
450 // call void @RecursiveFn.1(i32* nonnull %temp)
454 // Before a next iteration, we need to propagate the constant like so
455 // which allows further specialization in next iterations.
457 // @funcspec.arg = internal constant i32 2
459 // define internal void @someFunc(i32* arg1) {
460 // call void @otherFunc(i32* nonnull @funcspec.arg)
464 // See if there are any new constant values for the callers of \p F via
465 // stack variables and promote them to global variables.
466 void FunctionSpecializer::promoteConstantStackValues(Function
*F
) {
467 for (User
*U
: F
->users()) {
469 auto *Call
= dyn_cast
<CallInst
>(U
);
473 if (!Solver
.isBlockExecutable(Call
->getParent()))
476 for (const Use
&U
: Call
->args()) {
477 unsigned Idx
= Call
->getArgOperandNo(&U
);
478 Value
*ArgOp
= Call
->getArgOperand(Idx
);
479 Type
*ArgOpType
= ArgOp
->getType();
481 if (!Call
->onlyReadsMemory(Idx
) || !ArgOpType
->isPointerTy())
484 auto *ConstVal
= getConstantStackValue(Call
, ArgOp
);
488 Value
*GV
= new GlobalVariable(M
, ConstVal
->getType(), true,
489 GlobalValue::InternalLinkage
, ConstVal
,
490 "specialized.arg." + Twine(++NGlobals
));
491 if (ArgOpType
!= ConstVal
->getType())
492 GV
= ConstantExpr::getBitCast(cast
<Constant
>(GV
), ArgOpType
);
494 Call
->setArgOperand(Idx
, GV
);
499 // ssa_copy intrinsics are introduced by the SCCP solver. These intrinsics
500 // interfere with the promoteConstantStackValues() optimization.
501 static void removeSSACopy(Function
&F
) {
502 for (BasicBlock
&BB
: F
) {
503 for (Instruction
&Inst
: llvm::make_early_inc_range(BB
)) {
504 auto *II
= dyn_cast
<IntrinsicInst
>(&Inst
);
507 if (II
->getIntrinsicID() != Intrinsic::ssa_copy
)
509 Inst
.replaceAllUsesWith(II
->getOperand(0));
510 Inst
.eraseFromParent();
515 /// Remove any ssa_copy intrinsics that may have been introduced.
516 void FunctionSpecializer::cleanUpSSA() {
517 for (Function
*F
: Specializations
)
522 template <> struct llvm::DenseMapInfo
<SpecSig
> {
523 static inline SpecSig
getEmptyKey() { return {~0U, {}}; }
525 static inline SpecSig
getTombstoneKey() { return {~1U, {}}; }
527 static unsigned getHashValue(const SpecSig
&S
) {
528 return static_cast<unsigned>(hash_value(S
));
531 static bool isEqual(const SpecSig
&LHS
, const SpecSig
&RHS
) {
536 FunctionSpecializer::~FunctionSpecializer() {
538 if (NumSpecsCreated
> 0)
539 dbgs() << "FnSpecialization: Created " << NumSpecsCreated
540 << " specializations in module " << M
.getName() << "\n");
541 // Eliminate dead code.
542 removeDeadFunctions();
546 /// Attempt to specialize functions in the module to enable constant
547 /// propagation across function boundaries.
549 /// \returns true if at least one function is specialized.
550 bool FunctionSpecializer::run() {
551 // Find possible specializations for each function.
553 SmallVector
<Spec
, 32> AllSpecs
;
554 unsigned NumCandidates
= 0;
555 for (Function
&F
: M
) {
556 if (!isCandidateFunction(&F
))
559 auto [It
, Inserted
] = FunctionMetrics
.try_emplace(&F
);
560 CodeMetrics
&Metrics
= It
->second
;
561 //Analyze the function.
563 SmallPtrSet
<const Value
*, 32> EphValues
;
564 CodeMetrics::collectEphemeralValues(&F
, &GetAC(F
), EphValues
);
565 for (BasicBlock
&BB
: F
)
566 Metrics
.analyzeBasicBlock(&BB
, GetTTI(F
), EphValues
);
569 // If the code metrics reveal that we shouldn't duplicate the function,
570 // or if the code size implies that this function is easy to get inlined,
571 // then we shouldn't specialize it.
572 if (Metrics
.notDuplicatable
|| !Metrics
.NumInsts
.isValid() ||
573 (!ForceSpecialization
&& !F
.hasFnAttribute(Attribute::NoInline
) &&
574 Metrics
.NumInsts
< MinFunctionSize
))
577 // TODO: For now only consider recursive functions when running multiple
578 // times. This should change if specialization on literal constants gets
580 if (!Inserted
&& !Metrics
.isRecursive
&& !SpecializeLiteralConstant
)
583 int64_t Sz
= *Metrics
.NumInsts
.getValue();
584 assert(Sz
> 0 && "CodeSize should be positive");
585 // It is safe to down cast from int64_t, NumInsts is always positive.
586 unsigned FuncSize
= static_cast<unsigned>(Sz
);
588 LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization cost for "
589 << F
.getName() << " is " << FuncSize
<< "\n");
591 if (Inserted
&& Metrics
.isRecursive
)
592 promoteConstantStackValues(&F
);
594 if (!findSpecializations(&F
, FuncSize
, AllSpecs
, SM
)) {
596 dbgs() << "FnSpecialization: No possible specializations found for "
597 << F
.getName() << "\n");
604 if (!NumCandidates
) {
607 << "FnSpecialization: No possible specializations found in module\n");
611 // Choose the most profitable specialisations, which fit in the module
612 // specialization budget, which is derived from maximum number of
613 // specializations per specialization candidate function.
614 auto CompareScore
= [&AllSpecs
](unsigned I
, unsigned J
) {
615 return AllSpecs
[I
].Score
> AllSpecs
[J
].Score
;
617 const unsigned NSpecs
=
618 std::min(NumCandidates
* MaxClones
, unsigned(AllSpecs
.size()));
619 SmallVector
<unsigned> BestSpecs(NSpecs
+ 1);
620 std::iota(BestSpecs
.begin(), BestSpecs
.begin() + NSpecs
, 0);
621 if (AllSpecs
.size() > NSpecs
) {
622 LLVM_DEBUG(dbgs() << "FnSpecialization: Number of candidates exceed "
623 << "the maximum number of clones threshold.\n"
624 << "FnSpecialization: Specializing the "
626 << " most profitable candidates.\n");
627 std::make_heap(BestSpecs
.begin(), BestSpecs
.begin() + NSpecs
, CompareScore
);
628 for (unsigned I
= NSpecs
, N
= AllSpecs
.size(); I
< N
; ++I
) {
629 BestSpecs
[NSpecs
] = I
;
630 std::push_heap(BestSpecs
.begin(), BestSpecs
.end(), CompareScore
);
631 std::pop_heap(BestSpecs
.begin(), BestSpecs
.end(), CompareScore
);
635 LLVM_DEBUG(dbgs() << "FnSpecialization: List of specializations \n";
636 for (unsigned I
= 0; I
< NSpecs
; ++I
) {
637 const Spec
&S
= AllSpecs
[BestSpecs
[I
]];
638 dbgs() << "FnSpecialization: Function " << S
.F
->getName()
639 << " , score " << S
.Score
<< "\n";
640 for (const ArgInfo
&Arg
: S
.Sig
.Args
)
641 dbgs() << "FnSpecialization: FormalArg = "
642 << Arg
.Formal
->getNameOrAsOperand()
643 << ", ActualArg = " << Arg
.Actual
->getNameOrAsOperand()
647 // Create the chosen specializations.
648 SmallPtrSet
<Function
*, 8> OriginalFuncs
;
649 SmallVector
<Function
*> Clones
;
650 for (unsigned I
= 0; I
< NSpecs
; ++I
) {
651 Spec
&S
= AllSpecs
[BestSpecs
[I
]];
652 S
.Clone
= createSpecialization(S
.F
, S
.Sig
);
654 // Update the known call sites to call the clone.
655 for (CallBase
*Call
: S
.CallSites
) {
656 LLVM_DEBUG(dbgs() << "FnSpecialization: Redirecting " << *Call
657 << " to call " << S
.Clone
->getName() << "\n");
658 Call
->setCalledFunction(S
.Clone
);
661 Clones
.push_back(S
.Clone
);
662 OriginalFuncs
.insert(S
.F
);
665 Solver
.solveWhileResolvedUndefsIn(Clones
);
667 // Update the rest of the call sites - these are the recursive calls, calls
668 // to discarded specialisations and calls that may match a specialisation
669 // after the solver runs.
670 for (Function
*F
: OriginalFuncs
) {
671 auto [Begin
, End
] = SM
[F
];
672 updateCallSites(F
, AllSpecs
.begin() + Begin
, AllSpecs
.begin() + End
);
675 for (Function
*F
: Clones
) {
676 if (F
->getReturnType()->isVoidTy())
678 if (F
->getReturnType()->isStructTy()) {
679 auto *STy
= cast
<StructType
>(F
->getReturnType());
680 if (!Solver
.isStructLatticeConstant(F
, STy
))
683 auto It
= Solver
.getTrackedRetVals().find(F
);
684 assert(It
!= Solver
.getTrackedRetVals().end() &&
685 "Return value ought to be tracked");
686 if (SCCPSolver::isOverdefined(It
->second
))
689 for (User
*U
: F
->users()) {
690 if (auto *CS
= dyn_cast
<CallBase
>(U
)) {
691 //The user instruction does not call our function.
692 if (CS
->getCalledFunction() != F
)
694 Solver
.resetLatticeValueFor(CS
);
699 // Rerun the solver to notify the users of the modified callsites.
700 Solver
.solveWhileResolvedUndefs();
702 for (Function
*F
: OriginalFuncs
)
703 if (FunctionMetrics
[F
].isRecursive
)
704 promoteConstantStackValues(F
);
709 void FunctionSpecializer::removeDeadFunctions() {
710 for (Function
*F
: FullySpecialized
) {
711 LLVM_DEBUG(dbgs() << "FnSpecialization: Removing dead function "
712 << F
->getName() << "\n");
714 FAM
->clear(*F
, F
->getName());
715 F
->eraseFromParent();
717 FullySpecialized
.clear();
720 /// Clone the function \p F and remove the ssa_copy intrinsics added by
721 /// the SCCPSolver in the cloned version.
722 static Function
*cloneCandidateFunction(Function
*F
, unsigned NSpecs
) {
723 ValueToValueMapTy Mappings
;
724 Function
*Clone
= CloneFunction(F
, Mappings
);
725 Clone
->setName(F
->getName() + ".specialized." + Twine(NSpecs
));
726 removeSSACopy(*Clone
);
730 bool FunctionSpecializer::findSpecializations(Function
*F
, unsigned FuncSize
,
731 SmallVectorImpl
<Spec
> &AllSpecs
,
733 // A mapping from a specialisation signature to the index of the respective
734 // entry in the all specialisation array. Used to ensure uniqueness of
736 DenseMap
<SpecSig
, unsigned> UniqueSpecs
;
738 // Get a list of interesting arguments.
739 SmallVector
<Argument
*> Args
;
740 for (Argument
&Arg
: F
->args())
741 if (isArgumentInteresting(&Arg
))
742 Args
.push_back(&Arg
);
747 for (User
*U
: F
->users()) {
748 if (!isa
<CallInst
>(U
) && !isa
<InvokeInst
>(U
))
750 auto &CS
= *cast
<CallBase
>(U
);
752 // The user instruction does not call our function.
753 if (CS
.getCalledFunction() != F
)
756 // If the call site has attribute minsize set, that callsite won't be
758 if (CS
.hasFnAttr(Attribute::MinSize
))
761 // If the parent of the call site will never be executed, we don't need
762 // to worry about the passed value.
763 if (!Solver
.isBlockExecutable(CS
.getParent()))
766 // Examine arguments and create a specialisation candidate from the
767 // constant operands of this call site.
769 for (Argument
*A
: Args
) {
770 Constant
*C
= getCandidateConstant(CS
.getArgOperand(A
->getArgNo()));
773 LLVM_DEBUG(dbgs() << "FnSpecialization: Found interesting argument "
774 << A
->getName() << " : " << C
->getNameOrAsOperand()
776 S
.Args
.push_back({A
, C
});
782 // Check if we have encountered the same specialisation already.
783 if (auto It
= UniqueSpecs
.find(S
); It
!= UniqueSpecs
.end()) {
784 // Existing specialisation. Add the call to the list to rewrite, unless
785 // it's a recursive call. A specialisation, generated because of a
786 // recursive call may end up as not the best specialisation for all
787 // the cloned instances of this call, which result from specialising
788 // functions. Hence we don't rewrite the call directly, but match it with
789 // the best specialisation once all specialisations are known.
790 if (CS
.getFunction() == F
)
792 const unsigned Index
= It
->second
;
793 AllSpecs
[Index
].CallSites
.push_back(&CS
);
795 // Calculate the specialisation gain.
798 InstCostVisitor Visitor
= getInstCostVisitorFor(F
);
799 for (ArgInfo
&A
: S
.Args
) {
800 B
+= Visitor
.getSpecializationBonus(A
.Formal
, A
.Actual
);
801 Score
+= getInliningBonus(A
.Formal
, A
.Actual
);
803 B
+= Visitor
.getBonusFromPendingPHIs();
806 LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization bonus {CodeSize = "
807 << B
.CodeSize
<< ", Latency = " << B
.Latency
808 << ", Inlining = " << Score
<< "}\n");
810 FunctionGrowth
[F
] += FuncSize
- B
.CodeSize
;
812 auto IsProfitable
= [](Bonus
&B
, unsigned Score
, unsigned FuncSize
,
813 unsigned FuncGrowth
) -> bool {
814 // No check required.
815 if (ForceSpecialization
)
817 // Minimum inlining bonus.
818 if (Score
> MinInliningBonus
* FuncSize
/ 100)
820 // Minimum codesize savings.
821 if (B
.CodeSize
< MinCodeSizeSavings
* FuncSize
/ 100)
823 // Minimum latency savings.
824 if (B
.Latency
< MinLatencySavings
* FuncSize
/ 100)
826 // Maximum codesize growth.
827 if (FuncGrowth
/ FuncSize
> MaxCodeSizeGrowth
)
832 // Discard unprofitable specialisations.
833 if (!IsProfitable(B
, Score
, FuncSize
, FunctionGrowth
[F
]))
836 // Create a new specialisation entry.
837 Score
+= std::max(B
.CodeSize
, B
.Latency
);
838 auto &Spec
= AllSpecs
.emplace_back(F
, S
, Score
);
839 if (CS
.getFunction() != F
)
840 Spec
.CallSites
.push_back(&CS
);
841 const unsigned Index
= AllSpecs
.size() - 1;
842 UniqueSpecs
[S
] = Index
;
843 if (auto [It
, Inserted
] = SM
.try_emplace(F
, Index
, Index
+ 1); !Inserted
)
844 It
->second
.second
= Index
+ 1;
848 return !UniqueSpecs
.empty();
851 bool FunctionSpecializer::isCandidateFunction(Function
*F
) {
852 if (F
->isDeclaration() || F
->arg_empty())
855 if (F
->hasFnAttribute(Attribute::NoDuplicate
))
858 // Do not specialize the cloned function again.
859 if (Specializations
.contains(F
))
862 // If we're optimizing the function for size, we shouldn't specialize it.
863 if (F
->hasOptSize() ||
864 shouldOptimizeForSize(F
, nullptr, nullptr, PGSOQueryType::IRPass
))
867 // Exit if the function is not executable. There's no point in specializing
869 if (!Solver
.isBlockExecutable(&F
->getEntryBlock()))
872 // It wastes time to specialize a function which would get inlined finally.
873 if (F
->hasFnAttribute(Attribute::AlwaysInline
))
876 LLVM_DEBUG(dbgs() << "FnSpecialization: Try function: " << F
->getName()
881 Function
*FunctionSpecializer::createSpecialization(Function
*F
,
883 Function
*Clone
= cloneCandidateFunction(F
, Specializations
.size() + 1);
885 // The original function does not neccessarily have internal linkage, but the
887 Clone
->setLinkage(GlobalValue::InternalLinkage
);
889 // Initialize the lattice state of the arguments of the function clone,
890 // marking the argument on which we specialized the function constant
891 // with the given value.
892 Solver
.setLatticeValueForSpecializationArguments(Clone
, S
.Args
);
893 Solver
.markBlockExecutable(&Clone
->front());
894 Solver
.addArgumentTrackedFunction(Clone
);
895 Solver
.addTrackedFunction(Clone
);
897 // Mark all the specialized functions
898 Specializations
.insert(Clone
);
904 /// Compute the inlining bonus for replacing argument \p A with constant \p C.
905 /// The below heuristic is only concerned with exposing inlining
906 /// opportunities via indirect call promotion. If the argument is not a
907 /// (potentially casted) function pointer, give up.
908 unsigned FunctionSpecializer::getInliningBonus(Argument
*A
, Constant
*C
) {
909 Function
*CalledFunction
= dyn_cast
<Function
>(C
->stripPointerCasts());
913 // Get TTI for the called function (used for the inline cost).
914 auto &CalleeTTI
= (GetTTI
)(*CalledFunction
);
916 // Look at all the call sites whose called value is the argument.
917 // Specializing the function on the argument would allow these indirect
918 // calls to be promoted to direct calls. If the indirect call promotion
919 // would likely enable the called function to be inlined, specializing is a
921 int InliningBonus
= 0;
922 for (User
*U
: A
->users()) {
923 if (!isa
<CallInst
>(U
) && !isa
<InvokeInst
>(U
))
925 auto *CS
= cast
<CallBase
>(U
);
926 if (CS
->getCalledOperand() != A
)
928 if (CS
->getFunctionType() != CalledFunction
->getFunctionType())
931 // Get the cost of inlining the called function at this call site. Note
932 // that this is only an estimate. The called function may eventually
933 // change in a way that leads to it not being inlined here, even though
934 // inlining looks profitable now. For example, one of its called
935 // functions may be inlined into it, making the called function too large
936 // to be inlined into this call site.
938 // We apply a boost for performing indirect call promotion by increasing
939 // the default threshold by the threshold for indirect calls.
940 auto Params
= getInlineParams();
941 Params
.DefaultThreshold
+= InlineConstants::IndirectCallThreshold
;
943 getInlineCost(*CS
, CalledFunction
, Params
, CalleeTTI
, GetAC
, GetTLI
);
945 // We clamp the bonus for this call to be between zero and the default
948 InliningBonus
+= Params
.DefaultThreshold
;
949 else if (IC
.isVariable() && IC
.getCostDelta() > 0)
950 InliningBonus
+= IC
.getCostDelta();
952 LLVM_DEBUG(dbgs() << "FnSpecialization: Inlining bonus " << InliningBonus
953 << " for user " << *U
<< "\n");
956 return InliningBonus
> 0 ? static_cast<unsigned>(InliningBonus
) : 0;
959 /// Determine if it is possible to specialise the function for constant values
960 /// of the formal parameter \p A.
961 bool FunctionSpecializer::isArgumentInteresting(Argument
*A
) {
962 // No point in specialization if the argument is unused.
966 Type
*Ty
= A
->getType();
967 if (!Ty
->isPointerTy() && (!SpecializeLiteralConstant
||
968 (!Ty
->isIntegerTy() && !Ty
->isFloatingPointTy() && !Ty
->isStructTy())))
971 // SCCP solver does not record an argument that will be constructed on
973 if (A
->hasByValAttr() && !A
->getParent()->onlyReadsMemory())
976 // For non-argument-tracked functions every argument is overdefined.
977 if (!Solver
.isArgumentTrackedFunction(A
->getParent()))
980 // Check the lattice value and decide if we should attemt to specialize,
981 // based on this argument. No point in specialization, if the lattice value
982 // is already a constant.
983 bool IsOverdefined
= Ty
->isStructTy()
984 ? any_of(Solver
.getStructLatticeValueFor(A
), SCCPSolver::isOverdefined
)
985 : SCCPSolver::isOverdefined(Solver
.getLatticeValueFor(A
));
989 dbgs() << "FnSpecialization: Found interesting parameter "
990 << A
->getNameOrAsOperand() << "\n";
992 dbgs() << "FnSpecialization: Nothing to do, parameter "
993 << A
->getNameOrAsOperand() << " is already constant\n";
995 return IsOverdefined
;
998 /// Check if the value \p V (an actual argument) is a constant or can only
999 /// have a constant value. Return that constant.
1000 Constant
*FunctionSpecializer::getCandidateConstant(Value
*V
) {
1001 if (isa
<PoisonValue
>(V
))
1004 // Select for possible specialisation values that are constants or
1005 // are deduced to be constants or constant ranges with a single element.
1006 Constant
*C
= dyn_cast
<Constant
>(V
);
1008 C
= Solver
.getConstantOrNull(V
);
1010 // Don't specialize on (anything derived from) the address of a non-constant
1011 // global variable, unless explicitly enabled.
1012 if (C
&& C
->getType()->isPointerTy() && !C
->isNullValue())
1013 if (auto *GV
= dyn_cast
<GlobalVariable
>(getUnderlyingObject(C
));
1014 GV
&& !(GV
->isConstant() || SpecializeOnAddress
))
1020 void FunctionSpecializer::updateCallSites(Function
*F
, const Spec
*Begin
,
1022 // Collect the call sites that need updating.
1023 SmallVector
<CallBase
*> ToUpdate
;
1024 for (User
*U
: F
->users())
1025 if (auto *CS
= dyn_cast
<CallBase
>(U
);
1026 CS
&& CS
->getCalledFunction() == F
&&
1027 Solver
.isBlockExecutable(CS
->getParent()))
1028 ToUpdate
.push_back(CS
);
1030 unsigned NCallsLeft
= ToUpdate
.size();
1031 for (CallBase
*CS
: ToUpdate
) {
1032 bool ShouldDecrementCount
= CS
->getFunction() == F
;
1034 // Find the best matching specialisation.
1035 const Spec
*BestSpec
= nullptr;
1036 for (const Spec
&S
: make_range(Begin
, End
)) {
1037 if (!S
.Clone
|| (BestSpec
&& S
.Score
<= BestSpec
->Score
))
1040 if (any_of(S
.Sig
.Args
, [CS
, this](const ArgInfo
&Arg
) {
1041 unsigned ArgNo
= Arg
.Formal
->getArgNo();
1042 return getCandidateConstant(CS
->getArgOperand(ArgNo
)) != Arg
.Actual
;
1050 LLVM_DEBUG(dbgs() << "FnSpecialization: Redirecting " << *CS
1051 << " to call " << BestSpec
->Clone
->getName() << "\n");
1052 CS
->setCalledFunction(BestSpec
->Clone
);
1053 ShouldDecrementCount
= true;
1056 if (ShouldDecrementCount
)
1060 // If the function has been completely specialized, the original function
1061 // is no longer needed. Mark it unreachable.
1062 if (NCallsLeft
== 0 && Solver
.isArgumentTrackedFunction(F
)) {
1063 Solver
.markFunctionUnreachable(F
);
1064 FullySpecialized
.insert(F
);