1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements the transformation that optimizes memory intrinsics
10 // such as memcpy using the size value profile. When memory intrinsic size
11 // value profile metadata is available, a single memory intrinsic is expanded
12 // to a sequence of guarded specialized versions that are called with the
13 // hottest size(s), for later expansion into more optimal inline sequences.
15 //===----------------------------------------------------------------------===//
17 #include "llvm/ADT/ArrayRef.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/ADT/Twine.h"
21 #include "llvm/Analysis/BlockFrequencyInfo.h"
22 #include "llvm/Analysis/DomTreeUpdater.h"
23 #include "llvm/Analysis/GlobalsModRef.h"
24 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
25 #include "llvm/Analysis/TargetLibraryInfo.h"
26 #include "llvm/IR/BasicBlock.h"
27 #include "llvm/IR/DerivedTypes.h"
28 #include "llvm/IR/Dominators.h"
29 #include "llvm/IR/Function.h"
30 #include "llvm/IR/IRBuilder.h"
31 #include "llvm/IR/InstVisitor.h"
32 #include "llvm/IR/InstrTypes.h"
33 #include "llvm/IR/Instruction.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/IR/LLVMContext.h"
36 #include "llvm/IR/PassManager.h"
37 #include "llvm/IR/Type.h"
38 #include "llvm/InitializePasses.h"
39 #include "llvm/Pass.h"
40 #include "llvm/PassRegistry.h"
41 #include "llvm/ProfileData/InstrProf.h"
42 #define INSTR_PROF_VALUE_PROF_MEMOP_API
43 #include "llvm/ProfileData/InstrProfData.inc"
44 #include "llvm/Support/Casting.h"
45 #include "llvm/Support/CommandLine.h"
46 #include "llvm/Support/Debug.h"
47 #include "llvm/Support/ErrorHandling.h"
48 #include "llvm/Support/MathExtras.h"
49 #include "llvm/Support/WithColor.h"
50 #include "llvm/Transforms/Instrumentation.h"
51 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
52 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
59 #define DEBUG_TYPE "pgo-memop-opt"
61 STATISTIC(NumOfPGOMemOPOpt
, "Number of memop intrinsics optimized.");
62 STATISTIC(NumOfPGOMemOPAnnotate
, "Number of memop intrinsics annotated.");
64 // The minimum call count to optimize memory intrinsic calls.
65 static cl::opt
<unsigned>
66 MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden
, cl::ZeroOrMore
,
68 cl::desc("The minimum count to optimize memory "
71 // Command line option to disable memory intrinsic optimization. The default is
72 // false. This is for debug purpose.
73 static cl::opt
<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
74 cl::Hidden
, cl::desc("Disable optimize"));
76 // The percent threshold to optimize memory intrinsic calls.
77 static cl::opt
<unsigned>
78 MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
79 cl::Hidden
, cl::ZeroOrMore
,
80 cl::desc("The percentage threshold for the "
81 "memory intrinsic calls optimization"));
83 // Maximum number of versions for optimizing memory intrinsic call.
84 static cl::opt
<unsigned>
85 MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden
,
87 cl::desc("The max version for the optimized memory "
90 // Scale the counts from the annotation using the BB count value.
92 MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden
,
93 cl::desc("Scale the memop size counts using the basic "
94 " block count value"));
97 MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true),
99 cl::desc("Size-specialize memcmp and bcmp calls"));
101 static cl::opt
<unsigned>
102 MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden
, cl::init(128),
103 cl::desc("Optimize the memop size <= this value"));
106 class PGOMemOPSizeOptLegacyPass
: public FunctionPass
{
110 PGOMemOPSizeOptLegacyPass() : FunctionPass(ID
) {
111 initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry());
114 StringRef
getPassName() const override
{ return "PGOMemOPSize"; }
117 bool runOnFunction(Function
&F
) override
;
118 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
119 AU
.addRequired
<BlockFrequencyInfoWrapperPass
>();
120 AU
.addRequired
<OptimizationRemarkEmitterWrapperPass
>();
121 AU
.addPreserved
<GlobalsAAWrapperPass
>();
122 AU
.addPreserved
<DominatorTreeWrapperPass
>();
123 AU
.addRequired
<TargetLibraryInfoWrapperPass
>();
126 } // end anonymous namespace
128 char PGOMemOPSizeOptLegacyPass::ID
= 0;
129 INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass
, "pgo-memop-opt",
130 "Optimize memory intrinsic using its size value profile",
132 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass
)
133 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass
)
134 INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass
, "pgo-memop-opt",
135 "Optimize memory intrinsic using its size value profile",
138 FunctionPass
*llvm::createPGOMemOPSizeOptLegacyPass() {
139 return new PGOMemOPSizeOptLegacyPass();
144 static const char *getMIName(const MemIntrinsic
*MI
) {
145 switch (MI
->getIntrinsicID()) {
146 case Intrinsic::memcpy
:
148 case Intrinsic::memmove
:
150 case Intrinsic::memset
:
157 // A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp).
160 MemOp(MemIntrinsic
*MI
) : I(MI
) {}
161 MemOp(CallInst
*CI
) : I(CI
) {}
162 MemIntrinsic
*asMI() { return dyn_cast
<MemIntrinsic
>(I
); }
163 CallInst
*asCI() { return cast
<CallInst
>(I
); }
165 if (auto MI
= asMI())
166 return MemOp(cast
<MemIntrinsic
>(MI
->clone()));
167 return MemOp(cast
<CallInst
>(asCI()->clone()));
170 if (auto MI
= asMI())
171 return MI
->getLength();
172 return asCI()->getArgOperand(2);
174 void setLength(Value
*Length
) {
175 if (auto MI
= asMI())
176 return MI
->setLength(Length
);
177 asCI()->setArgOperand(2, Length
);
179 StringRef
getFuncName() {
180 if (auto MI
= asMI())
181 return MI
->getCalledFunction()->getName();
182 return asCI()->getCalledFunction()->getName();
185 if (auto MI
= asMI())
186 if (MI
->getIntrinsicID() == Intrinsic::memmove
)
190 bool isMemcmp(TargetLibraryInfo
&TLI
) {
192 if (asMI() == nullptr && TLI
.getLibFunc(*asCI(), Func
) &&
193 Func
== LibFunc_memcmp
) {
198 bool isBcmp(TargetLibraryInfo
&TLI
) {
200 if (asMI() == nullptr && TLI
.getLibFunc(*asCI(), Func
) &&
201 Func
== LibFunc_bcmp
) {
206 const char *getName(TargetLibraryInfo
&TLI
) {
207 if (auto MI
= asMI())
208 return getMIName(MI
);
210 if (TLI
.getLibFunc(*asCI(), Func
)) {
211 if (Func
== LibFunc_memcmp
)
213 if (Func
== LibFunc_bcmp
)
216 llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst");
221 class MemOPSizeOpt
: public InstVisitor
<MemOPSizeOpt
> {
223 MemOPSizeOpt(Function
&Func
, BlockFrequencyInfo
&BFI
,
224 OptimizationRemarkEmitter
&ORE
, DominatorTree
*DT
,
225 TargetLibraryInfo
&TLI
)
226 : Func(Func
), BFI(BFI
), ORE(ORE
), DT(DT
), TLI(TLI
), Changed(false) {
228 std::make_unique
<InstrProfValueData
[]>(INSTR_PROF_NUM_BUCKETS
);
230 bool isChanged() const { return Changed
; }
235 for (auto &MO
: WorkList
) {
236 ++NumOfPGOMemOPAnnotate
;
240 LLVM_DEBUG(dbgs() << "MemOP call: " << MO
.getFuncName()
241 << "is Transformed.\n");
246 void visitMemIntrinsic(MemIntrinsic
&MI
) {
247 Value
*Length
= MI
.getLength();
248 // Not perform on constant length calls.
249 if (isa
<ConstantInt
>(Length
))
251 WorkList
.push_back(MemOp(&MI
));
254 void visitCallInst(CallInst
&CI
) {
256 if (TLI
.getLibFunc(CI
, Func
) &&
257 (Func
== LibFunc_memcmp
|| Func
== LibFunc_bcmp
) &&
258 !isa
<ConstantInt
>(CI
.getArgOperand(2))) {
259 WorkList
.push_back(MemOp(&CI
));
265 BlockFrequencyInfo
&BFI
;
266 OptimizationRemarkEmitter
&ORE
;
268 TargetLibraryInfo
&TLI
;
270 std::vector
<MemOp
> WorkList
;
271 // The space to read the profile annotation.
272 std::unique_ptr
<InstrProfValueData
[]> ValueDataArray
;
273 bool perform(MemOp MO
);
276 static bool isProfitable(uint64_t Count
, uint64_t TotalCount
) {
277 assert(Count
<= TotalCount
);
278 if (Count
< MemOPCountThreshold
)
280 if (Count
< TotalCount
* MemOPPercentThreshold
/ 100)
285 static inline uint64_t getScaledCount(uint64_t Count
, uint64_t Num
,
287 if (!MemOPScaleCount
)
290 uint64_t ScaleCount
= SaturatingMultiply(Count
, Num
, &Overflowed
);
291 return ScaleCount
/ Denom
;
294 bool MemOPSizeOpt::perform(MemOp MO
) {
298 if (!MemOPOptMemcmpBcmp
&& (MO
.isMemcmp(TLI
) || MO
.isBcmp(TLI
)))
301 uint32_t NumVals
, MaxNumVals
= INSTR_PROF_NUM_BUCKETS
;
303 if (!getValueProfDataFromInst(*MO
.I
, IPVK_MemOPSize
, MaxNumVals
,
304 ValueDataArray
.get(), NumVals
, TotalCount
))
307 uint64_t ActualCount
= TotalCount
;
308 uint64_t SavedTotalCount
= TotalCount
;
309 if (MemOPScaleCount
) {
310 auto BBEdgeCount
= BFI
.getBlockProfileCount(MO
.I
->getParent());
313 ActualCount
= *BBEdgeCount
;
316 ArrayRef
<InstrProfValueData
> VDs(ValueDataArray
.get(), NumVals
);
317 LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count "
318 << ActualCount
<< "\n");
321 : VDs
) { dbgs() << " (" << VD
.Value
<< "," << VD
.Count
<< ")\n"; });
323 if (ActualCount
< MemOPCountThreshold
)
325 // Skip if the total value profiled count is 0, in which case we can't
326 // scale up the counts properly (and there is no profitable transformation).
330 TotalCount
= ActualCount
;
332 LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
333 << " denominator = " << SavedTotalCount
<< "\n");
335 // Keeping track of the count of the default case:
336 uint64_t RemainCount
= TotalCount
;
337 uint64_t SavedRemainCount
= SavedTotalCount
;
338 SmallVector
<uint64_t, 16> SizeIds
;
339 SmallVector
<uint64_t, 16> CaseCounts
;
340 uint64_t MaxCount
= 0;
341 unsigned Version
= 0;
343 // Default case is in the front -- save the slot here.
344 CaseCounts
.push_back(0);
345 SmallVector
<InstrProfValueData
, 24> RemainingVDs
;
346 for (auto I
= VDs
.begin(), E
= VDs
.end(); I
!= E
; ++I
) {
348 int64_t V
= VD
.Value
;
349 uint64_t C
= VD
.Count
;
351 C
= getScaledCount(C
, ActualCount
, SavedTotalCount
);
353 if (!InstrProfIsSingleValRange(V
) || V
> MemOpMaxOptSize
) {
354 RemainingVDs
.push_back(VD
);
358 // ValueCounts are sorted on the count. Break at the first un-profitable
360 if (!isProfitable(C
, RemainCount
)) {
361 RemainingVDs
.insert(RemainingVDs
.end(), I
, E
);
366 LLVM_DEBUG(dbgs() << "Invalid Profile Data in Function " << Func
.getName()
367 << ": Two consecutive, identical values in MemOp value"
374 SizeIds
.push_back(V
);
375 CaseCounts
.push_back(C
);
379 assert(RemainCount
>= C
);
381 assert(SavedRemainCount
>= VD
.Count
);
382 SavedRemainCount
-= VD
.Count
;
384 if (++Version
>= MemOPMaxVersion
&& MemOPMaxVersion
!= 0) {
385 RemainingVDs
.insert(RemainingVDs
.end(), I
+ 1, E
);
393 CaseCounts
[0] = RemainCount
;
394 if (RemainCount
> MaxCount
)
395 MaxCount
= RemainCount
;
397 uint64_t SumForOpt
= TotalCount
- RemainCount
;
399 LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
400 << " Versions (covering " << SumForOpt
<< " out of "
401 << TotalCount
<< ")\n");
414 // mem_op(..., size);
419 BasicBlock
*BB
= MO
.I
->getParent();
420 LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
421 LLVM_DEBUG(dbgs() << *BB
<< "\n");
422 auto OrigBBFreq
= BFI
.getBlockFreq(BB
);
424 BasicBlock
*DefaultBB
= SplitBlock(BB
, MO
.I
, DT
);
425 BasicBlock::iterator
It(*MO
.I
);
427 assert(It
!= DefaultBB
->end());
428 BasicBlock
*MergeBB
= SplitBlock(DefaultBB
, &(*It
), DT
);
429 MergeBB
->setName("MemOP.Merge");
430 BFI
.setBlockFreq(MergeBB
, OrigBBFreq
.getFrequency());
431 DefaultBB
->setName("MemOP.Default");
433 DomTreeUpdater
DTU(DT
, DomTreeUpdater::UpdateStrategy::Eager
);
434 auto &Ctx
= Func
.getContext();
436 BB
->getTerminator()->eraseFromParent();
437 Value
*SizeVar
= MO
.getLength();
438 SwitchInst
*SI
= IRB
.CreateSwitch(SizeVar
, DefaultBB
, SizeIds
.size());
439 Type
*MemOpTy
= MO
.I
->getType();
440 PHINode
*PHI
= nullptr;
441 if (!MemOpTy
->isVoidTy()) {
442 // Insert a phi for the return values at the merge block.
443 IRBuilder
<> IRBM(MergeBB
->getFirstNonPHI());
444 PHI
= IRBM
.CreatePHI(MemOpTy
, SizeIds
.size() + 1, "MemOP.RVMerge");
445 MO
.I
->replaceAllUsesWith(PHI
);
446 PHI
->addIncoming(MO
.I
, DefaultBB
);
449 // Clear the value profile data.
450 MO
.I
->setMetadata(LLVMContext::MD_prof
, nullptr);
451 // If all promoted, we don't need the MD.prof metadata.
452 if (SavedRemainCount
> 0 || Version
!= NumVals
) {
453 // Otherwise we need update with the un-promoted records back.
454 ArrayRef
<InstrProfValueData
> RemVDs(RemainingVDs
);
455 annotateValueSite(*Func
.getParent(), *MO
.I
, RemVDs
, SavedRemainCount
,
456 IPVK_MemOPSize
, NumVals
);
459 LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n");
461 std::vector
<DominatorTree::UpdateType
> Updates
;
463 Updates
.reserve(2 * SizeIds
.size());
465 for (uint64_t SizeId
: SizeIds
) {
466 BasicBlock
*CaseBB
= BasicBlock::Create(
467 Ctx
, Twine("MemOP.Case.") + Twine(SizeId
), &Func
, DefaultBB
);
468 MemOp NewMO
= MO
.clone();
470 auto *SizeType
= dyn_cast
<IntegerType
>(NewMO
.getLength()->getType());
471 assert(SizeType
&& "Expected integer type size argument.");
472 ConstantInt
*CaseSizeId
= ConstantInt::get(SizeType
, SizeId
);
473 NewMO
.setLength(CaseSizeId
);
474 CaseBB
->getInstList().push_back(NewMO
.I
);
475 IRBuilder
<> IRBCase(CaseBB
);
476 IRBCase
.CreateBr(MergeBB
);
477 SI
->addCase(CaseSizeId
, CaseBB
);
478 if (!MemOpTy
->isVoidTy())
479 PHI
->addIncoming(NewMO
.I
, CaseBB
);
481 Updates
.push_back({DominatorTree::Insert
, CaseBB
, MergeBB
});
482 Updates
.push_back({DominatorTree::Insert
, BB
, CaseBB
});
484 LLVM_DEBUG(dbgs() << *CaseBB
<< "\n");
486 DTU
.applyUpdates(Updates
);
489 setProfMetadata(Func
.getParent(), SI
, CaseCounts
, MaxCount
);
491 LLVM_DEBUG(dbgs() << *BB
<< "\n");
492 LLVM_DEBUG(dbgs() << *DefaultBB
<< "\n");
493 LLVM_DEBUG(dbgs() << *MergeBB
<< "\n");
497 return OptimizationRemark(DEBUG_TYPE
, "memopt-opt", MO
.I
)
498 << "optimized " << NV("Memop", MO
.getName(TLI
)) << " with count "
499 << NV("Count", SumForOpt
) << " out of " << NV("Total", TotalCount
)
500 << " for " << NV("Versions", Version
) << " versions";
507 static bool PGOMemOPSizeOptImpl(Function
&F
, BlockFrequencyInfo
&BFI
,
508 OptimizationRemarkEmitter
&ORE
,
509 DominatorTree
*DT
, TargetLibraryInfo
&TLI
) {
513 if (F
.hasFnAttribute(Attribute::OptimizeForSize
))
515 MemOPSizeOpt
MemOPSizeOpt(F
, BFI
, ORE
, DT
, TLI
);
516 MemOPSizeOpt
.perform();
517 return MemOPSizeOpt
.isChanged();
520 bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function
&F
) {
521 BlockFrequencyInfo
&BFI
=
522 getAnalysis
<BlockFrequencyInfoWrapperPass
>().getBFI();
523 auto &ORE
= getAnalysis
<OptimizationRemarkEmitterWrapperPass
>().getORE();
524 auto *DTWP
= getAnalysisIfAvailable
<DominatorTreeWrapperPass
>();
525 DominatorTree
*DT
= DTWP
? &DTWP
->getDomTree() : nullptr;
526 TargetLibraryInfo
&TLI
=
527 getAnalysis
<TargetLibraryInfoWrapperPass
>().getTLI(F
);
528 return PGOMemOPSizeOptImpl(F
, BFI
, ORE
, DT
, TLI
);
532 char &PGOMemOPSizeOptID
= PGOMemOPSizeOptLegacyPass::ID
;
534 PreservedAnalyses
PGOMemOPSizeOpt::run(Function
&F
,
535 FunctionAnalysisManager
&FAM
) {
536 auto &BFI
= FAM
.getResult
<BlockFrequencyAnalysis
>(F
);
537 auto &ORE
= FAM
.getResult
<OptimizationRemarkEmitterAnalysis
>(F
);
538 auto *DT
= FAM
.getCachedResult
<DominatorTreeAnalysis
>(F
);
539 auto &TLI
= FAM
.getResult
<TargetLibraryAnalysis
>(F
);
540 bool Changed
= PGOMemOPSizeOptImpl(F
, BFI
, ORE
, DT
, TLI
);
542 return PreservedAnalyses::all();
543 auto PA
= PreservedAnalyses();
544 PA
.preserve
<DominatorTreeAnalysis
>();