1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements the transformation that optimizes memory intrinsics
10 // such as memcpy using the size value profile. When memory intrinsic size
11 // value profile metadata is available, a single memory intrinsic is expanded
12 // to a sequence of guarded specialized versions that are called with the
13 // hottest size(s), for later expansion into more optimal inline sequences.
15 //===----------------------------------------------------------------------===//
17 #include "llvm/ADT/ArrayRef.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/ADT/Twine.h"
21 #include "llvm/Analysis/BlockFrequencyInfo.h"
22 #include "llvm/Analysis/DomTreeUpdater.h"
23 #include "llvm/Analysis/GlobalsModRef.h"
24 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
25 #include "llvm/IR/BasicBlock.h"
26 #include "llvm/IR/CallSite.h"
27 #include "llvm/IR/DerivedTypes.h"
28 #include "llvm/IR/Dominators.h"
29 #include "llvm/IR/Function.h"
30 #include "llvm/IR/IRBuilder.h"
31 #include "llvm/IR/InstVisitor.h"
32 #include "llvm/IR/InstrTypes.h"
33 #include "llvm/IR/Instruction.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/IR/LLVMContext.h"
36 #include "llvm/IR/PassManager.h"
37 #include "llvm/IR/Type.h"
38 #include "llvm/Pass.h"
39 #include "llvm/PassRegistry.h"
40 #include "llvm/PassSupport.h"
41 #include "llvm/ProfileData/InstrProf.h"
42 #include "llvm/Support/Casting.h"
43 #include "llvm/Support/CommandLine.h"
44 #include "llvm/Support/Debug.h"
45 #include "llvm/Support/ErrorHandling.h"
46 #include "llvm/Support/MathExtras.h"
47 #include "llvm/Transforms/Instrumentation.h"
48 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
49 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
56 #define DEBUG_TYPE "pgo-memop-opt"
58 STATISTIC(NumOfPGOMemOPOpt
, "Number of memop intrinsics optimized.");
59 STATISTIC(NumOfPGOMemOPAnnotate
, "Number of memop intrinsics annotated.");
61 // The minimum call count to optimize memory intrinsic calls.
62 static cl::opt
<unsigned>
63 MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden
, cl::ZeroOrMore
,
65 cl::desc("The minimum count to optimize memory "
68 // Command line option to disable memory intrinsic optimization. The default is
69 // false. This is for debug purpose.
70 static cl::opt
<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
71 cl::Hidden
, cl::desc("Disable optimize"));
73 // The percent threshold to optimize memory intrinsic calls.
74 static cl::opt
<unsigned>
75 MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
76 cl::Hidden
, cl::ZeroOrMore
,
77 cl::desc("The percentage threshold for the "
78 "memory intrinsic calls optimization"));
80 // Maximum number of versions for optimizing memory intrinsic call.
81 static cl::opt
<unsigned>
82 MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden
,
84 cl::desc("The max version for the optimized memory "
87 // Scale the counts from the annotation using the BB count value.
89 MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden
,
90 cl::desc("Scale the memop size counts using the basic "
91 " block count value"));
93 // This option sets the rangge of precise profile memop sizes.
94 extern cl::opt
<std::string
> MemOPSizeRange
;
96 // This option sets the value that groups large memop sizes
97 extern cl::opt
<unsigned> MemOPSizeLarge
;
100 class PGOMemOPSizeOptLegacyPass
: public FunctionPass
{
104 PGOMemOPSizeOptLegacyPass() : FunctionPass(ID
) {
105 initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry());
108 StringRef
getPassName() const override
{ return "PGOMemOPSize"; }
111 bool runOnFunction(Function
&F
) override
;
112 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
113 AU
.addRequired
<BlockFrequencyInfoWrapperPass
>();
114 AU
.addRequired
<OptimizationRemarkEmitterWrapperPass
>();
115 AU
.addPreserved
<GlobalsAAWrapperPass
>();
116 AU
.addPreserved
<DominatorTreeWrapperPass
>();
119 } // end anonymous namespace
121 char PGOMemOPSizeOptLegacyPass::ID
= 0;
122 INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass
, "pgo-memop-opt",
123 "Optimize memory intrinsic using its size value profile",
125 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass
)
126 INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass
, "pgo-memop-opt",
127 "Optimize memory intrinsic using its size value profile",
130 FunctionPass
*llvm::createPGOMemOPSizeOptLegacyPass() {
131 return new PGOMemOPSizeOptLegacyPass();
135 class MemOPSizeOpt
: public InstVisitor
<MemOPSizeOpt
> {
137 MemOPSizeOpt(Function
&Func
, BlockFrequencyInfo
&BFI
,
138 OptimizationRemarkEmitter
&ORE
, DominatorTree
*DT
)
139 : Func(Func
), BFI(BFI
), ORE(ORE
), DT(DT
), Changed(false) {
141 std::make_unique
<InstrProfValueData
[]>(MemOPMaxVersion
+ 2);
142 // Get the MemOPSize range information from option MemOPSizeRange,
143 getMemOPSizeRangeFromOption(MemOPSizeRange
, PreciseRangeStart
,
146 bool isChanged() const { return Changed
; }
151 for (auto &MI
: WorkList
) {
152 ++NumOfPGOMemOPAnnotate
;
156 LLVM_DEBUG(dbgs() << "MemOP call: "
157 << MI
->getCalledFunction()->getName()
158 << "is Transformed.\n");
163 void visitMemIntrinsic(MemIntrinsic
&MI
) {
164 Value
*Length
= MI
.getLength();
165 // Not perform on constant length calls.
166 if (dyn_cast
<ConstantInt
>(Length
))
168 WorkList
.push_back(&MI
);
173 BlockFrequencyInfo
&BFI
;
174 OptimizationRemarkEmitter
&ORE
;
177 std::vector
<MemIntrinsic
*> WorkList
;
178 // Start of the previse range.
179 int64_t PreciseRangeStart
;
180 // Last value of the previse range.
181 int64_t PreciseRangeLast
;
182 // The space to read the profile annotation.
183 std::unique_ptr
<InstrProfValueData
[]> ValueDataArray
;
184 bool perform(MemIntrinsic
*MI
);
186 // This kind shows which group the value falls in. For PreciseValue, we have
187 // the profile count for that value. LargeGroup groups the values that are in
188 // range [LargeValue, +inf). NonLargeGroup groups the rest of values.
189 enum MemOPSizeKind
{ PreciseValue
, NonLargeGroup
, LargeGroup
};
191 MemOPSizeKind
getMemOPSizeKind(int64_t Value
) const {
192 if (Value
== MemOPSizeLarge
&& MemOPSizeLarge
!= 0)
194 if (Value
== PreciseRangeLast
+ 1)
195 return NonLargeGroup
;
200 static const char *getMIName(const MemIntrinsic
*MI
) {
201 switch (MI
->getIntrinsicID()) {
202 case Intrinsic::memcpy
:
204 case Intrinsic::memmove
:
206 case Intrinsic::memset
:
213 static bool isProfitable(uint64_t Count
, uint64_t TotalCount
) {
214 assert(Count
<= TotalCount
);
215 if (Count
< MemOPCountThreshold
)
217 if (Count
< TotalCount
* MemOPPercentThreshold
/ 100)
222 static inline uint64_t getScaledCount(uint64_t Count
, uint64_t Num
,
224 if (!MemOPScaleCount
)
227 uint64_t ScaleCount
= SaturatingMultiply(Count
, Num
, &Overflowed
);
228 return ScaleCount
/ Denom
;
231 bool MemOPSizeOpt::perform(MemIntrinsic
*MI
) {
233 if (MI
->getIntrinsicID() == Intrinsic::memmove
)
236 uint32_t NumVals
, MaxNumPromotions
= MemOPMaxVersion
+ 2;
238 if (!getValueProfDataFromInst(*MI
, IPVK_MemOPSize
, MaxNumPromotions
,
239 ValueDataArray
.get(), NumVals
, TotalCount
))
242 uint64_t ActualCount
= TotalCount
;
243 uint64_t SavedTotalCount
= TotalCount
;
244 if (MemOPScaleCount
) {
245 auto BBEdgeCount
= BFI
.getBlockProfileCount(MI
->getParent());
248 ActualCount
= *BBEdgeCount
;
251 ArrayRef
<InstrProfValueData
> VDs(ValueDataArray
.get(), NumVals
);
252 LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count "
253 << ActualCount
<< "\n");
256 : VDs
) { dbgs() << " (" << VD
.Value
<< "," << VD
.Count
<< ")\n"; });
258 if (ActualCount
< MemOPCountThreshold
)
260 // Skip if the total value profiled count is 0, in which case we can't
261 // scale up the counts properly (and there is no profitable transformation).
265 TotalCount
= ActualCount
;
267 LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
268 << " denominator = " << SavedTotalCount
<< "\n");
270 // Keeping track of the count of the default case:
271 uint64_t RemainCount
= TotalCount
;
272 uint64_t SavedRemainCount
= SavedTotalCount
;
273 SmallVector
<uint64_t, 16> SizeIds
;
274 SmallVector
<uint64_t, 16> CaseCounts
;
275 uint64_t MaxCount
= 0;
276 unsigned Version
= 0;
277 // Default case is in the front -- save the slot here.
278 CaseCounts
.push_back(0);
279 for (auto &VD
: VDs
) {
280 int64_t V
= VD
.Value
;
281 uint64_t C
= VD
.Count
;
283 C
= getScaledCount(C
, ActualCount
, SavedTotalCount
);
285 // Only care precise value here.
286 if (getMemOPSizeKind(V
) != PreciseValue
)
289 // ValueCounts are sorted on the count. Break at the first un-profitable
291 if (!isProfitable(C
, RemainCount
))
294 SizeIds
.push_back(V
);
295 CaseCounts
.push_back(C
);
299 assert(RemainCount
>= C
);
301 assert(SavedRemainCount
>= VD
.Count
);
302 SavedRemainCount
-= VD
.Count
;
304 if (++Version
> MemOPMaxVersion
&& MemOPMaxVersion
!= 0)
311 CaseCounts
[0] = RemainCount
;
312 if (RemainCount
> MaxCount
)
313 MaxCount
= RemainCount
;
315 uint64_t SumForOpt
= TotalCount
- RemainCount
;
317 LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
318 << " Versions (covering " << SumForOpt
<< " out of "
319 << TotalCount
<< ")\n");
332 // mem_op(..., size);
337 BasicBlock
*BB
= MI
->getParent();
338 LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
339 LLVM_DEBUG(dbgs() << *BB
<< "\n");
340 auto OrigBBFreq
= BFI
.getBlockFreq(BB
);
342 BasicBlock
*DefaultBB
= SplitBlock(BB
, MI
, DT
);
343 BasicBlock::iterator
It(*MI
);
345 assert(It
!= DefaultBB
->end());
346 BasicBlock
*MergeBB
= SplitBlock(DefaultBB
, &(*It
), DT
);
347 MergeBB
->setName("MemOP.Merge");
348 BFI
.setBlockFreq(MergeBB
, OrigBBFreq
.getFrequency());
349 DefaultBB
->setName("MemOP.Default");
351 DomTreeUpdater
DTU(DT
, DomTreeUpdater::UpdateStrategy::Eager
);
352 auto &Ctx
= Func
.getContext();
354 BB
->getTerminator()->eraseFromParent();
355 Value
*SizeVar
= MI
->getLength();
356 SwitchInst
*SI
= IRB
.CreateSwitch(SizeVar
, DefaultBB
, SizeIds
.size());
358 // Clear the value profile data.
359 MI
->setMetadata(LLVMContext::MD_prof
, nullptr);
360 // If all promoted, we don't need the MD.prof metadata.
361 if (SavedRemainCount
> 0 || Version
!= NumVals
)
362 // Otherwise we need update with the un-promoted records back.
363 annotateValueSite(*Func
.getParent(), *MI
, VDs
.slice(Version
),
364 SavedRemainCount
, IPVK_MemOPSize
, NumVals
);
366 LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n");
368 std::vector
<DominatorTree::UpdateType
> Updates
;
370 Updates
.reserve(2 * SizeIds
.size());
372 for (uint64_t SizeId
: SizeIds
) {
373 BasicBlock
*CaseBB
= BasicBlock::Create(
374 Ctx
, Twine("MemOP.Case.") + Twine(SizeId
), &Func
, DefaultBB
);
375 Instruction
*NewInst
= MI
->clone();
377 auto *MemI
= cast
<MemIntrinsic
>(NewInst
);
378 auto *SizeType
= dyn_cast
<IntegerType
>(MemI
->getLength()->getType());
379 assert(SizeType
&& "Expected integer type size argument.");
380 ConstantInt
*CaseSizeId
= ConstantInt::get(SizeType
, SizeId
);
381 MemI
->setLength(CaseSizeId
);
382 CaseBB
->getInstList().push_back(NewInst
);
383 IRBuilder
<> IRBCase(CaseBB
);
384 IRBCase
.CreateBr(MergeBB
);
385 SI
->addCase(CaseSizeId
, CaseBB
);
387 Updates
.push_back({DominatorTree::Insert
, CaseBB
, MergeBB
});
388 Updates
.push_back({DominatorTree::Insert
, BB
, CaseBB
});
390 LLVM_DEBUG(dbgs() << *CaseBB
<< "\n");
392 DTU
.applyUpdates(Updates
);
395 setProfMetadata(Func
.getParent(), SI
, CaseCounts
, MaxCount
);
397 LLVM_DEBUG(dbgs() << *BB
<< "\n");
398 LLVM_DEBUG(dbgs() << *DefaultBB
<< "\n");
399 LLVM_DEBUG(dbgs() << *MergeBB
<< "\n");
403 return OptimizationRemark(DEBUG_TYPE
, "memopt-opt", MI
)
404 << "optimized " << NV("Intrinsic", StringRef(getMIName(MI
)))
405 << " with count " << NV("Count", SumForOpt
) << " out of "
406 << NV("Total", TotalCount
) << " for " << NV("Versions", Version
)
414 static bool PGOMemOPSizeOptImpl(Function
&F
, BlockFrequencyInfo
&BFI
,
415 OptimizationRemarkEmitter
&ORE
,
420 if (F
.hasFnAttribute(Attribute::OptimizeForSize
))
422 MemOPSizeOpt
MemOPSizeOpt(F
, BFI
, ORE
, DT
);
423 MemOPSizeOpt
.perform();
424 return MemOPSizeOpt
.isChanged();
427 bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function
&F
) {
428 BlockFrequencyInfo
&BFI
=
429 getAnalysis
<BlockFrequencyInfoWrapperPass
>().getBFI();
430 auto &ORE
= getAnalysis
<OptimizationRemarkEmitterWrapperPass
>().getORE();
431 auto *DTWP
= getAnalysisIfAvailable
<DominatorTreeWrapperPass
>();
432 DominatorTree
*DT
= DTWP
? &DTWP
->getDomTree() : nullptr;
433 return PGOMemOPSizeOptImpl(F
, BFI
, ORE
, DT
);
437 char &PGOMemOPSizeOptID
= PGOMemOPSizeOptLegacyPass::ID
;
439 PreservedAnalyses
PGOMemOPSizeOpt::run(Function
&F
,
440 FunctionAnalysisManager
&FAM
) {
441 auto &BFI
= FAM
.getResult
<BlockFrequencyAnalysis
>(F
);
442 auto &ORE
= FAM
.getResult
<OptimizationRemarkEmitterAnalysis
>(F
);
443 auto *DT
= FAM
.getCachedResult
<DominatorTreeAnalysis
>(F
);
444 bool Changed
= PGOMemOPSizeOptImpl(F
, BFI
, ORE
, DT
);
446 return PreservedAnalyses::all();
447 auto PA
= PreservedAnalyses();
448 PA
.preserve
<GlobalsAA
>();
449 PA
.preserve
<DominatorTreeAnalysis
>();