[ARM] Rejig MVE load store tests. NFC
[llvm-core.git] / lib / Transforms / Instrumentation / PGOMemOPSizeOpt.cpp
blob188f95b4676bb738b0b15177571499f6ac4257d3
1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the transformation that optimizes memory intrinsics
10 // such as memcpy using the size value profile. When memory intrinsic size
11 // value profile metadata is available, a single memory intrinsic is expanded
12 // to a sequence of guarded specialized versions that are called with the
13 // hottest size(s), for later expansion into more optimal inline sequences.
15 //===----------------------------------------------------------------------===//
17 #include "llvm/ADT/ArrayRef.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/ADT/Twine.h"
21 #include "llvm/Analysis/BlockFrequencyInfo.h"
22 #include "llvm/Analysis/DomTreeUpdater.h"
23 #include "llvm/Analysis/GlobalsModRef.h"
24 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
25 #include "llvm/IR/BasicBlock.h"
26 #include "llvm/IR/CallSite.h"
27 #include "llvm/IR/DerivedTypes.h"
28 #include "llvm/IR/Dominators.h"
29 #include "llvm/IR/Function.h"
30 #include "llvm/IR/IRBuilder.h"
31 #include "llvm/IR/InstVisitor.h"
32 #include "llvm/IR/InstrTypes.h"
33 #include "llvm/IR/Instruction.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/IR/LLVMContext.h"
36 #include "llvm/IR/PassManager.h"
37 #include "llvm/IR/Type.h"
38 #include "llvm/Pass.h"
39 #include "llvm/PassRegistry.h"
40 #include "llvm/PassSupport.h"
41 #include "llvm/ProfileData/InstrProf.h"
42 #include "llvm/Support/Casting.h"
43 #include "llvm/Support/CommandLine.h"
44 #include "llvm/Support/Debug.h"
45 #include "llvm/Support/ErrorHandling.h"
46 #include "llvm/Support/MathExtras.h"
47 #include "llvm/Transforms/Instrumentation.h"
48 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
49 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
50 #include <cassert>
51 #include <cstdint>
52 #include <vector>
54 using namespace llvm;
56 #define DEBUG_TYPE "pgo-memop-opt"
58 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
59 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
61 // The minimum call count to optimize memory intrinsic calls.
62 static cl::opt<unsigned>
63 MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
64 cl::init(1000),
65 cl::desc("The minimum count to optimize memory "
66 "intrinsic calls"));
68 // Command line option to disable memory intrinsic optimization. The default is
69 // false. This is for debug purpose.
70 static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
71 cl::Hidden, cl::desc("Disable optimize"));
73 // The percent threshold to optimize memory intrinsic calls.
74 static cl::opt<unsigned>
75 MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
76 cl::Hidden, cl::ZeroOrMore,
77 cl::desc("The percentage threshold for the "
78 "memory intrinsic calls optimization"));
80 // Maximum number of versions for optimizing memory intrinsic call.
81 static cl::opt<unsigned>
82 MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
83 cl::ZeroOrMore,
84 cl::desc("The max version for the optimized memory "
85 " intrinsic calls"));
87 // Scale the counts from the annotation using the BB count value.
88 static cl::opt<bool>
89 MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
90 cl::desc("Scale the memop size counts using the basic "
91 " block count value"));
93 // This option sets the rangge of precise profile memop sizes.
94 extern cl::opt<std::string> MemOPSizeRange;
96 // This option sets the value that groups large memop sizes
97 extern cl::opt<unsigned> MemOPSizeLarge;
99 namespace {
100 class PGOMemOPSizeOptLegacyPass : public FunctionPass {
101 public:
102 static char ID;
104 PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) {
105 initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry());
108 StringRef getPassName() const override { return "PGOMemOPSize"; }
110 private:
111 bool runOnFunction(Function &F) override;
112 void getAnalysisUsage(AnalysisUsage &AU) const override {
113 AU.addRequired<BlockFrequencyInfoWrapperPass>();
114 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
115 AU.addPreserved<GlobalsAAWrapperPass>();
116 AU.addPreserved<DominatorTreeWrapperPass>();
119 } // end anonymous namespace
121 char PGOMemOPSizeOptLegacyPass::ID = 0;
122 INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
123 "Optimize memory intrinsic using its size value profile",
124 false, false)
125 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
126 INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
127 "Optimize memory intrinsic using its size value profile",
128 false, false)
130 FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() {
131 return new PGOMemOPSizeOptLegacyPass();
134 namespace {
135 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
136 public:
137 MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI,
138 OptimizationRemarkEmitter &ORE, DominatorTree *DT)
139 : Func(Func), BFI(BFI), ORE(ORE), DT(DT), Changed(false) {
140 ValueDataArray =
141 llvm::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
142 // Get the MemOPSize range information from option MemOPSizeRange,
143 getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart,
144 PreciseRangeLast);
146 bool isChanged() const { return Changed; }
147 void perform() {
148 WorkList.clear();
149 visit(Func);
151 for (auto &MI : WorkList) {
152 ++NumOfPGOMemOPAnnotate;
153 if (perform(MI)) {
154 Changed = true;
155 ++NumOfPGOMemOPOpt;
156 LLVM_DEBUG(dbgs() << "MemOP call: "
157 << MI->getCalledFunction()->getName()
158 << "is Transformed.\n");
163 void visitMemIntrinsic(MemIntrinsic &MI) {
164 Value *Length = MI.getLength();
165 // Not perform on constant length calls.
166 if (dyn_cast<ConstantInt>(Length))
167 return;
168 WorkList.push_back(&MI);
171 private:
172 Function &Func;
173 BlockFrequencyInfo &BFI;
174 OptimizationRemarkEmitter &ORE;
175 DominatorTree *DT;
176 bool Changed;
177 std::vector<MemIntrinsic *> WorkList;
178 // Start of the previse range.
179 int64_t PreciseRangeStart;
180 // Last value of the previse range.
181 int64_t PreciseRangeLast;
182 // The space to read the profile annotation.
183 std::unique_ptr<InstrProfValueData[]> ValueDataArray;
184 bool perform(MemIntrinsic *MI);
186 // This kind shows which group the value falls in. For PreciseValue, we have
187 // the profile count for that value. LargeGroup groups the values that are in
188 // range [LargeValue, +inf). NonLargeGroup groups the rest of values.
189 enum MemOPSizeKind { PreciseValue, NonLargeGroup, LargeGroup };
191 MemOPSizeKind getMemOPSizeKind(int64_t Value) const {
192 if (Value == MemOPSizeLarge && MemOPSizeLarge != 0)
193 return LargeGroup;
194 if (Value == PreciseRangeLast + 1)
195 return NonLargeGroup;
196 return PreciseValue;
200 static const char *getMIName(const MemIntrinsic *MI) {
201 switch (MI->getIntrinsicID()) {
202 case Intrinsic::memcpy:
203 return "memcpy";
204 case Intrinsic::memmove:
205 return "memmove";
206 case Intrinsic::memset:
207 return "memset";
208 default:
209 return "unknown";
213 static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
214 assert(Count <= TotalCount);
215 if (Count < MemOPCountThreshold)
216 return false;
217 if (Count < TotalCount * MemOPPercentThreshold / 100)
218 return false;
219 return true;
222 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
223 uint64_t Denom) {
224 if (!MemOPScaleCount)
225 return Count;
226 bool Overflowed;
227 uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
228 return ScaleCount / Denom;
231 bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
232 assert(MI);
233 if (MI->getIntrinsicID() == Intrinsic::memmove)
234 return false;
236 uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2;
237 uint64_t TotalCount;
238 if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions,
239 ValueDataArray.get(), NumVals, TotalCount))
240 return false;
242 uint64_t ActualCount = TotalCount;
243 uint64_t SavedTotalCount = TotalCount;
244 if (MemOPScaleCount) {
245 auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent());
246 if (!BBEdgeCount)
247 return false;
248 ActualCount = *BBEdgeCount;
251 ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
252 LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count "
253 << ActualCount << "\n");
254 LLVM_DEBUG(
255 for (auto &VD
256 : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; });
258 if (ActualCount < MemOPCountThreshold)
259 return false;
260 // Skip if the total value profiled count is 0, in which case we can't
261 // scale up the counts properly (and there is no profitable transformation).
262 if (TotalCount == 0)
263 return false;
265 TotalCount = ActualCount;
266 if (MemOPScaleCount)
267 LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
268 << " denominator = " << SavedTotalCount << "\n");
270 // Keeping track of the count of the default case:
271 uint64_t RemainCount = TotalCount;
272 uint64_t SavedRemainCount = SavedTotalCount;
273 SmallVector<uint64_t, 16> SizeIds;
274 SmallVector<uint64_t, 16> CaseCounts;
275 uint64_t MaxCount = 0;
276 unsigned Version = 0;
277 // Default case is in the front -- save the slot here.
278 CaseCounts.push_back(0);
279 for (auto &VD : VDs) {
280 int64_t V = VD.Value;
281 uint64_t C = VD.Count;
282 if (MemOPScaleCount)
283 C = getScaledCount(C, ActualCount, SavedTotalCount);
285 // Only care precise value here.
286 if (getMemOPSizeKind(V) != PreciseValue)
287 continue;
289 // ValueCounts are sorted on the count. Break at the first un-profitable
290 // value.
291 if (!isProfitable(C, RemainCount))
292 break;
294 SizeIds.push_back(V);
295 CaseCounts.push_back(C);
296 if (C > MaxCount)
297 MaxCount = C;
299 assert(RemainCount >= C);
300 RemainCount -= C;
301 assert(SavedRemainCount >= VD.Count);
302 SavedRemainCount -= VD.Count;
304 if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0)
305 break;
308 if (Version == 0)
309 return false;
311 CaseCounts[0] = RemainCount;
312 if (RemainCount > MaxCount)
313 MaxCount = RemainCount;
315 uint64_t SumForOpt = TotalCount - RemainCount;
317 LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
318 << " Versions (covering " << SumForOpt << " out of "
319 << TotalCount << ")\n");
321 // mem_op(..., size)
322 // ==>
323 // switch (size) {
324 // case s1:
325 // mem_op(..., s1);
326 // goto merge_bb;
327 // case s2:
328 // mem_op(..., s2);
329 // goto merge_bb;
330 // ...
331 // default:
332 // mem_op(..., size);
333 // goto merge_bb;
334 // }
335 // merge_bb:
337 BasicBlock *BB = MI->getParent();
338 LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
339 LLVM_DEBUG(dbgs() << *BB << "\n");
340 auto OrigBBFreq = BFI.getBlockFreq(BB);
342 BasicBlock *DefaultBB = SplitBlock(BB, MI, DT);
343 BasicBlock::iterator It(*MI);
344 ++It;
345 assert(It != DefaultBB->end());
346 BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT);
347 MergeBB->setName("MemOP.Merge");
348 BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
349 DefaultBB->setName("MemOP.Default");
351 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
352 auto &Ctx = Func.getContext();
353 IRBuilder<> IRB(BB);
354 BB->getTerminator()->eraseFromParent();
355 Value *SizeVar = MI->getLength();
356 SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
358 // Clear the value profile data.
359 MI->setMetadata(LLVMContext::MD_prof, nullptr);
360 // If all promoted, we don't need the MD.prof metadata.
361 if (SavedRemainCount > 0 || Version != NumVals)
362 // Otherwise we need update with the un-promoted records back.
363 annotateValueSite(*Func.getParent(), *MI, VDs.slice(Version),
364 SavedRemainCount, IPVK_MemOPSize, NumVals);
366 LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n");
368 std::vector<DominatorTree::UpdateType> Updates;
369 if (DT)
370 Updates.reserve(2 * SizeIds.size());
372 for (uint64_t SizeId : SizeIds) {
373 BasicBlock *CaseBB = BasicBlock::Create(
374 Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
375 Instruction *NewInst = MI->clone();
376 // Fix the argument.
377 MemIntrinsic * MemI = dyn_cast<MemIntrinsic>(NewInst);
378 IntegerType *SizeType = dyn_cast<IntegerType>(MemI->getLength()->getType());
379 assert(SizeType && "Expected integer type size argument.");
380 ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId);
381 MemI->setLength(CaseSizeId);
382 CaseBB->getInstList().push_back(NewInst);
383 IRBuilder<> IRBCase(CaseBB);
384 IRBCase.CreateBr(MergeBB);
385 SI->addCase(CaseSizeId, CaseBB);
386 if (DT) {
387 Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB});
388 Updates.push_back({DominatorTree::Insert, BB, CaseBB});
390 LLVM_DEBUG(dbgs() << *CaseBB << "\n");
392 DTU.applyUpdates(Updates);
393 Updates.clear();
395 setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
397 LLVM_DEBUG(dbgs() << *BB << "\n");
398 LLVM_DEBUG(dbgs() << *DefaultBB << "\n");
399 LLVM_DEBUG(dbgs() << *MergeBB << "\n");
401 ORE.emit([&]() {
402 using namespace ore;
403 return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MI)
404 << "optimized " << NV("Intrinsic", StringRef(getMIName(MI)))
405 << " with count " << NV("Count", SumForOpt) << " out of "
406 << NV("Total", TotalCount) << " for " << NV("Versions", Version)
407 << " versions";
410 return true;
412 } // namespace
414 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
415 OptimizationRemarkEmitter &ORE,
416 DominatorTree *DT) {
417 if (DisableMemOPOPT)
418 return false;
420 if (F.hasFnAttribute(Attribute::OptimizeForSize))
421 return false;
422 MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT);
423 MemOPSizeOpt.perform();
424 return MemOPSizeOpt.isChanged();
427 bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
428 BlockFrequencyInfo &BFI =
429 getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
430 auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
431 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
432 DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
433 return PGOMemOPSizeOptImpl(F, BFI, ORE, DT);
436 namespace llvm {
437 char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID;
439 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
440 FunctionAnalysisManager &FAM) {
441 auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
442 auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
443 auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
444 bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT);
445 if (!Changed)
446 return PreservedAnalyses::all();
447 auto PA = PreservedAnalyses();
448 PA.preserve<GlobalsAA>();
449 PA.preserve<DominatorTreeAnalysis>();
450 return PA;
452 } // namespace llvm