[ORC] Add std::tuple support to SimplePackedSerialization.
[llvm-project.git] / llvm / lib / Transforms / Instrumentation / PGOMemOPSizeOpt.cpp
blobd4b78f2c14b00b0f1eed421210cd93f93db6ac1a
1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the transformation that optimizes memory intrinsics
10 // such as memcpy using the size value profile. When memory intrinsic size
11 // value profile metadata is available, a single memory intrinsic is expanded
12 // to a sequence of guarded specialized versions that are called with the
13 // hottest size(s), for later expansion into more optimal inline sequences.
15 //===----------------------------------------------------------------------===//
17 #include "llvm/ADT/ArrayRef.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/ADT/Twine.h"
21 #include "llvm/Analysis/BlockFrequencyInfo.h"
22 #include "llvm/Analysis/DomTreeUpdater.h"
23 #include "llvm/Analysis/GlobalsModRef.h"
24 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
25 #include "llvm/Analysis/TargetLibraryInfo.h"
26 #include "llvm/IR/BasicBlock.h"
27 #include "llvm/IR/DerivedTypes.h"
28 #include "llvm/IR/Dominators.h"
29 #include "llvm/IR/Function.h"
30 #include "llvm/IR/IRBuilder.h"
31 #include "llvm/IR/InstVisitor.h"
32 #include "llvm/IR/InstrTypes.h"
33 #include "llvm/IR/Instruction.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/IR/LLVMContext.h"
36 #include "llvm/IR/PassManager.h"
37 #include "llvm/IR/Type.h"
38 #include "llvm/InitializePasses.h"
39 #include "llvm/Pass.h"
40 #include "llvm/PassRegistry.h"
41 #include "llvm/ProfileData/InstrProf.h"
42 #define INSTR_PROF_VALUE_PROF_MEMOP_API
43 #include "llvm/ProfileData/InstrProfData.inc"
44 #include "llvm/Support/Casting.h"
45 #include "llvm/Support/CommandLine.h"
46 #include "llvm/Support/Debug.h"
47 #include "llvm/Support/ErrorHandling.h"
48 #include "llvm/Support/MathExtras.h"
49 #include "llvm/Support/WithColor.h"
50 #include "llvm/Transforms/Instrumentation.h"
51 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
52 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
53 #include <cassert>
54 #include <cstdint>
55 #include <vector>
57 using namespace llvm;
59 #define DEBUG_TYPE "pgo-memop-opt"
61 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
62 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
64 // The minimum call count to optimize memory intrinsic calls.
65 static cl::opt<unsigned>
66 MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
67 cl::init(1000),
68 cl::desc("The minimum count to optimize memory "
69 "intrinsic calls"));
71 // Command line option to disable memory intrinsic optimization. The default is
72 // false. This is for debug purpose.
73 static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
74 cl::Hidden, cl::desc("Disable optimize"));
76 // The percent threshold to optimize memory intrinsic calls.
77 static cl::opt<unsigned>
78 MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
79 cl::Hidden, cl::ZeroOrMore,
80 cl::desc("The percentage threshold for the "
81 "memory intrinsic calls optimization"));
83 // Maximum number of versions for optimizing memory intrinsic call.
84 static cl::opt<unsigned>
85 MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
86 cl::ZeroOrMore,
87 cl::desc("The max version for the optimized memory "
88 " intrinsic calls"));
90 // Scale the counts from the annotation using the BB count value.
91 static cl::opt<bool>
92 MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
93 cl::desc("Scale the memop size counts using the basic "
94 " block count value"));
96 cl::opt<bool>
97 MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true),
98 cl::Hidden,
99 cl::desc("Size-specialize memcmp and bcmp calls"));
101 static cl::opt<unsigned>
102 MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128),
103 cl::desc("Optimize the memop size <= this value"));
105 namespace {
106 class PGOMemOPSizeOptLegacyPass : public FunctionPass {
107 public:
108 static char ID;
110 PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) {
111 initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry());
114 StringRef getPassName() const override { return "PGOMemOPSize"; }
116 private:
117 bool runOnFunction(Function &F) override;
118 void getAnalysisUsage(AnalysisUsage &AU) const override {
119 AU.addRequired<BlockFrequencyInfoWrapperPass>();
120 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
121 AU.addPreserved<GlobalsAAWrapperPass>();
122 AU.addPreserved<DominatorTreeWrapperPass>();
123 AU.addRequired<TargetLibraryInfoWrapperPass>();
126 } // end anonymous namespace
128 char PGOMemOPSizeOptLegacyPass::ID = 0;
129 INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
130 "Optimize memory intrinsic using its size value profile",
131 false, false)
132 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
133 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
134 INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
135 "Optimize memory intrinsic using its size value profile",
136 false, false)
138 FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() {
139 return new PGOMemOPSizeOptLegacyPass();
142 namespace {
144 static const char *getMIName(const MemIntrinsic *MI) {
145 switch (MI->getIntrinsicID()) {
146 case Intrinsic::memcpy:
147 return "memcpy";
148 case Intrinsic::memmove:
149 return "memmove";
150 case Intrinsic::memset:
151 return "memset";
152 default:
153 return "unknown";
157 // A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp).
158 struct MemOp {
159 Instruction *I;
160 MemOp(MemIntrinsic *MI) : I(MI) {}
161 MemOp(CallInst *CI) : I(CI) {}
162 MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(I); }
163 CallInst *asCI() { return cast<CallInst>(I); }
164 MemOp clone() {
165 if (auto MI = asMI())
166 return MemOp(cast<MemIntrinsic>(MI->clone()));
167 return MemOp(cast<CallInst>(asCI()->clone()));
169 Value *getLength() {
170 if (auto MI = asMI())
171 return MI->getLength();
172 return asCI()->getArgOperand(2);
174 void setLength(Value *Length) {
175 if (auto MI = asMI())
176 return MI->setLength(Length);
177 asCI()->setArgOperand(2, Length);
179 StringRef getFuncName() {
180 if (auto MI = asMI())
181 return MI->getCalledFunction()->getName();
182 return asCI()->getCalledFunction()->getName();
184 bool isMemmove() {
185 if (auto MI = asMI())
186 if (MI->getIntrinsicID() == Intrinsic::memmove)
187 return true;
188 return false;
190 bool isMemcmp(TargetLibraryInfo &TLI) {
191 LibFunc Func;
192 if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
193 Func == LibFunc_memcmp) {
194 return true;
196 return false;
198 bool isBcmp(TargetLibraryInfo &TLI) {
199 LibFunc Func;
200 if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
201 Func == LibFunc_bcmp) {
202 return true;
204 return false;
206 const char *getName(TargetLibraryInfo &TLI) {
207 if (auto MI = asMI())
208 return getMIName(MI);
209 LibFunc Func;
210 if (TLI.getLibFunc(*asCI(), Func)) {
211 if (Func == LibFunc_memcmp)
212 return "memcmp";
213 if (Func == LibFunc_bcmp)
214 return "bcmp";
216 llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst");
217 return nullptr;
221 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
222 public:
223 MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI,
224 OptimizationRemarkEmitter &ORE, DominatorTree *DT,
225 TargetLibraryInfo &TLI)
226 : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) {
227 ValueDataArray =
228 std::make_unique<InstrProfValueData[]>(INSTR_PROF_NUM_BUCKETS);
230 bool isChanged() const { return Changed; }
231 void perform() {
232 WorkList.clear();
233 visit(Func);
235 for (auto &MO : WorkList) {
236 ++NumOfPGOMemOPAnnotate;
237 if (perform(MO)) {
238 Changed = true;
239 ++NumOfPGOMemOPOpt;
240 LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName()
241 << "is Transformed.\n");
246 void visitMemIntrinsic(MemIntrinsic &MI) {
247 Value *Length = MI.getLength();
248 // Not perform on constant length calls.
249 if (isa<ConstantInt>(Length))
250 return;
251 WorkList.push_back(MemOp(&MI));
254 void visitCallInst(CallInst &CI) {
255 LibFunc Func;
256 if (TLI.getLibFunc(CI, Func) &&
257 (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
258 !isa<ConstantInt>(CI.getArgOperand(2))) {
259 WorkList.push_back(MemOp(&CI));
263 private:
264 Function &Func;
265 BlockFrequencyInfo &BFI;
266 OptimizationRemarkEmitter &ORE;
267 DominatorTree *DT;
268 TargetLibraryInfo &TLI;
269 bool Changed;
270 std::vector<MemOp> WorkList;
271 // The space to read the profile annotation.
272 std::unique_ptr<InstrProfValueData[]> ValueDataArray;
273 bool perform(MemOp MO);
276 static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
277 assert(Count <= TotalCount);
278 if (Count < MemOPCountThreshold)
279 return false;
280 if (Count < TotalCount * MemOPPercentThreshold / 100)
281 return false;
282 return true;
285 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
286 uint64_t Denom) {
287 if (!MemOPScaleCount)
288 return Count;
289 bool Overflowed;
290 uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
291 return ScaleCount / Denom;
294 bool MemOPSizeOpt::perform(MemOp MO) {
295 assert(MO.I);
296 if (MO.isMemmove())
297 return false;
298 if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI)))
299 return false;
301 uint32_t NumVals, MaxNumVals = INSTR_PROF_NUM_BUCKETS;
302 uint64_t TotalCount;
303 if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumVals,
304 ValueDataArray.get(), NumVals, TotalCount))
305 return false;
307 uint64_t ActualCount = TotalCount;
308 uint64_t SavedTotalCount = TotalCount;
309 if (MemOPScaleCount) {
310 auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent());
311 if (!BBEdgeCount)
312 return false;
313 ActualCount = *BBEdgeCount;
316 ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
317 LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count "
318 << ActualCount << "\n");
319 LLVM_DEBUG(
320 for (auto &VD
321 : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; });
323 if (ActualCount < MemOPCountThreshold)
324 return false;
325 // Skip if the total value profiled count is 0, in which case we can't
326 // scale up the counts properly (and there is no profitable transformation).
327 if (TotalCount == 0)
328 return false;
330 TotalCount = ActualCount;
331 if (MemOPScaleCount)
332 LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
333 << " denominator = " << SavedTotalCount << "\n");
335 // Keeping track of the count of the default case:
336 uint64_t RemainCount = TotalCount;
337 uint64_t SavedRemainCount = SavedTotalCount;
338 SmallVector<uint64_t, 16> SizeIds;
339 SmallVector<uint64_t, 16> CaseCounts;
340 uint64_t MaxCount = 0;
341 unsigned Version = 0;
342 int64_t LastV = -1;
343 // Default case is in the front -- save the slot here.
344 CaseCounts.push_back(0);
345 SmallVector<InstrProfValueData, 24> RemainingVDs;
346 for (auto I = VDs.begin(), E = VDs.end(); I != E; ++I) {
347 auto &VD = *I;
348 int64_t V = VD.Value;
349 uint64_t C = VD.Count;
350 if (MemOPScaleCount)
351 C = getScaledCount(C, ActualCount, SavedTotalCount);
353 if (!InstrProfIsSingleValRange(V) || V > MemOpMaxOptSize) {
354 RemainingVDs.push_back(VD);
355 continue;
358 // ValueCounts are sorted on the count. Break at the first un-profitable
359 // value.
360 if (!isProfitable(C, RemainCount)) {
361 RemainingVDs.insert(RemainingVDs.end(), I, E);
362 break;
365 if (V == LastV) {
366 LLVM_DEBUG(dbgs() << "Invalid Profile Data in Function " << Func.getName()
367 << ": Two consecutive, identical values in MemOp value"
368 "counts.\n");
369 return false;
372 LastV = V;
374 SizeIds.push_back(V);
375 CaseCounts.push_back(C);
376 if (C > MaxCount)
377 MaxCount = C;
379 assert(RemainCount >= C);
380 RemainCount -= C;
381 assert(SavedRemainCount >= VD.Count);
382 SavedRemainCount -= VD.Count;
384 if (++Version >= MemOPMaxVersion && MemOPMaxVersion != 0) {
385 RemainingVDs.insert(RemainingVDs.end(), I + 1, E);
386 break;
390 if (Version == 0)
391 return false;
393 CaseCounts[0] = RemainCount;
394 if (RemainCount > MaxCount)
395 MaxCount = RemainCount;
397 uint64_t SumForOpt = TotalCount - RemainCount;
399 LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
400 << " Versions (covering " << SumForOpt << " out of "
401 << TotalCount << ")\n");
403 // mem_op(..., size)
404 // ==>
405 // switch (size) {
406 // case s1:
407 // mem_op(..., s1);
408 // goto merge_bb;
409 // case s2:
410 // mem_op(..., s2);
411 // goto merge_bb;
412 // ...
413 // default:
414 // mem_op(..., size);
415 // goto merge_bb;
416 // }
417 // merge_bb:
419 BasicBlock *BB = MO.I->getParent();
420 LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
421 LLVM_DEBUG(dbgs() << *BB << "\n");
422 auto OrigBBFreq = BFI.getBlockFreq(BB);
424 BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT);
425 BasicBlock::iterator It(*MO.I);
426 ++It;
427 assert(It != DefaultBB->end());
428 BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT);
429 MergeBB->setName("MemOP.Merge");
430 BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
431 DefaultBB->setName("MemOP.Default");
433 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
434 auto &Ctx = Func.getContext();
435 IRBuilder<> IRB(BB);
436 BB->getTerminator()->eraseFromParent();
437 Value *SizeVar = MO.getLength();
438 SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
439 Type *MemOpTy = MO.I->getType();
440 PHINode *PHI = nullptr;
441 if (!MemOpTy->isVoidTy()) {
442 // Insert a phi for the return values at the merge block.
443 IRBuilder<> IRBM(MergeBB->getFirstNonPHI());
444 PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge");
445 MO.I->replaceAllUsesWith(PHI);
446 PHI->addIncoming(MO.I, DefaultBB);
449 // Clear the value profile data.
450 MO.I->setMetadata(LLVMContext::MD_prof, nullptr);
451 // If all promoted, we don't need the MD.prof metadata.
452 if (SavedRemainCount > 0 || Version != NumVals) {
453 // Otherwise we need update with the un-promoted records back.
454 ArrayRef<InstrProfValueData> RemVDs(RemainingVDs);
455 annotateValueSite(*Func.getParent(), *MO.I, RemVDs, SavedRemainCount,
456 IPVK_MemOPSize, NumVals);
459 LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n");
461 std::vector<DominatorTree::UpdateType> Updates;
462 if (DT)
463 Updates.reserve(2 * SizeIds.size());
465 for (uint64_t SizeId : SizeIds) {
466 BasicBlock *CaseBB = BasicBlock::Create(
467 Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
468 MemOp NewMO = MO.clone();
469 // Fix the argument.
470 auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType());
471 assert(SizeType && "Expected integer type size argument.");
472 ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId);
473 NewMO.setLength(CaseSizeId);
474 CaseBB->getInstList().push_back(NewMO.I);
475 IRBuilder<> IRBCase(CaseBB);
476 IRBCase.CreateBr(MergeBB);
477 SI->addCase(CaseSizeId, CaseBB);
478 if (!MemOpTy->isVoidTy())
479 PHI->addIncoming(NewMO.I, CaseBB);
480 if (DT) {
481 Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB});
482 Updates.push_back({DominatorTree::Insert, BB, CaseBB});
484 LLVM_DEBUG(dbgs() << *CaseBB << "\n");
486 DTU.applyUpdates(Updates);
487 Updates.clear();
489 setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
491 LLVM_DEBUG(dbgs() << *BB << "\n");
492 LLVM_DEBUG(dbgs() << *DefaultBB << "\n");
493 LLVM_DEBUG(dbgs() << *MergeBB << "\n");
495 ORE.emit([&]() {
496 using namespace ore;
497 return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I)
498 << "optimized " << NV("Memop", MO.getName(TLI)) << " with count "
499 << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount)
500 << " for " << NV("Versions", Version) << " versions";
503 return true;
505 } // namespace
507 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
508 OptimizationRemarkEmitter &ORE,
509 DominatorTree *DT, TargetLibraryInfo &TLI) {
510 if (DisableMemOPOPT)
511 return false;
513 if (F.hasFnAttribute(Attribute::OptimizeForSize))
514 return false;
515 MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI);
516 MemOPSizeOpt.perform();
517 return MemOPSizeOpt.isChanged();
520 bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
521 BlockFrequencyInfo &BFI =
522 getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
523 auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
524 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
525 DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
526 TargetLibraryInfo &TLI =
527 getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
528 return PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI);
531 namespace llvm {
532 char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID;
534 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
535 FunctionAnalysisManager &FAM) {
536 auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
537 auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
538 auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
539 auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
540 bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI);
541 if (!Changed)
542 return PreservedAnalyses::all();
543 auto PA = PreservedAnalyses();
544 PA.preserve<DominatorTreeAnalysis>();
545 return PA;
547 } // namespace llvm