[LLVM][IR] Use splat syntax when printing ConstantExpr based splats. (#116856)
[llvm-project.git] / bolt / lib / Passes / FrameOptimizer.cpp
blob1c0f9555f9eb9bebf8078d0be3f9219279b3c197
1 //===- bolt/Passes/FrameOptimizer.cpp -------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the FrameOptimizerPass class.
11 //===----------------------------------------------------------------------===//
13 #include "bolt/Passes/FrameOptimizer.h"
14 #include "bolt/Core/BinaryFunctionCallGraph.h"
15 #include "bolt/Core/ParallelUtilities.h"
16 #include "bolt/Passes/DataflowInfoManager.h"
17 #include "bolt/Passes/ShrinkWrapping.h"
18 #include "bolt/Passes/StackAvailableExpressions.h"
19 #include "bolt/Passes/StackReachingUses.h"
20 #include "bolt/Utils/CommandLineOpts.h"
21 #include "llvm/Support/Timer.h"
22 #include <deque>
24 #define DEBUG_TYPE "fop"
26 using namespace llvm;
28 namespace opts {
29 extern cl::opt<unsigned> Verbosity;
30 extern cl::opt<bool> TimeOpts;
31 extern cl::OptionCategory BoltOptCategory;
33 using namespace bolt;
35 cl::opt<FrameOptimizationType>
36 FrameOptimization("frame-opt",
37 cl::init(FOP_NONE),
38 cl::desc("optimize stack frame accesses"),
39 cl::values(
40 clEnumValN(FOP_NONE, "none", "do not perform frame optimization"),
41 clEnumValN(FOP_HOT, "hot", "perform FOP on hot functions"),
42 clEnumValN(FOP_ALL, "all", "perform FOP on all functions")),
43 cl::ZeroOrMore,
44 cl::cat(BoltOptCategory));
46 cl::opt<bool> RemoveStores(
47 "frame-opt-rm-stores", cl::init(FOP_NONE),
48 cl::desc("apply additional analysis to remove stores (experimental)"),
49 cl::cat(BoltOptCategory));
51 } // namespace opts
53 namespace llvm {
54 namespace bolt {
56 void FrameOptimizerPass::removeUnnecessaryLoads(const RegAnalysis &RA,
57 const FrameAnalysis &FA,
58 BinaryFunction &BF) {
59 StackAvailableExpressions SAE(RA, FA, BF);
60 SAE.run();
62 LLVM_DEBUG(dbgs() << "Performing unnecessary loads removal\n");
63 std::deque<std::pair<BinaryBasicBlock *, MCInst *>> ToErase;
64 bool Changed = false;
65 const auto ExprEnd = SAE.expr_end();
66 MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get();
67 for (BinaryBasicBlock &BB : BF) {
68 LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n");
69 const MCInst *Prev = nullptr;
70 for (MCInst &Inst : BB) {
71 LLVM_DEBUG({
72 dbgs() << "\t\tNow at ";
73 Inst.dump();
74 for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB);
75 I != ExprEnd; ++I) {
76 dbgs() << "\t\t\tReached by: ";
77 (*I)->dump();
79 });
80 // if Inst is a load from stack and the current available expressions show
81 // this value is available in a register or immediate, replace this load
82 // with move from register or from immediate.
83 ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst);
84 if (!FIEX) {
85 Prev = &Inst;
86 continue;
88 // FIXME: Change to remove IsSimple == 0. We're being conservative here,
89 // but once replaceMemOperandWithReg is ready, we should feed it with all
90 // sorts of complex instructions.
91 if (FIEX->IsLoad == false || FIEX->IsSimple == false ||
92 FIEX->StackOffset >= 0) {
93 Prev = &Inst;
94 continue;
97 for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB);
98 I != ExprEnd; ++I) {
99 const MCInst *AvailableInst = *I;
100 ErrorOr<const FrameIndexEntry &> FIEY = FA.getFIEFor(*AvailableInst);
101 if (!FIEY)
102 continue;
103 assert(FIEY->IsStore && FIEY->IsSimple);
104 if (FIEX->StackOffset != FIEY->StackOffset || FIEX->Size != FIEY->Size)
105 continue;
106 // TODO: Change push/pops to stack adjustment instruction
107 if (MIB->isPop(Inst))
108 continue;
110 ++NumRedundantLoads;
111 FreqRedundantLoads += BB.getKnownExecutionCount();
112 Changed = true;
113 LLVM_DEBUG(dbgs() << "Redundant load instruction: ");
114 LLVM_DEBUG(Inst.dump());
115 LLVM_DEBUG(dbgs() << "Related store instruction: ");
116 LLVM_DEBUG(AvailableInst->dump());
117 LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n");
118 // Replace load
119 if (FIEY->IsStoreFromReg) {
120 if (!MIB->replaceMemOperandWithReg(Inst, FIEY->RegOrImm)) {
121 LLVM_DEBUG(dbgs() << "FAILED to change operand to a reg\n");
122 break;
124 FreqLoadsChangedToReg += BB.getKnownExecutionCount();
125 MIB->removeAnnotation(Inst, "FrameAccessEntry");
126 LLVM_DEBUG(dbgs() << "Changed operand to a reg\n");
127 if (MIB->isRedundantMove(Inst)) {
128 ++NumLoadsDeleted;
129 FreqLoadsDeleted += BB.getKnownExecutionCount();
130 LLVM_DEBUG(dbgs() << "Created a redundant move\n");
131 // Delete it!
132 ToErase.push_front(std::make_pair(&BB, &Inst));
134 } else {
135 char Buf[8] = {0, 0, 0, 0, 0, 0, 0, 0};
136 support::ulittle64_t::ref(Buf + 0) = FIEY->RegOrImm;
137 LLVM_DEBUG(dbgs() << "Changing operand to an imm... ");
138 if (!MIB->replaceMemOperandWithImm(Inst, StringRef(Buf, 8), 0)) {
139 LLVM_DEBUG(dbgs() << "FAILED\n");
140 } else {
141 FreqLoadsChangedToImm += BB.getKnownExecutionCount();
142 MIB->removeAnnotation(Inst, "FrameAccessEntry");
143 LLVM_DEBUG(dbgs() << "Ok\n");
146 LLVM_DEBUG(dbgs() << "Changed to: ");
147 LLVM_DEBUG(Inst.dump());
148 break;
150 Prev = &Inst;
153 if (Changed)
154 LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n");
156 // TODO: Implement an interface of eraseInstruction that works out the
157 // complete list of elements to remove.
158 for (std::pair<BinaryBasicBlock *, MCInst *> I : ToErase)
159 I.first->eraseInstruction(I.first->findInstruction(I.second));
162 void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA,
163 BinaryFunction &BF) {
164 StackReachingUses SRU(FA, BF);
165 SRU.run();
167 LLVM_DEBUG(dbgs() << "Performing unused stores removal\n");
168 std::vector<std::pair<BinaryBasicBlock *, MCInst *>> ToErase;
169 bool Changed = false;
170 for (BinaryBasicBlock &BB : BF) {
171 LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n");
172 const MCInst *Prev = nullptr;
173 for (MCInst &Inst : llvm::reverse(BB)) {
174 LLVM_DEBUG({
175 dbgs() << "\t\tNow at ";
176 Inst.dump();
177 for (auto I = Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB);
178 I != SRU.expr_end(); ++I) {
179 dbgs() << "\t\t\tReached by: ";
180 (*I)->dump();
183 ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst);
184 if (!FIEX) {
185 Prev = &Inst;
186 continue;
188 if (FIEX->IsLoad || !FIEX->IsSimple || FIEX->StackOffset >= 0) {
189 Prev = &Inst;
190 continue;
193 if (SRU.isStoreUsed(*FIEX,
194 Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB))) {
195 Prev = &Inst;
196 continue;
198 // TODO: Change push/pops to stack adjustment instruction
199 if (BF.getBinaryContext().MIB->isPush(Inst))
200 continue;
202 ++NumRedundantStores;
203 FreqRedundantStores += BB.getKnownExecutionCount();
204 Changed = true;
205 LLVM_DEBUG(dbgs() << "Unused store instruction: ");
206 LLVM_DEBUG(Inst.dump());
207 LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n");
208 LLVM_DEBUG(dbgs() << "FIE offset = " << FIEX->StackOffset
209 << " size = " << (int)FIEX->Size << "\n");
210 // Delete it!
211 ToErase.emplace_back(&BB, &Inst);
212 Prev = &Inst;
216 for (std::pair<BinaryBasicBlock *, MCInst *> I : ToErase)
217 I.first->eraseInstruction(I.first->findInstruction(I.second));
219 if (Changed)
220 LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n");
223 Error FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
224 if (opts::FrameOptimization == FOP_NONE)
225 return Error::success();
227 std::unique_ptr<BinaryFunctionCallGraph> CG;
228 std::unique_ptr<FrameAnalysis> FA;
229 std::unique_ptr<RegAnalysis> RA;
232 NamedRegionTimer T1("callgraph", "create call graph", "FOP",
233 "FOP breakdown", opts::TimeOpts);
234 CG = std::make_unique<BinaryFunctionCallGraph>(buildCallGraph(BC));
238 NamedRegionTimer T1("frameanalysis", "frame analysis", "FOP",
239 "FOP breakdown", opts::TimeOpts);
240 FA = std::make_unique<FrameAnalysis>(BC, *CG);
244 NamedRegionTimer T1("reganalysis", "reg analysis", "FOP", "FOP breakdown",
245 opts::TimeOpts);
246 RA = std::make_unique<RegAnalysis>(BC, &BC.getBinaryFunctions(), CG.get());
249 // Perform caller-saved register optimizations, then callee-saved register
250 // optimizations (shrink wrapping)
251 for (auto &I : BC.getBinaryFunctions()) {
252 if (!FA->hasFrameInfo(I.second))
253 continue;
254 // Restrict pass execution if user asked to only run on hot functions
255 if (opts::FrameOptimization == FOP_HOT) {
256 if (I.second.getKnownExecutionCount() < BC.getHotThreshold())
257 continue;
258 LLVM_DEBUG(
259 dbgs() << "Considering " << I.second.getPrintName()
260 << " for frame optimizations because its execution count ( "
261 << I.second.getKnownExecutionCount()
262 << " ) exceeds our hotness threshold ( "
263 << BC.getHotThreshold() << " )\n");
267 NamedRegionTimer T1("removeloads", "remove loads", "FOP", "FOP breakdown",
268 opts::TimeOpts);
269 if (!FA->hasStackArithmetic(I.second))
270 removeUnnecessaryLoads(*RA, *FA, I.second);
273 if (opts::RemoveStores) {
274 NamedRegionTimer T1("removestores", "remove stores", "FOP",
275 "FOP breakdown", opts::TimeOpts);
276 if (!FA->hasStackArithmetic(I.second))
277 removeUnusedStores(*FA, I.second);
279 // Don't even start shrink wrapping if no profiling info is available
280 if (I.second.getKnownExecutionCount() == 0)
281 continue;
285 NamedRegionTimer T1("shrinkwrapping", "shrink wrapping", "FOP",
286 "FOP breakdown", opts::TimeOpts);
287 if (Error E = performShrinkWrapping(*RA, *FA, BC))
288 return Error(std::move(E));
291 BC.outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads
292 << " redundant load(s) and " << NumRedundantStores
293 << " unused store(s)\n";
294 BC.outs() << "BOLT-INFO: Frequency of redundant loads is "
295 << FreqRedundantLoads << " and frequency of unused stores is "
296 << FreqRedundantStores << "\n";
297 BC.outs() << "BOLT-INFO: Frequency of loads changed to use a register is "
298 << FreqLoadsChangedToReg
299 << " and frequency of loads changed to use an immediate is "
300 << FreqLoadsChangedToImm << "\n";
301 BC.outs() << "BOLT-INFO: FOP deleted " << NumLoadsDeleted
302 << " load(s) (dyn count: " << FreqLoadsDeleted << ") and "
303 << NumRedundantStores << " store(s)\n";
304 FA->printStats();
305 ShrinkWrapping::printStats(BC);
306 return Error::success();
309 Error FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA,
310 const FrameAnalysis &FA,
311 BinaryContext &BC) {
312 // Initialize necessary annotations to allow safe parallel accesses to
313 // annotation index in MIB
314 BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getSaveTagName());
315 BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getRestoreTagName());
316 BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getTodoTagName());
317 BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getSlotTagName());
318 BC.MIB->getOrCreateAnnotationIndex(
319 StackLayoutModifier::getOffsetCFIRegTagName());
320 BC.MIB->getOrCreateAnnotationIndex("ReachingDefs");
321 BC.MIB->getOrCreateAnnotationIndex("ReachingUses");
322 BC.MIB->getOrCreateAnnotationIndex("LivenessAnalysis");
323 BC.MIB->getOrCreateAnnotationIndex("StackReachingUses");
324 BC.MIB->getOrCreateAnnotationIndex("PostDominatorAnalysis");
325 BC.MIB->getOrCreateAnnotationIndex("DominatorAnalysis");
326 BC.MIB->getOrCreateAnnotationIndex("StackPointerTracking");
327 BC.MIB->getOrCreateAnnotationIndex("StackPointerTrackingForInternalCalls");
328 BC.MIB->getOrCreateAnnotationIndex("StackAvailableExpressions");
329 BC.MIB->getOrCreateAnnotationIndex("StackAllocationAnalysis");
330 BC.MIB->getOrCreateAnnotationIndex("ShrinkWrap-Todo");
331 BC.MIB->getOrCreateAnnotationIndex("PredictiveStackPointerTracking");
332 BC.MIB->getOrCreateAnnotationIndex("ReachingInsnsBackward");
333 BC.MIB->getOrCreateAnnotationIndex("ReachingInsns");
334 BC.MIB->getOrCreateAnnotationIndex("AccessesDeletedPos");
335 BC.MIB->getOrCreateAnnotationIndex("DeleteMe");
337 std::vector<std::pair<uint64_t, const BinaryFunction *>> Top10Funcs;
338 auto LogFunc = [&](BinaryFunction &BF) {
339 auto Lower = llvm::lower_bound(
340 Top10Funcs, BF.getKnownExecutionCount(),
341 [](const std::pair<uint64_t, const BinaryFunction *> &Elmt,
342 uint64_t Value) { return Elmt.first > Value; });
343 if (Lower == Top10Funcs.end() && Top10Funcs.size() >= 10)
344 return;
345 Top10Funcs.insert(Lower,
346 std::make_pair<>(BF.getKnownExecutionCount(), &BF));
347 if (Top10Funcs.size() > 10)
348 Top10Funcs.resize(10);
350 (void)LogFunc;
352 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
353 if (BF.getFunctionScore() == 0)
354 return true;
356 return false;
359 const bool HotOnly = opts::FrameOptimization == FOP_HOT;
361 Error SWError = Error::success();
363 ParallelUtilities::WorkFuncWithAllocTy WorkFunction =
364 [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) {
365 DataflowInfoManager Info(BF, &RA, &FA, AllocatorId);
366 ShrinkWrapping SW(FA, BF, Info, AllocatorId);
368 auto ChangedOrErr = SW.perform(HotOnly);
369 if (auto E = ChangedOrErr.takeError()) {
370 std::lock_guard<std::mutex> Lock(FuncsChangedMutex);
371 SWError = joinErrors(std::move(SWError), Error(std::move(E)));
372 return;
374 const bool Changed = *ChangedOrErr;
375 if (Changed) {
376 std::lock_guard<std::mutex> Lock(FuncsChangedMutex);
377 FuncsChanged.insert(&BF);
378 LLVM_DEBUG(LogFunc(BF));
382 ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
383 BC, ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC, WorkFunction,
384 SkipPredicate, "shrink-wrapping");
386 if (!Top10Funcs.empty()) {
387 BC.outs() << "BOLT-INFO: top 10 functions changed by shrink wrapping:\n";
388 for (const auto &Elmt : Top10Funcs)
389 BC.outs() << Elmt.first << " : " << Elmt.second->getPrintName() << "\n";
391 return SWError;
394 } // namespace bolt
395 } // namespace llvm