Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / bolt / lib / Passes / FrameOptimizer.cpp
blob6f6dea08950a7dd1ba425e3824b9bc36646f447c
1 //===- bolt/Passes/FrameOptimizer.cpp -------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the FrameOptimizerPass class.
11 //===----------------------------------------------------------------------===//
13 #include "bolt/Passes/FrameOptimizer.h"
14 #include "bolt/Core/ParallelUtilities.h"
15 #include "bolt/Passes/BinaryFunctionCallGraph.h"
16 #include "bolt/Passes/DataflowInfoManager.h"
17 #include "bolt/Passes/ShrinkWrapping.h"
18 #include "bolt/Passes/StackAvailableExpressions.h"
19 #include "bolt/Passes/StackReachingUses.h"
20 #include "bolt/Utils/CommandLineOpts.h"
21 #include "llvm/Support/Timer.h"
22 #include <deque>
23 #include <unordered_map>
25 #define DEBUG_TYPE "fop"
27 using namespace llvm;
29 namespace opts {
30 extern cl::opt<unsigned> Verbosity;
31 extern cl::opt<bool> TimeOpts;
32 extern cl::OptionCategory BoltOptCategory;
34 using namespace bolt;
36 cl::opt<FrameOptimizationType>
37 FrameOptimization("frame-opt",
38 cl::init(FOP_NONE),
39 cl::desc("optimize stack frame accesses"),
40 cl::values(
41 clEnumValN(FOP_NONE, "none", "do not perform frame optimization"),
42 clEnumValN(FOP_HOT, "hot", "perform FOP on hot functions"),
43 clEnumValN(FOP_ALL, "all", "perform FOP on all functions")),
44 cl::ZeroOrMore,
45 cl::cat(BoltOptCategory));
47 cl::opt<bool> RemoveStores(
48 "frame-opt-rm-stores", cl::init(FOP_NONE),
49 cl::desc("apply additional analysis to remove stores (experimental)"),
50 cl::cat(BoltOptCategory));
52 } // namespace opts
54 namespace llvm {
55 namespace bolt {
57 void FrameOptimizerPass::removeUnnecessaryLoads(const RegAnalysis &RA,
58 const FrameAnalysis &FA,
59 BinaryFunction &BF) {
60 StackAvailableExpressions SAE(RA, FA, BF);
61 SAE.run();
63 LLVM_DEBUG(dbgs() << "Performing unnecessary loads removal\n");
64 std::deque<std::pair<BinaryBasicBlock *, MCInst *>> ToErase;
65 bool Changed = false;
66 const auto ExprEnd = SAE.expr_end();
67 MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get();
68 for (BinaryBasicBlock &BB : BF) {
69 LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n");
70 const MCInst *Prev = nullptr;
71 for (MCInst &Inst : BB) {
72 LLVM_DEBUG({
73 dbgs() << "\t\tNow at ";
74 Inst.dump();
75 for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB);
76 I != ExprEnd; ++I) {
77 dbgs() << "\t\t\tReached by: ";
78 (*I)->dump();
80 });
81 // if Inst is a load from stack and the current available expressions show
82 // this value is available in a register or immediate, replace this load
83 // with move from register or from immediate.
84 ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst);
85 if (!FIEX) {
86 Prev = &Inst;
87 continue;
89 // FIXME: Change to remove IsSimple == 0. We're being conservative here,
90 // but once replaceMemOperandWithReg is ready, we should feed it with all
91 // sorts of complex instructions.
92 if (FIEX->IsLoad == false || FIEX->IsSimple == false ||
93 FIEX->StackOffset >= 0) {
94 Prev = &Inst;
95 continue;
98 for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB);
99 I != ExprEnd; ++I) {
100 const MCInst *AvailableInst = *I;
101 ErrorOr<const FrameIndexEntry &> FIEY = FA.getFIEFor(*AvailableInst);
102 if (!FIEY)
103 continue;
104 assert(FIEY->IsStore && FIEY->IsSimple);
105 if (FIEX->StackOffset != FIEY->StackOffset || FIEX->Size != FIEY->Size)
106 continue;
107 // TODO: Change push/pops to stack adjustment instruction
108 if (MIB->isPop(Inst))
109 continue;
111 ++NumRedundantLoads;
112 FreqRedundantLoads += BB.getKnownExecutionCount();
113 Changed = true;
114 LLVM_DEBUG(dbgs() << "Redundant load instruction: ");
115 LLVM_DEBUG(Inst.dump());
116 LLVM_DEBUG(dbgs() << "Related store instruction: ");
117 LLVM_DEBUG(AvailableInst->dump());
118 LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n");
119 // Replace load
120 if (FIEY->IsStoreFromReg) {
121 if (!MIB->replaceMemOperandWithReg(Inst, FIEY->RegOrImm)) {
122 LLVM_DEBUG(dbgs() << "FAILED to change operand to a reg\n");
123 break;
125 FreqLoadsChangedToReg += BB.getKnownExecutionCount();
126 MIB->removeAnnotation(Inst, "FrameAccessEntry");
127 LLVM_DEBUG(dbgs() << "Changed operand to a reg\n");
128 if (MIB->isRedundantMove(Inst)) {
129 ++NumLoadsDeleted;
130 FreqLoadsDeleted += BB.getKnownExecutionCount();
131 LLVM_DEBUG(dbgs() << "Created a redundant move\n");
132 // Delete it!
133 ToErase.push_front(std::make_pair(&BB, &Inst));
135 } else {
136 char Buf[8] = {0, 0, 0, 0, 0, 0, 0, 0};
137 support::ulittle64_t::ref(Buf + 0) = FIEY->RegOrImm;
138 LLVM_DEBUG(dbgs() << "Changing operand to an imm... ");
139 if (!MIB->replaceMemOperandWithImm(Inst, StringRef(Buf, 8), 0)) {
140 LLVM_DEBUG(dbgs() << "FAILED\n");
141 } else {
142 FreqLoadsChangedToImm += BB.getKnownExecutionCount();
143 MIB->removeAnnotation(Inst, "FrameAccessEntry");
144 LLVM_DEBUG(dbgs() << "Ok\n");
147 LLVM_DEBUG(dbgs() << "Changed to: ");
148 LLVM_DEBUG(Inst.dump());
149 break;
151 Prev = &Inst;
154 if (Changed)
155 LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n");
157 // TODO: Implement an interface of eraseInstruction that works out the
158 // complete list of elements to remove.
159 for (std::pair<BinaryBasicBlock *, MCInst *> I : ToErase)
160 I.first->eraseInstruction(I.first->findInstruction(I.second));
163 void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA,
164 BinaryFunction &BF) {
165 StackReachingUses SRU(FA, BF);
166 SRU.run();
168 LLVM_DEBUG(dbgs() << "Performing unused stores removal\n");
169 std::vector<std::pair<BinaryBasicBlock *, MCInst *>> ToErase;
170 bool Changed = false;
171 for (BinaryBasicBlock &BB : BF) {
172 LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n");
173 const MCInst *Prev = nullptr;
174 for (MCInst &Inst : llvm::reverse(BB)) {
175 LLVM_DEBUG({
176 dbgs() << "\t\tNow at ";
177 Inst.dump();
178 for (auto I = Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB);
179 I != SRU.expr_end(); ++I) {
180 dbgs() << "\t\t\tReached by: ";
181 (*I)->dump();
184 ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst);
185 if (!FIEX) {
186 Prev = &Inst;
187 continue;
189 if (FIEX->IsLoad || !FIEX->IsSimple || FIEX->StackOffset >= 0) {
190 Prev = &Inst;
191 continue;
194 if (SRU.isStoreUsed(*FIEX,
195 Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB))) {
196 Prev = &Inst;
197 continue;
199 // TODO: Change push/pops to stack adjustment instruction
200 if (BF.getBinaryContext().MIB->isPush(Inst))
201 continue;
203 ++NumRedundantStores;
204 FreqRedundantStores += BB.getKnownExecutionCount();
205 Changed = true;
206 LLVM_DEBUG(dbgs() << "Unused store instruction: ");
207 LLVM_DEBUG(Inst.dump());
208 LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n");
209 LLVM_DEBUG(dbgs() << "FIE offset = " << FIEX->StackOffset
210 << " size = " << (int)FIEX->Size << "\n");
211 // Delete it!
212 ToErase.emplace_back(&BB, &Inst);
213 Prev = &Inst;
217 for (std::pair<BinaryBasicBlock *, MCInst *> I : ToErase)
218 I.first->eraseInstruction(I.first->findInstruction(I.second));
220 if (Changed)
221 LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n");
224 void FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
225 if (opts::FrameOptimization == FOP_NONE)
226 return;
228 std::unique_ptr<BinaryFunctionCallGraph> CG;
229 std::unique_ptr<FrameAnalysis> FA;
230 std::unique_ptr<RegAnalysis> RA;
233 NamedRegionTimer T1("callgraph", "create call graph", "FOP",
234 "FOP breakdown", opts::TimeOpts);
235 CG = std::make_unique<BinaryFunctionCallGraph>(buildCallGraph(BC));
239 NamedRegionTimer T1("frameanalysis", "frame analysis", "FOP",
240 "FOP breakdown", opts::TimeOpts);
241 FA = std::make_unique<FrameAnalysis>(BC, *CG);
245 NamedRegionTimer T1("reganalysis", "reg analysis", "FOP", "FOP breakdown",
246 opts::TimeOpts);
247 RA = std::make_unique<RegAnalysis>(BC, &BC.getBinaryFunctions(), CG.get());
250 // Perform caller-saved register optimizations, then callee-saved register
251 // optimizations (shrink wrapping)
252 for (auto &I : BC.getBinaryFunctions()) {
253 if (!FA->hasFrameInfo(I.second))
254 continue;
255 // Restrict pass execution if user asked to only run on hot functions
256 if (opts::FrameOptimization == FOP_HOT) {
257 if (I.second.getKnownExecutionCount() < BC.getHotThreshold())
258 continue;
259 LLVM_DEBUG(
260 dbgs() << "Considering " << I.second.getPrintName()
261 << " for frame optimizations because its execution count ( "
262 << I.second.getKnownExecutionCount()
263 << " ) exceeds our hotness threshold ( "
264 << BC.getHotThreshold() << " )\n");
268 NamedRegionTimer T1("removeloads", "remove loads", "FOP", "FOP breakdown",
269 opts::TimeOpts);
270 if (!FA->hasStackArithmetic(I.second))
271 removeUnnecessaryLoads(*RA, *FA, I.second);
274 if (opts::RemoveStores) {
275 NamedRegionTimer T1("removestores", "remove stores", "FOP",
276 "FOP breakdown", opts::TimeOpts);
277 if (!FA->hasStackArithmetic(I.second))
278 removeUnusedStores(*FA, I.second);
280 // Don't even start shrink wrapping if no profiling info is available
281 if (I.second.getKnownExecutionCount() == 0)
282 continue;
286 NamedRegionTimer T1("shrinkwrapping", "shrink wrapping", "FOP",
287 "FOP breakdown", opts::TimeOpts);
288 performShrinkWrapping(*RA, *FA, BC);
291 outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads
292 << " redundant load(s) and " << NumRedundantStores
293 << " unused store(s)\n";
294 outs() << "BOLT-INFO: Frequency of redundant loads is " << FreqRedundantLoads
295 << " and frequency of unused stores is " << FreqRedundantStores
296 << "\n";
297 outs() << "BOLT-INFO: Frequency of loads changed to use a register is "
298 << FreqLoadsChangedToReg
299 << " and frequency of loads changed to use an immediate is "
300 << FreqLoadsChangedToImm << "\n";
301 outs() << "BOLT-INFO: FOP deleted " << NumLoadsDeleted
302 << " load(s) (dyn count: " << FreqLoadsDeleted << ") and "
303 << NumRedundantStores << " store(s)\n";
304 FA->printStats();
305 ShrinkWrapping::printStats();
308 void FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA,
309 const FrameAnalysis &FA,
310 BinaryContext &BC) {
311 // Initialize necessary annotations to allow safe parallel accesses to
312 // annotation index in MIB
313 BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getSaveTagName());
314 BC.MIB->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getRestoreTagName());
315 BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getTodoTagName());
316 BC.MIB->getOrCreateAnnotationIndex(StackLayoutModifier::getSlotTagName());
317 BC.MIB->getOrCreateAnnotationIndex(
318 StackLayoutModifier::getOffsetCFIRegTagName());
319 BC.MIB->getOrCreateAnnotationIndex("ReachingDefs");
320 BC.MIB->getOrCreateAnnotationIndex("ReachingUses");
321 BC.MIB->getOrCreateAnnotationIndex("LivenessAnalysis");
322 BC.MIB->getOrCreateAnnotationIndex("StackReachingUses");
323 BC.MIB->getOrCreateAnnotationIndex("PostDominatorAnalysis");
324 BC.MIB->getOrCreateAnnotationIndex("DominatorAnalysis");
325 BC.MIB->getOrCreateAnnotationIndex("StackPointerTracking");
326 BC.MIB->getOrCreateAnnotationIndex("StackPointerTrackingForInternalCalls");
327 BC.MIB->getOrCreateAnnotationIndex("StackAvailableExpressions");
328 BC.MIB->getOrCreateAnnotationIndex("StackAllocationAnalysis");
329 BC.MIB->getOrCreateAnnotationIndex("ShrinkWrap-Todo");
330 BC.MIB->getOrCreateAnnotationIndex("PredictiveStackPointerTracking");
331 BC.MIB->getOrCreateAnnotationIndex("ReachingInsnsBackward");
332 BC.MIB->getOrCreateAnnotationIndex("ReachingInsns");
333 BC.MIB->getOrCreateAnnotationIndex("AccessesDeletedPos");
334 BC.MIB->getOrCreateAnnotationIndex("DeleteMe");
336 std::vector<std::pair<uint64_t, const BinaryFunction *>> Top10Funcs;
337 auto LogFunc = [&](BinaryFunction &BF) {
338 auto Lower = llvm::lower_bound(
339 Top10Funcs, BF.getKnownExecutionCount(),
340 [](const std::pair<uint64_t, const BinaryFunction *> &Elmt,
341 uint64_t Value) { return Elmt.first > Value; });
342 if (Lower == Top10Funcs.end() && Top10Funcs.size() >= 10)
343 return;
344 Top10Funcs.insert(Lower,
345 std::make_pair<>(BF.getKnownExecutionCount(), &BF));
346 if (Top10Funcs.size() > 10)
347 Top10Funcs.resize(10);
349 (void)LogFunc;
351 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
352 if (BF.getFunctionScore() == 0)
353 return true;
355 return false;
358 const bool HotOnly = opts::FrameOptimization == FOP_HOT;
360 ParallelUtilities::WorkFuncWithAllocTy WorkFunction =
361 [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) {
362 DataflowInfoManager Info(BF, &RA, &FA, AllocatorId);
363 ShrinkWrapping SW(FA, BF, Info, AllocatorId);
365 if (SW.perform(HotOnly)) {
366 std::lock_guard<std::mutex> Lock(FuncsChangedMutex);
367 FuncsChanged.insert(&BF);
368 LLVM_DEBUG(LogFunc(BF));
372 ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
373 BC, ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC, WorkFunction,
374 SkipPredicate, "shrink-wrapping");
376 if (!Top10Funcs.empty()) {
377 outs() << "BOLT-INFO: top 10 functions changed by shrink wrapping:\n";
378 for (const auto &Elmt : Top10Funcs)
379 outs() << Elmt.first << " : " << Elmt.second->getPrintName() << "\n";
383 } // namespace bolt
384 } // namespace llvm