1 //===- bolt/Passes/FrameOptimizer.cpp -------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements the FrameOptimizerPass class.
11 //===----------------------------------------------------------------------===//
13 #include "bolt/Passes/FrameOptimizer.h"
14 #include "bolt/Core/ParallelUtilities.h"
15 #include "bolt/Passes/BinaryFunctionCallGraph.h"
16 #include "bolt/Passes/DataflowInfoManager.h"
17 #include "bolt/Passes/ShrinkWrapping.h"
18 #include "bolt/Passes/StackAvailableExpressions.h"
19 #include "bolt/Passes/StackReachingUses.h"
20 #include "bolt/Utils/CommandLineOpts.h"
21 #include "llvm/Support/Timer.h"
23 #include <unordered_map>
25 #define DEBUG_TYPE "fop"
30 extern cl::opt
<unsigned> Verbosity
;
31 extern cl::opt
<bool> TimeOpts
;
32 extern cl::OptionCategory BoltOptCategory
;
36 cl::opt
<FrameOptimizationType
>
37 FrameOptimization("frame-opt",
39 cl::desc("optimize stack frame accesses"),
41 clEnumValN(FOP_NONE
, "none", "do not perform frame optimization"),
42 clEnumValN(FOP_HOT
, "hot", "perform FOP on hot functions"),
43 clEnumValN(FOP_ALL
, "all", "perform FOP on all functions")),
45 cl::cat(BoltOptCategory
));
47 cl::opt
<bool> RemoveStores(
48 "frame-opt-rm-stores", cl::init(FOP_NONE
),
49 cl::desc("apply additional analysis to remove stores (experimental)"),
50 cl::cat(BoltOptCategory
));
57 void FrameOptimizerPass::removeUnnecessaryLoads(const RegAnalysis
&RA
,
58 const FrameAnalysis
&FA
,
60 StackAvailableExpressions
SAE(RA
, FA
, BF
);
63 LLVM_DEBUG(dbgs() << "Performing unnecessary loads removal\n");
64 std::deque
<std::pair
<BinaryBasicBlock
*, MCInst
*>> ToErase
;
66 const auto ExprEnd
= SAE
.expr_end();
67 MCPlusBuilder
*MIB
= BF
.getBinaryContext().MIB
.get();
68 for (BinaryBasicBlock
&BB
: BF
) {
69 LLVM_DEBUG(dbgs() << "\tNow at BB " << BB
.getName() << "\n");
70 const MCInst
*Prev
= nullptr;
71 for (MCInst
&Inst
: BB
) {
73 dbgs() << "\t\tNow at ";
75 for (auto I
= Prev
? SAE
.expr_begin(*Prev
) : SAE
.expr_begin(BB
);
77 dbgs() << "\t\t\tReached by: ";
81 // if Inst is a load from stack and the current available expressions show
82 // this value is available in a register or immediate, replace this load
83 // with move from register or from immediate.
84 ErrorOr
<const FrameIndexEntry
&> FIEX
= FA
.getFIEFor(Inst
);
89 // FIXME: Change to remove IsSimple == 0. We're being conservative here,
90 // but once replaceMemOperandWithReg is ready, we should feed it with all
91 // sorts of complex instructions.
92 if (FIEX
->IsLoad
== false || FIEX
->IsSimple
== false ||
93 FIEX
->StackOffset
>= 0) {
98 for (auto I
= Prev
? SAE
.expr_begin(*Prev
) : SAE
.expr_begin(BB
);
100 const MCInst
*AvailableInst
= *I
;
101 ErrorOr
<const FrameIndexEntry
&> FIEY
= FA
.getFIEFor(*AvailableInst
);
104 assert(FIEY
->IsStore
&& FIEY
->IsSimple
);
105 if (FIEX
->StackOffset
!= FIEY
->StackOffset
|| FIEX
->Size
!= FIEY
->Size
)
107 // TODO: Change push/pops to stack adjustment instruction
108 if (MIB
->isPop(Inst
))
112 FreqRedundantLoads
+= BB
.getKnownExecutionCount();
114 LLVM_DEBUG(dbgs() << "Redundant load instruction: ");
115 LLVM_DEBUG(Inst
.dump());
116 LLVM_DEBUG(dbgs() << "Related store instruction: ");
117 LLVM_DEBUG(AvailableInst
->dump());
118 LLVM_DEBUG(dbgs() << "@BB: " << BB
.getName() << "\n");
120 if (FIEY
->IsStoreFromReg
) {
121 if (!MIB
->replaceMemOperandWithReg(Inst
, FIEY
->RegOrImm
)) {
122 LLVM_DEBUG(dbgs() << "FAILED to change operand to a reg\n");
125 FreqLoadsChangedToReg
+= BB
.getKnownExecutionCount();
126 MIB
->removeAnnotation(Inst
, "FrameAccessEntry");
127 LLVM_DEBUG(dbgs() << "Changed operand to a reg\n");
128 if (MIB
->isRedundantMove(Inst
)) {
130 FreqLoadsDeleted
+= BB
.getKnownExecutionCount();
131 LLVM_DEBUG(dbgs() << "Created a redundant move\n");
133 ToErase
.push_front(std::make_pair(&BB
, &Inst
));
136 char Buf
[8] = {0, 0, 0, 0, 0, 0, 0, 0};
137 support::ulittle64_t::ref(Buf
+ 0) = FIEY
->RegOrImm
;
138 LLVM_DEBUG(dbgs() << "Changing operand to an imm... ");
139 if (!MIB
->replaceMemOperandWithImm(Inst
, StringRef(Buf
, 8), 0)) {
140 LLVM_DEBUG(dbgs() << "FAILED\n");
142 FreqLoadsChangedToImm
+= BB
.getKnownExecutionCount();
143 MIB
->removeAnnotation(Inst
, "FrameAccessEntry");
144 LLVM_DEBUG(dbgs() << "Ok\n");
147 LLVM_DEBUG(dbgs() << "Changed to: ");
148 LLVM_DEBUG(Inst
.dump());
155 LLVM_DEBUG(dbgs() << "FOP modified \"" << BF
.getPrintName() << "\"\n");
157 // TODO: Implement an interface of eraseInstruction that works out the
158 // complete list of elements to remove.
159 for (std::pair
<BinaryBasicBlock
*, MCInst
*> I
: ToErase
)
160 I
.first
->eraseInstruction(I
.first
->findInstruction(I
.second
));
163 void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis
&FA
,
164 BinaryFunction
&BF
) {
165 StackReachingUses
SRU(FA
, BF
);
168 LLVM_DEBUG(dbgs() << "Performing unused stores removal\n");
169 std::vector
<std::pair
<BinaryBasicBlock
*, MCInst
*>> ToErase
;
170 bool Changed
= false;
171 for (BinaryBasicBlock
&BB
: BF
) {
172 LLVM_DEBUG(dbgs() << "\tNow at BB " << BB
.getName() << "\n");
173 const MCInst
*Prev
= nullptr;
174 for (MCInst
&Inst
: llvm::reverse(BB
)) {
176 dbgs() << "\t\tNow at ";
178 for (auto I
= Prev
? SRU
.expr_begin(*Prev
) : SRU
.expr_begin(BB
);
179 I
!= SRU
.expr_end(); ++I
) {
180 dbgs() << "\t\t\tReached by: ";
184 ErrorOr
<const FrameIndexEntry
&> FIEX
= FA
.getFIEFor(Inst
);
189 if (FIEX
->IsLoad
|| !FIEX
->IsSimple
|| FIEX
->StackOffset
>= 0) {
194 if (SRU
.isStoreUsed(*FIEX
,
195 Prev
? SRU
.expr_begin(*Prev
) : SRU
.expr_begin(BB
))) {
199 // TODO: Change push/pops to stack adjustment instruction
200 if (BF
.getBinaryContext().MIB
->isPush(Inst
))
203 ++NumRedundantStores
;
204 FreqRedundantStores
+= BB
.getKnownExecutionCount();
206 LLVM_DEBUG(dbgs() << "Unused store instruction: ");
207 LLVM_DEBUG(Inst
.dump());
208 LLVM_DEBUG(dbgs() << "@BB: " << BB
.getName() << "\n");
209 LLVM_DEBUG(dbgs() << "FIE offset = " << FIEX
->StackOffset
210 << " size = " << (int)FIEX
->Size
<< "\n");
212 ToErase
.emplace_back(&BB
, &Inst
);
217 for (std::pair
<BinaryBasicBlock
*, MCInst
*> I
: ToErase
)
218 I
.first
->eraseInstruction(I
.first
->findInstruction(I
.second
));
221 LLVM_DEBUG(dbgs() << "FOP modified \"" << BF
.getPrintName() << "\"\n");
224 void FrameOptimizerPass::runOnFunctions(BinaryContext
&BC
) {
225 if (opts::FrameOptimization
== FOP_NONE
)
228 std::unique_ptr
<BinaryFunctionCallGraph
> CG
;
229 std::unique_ptr
<FrameAnalysis
> FA
;
230 std::unique_ptr
<RegAnalysis
> RA
;
233 NamedRegionTimer
T1("callgraph", "create call graph", "FOP",
234 "FOP breakdown", opts::TimeOpts
);
235 CG
= std::make_unique
<BinaryFunctionCallGraph
>(buildCallGraph(BC
));
239 NamedRegionTimer
T1("frameanalysis", "frame analysis", "FOP",
240 "FOP breakdown", opts::TimeOpts
);
241 FA
= std::make_unique
<FrameAnalysis
>(BC
, *CG
);
245 NamedRegionTimer
T1("reganalysis", "reg analysis", "FOP", "FOP breakdown",
247 RA
= std::make_unique
<RegAnalysis
>(BC
, &BC
.getBinaryFunctions(), CG
.get());
250 // Perform caller-saved register optimizations, then callee-saved register
251 // optimizations (shrink wrapping)
252 for (auto &I
: BC
.getBinaryFunctions()) {
253 if (!FA
->hasFrameInfo(I
.second
))
255 // Restrict pass execution if user asked to only run on hot functions
256 if (opts::FrameOptimization
== FOP_HOT
) {
257 if (I
.second
.getKnownExecutionCount() < BC
.getHotThreshold())
260 dbgs() << "Considering " << I
.second
.getPrintName()
261 << " for frame optimizations because its execution count ( "
262 << I
.second
.getKnownExecutionCount()
263 << " ) exceeds our hotness threshold ( "
264 << BC
.getHotThreshold() << " )\n");
268 NamedRegionTimer
T1("removeloads", "remove loads", "FOP", "FOP breakdown",
270 if (!FA
->hasStackArithmetic(I
.second
))
271 removeUnnecessaryLoads(*RA
, *FA
, I
.second
);
274 if (opts::RemoveStores
) {
275 NamedRegionTimer
T1("removestores", "remove stores", "FOP",
276 "FOP breakdown", opts::TimeOpts
);
277 if (!FA
->hasStackArithmetic(I
.second
))
278 removeUnusedStores(*FA
, I
.second
);
280 // Don't even start shrink wrapping if no profiling info is available
281 if (I
.second
.getKnownExecutionCount() == 0)
286 NamedRegionTimer
T1("shrinkwrapping", "shrink wrapping", "FOP",
287 "FOP breakdown", opts::TimeOpts
);
288 performShrinkWrapping(*RA
, *FA
, BC
);
291 outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads
292 << " redundant load(s) and " << NumRedundantStores
293 << " unused store(s)\n";
294 outs() << "BOLT-INFO: Frequency of redundant loads is " << FreqRedundantLoads
295 << " and frequency of unused stores is " << FreqRedundantStores
297 outs() << "BOLT-INFO: Frequency of loads changed to use a register is "
298 << FreqLoadsChangedToReg
299 << " and frequency of loads changed to use an immediate is "
300 << FreqLoadsChangedToImm
<< "\n";
301 outs() << "BOLT-INFO: FOP deleted " << NumLoadsDeleted
302 << " load(s) (dyn count: " << FreqLoadsDeleted
<< ") and "
303 << NumRedundantStores
<< " store(s)\n";
305 ShrinkWrapping::printStats();
308 void FrameOptimizerPass::performShrinkWrapping(const RegAnalysis
&RA
,
309 const FrameAnalysis
&FA
,
311 // Initialize necessary annotations to allow safe parallel accesses to
312 // annotation index in MIB
313 BC
.MIB
->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getSaveTagName());
314 BC
.MIB
->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getRestoreTagName());
315 BC
.MIB
->getOrCreateAnnotationIndex(StackLayoutModifier::getTodoTagName());
316 BC
.MIB
->getOrCreateAnnotationIndex(StackLayoutModifier::getSlotTagName());
317 BC
.MIB
->getOrCreateAnnotationIndex(
318 StackLayoutModifier::getOffsetCFIRegTagName());
319 BC
.MIB
->getOrCreateAnnotationIndex("ReachingDefs");
320 BC
.MIB
->getOrCreateAnnotationIndex("ReachingUses");
321 BC
.MIB
->getOrCreateAnnotationIndex("LivenessAnalysis");
322 BC
.MIB
->getOrCreateAnnotationIndex("StackReachingUses");
323 BC
.MIB
->getOrCreateAnnotationIndex("PostDominatorAnalysis");
324 BC
.MIB
->getOrCreateAnnotationIndex("DominatorAnalysis");
325 BC
.MIB
->getOrCreateAnnotationIndex("StackPointerTracking");
326 BC
.MIB
->getOrCreateAnnotationIndex("StackPointerTrackingForInternalCalls");
327 BC
.MIB
->getOrCreateAnnotationIndex("StackAvailableExpressions");
328 BC
.MIB
->getOrCreateAnnotationIndex("StackAllocationAnalysis");
329 BC
.MIB
->getOrCreateAnnotationIndex("ShrinkWrap-Todo");
330 BC
.MIB
->getOrCreateAnnotationIndex("PredictiveStackPointerTracking");
331 BC
.MIB
->getOrCreateAnnotationIndex("ReachingInsnsBackward");
332 BC
.MIB
->getOrCreateAnnotationIndex("ReachingInsns");
333 BC
.MIB
->getOrCreateAnnotationIndex("AccessesDeletedPos");
334 BC
.MIB
->getOrCreateAnnotationIndex("DeleteMe");
336 std::vector
<std::pair
<uint64_t, const BinaryFunction
*>> Top10Funcs
;
337 auto LogFunc
= [&](BinaryFunction
&BF
) {
338 auto Lower
= llvm::lower_bound(
339 Top10Funcs
, BF
.getKnownExecutionCount(),
340 [](const std::pair
<uint64_t, const BinaryFunction
*> &Elmt
,
341 uint64_t Value
) { return Elmt
.first
> Value
; });
342 if (Lower
== Top10Funcs
.end() && Top10Funcs
.size() >= 10)
344 Top10Funcs
.insert(Lower
,
345 std::make_pair
<>(BF
.getKnownExecutionCount(), &BF
));
346 if (Top10Funcs
.size() > 10)
347 Top10Funcs
.resize(10);
351 ParallelUtilities::PredicateTy SkipPredicate
= [&](const BinaryFunction
&BF
) {
352 if (BF
.getFunctionScore() == 0)
358 const bool HotOnly
= opts::FrameOptimization
== FOP_HOT
;
360 ParallelUtilities::WorkFuncWithAllocTy WorkFunction
=
361 [&](BinaryFunction
&BF
, MCPlusBuilder::AllocatorIdTy AllocatorId
) {
362 DataflowInfoManager
Info(BF
, &RA
, &FA
, AllocatorId
);
363 ShrinkWrapping
SW(FA
, BF
, Info
, AllocatorId
);
365 if (SW
.perform(HotOnly
)) {
366 std::lock_guard
<std::mutex
> Lock(FuncsChangedMutex
);
367 FuncsChanged
.insert(&BF
);
368 LLVM_DEBUG(LogFunc(BF
));
372 ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
373 BC
, ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC
, WorkFunction
,
374 SkipPredicate
, "shrink-wrapping");
376 if (!Top10Funcs
.empty()) {
377 outs() << "BOLT-INFO: top 10 functions changed by shrink wrapping:\n";
378 for (const auto &Elmt
: Top10Funcs
)
379 outs() << Elmt
.first
<< " : " << Elmt
.second
->getPrintName() << "\n";