1 //===- bolt/Passes/FrameOptimizer.cpp -------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements the FrameOptimizerPass class.
11 //===----------------------------------------------------------------------===//
13 #include "bolt/Passes/FrameOptimizer.h"
14 #include "bolt/Core/BinaryFunctionCallGraph.h"
15 #include "bolt/Core/ParallelUtilities.h"
16 #include "bolt/Passes/DataflowInfoManager.h"
17 #include "bolt/Passes/ShrinkWrapping.h"
18 #include "bolt/Passes/StackAvailableExpressions.h"
19 #include "bolt/Passes/StackReachingUses.h"
20 #include "bolt/Utils/CommandLineOpts.h"
21 #include "llvm/Support/Timer.h"
24 #define DEBUG_TYPE "fop"
29 extern cl::opt
<unsigned> Verbosity
;
30 extern cl::opt
<bool> TimeOpts
;
31 extern cl::OptionCategory BoltOptCategory
;
35 cl::opt
<FrameOptimizationType
>
36 FrameOptimization("frame-opt",
38 cl::desc("optimize stack frame accesses"),
40 clEnumValN(FOP_NONE
, "none", "do not perform frame optimization"),
41 clEnumValN(FOP_HOT
, "hot", "perform FOP on hot functions"),
42 clEnumValN(FOP_ALL
, "all", "perform FOP on all functions")),
44 cl::cat(BoltOptCategory
));
46 cl::opt
<bool> RemoveStores(
47 "frame-opt-rm-stores", cl::init(FOP_NONE
),
48 cl::desc("apply additional analysis to remove stores (experimental)"),
49 cl::cat(BoltOptCategory
));
56 void FrameOptimizerPass::removeUnnecessaryLoads(const RegAnalysis
&RA
,
57 const FrameAnalysis
&FA
,
59 StackAvailableExpressions
SAE(RA
, FA
, BF
);
62 LLVM_DEBUG(dbgs() << "Performing unnecessary loads removal\n");
63 std::deque
<std::pair
<BinaryBasicBlock
*, MCInst
*>> ToErase
;
65 const auto ExprEnd
= SAE
.expr_end();
66 MCPlusBuilder
*MIB
= BF
.getBinaryContext().MIB
.get();
67 for (BinaryBasicBlock
&BB
: BF
) {
68 LLVM_DEBUG(dbgs() << "\tNow at BB " << BB
.getName() << "\n");
69 const MCInst
*Prev
= nullptr;
70 for (MCInst
&Inst
: BB
) {
72 dbgs() << "\t\tNow at ";
74 for (auto I
= Prev
? SAE
.expr_begin(*Prev
) : SAE
.expr_begin(BB
);
76 dbgs() << "\t\t\tReached by: ";
80 // if Inst is a load from stack and the current available expressions show
81 // this value is available in a register or immediate, replace this load
82 // with move from register or from immediate.
83 ErrorOr
<const FrameIndexEntry
&> FIEX
= FA
.getFIEFor(Inst
);
88 // FIXME: Change to remove IsSimple == 0. We're being conservative here,
89 // but once replaceMemOperandWithReg is ready, we should feed it with all
90 // sorts of complex instructions.
91 if (FIEX
->IsLoad
== false || FIEX
->IsSimple
== false ||
92 FIEX
->StackOffset
>= 0) {
97 for (auto I
= Prev
? SAE
.expr_begin(*Prev
) : SAE
.expr_begin(BB
);
99 const MCInst
*AvailableInst
= *I
;
100 ErrorOr
<const FrameIndexEntry
&> FIEY
= FA
.getFIEFor(*AvailableInst
);
103 assert(FIEY
->IsStore
&& FIEY
->IsSimple
);
104 if (FIEX
->StackOffset
!= FIEY
->StackOffset
|| FIEX
->Size
!= FIEY
->Size
)
106 // TODO: Change push/pops to stack adjustment instruction
107 if (MIB
->isPop(Inst
))
111 FreqRedundantLoads
+= BB
.getKnownExecutionCount();
113 LLVM_DEBUG(dbgs() << "Redundant load instruction: ");
114 LLVM_DEBUG(Inst
.dump());
115 LLVM_DEBUG(dbgs() << "Related store instruction: ");
116 LLVM_DEBUG(AvailableInst
->dump());
117 LLVM_DEBUG(dbgs() << "@BB: " << BB
.getName() << "\n");
119 if (FIEY
->IsStoreFromReg
) {
120 if (!MIB
->replaceMemOperandWithReg(Inst
, FIEY
->RegOrImm
)) {
121 LLVM_DEBUG(dbgs() << "FAILED to change operand to a reg\n");
124 FreqLoadsChangedToReg
+= BB
.getKnownExecutionCount();
125 MIB
->removeAnnotation(Inst
, "FrameAccessEntry");
126 LLVM_DEBUG(dbgs() << "Changed operand to a reg\n");
127 if (MIB
->isRedundantMove(Inst
)) {
129 FreqLoadsDeleted
+= BB
.getKnownExecutionCount();
130 LLVM_DEBUG(dbgs() << "Created a redundant move\n");
132 ToErase
.push_front(std::make_pair(&BB
, &Inst
));
135 char Buf
[8] = {0, 0, 0, 0, 0, 0, 0, 0};
136 support::ulittle64_t::ref(Buf
+ 0) = FIEY
->RegOrImm
;
137 LLVM_DEBUG(dbgs() << "Changing operand to an imm... ");
138 if (!MIB
->replaceMemOperandWithImm(Inst
, StringRef(Buf
, 8), 0)) {
139 LLVM_DEBUG(dbgs() << "FAILED\n");
141 FreqLoadsChangedToImm
+= BB
.getKnownExecutionCount();
142 MIB
->removeAnnotation(Inst
, "FrameAccessEntry");
143 LLVM_DEBUG(dbgs() << "Ok\n");
146 LLVM_DEBUG(dbgs() << "Changed to: ");
147 LLVM_DEBUG(Inst
.dump());
154 LLVM_DEBUG(dbgs() << "FOP modified \"" << BF
.getPrintName() << "\"\n");
156 // TODO: Implement an interface of eraseInstruction that works out the
157 // complete list of elements to remove.
158 for (std::pair
<BinaryBasicBlock
*, MCInst
*> I
: ToErase
)
159 I
.first
->eraseInstruction(I
.first
->findInstruction(I
.second
));
162 void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis
&FA
,
163 BinaryFunction
&BF
) {
164 StackReachingUses
SRU(FA
, BF
);
167 LLVM_DEBUG(dbgs() << "Performing unused stores removal\n");
168 std::vector
<std::pair
<BinaryBasicBlock
*, MCInst
*>> ToErase
;
169 bool Changed
= false;
170 for (BinaryBasicBlock
&BB
: BF
) {
171 LLVM_DEBUG(dbgs() << "\tNow at BB " << BB
.getName() << "\n");
172 const MCInst
*Prev
= nullptr;
173 for (MCInst
&Inst
: llvm::reverse(BB
)) {
175 dbgs() << "\t\tNow at ";
177 for (auto I
= Prev
? SRU
.expr_begin(*Prev
) : SRU
.expr_begin(BB
);
178 I
!= SRU
.expr_end(); ++I
) {
179 dbgs() << "\t\t\tReached by: ";
183 ErrorOr
<const FrameIndexEntry
&> FIEX
= FA
.getFIEFor(Inst
);
188 if (FIEX
->IsLoad
|| !FIEX
->IsSimple
|| FIEX
->StackOffset
>= 0) {
193 if (SRU
.isStoreUsed(*FIEX
,
194 Prev
? SRU
.expr_begin(*Prev
) : SRU
.expr_begin(BB
))) {
198 // TODO: Change push/pops to stack adjustment instruction
199 if (BF
.getBinaryContext().MIB
->isPush(Inst
))
202 ++NumRedundantStores
;
203 FreqRedundantStores
+= BB
.getKnownExecutionCount();
205 LLVM_DEBUG(dbgs() << "Unused store instruction: ");
206 LLVM_DEBUG(Inst
.dump());
207 LLVM_DEBUG(dbgs() << "@BB: " << BB
.getName() << "\n");
208 LLVM_DEBUG(dbgs() << "FIE offset = " << FIEX
->StackOffset
209 << " size = " << (int)FIEX
->Size
<< "\n");
211 ToErase
.emplace_back(&BB
, &Inst
);
216 for (std::pair
<BinaryBasicBlock
*, MCInst
*> I
: ToErase
)
217 I
.first
->eraseInstruction(I
.first
->findInstruction(I
.second
));
220 LLVM_DEBUG(dbgs() << "FOP modified \"" << BF
.getPrintName() << "\"\n");
223 Error
FrameOptimizerPass::runOnFunctions(BinaryContext
&BC
) {
224 if (opts::FrameOptimization
== FOP_NONE
)
225 return Error::success();
227 std::unique_ptr
<BinaryFunctionCallGraph
> CG
;
228 std::unique_ptr
<FrameAnalysis
> FA
;
229 std::unique_ptr
<RegAnalysis
> RA
;
232 NamedRegionTimer
T1("callgraph", "create call graph", "FOP",
233 "FOP breakdown", opts::TimeOpts
);
234 CG
= std::make_unique
<BinaryFunctionCallGraph
>(buildCallGraph(BC
));
238 NamedRegionTimer
T1("frameanalysis", "frame analysis", "FOP",
239 "FOP breakdown", opts::TimeOpts
);
240 FA
= std::make_unique
<FrameAnalysis
>(BC
, *CG
);
244 NamedRegionTimer
T1("reganalysis", "reg analysis", "FOP", "FOP breakdown",
246 RA
= std::make_unique
<RegAnalysis
>(BC
, &BC
.getBinaryFunctions(), CG
.get());
249 // Perform caller-saved register optimizations, then callee-saved register
250 // optimizations (shrink wrapping)
251 for (auto &I
: BC
.getBinaryFunctions()) {
252 if (!FA
->hasFrameInfo(I
.second
))
254 // Restrict pass execution if user asked to only run on hot functions
255 if (opts::FrameOptimization
== FOP_HOT
) {
256 if (I
.second
.getKnownExecutionCount() < BC
.getHotThreshold())
259 dbgs() << "Considering " << I
.second
.getPrintName()
260 << " for frame optimizations because its execution count ( "
261 << I
.second
.getKnownExecutionCount()
262 << " ) exceeds our hotness threshold ( "
263 << BC
.getHotThreshold() << " )\n");
267 NamedRegionTimer
T1("removeloads", "remove loads", "FOP", "FOP breakdown",
269 if (!FA
->hasStackArithmetic(I
.second
))
270 removeUnnecessaryLoads(*RA
, *FA
, I
.second
);
273 if (opts::RemoveStores
) {
274 NamedRegionTimer
T1("removestores", "remove stores", "FOP",
275 "FOP breakdown", opts::TimeOpts
);
276 if (!FA
->hasStackArithmetic(I
.second
))
277 removeUnusedStores(*FA
, I
.second
);
279 // Don't even start shrink wrapping if no profiling info is available
280 if (I
.second
.getKnownExecutionCount() == 0)
285 NamedRegionTimer
T1("shrinkwrapping", "shrink wrapping", "FOP",
286 "FOP breakdown", opts::TimeOpts
);
287 if (Error E
= performShrinkWrapping(*RA
, *FA
, BC
))
288 return Error(std::move(E
));
291 BC
.outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads
292 << " redundant load(s) and " << NumRedundantStores
293 << " unused store(s)\n";
294 BC
.outs() << "BOLT-INFO: Frequency of redundant loads is "
295 << FreqRedundantLoads
<< " and frequency of unused stores is "
296 << FreqRedundantStores
<< "\n";
297 BC
.outs() << "BOLT-INFO: Frequency of loads changed to use a register is "
298 << FreqLoadsChangedToReg
299 << " and frequency of loads changed to use an immediate is "
300 << FreqLoadsChangedToImm
<< "\n";
301 BC
.outs() << "BOLT-INFO: FOP deleted " << NumLoadsDeleted
302 << " load(s) (dyn count: " << FreqLoadsDeleted
<< ") and "
303 << NumRedundantStores
<< " store(s)\n";
305 ShrinkWrapping::printStats(BC
);
306 return Error::success();
309 Error
FrameOptimizerPass::performShrinkWrapping(const RegAnalysis
&RA
,
310 const FrameAnalysis
&FA
,
312 // Initialize necessary annotations to allow safe parallel accesses to
313 // annotation index in MIB
314 BC
.MIB
->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getSaveTagName());
315 BC
.MIB
->getOrCreateAnnotationIndex(CalleeSavedAnalysis::getRestoreTagName());
316 BC
.MIB
->getOrCreateAnnotationIndex(StackLayoutModifier::getTodoTagName());
317 BC
.MIB
->getOrCreateAnnotationIndex(StackLayoutModifier::getSlotTagName());
318 BC
.MIB
->getOrCreateAnnotationIndex(
319 StackLayoutModifier::getOffsetCFIRegTagName());
320 BC
.MIB
->getOrCreateAnnotationIndex("ReachingDefs");
321 BC
.MIB
->getOrCreateAnnotationIndex("ReachingUses");
322 BC
.MIB
->getOrCreateAnnotationIndex("LivenessAnalysis");
323 BC
.MIB
->getOrCreateAnnotationIndex("StackReachingUses");
324 BC
.MIB
->getOrCreateAnnotationIndex("PostDominatorAnalysis");
325 BC
.MIB
->getOrCreateAnnotationIndex("DominatorAnalysis");
326 BC
.MIB
->getOrCreateAnnotationIndex("StackPointerTracking");
327 BC
.MIB
->getOrCreateAnnotationIndex("StackPointerTrackingForInternalCalls");
328 BC
.MIB
->getOrCreateAnnotationIndex("StackAvailableExpressions");
329 BC
.MIB
->getOrCreateAnnotationIndex("StackAllocationAnalysis");
330 BC
.MIB
->getOrCreateAnnotationIndex("ShrinkWrap-Todo");
331 BC
.MIB
->getOrCreateAnnotationIndex("PredictiveStackPointerTracking");
332 BC
.MIB
->getOrCreateAnnotationIndex("ReachingInsnsBackward");
333 BC
.MIB
->getOrCreateAnnotationIndex("ReachingInsns");
334 BC
.MIB
->getOrCreateAnnotationIndex("AccessesDeletedPos");
335 BC
.MIB
->getOrCreateAnnotationIndex("DeleteMe");
337 std::vector
<std::pair
<uint64_t, const BinaryFunction
*>> Top10Funcs
;
338 auto LogFunc
= [&](BinaryFunction
&BF
) {
339 auto Lower
= llvm::lower_bound(
340 Top10Funcs
, BF
.getKnownExecutionCount(),
341 [](const std::pair
<uint64_t, const BinaryFunction
*> &Elmt
,
342 uint64_t Value
) { return Elmt
.first
> Value
; });
343 if (Lower
== Top10Funcs
.end() && Top10Funcs
.size() >= 10)
345 Top10Funcs
.insert(Lower
,
346 std::make_pair
<>(BF
.getKnownExecutionCount(), &BF
));
347 if (Top10Funcs
.size() > 10)
348 Top10Funcs
.resize(10);
352 ParallelUtilities::PredicateTy SkipPredicate
= [&](const BinaryFunction
&BF
) {
353 if (BF
.getFunctionScore() == 0)
359 const bool HotOnly
= opts::FrameOptimization
== FOP_HOT
;
361 Error SWError
= Error::success();
363 ParallelUtilities::WorkFuncWithAllocTy WorkFunction
=
364 [&](BinaryFunction
&BF
, MCPlusBuilder::AllocatorIdTy AllocatorId
) {
365 DataflowInfoManager
Info(BF
, &RA
, &FA
, AllocatorId
);
366 ShrinkWrapping
SW(FA
, BF
, Info
, AllocatorId
);
368 auto ChangedOrErr
= SW
.perform(HotOnly
);
369 if (auto E
= ChangedOrErr
.takeError()) {
370 std::lock_guard
<std::mutex
> Lock(FuncsChangedMutex
);
371 SWError
= joinErrors(std::move(SWError
), Error(std::move(E
)));
374 const bool Changed
= *ChangedOrErr
;
376 std::lock_guard
<std::mutex
> Lock(FuncsChangedMutex
);
377 FuncsChanged
.insert(&BF
);
378 LLVM_DEBUG(LogFunc(BF
));
382 ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
383 BC
, ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC
, WorkFunction
,
384 SkipPredicate
, "shrink-wrapping");
386 if (!Top10Funcs
.empty()) {
387 BC
.outs() << "BOLT-INFO: top 10 functions changed by shrink wrapping:\n";
388 for (const auto &Elmt
: Top10Funcs
)
389 BC
.outs() << Elmt
.first
<< " : " << Elmt
.second
->getPrintName() << "\n";