1 //===- TailRecursionElimination.cpp - Eliminate Tail Calls ----------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file transforms calls of the current function (self recursion) followed
10 // by a return instruction with a branch to the entry of the function, creating
11 // a loop. This pass also implements the following extensions to the basic
14 // 1. Trivial instructions between the call and return do not prevent the
15 // transformation from taking place, though currently the analysis cannot
16 // support moving any really useful instructions (only dead ones).
17 // 2. This pass transforms functions that are prevented from being tail
18 // recursive by an associative and commutative expression to use an
19 // accumulator variable, thus compiling the typical naive factorial or
20 // 'fib' implementation into efficient code.
21 // 3. TRE is performed if the function returns void, if the return
22 // returns the result returned by the call, or if the function returns a
23 // run-time constant on all exits from the function. It is possible, though
24 // unlikely, that the return returns something else (like constant 0), and
25 // can still be TRE'd. It can be TRE'd if ALL OTHER return instructions in
26 // the function return the exact same value.
27 // 4. If it can prove that callees do not access their caller stack frame,
28 // they are marked as eligible for tail call elimination (by the code
31 // There are several improvements that could be made:
33 // 1. If the function has any alloca instructions, these instructions will be
34 // moved out of the entry block of the function, causing them to be
35 // evaluated each time through the tail recursion. Safely keeping allocas
36 // in the entry block requires analysis to proves that the tail-called
37 // function does not read or write the stack object.
38 // 2. Tail recursion is only performed if the call immediately precedes the
39 // return instruction. It's possible that there could be a jump between
40 // the call and the return.
41 // 3. There can be intervening operations between the call and the return that
42 // prevent the TRE from occurring. For example, there could be GEP's and
43 // stores to memory that will not be read or written by the call. This
44 // requires some substantial analysis (such as with DSA) to prove safe to
45 // move ahead of the call, but doing so could allow many more TREs to be
46 // performed, for example in TreeAdd/TreeAlloc from the treeadd benchmark.
47 // 4. The algorithm we use to detect if callees access their caller stack
48 // frames is very primitive.
50 //===----------------------------------------------------------------------===//
52 #include "llvm/Transforms/Scalar/TailRecursionElimination.h"
53 #include "llvm/ADT/STLExtras.h"
54 #include "llvm/ADT/SmallPtrSet.h"
55 #include "llvm/ADT/Statistic.h"
56 #include "llvm/Analysis/DomTreeUpdater.h"
57 #include "llvm/Analysis/GlobalsModRef.h"
58 #include "llvm/Analysis/InstructionSimplify.h"
59 #include "llvm/Analysis/Loads.h"
60 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
61 #include "llvm/Analysis/PostDominators.h"
62 #include "llvm/Analysis/TargetTransformInfo.h"
63 #include "llvm/Analysis/ValueTracking.h"
64 #include "llvm/IR/CFG.h"
65 #include "llvm/IR/Constants.h"
66 #include "llvm/IR/DataLayout.h"
67 #include "llvm/IR/DerivedTypes.h"
68 #include "llvm/IR/DiagnosticInfo.h"
69 #include "llvm/IR/Dominators.h"
70 #include "llvm/IR/Function.h"
71 #include "llvm/IR/IRBuilder.h"
72 #include "llvm/IR/InstIterator.h"
73 #include "llvm/IR/Instructions.h"
74 #include "llvm/IR/IntrinsicInst.h"
75 #include "llvm/IR/Module.h"
76 #include "llvm/InitializePasses.h"
77 #include "llvm/Pass.h"
78 #include "llvm/Support/Debug.h"
79 #include "llvm/Support/raw_ostream.h"
80 #include "llvm/Transforms/Scalar.h"
81 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
84 #define DEBUG_TYPE "tailcallelim"
86 STATISTIC(NumEliminated
, "Number of tail calls removed");
87 STATISTIC(NumRetDuped
, "Number of return duplicated");
88 STATISTIC(NumAccumAdded
, "Number of accumulators introduced");
90 /// Scan the specified function for alloca instructions.
91 /// If it contains any dynamic allocas, returns false.
92 static bool canTRE(Function
&F
) {
93 // TODO: We don't do TRE if dynamic allocas are used.
94 // Dynamic allocas allocate stack space which should be
95 // deallocated before new iteration started. That is
96 // currently not implemented.
97 return llvm::all_of(instructions(F
), [](Instruction
&I
) {
98 auto *AI
= dyn_cast
<AllocaInst
>(&I
);
99 return !AI
|| AI
->isStaticAlloca();
104 struct AllocaDerivedValueTracker
{
105 // Start at a root value and walk its use-def chain to mark calls that use the
106 // value or a derived value in AllocaUsers, and places where it may escape in
108 void walk(Value
*Root
) {
109 SmallVector
<Use
*, 32> Worklist
;
110 SmallPtrSet
<Use
*, 32> Visited
;
112 auto AddUsesToWorklist
= [&](Value
*V
) {
113 for (auto &U
: V
->uses()) {
114 if (!Visited
.insert(&U
).second
)
116 Worklist
.push_back(&U
);
120 AddUsesToWorklist(Root
);
122 while (!Worklist
.empty()) {
123 Use
*U
= Worklist
.pop_back_val();
124 Instruction
*I
= cast
<Instruction
>(U
->getUser());
126 switch (I
->getOpcode()) {
127 case Instruction::Call
:
128 case Instruction::Invoke
: {
129 auto &CB
= cast
<CallBase
>(*I
);
130 // If the alloca-derived argument is passed byval it is not an escape
131 // point, or a use of an alloca. Calling with byval copies the contents
132 // of the alloca into argument registers or stack slots, which exist
133 // beyond the lifetime of the current frame.
134 if (CB
.isArgOperand(U
) && CB
.isByValArgument(CB
.getArgOperandNo(U
)))
137 CB
.isDataOperand(U
) && CB
.doesNotCapture(CB
.getDataOperandNo(U
));
138 callUsesLocalStack(CB
, IsNocapture
);
140 // If the alloca-derived argument is passed in as nocapture, then it
141 // can't propagate to the call's return. That would be capturing.
146 case Instruction::Load
: {
147 // The result of a load is not alloca-derived (unless an alloca has
148 // otherwise escaped, but this is a local analysis).
151 case Instruction::Store
: {
152 if (U
->getOperandNo() == 0)
153 EscapePoints
.insert(I
);
154 continue; // Stores have no users to analyze.
156 case Instruction::BitCast
:
157 case Instruction::GetElementPtr
:
158 case Instruction::PHI
:
159 case Instruction::Select
:
160 case Instruction::AddrSpaceCast
:
163 EscapePoints
.insert(I
);
167 AddUsesToWorklist(I
);
171 void callUsesLocalStack(CallBase
&CB
, bool IsNocapture
) {
172 // Add it to the list of alloca users.
173 AllocaUsers
.insert(&CB
);
175 // If it's nocapture then it can't capture this alloca.
179 // If it can write to memory, it can leak the alloca value.
180 if (!CB
.onlyReadsMemory())
181 EscapePoints
.insert(&CB
);
184 SmallPtrSet
<Instruction
*, 32> AllocaUsers
;
185 SmallPtrSet
<Instruction
*, 32> EscapePoints
;
189 static bool markTails(Function
&F
, OptimizationRemarkEmitter
*ORE
) {
190 if (F
.callsFunctionThatReturnsTwice())
193 // The local stack holds all alloca instructions and all byval arguments.
194 AllocaDerivedValueTracker Tracker
;
195 for (Argument
&Arg
: F
.args()) {
196 if (Arg
.hasByValAttr())
201 if (AllocaInst
*AI
= dyn_cast
<AllocaInst
>(&I
))
205 bool Modified
= false;
207 // Track whether a block is reachable after an alloca has escaped. Blocks that
208 // contain the escaping instruction will be marked as being visited without an
209 // escaped alloca, since that is how the block began.
215 DenseMap
<BasicBlock
*, VisitType
> Visited
;
217 // We propagate the fact that an alloca has escaped from block to successor.
218 // Visit the blocks that are propagating the escapedness first. To do this, we
219 // maintain two worklists.
220 SmallVector
<BasicBlock
*, 32> WorklistUnescaped
, WorklistEscaped
;
222 // We may enter a block and visit it thinking that no alloca has escaped yet,
223 // then see an escape point and go back around a loop edge and come back to
224 // the same block twice. Because of this, we defer setting tail on calls when
225 // we first encounter them in a block. Every entry in this list does not
226 // statically use an alloca via use-def chain analysis, but may find an alloca
227 // through other means if the block turns out to be reachable after an escape
229 SmallVector
<CallInst
*, 32> DeferredTails
;
231 BasicBlock
*BB
= &F
.getEntryBlock();
232 VisitType Escaped
= UNESCAPED
;
234 for (auto &I
: *BB
) {
235 if (Tracker
.EscapePoints
.count(&I
))
238 CallInst
*CI
= dyn_cast
<CallInst
>(&I
);
239 // A PseudoProbeInst has the IntrInaccessibleMemOnly tag hence it is
240 // considered accessing memory and will be marked as a tail call if we
241 // don't bail out here.
242 if (!CI
|| CI
->isTailCall() || isa
<DbgInfoIntrinsic
>(&I
) ||
243 isa
<PseudoProbeInst
>(&I
))
246 // Special-case operand bundles "clang.arc.attachedcall", "ptrauth", and
248 bool IsNoTail
= CI
->isNoTailCall() ||
249 CI
->hasOperandBundlesOtherThan(
250 {LLVMContext::OB_clang_arc_attachedcall
,
251 LLVMContext::OB_ptrauth
, LLVMContext::OB_kcfi
});
253 if (!IsNoTail
&& CI
->doesNotAccessMemory()) {
254 // A call to a readnone function whose arguments are all things computed
255 // outside this function can be marked tail. Even if you stored the
256 // alloca address into a global, a readnone function can't load the
259 // Note that this runs whether we know an alloca has escaped or not. If
260 // it has, then we can't trust Tracker.AllocaUsers to be accurate.
261 bool SafeToTail
= true;
262 for (auto &Arg
: CI
->args()) {
263 if (isa
<Constant
>(Arg
.getUser()))
265 if (Argument
*A
= dyn_cast
<Argument
>(Arg
.getUser()))
266 if (!A
->hasByValAttr())
274 return OptimizationRemark(DEBUG_TYPE
, "tailcall-readnone", CI
)
275 << "marked as tail call candidate (readnone)";
283 if (!IsNoTail
&& Escaped
== UNESCAPED
&& !Tracker
.AllocaUsers
.count(CI
))
284 DeferredTails
.push_back(CI
);
287 for (auto *SuccBB
: successors(BB
)) {
288 auto &State
= Visited
[SuccBB
];
289 if (State
< Escaped
) {
291 if (State
== ESCAPED
)
292 WorklistEscaped
.push_back(SuccBB
);
294 WorklistUnescaped
.push_back(SuccBB
);
298 if (!WorklistEscaped
.empty()) {
299 BB
= WorklistEscaped
.pop_back_val();
303 while (!WorklistUnescaped
.empty()) {
304 auto *NextBB
= WorklistUnescaped
.pop_back_val();
305 if (Visited
[NextBB
] == UNESCAPED
) {
314 for (CallInst
*CI
: DeferredTails
) {
315 if (Visited
[CI
->getParent()] != ESCAPED
) {
316 // If the escape point was part way through the block, calls after the
317 // escape point wouldn't have been put into DeferredTails.
318 LLVM_DEBUG(dbgs() << "Marked as tail call candidate: " << *CI
<< "\n");
327 /// Return true if it is safe to move the specified
328 /// instruction from after the call to before the call, assuming that all
329 /// instructions between the call and this instruction are movable.
331 static bool canMoveAboveCall(Instruction
*I
, CallInst
*CI
, AliasAnalysis
*AA
) {
332 if (isa
<DbgInfoIntrinsic
>(I
))
335 if (const IntrinsicInst
*II
= dyn_cast
<IntrinsicInst
>(I
))
336 if (II
->getIntrinsicID() == Intrinsic::lifetime_end
&&
337 llvm::findAllocaForValue(II
->getArgOperand(1)))
340 // FIXME: We can move load/store/call/free instructions above the call if the
341 // call does not mod/ref the memory location being processed.
342 if (I
->mayHaveSideEffects()) // This also handles volatile loads.
345 if (LoadInst
*L
= dyn_cast
<LoadInst
>(I
)) {
346 // Loads may always be moved above calls without side effects.
347 if (CI
->mayHaveSideEffects()) {
348 // Non-volatile loads may be moved above a call with side effects if it
349 // does not write to memory and the load provably won't trap.
350 // Writes to memory only matter if they may alias the pointer
351 // being loaded from.
352 const DataLayout
&DL
= L
->getModule()->getDataLayout();
353 if (isModSet(AA
->getModRefInfo(CI
, MemoryLocation::get(L
))) ||
354 !isSafeToLoadUnconditionally(L
->getPointerOperand(), L
->getType(),
355 L
->getAlign(), DL
, L
))
360 // Otherwise, if this is a side-effect free instruction, check to make sure
361 // that it does not use the return value of the call. If it doesn't use the
362 // return value of the call, it must only use things that are defined before
363 // the call, or movable instructions between the call and the instruction
365 return !is_contained(I
->operands(), CI
);
368 static bool canTransformAccumulatorRecursion(Instruction
*I
, CallInst
*CI
) {
369 if (!I
->isAssociative() || !I
->isCommutative())
372 assert(I
->getNumOperands() >= 2 &&
373 "Associative/commutative operations should have at least 2 args!");
375 if (IntrinsicInst
*II
= dyn_cast
<IntrinsicInst
>(I
)) {
376 // Accumulators must have an identity.
377 if (!ConstantExpr::getIntrinsicIdentity(II
->getIntrinsicID(), I
->getType()))
381 // Exactly one operand should be the result of the call instruction.
382 if ((I
->getOperand(0) == CI
&& I
->getOperand(1) == CI
) ||
383 (I
->getOperand(0) != CI
&& I
->getOperand(1) != CI
))
386 // The only user of this instruction we allow is a single return instruction.
387 if (!I
->hasOneUse() || !isa
<ReturnInst
>(I
->user_back()))
393 static Instruction
*firstNonDbg(BasicBlock::iterator I
) {
394 while (isa
<DbgInfoIntrinsic
>(I
))
400 class TailRecursionEliminator
{
402 const TargetTransformInfo
*TTI
;
404 OptimizationRemarkEmitter
*ORE
;
407 // The below are shared state we want to have available when eliminating any
408 // calls in the function. There values should be populated by
409 // createTailRecurseLoopHeader the first time we find a call we can eliminate.
410 BasicBlock
*HeaderBB
= nullptr;
411 SmallVector
<PHINode
*, 8> ArgumentPHIs
;
413 // PHI node to store our return value.
414 PHINode
*RetPN
= nullptr;
416 // i1 PHI node to track if we have a valid return value stored in RetPN.
417 PHINode
*RetKnownPN
= nullptr;
419 // Vector of select instructions we insereted. These selects use RetKnownPN
420 // to either propagate RetPN or select a new return value.
421 SmallVector
<SelectInst
*, 8> RetSelects
;
423 // The below are shared state needed when performing accumulator recursion.
424 // There values should be populated by insertAccumulator the first time we
425 // find an elimination that requires an accumulator.
427 // PHI node to store our current accumulated value.
428 PHINode
*AccPN
= nullptr;
430 // The instruction doing the accumulating.
431 Instruction
*AccumulatorRecursionInstr
= nullptr;
433 TailRecursionEliminator(Function
&F
, const TargetTransformInfo
*TTI
,
434 AliasAnalysis
*AA
, OptimizationRemarkEmitter
*ORE
,
436 : F(F
), TTI(TTI
), AA(AA
), ORE(ORE
), DTU(DTU
) {}
438 CallInst
*findTRECandidate(BasicBlock
*BB
);
440 void createTailRecurseLoopHeader(CallInst
*CI
);
442 void insertAccumulator(Instruction
*AccRecInstr
);
444 bool eliminateCall(CallInst
*CI
);
446 void cleanupAndFinalize();
448 bool processBlock(BasicBlock
&BB
);
450 void copyByValueOperandIntoLocalTemp(CallInst
*CI
, int OpndIdx
);
452 void copyLocalTempOfByValueOperandIntoArguments(CallInst
*CI
, int OpndIdx
);
455 static bool eliminate(Function
&F
, const TargetTransformInfo
*TTI
,
456 AliasAnalysis
*AA
, OptimizationRemarkEmitter
*ORE
,
457 DomTreeUpdater
&DTU
);
461 CallInst
*TailRecursionEliminator::findTRECandidate(BasicBlock
*BB
) {
462 Instruction
*TI
= BB
->getTerminator();
464 if (&BB
->front() == TI
) // Make sure there is something before the terminator.
467 // Scan backwards from the return, checking to see if there is a tail call in
468 // this block. If so, set CI to it.
469 CallInst
*CI
= nullptr;
470 BasicBlock::iterator
BBI(TI
);
472 CI
= dyn_cast
<CallInst
>(BBI
);
473 if (CI
&& CI
->getCalledFunction() == &F
)
476 if (BBI
== BB
->begin())
477 return nullptr; // Didn't find a potential tail call.
481 assert((!CI
->isTailCall() || !CI
->isNoTailCall()) &&
482 "Incompatible call site attributes(Tail,NoTail)");
483 if (!CI
->isTailCall())
486 // As a special case, detect code like this:
487 // double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call
488 // and disable this xform in this case, because the code generator will
489 // lower the call to fabs into inline code.
490 if (BB
== &F
.getEntryBlock() &&
491 firstNonDbg(BB
->front().getIterator()) == CI
&&
492 firstNonDbg(std::next(BB
->begin())) == TI
&& CI
->getCalledFunction() &&
493 !TTI
->isLoweredToCall(CI
->getCalledFunction())) {
494 // A single-block function with just a call and a return. Check that
495 // the arguments match.
496 auto I
= CI
->arg_begin(), E
= CI
->arg_end();
497 Function::arg_iterator FI
= F
.arg_begin(), FE
= F
.arg_end();
498 for (; I
!= E
&& FI
!= FE
; ++I
, ++FI
)
499 if (*I
!= &*FI
) break;
500 if (I
== E
&& FI
== FE
)
507 void TailRecursionEliminator::createTailRecurseLoopHeader(CallInst
*CI
) {
508 HeaderBB
= &F
.getEntryBlock();
509 BasicBlock
*NewEntry
= BasicBlock::Create(F
.getContext(), "", &F
, HeaderBB
);
510 NewEntry
->takeName(HeaderBB
);
511 HeaderBB
->setName("tailrecurse");
512 BranchInst
*BI
= BranchInst::Create(HeaderBB
, NewEntry
);
513 BI
->setDebugLoc(CI
->getDebugLoc());
515 // Move all fixed sized allocas from HeaderBB to NewEntry.
516 for (BasicBlock::iterator OEBI
= HeaderBB
->begin(), E
= HeaderBB
->end(),
517 NEBI
= NewEntry
->begin();
519 if (AllocaInst
*AI
= dyn_cast
<AllocaInst
>(OEBI
++))
520 if (isa
<ConstantInt
>(AI
->getArraySize()))
521 AI
->moveBefore(&*NEBI
);
523 // Now that we have created a new block, which jumps to the entry
524 // block, insert a PHI node for each argument of the function.
525 // For now, we initialize each PHI to only have the real arguments
526 // which are passed in.
527 BasicBlock::iterator InsertPos
= HeaderBB
->begin();
528 for (Function::arg_iterator I
= F
.arg_begin(), E
= F
.arg_end(); I
!= E
; ++I
) {
529 PHINode
*PN
= PHINode::Create(I
->getType(), 2, I
->getName() + ".tr");
530 PN
->insertBefore(InsertPos
);
531 I
->replaceAllUsesWith(PN
); // Everyone use the PHI node now!
532 PN
->addIncoming(&*I
, NewEntry
);
533 ArgumentPHIs
.push_back(PN
);
536 // If the function doen't return void, create the RetPN and RetKnownPN PHI
537 // nodes to track our return value. We initialize RetPN with poison and
538 // RetKnownPN with false since we can't know our return value at function
540 Type
*RetType
= F
.getReturnType();
541 if (!RetType
->isVoidTy()) {
542 Type
*BoolType
= Type::getInt1Ty(F
.getContext());
543 RetPN
= PHINode::Create(RetType
, 2, "ret.tr");
544 RetPN
->insertBefore(InsertPos
);
545 RetKnownPN
= PHINode::Create(BoolType
, 2, "ret.known.tr");
546 RetKnownPN
->insertBefore(InsertPos
);
548 RetPN
->addIncoming(PoisonValue::get(RetType
), NewEntry
);
549 RetKnownPN
->addIncoming(ConstantInt::getFalse(BoolType
), NewEntry
);
552 // The entry block was changed from HeaderBB to NewEntry.
553 // The forward DominatorTree needs to be recalculated when the EntryBB is
554 // changed. In this corner-case we recalculate the entire tree.
555 DTU
.recalculate(*NewEntry
->getParent());
558 void TailRecursionEliminator::insertAccumulator(Instruction
*AccRecInstr
) {
559 assert(!AccPN
&& "Trying to insert multiple accumulators");
561 AccumulatorRecursionInstr
= AccRecInstr
;
563 // Start by inserting a new PHI node for the accumulator.
564 pred_iterator PB
= pred_begin(HeaderBB
), PE
= pred_end(HeaderBB
);
565 AccPN
= PHINode::Create(F
.getReturnType(), std::distance(PB
, PE
) + 1,
567 AccPN
->insertBefore(HeaderBB
->begin());
569 // Loop over all of the predecessors of the tail recursion block. For the
570 // real entry into the function we seed the PHI with the identity constant for
571 // the accumulation operation. For any other existing branches to this block
572 // (due to other tail recursions eliminated) the accumulator is not modified.
573 // Because we haven't added the branch in the current block to HeaderBB yet,
574 // it will not show up as a predecessor.
575 for (pred_iterator PI
= PB
; PI
!= PE
; ++PI
) {
577 if (P
== &F
.getEntryBlock()) {
579 ConstantExpr::getIdentity(AccRecInstr
, AccRecInstr
->getType());
580 AccPN
->addIncoming(Identity
, P
);
582 AccPN
->addIncoming(AccPN
, P
);
589 // Creates a copy of contents of ByValue operand of the specified
590 // call instruction into the newly created temporarily variable.
591 void TailRecursionEliminator::copyByValueOperandIntoLocalTemp(CallInst
*CI
,
593 Type
*AggTy
= CI
->getParamByValType(OpndIdx
);
595 const DataLayout
&DL
= F
.getParent()->getDataLayout();
597 // Get alignment of byVal operand.
598 Align
Alignment(CI
->getParamAlign(OpndIdx
).valueOrOne());
600 // Create alloca for temporarily byval operands.
601 // Put alloca into the entry block.
602 Value
*NewAlloca
= new AllocaInst(
603 AggTy
, DL
.getAllocaAddrSpace(), nullptr, Alignment
,
604 CI
->getArgOperand(OpndIdx
)->getName(), &*F
.getEntryBlock().begin());
606 IRBuilder
<> Builder(CI
);
607 Value
*Size
= Builder
.getInt64(DL
.getTypeAllocSize(AggTy
));
609 // Copy data from byvalue operand into the temporarily variable.
610 Builder
.CreateMemCpy(NewAlloca
, /*DstAlign*/ Alignment
,
611 CI
->getArgOperand(OpndIdx
),
612 /*SrcAlign*/ Alignment
, Size
);
613 CI
->setArgOperand(OpndIdx
, NewAlloca
);
616 // Creates a copy from temporarily variable(keeping value of ByVal argument)
617 // into the corresponding function argument location.
618 void TailRecursionEliminator::copyLocalTempOfByValueOperandIntoArguments(
619 CallInst
*CI
, int OpndIdx
) {
620 Type
*AggTy
= CI
->getParamByValType(OpndIdx
);
622 const DataLayout
&DL
= F
.getParent()->getDataLayout();
624 // Get alignment of byVal operand.
625 Align
Alignment(CI
->getParamAlign(OpndIdx
).valueOrOne());
627 IRBuilder
<> Builder(CI
);
628 Value
*Size
= Builder
.getInt64(DL
.getTypeAllocSize(AggTy
));
630 // Copy data from the temporarily variable into corresponding
631 // function argument location.
632 Builder
.CreateMemCpy(F
.getArg(OpndIdx
), /*DstAlign*/ Alignment
,
633 CI
->getArgOperand(OpndIdx
),
634 /*SrcAlign*/ Alignment
, Size
);
637 bool TailRecursionEliminator::eliminateCall(CallInst
*CI
) {
638 ReturnInst
*Ret
= cast
<ReturnInst
>(CI
->getParent()->getTerminator());
640 // Ok, we found a potential tail call. We can currently only transform the
641 // tail call if all of the instructions between the call and the return are
642 // movable to above the call itself, leaving the call next to the return.
643 // Check that this is the case now.
644 Instruction
*AccRecInstr
= nullptr;
645 BasicBlock::iterator
BBI(CI
);
646 for (++BBI
; &*BBI
!= Ret
; ++BBI
) {
647 if (canMoveAboveCall(&*BBI
, CI
, AA
))
650 // If we can't move the instruction above the call, it might be because it
651 // is an associative and commutative operation that could be transformed
652 // using accumulator recursion elimination. Check to see if this is the
653 // case, and if so, remember which instruction accumulates for later.
654 if (AccPN
|| !canTransformAccumulatorRecursion(&*BBI
, CI
))
655 return false; // We cannot eliminate the tail recursion!
657 // Yes, this is accumulator recursion. Remember which instruction
662 BasicBlock
*BB
= Ret
->getParent();
666 return OptimizationRemark(DEBUG_TYPE
, "tailcall-recursion", CI
)
667 << "transforming tail recursion into loop";
670 // OK! We can transform this tail call. If this is the first one found,
671 // create the new entry block, allowing us to branch back to the old entry.
673 createTailRecurseLoopHeader(CI
);
675 // Copy values of ByVal operands into local temporarily variables.
676 for (unsigned I
= 0, E
= CI
->arg_size(); I
!= E
; ++I
) {
677 if (CI
->isByValArgument(I
))
678 copyByValueOperandIntoLocalTemp(CI
, I
);
681 // Ok, now that we know we have a pseudo-entry block WITH all of the
682 // required PHI nodes, add entries into the PHI node for the actual
683 // parameters passed into the tail-recursive call.
684 for (unsigned I
= 0, E
= CI
->arg_size(); I
!= E
; ++I
) {
685 if (CI
->isByValArgument(I
)) {
686 copyLocalTempOfByValueOperandIntoArguments(CI
, I
);
687 // When eliminating a tail call, we modify the values of the arguments.
688 // Therefore, if the byval parameter has a readonly attribute, we have to
689 // remove it. It is safe because, from the perspective of a caller, the
690 // byval parameter is always treated as "readonly," even if the readonly
691 // attribute is removed.
692 F
.removeParamAttr(I
, Attribute::ReadOnly
);
693 ArgumentPHIs
[I
]->addIncoming(F
.getArg(I
), BB
);
695 ArgumentPHIs
[I
]->addIncoming(CI
->getArgOperand(I
), BB
);
699 insertAccumulator(AccRecInstr
);
701 // Rewrite the accumulator recursion instruction so that it does not use
702 // the result of the call anymore, instead, use the PHI node we just
704 AccRecInstr
->setOperand(AccRecInstr
->getOperand(0) != CI
, AccPN
);
707 // Update our return value tracking
709 if (Ret
->getReturnValue() == CI
|| AccRecInstr
) {
710 // Defer selecting a return value
711 RetPN
->addIncoming(RetPN
, BB
);
712 RetKnownPN
->addIncoming(RetKnownPN
, BB
);
714 // We found a return value we want to use, insert a select instruction to
715 // select it if we don't already know what our return value will be and
716 // store the result in our return value PHI node.
717 SelectInst
*SI
= SelectInst::Create(
718 RetKnownPN
, RetPN
, Ret
->getReturnValue(), "current.ret.tr", Ret
);
719 RetSelects
.push_back(SI
);
721 RetPN
->addIncoming(SI
, BB
);
722 RetKnownPN
->addIncoming(ConstantInt::getTrue(RetKnownPN
->getType()), BB
);
726 AccPN
->addIncoming(AccRecInstr
? AccRecInstr
: AccPN
, BB
);
729 // Now that all of the PHI nodes are in place, remove the call and
730 // ret instructions, replacing them with an unconditional branch.
731 BranchInst
*NewBI
= BranchInst::Create(HeaderBB
, Ret
);
732 NewBI
->setDebugLoc(CI
->getDebugLoc());
734 Ret
->eraseFromParent(); // Remove return.
735 CI
->eraseFromParent(); // Remove call.
736 DTU
.applyUpdates({{DominatorTree::Insert
, BB
, HeaderBB
}});
741 void TailRecursionEliminator::cleanupAndFinalize() {
742 // If we eliminated any tail recursions, it's possible that we inserted some
743 // silly PHI nodes which just merge an initial value (the incoming operand)
744 // with themselves. Check to see if we did and clean up our mess if so. This
745 // occurs when a function passes an argument straight through to its tail
747 for (PHINode
*PN
: ArgumentPHIs
) {
748 // If the PHI Node is a dynamic constant, replace it with the value it is.
749 if (Value
*PNV
= simplifyInstruction(PN
, F
.getParent()->getDataLayout())) {
750 PN
->replaceAllUsesWith(PNV
);
751 PN
->eraseFromParent();
756 if (RetSelects
.empty()) {
757 // If we didn't insert any select instructions, then we know we didn't
758 // store a return value and we can remove the PHI nodes we inserted.
759 RetPN
->dropAllReferences();
760 RetPN
->eraseFromParent();
762 RetKnownPN
->dropAllReferences();
763 RetKnownPN
->eraseFromParent();
766 // We need to insert a copy of our accumulator instruction before any
767 // return in the function, and return its result instead.
768 Instruction
*AccRecInstr
= AccumulatorRecursionInstr
;
769 for (BasicBlock
&BB
: F
) {
770 ReturnInst
*RI
= dyn_cast
<ReturnInst
>(BB
.getTerminator());
774 Instruction
*AccRecInstrNew
= AccRecInstr
->clone();
775 AccRecInstrNew
->setName("accumulator.ret.tr");
776 AccRecInstrNew
->setOperand(AccRecInstr
->getOperand(0) == AccPN
,
778 AccRecInstrNew
->insertBefore(RI
);
779 RI
->setOperand(0, AccRecInstrNew
);
783 // We need to insert a select instruction before any return left in the
784 // function to select our stored return value if we have one.
785 for (BasicBlock
&BB
: F
) {
786 ReturnInst
*RI
= dyn_cast
<ReturnInst
>(BB
.getTerminator());
790 SelectInst
*SI
= SelectInst::Create(
791 RetKnownPN
, RetPN
, RI
->getOperand(0), "current.ret.tr", RI
);
792 RetSelects
.push_back(SI
);
793 RI
->setOperand(0, SI
);
797 // We need to insert a copy of our accumulator instruction before any
798 // of the selects we inserted, and select its result instead.
799 Instruction
*AccRecInstr
= AccumulatorRecursionInstr
;
800 for (SelectInst
*SI
: RetSelects
) {
801 Instruction
*AccRecInstrNew
= AccRecInstr
->clone();
802 AccRecInstrNew
->setName("accumulator.ret.tr");
803 AccRecInstrNew
->setOperand(AccRecInstr
->getOperand(0) == AccPN
,
804 SI
->getFalseValue());
805 AccRecInstrNew
->insertBefore(SI
);
806 SI
->setFalseValue(AccRecInstrNew
);
813 bool TailRecursionEliminator::processBlock(BasicBlock
&BB
) {
814 Instruction
*TI
= BB
.getTerminator();
816 if (BranchInst
*BI
= dyn_cast
<BranchInst
>(TI
)) {
817 if (BI
->isConditional())
820 BasicBlock
*Succ
= BI
->getSuccessor(0);
821 ReturnInst
*Ret
= dyn_cast
<ReturnInst
>(Succ
->getFirstNonPHIOrDbg(true));
826 CallInst
*CI
= findTRECandidate(&BB
);
831 LLVM_DEBUG(dbgs() << "FOLDING: " << *Succ
832 << "INTO UNCOND BRANCH PRED: " << BB
);
833 FoldReturnIntoUncondBranch(Ret
, Succ
, &BB
, &DTU
);
836 // If all predecessors of Succ have been eliminated by
837 // FoldReturnIntoUncondBranch, delete it. It is important to empty it,
838 // because the ret instruction in there is still using a value which
839 // eliminateCall will attempt to remove. This block can only contain
840 // instructions that can't have uses, therefore it is safe to remove.
841 if (pred_empty(Succ
))
846 } else if (isa
<ReturnInst
>(TI
)) {
847 CallInst
*CI
= findTRECandidate(&BB
);
850 return eliminateCall(CI
);
856 bool TailRecursionEliminator::eliminate(Function
&F
,
857 const TargetTransformInfo
*TTI
,
859 OptimizationRemarkEmitter
*ORE
,
860 DomTreeUpdater
&DTU
) {
861 if (F
.getFnAttribute("disable-tail-calls").getValueAsBool())
864 bool MadeChange
= false;
865 MadeChange
|= markTails(F
, ORE
);
867 // If this function is a varargs function, we won't be able to PHI the args
868 // right, so don't even try to convert it...
869 if (F
.getFunctionType()->isVarArg())
875 // Change any tail recursive calls to loops.
876 TailRecursionEliminator
TRE(F
, TTI
, AA
, ORE
, DTU
);
878 for (BasicBlock
&BB
: F
)
879 MadeChange
|= TRE
.processBlock(BB
);
881 TRE
.cleanupAndFinalize();
887 struct TailCallElim
: public FunctionPass
{
888 static char ID
; // Pass identification, replacement for typeid
889 TailCallElim() : FunctionPass(ID
) {
890 initializeTailCallElimPass(*PassRegistry::getPassRegistry());
893 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
894 AU
.addRequired
<TargetTransformInfoWrapperPass
>();
895 AU
.addRequired
<AAResultsWrapperPass
>();
896 AU
.addRequired
<OptimizationRemarkEmitterWrapperPass
>();
897 AU
.addPreserved
<GlobalsAAWrapperPass
>();
898 AU
.addPreserved
<DominatorTreeWrapperPass
>();
899 AU
.addPreserved
<PostDominatorTreeWrapperPass
>();
902 bool runOnFunction(Function
&F
) override
{
906 auto *DTWP
= getAnalysisIfAvailable
<DominatorTreeWrapperPass
>();
907 auto *DT
= DTWP
? &DTWP
->getDomTree() : nullptr;
908 auto *PDTWP
= getAnalysisIfAvailable
<PostDominatorTreeWrapperPass
>();
909 auto *PDT
= PDTWP
? &PDTWP
->getPostDomTree() : nullptr;
910 // There is no noticable performance difference here between Lazy and Eager
911 // UpdateStrategy based on some test results. It is feasible to switch the
912 // UpdateStrategy to Lazy if we find it profitable later.
913 DomTreeUpdater
DTU(DT
, PDT
, DomTreeUpdater::UpdateStrategy::Eager
);
915 return TailRecursionEliminator::eliminate(
916 F
, &getAnalysis
<TargetTransformInfoWrapperPass
>().getTTI(F
),
917 &getAnalysis
<AAResultsWrapperPass
>().getAAResults(),
918 &getAnalysis
<OptimizationRemarkEmitterWrapperPass
>().getORE(), DTU
);
923 char TailCallElim::ID
= 0;
924 INITIALIZE_PASS_BEGIN(TailCallElim
, "tailcallelim", "Tail Call Elimination",
926 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass
)
927 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass
)
928 INITIALIZE_PASS_END(TailCallElim
, "tailcallelim", "Tail Call Elimination",
931 // Public interface to the TailCallElimination pass
932 FunctionPass
*llvm::createTailCallEliminationPass() {
933 return new TailCallElim();
936 PreservedAnalyses
TailCallElimPass::run(Function
&F
,
937 FunctionAnalysisManager
&AM
) {
939 TargetTransformInfo
&TTI
= AM
.getResult
<TargetIRAnalysis
>(F
);
940 AliasAnalysis
&AA
= AM
.getResult
<AAManager
>(F
);
941 auto &ORE
= AM
.getResult
<OptimizationRemarkEmitterAnalysis
>(F
);
942 auto *DT
= AM
.getCachedResult
<DominatorTreeAnalysis
>(F
);
943 auto *PDT
= AM
.getCachedResult
<PostDominatorTreeAnalysis
>(F
);
944 // There is no noticable performance difference here between Lazy and Eager
945 // UpdateStrategy based on some test results. It is feasible to switch the
946 // UpdateStrategy to Lazy if we find it profitable later.
947 DomTreeUpdater
DTU(DT
, PDT
, DomTreeUpdater::UpdateStrategy::Eager
);
948 bool Changed
= TailRecursionEliminator::eliminate(F
, &TTI
, &AA
, &ORE
, DTU
);
951 return PreservedAnalyses::all();
952 PreservedAnalyses PA
;
953 PA
.preserve
<DominatorTreeAnalysis
>();
954 PA
.preserve
<PostDominatorTreeAnalysis
>();