1 //===- TailRecursionElimination.cpp - Eliminate Tail Calls ----------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file transforms calls of the current function (self recursion) followed
10 // by a return instruction with a branch to the entry of the function, creating
11 // a loop. This pass also implements the following extensions to the basic
14 // 1. Trivial instructions between the call and return do not prevent the
15 // transformation from taking place, though currently the analysis cannot
16 // support moving any really useful instructions (only dead ones).
17 // 2. This pass transforms functions that are prevented from being tail
18 // recursive by an associative and commutative expression to use an
19 // accumulator variable, thus compiling the typical naive factorial or
20 // 'fib' implementation into efficient code.
21 // 3. TRE is performed if the function returns void, if the return
22 // returns the result returned by the call, or if the function returns a
23 // run-time constant on all exits from the function. It is possible, though
24 // unlikely, that the return returns something else (like constant 0), and
25 // can still be TRE'd. It can be TRE'd if ALL OTHER return instructions in
26 // the function return the exact same value.
27 // 4. If it can prove that callees do not access their caller stack frame,
28 // they are marked as eligible for tail call elimination (by the code
31 // There are several improvements that could be made:
33 // 1. If the function has any alloca instructions, these instructions will be
34 // moved out of the entry block of the function, causing them to be
35 // evaluated each time through the tail recursion. Safely keeping allocas
36 // in the entry block requires analysis to proves that the tail-called
37 // function does not read or write the stack object.
38 // 2. Tail recursion is only performed if the call immediately precedes the
39 // return instruction. It's possible that there could be a jump between
40 // the call and the return.
41 // 3. There can be intervening operations between the call and the return that
42 // prevent the TRE from occurring. For example, there could be GEP's and
43 // stores to memory that will not be read or written by the call. This
44 // requires some substantial analysis (such as with DSA) to prove safe to
45 // move ahead of the call, but doing so could allow many more TREs to be
46 // performed, for example in TreeAdd/TreeAlloc from the treeadd benchmark.
47 // 4. The algorithm we use to detect if callees access their caller stack
48 // frames is very primitive.
50 //===----------------------------------------------------------------------===//
52 #include "llvm/Transforms/Scalar/TailRecursionElimination.h"
53 #include "llvm/ADT/STLExtras.h"
54 #include "llvm/ADT/SmallPtrSet.h"
55 #include "llvm/ADT/Statistic.h"
56 #include "llvm/Analysis/DomTreeUpdater.h"
57 #include "llvm/Analysis/GlobalsModRef.h"
58 #include "llvm/Analysis/InstructionSimplify.h"
59 #include "llvm/Analysis/Loads.h"
60 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
61 #include "llvm/Analysis/PostDominators.h"
62 #include "llvm/Analysis/TargetTransformInfo.h"
63 #include "llvm/Analysis/ValueTracking.h"
64 #include "llvm/IR/CFG.h"
65 #include "llvm/IR/Constants.h"
66 #include "llvm/IR/DataLayout.h"
67 #include "llvm/IR/DerivedTypes.h"
68 #include "llvm/IR/DiagnosticInfo.h"
69 #include "llvm/IR/Dominators.h"
70 #include "llvm/IR/Function.h"
71 #include "llvm/IR/IRBuilder.h"
72 #include "llvm/IR/InstIterator.h"
73 #include "llvm/IR/Instructions.h"
74 #include "llvm/IR/IntrinsicInst.h"
75 #include "llvm/IR/Module.h"
76 #include "llvm/InitializePasses.h"
77 #include "llvm/Pass.h"
78 #include "llvm/Support/Debug.h"
79 #include "llvm/Support/raw_ostream.h"
80 #include "llvm/Transforms/Scalar.h"
81 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
84 #define DEBUG_TYPE "tailcallelim"
86 STATISTIC(NumEliminated
, "Number of tail calls removed");
87 STATISTIC(NumRetDuped
, "Number of return duplicated");
88 STATISTIC(NumAccumAdded
, "Number of accumulators introduced");
90 /// Scan the specified function for alloca instructions.
91 /// If it contains any dynamic allocas, returns false.
92 static bool canTRE(Function
&F
) {
93 // TODO: We don't do TRE if dynamic allocas are used.
94 // Dynamic allocas allocate stack space which should be
95 // deallocated before new iteration started. That is
96 // currently not implemented.
97 return llvm::all_of(instructions(F
), [](Instruction
&I
) {
98 auto *AI
= dyn_cast
<AllocaInst
>(&I
);
99 return !AI
|| AI
->isStaticAlloca();
104 struct AllocaDerivedValueTracker
{
105 // Start at a root value and walk its use-def chain to mark calls that use the
106 // value or a derived value in AllocaUsers, and places where it may escape in
108 void walk(Value
*Root
) {
109 SmallVector
<Use
*, 32> Worklist
;
110 SmallPtrSet
<Use
*, 32> Visited
;
112 auto AddUsesToWorklist
= [&](Value
*V
) {
113 for (auto &U
: V
->uses()) {
114 if (!Visited
.insert(&U
).second
)
116 Worklist
.push_back(&U
);
120 AddUsesToWorklist(Root
);
122 while (!Worklist
.empty()) {
123 Use
*U
= Worklist
.pop_back_val();
124 Instruction
*I
= cast
<Instruction
>(U
->getUser());
126 switch (I
->getOpcode()) {
127 case Instruction::Call
:
128 case Instruction::Invoke
: {
129 auto &CB
= cast
<CallBase
>(*I
);
130 // If the alloca-derived argument is passed byval it is not an escape
131 // point, or a use of an alloca. Calling with byval copies the contents
132 // of the alloca into argument registers or stack slots, which exist
133 // beyond the lifetime of the current frame.
134 if (CB
.isArgOperand(U
) && CB
.isByValArgument(CB
.getArgOperandNo(U
)))
137 CB
.isDataOperand(U
) && CB
.doesNotCapture(CB
.getDataOperandNo(U
));
138 callUsesLocalStack(CB
, IsNocapture
);
140 // If the alloca-derived argument is passed in as nocapture, then it
141 // can't propagate to the call's return. That would be capturing.
146 case Instruction::Load
: {
147 // The result of a load is not alloca-derived (unless an alloca has
148 // otherwise escaped, but this is a local analysis).
151 case Instruction::Store
: {
152 if (U
->getOperandNo() == 0)
153 EscapePoints
.insert(I
);
154 continue; // Stores have no users to analyze.
156 case Instruction::BitCast
:
157 case Instruction::GetElementPtr
:
158 case Instruction::PHI
:
159 case Instruction::Select
:
160 case Instruction::AddrSpaceCast
:
163 EscapePoints
.insert(I
);
167 AddUsesToWorklist(I
);
171 void callUsesLocalStack(CallBase
&CB
, bool IsNocapture
) {
172 // Add it to the list of alloca users.
173 AllocaUsers
.insert(&CB
);
175 // If it's nocapture then it can't capture this alloca.
179 // If it can write to memory, it can leak the alloca value.
180 if (!CB
.onlyReadsMemory())
181 EscapePoints
.insert(&CB
);
184 SmallPtrSet
<Instruction
*, 32> AllocaUsers
;
185 SmallPtrSet
<Instruction
*, 32> EscapePoints
;
189 static bool markTails(Function
&F
, OptimizationRemarkEmitter
*ORE
) {
190 if (F
.callsFunctionThatReturnsTwice())
193 // The local stack holds all alloca instructions and all byval arguments.
194 AllocaDerivedValueTracker Tracker
;
195 for (Argument
&Arg
: F
.args()) {
196 if (Arg
.hasByValAttr())
201 if (AllocaInst
*AI
= dyn_cast
<AllocaInst
>(&I
))
205 bool Modified
= false;
207 // Track whether a block is reachable after an alloca has escaped. Blocks that
208 // contain the escaping instruction will be marked as being visited without an
209 // escaped alloca, since that is how the block began.
215 DenseMap
<BasicBlock
*, VisitType
> Visited
;
217 // We propagate the fact that an alloca has escaped from block to successor.
218 // Visit the blocks that are propagating the escapedness first. To do this, we
219 // maintain two worklists.
220 SmallVector
<BasicBlock
*, 32> WorklistUnescaped
, WorklistEscaped
;
222 // We may enter a block and visit it thinking that no alloca has escaped yet,
223 // then see an escape point and go back around a loop edge and come back to
224 // the same block twice. Because of this, we defer setting tail on calls when
225 // we first encounter them in a block. Every entry in this list does not
226 // statically use an alloca via use-def chain analysis, but may find an alloca
227 // through other means if the block turns out to be reachable after an escape
229 SmallVector
<CallInst
*, 32> DeferredTails
;
231 BasicBlock
*BB
= &F
.getEntryBlock();
232 VisitType Escaped
= UNESCAPED
;
234 for (auto &I
: *BB
) {
235 if (Tracker
.EscapePoints
.count(&I
))
238 CallInst
*CI
= dyn_cast
<CallInst
>(&I
);
239 // A PseudoProbeInst has the IntrInaccessibleMemOnly tag hence it is
240 // considered accessing memory and will be marked as a tail call if we
241 // don't bail out here.
242 if (!CI
|| CI
->isTailCall() || isa
<DbgInfoIntrinsic
>(&I
) ||
243 isa
<PseudoProbeInst
>(&I
))
246 // Bail out for intrinsic stackrestore call because it can modify
247 // unescaped allocas.
248 if (auto *II
= dyn_cast
<IntrinsicInst
>(CI
))
249 if (II
->getIntrinsicID() == Intrinsic::stackrestore
)
252 // Special-case operand bundles "clang.arc.attachedcall", "ptrauth", and
254 bool IsNoTail
= CI
->isNoTailCall() ||
255 CI
->hasOperandBundlesOtherThan(
256 {LLVMContext::OB_clang_arc_attachedcall
,
257 LLVMContext::OB_ptrauth
, LLVMContext::OB_kcfi
});
259 if (!IsNoTail
&& CI
->doesNotAccessMemory()) {
260 // A call to a readnone function whose arguments are all things computed
261 // outside this function can be marked tail. Even if you stored the
262 // alloca address into a global, a readnone function can't load the
265 // Note that this runs whether we know an alloca has escaped or not. If
266 // it has, then we can't trust Tracker.AllocaUsers to be accurate.
267 bool SafeToTail
= true;
268 for (auto &Arg
: CI
->args()) {
269 if (isa
<Constant
>(Arg
.getUser()))
271 if (Argument
*A
= dyn_cast
<Argument
>(Arg
.getUser()))
272 if (!A
->hasByValAttr())
280 return OptimizationRemark(DEBUG_TYPE
, "tailcall-readnone", CI
)
281 << "marked as tail call candidate (readnone)";
289 if (!IsNoTail
&& Escaped
== UNESCAPED
&& !Tracker
.AllocaUsers
.count(CI
))
290 DeferredTails
.push_back(CI
);
293 for (auto *SuccBB
: successors(BB
)) {
294 auto &State
= Visited
[SuccBB
];
295 if (State
< Escaped
) {
297 if (State
== ESCAPED
)
298 WorklistEscaped
.push_back(SuccBB
);
300 WorklistUnescaped
.push_back(SuccBB
);
304 if (!WorklistEscaped
.empty()) {
305 BB
= WorklistEscaped
.pop_back_val();
309 while (!WorklistUnescaped
.empty()) {
310 auto *NextBB
= WorklistUnescaped
.pop_back_val();
311 if (Visited
[NextBB
] == UNESCAPED
) {
320 for (CallInst
*CI
: DeferredTails
) {
321 if (Visited
[CI
->getParent()] != ESCAPED
) {
322 // If the escape point was part way through the block, calls after the
323 // escape point wouldn't have been put into DeferredTails.
324 LLVM_DEBUG(dbgs() << "Marked as tail call candidate: " << *CI
<< "\n");
333 /// Return true if it is safe to move the specified
334 /// instruction from after the call to before the call, assuming that all
335 /// instructions between the call and this instruction are movable.
337 static bool canMoveAboveCall(Instruction
*I
, CallInst
*CI
, AliasAnalysis
*AA
) {
338 if (isa
<DbgInfoIntrinsic
>(I
))
341 if (const IntrinsicInst
*II
= dyn_cast
<IntrinsicInst
>(I
))
342 if (II
->getIntrinsicID() == Intrinsic::lifetime_end
&&
343 llvm::findAllocaForValue(II
->getArgOperand(1)))
346 // FIXME: We can move load/store/call/free instructions above the call if the
347 // call does not mod/ref the memory location being processed.
348 if (I
->mayHaveSideEffects()) // This also handles volatile loads.
351 if (LoadInst
*L
= dyn_cast
<LoadInst
>(I
)) {
352 // Loads may always be moved above calls without side effects.
353 if (CI
->mayHaveSideEffects()) {
354 // Non-volatile loads may be moved above a call with side effects if it
355 // does not write to memory and the load provably won't trap.
356 // Writes to memory only matter if they may alias the pointer
357 // being loaded from.
358 const DataLayout
&DL
= L
->getDataLayout();
359 if (isModSet(AA
->getModRefInfo(CI
, MemoryLocation::get(L
))) ||
360 !isSafeToLoadUnconditionally(L
->getPointerOperand(), L
->getType(),
361 L
->getAlign(), DL
, L
))
366 // Otherwise, if this is a side-effect free instruction, check to make sure
367 // that it does not use the return value of the call. If it doesn't use the
368 // return value of the call, it must only use things that are defined before
369 // the call, or movable instructions between the call and the instruction
371 return !is_contained(I
->operands(), CI
);
374 static bool canTransformAccumulatorRecursion(Instruction
*I
, CallInst
*CI
) {
375 if (!I
->isAssociative() || !I
->isCommutative())
378 assert(I
->getNumOperands() >= 2 &&
379 "Associative/commutative operations should have at least 2 args!");
381 if (IntrinsicInst
*II
= dyn_cast
<IntrinsicInst
>(I
)) {
382 // Accumulators must have an identity.
383 if (!ConstantExpr::getIntrinsicIdentity(II
->getIntrinsicID(), I
->getType()))
387 // Exactly one operand should be the result of the call instruction.
388 if ((I
->getOperand(0) == CI
&& I
->getOperand(1) == CI
) ||
389 (I
->getOperand(0) != CI
&& I
->getOperand(1) != CI
))
392 // The only user of this instruction we allow is a single return instruction.
393 if (!I
->hasOneUse() || !isa
<ReturnInst
>(I
->user_back()))
399 static Instruction
*firstNonDbg(BasicBlock::iterator I
) {
400 while (isa
<DbgInfoIntrinsic
>(I
))
406 class TailRecursionEliminator
{
408 const TargetTransformInfo
*TTI
;
410 OptimizationRemarkEmitter
*ORE
;
413 // The below are shared state we want to have available when eliminating any
414 // calls in the function. There values should be populated by
415 // createTailRecurseLoopHeader the first time we find a call we can eliminate.
416 BasicBlock
*HeaderBB
= nullptr;
417 SmallVector
<PHINode
*, 8> ArgumentPHIs
;
419 // PHI node to store our return value.
420 PHINode
*RetPN
= nullptr;
422 // i1 PHI node to track if we have a valid return value stored in RetPN.
423 PHINode
*RetKnownPN
= nullptr;
425 // Vector of select instructions we insereted. These selects use RetKnownPN
426 // to either propagate RetPN or select a new return value.
427 SmallVector
<SelectInst
*, 8> RetSelects
;
429 // The below are shared state needed when performing accumulator recursion.
430 // There values should be populated by insertAccumulator the first time we
431 // find an elimination that requires an accumulator.
433 // PHI node to store our current accumulated value.
434 PHINode
*AccPN
= nullptr;
436 // The instruction doing the accumulating.
437 Instruction
*AccumulatorRecursionInstr
= nullptr;
439 TailRecursionEliminator(Function
&F
, const TargetTransformInfo
*TTI
,
440 AliasAnalysis
*AA
, OptimizationRemarkEmitter
*ORE
,
442 : F(F
), TTI(TTI
), AA(AA
), ORE(ORE
), DTU(DTU
) {}
444 CallInst
*findTRECandidate(BasicBlock
*BB
);
446 void createTailRecurseLoopHeader(CallInst
*CI
);
448 void insertAccumulator(Instruction
*AccRecInstr
);
450 bool eliminateCall(CallInst
*CI
);
452 void cleanupAndFinalize();
454 bool processBlock(BasicBlock
&BB
);
456 void copyByValueOperandIntoLocalTemp(CallInst
*CI
, int OpndIdx
);
458 void copyLocalTempOfByValueOperandIntoArguments(CallInst
*CI
, int OpndIdx
);
461 static bool eliminate(Function
&F
, const TargetTransformInfo
*TTI
,
462 AliasAnalysis
*AA
, OptimizationRemarkEmitter
*ORE
,
463 DomTreeUpdater
&DTU
);
467 CallInst
*TailRecursionEliminator::findTRECandidate(BasicBlock
*BB
) {
468 Instruction
*TI
= BB
->getTerminator();
470 if (&BB
->front() == TI
) // Make sure there is something before the terminator.
473 // Scan backwards from the return, checking to see if there is a tail call in
474 // this block. If so, set CI to it.
475 CallInst
*CI
= nullptr;
476 BasicBlock::iterator
BBI(TI
);
478 CI
= dyn_cast
<CallInst
>(BBI
);
479 if (CI
&& CI
->getCalledFunction() == &F
)
482 if (BBI
== BB
->begin())
483 return nullptr; // Didn't find a potential tail call.
487 assert((!CI
->isTailCall() || !CI
->isNoTailCall()) &&
488 "Incompatible call site attributes(Tail,NoTail)");
489 if (!CI
->isTailCall())
492 // As a special case, detect code like this:
493 // double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call
494 // and disable this xform in this case, because the code generator will
495 // lower the call to fabs into inline code.
496 if (BB
== &F
.getEntryBlock() &&
497 firstNonDbg(BB
->front().getIterator()) == CI
&&
498 firstNonDbg(std::next(BB
->begin())) == TI
&& CI
->getCalledFunction() &&
499 !TTI
->isLoweredToCall(CI
->getCalledFunction())) {
500 // A single-block function with just a call and a return. Check that
501 // the arguments match.
502 auto I
= CI
->arg_begin(), E
= CI
->arg_end();
503 Function::arg_iterator FI
= F
.arg_begin(), FE
= F
.arg_end();
504 for (; I
!= E
&& FI
!= FE
; ++I
, ++FI
)
505 if (*I
!= &*FI
) break;
506 if (I
== E
&& FI
== FE
)
513 void TailRecursionEliminator::createTailRecurseLoopHeader(CallInst
*CI
) {
514 HeaderBB
= &F
.getEntryBlock();
515 BasicBlock
*NewEntry
= BasicBlock::Create(F
.getContext(), "", &F
, HeaderBB
);
516 NewEntry
->takeName(HeaderBB
);
517 HeaderBB
->setName("tailrecurse");
518 BranchInst::Create(HeaderBB
, NewEntry
);
519 // If the new branch preserves the debug location of CI, it could result in
520 // misleading stepping, if CI is located in a conditional branch.
521 // So, here we don't give any debug location to the new branch.
523 // Move all fixed sized allocas from HeaderBB to NewEntry.
524 for (BasicBlock::iterator OEBI
= HeaderBB
->begin(), E
= HeaderBB
->end(),
525 NEBI
= NewEntry
->begin();
527 if (AllocaInst
*AI
= dyn_cast
<AllocaInst
>(OEBI
++))
528 if (isa
<ConstantInt
>(AI
->getArraySize()))
529 AI
->moveBefore(&*NEBI
);
531 // Now that we have created a new block, which jumps to the entry
532 // block, insert a PHI node for each argument of the function.
533 // For now, we initialize each PHI to only have the real arguments
534 // which are passed in.
535 BasicBlock::iterator InsertPos
= HeaderBB
->begin();
536 for (Function::arg_iterator I
= F
.arg_begin(), E
= F
.arg_end(); I
!= E
; ++I
) {
537 PHINode
*PN
= PHINode::Create(I
->getType(), 2, I
->getName() + ".tr");
538 PN
->insertBefore(InsertPos
);
539 I
->replaceAllUsesWith(PN
); // Everyone use the PHI node now!
540 PN
->addIncoming(&*I
, NewEntry
);
541 ArgumentPHIs
.push_back(PN
);
544 // If the function doen't return void, create the RetPN and RetKnownPN PHI
545 // nodes to track our return value. We initialize RetPN with poison and
546 // RetKnownPN with false since we can't know our return value at function
548 Type
*RetType
= F
.getReturnType();
549 if (!RetType
->isVoidTy()) {
550 Type
*BoolType
= Type::getInt1Ty(F
.getContext());
551 RetPN
= PHINode::Create(RetType
, 2, "ret.tr");
552 RetPN
->insertBefore(InsertPos
);
553 RetKnownPN
= PHINode::Create(BoolType
, 2, "ret.known.tr");
554 RetKnownPN
->insertBefore(InsertPos
);
556 RetPN
->addIncoming(PoisonValue::get(RetType
), NewEntry
);
557 RetKnownPN
->addIncoming(ConstantInt::getFalse(BoolType
), NewEntry
);
560 // The entry block was changed from HeaderBB to NewEntry.
561 // The forward DominatorTree needs to be recalculated when the EntryBB is
562 // changed. In this corner-case we recalculate the entire tree.
563 DTU
.recalculate(*NewEntry
->getParent());
566 void TailRecursionEliminator::insertAccumulator(Instruction
*AccRecInstr
) {
567 assert(!AccPN
&& "Trying to insert multiple accumulators");
569 AccumulatorRecursionInstr
= AccRecInstr
;
571 // Start by inserting a new PHI node for the accumulator.
572 pred_iterator PB
= pred_begin(HeaderBB
), PE
= pred_end(HeaderBB
);
573 AccPN
= PHINode::Create(F
.getReturnType(), std::distance(PB
, PE
) + 1,
575 AccPN
->insertBefore(HeaderBB
->begin());
577 // Loop over all of the predecessors of the tail recursion block. For the
578 // real entry into the function we seed the PHI with the identity constant for
579 // the accumulation operation. For any other existing branches to this block
580 // (due to other tail recursions eliminated) the accumulator is not modified.
581 // Because we haven't added the branch in the current block to HeaderBB yet,
582 // it will not show up as a predecessor.
583 for (pred_iterator PI
= PB
; PI
!= PE
; ++PI
) {
585 if (P
== &F
.getEntryBlock()) {
587 ConstantExpr::getIdentity(AccRecInstr
, AccRecInstr
->getType());
588 AccPN
->addIncoming(Identity
, P
);
590 AccPN
->addIncoming(AccPN
, P
);
597 // Creates a copy of contents of ByValue operand of the specified
598 // call instruction into the newly created temporarily variable.
599 void TailRecursionEliminator::copyByValueOperandIntoLocalTemp(CallInst
*CI
,
601 Type
*AggTy
= CI
->getParamByValType(OpndIdx
);
603 const DataLayout
&DL
= F
.getDataLayout();
605 // Get alignment of byVal operand.
606 Align
Alignment(CI
->getParamAlign(OpndIdx
).valueOrOne());
608 // Create alloca for temporarily byval operands.
609 // Put alloca into the entry block.
610 Value
*NewAlloca
= new AllocaInst(
611 AggTy
, DL
.getAllocaAddrSpace(), nullptr, Alignment
,
612 CI
->getArgOperand(OpndIdx
)->getName(), F
.getEntryBlock().begin());
614 IRBuilder
<> Builder(CI
);
615 Value
*Size
= Builder
.getInt64(DL
.getTypeAllocSize(AggTy
));
617 // Copy data from byvalue operand into the temporarily variable.
618 Builder
.CreateMemCpy(NewAlloca
, /*DstAlign*/ Alignment
,
619 CI
->getArgOperand(OpndIdx
),
620 /*SrcAlign*/ Alignment
, Size
);
621 CI
->setArgOperand(OpndIdx
, NewAlloca
);
624 // Creates a copy from temporarily variable(keeping value of ByVal argument)
625 // into the corresponding function argument location.
626 void TailRecursionEliminator::copyLocalTempOfByValueOperandIntoArguments(
627 CallInst
*CI
, int OpndIdx
) {
628 Type
*AggTy
= CI
->getParamByValType(OpndIdx
);
630 const DataLayout
&DL
= F
.getDataLayout();
632 // Get alignment of byVal operand.
633 Align
Alignment(CI
->getParamAlign(OpndIdx
).valueOrOne());
635 IRBuilder
<> Builder(CI
);
636 Value
*Size
= Builder
.getInt64(DL
.getTypeAllocSize(AggTy
));
638 // Copy data from the temporarily variable into corresponding
639 // function argument location.
640 Builder
.CreateMemCpy(F
.getArg(OpndIdx
), /*DstAlign*/ Alignment
,
641 CI
->getArgOperand(OpndIdx
),
642 /*SrcAlign*/ Alignment
, Size
);
645 bool TailRecursionEliminator::eliminateCall(CallInst
*CI
) {
646 ReturnInst
*Ret
= cast
<ReturnInst
>(CI
->getParent()->getTerminator());
648 // Ok, we found a potential tail call. We can currently only transform the
649 // tail call if all of the instructions between the call and the return are
650 // movable to above the call itself, leaving the call next to the return.
651 // Check that this is the case now.
652 Instruction
*AccRecInstr
= nullptr;
653 BasicBlock::iterator
BBI(CI
);
654 for (++BBI
; &*BBI
!= Ret
; ++BBI
) {
655 if (canMoveAboveCall(&*BBI
, CI
, AA
))
658 // If we can't move the instruction above the call, it might be because it
659 // is an associative and commutative operation that could be transformed
660 // using accumulator recursion elimination. Check to see if this is the
661 // case, and if so, remember which instruction accumulates for later.
662 if (AccPN
|| !canTransformAccumulatorRecursion(&*BBI
, CI
))
663 return false; // We cannot eliminate the tail recursion!
665 // Yes, this is accumulator recursion. Remember which instruction
670 BasicBlock
*BB
= Ret
->getParent();
674 return OptimizationRemark(DEBUG_TYPE
, "tailcall-recursion", CI
)
675 << "transforming tail recursion into loop";
678 // OK! We can transform this tail call. If this is the first one found,
679 // create the new entry block, allowing us to branch back to the old entry.
681 createTailRecurseLoopHeader(CI
);
683 // Copy values of ByVal operands into local temporarily variables.
684 for (unsigned I
= 0, E
= CI
->arg_size(); I
!= E
; ++I
) {
685 if (CI
->isByValArgument(I
))
686 copyByValueOperandIntoLocalTemp(CI
, I
);
689 // Ok, now that we know we have a pseudo-entry block WITH all of the
690 // required PHI nodes, add entries into the PHI node for the actual
691 // parameters passed into the tail-recursive call.
692 for (unsigned I
= 0, E
= CI
->arg_size(); I
!= E
; ++I
) {
693 if (CI
->isByValArgument(I
)) {
694 copyLocalTempOfByValueOperandIntoArguments(CI
, I
);
695 // When eliminating a tail call, we modify the values of the arguments.
696 // Therefore, if the byval parameter has a readonly attribute, we have to
697 // remove it. It is safe because, from the perspective of a caller, the
698 // byval parameter is always treated as "readonly," even if the readonly
699 // attribute is removed.
700 F
.removeParamAttr(I
, Attribute::ReadOnly
);
701 ArgumentPHIs
[I
]->addIncoming(F
.getArg(I
), BB
);
703 ArgumentPHIs
[I
]->addIncoming(CI
->getArgOperand(I
), BB
);
707 insertAccumulator(AccRecInstr
);
709 // Rewrite the accumulator recursion instruction so that it does not use
710 // the result of the call anymore, instead, use the PHI node we just
712 AccRecInstr
->setOperand(AccRecInstr
->getOperand(0) != CI
, AccPN
);
715 // Update our return value tracking
717 if (Ret
->getReturnValue() == CI
|| AccRecInstr
) {
718 // Defer selecting a return value
719 RetPN
->addIncoming(RetPN
, BB
);
720 RetKnownPN
->addIncoming(RetKnownPN
, BB
);
722 // We found a return value we want to use, insert a select instruction to
723 // select it if we don't already know what our return value will be and
724 // store the result in our return value PHI node.
726 SelectInst::Create(RetKnownPN
, RetPN
, Ret
->getReturnValue(),
727 "current.ret.tr", Ret
->getIterator());
728 RetSelects
.push_back(SI
);
730 RetPN
->addIncoming(SI
, BB
);
731 RetKnownPN
->addIncoming(ConstantInt::getTrue(RetKnownPN
->getType()), BB
);
735 AccPN
->addIncoming(AccRecInstr
? AccRecInstr
: AccPN
, BB
);
738 // Now that all of the PHI nodes are in place, remove the call and
739 // ret instructions, replacing them with an unconditional branch.
740 BranchInst
*NewBI
= BranchInst::Create(HeaderBB
, Ret
->getIterator());
741 NewBI
->setDebugLoc(CI
->getDebugLoc());
743 Ret
->eraseFromParent(); // Remove return.
744 CI
->eraseFromParent(); // Remove call.
745 DTU
.applyUpdates({{DominatorTree::Insert
, BB
, HeaderBB
}});
750 void TailRecursionEliminator::cleanupAndFinalize() {
751 // If we eliminated any tail recursions, it's possible that we inserted some
752 // silly PHI nodes which just merge an initial value (the incoming operand)
753 // with themselves. Check to see if we did and clean up our mess if so. This
754 // occurs when a function passes an argument straight through to its tail
756 for (PHINode
*PN
: ArgumentPHIs
) {
757 // If the PHI Node is a dynamic constant, replace it with the value it is.
758 if (Value
*PNV
= simplifyInstruction(PN
, F
.getDataLayout())) {
759 PN
->replaceAllUsesWith(PNV
);
760 PN
->eraseFromParent();
765 if (RetSelects
.empty()) {
766 // If we didn't insert any select instructions, then we know we didn't
767 // store a return value and we can remove the PHI nodes we inserted.
768 RetPN
->dropAllReferences();
769 RetPN
->eraseFromParent();
771 RetKnownPN
->dropAllReferences();
772 RetKnownPN
->eraseFromParent();
775 // We need to insert a copy of our accumulator instruction before any
776 // return in the function, and return its result instead.
777 Instruction
*AccRecInstr
= AccumulatorRecursionInstr
;
778 for (BasicBlock
&BB
: F
) {
779 ReturnInst
*RI
= dyn_cast
<ReturnInst
>(BB
.getTerminator());
783 Instruction
*AccRecInstrNew
= AccRecInstr
->clone();
784 AccRecInstrNew
->setName("accumulator.ret.tr");
785 AccRecInstrNew
->setOperand(AccRecInstr
->getOperand(0) == AccPN
,
787 AccRecInstrNew
->insertBefore(RI
);
788 AccRecInstrNew
->dropLocation();
789 RI
->setOperand(0, AccRecInstrNew
);
793 // We need to insert a select instruction before any return left in the
794 // function to select our stored return value if we have one.
795 for (BasicBlock
&BB
: F
) {
796 ReturnInst
*RI
= dyn_cast
<ReturnInst
>(BB
.getTerminator());
801 SelectInst::Create(RetKnownPN
, RetPN
, RI
->getOperand(0),
802 "current.ret.tr", RI
->getIterator());
803 RetSelects
.push_back(SI
);
804 RI
->setOperand(0, SI
);
808 // We need to insert a copy of our accumulator instruction before any
809 // of the selects we inserted, and select its result instead.
810 Instruction
*AccRecInstr
= AccumulatorRecursionInstr
;
811 for (SelectInst
*SI
: RetSelects
) {
812 Instruction
*AccRecInstrNew
= AccRecInstr
->clone();
813 AccRecInstrNew
->setName("accumulator.ret.tr");
814 AccRecInstrNew
->setOperand(AccRecInstr
->getOperand(0) == AccPN
,
815 SI
->getFalseValue());
816 AccRecInstrNew
->insertBefore(SI
);
817 AccRecInstrNew
->dropLocation();
818 SI
->setFalseValue(AccRecInstrNew
);
825 bool TailRecursionEliminator::processBlock(BasicBlock
&BB
) {
826 Instruction
*TI
= BB
.getTerminator();
828 if (BranchInst
*BI
= dyn_cast
<BranchInst
>(TI
)) {
829 if (BI
->isConditional())
832 BasicBlock
*Succ
= BI
->getSuccessor(0);
833 ReturnInst
*Ret
= dyn_cast
<ReturnInst
>(Succ
->getFirstNonPHIOrDbg(true));
838 CallInst
*CI
= findTRECandidate(&BB
);
843 LLVM_DEBUG(dbgs() << "FOLDING: " << *Succ
844 << "INTO UNCOND BRANCH PRED: " << BB
);
845 FoldReturnIntoUncondBranch(Ret
, Succ
, &BB
, &DTU
);
848 // If all predecessors of Succ have been eliminated by
849 // FoldReturnIntoUncondBranch, delete it. It is important to empty it,
850 // because the ret instruction in there is still using a value which
851 // eliminateCall will attempt to remove. This block can only contain
852 // instructions that can't have uses, therefore it is safe to remove.
853 if (pred_empty(Succ
))
858 } else if (isa
<ReturnInst
>(TI
)) {
859 CallInst
*CI
= findTRECandidate(&BB
);
862 return eliminateCall(CI
);
868 bool TailRecursionEliminator::eliminate(Function
&F
,
869 const TargetTransformInfo
*TTI
,
871 OptimizationRemarkEmitter
*ORE
,
872 DomTreeUpdater
&DTU
) {
873 if (F
.getFnAttribute("disable-tail-calls").getValueAsBool())
876 bool MadeChange
= false;
877 MadeChange
|= markTails(F
, ORE
);
879 // If this function is a varargs function, we won't be able to PHI the args
880 // right, so don't even try to convert it...
881 if (F
.getFunctionType()->isVarArg())
887 // Change any tail recursive calls to loops.
888 TailRecursionEliminator
TRE(F
, TTI
, AA
, ORE
, DTU
);
890 for (BasicBlock
&BB
: F
)
891 MadeChange
|= TRE
.processBlock(BB
);
893 TRE
.cleanupAndFinalize();
899 struct TailCallElim
: public FunctionPass
{
900 static char ID
; // Pass identification, replacement for typeid
901 TailCallElim() : FunctionPass(ID
) {
902 initializeTailCallElimPass(*PassRegistry::getPassRegistry());
905 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
906 AU
.addRequired
<TargetTransformInfoWrapperPass
>();
907 AU
.addRequired
<AAResultsWrapperPass
>();
908 AU
.addRequired
<OptimizationRemarkEmitterWrapperPass
>();
909 AU
.addPreserved
<GlobalsAAWrapperPass
>();
910 AU
.addPreserved
<DominatorTreeWrapperPass
>();
911 AU
.addPreserved
<PostDominatorTreeWrapperPass
>();
914 bool runOnFunction(Function
&F
) override
{
918 auto *DTWP
= getAnalysisIfAvailable
<DominatorTreeWrapperPass
>();
919 auto *DT
= DTWP
? &DTWP
->getDomTree() : nullptr;
920 auto *PDTWP
= getAnalysisIfAvailable
<PostDominatorTreeWrapperPass
>();
921 auto *PDT
= PDTWP
? &PDTWP
->getPostDomTree() : nullptr;
922 // There is no noticable performance difference here between Lazy and Eager
923 // UpdateStrategy based on some test results. It is feasible to switch the
924 // UpdateStrategy to Lazy if we find it profitable later.
925 DomTreeUpdater
DTU(DT
, PDT
, DomTreeUpdater::UpdateStrategy::Eager
);
927 return TailRecursionEliminator::eliminate(
928 F
, &getAnalysis
<TargetTransformInfoWrapperPass
>().getTTI(F
),
929 &getAnalysis
<AAResultsWrapperPass
>().getAAResults(),
930 &getAnalysis
<OptimizationRemarkEmitterWrapperPass
>().getORE(), DTU
);
935 char TailCallElim::ID
= 0;
936 INITIALIZE_PASS_BEGIN(TailCallElim
, "tailcallelim", "Tail Call Elimination",
938 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass
)
939 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass
)
940 INITIALIZE_PASS_END(TailCallElim
, "tailcallelim", "Tail Call Elimination",
943 // Public interface to the TailCallElimination pass
944 FunctionPass
*llvm::createTailCallEliminationPass() {
945 return new TailCallElim();
948 PreservedAnalyses
TailCallElimPass::run(Function
&F
,
949 FunctionAnalysisManager
&AM
) {
951 TargetTransformInfo
&TTI
= AM
.getResult
<TargetIRAnalysis
>(F
);
952 AliasAnalysis
&AA
= AM
.getResult
<AAManager
>(F
);
953 auto &ORE
= AM
.getResult
<OptimizationRemarkEmitterAnalysis
>(F
);
954 auto *DT
= AM
.getCachedResult
<DominatorTreeAnalysis
>(F
);
955 auto *PDT
= AM
.getCachedResult
<PostDominatorTreeAnalysis
>(F
);
956 // There is no noticable performance difference here between Lazy and Eager
957 // UpdateStrategy based on some test results. It is feasible to switch the
958 // UpdateStrategy to Lazy if we find it profitable later.
959 DomTreeUpdater
DTU(DT
, PDT
, DomTreeUpdater::UpdateStrategy::Eager
);
960 bool Changed
= TailRecursionEliminator::eliminate(F
, &TTI
, &AA
, &ORE
, DTU
);
963 return PreservedAnalyses::all();
964 PreservedAnalyses PA
;
965 PA
.preserve
<DominatorTreeAnalysis
>();
966 PA
.preserve
<PostDominatorTreeAnalysis
>();