1 //===-- VPlanUnroll.cpp - VPlan unroller ----------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This file implements explicit unrolling for VPlans.
12 //===----------------------------------------------------------------------===//
14 #include "VPRecipeBuilder.h"
16 #include "VPlanAnalysis.h"
18 #include "VPlanPatternMatch.h"
19 #include "VPlanTransforms.h"
20 #include "VPlanUtils.h"
21 #include "llvm/ADT/PostOrderIterator.h"
22 #include "llvm/ADT/STLExtras.h"
23 #include "llvm/ADT/ScopeExit.h"
24 #include "llvm/Analysis/IVDescriptors.h"
25 #include "llvm/IR/Intrinsics.h"
31 /// Helper to hold state needed for unrolling. It holds the Plan to unroll by
32 /// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate
33 /// the unrolling transformation, where the original VPValues are retained for
38 /// Unroll factor to unroll by.
40 /// Analysis for types.
41 VPTypeAnalysis TypeInfo
;
43 /// Unrolling may create recipes that should not be unrolled themselves.
44 /// Those are tracked in ToSkip.
45 SmallPtrSet
<VPRecipeBase
*, 8> ToSkip
;
47 // Associate with each VPValue of part 0 its unrolled instances of parts 1,
49 DenseMap
<VPValue
*, SmallVector
<VPValue
*>> VPV2Parts
;
51 /// Unroll replicate region \p VPR by cloning the region UF - 1 times.
52 void unrollReplicateRegionByUF(VPRegionBlock
*VPR
);
54 /// Unroll recipe \p R by cloning it UF - 1 times, unless it is uniform across
56 void unrollRecipeByUF(VPRecipeBase
&R
);
58 /// Unroll header phi recipe \p R. How exactly the recipe gets unrolled
59 /// depends on the concrete header phi. Inserts newly created recipes at \p
61 void unrollHeaderPHIByUF(VPHeaderPHIRecipe
*R
,
62 VPBasicBlock::iterator InsertPtForPhi
);
64 /// Unroll a widen induction recipe \p IV. This introduces recipes to compute
65 /// the induction steps for each part.
66 void unrollWidenInductionByUF(VPWidenIntOrFpInductionRecipe
*IV
,
67 VPBasicBlock::iterator InsertPtForPhi
);
69 VPValue
*getConstantVPV(unsigned Part
) {
70 Type
*CanIVIntTy
= Plan
.getCanonicalIV()->getScalarType();
71 return Plan
.getOrAddLiveIn(ConstantInt::get(CanIVIntTy
, Part
));
75 UnrollState(VPlan
&Plan
, unsigned UF
, LLVMContext
&Ctx
)
76 : Plan(Plan
), UF(UF
), TypeInfo(Plan
.getCanonicalIV()->getScalarType()) {}
78 void unrollBlock(VPBlockBase
*VPB
);
80 VPValue
*getValueForPart(VPValue
*V
, unsigned Part
) {
81 if (Part
== 0 || V
->isLiveIn())
83 assert((VPV2Parts
.contains(V
) && VPV2Parts
[V
].size() >= Part
) &&
84 "accessed value does not exist");
85 return VPV2Parts
[V
][Part
- 1];
88 /// Given a single original recipe \p OrigR (of part zero), and its copy \p
89 /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its
90 /// corresponding VPValue defined by \p CopyR.
91 void addRecipeForPart(VPRecipeBase
*OrigR
, VPRecipeBase
*CopyR
,
93 for (const auto &[Idx
, VPV
] : enumerate(OrigR
->definedValues())) {
94 auto Ins
= VPV2Parts
.insert({VPV
, {}});
95 assert(Ins
.first
->second
.size() == Part
- 1 && "earlier parts not set");
96 Ins
.first
->second
.push_back(CopyR
->getVPValue(Idx
));
100 /// Given a uniform recipe \p R, add it for all parts.
101 void addUniformForAllParts(VPSingleDefRecipe
*R
) {
102 auto Ins
= VPV2Parts
.insert({R
, {}});
103 assert(Ins
.second
&& "uniform value already added");
104 for (unsigned Part
= 0; Part
!= UF
; ++Part
)
105 Ins
.first
->second
.push_back(R
);
108 bool contains(VPValue
*VPV
) const { return VPV2Parts
.contains(VPV
); }
110 /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part
112 void remapOperand(VPRecipeBase
*R
, unsigned OpIdx
, unsigned Part
) {
113 auto *Op
= R
->getOperand(OpIdx
);
114 R
->setOperand(OpIdx
, getValueForPart(Op
, Part
));
117 /// Update \p R's operands with their corresponding VPValues for part \p P.
118 void remapOperands(VPRecipeBase
*R
, unsigned Part
) {
119 for (const auto &[OpIdx
, Op
] : enumerate(R
->operands()))
120 R
->setOperand(OpIdx
, getValueForPart(Op
, Part
));
125 void UnrollState::unrollReplicateRegionByUF(VPRegionBlock
*VPR
) {
126 VPBlockBase
*InsertPt
= VPR
->getSingleSuccessor();
127 for (unsigned Part
= 1; Part
!= UF
; ++Part
) {
128 auto *Copy
= VPR
->clone();
129 VPBlockUtils::insertBlockBefore(Copy
, InsertPt
);
131 auto PartI
= vp_depth_first_shallow(Copy
->getEntry());
132 auto Part0
= vp_depth_first_shallow(VPR
->getEntry());
133 for (const auto &[PartIVPBB
, Part0VPBB
] :
134 zip(VPBlockUtils::blocksOnly
<VPBasicBlock
>(PartI
),
135 VPBlockUtils::blocksOnly
<VPBasicBlock
>(Part0
))) {
136 for (const auto &[PartIR
, Part0R
] : zip(*PartIVPBB
, *Part0VPBB
)) {
137 remapOperands(&PartIR
, Part
);
138 if (auto *ScalarIVSteps
= dyn_cast
<VPScalarIVStepsRecipe
>(&PartIR
)) {
139 ScalarIVSteps
->addOperand(getConstantVPV(Part
));
142 addRecipeForPart(&Part0R
, &PartIR
, Part
);
148 void UnrollState::unrollWidenInductionByUF(
149 VPWidenIntOrFpInductionRecipe
*IV
, VPBasicBlock::iterator InsertPtForPhi
) {
150 VPBasicBlock
*PH
= cast
<VPBasicBlock
>(
151 IV
->getParent()->getEnclosingLoopRegion()->getSinglePredecessor());
152 Type
*IVTy
= TypeInfo
.inferScalarType(IV
);
153 auto &ID
= IV
->getInductionDescriptor();
154 std::optional
<FastMathFlags
> FMFs
;
155 if (isa_and_present
<FPMathOperator
>(ID
.getInductionBinOp()))
156 FMFs
= ID
.getInductionBinOp()->getFastMathFlags();
158 VPValue
*VectorStep
= &Plan
.getVF();
159 VPBuilder
Builder(PH
);
160 if (TypeInfo
.inferScalarType(VectorStep
) != IVTy
) {
161 Instruction::CastOps CastOp
=
162 IVTy
->isFloatingPointTy() ? Instruction::UIToFP
: Instruction::Trunc
;
163 VectorStep
= Builder
.createWidenCast(CastOp
, VectorStep
, IVTy
);
164 ToSkip
.insert(VectorStep
->getDefiningRecipe());
167 VPValue
*ScalarStep
= IV
->getStepValue();
168 auto *ConstStep
= ScalarStep
->isLiveIn()
169 ? dyn_cast
<ConstantInt
>(ScalarStep
->getLiveInIRValue())
171 if (!ConstStep
|| ConstStep
->getValue() != 1) {
172 if (TypeInfo
.inferScalarType(ScalarStep
) != IVTy
) {
174 Builder
.createWidenCast(Instruction::Trunc
, ScalarStep
, IVTy
);
175 ToSkip
.insert(ScalarStep
->getDefiningRecipe());
179 IVTy
->isFloatingPointTy() ? Instruction::FMul
: Instruction::Mul
;
180 VPInstruction
*Mul
= Builder
.createNaryOp(MulOpc
, {VectorStep
, ScalarStep
},
181 FMFs
, IV
->getDebugLoc());
186 // Now create recipes to compute the induction steps for part 1 .. UF. Part 0
187 // remains the header phi. Parts > 0 are computed by adding Step to the
188 // previous part. The header phi recipe will get 2 new operands: the step
189 // value for a single part and the last part, used to compute the backedge
190 // value during VPWidenIntOrFpInductionRecipe::execute. %Part.0 =
191 // VPWidenIntOrFpInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3
192 // %Part.1 = %Part.0 + %VectorStep
193 // %Part.2 = %Part.1 + %VectorStep
194 // %Part.3 = %Part.2 + %VectorStep
196 // The newly added recipes are added to ToSkip to avoid interleaving them
199 Builder
.setInsertPoint(IV
->getParent(), InsertPtForPhi
);
201 IVTy
->isFloatingPointTy() ? ID
.getInductionOpcode() : Instruction::Add
;
202 for (unsigned Part
= 1; Part
!= UF
; ++Part
) {
204 Part
> 1 ? "step.add." + std::to_string(Part
) : "step.add";
206 VPInstruction
*Add
= Builder
.createNaryOp(AddOpc
,
211 FMFs
, IV
->getDebugLoc(), Name
);
213 addRecipeForPart(IV
, Add
, Part
);
216 IV
->addOperand(VectorStep
);
217 IV
->addOperand(Prev
);
220 void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe
*R
,
221 VPBasicBlock::iterator InsertPtForPhi
) {
222 // First-order recurrences pass a single vector or scalar through their header
223 // phis, irrespective of interleaving.
224 if (isa
<VPFirstOrderRecurrencePHIRecipe
>(R
))
227 // Generate step vectors for each unrolled part.
228 if (auto *IV
= dyn_cast
<VPWidenIntOrFpInductionRecipe
>(R
)) {
229 unrollWidenInductionByUF(IV
, InsertPtForPhi
);
233 auto *RdxPhi
= dyn_cast
<VPReductionPHIRecipe
>(R
);
234 if (RdxPhi
&& RdxPhi
->isOrdered())
237 auto InsertPt
= std::next(R
->getIterator());
238 for (unsigned Part
= 1; Part
!= UF
; ++Part
) {
239 VPRecipeBase
*Copy
= R
->clone();
240 Copy
->insertBefore(*R
->getParent(), InsertPt
);
241 addRecipeForPart(R
, Copy
, Part
);
242 if (isa
<VPWidenPointerInductionRecipe
>(R
)) {
244 Copy
->addOperand(getConstantVPV(Part
));
246 Copy
->addOperand(getConstantVPV(Part
));
248 assert(isa
<VPActiveLaneMaskPHIRecipe
>(R
) &&
249 "unexpected header phi recipe not needing unrolled part");
254 /// Handle non-header-phi recipes.
255 void UnrollState::unrollRecipeByUF(VPRecipeBase
&R
) {
256 using namespace llvm::VPlanPatternMatch
;
257 if (match(&R
, m_BranchOnCond(m_VPValue())) ||
258 match(&R
, m_BranchOnCount(m_VPValue(), m_VPValue())))
261 if (auto *VPI
= dyn_cast
<VPInstruction
>(&R
)) {
262 if (vputils::onlyFirstPartUsed(VPI
)) {
263 addUniformForAllParts(VPI
);
267 if (auto *RepR
= dyn_cast
<VPReplicateRecipe
>(&R
)) {
268 if (isa
<StoreInst
>(RepR
->getUnderlyingValue()) &&
269 RepR
->getOperand(1)->isDefinedOutsideLoopRegions()) {
270 // Stores to an invariant address only need to store the last part.
271 remapOperands(&R
, UF
- 1);
274 if (auto *II
= dyn_cast
<IntrinsicInst
>(RepR
->getUnderlyingValue())) {
275 if (II
->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl
) {
276 addUniformForAllParts(RepR
);
282 // Unroll non-uniform recipes.
283 auto InsertPt
= std::next(R
.getIterator());
284 VPBasicBlock
&VPBB
= *R
.getParent();
285 for (unsigned Part
= 1; Part
!= UF
; ++Part
) {
286 VPRecipeBase
*Copy
= R
.clone();
287 Copy
->insertBefore(VPBB
, InsertPt
);
288 addRecipeForPart(&R
, Copy
, Part
);
291 if (match(&R
, m_VPInstruction
<VPInstruction::FirstOrderRecurrenceSplice
>(
292 m_VPValue(), m_VPValue(Op
)))) {
293 Copy
->setOperand(0, getValueForPart(Op
, Part
- 1));
294 Copy
->setOperand(1, getValueForPart(Op
, Part
));
297 if (auto *Red
= dyn_cast
<VPReductionRecipe
>(&R
)) {
298 auto *Phi
= cast
<VPReductionPHIRecipe
>(R
.getOperand(0));
299 if (Phi
->isOrdered()) {
300 auto &Parts
= VPV2Parts
[Phi
];
303 Parts
.push_back(Red
);
305 Parts
.push_back(Copy
->getVPSingleValue());
306 Phi
->setOperand(1, Copy
->getVPSingleValue());
309 remapOperands(Copy
, Part
);
311 // Add operand indicating the part to generate code for, to recipes still
313 if (isa
<VPScalarIVStepsRecipe
, VPWidenCanonicalIVRecipe
,
314 VPVectorPointerRecipe
, VPReverseVectorPointerRecipe
>(Copy
) ||
315 match(Copy
, m_VPInstruction
<VPInstruction::CanonicalIVIncrementForPart
>(
317 Copy
->addOperand(getConstantVPV(Part
));
319 if (isa
<VPVectorPointerRecipe
, VPReverseVectorPointerRecipe
>(R
))
320 Copy
->setOperand(0, R
.getOperand(0));
324 using namespace llvm::VPlanPatternMatch
;
325 void UnrollState::unrollBlock(VPBlockBase
*VPB
) {
326 auto *VPR
= dyn_cast
<VPRegionBlock
>(VPB
);
328 if (VPR
->isReplicator())
329 return unrollReplicateRegionByUF(VPR
);
331 // Traverse blocks in region in RPO to ensure defs are visited before uses
333 ReversePostOrderTraversal
<VPBlockShallowTraversalWrapper
<VPBlockBase
*>>
334 RPOT(VPR
->getEntry());
335 for (VPBlockBase
*VPB
: RPOT
)
340 // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes.
341 auto *VPBB
= cast
<VPBasicBlock
>(VPB
);
342 auto InsertPtForPhi
= VPBB
->getFirstNonPhi();
343 for (VPRecipeBase
&R
: make_early_inc_range(*VPBB
)) {
344 if (ToSkip
.contains(&R
) || isa
<VPIRInstruction
>(&R
))
347 // Add all VPValues for all parts to ComputeReductionResult which combines
348 // the parts to compute the final reduction value.
350 if (match(&R
, m_VPInstruction
<VPInstruction::ComputeReductionResult
>(
351 m_VPValue(), m_VPValue(Op1
)))) {
352 addUniformForAllParts(cast
<VPInstruction
>(&R
));
353 for (unsigned Part
= 1; Part
!= UF
; ++Part
)
354 R
.addOperand(getValueForPart(Op1
, Part
));
358 if (match(&R
, m_VPInstruction
<VPInstruction::ExtractFromEnd
>(
359 m_VPValue(Op0
), m_VPValue(Op1
)))) {
360 addUniformForAllParts(cast
<VPSingleDefRecipe
>(&R
));
361 if (Plan
.hasScalarVFOnly()) {
362 // Extracting from end with VF = 1 implies retrieving the scalar part UF
365 cast
<ConstantInt
>(Op1
->getLiveInIRValue())->getZExtValue();
366 R
.getVPSingleValue()->replaceAllUsesWith(
367 getValueForPart(Op0
, UF
- Offset
));
370 // Otherwise we extract from the last part.
371 remapOperands(&R
, UF
- 1);
376 auto *SingleDef
= dyn_cast
<VPSingleDefRecipe
>(&R
);
377 if (SingleDef
&& vputils::isUniformAcrossVFsAndUFs(SingleDef
)) {
378 addUniformForAllParts(SingleDef
);
382 if (auto *H
= dyn_cast
<VPHeaderPHIRecipe
>(&R
)) {
383 unrollHeaderPHIByUF(H
, InsertPtForPhi
);
391 void VPlanTransforms::unrollByUF(VPlan
&Plan
, unsigned UF
, LLVMContext
&Ctx
) {
392 assert(UF
> 0 && "Unroll factor must be positive");
394 auto Cleanup
= make_scope_exit([&Plan
]() {
395 auto Iter
= vp_depth_first_deep(Plan
.getEntry());
396 // Remove recipes that are redundant after unrolling.
397 for (VPBasicBlock
*VPBB
: VPBlockUtils::blocksOnly
<VPBasicBlock
>(Iter
)) {
398 for (VPRecipeBase
&R
: make_early_inc_range(*VPBB
)) {
399 auto *VPI
= dyn_cast
<VPInstruction
>(&R
);
401 VPI
->getOpcode() == VPInstruction::CanonicalIVIncrementForPart
&&
402 VPI
->getNumOperands() == 1) {
403 VPI
->replaceAllUsesWith(VPI
->getOperand(0));
404 VPI
->eraseFromParent();
413 UnrollState
Unroller(Plan
, UF
, Ctx
);
415 // Iterate over all blocks in the plan starting from Entry, and unroll
416 // recipes inside them. This includes the vector preheader and middle blocks,
417 // which may set up or post-process per-part values.
418 ReversePostOrderTraversal
<VPBlockShallowTraversalWrapper
<VPBlockBase
*>> RPOT(
420 for (VPBlockBase
*VPB
: RPOT
)
421 Unroller
.unrollBlock(VPB
);
424 // Remap operands of cloned header phis to update backedge values. The header
425 // phis cloned during unrolling are just after the header phi for part 0.
426 // Reset Part to 1 when reaching the first (part 0) recipe of a block.
427 for (VPRecipeBase
&H
:
428 Plan
.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
429 // The second operand of Fixed Order Recurrence phi's, feeding the spliced
430 // value across the backedge, needs to remap to the last part of the spliced
432 if (isa
<VPFirstOrderRecurrencePHIRecipe
>(&H
)) {
433 Unroller
.remapOperand(&H
, 1, UF
- 1);
436 if (Unroller
.contains(H
.getVPSingleValue()) ||
437 isa
<VPWidenPointerInductionRecipe
>(&H
)) {
441 Unroller
.remapOperands(&H
, Part
);
445 VPlanTransforms::removeDeadRecipes(Plan
);