[lld][WebAssembly] Introduce Ctx::arg
[llvm-project.git] / llvm / lib / Transforms / Vectorize / VPlanUnroll.cpp
blob89e372d6b46cfd2ebd11403b39b517cf364f0db4
1 //===-- VPlanUnroll.cpp - VPlan unroller ----------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements explicit unrolling for VPlans.
11 ///
12 //===----------------------------------------------------------------------===//
14 #include "VPRecipeBuilder.h"
15 #include "VPlan.h"
16 #include "VPlanAnalysis.h"
17 #include "VPlanCFG.h"
18 #include "VPlanPatternMatch.h"
19 #include "VPlanTransforms.h"
20 #include "VPlanUtils.h"
21 #include "llvm/ADT/PostOrderIterator.h"
22 #include "llvm/ADT/STLExtras.h"
23 #include "llvm/ADT/ScopeExit.h"
24 #include "llvm/Analysis/IVDescriptors.h"
25 #include "llvm/IR/Intrinsics.h"
27 using namespace llvm;
29 namespace {
31 /// Helper to hold state needed for unrolling. It holds the Plan to unroll by
32 /// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate
33 /// the unrolling transformation, where the original VPValues are retained for
34 /// part zero.
35 class UnrollState {
36 /// Plan to unroll.
37 VPlan &Plan;
38 /// Unroll factor to unroll by.
39 const unsigned UF;
40 /// Analysis for types.
41 VPTypeAnalysis TypeInfo;
43 /// Unrolling may create recipes that should not be unrolled themselves.
44 /// Those are tracked in ToSkip.
45 SmallPtrSet<VPRecipeBase *, 8> ToSkip;
47 // Associate with each VPValue of part 0 its unrolled instances of parts 1,
48 // ..., UF-1.
49 DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts;
51 /// Unroll replicate region \p VPR by cloning the region UF - 1 times.
52 void unrollReplicateRegionByUF(VPRegionBlock *VPR);
54 /// Unroll recipe \p R by cloning it UF - 1 times, unless it is uniform across
55 /// all parts.
56 void unrollRecipeByUF(VPRecipeBase &R);
58 /// Unroll header phi recipe \p R. How exactly the recipe gets unrolled
59 /// depends on the concrete header phi. Inserts newly created recipes at \p
60 /// InsertPtForPhi.
61 void unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
62 VPBasicBlock::iterator InsertPtForPhi);
64 /// Unroll a widen induction recipe \p IV. This introduces recipes to compute
65 /// the induction steps for each part.
66 void unrollWidenInductionByUF(VPWidenIntOrFpInductionRecipe *IV,
67 VPBasicBlock::iterator InsertPtForPhi);
69 VPValue *getConstantVPV(unsigned Part) {
70 Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
71 return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part));
74 public:
75 UnrollState(VPlan &Plan, unsigned UF, LLVMContext &Ctx)
76 : Plan(Plan), UF(UF), TypeInfo(Plan.getCanonicalIV()->getScalarType()) {}
78 void unrollBlock(VPBlockBase *VPB);
80 VPValue *getValueForPart(VPValue *V, unsigned Part) {
81 if (Part == 0 || V->isLiveIn())
82 return V;
83 assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) &&
84 "accessed value does not exist");
85 return VPV2Parts[V][Part - 1];
88 /// Given a single original recipe \p OrigR (of part zero), and its copy \p
89 /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its
90 /// corresponding VPValue defined by \p CopyR.
91 void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR,
92 unsigned Part) {
93 for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) {
94 auto Ins = VPV2Parts.insert({VPV, {}});
95 assert(Ins.first->second.size() == Part - 1 && "earlier parts not set");
96 Ins.first->second.push_back(CopyR->getVPValue(Idx));
100 /// Given a uniform recipe \p R, add it for all parts.
101 void addUniformForAllParts(VPSingleDefRecipe *R) {
102 auto Ins = VPV2Parts.insert({R, {}});
103 assert(Ins.second && "uniform value already added");
104 for (unsigned Part = 0; Part != UF; ++Part)
105 Ins.first->second.push_back(R);
108 bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); }
110 /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part
111 /// \p P.
112 void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) {
113 auto *Op = R->getOperand(OpIdx);
114 R->setOperand(OpIdx, getValueForPart(Op, Part));
117 /// Update \p R's operands with their corresponding VPValues for part \p P.
118 void remapOperands(VPRecipeBase *R, unsigned Part) {
119 for (const auto &[OpIdx, Op] : enumerate(R->operands()))
120 R->setOperand(OpIdx, getValueForPart(Op, Part));
123 } // namespace
125 void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) {
126 VPBlockBase *InsertPt = VPR->getSingleSuccessor();
127 for (unsigned Part = 1; Part != UF; ++Part) {
128 auto *Copy = VPR->clone();
129 VPBlockUtils::insertBlockBefore(Copy, InsertPt);
131 auto PartI = vp_depth_first_shallow(Copy->getEntry());
132 auto Part0 = vp_depth_first_shallow(VPR->getEntry());
133 for (const auto &[PartIVPBB, Part0VPBB] :
134 zip(VPBlockUtils::blocksOnly<VPBasicBlock>(PartI),
135 VPBlockUtils::blocksOnly<VPBasicBlock>(Part0))) {
136 for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) {
137 remapOperands(&PartIR, Part);
138 if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) {
139 ScalarIVSteps->addOperand(getConstantVPV(Part));
142 addRecipeForPart(&Part0R, &PartIR, Part);
148 void UnrollState::unrollWidenInductionByUF(
149 VPWidenIntOrFpInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) {
150 VPBasicBlock *PH = cast<VPBasicBlock>(
151 IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor());
152 Type *IVTy = TypeInfo.inferScalarType(IV);
153 auto &ID = IV->getInductionDescriptor();
154 std::optional<FastMathFlags> FMFs;
155 if (isa_and_present<FPMathOperator>(ID.getInductionBinOp()))
156 FMFs = ID.getInductionBinOp()->getFastMathFlags();
158 VPValue *VectorStep = &Plan.getVF();
159 VPBuilder Builder(PH);
160 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
161 Instruction::CastOps CastOp =
162 IVTy->isFloatingPointTy() ? Instruction::UIToFP : Instruction::Trunc;
163 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
164 ToSkip.insert(VectorStep->getDefiningRecipe());
167 VPValue *ScalarStep = IV->getStepValue();
168 auto *ConstStep = ScalarStep->isLiveIn()
169 ? dyn_cast<ConstantInt>(ScalarStep->getLiveInIRValue())
170 : nullptr;
171 if (!ConstStep || ConstStep->getValue() != 1) {
172 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
173 ScalarStep =
174 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
175 ToSkip.insert(ScalarStep->getDefiningRecipe());
178 unsigned MulOpc =
179 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
180 VPInstruction *Mul = Builder.createNaryOp(MulOpc, {VectorStep, ScalarStep},
181 FMFs, IV->getDebugLoc());
182 VectorStep = Mul;
183 ToSkip.insert(Mul);
186 // Now create recipes to compute the induction steps for part 1 .. UF. Part 0
187 // remains the header phi. Parts > 0 are computed by adding Step to the
188 // previous part. The header phi recipe will get 2 new operands: the step
189 // value for a single part and the last part, used to compute the backedge
190 // value during VPWidenIntOrFpInductionRecipe::execute. %Part.0 =
191 // VPWidenIntOrFpInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3
192 // %Part.1 = %Part.0 + %VectorStep
193 // %Part.2 = %Part.1 + %VectorStep
194 // %Part.3 = %Part.2 + %VectorStep
196 // The newly added recipes are added to ToSkip to avoid interleaving them
197 // again.
198 VPValue *Prev = IV;
199 Builder.setInsertPoint(IV->getParent(), InsertPtForPhi);
200 unsigned AddOpc =
201 IVTy->isFloatingPointTy() ? ID.getInductionOpcode() : Instruction::Add;
202 for (unsigned Part = 1; Part != UF; ++Part) {
203 std::string Name =
204 Part > 1 ? "step.add." + std::to_string(Part) : "step.add";
206 VPInstruction *Add = Builder.createNaryOp(AddOpc,
208 Prev,
209 VectorStep,
211 FMFs, IV->getDebugLoc(), Name);
212 ToSkip.insert(Add);
213 addRecipeForPart(IV, Add, Part);
214 Prev = Add;
216 IV->addOperand(VectorStep);
217 IV->addOperand(Prev);
220 void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
221 VPBasicBlock::iterator InsertPtForPhi) {
222 // First-order recurrences pass a single vector or scalar through their header
223 // phis, irrespective of interleaving.
224 if (isa<VPFirstOrderRecurrencePHIRecipe>(R))
225 return;
227 // Generate step vectors for each unrolled part.
228 if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(R)) {
229 unrollWidenInductionByUF(IV, InsertPtForPhi);
230 return;
233 auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(R);
234 if (RdxPhi && RdxPhi->isOrdered())
235 return;
237 auto InsertPt = std::next(R->getIterator());
238 for (unsigned Part = 1; Part != UF; ++Part) {
239 VPRecipeBase *Copy = R->clone();
240 Copy->insertBefore(*R->getParent(), InsertPt);
241 addRecipeForPart(R, Copy, Part);
242 if (isa<VPWidenPointerInductionRecipe>(R)) {
243 Copy->addOperand(R);
244 Copy->addOperand(getConstantVPV(Part));
245 } else if (RdxPhi) {
246 Copy->addOperand(getConstantVPV(Part));
247 } else {
248 assert(isa<VPActiveLaneMaskPHIRecipe>(R) &&
249 "unexpected header phi recipe not needing unrolled part");
254 /// Handle non-header-phi recipes.
255 void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
256 using namespace llvm::VPlanPatternMatch;
257 if (match(&R, m_BranchOnCond(m_VPValue())) ||
258 match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
259 return;
261 if (auto *VPI = dyn_cast<VPInstruction>(&R)) {
262 if (vputils::onlyFirstPartUsed(VPI)) {
263 addUniformForAllParts(VPI);
264 return;
267 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
268 if (isa<StoreInst>(RepR->getUnderlyingValue()) &&
269 RepR->getOperand(1)->isDefinedOutsideLoopRegions()) {
270 // Stores to an invariant address only need to store the last part.
271 remapOperands(&R, UF - 1);
272 return;
274 if (auto *II = dyn_cast<IntrinsicInst>(RepR->getUnderlyingValue())) {
275 if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) {
276 addUniformForAllParts(RepR);
277 return;
282 // Unroll non-uniform recipes.
283 auto InsertPt = std::next(R.getIterator());
284 VPBasicBlock &VPBB = *R.getParent();
285 for (unsigned Part = 1; Part != UF; ++Part) {
286 VPRecipeBase *Copy = R.clone();
287 Copy->insertBefore(VPBB, InsertPt);
288 addRecipeForPart(&R, Copy, Part);
290 VPValue *Op;
291 if (match(&R, m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
292 m_VPValue(), m_VPValue(Op)))) {
293 Copy->setOperand(0, getValueForPart(Op, Part - 1));
294 Copy->setOperand(1, getValueForPart(Op, Part));
295 continue;
297 if (auto *Red = dyn_cast<VPReductionRecipe>(&R)) {
298 auto *Phi = cast<VPReductionPHIRecipe>(R.getOperand(0));
299 if (Phi->isOrdered()) {
300 auto &Parts = VPV2Parts[Phi];
301 if (Part == 1) {
302 Parts.clear();
303 Parts.push_back(Red);
305 Parts.push_back(Copy->getVPSingleValue());
306 Phi->setOperand(1, Copy->getVPSingleValue());
309 remapOperands(Copy, Part);
311 // Add operand indicating the part to generate code for, to recipes still
312 // requiring it.
313 if (isa<VPScalarIVStepsRecipe, VPWidenCanonicalIVRecipe,
314 VPVectorPointerRecipe, VPReverseVectorPointerRecipe>(Copy) ||
315 match(Copy, m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>(
316 m_VPValue())))
317 Copy->addOperand(getConstantVPV(Part));
319 if (isa<VPVectorPointerRecipe, VPReverseVectorPointerRecipe>(R))
320 Copy->setOperand(0, R.getOperand(0));
324 using namespace llvm::VPlanPatternMatch;
325 void UnrollState::unrollBlock(VPBlockBase *VPB) {
326 auto *VPR = dyn_cast<VPRegionBlock>(VPB);
327 if (VPR) {
328 if (VPR->isReplicator())
329 return unrollReplicateRegionByUF(VPR);
331 // Traverse blocks in region in RPO to ensure defs are visited before uses
332 // across blocks.
333 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
334 RPOT(VPR->getEntry());
335 for (VPBlockBase *VPB : RPOT)
336 unrollBlock(VPB);
337 return;
340 // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes.
341 auto *VPBB = cast<VPBasicBlock>(VPB);
342 auto InsertPtForPhi = VPBB->getFirstNonPhi();
343 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
344 if (ToSkip.contains(&R) || isa<VPIRInstruction>(&R))
345 continue;
347 // Add all VPValues for all parts to ComputeReductionResult which combines
348 // the parts to compute the final reduction value.
349 VPValue *Op1;
350 if (match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(
351 m_VPValue(), m_VPValue(Op1)))) {
352 addUniformForAllParts(cast<VPInstruction>(&R));
353 for (unsigned Part = 1; Part != UF; ++Part)
354 R.addOperand(getValueForPart(Op1, Part));
355 continue;
357 VPValue *Op0;
358 if (match(&R, m_VPInstruction<VPInstruction::ExtractFromEnd>(
359 m_VPValue(Op0), m_VPValue(Op1)))) {
360 addUniformForAllParts(cast<VPSingleDefRecipe>(&R));
361 if (Plan.hasScalarVFOnly()) {
362 // Extracting from end with VF = 1 implies retrieving the scalar part UF
363 // - Op1.
364 unsigned Offset =
365 cast<ConstantInt>(Op1->getLiveInIRValue())->getZExtValue();
366 R.getVPSingleValue()->replaceAllUsesWith(
367 getValueForPart(Op0, UF - Offset));
368 R.eraseFromParent();
369 } else {
370 // Otherwise we extract from the last part.
371 remapOperands(&R, UF - 1);
373 continue;
376 auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R);
377 if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) {
378 addUniformForAllParts(SingleDef);
379 continue;
382 if (auto *H = dyn_cast<VPHeaderPHIRecipe>(&R)) {
383 unrollHeaderPHIByUF(H, InsertPtForPhi);
384 continue;
387 unrollRecipeByUF(R);
391 void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
392 assert(UF > 0 && "Unroll factor must be positive");
393 Plan.setUF(UF);
394 auto Cleanup = make_scope_exit([&Plan]() {
395 auto Iter = vp_depth_first_deep(Plan.getEntry());
396 // Remove recipes that are redundant after unrolling.
397 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
398 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
399 auto *VPI = dyn_cast<VPInstruction>(&R);
400 if (VPI &&
401 VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart &&
402 VPI->getNumOperands() == 1) {
403 VPI->replaceAllUsesWith(VPI->getOperand(0));
404 VPI->eraseFromParent();
409 if (UF == 1) {
410 return;
413 UnrollState Unroller(Plan, UF, Ctx);
415 // Iterate over all blocks in the plan starting from Entry, and unroll
416 // recipes inside them. This includes the vector preheader and middle blocks,
417 // which may set up or post-process per-part values.
418 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
419 Plan.getEntry());
420 for (VPBlockBase *VPB : RPOT)
421 Unroller.unrollBlock(VPB);
423 unsigned Part = 1;
424 // Remap operands of cloned header phis to update backedge values. The header
425 // phis cloned during unrolling are just after the header phi for part 0.
426 // Reset Part to 1 when reaching the first (part 0) recipe of a block.
427 for (VPRecipeBase &H :
428 Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
429 // The second operand of Fixed Order Recurrence phi's, feeding the spliced
430 // value across the backedge, needs to remap to the last part of the spliced
431 // value.
432 if (isa<VPFirstOrderRecurrencePHIRecipe>(&H)) {
433 Unroller.remapOperand(&H, 1, UF - 1);
434 continue;
436 if (Unroller.contains(H.getVPSingleValue()) ||
437 isa<VPWidenPointerInductionRecipe>(&H)) {
438 Part = 1;
439 continue;
441 Unroller.remapOperands(&H, Part);
442 Part++;
445 VPlanTransforms::removeDeadRecipes(Plan);