[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB...
[llvm-complete.git] / lib / Transforms / Vectorize / VPlanSLP.cpp
blob9019ed15ec5ff64aebfffbbac52bcdc21cb8f98e
1 //===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// This file implements SLP analysis based on VPlan. The analysis is based on
9 /// the ideas described in
10 ///
11 /// Look-ahead SLP: auto-vectorization in the presence of commutative
12 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
13 /// Luís F. W. Góes
14 ///
15 //===----------------------------------------------------------------------===//
17 #include "VPlan.h"
18 #include "llvm/ADT/DepthFirstIterator.h"
19 #include "llvm/ADT/PostOrderIterator.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Twine.h"
22 #include "llvm/Analysis/LoopInfo.h"
23 #include "llvm/Analysis/VectorUtils.h"
24 #include "llvm/IR/BasicBlock.h"
25 #include "llvm/IR/CFG.h"
26 #include "llvm/IR/Dominators.h"
27 #include "llvm/IR/InstrTypes.h"
28 #include "llvm/IR/Instruction.h"
29 #include "llvm/IR/Instructions.h"
30 #include "llvm/IR/Type.h"
31 #include "llvm/IR/Value.h"
32 #include "llvm/Support/Casting.h"
33 #include "llvm/Support/Debug.h"
34 #include "llvm/Support/ErrorHandling.h"
35 #include "llvm/Support/GraphWriter.h"
36 #include "llvm/Support/raw_ostream.h"
37 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
38 #include <cassert>
39 #include <iterator>
40 #include <string>
41 #include <vector>
43 using namespace llvm;
45 #define DEBUG_TYPE "vplan-slp"
47 // Number of levels to look ahead when re-ordering multi node operands.
48 static unsigned LookaheadMaxDepth = 5;
50 VPInstruction *VPlanSlp::markFailed() {
51 // FIXME: Currently this is used to signal we hit instructions we cannot
52 // trivially SLP'ize.
53 CompletelySLP = false;
54 return nullptr;
57 void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) {
58 if (all_of(Operands, [](VPValue *V) {
59 return cast<VPInstruction>(V)->getUnderlyingInstr();
60 })) {
61 unsigned BundleSize = 0;
62 for (VPValue *V : Operands) {
63 Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();
64 assert(!T->isVectorTy() && "Only scalar types supported for now");
65 BundleSize += T->getScalarSizeInBits();
67 WidestBundleBits = std::max(WidestBundleBits, BundleSize);
70 auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New);
71 assert(Res.second &&
72 "Already created a combined instruction for the operand bundle");
73 (void)Res;
76 bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
77 // Currently we only support VPInstructions.
78 if (!all_of(Operands, [](VPValue *Op) {
79 return Op && isa<VPInstruction>(Op) &&
80 cast<VPInstruction>(Op)->getUnderlyingInstr();
81 })) {
82 LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
83 return false;
86 // Check if opcodes and type width agree for all instructions in the bundle.
87 // FIXME: Differing widths/opcodes can be handled by inserting additional
88 // instructions.
89 // FIXME: Deal with non-primitive types.
90 const Instruction *OriginalInstr =
91 cast<VPInstruction>(Operands[0])->getUnderlyingInstr();
92 unsigned Opcode = OriginalInstr->getOpcode();
93 unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
94 if (!all_of(Operands, [Opcode, Width](VPValue *Op) {
95 const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr();
96 return I->getOpcode() == Opcode &&
97 I->getType()->getPrimitiveSizeInBits() == Width;
98 })) {
99 LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
100 return false;
103 // For now, all operands must be defined in the same BB.
104 if (any_of(Operands, [this](VPValue *Op) {
105 return cast<VPInstruction>(Op)->getParent() != &this->BB;
106 })) {
107 LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
108 return false;
111 if (any_of(Operands,
112 [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {
113 LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
114 return false;
117 // For loads, check that there are no instructions writing to memory in
118 // between them.
119 // TODO: we only have to forbid instructions writing to memory that could
120 // interfere with any of the loads in the bundle
121 if (Opcode == Instruction::Load) {
122 unsigned LoadsSeen = 0;
123 VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();
124 for (auto &I : *Parent) {
125 auto *VPI = cast<VPInstruction>(&I);
126 if (VPI->getOpcode() == Instruction::Load &&
127 std::find(Operands.begin(), Operands.end(), VPI) != Operands.end())
128 LoadsSeen++;
130 if (LoadsSeen == Operands.size())
131 break;
132 if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {
133 LLVM_DEBUG(
134 dbgs() << "VPSLP: instruction modifying memory between loads\n");
135 return false;
139 if (!all_of(Operands, [](VPValue *Op) {
140 return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
141 ->isSimple();
142 })) {
143 LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
144 return false;
148 if (Opcode == Instruction::Store)
149 if (!all_of(Operands, [](VPValue *Op) {
150 return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
151 ->isSimple();
152 })) {
153 LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
154 return false;
157 return true;
160 static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
161 unsigned OperandIndex) {
162 SmallVector<VPValue *, 4> Operands;
163 for (VPValue *V : Values) {
164 auto *U = cast<VPUser>(V);
165 Operands.push_back(U->getOperand(OperandIndex));
167 return Operands;
170 static bool areCommutative(ArrayRef<VPValue *> Values) {
171 return Instruction::isCommutative(
172 cast<VPInstruction>(Values[0])->getOpcode());
175 static SmallVector<SmallVector<VPValue *, 4>, 4>
176 getOperands(ArrayRef<VPValue *> Values) {
177 SmallVector<SmallVector<VPValue *, 4>, 4> Result;
178 auto *VPI = cast<VPInstruction>(Values[0]);
180 switch (VPI->getOpcode()) {
181 case Instruction::Load:
182 llvm_unreachable("Loads terminate a tree, no need to get operands");
183 case Instruction::Store:
184 Result.push_back(getOperands(Values, 0));
185 break;
186 default:
187 for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
188 Result.push_back(getOperands(Values, I));
189 break;
192 return Result;
195 /// Returns the opcode of Values or ~0 if they do not all agree.
196 static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {
197 unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();
198 if (any_of(Values, [Opcode](VPValue *V) {
199 return cast<VPInstruction>(V)->getOpcode() != Opcode;
201 return None;
202 return {Opcode};
205 /// Returns true if A and B access sequential memory if they are loads or
206 /// stores or if they have identical opcodes otherwise.
207 static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
208 VPInterleavedAccessInfo &IAI) {
209 if (A->getOpcode() != B->getOpcode())
210 return false;
212 if (A->getOpcode() != Instruction::Load &&
213 A->getOpcode() != Instruction::Store)
214 return true;
215 auto *GA = IAI.getInterleaveGroup(A);
216 auto *GB = IAI.getInterleaveGroup(B);
218 return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);
221 /// Implements getLAScore from Listing 7 in the paper.
222 /// Traverses and compares operands of V1 and V2 to MaxLevel.
223 static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
224 VPInterleavedAccessInfo &IAI) {
225 if (!isa<VPInstruction>(V1) || !isa<VPInstruction>(V2))
226 return 0;
228 if (MaxLevel == 0)
229 return (unsigned)areConsecutiveOrMatch(cast<VPInstruction>(V1),
230 cast<VPInstruction>(V2), IAI);
232 unsigned Score = 0;
233 for (unsigned I = 0, EV1 = cast<VPUser>(V1)->getNumOperands(); I < EV1; ++I)
234 for (unsigned J = 0, EV2 = cast<VPUser>(V2)->getNumOperands(); J < EV2; ++J)
235 Score += getLAScore(cast<VPUser>(V1)->getOperand(I),
236 cast<VPUser>(V2)->getOperand(J), MaxLevel - 1, IAI);
237 return Score;
240 std::pair<VPlanSlp::OpMode, VPValue *>
241 VPlanSlp::getBest(OpMode Mode, VPValue *Last,
242 SmallPtrSetImpl<VPValue *> &Candidates,
243 VPInterleavedAccessInfo &IAI) {
244 assert((Mode == OpMode::Load || Mode == OpMode::Opcode) &&
245 "Currently we only handle load and commutative opcodes");
246 LLVM_DEBUG(dbgs() << " getBest\n");
248 SmallVector<VPValue *, 4> BestCandidates;
249 LLVM_DEBUG(dbgs() << " Candidates for "
250 << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
251 for (auto *Candidate : Candidates) {
252 auto *LastI = cast<VPInstruction>(Last);
253 auto *CandidateI = cast<VPInstruction>(Candidate);
254 if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {
255 LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
256 << " ");
257 BestCandidates.push_back(Candidate);
260 LLVM_DEBUG(dbgs() << "\n");
262 if (BestCandidates.empty())
263 return {OpMode::Failed, nullptr};
265 if (BestCandidates.size() == 1)
266 return {Mode, BestCandidates[0]};
268 VPValue *Best = nullptr;
269 unsigned BestScore = 0;
270 for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {
271 unsigned PrevScore = ~0u;
272 bool AllSame = true;
274 // FIXME: Avoid visiting the same operands multiple times.
275 for (auto *Candidate : BestCandidates) {
276 unsigned Score = getLAScore(Last, Candidate, Depth, IAI);
277 if (PrevScore == ~0u)
278 PrevScore = Score;
279 if (PrevScore != Score)
280 AllSame = false;
281 PrevScore = Score;
283 if (Score > BestScore) {
284 BestScore = Score;
285 Best = Candidate;
288 if (!AllSame)
289 break;
291 LLVM_DEBUG(dbgs() << "Found best "
292 << *cast<VPInstruction>(Best)->getUnderlyingInstr()
293 << "\n");
294 Candidates.erase(Best);
296 return {Mode, Best};
299 SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
300 SmallVector<MultiNodeOpTy, 4> FinalOrder;
301 SmallVector<OpMode, 4> Mode;
302 FinalOrder.reserve(MultiNodeOps.size());
303 Mode.reserve(MultiNodeOps.size());
305 LLVM_DEBUG(dbgs() << "Reordering multinode\n");
307 for (auto &Operands : MultiNodeOps) {
308 FinalOrder.push_back({Operands.first, {Operands.second[0]}});
309 if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==
310 Instruction::Load)
311 Mode.push_back(OpMode::Load);
312 else
313 Mode.push_back(OpMode::Opcode);
316 for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
317 LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n");
318 SmallPtrSet<VPValue *, 4> Candidates;
319 LLVM_DEBUG(dbgs() << " Candidates ");
320 for (auto Ops : MultiNodeOps) {
321 LLVM_DEBUG(
322 dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
323 << " ");
324 Candidates.insert(Ops.second[Lane]);
326 LLVM_DEBUG(dbgs() << "\n");
328 for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {
329 LLVM_DEBUG(dbgs() << " Checking " << Op << "\n");
330 if (Mode[Op] == OpMode::Failed)
331 continue;
333 VPValue *Last = FinalOrder[Op].second[Lane - 1];
334 std::pair<OpMode, VPValue *> Res =
335 getBest(Mode[Op], Last, Candidates, IAI);
336 if (Res.second)
337 FinalOrder[Op].second.push_back(Res.second);
338 else
339 // TODO: handle this case
340 FinalOrder[Op].second.push_back(markFailed());
344 return FinalOrder;
347 void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
348 dbgs() << " Ops: ";
349 for (auto Op : Values) {
350 if (auto *VPInstr = cast_or_null<VPInstruction>(Op))
351 if (auto *Instr = VPInstr->getUnderlyingInstr()) {
352 dbgs() << *Instr << " | ";
353 continue;
355 dbgs() << " nullptr | ";
357 dbgs() << "\n";
360 VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
361 assert(!Values.empty() && "Need some operands!");
363 // If we already visited this instruction bundle, re-use the existing node
364 auto I = BundleToCombined.find(to_vector<4>(Values));
365 if (I != BundleToCombined.end()) {
366 #ifndef NDEBUG
367 // Check that the resulting graph is a tree. If we re-use a node, this means
368 // its values have multiple users. We only allow this, if all users of each
369 // value are the same instruction.
370 for (auto *V : Values) {
371 auto UI = V->user_begin();
372 auto *FirstUser = *UI++;
373 while (UI != V->user_end()) {
374 assert(*UI == FirstUser && "Currently we only support SLP trees.");
375 UI++;
378 #endif
379 return I->second;
382 // Dump inputs
383 LLVM_DEBUG({
384 dbgs() << "buildGraph: ";
385 dumpBundle(Values);
388 if (!areVectorizable(Values))
389 return markFailed();
391 assert(getOpcode(Values) && "Opcodes for all values must match");
392 unsigned ValuesOpcode = getOpcode(Values).getValue();
394 SmallVector<VPValue *, 4> CombinedOperands;
395 if (areCommutative(Values)) {
396 bool MultiNodeRoot = !MultiNodeActive;
397 MultiNodeActive = true;
398 for (auto &Operands : getOperands(Values)) {
399 LLVM_DEBUG({
400 dbgs() << " Visiting Commutative";
401 dumpBundle(Operands);
404 auto OperandsOpcode = getOpcode(Operands);
405 if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
406 LLVM_DEBUG(dbgs() << " Same opcode, continue building\n");
407 CombinedOperands.push_back(buildGraph(Operands));
408 } else {
409 LLVM_DEBUG(dbgs() << " Adding multinode Ops\n");
410 // Create dummy VPInstruction, which will we replace later by the
411 // re-ordered operand.
412 VPInstruction *Op = new VPInstruction(0, {});
413 CombinedOperands.push_back(Op);
414 MultiNodeOps.emplace_back(Op, Operands);
418 if (MultiNodeRoot) {
419 LLVM_DEBUG(dbgs() << "Reorder \n");
420 MultiNodeActive = false;
422 auto FinalOrder = reorderMultiNodeOps();
424 MultiNodeOps.clear();
425 for (auto &Ops : FinalOrder) {
426 VPInstruction *NewOp = buildGraph(Ops.second);
427 Ops.first->replaceAllUsesWith(NewOp);
428 for (unsigned i = 0; i < CombinedOperands.size(); i++)
429 if (CombinedOperands[i] == Ops.first)
430 CombinedOperands[i] = NewOp;
431 delete Ops.first;
432 Ops.first = NewOp;
434 LLVM_DEBUG(dbgs() << "Found final order\n");
436 } else {
437 LLVM_DEBUG(dbgs() << " NonCommuntative\n");
438 if (ValuesOpcode == Instruction::Load)
439 for (VPValue *V : Values)
440 CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));
441 else
442 for (auto &Operands : getOperands(Values))
443 CombinedOperands.push_back(buildGraph(Operands));
446 unsigned Opcode;
447 switch (ValuesOpcode) {
448 case Instruction::Load:
449 Opcode = VPInstruction::SLPLoad;
450 break;
451 case Instruction::Store:
452 Opcode = VPInstruction::SLPStore;
453 break;
454 default:
455 Opcode = ValuesOpcode;
456 break;
459 if (!CompletelySLP)
460 return markFailed();
462 assert(CombinedOperands.size() > 0 && "Need more some operands");
463 auto *VPI = new VPInstruction(Opcode, CombinedOperands);
464 VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());
466 LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI->print(dbgs());
467 cast<VPInstruction>(Values[0])->print(dbgs()); dbgs() << "\n");
468 addCombined(Values, VPI);
469 return VPI;