[ORC] Add std::tuple support to SimplePackedSerialization.
[llvm-project.git] / llvm / lib / Transforms / Vectorize / VPlanSLP.cpp
blobc52c8a2229e8fe344911e1d4a156cf70529289f2
1 //===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// This file implements SLP analysis based on VPlan. The analysis is based on
9 /// the ideas described in
10 ///
11 /// Look-ahead SLP: auto-vectorization in the presence of commutative
12 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
13 /// Luís F. W. Góes
14 ///
15 //===----------------------------------------------------------------------===//
17 #include "VPlan.h"
18 #include "llvm/ADT/DepthFirstIterator.h"
19 #include "llvm/ADT/PostOrderIterator.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Twine.h"
22 #include "llvm/Analysis/LoopInfo.h"
23 #include "llvm/Analysis/VectorUtils.h"
24 #include "llvm/IR/BasicBlock.h"
25 #include "llvm/IR/CFG.h"
26 #include "llvm/IR/Dominators.h"
27 #include "llvm/IR/InstrTypes.h"
28 #include "llvm/IR/Instruction.h"
29 #include "llvm/IR/Instructions.h"
30 #include "llvm/IR/Type.h"
31 #include "llvm/IR/Value.h"
32 #include "llvm/Support/Casting.h"
33 #include "llvm/Support/Debug.h"
34 #include "llvm/Support/ErrorHandling.h"
35 #include "llvm/Support/GraphWriter.h"
36 #include "llvm/Support/raw_ostream.h"
37 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
38 #include <algorithm>
39 #include <cassert>
40 #include <iterator>
41 #include <utility>
43 using namespace llvm;
45 #define DEBUG_TYPE "vplan-slp"
47 // Number of levels to look ahead when re-ordering multi node operands.
48 static unsigned LookaheadMaxDepth = 5;
50 VPInstruction *VPlanSlp::markFailed() {
51 // FIXME: Currently this is used to signal we hit instructions we cannot
52 // trivially SLP'ize.
53 CompletelySLP = false;
54 return nullptr;
57 void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) {
58 if (all_of(Operands, [](VPValue *V) {
59 return cast<VPInstruction>(V)->getUnderlyingInstr();
60 })) {
61 unsigned BundleSize = 0;
62 for (VPValue *V : Operands) {
63 Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();
64 assert(!T->isVectorTy() && "Only scalar types supported for now");
65 BundleSize += T->getScalarSizeInBits();
67 WidestBundleBits = std::max(WidestBundleBits, BundleSize);
70 auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New);
71 assert(Res.second &&
72 "Already created a combined instruction for the operand bundle");
73 (void)Res;
76 bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
77 // Currently we only support VPInstructions.
78 if (!all_of(Operands, [](VPValue *Op) {
79 return Op && isa<VPInstruction>(Op) &&
80 cast<VPInstruction>(Op)->getUnderlyingInstr();
81 })) {
82 LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
83 return false;
86 // Check if opcodes and type width agree for all instructions in the bundle.
87 // FIXME: Differing widths/opcodes can be handled by inserting additional
88 // instructions.
89 // FIXME: Deal with non-primitive types.
90 const Instruction *OriginalInstr =
91 cast<VPInstruction>(Operands[0])->getUnderlyingInstr();
92 unsigned Opcode = OriginalInstr->getOpcode();
93 unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
94 if (!all_of(Operands, [Opcode, Width](VPValue *Op) {
95 const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr();
96 return I->getOpcode() == Opcode &&
97 I->getType()->getPrimitiveSizeInBits() == Width;
98 })) {
99 LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
100 return false;
103 // For now, all operands must be defined in the same BB.
104 if (any_of(Operands, [this](VPValue *Op) {
105 return cast<VPInstruction>(Op)->getParent() != &this->BB;
106 })) {
107 LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
108 return false;
111 if (any_of(Operands,
112 [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {
113 LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
114 return false;
117 // For loads, check that there are no instructions writing to memory in
118 // between them.
119 // TODO: we only have to forbid instructions writing to memory that could
120 // interfere with any of the loads in the bundle
121 if (Opcode == Instruction::Load) {
122 unsigned LoadsSeen = 0;
123 VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();
124 for (auto &I : *Parent) {
125 auto *VPI = dyn_cast<VPInstruction>(&I);
126 if (!VPI)
127 break;
128 if (VPI->getOpcode() == Instruction::Load &&
129 llvm::is_contained(Operands, VPI))
130 LoadsSeen++;
132 if (LoadsSeen == Operands.size())
133 break;
134 if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {
135 LLVM_DEBUG(
136 dbgs() << "VPSLP: instruction modifying memory between loads\n");
137 return false;
141 if (!all_of(Operands, [](VPValue *Op) {
142 return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
143 ->isSimple();
144 })) {
145 LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
146 return false;
150 if (Opcode == Instruction::Store)
151 if (!all_of(Operands, [](VPValue *Op) {
152 return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
153 ->isSimple();
154 })) {
155 LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
156 return false;
159 return true;
162 static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
163 unsigned OperandIndex) {
164 SmallVector<VPValue *, 4> Operands;
165 for (VPValue *V : Values) {
166 // Currently we only support VPInstructions.
167 auto *U = cast<VPInstruction>(V);
168 Operands.push_back(U->getOperand(OperandIndex));
170 return Operands;
173 static bool areCommutative(ArrayRef<VPValue *> Values) {
174 return Instruction::isCommutative(
175 cast<VPInstruction>(Values[0])->getOpcode());
178 static SmallVector<SmallVector<VPValue *, 4>, 4>
179 getOperands(ArrayRef<VPValue *> Values) {
180 SmallVector<SmallVector<VPValue *, 4>, 4> Result;
181 auto *VPI = cast<VPInstruction>(Values[0]);
183 switch (VPI->getOpcode()) {
184 case Instruction::Load:
185 llvm_unreachable("Loads terminate a tree, no need to get operands");
186 case Instruction::Store:
187 Result.push_back(getOperands(Values, 0));
188 break;
189 default:
190 for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
191 Result.push_back(getOperands(Values, I));
192 break;
195 return Result;
198 /// Returns the opcode of Values or ~0 if they do not all agree.
199 static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {
200 unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();
201 if (any_of(Values, [Opcode](VPValue *V) {
202 return cast<VPInstruction>(V)->getOpcode() != Opcode;
204 return None;
205 return {Opcode};
208 /// Returns true if A and B access sequential memory if they are loads or
209 /// stores or if they have identical opcodes otherwise.
210 static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
211 VPInterleavedAccessInfo &IAI) {
212 if (A->getOpcode() != B->getOpcode())
213 return false;
215 if (A->getOpcode() != Instruction::Load &&
216 A->getOpcode() != Instruction::Store)
217 return true;
218 auto *GA = IAI.getInterleaveGroup(A);
219 auto *GB = IAI.getInterleaveGroup(B);
221 return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);
224 /// Implements getLAScore from Listing 7 in the paper.
225 /// Traverses and compares operands of V1 and V2 to MaxLevel.
226 static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
227 VPInterleavedAccessInfo &IAI) {
228 auto *I1 = dyn_cast<VPInstruction>(V1);
229 auto *I2 = dyn_cast<VPInstruction>(V2);
230 // Currently we only support VPInstructions.
231 if (!I1 || !I2)
232 return 0;
234 if (MaxLevel == 0)
235 return (unsigned)areConsecutiveOrMatch(I1, I2, IAI);
237 unsigned Score = 0;
238 for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I)
239 for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J)
240 Score +=
241 getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI);
242 return Score;
245 std::pair<VPlanSlp::OpMode, VPValue *>
246 VPlanSlp::getBest(OpMode Mode, VPValue *Last,
247 SmallPtrSetImpl<VPValue *> &Candidates,
248 VPInterleavedAccessInfo &IAI) {
249 assert((Mode == OpMode::Load || Mode == OpMode::Opcode) &&
250 "Currently we only handle load and commutative opcodes");
251 LLVM_DEBUG(dbgs() << " getBest\n");
253 SmallVector<VPValue *, 4> BestCandidates;
254 LLVM_DEBUG(dbgs() << " Candidates for "
255 << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
256 for (auto *Candidate : Candidates) {
257 auto *LastI = cast<VPInstruction>(Last);
258 auto *CandidateI = cast<VPInstruction>(Candidate);
259 if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {
260 LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
261 << " ");
262 BestCandidates.push_back(Candidate);
265 LLVM_DEBUG(dbgs() << "\n");
267 if (BestCandidates.empty())
268 return {OpMode::Failed, nullptr};
270 if (BestCandidates.size() == 1)
271 return {Mode, BestCandidates[0]};
273 VPValue *Best = nullptr;
274 unsigned BestScore = 0;
275 for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {
276 unsigned PrevScore = ~0u;
277 bool AllSame = true;
279 // FIXME: Avoid visiting the same operands multiple times.
280 for (auto *Candidate : BestCandidates) {
281 unsigned Score = getLAScore(Last, Candidate, Depth, IAI);
282 if (PrevScore == ~0u)
283 PrevScore = Score;
284 if (PrevScore != Score)
285 AllSame = false;
286 PrevScore = Score;
288 if (Score > BestScore) {
289 BestScore = Score;
290 Best = Candidate;
293 if (!AllSame)
294 break;
296 LLVM_DEBUG(dbgs() << "Found best "
297 << *cast<VPInstruction>(Best)->getUnderlyingInstr()
298 << "\n");
299 Candidates.erase(Best);
301 return {Mode, Best};
304 SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
305 SmallVector<MultiNodeOpTy, 4> FinalOrder;
306 SmallVector<OpMode, 4> Mode;
307 FinalOrder.reserve(MultiNodeOps.size());
308 Mode.reserve(MultiNodeOps.size());
310 LLVM_DEBUG(dbgs() << "Reordering multinode\n");
312 for (auto &Operands : MultiNodeOps) {
313 FinalOrder.push_back({Operands.first, {Operands.second[0]}});
314 if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==
315 Instruction::Load)
316 Mode.push_back(OpMode::Load);
317 else
318 Mode.push_back(OpMode::Opcode);
321 for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
322 LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n");
323 SmallPtrSet<VPValue *, 4> Candidates;
324 LLVM_DEBUG(dbgs() << " Candidates ");
325 for (auto Ops : MultiNodeOps) {
326 LLVM_DEBUG(
327 dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
328 << " ");
329 Candidates.insert(Ops.second[Lane]);
331 LLVM_DEBUG(dbgs() << "\n");
333 for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {
334 LLVM_DEBUG(dbgs() << " Checking " << Op << "\n");
335 if (Mode[Op] == OpMode::Failed)
336 continue;
338 VPValue *Last = FinalOrder[Op].second[Lane - 1];
339 std::pair<OpMode, VPValue *> Res =
340 getBest(Mode[Op], Last, Candidates, IAI);
341 if (Res.second)
342 FinalOrder[Op].second.push_back(Res.second);
343 else
344 // TODO: handle this case
345 FinalOrder[Op].second.push_back(markFailed());
349 return FinalOrder;
352 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
353 void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
354 dbgs() << " Ops: ";
355 for (auto Op : Values) {
356 if (auto *VPInstr = cast_or_null<VPInstruction>(Op))
357 if (auto *Instr = VPInstr->getUnderlyingInstr()) {
358 dbgs() << *Instr << " | ";
359 continue;
361 dbgs() << " nullptr | ";
363 dbgs() << "\n";
365 #endif
367 VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
368 assert(!Values.empty() && "Need some operands!");
370 // If we already visited this instruction bundle, re-use the existing node
371 auto I = BundleToCombined.find(to_vector<4>(Values));
372 if (I != BundleToCombined.end()) {
373 #ifndef NDEBUG
374 // Check that the resulting graph is a tree. If we re-use a node, this means
375 // its values have multiple users. We only allow this, if all users of each
376 // value are the same instruction.
377 for (auto *V : Values) {
378 auto UI = V->user_begin();
379 auto *FirstUser = *UI++;
380 while (UI != V->user_end()) {
381 assert(*UI == FirstUser && "Currently we only support SLP trees.");
382 UI++;
385 #endif
386 return I->second;
389 // Dump inputs
390 LLVM_DEBUG({
391 dbgs() << "buildGraph: ";
392 dumpBundle(Values);
395 if (!areVectorizable(Values))
396 return markFailed();
398 assert(getOpcode(Values) && "Opcodes for all values must match");
399 unsigned ValuesOpcode = getOpcode(Values).getValue();
401 SmallVector<VPValue *, 4> CombinedOperands;
402 if (areCommutative(Values)) {
403 bool MultiNodeRoot = !MultiNodeActive;
404 MultiNodeActive = true;
405 for (auto &Operands : getOperands(Values)) {
406 LLVM_DEBUG({
407 dbgs() << " Visiting Commutative";
408 dumpBundle(Operands);
411 auto OperandsOpcode = getOpcode(Operands);
412 if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
413 LLVM_DEBUG(dbgs() << " Same opcode, continue building\n");
414 CombinedOperands.push_back(buildGraph(Operands));
415 } else {
416 LLVM_DEBUG(dbgs() << " Adding multinode Ops\n");
417 // Create dummy VPInstruction, which will we replace later by the
418 // re-ordered operand.
419 VPInstruction *Op = new VPInstruction(0, {});
420 CombinedOperands.push_back(Op);
421 MultiNodeOps.emplace_back(Op, Operands);
425 if (MultiNodeRoot) {
426 LLVM_DEBUG(dbgs() << "Reorder \n");
427 MultiNodeActive = false;
429 auto FinalOrder = reorderMultiNodeOps();
431 MultiNodeOps.clear();
432 for (auto &Ops : FinalOrder) {
433 VPInstruction *NewOp = buildGraph(Ops.second);
434 Ops.first->replaceAllUsesWith(NewOp);
435 for (unsigned i = 0; i < CombinedOperands.size(); i++)
436 if (CombinedOperands[i] == Ops.first)
437 CombinedOperands[i] = NewOp;
438 delete Ops.first;
439 Ops.first = NewOp;
441 LLVM_DEBUG(dbgs() << "Found final order\n");
443 } else {
444 LLVM_DEBUG(dbgs() << " NonCommuntative\n");
445 if (ValuesOpcode == Instruction::Load)
446 for (VPValue *V : Values)
447 CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));
448 else
449 for (auto &Operands : getOperands(Values))
450 CombinedOperands.push_back(buildGraph(Operands));
453 unsigned Opcode;
454 switch (ValuesOpcode) {
455 case Instruction::Load:
456 Opcode = VPInstruction::SLPLoad;
457 break;
458 case Instruction::Store:
459 Opcode = VPInstruction::SLPStore;
460 break;
461 default:
462 Opcode = ValuesOpcode;
463 break;
466 if (!CompletelySLP)
467 return markFailed();
469 assert(CombinedOperands.size() > 0 && "Need more some operands");
470 auto *VPI = new VPInstruction(Opcode, CombinedOperands);
471 VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());
473 LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " "
474 << *cast<VPInstruction>(Values[0]) << "\n");
475 addCombined(Values, VPI);
476 return VPI;