1 //===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
8 /// This file implements SLP analysis based on VPlan. The analysis is based on
9 /// the ideas described in
11 /// Look-ahead SLP: auto-vectorization in the presence of commutative
12 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
15 //===----------------------------------------------------------------------===//
18 #include "llvm/ADT/DepthFirstIterator.h"
19 #include "llvm/ADT/PostOrderIterator.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Twine.h"
22 #include "llvm/Analysis/LoopInfo.h"
23 #include "llvm/Analysis/VectorUtils.h"
24 #include "llvm/IR/BasicBlock.h"
25 #include "llvm/IR/CFG.h"
26 #include "llvm/IR/Dominators.h"
27 #include "llvm/IR/InstrTypes.h"
28 #include "llvm/IR/Instruction.h"
29 #include "llvm/IR/Instructions.h"
30 #include "llvm/IR/Type.h"
31 #include "llvm/IR/Value.h"
32 #include "llvm/Support/Casting.h"
33 #include "llvm/Support/Debug.h"
34 #include "llvm/Support/ErrorHandling.h"
35 #include "llvm/Support/GraphWriter.h"
36 #include "llvm/Support/raw_ostream.h"
37 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
45 #define DEBUG_TYPE "vplan-slp"
47 // Number of levels to look ahead when re-ordering multi node operands.
48 static unsigned LookaheadMaxDepth
= 5;
50 VPInstruction
*VPlanSlp::markFailed() {
51 // FIXME: Currently this is used to signal we hit instructions we cannot
53 CompletelySLP
= false;
57 void VPlanSlp::addCombined(ArrayRef
<VPValue
*> Operands
, VPInstruction
*New
) {
58 if (all_of(Operands
, [](VPValue
*V
) {
59 return cast
<VPInstruction
>(V
)->getUnderlyingInstr();
61 unsigned BundleSize
= 0;
62 for (VPValue
*V
: Operands
) {
63 Type
*T
= cast
<VPInstruction
>(V
)->getUnderlyingInstr()->getType();
64 assert(!T
->isVectorTy() && "Only scalar types supported for now");
65 BundleSize
+= T
->getScalarSizeInBits();
67 WidestBundleBits
= std::max(WidestBundleBits
, BundleSize
);
70 auto Res
= BundleToCombined
.try_emplace(to_vector
<4>(Operands
), New
);
72 "Already created a combined instruction for the operand bundle");
76 bool VPlanSlp::areVectorizable(ArrayRef
<VPValue
*> Operands
) const {
77 // Currently we only support VPInstructions.
78 if (!all_of(Operands
, [](VPValue
*Op
) {
79 return Op
&& isa
<VPInstruction
>(Op
) &&
80 cast
<VPInstruction
>(Op
)->getUnderlyingInstr();
82 LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
86 // Check if opcodes and type width agree for all instructions in the bundle.
87 // FIXME: Differing widths/opcodes can be handled by inserting additional
89 // FIXME: Deal with non-primitive types.
90 const Instruction
*OriginalInstr
=
91 cast
<VPInstruction
>(Operands
[0])->getUnderlyingInstr();
92 unsigned Opcode
= OriginalInstr
->getOpcode();
93 unsigned Width
= OriginalInstr
->getType()->getPrimitiveSizeInBits();
94 if (!all_of(Operands
, [Opcode
, Width
](VPValue
*Op
) {
95 const Instruction
*I
= cast
<VPInstruction
>(Op
)->getUnderlyingInstr();
96 return I
->getOpcode() == Opcode
&&
97 I
->getType()->getPrimitiveSizeInBits() == Width
;
99 LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
103 // For now, all operands must be defined in the same BB.
104 if (any_of(Operands
, [this](VPValue
*Op
) {
105 return cast
<VPInstruction
>(Op
)->getParent() != &this->BB
;
107 LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
112 [](VPValue
*Op
) { return Op
->hasMoreThanOneUniqueUser(); })) {
113 LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
117 // For loads, check that there are no instructions writing to memory in
119 // TODO: we only have to forbid instructions writing to memory that could
120 // interfere with any of the loads in the bundle
121 if (Opcode
== Instruction::Load
) {
122 unsigned LoadsSeen
= 0;
123 VPBasicBlock
*Parent
= cast
<VPInstruction
>(Operands
[0])->getParent();
124 for (auto &I
: *Parent
) {
125 auto *VPI
= dyn_cast
<VPInstruction
>(&I
);
128 if (VPI
->getOpcode() == Instruction::Load
&&
129 llvm::is_contained(Operands
, VPI
))
132 if (LoadsSeen
== Operands
.size())
134 if (LoadsSeen
> 0 && VPI
->mayWriteToMemory()) {
136 dbgs() << "VPSLP: instruction modifying memory between loads\n");
141 if (!all_of(Operands
, [](VPValue
*Op
) {
142 return cast
<LoadInst
>(cast
<VPInstruction
>(Op
)->getUnderlyingInstr())
145 LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
150 if (Opcode
== Instruction::Store
)
151 if (!all_of(Operands
, [](VPValue
*Op
) {
152 return cast
<StoreInst
>(cast
<VPInstruction
>(Op
)->getUnderlyingInstr())
155 LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
162 static SmallVector
<VPValue
*, 4> getOperands(ArrayRef
<VPValue
*> Values
,
163 unsigned OperandIndex
) {
164 SmallVector
<VPValue
*, 4> Operands
;
165 for (VPValue
*V
: Values
) {
166 // Currently we only support VPInstructions.
167 auto *U
= cast
<VPInstruction
>(V
);
168 Operands
.push_back(U
->getOperand(OperandIndex
));
173 static bool areCommutative(ArrayRef
<VPValue
*> Values
) {
174 return Instruction::isCommutative(
175 cast
<VPInstruction
>(Values
[0])->getOpcode());
178 static SmallVector
<SmallVector
<VPValue
*, 4>, 4>
179 getOperands(ArrayRef
<VPValue
*> Values
) {
180 SmallVector
<SmallVector
<VPValue
*, 4>, 4> Result
;
181 auto *VPI
= cast
<VPInstruction
>(Values
[0]);
183 switch (VPI
->getOpcode()) {
184 case Instruction::Load
:
185 llvm_unreachable("Loads terminate a tree, no need to get operands");
186 case Instruction::Store
:
187 Result
.push_back(getOperands(Values
, 0));
190 for (unsigned I
= 0, NumOps
= VPI
->getNumOperands(); I
< NumOps
; ++I
)
191 Result
.push_back(getOperands(Values
, I
));
198 /// Returns the opcode of Values or ~0 if they do not all agree.
199 static Optional
<unsigned> getOpcode(ArrayRef
<VPValue
*> Values
) {
200 unsigned Opcode
= cast
<VPInstruction
>(Values
[0])->getOpcode();
201 if (any_of(Values
, [Opcode
](VPValue
*V
) {
202 return cast
<VPInstruction
>(V
)->getOpcode() != Opcode
;
208 /// Returns true if A and B access sequential memory if they are loads or
209 /// stores or if they have identical opcodes otherwise.
210 static bool areConsecutiveOrMatch(VPInstruction
*A
, VPInstruction
*B
,
211 VPInterleavedAccessInfo
&IAI
) {
212 if (A
->getOpcode() != B
->getOpcode())
215 if (A
->getOpcode() != Instruction::Load
&&
216 A
->getOpcode() != Instruction::Store
)
218 auto *GA
= IAI
.getInterleaveGroup(A
);
219 auto *GB
= IAI
.getInterleaveGroup(B
);
221 return GA
&& GB
&& GA
== GB
&& GA
->getIndex(A
) + 1 == GB
->getIndex(B
);
224 /// Implements getLAScore from Listing 7 in the paper.
225 /// Traverses and compares operands of V1 and V2 to MaxLevel.
226 static unsigned getLAScore(VPValue
*V1
, VPValue
*V2
, unsigned MaxLevel
,
227 VPInterleavedAccessInfo
&IAI
) {
228 auto *I1
= dyn_cast
<VPInstruction
>(V1
);
229 auto *I2
= dyn_cast
<VPInstruction
>(V2
);
230 // Currently we only support VPInstructions.
235 return (unsigned)areConsecutiveOrMatch(I1
, I2
, IAI
);
238 for (unsigned I
= 0, EV1
= I1
->getNumOperands(); I
< EV1
; ++I
)
239 for (unsigned J
= 0, EV2
= I2
->getNumOperands(); J
< EV2
; ++J
)
241 getLAScore(I1
->getOperand(I
), I2
->getOperand(J
), MaxLevel
- 1, IAI
);
245 std::pair
<VPlanSlp::OpMode
, VPValue
*>
246 VPlanSlp::getBest(OpMode Mode
, VPValue
*Last
,
247 SmallPtrSetImpl
<VPValue
*> &Candidates
,
248 VPInterleavedAccessInfo
&IAI
) {
249 assert((Mode
== OpMode::Load
|| Mode
== OpMode::Opcode
) &&
250 "Currently we only handle load and commutative opcodes");
251 LLVM_DEBUG(dbgs() << " getBest\n");
253 SmallVector
<VPValue
*, 4> BestCandidates
;
254 LLVM_DEBUG(dbgs() << " Candidates for "
255 << *cast
<VPInstruction
>(Last
)->getUnderlyingInstr() << " ");
256 for (auto *Candidate
: Candidates
) {
257 auto *LastI
= cast
<VPInstruction
>(Last
);
258 auto *CandidateI
= cast
<VPInstruction
>(Candidate
);
259 if (areConsecutiveOrMatch(LastI
, CandidateI
, IAI
)) {
260 LLVM_DEBUG(dbgs() << *cast
<VPInstruction
>(Candidate
)->getUnderlyingInstr()
262 BestCandidates
.push_back(Candidate
);
265 LLVM_DEBUG(dbgs() << "\n");
267 if (BestCandidates
.empty())
268 return {OpMode::Failed
, nullptr};
270 if (BestCandidates
.size() == 1)
271 return {Mode
, BestCandidates
[0]};
273 VPValue
*Best
= nullptr;
274 unsigned BestScore
= 0;
275 for (unsigned Depth
= 1; Depth
< LookaheadMaxDepth
; Depth
++) {
276 unsigned PrevScore
= ~0u;
279 // FIXME: Avoid visiting the same operands multiple times.
280 for (auto *Candidate
: BestCandidates
) {
281 unsigned Score
= getLAScore(Last
, Candidate
, Depth
, IAI
);
282 if (PrevScore
== ~0u)
284 if (PrevScore
!= Score
)
288 if (Score
> BestScore
) {
296 LLVM_DEBUG(dbgs() << "Found best "
297 << *cast
<VPInstruction
>(Best
)->getUnderlyingInstr()
299 Candidates
.erase(Best
);
304 SmallVector
<VPlanSlp::MultiNodeOpTy
, 4> VPlanSlp::reorderMultiNodeOps() {
305 SmallVector
<MultiNodeOpTy
, 4> FinalOrder
;
306 SmallVector
<OpMode
, 4> Mode
;
307 FinalOrder
.reserve(MultiNodeOps
.size());
308 Mode
.reserve(MultiNodeOps
.size());
310 LLVM_DEBUG(dbgs() << "Reordering multinode\n");
312 for (auto &Operands
: MultiNodeOps
) {
313 FinalOrder
.push_back({Operands
.first
, {Operands
.second
[0]}});
314 if (cast
<VPInstruction
>(Operands
.second
[0])->getOpcode() ==
316 Mode
.push_back(OpMode::Load
);
318 Mode
.push_back(OpMode::Opcode
);
321 for (unsigned Lane
= 1, E
= MultiNodeOps
[0].second
.size(); Lane
< E
; ++Lane
) {
322 LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane
<< "\n");
323 SmallPtrSet
<VPValue
*, 4> Candidates
;
324 LLVM_DEBUG(dbgs() << " Candidates ");
325 for (auto Ops
: MultiNodeOps
) {
327 dbgs() << *cast
<VPInstruction
>(Ops
.second
[Lane
])->getUnderlyingInstr()
329 Candidates
.insert(Ops
.second
[Lane
]);
331 LLVM_DEBUG(dbgs() << "\n");
333 for (unsigned Op
= 0, E
= MultiNodeOps
.size(); Op
< E
; ++Op
) {
334 LLVM_DEBUG(dbgs() << " Checking " << Op
<< "\n");
335 if (Mode
[Op
] == OpMode::Failed
)
338 VPValue
*Last
= FinalOrder
[Op
].second
[Lane
- 1];
339 std::pair
<OpMode
, VPValue
*> Res
=
340 getBest(Mode
[Op
], Last
, Candidates
, IAI
);
342 FinalOrder
[Op
].second
.push_back(Res
.second
);
344 // TODO: handle this case
345 FinalOrder
[Op
].second
.push_back(markFailed());
352 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
353 void VPlanSlp::dumpBundle(ArrayRef
<VPValue
*> Values
) {
355 for (auto Op
: Values
) {
356 if (auto *VPInstr
= cast_or_null
<VPInstruction
>(Op
))
357 if (auto *Instr
= VPInstr
->getUnderlyingInstr()) {
358 dbgs() << *Instr
<< " | ";
361 dbgs() << " nullptr | ";
367 VPInstruction
*VPlanSlp::buildGraph(ArrayRef
<VPValue
*> Values
) {
368 assert(!Values
.empty() && "Need some operands!");
370 // If we already visited this instruction bundle, re-use the existing node
371 auto I
= BundleToCombined
.find(to_vector
<4>(Values
));
372 if (I
!= BundleToCombined
.end()) {
374 // Check that the resulting graph is a tree. If we re-use a node, this means
375 // its values have multiple users. We only allow this, if all users of each
376 // value are the same instruction.
377 for (auto *V
: Values
) {
378 auto UI
= V
->user_begin();
379 auto *FirstUser
= *UI
++;
380 while (UI
!= V
->user_end()) {
381 assert(*UI
== FirstUser
&& "Currently we only support SLP trees.");
391 dbgs() << "buildGraph: ";
395 if (!areVectorizable(Values
))
398 assert(getOpcode(Values
) && "Opcodes for all values must match");
399 unsigned ValuesOpcode
= getOpcode(Values
).getValue();
401 SmallVector
<VPValue
*, 4> CombinedOperands
;
402 if (areCommutative(Values
)) {
403 bool MultiNodeRoot
= !MultiNodeActive
;
404 MultiNodeActive
= true;
405 for (auto &Operands
: getOperands(Values
)) {
407 dbgs() << " Visiting Commutative";
408 dumpBundle(Operands
);
411 auto OperandsOpcode
= getOpcode(Operands
);
412 if (OperandsOpcode
&& OperandsOpcode
== getOpcode(Values
)) {
413 LLVM_DEBUG(dbgs() << " Same opcode, continue building\n");
414 CombinedOperands
.push_back(buildGraph(Operands
));
416 LLVM_DEBUG(dbgs() << " Adding multinode Ops\n");
417 // Create dummy VPInstruction, which will we replace later by the
418 // re-ordered operand.
419 VPInstruction
*Op
= new VPInstruction(0, {});
420 CombinedOperands
.push_back(Op
);
421 MultiNodeOps
.emplace_back(Op
, Operands
);
426 LLVM_DEBUG(dbgs() << "Reorder \n");
427 MultiNodeActive
= false;
429 auto FinalOrder
= reorderMultiNodeOps();
431 MultiNodeOps
.clear();
432 for (auto &Ops
: FinalOrder
) {
433 VPInstruction
*NewOp
= buildGraph(Ops
.second
);
434 Ops
.first
->replaceAllUsesWith(NewOp
);
435 for (unsigned i
= 0; i
< CombinedOperands
.size(); i
++)
436 if (CombinedOperands
[i
] == Ops
.first
)
437 CombinedOperands
[i
] = NewOp
;
441 LLVM_DEBUG(dbgs() << "Found final order\n");
444 LLVM_DEBUG(dbgs() << " NonCommuntative\n");
445 if (ValuesOpcode
== Instruction::Load
)
446 for (VPValue
*V
: Values
)
447 CombinedOperands
.push_back(cast
<VPInstruction
>(V
)->getOperand(0));
449 for (auto &Operands
: getOperands(Values
))
450 CombinedOperands
.push_back(buildGraph(Operands
));
454 switch (ValuesOpcode
) {
455 case Instruction::Load
:
456 Opcode
= VPInstruction::SLPLoad
;
458 case Instruction::Store
:
459 Opcode
= VPInstruction::SLPStore
;
462 Opcode
= ValuesOpcode
;
469 assert(CombinedOperands
.size() > 0 && "Need more some operands");
470 auto *VPI
= new VPInstruction(Opcode
, CombinedOperands
);
471 VPI
->setUnderlyingInstr(cast
<VPInstruction
>(Values
[0])->getUnderlyingInstr());
473 LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI
<< " "
474 << *cast
<VPInstruction
>(Values
[0]) << "\n");
475 addCombined(Values
, VPI
);