1 //===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
8 /// This file implements SLP analysis based on VPlan. The analysis is based on
9 /// the ideas described in
11 /// Look-ahead SLP: auto-vectorization in the presence of commutative
12 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
15 //===----------------------------------------------------------------------===//
18 #include "VPlanValue.h"
19 #include "llvm/ADT/DenseMap.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/Analysis/VectorUtils.h"
22 #include "llvm/IR/Instruction.h"
23 #include "llvm/IR/Instructions.h"
24 #include "llvm/IR/Type.h"
25 #include "llvm/IR/Value.h"
26 #include "llvm/Support/Casting.h"
27 #include "llvm/Support/Debug.h"
28 #include "llvm/Support/ErrorHandling.h"
29 #include "llvm/Support/raw_ostream.h"
37 #define DEBUG_TYPE "vplan-slp"
39 // Number of levels to look ahead when re-ordering multi node operands.
40 static unsigned LookaheadMaxDepth
= 5;
42 VPInstruction
*VPlanSlp::markFailed() {
43 // FIXME: Currently this is used to signal we hit instructions we cannot
45 CompletelySLP
= false;
49 void VPlanSlp::addCombined(ArrayRef
<VPValue
*> Operands
, VPInstruction
*New
) {
50 if (all_of(Operands
, [](VPValue
*V
) {
51 return cast
<VPInstruction
>(V
)->getUnderlyingInstr();
53 unsigned BundleSize
= 0;
54 for (VPValue
*V
: Operands
) {
55 Type
*T
= cast
<VPInstruction
>(V
)->getUnderlyingInstr()->getType();
56 assert(!T
->isVectorTy() && "Only scalar types supported for now");
57 BundleSize
+= T
->getScalarSizeInBits();
59 WidestBundleBits
= std::max(WidestBundleBits
, BundleSize
);
62 auto Res
= BundleToCombined
.try_emplace(to_vector
<4>(Operands
), New
);
64 "Already created a combined instruction for the operand bundle");
68 bool VPlanSlp::areVectorizable(ArrayRef
<VPValue
*> Operands
) const {
69 // Currently we only support VPInstructions.
70 if (!all_of(Operands
, [](VPValue
*Op
) {
71 return Op
&& isa
<VPInstruction
>(Op
) &&
72 cast
<VPInstruction
>(Op
)->getUnderlyingInstr();
74 LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
78 // Check if opcodes and type width agree for all instructions in the bundle.
79 // FIXME: Differing widths/opcodes can be handled by inserting additional
81 // FIXME: Deal with non-primitive types.
82 const Instruction
*OriginalInstr
=
83 cast
<VPInstruction
>(Operands
[0])->getUnderlyingInstr();
84 unsigned Opcode
= OriginalInstr
->getOpcode();
85 unsigned Width
= OriginalInstr
->getType()->getPrimitiveSizeInBits();
86 if (!all_of(Operands
, [Opcode
, Width
](VPValue
*Op
) {
87 const Instruction
*I
= cast
<VPInstruction
>(Op
)->getUnderlyingInstr();
88 return I
->getOpcode() == Opcode
&&
89 I
->getType()->getPrimitiveSizeInBits() == Width
;
91 LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
95 // For now, all operands must be defined in the same BB.
96 if (any_of(Operands
, [this](VPValue
*Op
) {
97 return cast
<VPInstruction
>(Op
)->getParent() != &this->BB
;
99 LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
104 [](VPValue
*Op
) { return Op
->hasMoreThanOneUniqueUser(); })) {
105 LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
109 // For loads, check that there are no instructions writing to memory in
111 // TODO: we only have to forbid instructions writing to memory that could
112 // interfere with any of the loads in the bundle
113 if (Opcode
== Instruction::Load
) {
114 unsigned LoadsSeen
= 0;
115 VPBasicBlock
*Parent
= cast
<VPInstruction
>(Operands
[0])->getParent();
116 for (auto &I
: *Parent
) {
117 auto *VPI
= dyn_cast
<VPInstruction
>(&I
);
120 if (VPI
->getOpcode() == Instruction::Load
&&
121 llvm::is_contained(Operands
, VPI
))
124 if (LoadsSeen
== Operands
.size())
126 if (LoadsSeen
> 0 && VPI
->mayWriteToMemory()) {
128 dbgs() << "VPSLP: instruction modifying memory between loads\n");
133 if (!all_of(Operands
, [](VPValue
*Op
) {
134 return cast
<LoadInst
>(cast
<VPInstruction
>(Op
)->getUnderlyingInstr())
137 LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
142 if (Opcode
== Instruction::Store
)
143 if (!all_of(Operands
, [](VPValue
*Op
) {
144 return cast
<StoreInst
>(cast
<VPInstruction
>(Op
)->getUnderlyingInstr())
147 LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
154 static SmallVector
<VPValue
*, 4> getOperands(ArrayRef
<VPValue
*> Values
,
155 unsigned OperandIndex
) {
156 SmallVector
<VPValue
*, 4> Operands
;
157 for (VPValue
*V
: Values
) {
158 // Currently we only support VPInstructions.
159 auto *U
= cast
<VPInstruction
>(V
);
160 Operands
.push_back(U
->getOperand(OperandIndex
));
165 static bool areCommutative(ArrayRef
<VPValue
*> Values
) {
166 return Instruction::isCommutative(
167 cast
<VPInstruction
>(Values
[0])->getOpcode());
170 static SmallVector
<SmallVector
<VPValue
*, 4>, 4>
171 getOperands(ArrayRef
<VPValue
*> Values
) {
172 SmallVector
<SmallVector
<VPValue
*, 4>, 4> Result
;
173 auto *VPI
= cast
<VPInstruction
>(Values
[0]);
175 switch (VPI
->getOpcode()) {
176 case Instruction::Load
:
177 llvm_unreachable("Loads terminate a tree, no need to get operands");
178 case Instruction::Store
:
179 Result
.push_back(getOperands(Values
, 0));
182 for (unsigned I
= 0, NumOps
= VPI
->getNumOperands(); I
< NumOps
; ++I
)
183 Result
.push_back(getOperands(Values
, I
));
190 /// Returns the opcode of Values or ~0 if they do not all agree.
191 static std::optional
<unsigned> getOpcode(ArrayRef
<VPValue
*> Values
) {
192 unsigned Opcode
= cast
<VPInstruction
>(Values
[0])->getOpcode();
193 if (any_of(Values
, [Opcode
](VPValue
*V
) {
194 return cast
<VPInstruction
>(V
)->getOpcode() != Opcode
;
200 /// Returns true if A and B access sequential memory if they are loads or
201 /// stores or if they have identical opcodes otherwise.
202 static bool areConsecutiveOrMatch(VPInstruction
*A
, VPInstruction
*B
,
203 VPInterleavedAccessInfo
&IAI
) {
204 if (A
->getOpcode() != B
->getOpcode())
207 if (A
->getOpcode() != Instruction::Load
&&
208 A
->getOpcode() != Instruction::Store
)
210 auto *GA
= IAI
.getInterleaveGroup(A
);
211 auto *GB
= IAI
.getInterleaveGroup(B
);
213 return GA
&& GB
&& GA
== GB
&& GA
->getIndex(A
) + 1 == GB
->getIndex(B
);
216 /// Implements getLAScore from Listing 7 in the paper.
217 /// Traverses and compares operands of V1 and V2 to MaxLevel.
218 static unsigned getLAScore(VPValue
*V1
, VPValue
*V2
, unsigned MaxLevel
,
219 VPInterleavedAccessInfo
&IAI
) {
220 auto *I1
= dyn_cast
<VPInstruction
>(V1
);
221 auto *I2
= dyn_cast
<VPInstruction
>(V2
);
222 // Currently we only support VPInstructions.
227 return (unsigned)areConsecutiveOrMatch(I1
, I2
, IAI
);
230 for (unsigned I
= 0, EV1
= I1
->getNumOperands(); I
< EV1
; ++I
)
231 for (unsigned J
= 0, EV2
= I2
->getNumOperands(); J
< EV2
; ++J
)
233 getLAScore(I1
->getOperand(I
), I2
->getOperand(J
), MaxLevel
- 1, IAI
);
237 std::pair
<VPlanSlp::OpMode
, VPValue
*>
238 VPlanSlp::getBest(OpMode Mode
, VPValue
*Last
,
239 SmallPtrSetImpl
<VPValue
*> &Candidates
,
240 VPInterleavedAccessInfo
&IAI
) {
241 assert((Mode
== OpMode::Load
|| Mode
== OpMode::Opcode
) &&
242 "Currently we only handle load and commutative opcodes");
243 LLVM_DEBUG(dbgs() << " getBest\n");
245 SmallVector
<VPValue
*, 4> BestCandidates
;
246 LLVM_DEBUG(dbgs() << " Candidates for "
247 << *cast
<VPInstruction
>(Last
)->getUnderlyingInstr() << " ");
248 for (auto *Candidate
: Candidates
) {
249 auto *LastI
= cast
<VPInstruction
>(Last
);
250 auto *CandidateI
= cast
<VPInstruction
>(Candidate
);
251 if (areConsecutiveOrMatch(LastI
, CandidateI
, IAI
)) {
252 LLVM_DEBUG(dbgs() << *cast
<VPInstruction
>(Candidate
)->getUnderlyingInstr()
254 BestCandidates
.push_back(Candidate
);
257 LLVM_DEBUG(dbgs() << "\n");
259 if (BestCandidates
.empty())
260 return {OpMode::Failed
, nullptr};
262 if (BestCandidates
.size() == 1)
263 return {Mode
, BestCandidates
[0]};
265 VPValue
*Best
= nullptr;
266 unsigned BestScore
= 0;
267 for (unsigned Depth
= 1; Depth
< LookaheadMaxDepth
; Depth
++) {
268 unsigned PrevScore
= ~0u;
271 // FIXME: Avoid visiting the same operands multiple times.
272 for (auto *Candidate
: BestCandidates
) {
273 unsigned Score
= getLAScore(Last
, Candidate
, Depth
, IAI
);
274 if (PrevScore
== ~0u)
276 if (PrevScore
!= Score
)
280 if (Score
> BestScore
) {
288 LLVM_DEBUG(dbgs() << "Found best "
289 << *cast
<VPInstruction
>(Best
)->getUnderlyingInstr()
291 Candidates
.erase(Best
);
296 SmallVector
<VPlanSlp::MultiNodeOpTy
, 4> VPlanSlp::reorderMultiNodeOps() {
297 SmallVector
<MultiNodeOpTy
, 4> FinalOrder
;
298 SmallVector
<OpMode
, 4> Mode
;
299 FinalOrder
.reserve(MultiNodeOps
.size());
300 Mode
.reserve(MultiNodeOps
.size());
302 LLVM_DEBUG(dbgs() << "Reordering multinode\n");
304 for (auto &Operands
: MultiNodeOps
) {
305 FinalOrder
.push_back({Operands
.first
, {Operands
.second
[0]}});
306 if (cast
<VPInstruction
>(Operands
.second
[0])->getOpcode() ==
308 Mode
.push_back(OpMode::Load
);
310 Mode
.push_back(OpMode::Opcode
);
313 for (unsigned Lane
= 1, E
= MultiNodeOps
[0].second
.size(); Lane
< E
; ++Lane
) {
314 LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane
<< "\n");
315 SmallPtrSet
<VPValue
*, 4> Candidates
;
316 LLVM_DEBUG(dbgs() << " Candidates ");
317 for (auto Ops
: MultiNodeOps
) {
319 dbgs() << *cast
<VPInstruction
>(Ops
.second
[Lane
])->getUnderlyingInstr()
321 Candidates
.insert(Ops
.second
[Lane
]);
323 LLVM_DEBUG(dbgs() << "\n");
325 for (unsigned Op
= 0, E
= MultiNodeOps
.size(); Op
< E
; ++Op
) {
326 LLVM_DEBUG(dbgs() << " Checking " << Op
<< "\n");
327 if (Mode
[Op
] == OpMode::Failed
)
330 VPValue
*Last
= FinalOrder
[Op
].second
[Lane
- 1];
331 std::pair
<OpMode
, VPValue
*> Res
=
332 getBest(Mode
[Op
], Last
, Candidates
, IAI
);
334 FinalOrder
[Op
].second
.push_back(Res
.second
);
336 // TODO: handle this case
337 FinalOrder
[Op
].second
.push_back(markFailed());
344 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
345 void VPlanSlp::dumpBundle(ArrayRef
<VPValue
*> Values
) {
347 for (auto *Op
: Values
) {
348 if (auto *VPInstr
= cast_or_null
<VPInstruction
>(Op
))
349 if (auto *Instr
= VPInstr
->getUnderlyingInstr()) {
350 dbgs() << *Instr
<< " | ";
353 dbgs() << " nullptr | ";
359 VPInstruction
*VPlanSlp::buildGraph(ArrayRef
<VPValue
*> Values
) {
360 assert(!Values
.empty() && "Need some operands!");
362 // If we already visited this instruction bundle, re-use the existing node
363 auto I
= BundleToCombined
.find(to_vector
<4>(Values
));
364 if (I
!= BundleToCombined
.end()) {
366 // Check that the resulting graph is a tree. If we re-use a node, this means
367 // its values have multiple users. We only allow this, if all users of each
368 // value are the same instruction.
369 for (auto *V
: Values
) {
370 auto UI
= V
->user_begin();
371 auto *FirstUser
= *UI
++;
372 while (UI
!= V
->user_end()) {
373 assert(*UI
== FirstUser
&& "Currently we only support SLP trees.");
383 dbgs() << "buildGraph: ";
387 if (!areVectorizable(Values
))
390 assert(getOpcode(Values
) && "Opcodes for all values must match");
391 unsigned ValuesOpcode
= *getOpcode(Values
);
393 SmallVector
<VPValue
*, 4> CombinedOperands
;
394 if (areCommutative(Values
)) {
395 bool MultiNodeRoot
= !MultiNodeActive
;
396 MultiNodeActive
= true;
397 for (auto &Operands
: getOperands(Values
)) {
399 dbgs() << " Visiting Commutative";
400 dumpBundle(Operands
);
403 auto OperandsOpcode
= getOpcode(Operands
);
404 if (OperandsOpcode
&& OperandsOpcode
== getOpcode(Values
)) {
405 LLVM_DEBUG(dbgs() << " Same opcode, continue building\n");
406 CombinedOperands
.push_back(buildGraph(Operands
));
408 LLVM_DEBUG(dbgs() << " Adding multinode Ops\n");
409 // Create dummy VPInstruction, which will we replace later by the
410 // re-ordered operand.
411 VPInstruction
*Op
= new VPInstruction(0, {});
412 CombinedOperands
.push_back(Op
);
413 MultiNodeOps
.emplace_back(Op
, Operands
);
418 LLVM_DEBUG(dbgs() << "Reorder \n");
419 MultiNodeActive
= false;
421 auto FinalOrder
= reorderMultiNodeOps();
423 MultiNodeOps
.clear();
424 for (auto &Ops
: FinalOrder
) {
425 VPInstruction
*NewOp
= buildGraph(Ops
.second
);
426 Ops
.first
->replaceAllUsesWith(NewOp
);
427 for (unsigned i
= 0; i
< CombinedOperands
.size(); i
++)
428 if (CombinedOperands
[i
] == Ops
.first
)
429 CombinedOperands
[i
] = NewOp
;
433 LLVM_DEBUG(dbgs() << "Found final order\n");
436 LLVM_DEBUG(dbgs() << " NonCommuntative\n");
437 if (ValuesOpcode
== Instruction::Load
)
438 for (VPValue
*V
: Values
)
439 CombinedOperands
.push_back(cast
<VPInstruction
>(V
)->getOperand(0));
441 for (auto &Operands
: getOperands(Values
))
442 CombinedOperands
.push_back(buildGraph(Operands
));
446 switch (ValuesOpcode
) {
447 case Instruction::Load
:
448 Opcode
= VPInstruction::SLPLoad
;
450 case Instruction::Store
:
451 Opcode
= VPInstruction::SLPStore
;
454 Opcode
= ValuesOpcode
;
461 assert(CombinedOperands
.size() > 0 && "Need more some operands");
462 auto *Inst
= cast
<VPInstruction
>(Values
[0])->getUnderlyingInstr();
463 auto *VPI
= new VPInstruction(Opcode
, CombinedOperands
, Inst
->getDebugLoc());
465 LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI
<< " "
466 << *cast
<VPInstruction
>(Values
[0]) << "\n");
467 addCombined(Values
, VPI
);