1 //===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
8 /// This file implements SLP analysis based on VPlan. The analysis is based on
9 /// the ideas described in
11 /// Look-ahead SLP: auto-vectorization in the presence of commutative
12 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
15 //===----------------------------------------------------------------------===//
18 #include "llvm/ADT/DepthFirstIterator.h"
19 #include "llvm/ADT/PostOrderIterator.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Twine.h"
22 #include "llvm/Analysis/LoopInfo.h"
23 #include "llvm/Analysis/VectorUtils.h"
24 #include "llvm/IR/BasicBlock.h"
25 #include "llvm/IR/CFG.h"
26 #include "llvm/IR/Dominators.h"
27 #include "llvm/IR/InstrTypes.h"
28 #include "llvm/IR/Instruction.h"
29 #include "llvm/IR/Instructions.h"
30 #include "llvm/IR/Type.h"
31 #include "llvm/IR/Value.h"
32 #include "llvm/Support/Casting.h"
33 #include "llvm/Support/Debug.h"
34 #include "llvm/Support/ErrorHandling.h"
35 #include "llvm/Support/GraphWriter.h"
36 #include "llvm/Support/raw_ostream.h"
37 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
45 #define DEBUG_TYPE "vplan-slp"
47 // Number of levels to look ahead when re-ordering multi node operands.
48 static unsigned LookaheadMaxDepth
= 5;
50 VPInstruction
*VPlanSlp::markFailed() {
51 // FIXME: Currently this is used to signal we hit instructions we cannot
53 CompletelySLP
= false;
57 void VPlanSlp::addCombined(ArrayRef
<VPValue
*> Operands
, VPInstruction
*New
) {
58 if (all_of(Operands
, [](VPValue
*V
) {
59 return cast
<VPInstruction
>(V
)->getUnderlyingInstr();
61 unsigned BundleSize
= 0;
62 for (VPValue
*V
: Operands
) {
63 Type
*T
= cast
<VPInstruction
>(V
)->getUnderlyingInstr()->getType();
64 assert(!T
->isVectorTy() && "Only scalar types supported for now");
65 BundleSize
+= T
->getScalarSizeInBits();
67 WidestBundleBits
= std::max(WidestBundleBits
, BundleSize
);
70 auto Res
= BundleToCombined
.try_emplace(to_vector
<4>(Operands
), New
);
72 "Already created a combined instruction for the operand bundle");
76 bool VPlanSlp::areVectorizable(ArrayRef
<VPValue
*> Operands
) const {
77 // Currently we only support VPInstructions.
78 if (!all_of(Operands
, [](VPValue
*Op
) {
79 return Op
&& isa
<VPInstruction
>(Op
) &&
80 cast
<VPInstruction
>(Op
)->getUnderlyingInstr();
82 LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
86 // Check if opcodes and type width agree for all instructions in the bundle.
87 // FIXME: Differing widths/opcodes can be handled by inserting additional
89 // FIXME: Deal with non-primitive types.
90 const Instruction
*OriginalInstr
=
91 cast
<VPInstruction
>(Operands
[0])->getUnderlyingInstr();
92 unsigned Opcode
= OriginalInstr
->getOpcode();
93 unsigned Width
= OriginalInstr
->getType()->getPrimitiveSizeInBits();
94 if (!all_of(Operands
, [Opcode
, Width
](VPValue
*Op
) {
95 const Instruction
*I
= cast
<VPInstruction
>(Op
)->getUnderlyingInstr();
96 return I
->getOpcode() == Opcode
&&
97 I
->getType()->getPrimitiveSizeInBits() == Width
;
99 LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
103 // For now, all operands must be defined in the same BB.
104 if (any_of(Operands
, [this](VPValue
*Op
) {
105 return cast
<VPInstruction
>(Op
)->getParent() != &this->BB
;
107 LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
112 [](VPValue
*Op
) { return Op
->hasMoreThanOneUniqueUser(); })) {
113 LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
117 // For loads, check that there are no instructions writing to memory in
119 // TODO: we only have to forbid instructions writing to memory that could
120 // interfere with any of the loads in the bundle
121 if (Opcode
== Instruction::Load
) {
122 unsigned LoadsSeen
= 0;
123 VPBasicBlock
*Parent
= cast
<VPInstruction
>(Operands
[0])->getParent();
124 for (auto &I
: *Parent
) {
125 auto *VPI
= cast
<VPInstruction
>(&I
);
126 if (VPI
->getOpcode() == Instruction::Load
&&
127 std::find(Operands
.begin(), Operands
.end(), VPI
) != Operands
.end())
130 if (LoadsSeen
== Operands
.size())
132 if (LoadsSeen
> 0 && VPI
->mayWriteToMemory()) {
134 dbgs() << "VPSLP: instruction modifying memory between loads\n");
139 if (!all_of(Operands
, [](VPValue
*Op
) {
140 return cast
<LoadInst
>(cast
<VPInstruction
>(Op
)->getUnderlyingInstr())
143 LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
148 if (Opcode
== Instruction::Store
)
149 if (!all_of(Operands
, [](VPValue
*Op
) {
150 return cast
<StoreInst
>(cast
<VPInstruction
>(Op
)->getUnderlyingInstr())
153 LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
160 static SmallVector
<VPValue
*, 4> getOperands(ArrayRef
<VPValue
*> Values
,
161 unsigned OperandIndex
) {
162 SmallVector
<VPValue
*, 4> Operands
;
163 for (VPValue
*V
: Values
) {
164 auto *U
= cast
<VPUser
>(V
);
165 Operands
.push_back(U
->getOperand(OperandIndex
));
170 static bool areCommutative(ArrayRef
<VPValue
*> Values
) {
171 return Instruction::isCommutative(
172 cast
<VPInstruction
>(Values
[0])->getOpcode());
175 static SmallVector
<SmallVector
<VPValue
*, 4>, 4>
176 getOperands(ArrayRef
<VPValue
*> Values
) {
177 SmallVector
<SmallVector
<VPValue
*, 4>, 4> Result
;
178 auto *VPI
= cast
<VPInstruction
>(Values
[0]);
180 switch (VPI
->getOpcode()) {
181 case Instruction::Load
:
182 llvm_unreachable("Loads terminate a tree, no need to get operands");
183 case Instruction::Store
:
184 Result
.push_back(getOperands(Values
, 0));
187 for (unsigned I
= 0, NumOps
= VPI
->getNumOperands(); I
< NumOps
; ++I
)
188 Result
.push_back(getOperands(Values
, I
));
195 /// Returns the opcode of Values or ~0 if they do not all agree.
196 static Optional
<unsigned> getOpcode(ArrayRef
<VPValue
*> Values
) {
197 unsigned Opcode
= cast
<VPInstruction
>(Values
[0])->getOpcode();
198 if (any_of(Values
, [Opcode
](VPValue
*V
) {
199 return cast
<VPInstruction
>(V
)->getOpcode() != Opcode
;
205 /// Returns true if A and B access sequential memory if they are loads or
206 /// stores or if they have identical opcodes otherwise.
207 static bool areConsecutiveOrMatch(VPInstruction
*A
, VPInstruction
*B
,
208 VPInterleavedAccessInfo
&IAI
) {
209 if (A
->getOpcode() != B
->getOpcode())
212 if (A
->getOpcode() != Instruction::Load
&&
213 A
->getOpcode() != Instruction::Store
)
215 auto *GA
= IAI
.getInterleaveGroup(A
);
216 auto *GB
= IAI
.getInterleaveGroup(B
);
218 return GA
&& GB
&& GA
== GB
&& GA
->getIndex(A
) + 1 == GB
->getIndex(B
);
221 /// Implements getLAScore from Listing 7 in the paper.
222 /// Traverses and compares operands of V1 and V2 to MaxLevel.
223 static unsigned getLAScore(VPValue
*V1
, VPValue
*V2
, unsigned MaxLevel
,
224 VPInterleavedAccessInfo
&IAI
) {
225 if (!isa
<VPInstruction
>(V1
) || !isa
<VPInstruction
>(V2
))
229 return (unsigned)areConsecutiveOrMatch(cast
<VPInstruction
>(V1
),
230 cast
<VPInstruction
>(V2
), IAI
);
233 for (unsigned I
= 0, EV1
= cast
<VPUser
>(V1
)->getNumOperands(); I
< EV1
; ++I
)
234 for (unsigned J
= 0, EV2
= cast
<VPUser
>(V2
)->getNumOperands(); J
< EV2
; ++J
)
235 Score
+= getLAScore(cast
<VPUser
>(V1
)->getOperand(I
),
236 cast
<VPUser
>(V2
)->getOperand(J
), MaxLevel
- 1, IAI
);
240 std::pair
<VPlanSlp::OpMode
, VPValue
*>
241 VPlanSlp::getBest(OpMode Mode
, VPValue
*Last
,
242 SmallPtrSetImpl
<VPValue
*> &Candidates
,
243 VPInterleavedAccessInfo
&IAI
) {
244 assert((Mode
== OpMode::Load
|| Mode
== OpMode::Opcode
) &&
245 "Currently we only handle load and commutative opcodes");
246 LLVM_DEBUG(dbgs() << " getBest\n");
248 SmallVector
<VPValue
*, 4> BestCandidates
;
249 LLVM_DEBUG(dbgs() << " Candidates for "
250 << *cast
<VPInstruction
>(Last
)->getUnderlyingInstr() << " ");
251 for (auto *Candidate
: Candidates
) {
252 auto *LastI
= cast
<VPInstruction
>(Last
);
253 auto *CandidateI
= cast
<VPInstruction
>(Candidate
);
254 if (areConsecutiveOrMatch(LastI
, CandidateI
, IAI
)) {
255 LLVM_DEBUG(dbgs() << *cast
<VPInstruction
>(Candidate
)->getUnderlyingInstr()
257 BestCandidates
.push_back(Candidate
);
260 LLVM_DEBUG(dbgs() << "\n");
262 if (BestCandidates
.empty())
263 return {OpMode::Failed
, nullptr};
265 if (BestCandidates
.size() == 1)
266 return {Mode
, BestCandidates
[0]};
268 VPValue
*Best
= nullptr;
269 unsigned BestScore
= 0;
270 for (unsigned Depth
= 1; Depth
< LookaheadMaxDepth
; Depth
++) {
271 unsigned PrevScore
= ~0u;
274 // FIXME: Avoid visiting the same operands multiple times.
275 for (auto *Candidate
: BestCandidates
) {
276 unsigned Score
= getLAScore(Last
, Candidate
, Depth
, IAI
);
277 if (PrevScore
== ~0u)
279 if (PrevScore
!= Score
)
283 if (Score
> BestScore
) {
291 LLVM_DEBUG(dbgs() << "Found best "
292 << *cast
<VPInstruction
>(Best
)->getUnderlyingInstr()
294 Candidates
.erase(Best
);
299 SmallVector
<VPlanSlp::MultiNodeOpTy
, 4> VPlanSlp::reorderMultiNodeOps() {
300 SmallVector
<MultiNodeOpTy
, 4> FinalOrder
;
301 SmallVector
<OpMode
, 4> Mode
;
302 FinalOrder
.reserve(MultiNodeOps
.size());
303 Mode
.reserve(MultiNodeOps
.size());
305 LLVM_DEBUG(dbgs() << "Reordering multinode\n");
307 for (auto &Operands
: MultiNodeOps
) {
308 FinalOrder
.push_back({Operands
.first
, {Operands
.second
[0]}});
309 if (cast
<VPInstruction
>(Operands
.second
[0])->getOpcode() ==
311 Mode
.push_back(OpMode::Load
);
313 Mode
.push_back(OpMode::Opcode
);
316 for (unsigned Lane
= 1, E
= MultiNodeOps
[0].second
.size(); Lane
< E
; ++Lane
) {
317 LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane
<< "\n");
318 SmallPtrSet
<VPValue
*, 4> Candidates
;
319 LLVM_DEBUG(dbgs() << " Candidates ");
320 for (auto Ops
: MultiNodeOps
) {
322 dbgs() << *cast
<VPInstruction
>(Ops
.second
[Lane
])->getUnderlyingInstr()
324 Candidates
.insert(Ops
.second
[Lane
]);
326 LLVM_DEBUG(dbgs() << "\n");
328 for (unsigned Op
= 0, E
= MultiNodeOps
.size(); Op
< E
; ++Op
) {
329 LLVM_DEBUG(dbgs() << " Checking " << Op
<< "\n");
330 if (Mode
[Op
] == OpMode::Failed
)
333 VPValue
*Last
= FinalOrder
[Op
].second
[Lane
- 1];
334 std::pair
<OpMode
, VPValue
*> Res
=
335 getBest(Mode
[Op
], Last
, Candidates
, IAI
);
337 FinalOrder
[Op
].second
.push_back(Res
.second
);
339 // TODO: handle this case
340 FinalOrder
[Op
].second
.push_back(markFailed());
347 void VPlanSlp::dumpBundle(ArrayRef
<VPValue
*> Values
) {
349 for (auto Op
: Values
) {
350 if (auto *VPInstr
= cast_or_null
<VPInstruction
>(Op
))
351 if (auto *Instr
= VPInstr
->getUnderlyingInstr()) {
352 dbgs() << *Instr
<< " | ";
355 dbgs() << " nullptr | ";
360 VPInstruction
*VPlanSlp::buildGraph(ArrayRef
<VPValue
*> Values
) {
361 assert(!Values
.empty() && "Need some operands!");
363 // If we already visited this instruction bundle, re-use the existing node
364 auto I
= BundleToCombined
.find(to_vector
<4>(Values
));
365 if (I
!= BundleToCombined
.end()) {
367 // Check that the resulting graph is a tree. If we re-use a node, this means
368 // its values have multiple users. We only allow this, if all users of each
369 // value are the same instruction.
370 for (auto *V
: Values
) {
371 auto UI
= V
->user_begin();
372 auto *FirstUser
= *UI
++;
373 while (UI
!= V
->user_end()) {
374 assert(*UI
== FirstUser
&& "Currently we only support SLP trees.");
384 dbgs() << "buildGraph: ";
388 if (!areVectorizable(Values
))
391 assert(getOpcode(Values
) && "Opcodes for all values must match");
392 unsigned ValuesOpcode
= getOpcode(Values
).getValue();
394 SmallVector
<VPValue
*, 4> CombinedOperands
;
395 if (areCommutative(Values
)) {
396 bool MultiNodeRoot
= !MultiNodeActive
;
397 MultiNodeActive
= true;
398 for (auto &Operands
: getOperands(Values
)) {
400 dbgs() << " Visiting Commutative";
401 dumpBundle(Operands
);
404 auto OperandsOpcode
= getOpcode(Operands
);
405 if (OperandsOpcode
&& OperandsOpcode
== getOpcode(Values
)) {
406 LLVM_DEBUG(dbgs() << " Same opcode, continue building\n");
407 CombinedOperands
.push_back(buildGraph(Operands
));
409 LLVM_DEBUG(dbgs() << " Adding multinode Ops\n");
410 // Create dummy VPInstruction, which will we replace later by the
411 // re-ordered operand.
412 VPInstruction
*Op
= new VPInstruction(0, {});
413 CombinedOperands
.push_back(Op
);
414 MultiNodeOps
.emplace_back(Op
, Operands
);
419 LLVM_DEBUG(dbgs() << "Reorder \n");
420 MultiNodeActive
= false;
422 auto FinalOrder
= reorderMultiNodeOps();
424 MultiNodeOps
.clear();
425 for (auto &Ops
: FinalOrder
) {
426 VPInstruction
*NewOp
= buildGraph(Ops
.second
);
427 Ops
.first
->replaceAllUsesWith(NewOp
);
428 for (unsigned i
= 0; i
< CombinedOperands
.size(); i
++)
429 if (CombinedOperands
[i
] == Ops
.first
)
430 CombinedOperands
[i
] = NewOp
;
434 LLVM_DEBUG(dbgs() << "Found final order\n");
437 LLVM_DEBUG(dbgs() << " NonCommuntative\n");
438 if (ValuesOpcode
== Instruction::Load
)
439 for (VPValue
*V
: Values
)
440 CombinedOperands
.push_back(cast
<VPInstruction
>(V
)->getOperand(0));
442 for (auto &Operands
: getOperands(Values
))
443 CombinedOperands
.push_back(buildGraph(Operands
));
447 switch (ValuesOpcode
) {
448 case Instruction::Load
:
449 Opcode
= VPInstruction::SLPLoad
;
451 case Instruction::Store
:
452 Opcode
= VPInstruction::SLPStore
;
455 Opcode
= ValuesOpcode
;
462 assert(CombinedOperands
.size() > 0 && "Need more some operands");
463 auto *VPI
= new VPInstruction(Opcode
, CombinedOperands
);
464 VPI
->setUnderlyingInstr(cast
<VPInstruction
>(Values
[0])->getUnderlyingInstr());
466 LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI
->print(dbgs());
467 cast
<VPInstruction
>(Values
[0])->print(dbgs()); dbgs() << "\n");
468 addCombined(Values
, VPI
);