1 //===-- HexagonISelDAGToDAGHVX.cpp ----------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 #include "HexagonISelDAGToDAG.h"
11 #include "HexagonISelLowering.h"
12 #include "HexagonTargetMachine.h"
13 #include "llvm/ADT/SetVector.h"
14 #include "llvm/CodeGen/MachineInstrBuilder.h"
15 #include "llvm/CodeGen/SelectionDAGISel.h"
16 #include "llvm/IR/Intrinsics.h"
17 #include "llvm/IR/IntrinsicsHexagon.h"
18 #include "llvm/Support/CommandLine.h"
19 #include "llvm/Support/Debug.h"
27 #define DEBUG_TYPE "hexagon-isel"
33 // --------------------------------------------------------------------
34 // Implementation of permutation networks.
36 // Implementation of the node routing through butterfly networks:
42 // Forward delta network consists of log(N) steps, where N is the number
43 // of inputs. In each step, an input can stay in place, or it can get
44 // routed to another position[1]. The step after that consists of two
45 // networks, each half in size in terms of the number of nodes. In those
46 // terms, in the given step, an input can go to either the upper or the
47 // lower network in the next step.
49 // [1] Hexagon's vdelta/vrdelta allow an element to be routed to both
50 // positions as long as there is no conflict.
52 // Here's a delta network for 8 inputs, only the switching routes are
56 // |- 1 ---------------|- 2 -----|- 3 -|
58 // Inp[0] *** *** *** *** Out[0]
62 // Inp[1] *** \ / *** X *** *** Out[1]
66 // Inp[2] *** \ \ / / *** X *** *** Out[2]
70 // Inp[3] *** \ / \ / \ / *** *** *** Out[3]
76 // Inp[4] *** / \ / \ / \ *** *** *** Out[4]
80 // Inp[5] *** / / \ \ *** X *** *** Out[5]
84 // Inp[6] *** / \ *** X *** *** Out[6]
88 // Inp[7] *** *** *** *** Out[7]
91 // Reverse delta network is same as delta network, with the steps in
92 // the opposite order.
95 // Benes network is a forward delta network immediately followed by
96 // a reverse delta network.
98 enum class ColorKind
{ None
, Red
, Black
};
100 // Graph coloring utility used to partition nodes into two groups:
101 // they will correspond to nodes routed to the upper and lower networks.
104 using MapType
= std::map
<Node
, ColorKind
>;
105 static constexpr Node Ignore
= Node(-1);
107 Coloring(ArrayRef
<Node
> Ord
) : Order(Ord
) {
113 const MapType
&colors() const {
117 ColorKind
other(ColorKind Color
) {
118 if (Color
== ColorKind::None
)
119 return ColorKind::Red
;
120 return Color
== ColorKind::Red
? ColorKind::Black
: ColorKind::Red
;
123 LLVM_DUMP_METHOD
void dump() const;
126 ArrayRef
<Node
> Order
;
128 std::set
<Node
> Needed
;
130 using NodeSet
= std::set
<Node
>;
131 std::map
<Node
,NodeSet
> Edges
;
133 Node
conj(Node Pos
) {
134 Node Num
= Order
.size();
135 return (Pos
< Num
/2) ? Pos
+ Num
/2 : Pos
- Num
/2;
138 ColorKind
getColor(Node N
) {
139 auto F
= Colors
.find(N
);
140 return F
!= Colors
.end() ? F
->second
: ColorKind::None
;
143 std::pair
<bool, ColorKind
> getUniqueColor(const NodeSet
&Nodes
);
150 std::pair
<bool, ColorKind
> Coloring::getUniqueColor(const NodeSet
&Nodes
) {
151 auto Color
= ColorKind::None
;
152 for (Node N
: Nodes
) {
153 ColorKind ColorN
= getColor(N
);
154 if (ColorN
== ColorKind::None
)
156 if (Color
== ColorKind::None
)
158 else if (Color
!= ColorKind::None
&& Color
!= ColorN
)
159 return { false, ColorKind::None
};
161 return { true, Color
};
164 void Coloring::build() {
165 // Add Order[P] and Order[conj(P)] to Edges.
166 for (unsigned P
= 0; P
!= Order
.size(); ++P
) {
170 Node PC
= Order
[conj(P
)];
171 if (PC
!= Ignore
&& PC
!= I
)
175 // Add I and conj(I) to Edges.
176 for (unsigned I
= 0; I
!= Order
.size(); ++I
) {
177 if (!Needed
.count(I
))
180 // This will create an entry in the edge table, even if I is not
181 // connected to any other node. This is necessary, because it still
182 // needs to be colored.
183 NodeSet
&Is
= Edges
[I
];
189 bool Coloring::color() {
190 SetVector
<Node
> FirstQ
;
191 auto Enqueue
= [this,&FirstQ
] (Node N
) {
194 for (unsigned I
= 0; I
!= Q
.size(); ++I
) {
195 NodeSet
&Ns
= Edges
[Q
[I
]];
196 Q
.insert(Ns
.begin(), Ns
.end());
198 FirstQ
.insert(Q
.begin(), Q
.end());
200 for (Node N
: Needed
)
203 for (Node N
: FirstQ
) {
206 NodeSet
&Ns
= Edges
[N
];
207 auto P
= getUniqueColor(Ns
);
210 Colors
[N
] = other(P
.second
);
213 // First, color nodes that don't have any dups.
214 for (auto E
: Edges
) {
216 if (!Needed
.count(conj(N
)) || Colors
.count(N
))
218 auto P
= getUniqueColor(E
.second
);
221 Colors
[N
] = other(P
.second
);
224 // Now, nodes that are still uncolored. Since the graph can be modified
225 // in this step, create a work queue.
226 std::vector
<Node
> WorkQ
;
227 for (auto E
: Edges
) {
229 if (!Colors
.count(N
))
233 for (unsigned I
= 0; I
< WorkQ
.size(); ++I
) {
235 NodeSet
&Ns
= Edges
[N
];
236 auto P
= getUniqueColor(Ns
);
238 Colors
[N
] = other(P
.second
);
242 // Coloring failed. Split this node.
244 ColorKind ColorN
= other(ColorKind::None
);
245 ColorKind ColorC
= other(ColorN
);
246 NodeSet
&Cs
= Edges
[C
];
248 for (Node M
: CopyNs
) {
249 ColorKind ColorM
= getColor(M
);
250 if (ColorM
== ColorC
) {
251 // Connect M with C, disconnect M from N.
262 // Explicitly assign "None" to all uncolored nodes.
263 for (unsigned I
= 0; I
!= Order
.size(); ++I
)
264 if (Colors
.count(I
) == 0)
265 Colors
[I
] = ColorKind::None
;
270 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
271 void Coloring::dump() const {
272 dbgs() << "{ Order: {";
273 for (unsigned I
= 0; I
!= Order
.size(); ++I
) {
281 dbgs() << " Needed: {";
282 for (Node N
: Needed
)
286 dbgs() << " Edges: {\n";
287 for (auto E
: Edges
) {
288 dbgs() << " " << E
.first
<< " -> {";
289 for (auto N
: E
.second
)
295 auto ColorKindToName
= [](ColorKind C
) {
297 case ColorKind::None
:
301 case ColorKind::Black
:
304 llvm_unreachable("all ColorKinds should be handled by the switch above");
307 dbgs() << " Colors: {\n";
308 for (auto C
: Colors
)
309 dbgs() << " " << C
.first
<< " -> " << ColorKindToName(C
.second
) << "\n";
315 // Base class of for reordering networks. They don't strictly need to be
316 // permutations, as outputs with repeated occurrences of an input element
319 using Controls
= std::vector
<uint8_t>;
320 using ElemType
= int;
321 static constexpr ElemType Ignore
= ElemType(-1);
333 PermNetwork(ArrayRef
<ElemType
> Ord
, unsigned Mult
= 1) {
334 Order
.assign(Ord
.data(), Ord
.data()+Ord
.size());
337 unsigned S
= Order
.size();
341 Table
.resize(Order
.size());
342 for (RowType
&Row
: Table
)
343 Row
.resize(Mult
*Log
, None
);
346 void getControls(Controls
&V
, unsigned StartAt
, uint8_t Dir
) const {
347 unsigned Size
= Order
.size();
349 for (unsigned I
= 0; I
!= Size
; ++I
) {
351 for (unsigned L
= 0; L
!= Log
; ++L
) {
352 unsigned C
= ctl(I
, StartAt
+L
) == Switch
;
358 assert(isUInt
<8>(W
));
363 uint8_t ctl(ElemType Pos
, unsigned Step
) const {
364 return Table
[Pos
][Step
];
366 unsigned size() const {
369 unsigned steps() const {
375 std::vector
<ElemType
> Order
;
376 using RowType
= std::vector
<uint8_t>;
377 std::vector
<RowType
> Table
;
380 struct ForwardDeltaNetwork
: public PermNetwork
{
381 ForwardDeltaNetwork(ArrayRef
<ElemType
> Ord
) : PermNetwork(Ord
) {}
383 bool run(Controls
&V
) {
384 if (!route(Order
.data(), Table
.data(), size(), 0))
386 getControls(V
, 0, Forward
);
391 bool route(ElemType
*P
, RowType
*T
, unsigned Size
, unsigned Step
);
394 struct ReverseDeltaNetwork
: public PermNetwork
{
395 ReverseDeltaNetwork(ArrayRef
<ElemType
> Ord
) : PermNetwork(Ord
) {}
397 bool run(Controls
&V
) {
398 if (!route(Order
.data(), Table
.data(), size(), 0))
400 getControls(V
, 0, Reverse
);
405 bool route(ElemType
*P
, RowType
*T
, unsigned Size
, unsigned Step
);
408 struct BenesNetwork
: public PermNetwork
{
409 BenesNetwork(ArrayRef
<ElemType
> Ord
) : PermNetwork(Ord
, 2) {}
411 bool run(Controls
&F
, Controls
&R
) {
412 if (!route(Order
.data(), Table
.data(), size(), 0))
415 getControls(F
, 0, Forward
);
416 getControls(R
, Log
, Reverse
);
421 bool route(ElemType
*P
, RowType
*T
, unsigned Size
, unsigned Step
);
425 bool ForwardDeltaNetwork::route(ElemType
*P
, RowType
*T
, unsigned Size
,
427 bool UseUp
= false, UseDown
= false;
430 // Cannot use coloring here, because coloring is used to determine
431 // the "big" switch, i.e. the one that changes halves, and in a forward
432 // network, a color can be simultaneously routed to both halves in the
433 // step we're working on.
434 for (ElemType J
= 0; J
!= Num
; ++J
) {
436 // I is the position in the input,
437 // J is the position in the output.
442 S
= (J
< Num
/2) ? Pass
: Switch
;
444 S
= (J
< Num
/2) ? Switch
: Pass
;
446 // U is the element in the table that needs to be updated.
447 ElemType U
= (S
== Pass
) ? I
: (I
< Num
/2 ? I
+Num
/2 : I
-Num
/2);
452 if (T
[U
][Step
] != S
&& T
[U
][Step
] != None
)
457 for (ElemType J
= 0; J
!= Num
; ++J
)
458 if (P
[J
] != Ignore
&& P
[J
] >= Num
/2)
462 if (UseUp
&& !route(P
, T
, Size
/2, Step
+1))
464 if (UseDown
&& !route(P
+Size
/2, T
+Size
/2, Size
/2, Step
+1))
470 bool ReverseDeltaNetwork::route(ElemType
*P
, RowType
*T
, unsigned Size
,
472 unsigned Pets
= Log
-1 - Step
;
473 bool UseUp
= false, UseDown
= false;
476 // In this step half-switching occurs, so coloring can be used.
477 Coloring
G({P
,Size
});
478 const Coloring::MapType
&M
= G
.colors();
482 ColorKind ColorUp
= ColorKind::None
;
483 for (ElemType J
= 0; J
!= Num
; ++J
) {
485 // I is the position in the input,
486 // J is the position in the output.
489 ColorKind C
= M
.at(I
);
490 if (C
== ColorKind::None
)
492 // During "Step", inputs cannot switch halves, so if the "up" color
493 // is still unknown, make sure that it is selected in such a way that
494 // "I" will stay in the same half.
495 bool InpUp
= I
< Num
/2;
496 if (ColorUp
== ColorKind::None
)
497 ColorUp
= InpUp
? C
: G
.other(C
);
498 if ((C
== ColorUp
) != InpUp
) {
499 // If I should go to a different half than where is it now, give up.
505 S
= (J
< Num
/2) ? Pass
: Switch
;
508 S
= (J
< Num
/2) ? Switch
: Pass
;
514 // Reorder the working permutation according to the computed switch table
515 // for the last step (i.e. Pets).
516 for (ElemType J
= 0, E
= Size
/ 2; J
!= E
; ++J
) {
517 ElemType PJ
= P
[J
]; // Current values of P[J]
518 ElemType PC
= P
[J
+Size
/2]; // and P[conj(J)]
519 ElemType QJ
= PJ
; // New values of P[J]
520 ElemType QC
= PC
; // and P[conj(J)]
521 if (T
[J
][Pets
] == Switch
)
523 if (T
[J
+Size
/2][Pets
] == Switch
)
529 for (ElemType J
= 0; J
!= Num
; ++J
)
530 if (P
[J
] != Ignore
&& P
[J
] >= Num
/2)
534 if (UseUp
&& !route(P
, T
, Size
/2, Step
+1))
536 if (UseDown
&& !route(P
+Size
/2, T
+Size
/2, Size
/2, Step
+1))
542 bool BenesNetwork::route(ElemType
*P
, RowType
*T
, unsigned Size
,
544 Coloring
G({P
,Size
});
545 const Coloring::MapType
&M
= G
.colors();
550 unsigned Pets
= 2*Log
-1 - Step
;
551 bool UseUp
= false, UseDown
= false;
553 // Both assignments, i.e. Red->Up and Red->Down are valid, but they will
554 // result in different controls. Let's pick the one where the first
555 // control will be "Pass".
556 ColorKind ColorUp
= ColorKind::None
;
557 for (ElemType J
= 0; J
!= Num
; ++J
) {
561 ColorKind C
= M
.at(I
);
562 if (C
== ColorKind::None
)
564 if (ColorUp
== ColorKind::None
) {
565 ColorUp
= (I
< Num
/ 2) ? ColorKind::Red
: ColorKind::Black
;
567 unsigned CI
= (I
< Num
/2) ? I
+Num
/2 : I
-Num
/2;
572 T
[CI
][Step
] = Switch
;
573 T
[J
][Pets
] = (J
< Num
/2) ? Pass
: Switch
;
577 T
[CI
][Step
] = Switch
;
580 T
[J
][Pets
] = (J
< Num
/2) ? Switch
: Pass
;
585 // Reorder the working permutation according to the computed switch table
586 // for the last step (i.e. Pets).
587 for (ElemType J
= 0; J
!= Num
/2; ++J
) {
588 ElemType PJ
= P
[J
]; // Current values of P[J]
589 ElemType PC
= P
[J
+Num
/2]; // and P[conj(J)]
590 ElemType QJ
= PJ
; // New values of P[J]
591 ElemType QC
= PC
; // and P[conj(J)]
592 if (T
[J
][Pets
] == Switch
)
594 if (T
[J
+Num
/2][Pets
] == Switch
)
600 for (ElemType J
= 0; J
!= Num
; ++J
)
601 if (P
[J
] != Ignore
&& P
[J
] >= Num
/2)
605 if (UseUp
&& !route(P
, T
, Size
/2, Step
+1))
607 if (UseDown
&& !route(P
+Size
/2, T
+Size
/2, Size
/2, Step
+1))
613 // --------------------------------------------------------------------
614 // Support for building selection results (output instructions that are
615 // parts of the final selection).
619 OpRef(SDValue V
) : OpV(V
) {}
620 bool isValue() const { return OpV
.getNode() != nullptr; }
621 bool isValid() const { return isValue() || !(OpN
& Invalid
); }
622 static OpRef
res(int N
) { return OpRef(Whole
| (N
& Index
)); }
623 static OpRef
fail() { return OpRef(Invalid
); }
625 static OpRef
lo(const OpRef
&R
) {
626 assert(!R
.isValue());
627 return OpRef(R
.OpN
& (Undef
| Index
| LoHalf
));
629 static OpRef
hi(const OpRef
&R
) {
630 assert(!R
.isValue());
631 return OpRef(R
.OpN
& (Undef
| Index
| HiHalf
));
633 static OpRef
undef(MVT Ty
) { return OpRef(Undef
| Ty
.SimpleTy
); }
636 SDValue OpV
= SDValue();
638 // Reference to the operand of the input node:
639 // If the 31st bit is 1, it's undef, otherwise, bits 28..0 are the
641 // If bit 30 is set, it's the high half of the operand.
642 // If bit 29 is set, it's the low half of the operand.
646 Invalid
= 0x10000000,
649 Whole
= LoHalf
| HiHalf
,
651 Index
= 0x0FFFFFFF, // Mask of the index value.
656 void print(raw_ostream
&OS
, const SelectionDAG
&G
) const;
659 OpRef(unsigned N
) : OpN(N
) {}
662 struct NodeTemplate
{
663 NodeTemplate() = default;
666 std::vector
<OpRef
> Ops
;
668 LLVM_DUMP_METHOD
void print(raw_ostream
&OS
, const SelectionDAG
&G
) const;
672 ResultStack(SDNode
*Inp
)
673 : InpNode(Inp
), InpTy(Inp
->getValueType(0).getSimpleVT()) {}
676 unsigned push(const NodeTemplate
&Res
) {
678 return List
.size()-1;
680 unsigned push(unsigned Opc
, MVT Ty
, std::vector
<OpRef
> &&Ops
) {
687 bool empty() const { return List
.empty(); }
688 unsigned size() const { return List
.size(); }
689 unsigned top() const { return size()-1; }
690 const NodeTemplate
&operator[](unsigned I
) const { return List
[I
]; }
691 unsigned reset(unsigned NewTop
) {
692 List
.resize(NewTop
+1);
696 using BaseType
= std::vector
<NodeTemplate
>;
697 BaseType::iterator
begin() { return List
.begin(); }
698 BaseType::iterator
end() { return List
.end(); }
699 BaseType::const_iterator
begin() const { return List
.begin(); }
700 BaseType::const_iterator
end() const { return List
.end(); }
705 void print(raw_ostream
&OS
, const SelectionDAG
&G
) const;
709 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
710 void OpRef::print(raw_ostream
&OS
, const SelectionDAG
&G
) const {
712 OpV
.getNode()->print(OS
, &G
);
723 if ((OpN
& Whole
) != Whole
) {
724 assert((OpN
& Whole
) == LoHalf
|| (OpN
& Whole
) == HiHalf
);
730 OS
<< '#' << SignExtend32(OpN
& Index
, IndexBits
);
733 void NodeTemplate::print(raw_ostream
&OS
, const SelectionDAG
&G
) const {
734 const TargetInstrInfo
&TII
= *G
.getSubtarget().getInstrInfo();
735 OS
<< format("%8s", EVT(Ty
).getEVTString().c_str()) << " "
738 for (const auto &R
: Ops
) {
747 void ResultStack::print(raw_ostream
&OS
, const SelectionDAG
&G
) const {
748 OS
<< "Input node:\n";
752 OS
<< "Result templates:\n";
753 for (unsigned I
= 0, E
= List
.size(); I
!= E
; ++I
) {
754 OS
<< '[' << I
<< "] ";
755 List
[I
].print(OS
, G
);
763 ShuffleMask(ArrayRef
<int> M
) : Mask(M
) {
764 for (unsigned I
= 0, E
= Mask
.size(); I
!= E
; ++I
) {
768 MinSrc
= (MinSrc
== -1) ? M
: std::min(MinSrc
, M
);
769 MaxSrc
= (MaxSrc
== -1) ? M
: std::max(MaxSrc
, M
);
774 int MinSrc
= -1, MaxSrc
= -1;
776 ShuffleMask
lo() const {
777 size_t H
= Mask
.size()/2;
778 return ShuffleMask(Mask
.take_front(H
));
780 ShuffleMask
hi() const {
781 size_t H
= Mask
.size()/2;
782 return ShuffleMask(Mask
.take_back(H
));
785 void print(raw_ostream
&OS
) const {
786 OS
<< "MinSrc:" << MinSrc
<< ", MaxSrc:" << MaxSrc
<< " {";
793 LLVM_ATTRIBUTE_UNUSED
794 raw_ostream
&operator<<(raw_ostream
&OS
, const ShuffleMask
&SM
) {
800 // --------------------------------------------------------------------
801 // The HvxSelector class.
803 static const HexagonTargetLowering
&getHexagonLowering(SelectionDAG
&G
) {
804 return static_cast<const HexagonTargetLowering
&>(G
.getTargetLoweringInfo());
806 static const HexagonSubtarget
&getHexagonSubtarget(SelectionDAG
&G
) {
807 return static_cast<const HexagonSubtarget
&>(G
.getSubtarget());
812 const HexagonTargetLowering
&Lower
;
813 HexagonDAGToDAGISel
&ISel
;
815 const HexagonSubtarget
&HST
;
816 const unsigned HwLen
;
818 HvxSelector(HexagonDAGToDAGISel
&HS
, SelectionDAG
&G
)
819 : Lower(getHexagonLowering(G
)), ISel(HS
), DAG(G
),
820 HST(getHexagonSubtarget(G
)), HwLen(HST
.getVectorLength()) {}
822 MVT
getSingleVT(MVT ElemTy
) const {
823 assert(ElemTy
!= MVT::i1
&& "Use getBoolVT for predicates");
824 unsigned NumElems
= HwLen
/ (ElemTy
.getSizeInBits()/8);
825 return MVT::getVectorVT(ElemTy
, NumElems
);
828 MVT
getPairVT(MVT ElemTy
) const {
829 assert(ElemTy
!= MVT::i1
); // Suspicious: there are no predicate pairs.
830 unsigned NumElems
= (2*HwLen
) / (ElemTy
.getSizeInBits()/8);
831 return MVT::getVectorVT(ElemTy
, NumElems
);
834 MVT
getBoolVT() const {
835 // Return HwLen x i1.
836 return MVT::getVectorVT(MVT::i1
, HwLen
);
839 void selectShuffle(SDNode
*N
);
840 void selectRor(SDNode
*N
);
841 void selectVAlign(SDNode
*N
);
844 void select(SDNode
*ISelN
);
845 void materialize(const ResultStack
&Results
);
847 SDValue
getConst32(int Val
, const SDLoc
&dl
);
848 SDValue
getVectorConstant(ArrayRef
<uint8_t> Data
, const SDLoc
&dl
);
854 OpRef
concats(OpRef Va
, OpRef Vb
, ResultStack
&Results
);
855 OpRef
packs(ShuffleMask SM
, OpRef Va
, OpRef Vb
, ResultStack
&Results
,
856 MutableArrayRef
<int> NewMask
, unsigned Options
= None
);
857 OpRef
packp(ShuffleMask SM
, OpRef Va
, OpRef Vb
, ResultStack
&Results
,
858 MutableArrayRef
<int> NewMask
);
859 OpRef
vmuxs(ArrayRef
<uint8_t> Bytes
, OpRef Va
, OpRef Vb
,
860 ResultStack
&Results
);
861 OpRef
vmuxp(ArrayRef
<uint8_t> Bytes
, OpRef Va
, OpRef Vb
,
862 ResultStack
&Results
);
864 OpRef
shuffs1(ShuffleMask SM
, OpRef Va
, ResultStack
&Results
);
865 OpRef
shuffs2(ShuffleMask SM
, OpRef Va
, OpRef Vb
, ResultStack
&Results
);
866 OpRef
shuffp1(ShuffleMask SM
, OpRef Va
, ResultStack
&Results
);
867 OpRef
shuffp2(ShuffleMask SM
, OpRef Va
, OpRef Vb
, ResultStack
&Results
);
869 OpRef
butterfly(ShuffleMask SM
, OpRef Va
, ResultStack
&Results
);
870 OpRef
contracting(ShuffleMask SM
, OpRef Va
, OpRef Vb
, ResultStack
&Results
);
871 OpRef
expanding(ShuffleMask SM
, OpRef Va
, ResultStack
&Results
);
872 OpRef
perfect(ShuffleMask SM
, OpRef Va
, ResultStack
&Results
);
874 bool selectVectorConstants(SDNode
*N
);
875 bool scalarizeShuffle(ArrayRef
<int> Mask
, const SDLoc
&dl
, MVT ResTy
,
876 SDValue Va
, SDValue Vb
, SDNode
*N
);
881 static void splitMask(ArrayRef
<int> Mask
, MutableArrayRef
<int> MaskL
,
882 MutableArrayRef
<int> MaskR
) {
883 unsigned VecLen
= Mask
.size();
884 assert(MaskL
.size() == VecLen
&& MaskR
.size() == VecLen
);
885 for (unsigned I
= 0; I
!= VecLen
; ++I
) {
888 MaskL
[I
] = MaskR
[I
] = -1;
889 } else if (unsigned(M
) < VecLen
) {
899 static std::pair
<int,unsigned> findStrip(ArrayRef
<int> A
, int Inc
,
901 assert(A
.size() > 0 && A
.size() >= MaxLen
);
904 for (unsigned I
= 1; I
!= MaxLen
; ++I
) {
909 return { F
, MaxLen
};
912 static bool isUndef(ArrayRef
<int> Mask
) {
919 static bool isIdentity(ArrayRef
<int> Mask
) {
920 for (int I
= 0, E
= Mask
.size(); I
!= E
; ++I
) {
922 if (M
>= 0 && M
!= I
)
928 static SmallVector
<unsigned, 4> getInputSegmentList(ShuffleMask SM
,
930 assert(isPowerOf2_32(SegLen
));
931 SmallVector
<unsigned, 4> SegList
;
935 unsigned Shift
= Log2_32(SegLen
);
936 BitVector
Segs(alignTo(SM
.MaxSrc
+ 1, SegLen
) >> Shift
);
938 for (int I
= 0, E
= SM
.Mask
.size(); I
!= E
; ++I
) {
941 Segs
.set(M
>> Shift
);
944 for (unsigned B
: Segs
.set_bits())
945 SegList
.push_back(B
);
949 static SmallVector
<unsigned, 4> getOutputSegmentMap(ShuffleMask SM
,
951 // Calculate the layout of the output segments in terms of the input
953 // For example [1,3,1,0] means that the output consists of 4 output
954 // segments, where the first output segment has only elements of the
955 // input segment at index 1. The next output segment only has elements
956 // of the input segment 3, etc.
957 // If an output segment only has undef elements, the value will be ~0u.
958 // If an output segment has elements from more than one input segment,
959 // the corresponding value will be ~1u.
960 unsigned MaskLen
= SM
.Mask
.size();
961 assert(MaskLen
% SegLen
== 0);
962 SmallVector
<unsigned, 4> Map(MaskLen
/ SegLen
);
964 for (int S
= 0, E
= Map
.size(); S
!= E
; ++S
) {
966 for (int I
= 0; I
!= static_cast<int>(SegLen
); ++I
) {
967 int M
= SM
.Mask
[S
*SegLen
+ I
];
970 unsigned G
= M
/ SegLen
; // Input segment of this element.
973 } else if (Idx
!= G
) {
984 static void packSegmentMask(ArrayRef
<int> Mask
, ArrayRef
<unsigned> OutSegMap
,
985 unsigned SegLen
, MutableArrayRef
<int> PackedMask
) {
986 SmallVector
<unsigned, 4> InvMap
;
987 for (int I
= OutSegMap
.size() - 1; I
>= 0; --I
) {
988 unsigned S
= OutSegMap
[I
];
989 assert(S
!= ~0u && "Unexpected undef");
990 assert(S
!= ~1u && "Unexpected multi");
991 if (InvMap
.size() <= S
)
996 unsigned Shift
= Log2_32(SegLen
);
997 for (int I
= 0, E
= Mask
.size(); I
!= E
; ++I
) {
1000 int OutIdx
= InvMap
[M
>> Shift
];
1001 M
= (M
& (SegLen
-1)) + SegLen
*OutIdx
;
1007 static bool isPermutation(ArrayRef
<int> Mask
) {
1008 // Check by adding all numbers only works if there is no overflow.
1009 assert(Mask
.size() < 0x00007FFF && "Sanity failure");
1011 for (int Idx
: Mask
) {
1016 int N
= Mask
.size();
1017 return 2*Sum
== N
*(N
-1);
1020 bool HvxSelector::selectVectorConstants(SDNode
*N
) {
1021 // Constant vectors are generated as loads from constant pools or as
1022 // splats of a constant value. Since they are generated during the
1023 // selection process, the main selection algorithm is not aware of them.
1024 // Select them directly here.
1025 SmallVector
<SDNode
*,4> Nodes
;
1026 SetVector
<SDNode
*> WorkQ
;
1028 // The DAG can change (due to CSE) during selection, so cache all the
1029 // unselected nodes first to avoid traversing a mutating DAG.
1031 for (unsigned i
= 0; i
!= WorkQ
.size(); ++i
) {
1032 SDNode
*W
= WorkQ
[i
];
1033 if (!W
->isMachineOpcode() && W
->getOpcode() == HexagonISD::ISEL
)
1035 for (unsigned j
= 0, f
= W
->getNumOperands(); j
!= f
; ++j
)
1036 WorkQ
.insert(W
->getOperand(j
).getNode());
1039 for (SDNode
*L
: Nodes
)
1042 return !Nodes
.empty();
1045 void HvxSelector::materialize(const ResultStack
&Results
) {
1046 DEBUG_WITH_TYPE("isel", {
1047 dbgs() << "Materializing\n";
1048 Results
.print(dbgs(), DAG
);
1050 if (Results
.empty())
1052 const SDLoc
&dl(Results
.InpNode
);
1053 std::vector
<SDValue
> Output
;
1055 for (unsigned I
= 0, E
= Results
.size(); I
!= E
; ++I
) {
1056 const NodeTemplate
&Node
= Results
[I
];
1057 std::vector
<SDValue
> Ops
;
1058 for (const OpRef
&R
: Node
.Ops
) {
1059 assert(R
.isValid());
1061 Ops
.push_back(R
.OpV
);
1064 if (R
.OpN
& OpRef::Undef
) {
1065 MVT::SimpleValueType SVT
= MVT::SimpleValueType(R
.OpN
& OpRef::Index
);
1066 Ops
.push_back(ISel
.selectUndef(dl
, MVT(SVT
)));
1069 // R is an index of a result.
1070 unsigned Part
= R
.OpN
& OpRef::Whole
;
1071 int Idx
= SignExtend32(R
.OpN
& OpRef::Index
, OpRef::IndexBits
);
1074 assert(Idx
>= 0 && unsigned(Idx
) < Output
.size());
1075 SDValue Op
= Output
[Idx
];
1076 MVT OpTy
= Op
.getValueType().getSimpleVT();
1077 if (Part
!= OpRef::Whole
) {
1078 assert(Part
== OpRef::LoHalf
|| Part
== OpRef::HiHalf
);
1079 MVT HalfTy
= MVT::getVectorVT(OpTy
.getVectorElementType(),
1080 OpTy
.getVectorNumElements()/2);
1081 unsigned Sub
= (Part
== OpRef::LoHalf
) ? Hexagon::vsub_lo
1083 Op
= DAG
.getTargetExtractSubreg(Sub
, dl
, HalfTy
, Op
);
1086 } // for (Node : Results)
1088 assert(Node
.Ty
!= MVT::Other
);
1089 SDNode
*ResN
= (Node
.Opc
== TargetOpcode::COPY
)
1090 ? Ops
.front().getNode()
1091 : DAG
.getMachineNode(Node
.Opc
, dl
, Node
.Ty
, Ops
);
1092 Output
.push_back(SDValue(ResN
, 0));
1095 SDNode
*OutN
= Output
.back().getNode();
1096 SDNode
*InpN
= Results
.InpNode
;
1097 DEBUG_WITH_TYPE("isel", {
1098 dbgs() << "Generated node:\n";
1102 ISel
.ReplaceNode(InpN
, OutN
);
1103 selectVectorConstants(OutN
);
1104 DAG
.RemoveDeadNodes();
1107 OpRef
HvxSelector::concats(OpRef Lo
, OpRef Hi
, ResultStack
&Results
) {
1108 DEBUG_WITH_TYPE("isel", {dbgs() << __func__
<< '\n';});
1109 const SDLoc
&dl(Results
.InpNode
);
1110 Results
.push(TargetOpcode::REG_SEQUENCE
, getPairVT(MVT::i8
), {
1111 getConst32(Hexagon::HvxWRRegClassID
, dl
),
1112 Lo
, getConst32(Hexagon::vsub_lo
, dl
),
1113 Hi
, getConst32(Hexagon::vsub_hi
, dl
),
1115 return OpRef::res(Results
.top());
1118 // Va, Vb are single vectors. If SM only uses two vector halves from Va/Vb,
1119 // pack these halves into a single vector, and remap SM into NewMask to use
1120 // the new vector instead.
1121 OpRef
HvxSelector::packs(ShuffleMask SM
, OpRef Va
, OpRef Vb
,
1122 ResultStack
&Results
, MutableArrayRef
<int> NewMask
,
1124 DEBUG_WITH_TYPE("isel", {dbgs() << __func__
<< '\n';});
1125 if (!Va
.isValid() || !Vb
.isValid())
1126 return OpRef::fail();
1128 MVT Ty
= getSingleVT(MVT::i8
);
1129 MVT PairTy
= getPairVT(MVT::i8
);
1130 OpRef Inp
[2] = {Va
, Vb
};
1131 unsigned VecLen
= SM
.Mask
.size();
1133 auto valign
= [this](OpRef Lo
, OpRef Hi
, unsigned Amt
, MVT Ty
,
1134 ResultStack
&Results
) {
1137 const SDLoc
&dl(Results
.InpNode
);
1138 if (isUInt
<3>(Amt
) || isUInt
<3>(HwLen
- Amt
)) {
1139 bool IsRight
= isUInt
<3>(Amt
); // Right align.
1140 SDValue S
= getConst32(IsRight
? Amt
: HwLen
- Amt
, dl
);
1141 unsigned Opc
= IsRight
? Hexagon::V6_valignbi
: Hexagon::V6_vlalignbi
;
1142 Results
.push(Opc
, Ty
, {Hi
, Lo
, S
});
1143 return OpRef::res(Results
.top());
1145 Results
.push(Hexagon::A2_tfrsi
, MVT::i32
, {getConst32(Amt
, dl
)});
1146 OpRef A
= OpRef::res(Results
.top());
1147 Results
.push(Hexagon::V6_valignb
, Ty
, {Hi
, Lo
, A
});
1148 return OpRef::res(Results
.top());
1151 // Segment is a vector half.
1152 unsigned SegLen
= HwLen
/ 2;
1154 // Check if we can shuffle vector halves around to get the used elements
1155 // into a single vector.
1156 SmallVector
<int,128> MaskH(SM
.Mask
.begin(), SM
.Mask
.end());
1157 SmallVector
<unsigned, 4> SegList
= getInputSegmentList(SM
.Mask
, SegLen
);
1158 unsigned SegCount
= SegList
.size();
1159 SmallVector
<unsigned, 4> SegMap
= getOutputSegmentMap(SM
.Mask
, SegLen
);
1161 if (SegList
.empty())
1162 return OpRef::undef(Ty
);
1165 // In the following part of the function, where the segments are rearranged,
1166 // the shuffle mask SM can be of any length that is a multiple of a vector
1167 // (i.e. a multiple of 2*SegLen), and non-zero.
1168 // The output segment map is computed, and it may have any even number of
1169 // entries, but the rearrangement of input segments will be done based only
1170 // on the first two (non-undef) entries in the segment map.
1171 // For example, if the output map is 3, 1, 1, 3 (it can have at most two
1172 // distinct entries!), the segments 1 and 3 of Va/Vb will be packaged into
1173 // a single vector V = 3:1. The output mask will then be updated to use
1174 // seg(0,V), seg(1,V), seg(1,V), seg(0,V).
1176 // Picking the segments based on the output map is an optimization. For
1177 // correctness it is only necessary that Seg0 and Seg1 are the two input
1178 // segments that are used in the output.
1180 unsigned Seg0
= ~0u, Seg1
= ~0u;
1181 for (int I
= 0, E
= SegMap
.size(); I
!= E
; ++I
) {
1182 unsigned X
= SegMap
[I
];
1187 else if (Seg1
!= ~0u)
1189 if (X
== ~1u || X
!= Seg0
)
1193 if (SegCount
== 1) {
1194 unsigned SrcOp
= SegList
[0] / 2;
1195 for (int I
= 0; I
!= static_cast<int>(VecLen
); ++I
) {
1206 if (SegCount
== 2) {
1207 // Seg0 should not be undef here: this would imply a SegList
1208 // with <= 1 elements, which was checked earlier.
1209 assert(Seg0
!= ~0u);
1211 // If Seg0 or Seg1 are "multi-defined", pick them from the input
1212 // segment list instead.
1213 if (Seg0
== ~1u || Seg1
== ~1u) {
1217 } else if (Seg0
== ~1u) {
1218 Seg0
= SegList
[0] != Seg1
? SegList
[0] : SegList
[1];
1220 assert(Seg1
== ~1u); // Sanity
1221 Seg1
= SegList
[0] != Seg0
? SegList
[0] : SegList
[1];
1224 assert(Seg0
!= ~1u && Seg1
!= ~1u);
1226 assert(Seg0
!= Seg1
&& "Expecting different segments");
1227 const SDLoc
&dl(Results
.InpNode
);
1228 Results
.push(Hexagon::A2_tfrsi
, MVT::i32
, {getConst32(SegLen
, dl
)});
1229 OpRef HL
= OpRef::res(Results
.top());
1233 if (Seg0
/ 2 == Seg1
/ 2) {
1234 // Same input vector.
1238 Results
.push(Hexagon::V6_vror
, Ty
, {Inp
[Seg0
/ 2], HL
});
1239 Va
= OpRef::res(Results
.top());
1241 packSegmentMask(SM
.Mask
, {Seg0
, Seg1
}, SegLen
, MaskH
);
1242 } else if (Seg0
% 2 == Seg1
% 2) {
1243 // Picking AC, BD, CA, or DB.
1244 // vshuff(CD,AB,HL) -> BD:AC
1245 // vshuff(AB,CD,HL) -> DB:CA
1246 auto Vs
= (Seg0
== 0 || Seg0
== 1) ? std::make_pair(Vb
, Va
) // AC or BD
1247 : std::make_pair(Va
, Vb
); // CA or DB
1248 Results
.push(Hexagon::V6_vshuffvdd
, PairTy
, {Vs
.first
, Vs
.second
, HL
});
1249 OpRef P
= OpRef::res(Results
.top());
1250 Va
= (Seg0
== 0 || Seg0
== 2) ? OpRef::lo(P
) : OpRef::hi(P
);
1251 packSegmentMask(SM
.Mask
, {Seg0
, Seg1
}, SegLen
, MaskH
);
1253 // Picking AD, BC, CB, or DA.
1254 if ((Seg0
== 0 && Seg1
== 3) || (Seg0
== 2 && Seg1
== 1)) {
1255 // AD or BC: this can be done using vmux.
1256 // Q = V6_pred_scalar2 SegLen
1257 // V = V6_vmux Q, (Va, Vb) or (Vb, Va)
1258 Results
.push(Hexagon::V6_pred_scalar2
, getBoolVT(), {HL
});
1259 OpRef Qt
= OpRef::res(Results
.top());
1260 auto Vs
= (Seg0
== 0) ? std::make_pair(Va
, Vb
) // AD
1261 : std::make_pair(Vb
, Va
); // CB
1262 Results
.push(Hexagon::V6_vmux
, Ty
, {Qt
, Vs
.first
, Vs
.second
});
1263 Va
= OpRef::res(Results
.top());
1264 packSegmentMask(SM
.Mask
, {Seg0
, Seg1
}, SegLen
, MaskH
);
1266 // BC or DA: this could be done via valign by SegLen.
1267 // Do nothing here, because valign (if possible) will be generated
1268 // later on (make sure the Seg0 values are as expected, for sanity).
1269 assert(Seg0
== 1 || Seg0
== 3);
1274 // Check if the arguments can be packed by valign(Va,Vb) or valign(Vb,Va).
1276 ShuffleMask
SMH(MaskH
);
1277 assert(SMH
.Mask
.size() == VecLen
);
1278 SmallVector
<int,128> MaskA(SMH
.Mask
.begin(), SMH
.Mask
.end());
1280 if (SMH
.MaxSrc
- SMH
.MinSrc
>= static_cast<int>(HwLen
)) {
1281 // valign(Lo=Va,Hi=Vb) won't work. Try swapping Va/Vb.
1282 SmallVector
<int,128> Swapped(SMH
.Mask
.begin(), SMH
.Mask
.end());
1283 ShuffleVectorSDNode::commuteMask(Swapped
);
1284 ShuffleMask
SW(Swapped
);
1285 if (SW
.MaxSrc
- SW
.MinSrc
< static_cast<int>(HwLen
)) {
1286 MaskA
.assign(SW
.Mask
.begin(), SW
.Mask
.end());
1290 ShuffleMask
SMA(MaskA
);
1291 assert(SMA
.Mask
.size() == VecLen
);
1293 if (SMA
.MaxSrc
- SMA
.MinSrc
< static_cast<int>(HwLen
)) {
1294 int ShiftR
= SMA
.MinSrc
;
1295 if (ShiftR
>= static_cast<int>(HwLen
)) {
1297 Vb
= OpRef::undef(Ty
);
1300 OpRef RetVal
= valign(Va
, Vb
, ShiftR
, Ty
, Results
);
1302 for (int I
= 0; I
!= static_cast<int>(VecLen
); ++I
) {
1303 int M
= SMA
.Mask
[I
];
1311 // By here, packing by segment (half-vector) shuffling, and vector alignment
1312 // failed. Try vmux.
1313 // Note: since this is using the original mask, Va and Vb must not have been
1316 if (Options
& PackMux
) {
1317 // If elements picked from Va and Vb have all different (source) indexes
1318 // (relative to the start of the argument), do a mux, and update the mask.
1319 BitVector
Picked(HwLen
);
1320 SmallVector
<uint8_t,128> MuxBytes(HwLen
);
1322 for (int I
= 0; I
!= static_cast<int>(VecLen
); ++I
) {
1326 if (M
>= static_cast<int>(HwLen
))
1337 return vmuxs(MuxBytes
, Va
, Vb
, Results
);
1339 return OpRef::fail();
1342 // Va, Vb are vector pairs. If SM only uses two single vectors from Va/Vb,
1343 // pack these vectors into a pair, and remap SM into NewMask to use the
1344 // new pair instead.
1345 OpRef
HvxSelector::packp(ShuffleMask SM
, OpRef Va
, OpRef Vb
,
1346 ResultStack
&Results
, MutableArrayRef
<int> NewMask
) {
1347 DEBUG_WITH_TYPE("isel", {dbgs() << __func__
<< '\n';});
1348 SmallVector
<unsigned, 4> SegList
= getInputSegmentList(SM
.Mask
, HwLen
);
1349 if (SegList
.empty())
1350 return OpRef::undef(getPairVT(MVT::i8
));
1352 // If more than two halves are used, bail.
1353 // TODO: be more aggressive here?
1354 unsigned SegCount
= SegList
.size();
1356 return OpRef::fail();
1358 MVT HalfTy
= getSingleVT(MVT::i8
);
1360 OpRef Inp
[2] = { Va
, Vb
};
1361 OpRef Out
[2] = { OpRef::undef(HalfTy
), OpRef::undef(HalfTy
) };
1363 // Really make sure we have at most 2 vectors used in the mask.
1364 assert(SegCount
<= 2);
1366 for (int I
= 0, E
= SegList
.size(); I
!= E
; ++I
) {
1367 unsigned S
= SegList
[I
];
1368 OpRef Op
= Inp
[S
/ 2];
1369 Out
[I
] = (S
& 1) ? OpRef::hi(Op
) : OpRef::lo(Op
);
1372 // NOTE: Using SegList as the packing map here (not SegMap). This works,
1373 // because we're not concerned here about the order of the segments (i.e.
1374 // single vectors) in the output pair. Changing the order of vectors is
1375 // free (as opposed to changing the order of vector halves as in packs),
1376 // and so there is no extra cost added in case the order needs to be
1378 packSegmentMask(SM
.Mask
, SegList
, HwLen
, NewMask
);
1379 return concats(Out
[0], Out
[1], Results
);
1382 OpRef
HvxSelector::vmuxs(ArrayRef
<uint8_t> Bytes
, OpRef Va
, OpRef Vb
,
1383 ResultStack
&Results
) {
1384 DEBUG_WITH_TYPE("isel", {dbgs() << __func__
<< '\n';});
1385 MVT ByteTy
= getSingleVT(MVT::i8
);
1386 MVT BoolTy
= MVT::getVectorVT(MVT::i1
, HwLen
);
1387 const SDLoc
&dl(Results
.InpNode
);
1388 SDValue B
= getVectorConstant(Bytes
, dl
);
1389 Results
.push(Hexagon::V6_vd0
, ByteTy
, {});
1390 Results
.push(Hexagon::V6_veqb
, BoolTy
, {OpRef(B
), OpRef::res(-1)});
1391 Results
.push(Hexagon::V6_vmux
, ByteTy
, {OpRef::res(-1), Vb
, Va
});
1392 return OpRef::res(Results
.top());
1395 OpRef
HvxSelector::vmuxp(ArrayRef
<uint8_t> Bytes
, OpRef Va
, OpRef Vb
,
1396 ResultStack
&Results
) {
1397 DEBUG_WITH_TYPE("isel", {dbgs() << __func__
<< '\n';});
1398 size_t S
= Bytes
.size() / 2;
1399 OpRef L
= vmuxs(Bytes
.take_front(S
), OpRef::lo(Va
), OpRef::lo(Vb
), Results
);
1400 OpRef H
= vmuxs(Bytes
.drop_front(S
), OpRef::hi(Va
), OpRef::hi(Vb
), Results
);
1401 return concats(L
, H
, Results
);
1404 OpRef
HvxSelector::shuffs1(ShuffleMask SM
, OpRef Va
, ResultStack
&Results
) {
1405 DEBUG_WITH_TYPE("isel", {dbgs() << __func__
<< '\n';});
1406 unsigned VecLen
= SM
.Mask
.size();
1407 assert(HwLen
== VecLen
);
1409 assert(all_of(SM
.Mask
, [this](int M
) { return M
== -1 || M
< int(HwLen
); }));
1411 if (isIdentity(SM
.Mask
))
1413 if (isUndef(SM
.Mask
))
1414 return OpRef::undef(getSingleVT(MVT::i8
));
1416 unsigned HalfLen
= HwLen
/ 2;
1417 assert(isPowerOf2_32(HalfLen
)); // Sanity.
1419 // Handle special case where the output is the same half of the input
1420 // repeated twice, i.e. if Va = AB, then handle the output of AA or BB.
1421 std::pair
<int, unsigned> Strip1
= findStrip(SM
.Mask
, 1, HalfLen
);
1422 if ((Strip1
.first
& ~HalfLen
) == 0 && Strip1
.second
== HalfLen
) {
1423 std::pair
<int, unsigned> Strip2
=
1424 findStrip(SM
.Mask
.drop_front(HalfLen
), 1, HalfLen
);
1425 if (Strip1
== Strip2
) {
1426 const SDLoc
&dl(Results
.InpNode
);
1427 Results
.push(Hexagon::A2_tfrsi
, MVT::i32
, {getConst32(HalfLen
, dl
)});
1428 Results
.push(Hexagon::V6_vshuffvdd
, getPairVT(MVT::i8
),
1429 {Va
, Va
, OpRef::res(Results
.top())});
1430 OpRef S
= OpRef::res(Results
.top());
1431 return (Strip1
.first
== 0) ? OpRef::lo(S
) : OpRef::hi(S
);
1435 OpRef P
= perfect(SM
, Va
, Results
);
1438 return butterfly(SM
, Va
, Results
);
1441 OpRef
HvxSelector::shuffs2(ShuffleMask SM
, OpRef Va
, OpRef Vb
,
1442 ResultStack
&Results
) {
1443 DEBUG_WITH_TYPE("isel", {dbgs() << __func__
<< '\n';});
1444 if (isUndef(SM
.Mask
))
1445 return OpRef::undef(getSingleVT(MVT::i8
));
1447 OpRef C
= contracting(SM
, Va
, Vb
, Results
);
1451 int VecLen
= SM
.Mask
.size();
1452 SmallVector
<int,128> PackedMask(VecLen
);
1453 OpRef P
= packs(SM
, Va
, Vb
, Results
, PackedMask
);
1455 return shuffs1(ShuffleMask(PackedMask
), P
, Results
);
1457 // TODO: Before we split the mask, try perfect shuffle on concatenated
1458 // operands. This won't work now, because the perfect code does not
1459 // tolerate undefs in the mask.
1461 SmallVector
<int,128> MaskL(VecLen
), MaskR(VecLen
);
1462 splitMask(SM
.Mask
, MaskL
, MaskR
);
1464 OpRef L
= shuffs1(ShuffleMask(MaskL
), Va
, Results
);
1465 OpRef R
= shuffs1(ShuffleMask(MaskR
), Vb
, Results
);
1466 if (!L
.isValid() || !R
.isValid())
1467 return OpRef::fail();
1469 SmallVector
<uint8_t,128> Bytes(VecLen
);
1470 for (int I
= 0; I
!= VecLen
; ++I
) {
1474 return vmuxs(Bytes
, L
, R
, Results
);
1477 OpRef
HvxSelector::shuffp1(ShuffleMask SM
, OpRef Va
, ResultStack
&Results
) {
1478 DEBUG_WITH_TYPE("isel", {dbgs() << __func__
<< '\n';});
1479 int VecLen
= SM
.Mask
.size();
1481 if (isIdentity(SM
.Mask
))
1483 if (isUndef(SM
.Mask
))
1484 return OpRef::undef(getPairVT(MVT::i8
));
1486 SmallVector
<int,128> PackedMask(VecLen
);
1487 OpRef P
= packs(SM
, OpRef::lo(Va
), OpRef::hi(Va
), Results
, PackedMask
);
1489 ShuffleMask
PM(PackedMask
);
1490 OpRef E
= expanding(PM
, P
, Results
);
1494 OpRef L
= shuffs1(PM
.lo(), P
, Results
);
1495 OpRef H
= shuffs1(PM
.hi(), P
, Results
);
1496 if (L
.isValid() && H
.isValid())
1497 return concats(L
, H
, Results
);
1500 OpRef R
= perfect(SM
, Va
, Results
);
1503 // TODO commute the mask and try the opposite order of the halves.
1505 OpRef L
= shuffs2(SM
.lo(), OpRef::lo(Va
), OpRef::hi(Va
), Results
);
1506 OpRef H
= shuffs2(SM
.hi(), OpRef::lo(Va
), OpRef::hi(Va
), Results
);
1507 if (L
.isValid() && H
.isValid())
1508 return concats(L
, H
, Results
);
1510 return OpRef::fail();
1513 OpRef
HvxSelector::shuffp2(ShuffleMask SM
, OpRef Va
, OpRef Vb
,
1514 ResultStack
&Results
) {
1515 DEBUG_WITH_TYPE("isel", {dbgs() << __func__
<< '\n';});
1516 if (isUndef(SM
.Mask
))
1517 return OpRef::undef(getPairVT(MVT::i8
));
1519 int VecLen
= SM
.Mask
.size();
1520 SmallVector
<int,256> PackedMask(VecLen
);
1521 OpRef P
= packp(SM
, Va
, Vb
, Results
, PackedMask
);
1523 return shuffp1(ShuffleMask(PackedMask
), P
, Results
);
1525 SmallVector
<int,256> MaskL(VecLen
), MaskR(VecLen
);
1526 splitMask(SM
.Mask
, MaskL
, MaskR
);
1528 OpRef L
= shuffp1(ShuffleMask(MaskL
), Va
, Results
);
1529 OpRef R
= shuffp1(ShuffleMask(MaskR
), Vb
, Results
);
1530 if (!L
.isValid() || !R
.isValid())
1531 return OpRef::fail();
1534 SmallVector
<uint8_t,256> Bytes(VecLen
);
1535 for (int I
= 0; I
!= VecLen
; ++I
) {
1539 return vmuxp(Bytes
, L
, R
, Results
);
1543 struct Deleter
: public SelectionDAG::DAGNodeDeletedListener
{
1544 template <typename T
>
1545 Deleter(SelectionDAG
&D
, T
&C
)
1546 : SelectionDAG::DAGNodeDeletedListener(D
, [&C
] (SDNode
*N
, SDNode
*E
) {
1551 template <typename T
>
1552 struct NullifyingVector
: public T
{
1553 DenseMap
<SDNode
*, SDNode
**> Refs
;
1554 NullifyingVector(T
&&V
) : T(V
) {
1555 for (unsigned i
= 0, e
= T::size(); i
!= e
; ++i
) {
1556 SDNode
*&N
= T::operator[](i
);
1560 void erase(SDNode
*N
) {
1561 auto F
= Refs
.find(N
);
1562 if (F
!= Refs
.end())
1563 *F
->second
= nullptr;
1568 void HvxSelector::select(SDNode
*ISelN
) {
1569 // What's important here is to select the right set of nodes. The main
1570 // selection algorithm loops over nodes in a topological order, i.e. users
1571 // are visited before their operands.
1573 // It is an error to have an unselected node with a selected operand, and
1574 // there is an assertion in the main selector code to enforce that.
1576 // Such a situation could occur if we selected a node, which is both a
1577 // subnode of ISelN, and a subnode of an unrelated (and yet unselected)
1579 assert(ISelN
->getOpcode() == HexagonISD::ISEL
);
1580 SDNode
*N0
= ISelN
->getOperand(0).getNode();
1581 if (N0
->isMachineOpcode()) {
1582 ISel
.ReplaceNode(ISelN
, N0
);
1586 // There could have been nodes created (i.e. inserted into the DAG)
1587 // that are now dead. Remove them, in case they use any of the nodes
1588 // to select (and make them look shared).
1589 DAG
.RemoveDeadNodes();
1591 SetVector
<SDNode
*> SubNodes
, TmpQ
;
1592 std::map
<SDNode
*,unsigned> NumOps
;
1594 // Don't want to select N0 if it's shared with another node, except if
1595 // it's shared with other ISELs.
1596 auto IsISelN
= [](SDNode
*T
) { return T
->getOpcode() == HexagonISD::ISEL
; };
1597 if (llvm::all_of(N0
->uses(), IsISelN
))
1598 SubNodes
.insert(N0
);
1600 auto InSubNodes
= [&SubNodes
](SDNode
*T
) { return SubNodes
.count(T
); };
1601 for (unsigned I
= 0; I
!= SubNodes
.size(); ++I
) {
1602 SDNode
*S
= SubNodes
[I
];
1604 // Only add subnodes that are only reachable from N0.
1605 for (SDValue Op
: S
->ops()) {
1606 SDNode
*O
= Op
.getNode();
1607 if (llvm::all_of(O
->uses(), InSubNodes
)) {
1612 NumOps
.insert({S
, OpN
});
1617 for (unsigned I
= 0; I
!= TmpQ
.size(); ++I
) {
1618 SDNode
*S
= TmpQ
[I
];
1619 for (SDNode
*U
: S
->uses()) {
1622 auto F
= NumOps
.find(U
);
1623 assert(F
!= NumOps
.end());
1624 if (F
->second
> 0 && !--F
->second
)
1625 TmpQ
.insert(F
->first
);
1629 // Remove the marker.
1630 ISel
.ReplaceNode(ISelN
, N0
);
1632 assert(SubNodes
.size() == TmpQ
.size());
1633 NullifyingVector
<decltype(TmpQ
)::vector_type
> Queue(TmpQ
.takeVector());
1635 Deleter
DUQ(DAG
, Queue
);
1636 for (SDNode
*S
: reverse(Queue
)) {
1639 DEBUG_WITH_TYPE("isel", {dbgs() << "HVX selecting: "; S
->dump(&DAG
);});
1644 bool HvxSelector::scalarizeShuffle(ArrayRef
<int> Mask
, const SDLoc
&dl
,
1645 MVT ResTy
, SDValue Va
, SDValue Vb
,
1647 DEBUG_WITH_TYPE("isel", {dbgs() << __func__
<< '\n';});
1648 MVT ElemTy
= ResTy
.getVectorElementType();
1649 assert(ElemTy
== MVT::i8
);
1650 unsigned VecLen
= Mask
.size();
1651 bool HavePairs
= (2*HwLen
== VecLen
);
1652 MVT SingleTy
= getSingleVT(MVT::i8
);
1654 // The prior attempts to handle this shuffle may have left a bunch of
1655 // dead nodes in the DAG (such as constants). These nodes will be added
1656 // at the end of DAG's node list, which at that point had already been
1657 // sorted topologically. In the main selection loop, the node list is
1658 // traversed backwards from the root node, which means that any new
1659 // nodes (from the end of the list) will not be visited.
1660 // Scalarization will replace the shuffle node with the scalarized
1661 // expression, and if that expression reused any if the leftoever (dead)
1662 // nodes, these nodes would not be selected (since the "local" selection
1663 // only visits nodes that are not in AllNodes).
1664 // To avoid this issue, remove all dead nodes from the DAG now.
1665 // DAG.RemoveDeadNodes();
1667 SmallVector
<SDValue
,128> Ops
;
1668 LLVMContext
&Ctx
= *DAG
.getContext();
1669 MVT LegalTy
= Lower
.getTypeToTransformTo(Ctx
, ElemTy
).getSimpleVT();
1670 for (int I
: Mask
) {
1672 Ops
.push_back(ISel
.selectUndef(dl
, LegalTy
));
1685 Vec
= DAG
.getTargetExtractSubreg(Hexagon::vsub_lo
, dl
, SingleTy
, Vec
);
1687 Vec
= DAG
.getTargetExtractSubreg(Hexagon::vsub_hi
, dl
, SingleTy
, Vec
);
1691 SDValue Idx
= DAG
.getConstant(M
, dl
, MVT::i32
);
1692 SDValue Ex
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, LegalTy
, {Vec
, Idx
});
1693 SDValue L
= Lower
.LowerOperation(Ex
, DAG
);
1694 assert(L
.getNode());
1699 if (2*HwLen
== VecLen
) {
1700 SDValue B0
= DAG
.getBuildVector(SingleTy
, dl
, {Ops
.data(), HwLen
});
1701 SDValue L0
= Lower
.LowerOperation(B0
, DAG
);
1702 SDValue B1
= DAG
.getBuildVector(SingleTy
, dl
, {Ops
.data()+HwLen
, HwLen
});
1703 SDValue L1
= Lower
.LowerOperation(B1
, DAG
);
1704 // XXX CONCAT_VECTORS is legal for HVX vectors. Legalizing (lowering)
1705 // functions may expect to be called only for illegal operations, so
1706 // make sure that they are not called for legal ones. Develop a better
1707 // mechanism for dealing with this.
1708 LV
= DAG
.getNode(ISD::CONCAT_VECTORS
, dl
, ResTy
, {L0
, L1
});
1710 SDValue BV
= DAG
.getBuildVector(ResTy
, dl
, Ops
);
1711 LV
= Lower
.LowerOperation(BV
, DAG
);
1714 assert(!N
->use_empty());
1715 SDValue IS
= DAG
.getNode(HexagonISD::ISEL
, dl
, ResTy
, LV
);
1716 ISel
.ReplaceNode(N
, IS
.getNode());
1717 select(IS
.getNode());
1718 DAG
.RemoveDeadNodes();
1722 OpRef
HvxSelector::contracting(ShuffleMask SM
, OpRef Va
, OpRef Vb
,
1723 ResultStack
&Results
) {
1724 DEBUG_WITH_TYPE("isel", {dbgs() << __func__
<< '\n';});
1725 if (!Va
.isValid() || !Vb
.isValid())
1726 return OpRef::fail();
1728 // Contracting shuffles, i.e. instructions that always discard some bytes
1729 // from the operand vectors.
1733 // V6_vpack{e,o}{b,h}
1735 int VecLen
= SM
.Mask
.size();
1736 std::pair
<int,unsigned> Strip
= findStrip(SM
.Mask
, 1, VecLen
);
1737 MVT ResTy
= getSingleVT(MVT::i8
);
1739 // The following shuffles only work for bytes and halfwords. This requires
1740 // the strip length to be 1 or 2.
1741 if (Strip
.second
!= 1 && Strip
.second
!= 2)
1742 return OpRef::fail();
1744 // The patterns for the shuffles, in terms of the starting offsets of the
1745 // consecutive strips (L = length of the strip, N = VecLen):
1747 // vpacke: 0, 2L, 4L ... N+0, N+2L, N+4L ... L = 1 or 2
1748 // vpacko: L, 3L, 5L ... N+L, N+3L, N+5L ... L = 1 or 2
1750 // vshuffe: 0, N+0, 2L, N+2L, 4L ... L = 1 or 2
1751 // vshuffo: L, N+L, 3L, N+3L, 5L ... L = 1 or 2
1753 // vdealb4w: 0, 4, 8 ... 2, 6, 10 ... N+0, N+4, N+8 ... N+2, N+6, N+10 ...
1755 // The value of the element in the mask following the strip will decide
1756 // what kind of a shuffle this can be.
1757 int NextInMask
= SM
.Mask
[Strip
.second
];
1759 // Check if NextInMask could be 2L, 3L or 4, i.e. if it could be a mask
1760 // for vpack or vdealb4w. VecLen > 4, so NextInMask for vdealb4w would
1762 if (NextInMask
< VecLen
) {
1763 // vpack{e,o} or vdealb4w
1764 if (Strip
.first
== 0 && Strip
.second
== 1 && NextInMask
== 4) {
1766 // Check if this is vdealb4w (L=1).
1767 for (int I
= 0; I
!= N
/4; ++I
)
1768 if (SM
.Mask
[I
] != 4*I
)
1769 return OpRef::fail();
1770 for (int I
= 0; I
!= N
/4; ++I
)
1771 if (SM
.Mask
[I
+N
/4] != 2 + 4*I
)
1772 return OpRef::fail();
1773 for (int I
= 0; I
!= N
/4; ++I
)
1774 if (SM
.Mask
[I
+N
/2] != N
+ 4*I
)
1775 return OpRef::fail();
1776 for (int I
= 0; I
!= N
/4; ++I
)
1777 if (SM
.Mask
[I
+3*N
/4] != N
+2 + 4*I
)
1778 return OpRef::fail();
1779 // Matched mask for vdealb4w.
1780 Results
.push(Hexagon::V6_vdealb4w
, ResTy
, {Vb
, Va
});
1781 return OpRef::res(Results
.top());
1784 // Check if this is vpack{e,o}.
1786 int L
= Strip
.second
;
1787 // Check if the first strip starts at 0 or at L.
1788 if (Strip
.first
!= 0 && Strip
.first
!= L
)
1789 return OpRef::fail();
1790 // Examine the rest of the mask.
1791 for (int I
= L
; I
< N
; I
+= L
) {
1792 auto S
= findStrip(SM
.Mask
.drop_front(I
), 1, N
-I
);
1793 // Check whether the mask element at the beginning of each strip
1794 // increases by 2L each time.
1795 if (S
.first
- Strip
.first
!= 2*I
)
1796 return OpRef::fail();
1797 // Check whether each strip is of the same length.
1798 if (S
.second
!= unsigned(L
))
1799 return OpRef::fail();
1802 // Strip.first == 0 => vpacke
1803 // Strip.first == L => vpacko
1804 assert(Strip
.first
== 0 || Strip
.first
== L
);
1805 using namespace Hexagon
;
1807 Res
.Opc
= Strip
.second
== 1 // Number of bytes.
1808 ? (Strip
.first
== 0 ? V6_vpackeb
: V6_vpackob
)
1809 : (Strip
.first
== 0 ? V6_vpackeh
: V6_vpackoh
);
1811 Res
.Ops
= { Vb
, Va
};
1813 return OpRef::res(Results
.top());
1816 // Check if this is vshuff{e,o}.
1818 int L
= Strip
.second
;
1819 std::pair
<int,unsigned> PrevS
= Strip
;
1821 for (int I
= L
; I
< N
; I
+= L
) {
1822 auto S
= findStrip(SM
.Mask
.drop_front(I
), 1, N
-I
);
1823 if (S
.second
!= PrevS
.second
)
1824 return OpRef::fail();
1825 int Diff
= Flip
? PrevS
.first
- S
.first
+ 2*L
1826 : S
.first
- PrevS
.first
;
1828 return OpRef::fail();
1832 // Strip.first == 0 => vshuffe
1833 // Strip.first == L => vshuffo
1834 assert(Strip
.first
== 0 || Strip
.first
== L
);
1835 using namespace Hexagon
;
1837 Res
.Opc
= Strip
.second
== 1 // Number of bytes.
1838 ? (Strip
.first
== 0 ? V6_vshuffeb
: V6_vshuffob
)
1839 : (Strip
.first
== 0 ? V6_vshufeh
: V6_vshufoh
);
1841 Res
.Ops
= { Vb
, Va
};
1843 return OpRef::res(Results
.top());
1846 OpRef
HvxSelector::expanding(ShuffleMask SM
, OpRef Va
, ResultStack
&Results
) {
1847 DEBUG_WITH_TYPE("isel", {dbgs() << __func__
<< '\n';});
1848 // Expanding shuffles (using all elements and inserting into larger vector):
1850 // V6_vunpacku{b,h} [*]
1852 // [*] Only if the upper elements (filled with 0s) are "don't care" in Mask.
1854 // Note: V6_vunpacko{b,h} are or-ing the high byte/half in the result, so
1855 // they are not shuffles.
1857 // The argument is a single vector.
1859 int VecLen
= SM
.Mask
.size();
1860 assert(2*HwLen
== unsigned(VecLen
) && "Expecting vector-pair type");
1862 std::pair
<int,unsigned> Strip
= findStrip(SM
.Mask
, 1, VecLen
);
1864 // The patterns for the unpacks, in terms of the starting offsets of the
1865 // consecutive strips (L = length of the strip, N = VecLen):
1867 // vunpacku: 0, -1, L, -1, 2L, -1 ...
1869 if (Strip
.first
!= 0)
1870 return OpRef::fail();
1872 // The vunpackus only handle byte and half-word.
1873 if (Strip
.second
!= 1 && Strip
.second
!= 2)
1874 return OpRef::fail();
1877 int L
= Strip
.second
;
1879 // First, check the non-ignored strips.
1880 for (int I
= 2*L
; I
< N
; I
+= 2*L
) {
1881 auto S
= findStrip(SM
.Mask
.drop_front(I
), 1, N
-I
);
1882 if (S
.second
!= unsigned(L
))
1883 return OpRef::fail();
1885 return OpRef::fail();
1888 for (int I
= L
; I
< N
; I
+= 2*L
) {
1889 auto S
= findStrip(SM
.Mask
.drop_front(I
), 0, N
-I
);
1890 if (S
.first
!= -1 || S
.second
!= unsigned(L
))
1891 return OpRef::fail();
1894 unsigned Opc
= Strip
.second
== 1 ? Hexagon::V6_vunpackub
1895 : Hexagon::V6_vunpackuh
;
1896 Results
.push(Opc
, getPairVT(MVT::i8
), {Va
});
1897 return OpRef::res(Results
.top());
1900 OpRef
HvxSelector::perfect(ShuffleMask SM
, OpRef Va
, ResultStack
&Results
) {
1901 DEBUG_WITH_TYPE("isel", {dbgs() << __func__
<< '\n';});
1905 // V6_vshufoe{b,h} those are quivalent to vshuffvdd(..,{1,2})
1906 // V6_vshuffvdd (V6_vshuff)
1907 // V6_dealvdd (V6_vdeal)
1909 int VecLen
= SM
.Mask
.size();
1910 assert(isPowerOf2_32(VecLen
) && Log2_32(VecLen
) <= 8);
1911 unsigned LogLen
= Log2_32(VecLen
);
1912 unsigned HwLog
= Log2_32(HwLen
);
1913 // The result length must be the same as the length of a single vector,
1914 // or a vector pair.
1915 assert(LogLen
== HwLog
|| LogLen
== HwLog
+1);
1916 bool HavePairs
= LogLen
== HwLog
+1;
1918 if (!isPermutation(SM
.Mask
))
1919 return OpRef::fail();
1921 SmallVector
<unsigned,8> Perm(LogLen
);
1923 // Check if this could be a perfect shuffle, or a combination of perfect
1926 // Consider this permutation (using hex digits to make the ASCII diagrams
1928 // { 0, 8, 1, 9, 2, A, 3, B, 4, C, 5, D, 6, E, 7, F }.
1929 // This is a "deal" operation: divide the input into two halves, and
1930 // create the output by picking elements by alternating between these two
1932 // 0 1 2 3 4 5 6 7 --> 0 8 1 9 2 A 3 B 4 C 5 D 6 E 7 F [*]
1935 // Aside from a few special explicit cases (V6_vdealb, etc.), HVX provides
1936 // a somwehat different mechanism that could be used to perform shuffle/
1937 // deal operations: a 2x2 transpose.
1938 // Consider the halves of inputs again, they can be interpreted as a 2x8
1939 // matrix. A 2x8 matrix can be looked at four 2x2 matrices concatenated
1940 // together. Now, when considering 2 elements at a time, it will be a 2x4
1941 // matrix (with elements 01, 23, 45, etc.), or two 2x2 matrices:
1944 // With groups of 4, this will become a single 2x2 matrix, and so on.
1946 // The 2x2 transpose instruction works by transposing each of the 2x2
1947 // matrices (or "sub-matrices"), given a specific group size. For example,
1948 // if the group size is 1 (i.e. each element is its own group), there
1949 // will be four transposes of the four 2x2 matrices that form the 2x8.
1950 // For example, with the inputs as above, the result will be:
1953 // Now, this result can be tranposed again, but with the group size of 2:
1956 // If we then transpose that result, but with the group size of 4, we get:
1959 // If we concatenate these two rows, it will be
1960 // 0 8 1 9 2 A 3 B 4 C 5 D 6 E 7 F
1961 // which is the same as the "deal" [*] above.
1963 // In general, a "deal" of individual elements is a series of 2x2 transposes,
1964 // with changing group size. HVX has two instructions:
1965 // Vdd = V6_vdealvdd Vu, Vv, Rt
1966 // Vdd = V6_shufvdd Vu, Vv, Rt
1967 // that perform exactly that. The register Rt controls which transposes are
1968 // going to happen: a bit at position n (counting from 0) indicates that a
1969 // transpose with a group size of 2^n will take place. If multiple bits are
1970 // set, multiple transposes will happen: vdealvdd will perform them starting
1971 // with the largest group size, vshuffvdd will do them in the reverse order.
1973 // The main observation is that each 2x2 transpose corresponds to swapping
1974 // columns of bits in the binary representation of the values.
1976 // The numbers {3,2,1,0} and the log2 of the number of contiguous 1 bits
1977 // in a given column. The * denote the columns that will be swapped.
1978 // The transpose with the group size 2^n corresponds to swapping columns
1979 // 3 (the highest log) and log2(n):
1981 // 3 2 1 0 0 2 1 3 0 2 3 1
1983 // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1984 // 1 0 0 0 1 8 1 0 0 0 8 1 0 0 0 8 1 0 0 0
1985 // 2 0 0 1 0 2 0 0 1 0 1 0 0 0 1 1 0 0 0 1
1986 // 3 0 0 1 1 A 1 0 1 0 9 1 0 0 1 9 1 0 0 1
1987 // 4 0 1 0 0 4 0 1 0 0 4 0 1 0 0 2 0 0 1 0
1988 // 5 0 1 0 1 C 1 1 0 0 C 1 1 0 0 A 1 0 1 0
1989 // 6 0 1 1 0 6 0 1 1 0 5 0 1 0 1 3 0 0 1 1
1990 // 7 0 1 1 1 E 1 1 1 0 D 1 1 0 1 B 1 0 1 1
1991 // 8 1 0 0 0 1 0 0 0 1 2 0 0 1 0 4 0 1 0 0
1992 // 9 1 0 0 1 9 1 0 0 1 A 1 0 1 0 C 1 1 0 0
1993 // A 1 0 1 0 3 0 0 1 1 3 0 0 1 1 5 0 1 0 1
1994 // B 1 0 1 1 B 1 0 1 1 B 1 0 1 1 D 1 1 0 1
1995 // C 1 1 0 0 5 0 1 0 1 6 0 1 1 0 6 0 1 1 0
1996 // D 1 1 0 1 D 1 1 0 1 E 1 1 1 0 E 1 1 1 0
1997 // E 1 1 1 0 7 0 1 1 1 7 0 1 1 1 7 0 1 1 1
1998 // F 1 1 1 1 F 1 1 1 1 F 1 1 1 1 F 1 1 1 1
2000 // There is one special case that is not a perfect shuffle, but
2001 // can be turned into one easily: when the shuffle operates on
2002 // a vector pair, but the two vectors in the pair are swapped.
2003 // The code below that identifies perfect shuffles will reject
2004 // it, unless the order is reversed.
2005 SmallVector
<int,128> MaskStorage(SM
.Mask
.begin(), SM
.Mask
.end());
2006 bool InvertedPair
= false;
2007 if (HavePairs
&& SM
.Mask
[0] >= int(HwLen
)) {
2008 for (int i
= 0, e
= SM
.Mask
.size(); i
!= e
; ++i
) {
2010 MaskStorage
[i
] = M
>= int(HwLen
) ? M
-HwLen
: M
+HwLen
;
2012 InvertedPair
= true;
2014 ArrayRef
<int> LocalMask(MaskStorage
);
2016 auto XorPow2
= [] (ArrayRef
<int> Mask
, unsigned Num
) {
2017 unsigned X
= Mask
[0] ^ Mask
[Num
/2];
2018 // Check that the first half has the X's bits clear.
2019 if ((Mask
[0] & X
) != 0)
2021 for (unsigned I
= 1; I
!= Num
/2; ++I
) {
2022 if (unsigned(Mask
[I
] ^ Mask
[I
+Num
/2]) != X
)
2024 if ((Mask
[I
] & X
) != 0)
2030 // Create a vector of log2's for each column: Perm[i] corresponds to
2031 // the i-th bit (lsb is 0).
2033 for (unsigned I
= VecLen
; I
>= 2; I
>>= 1) {
2034 // Examine the initial segment of Mask of size I.
2035 unsigned X
= XorPow2(LocalMask
, I
);
2036 if (!isPowerOf2_32(X
))
2037 return OpRef::fail();
2038 // Check the other segments of Mask.
2039 for (int J
= I
; J
< VecLen
; J
+= I
) {
2040 if (XorPow2(LocalMask
.slice(J
, I
), I
) != X
)
2041 return OpRef::fail();
2043 Perm
[Log2_32(X
)] = Log2_32(I
)-1;
2046 // Once we have Perm, represent it as cycles. Denote the maximum log2
2047 // (equal to log2(VecLen)-1) as M. The cycle containing M can then be
2048 // written as (M a1 a2 a3 ... an). That cycle can be broken up into
2049 // simple swaps as (M a1)(M a2)(M a3)...(M an), with the composition
2050 // order being from left to right. Any (contiguous) segment where the
2051 // values ai, ai+1...aj are either all increasing or all decreasing,
2052 // can be implemented via a single vshuffvdd/vdealvdd respectively.
2054 // If there is a cycle (a1 a2 ... an) that does not involve M, it can
2055 // be written as (M an)(a1 a2 ... an)(M a1). The first two cycles can
2056 // then be folded to get (M a1 a2 ... an)(M a1), and the above procedure
2057 // can be used to generate a sequence of vshuffvdd/vdealvdd.
2060 // Assume M = 4 and consider a permutation (0 1)(2 3). It can be written
2061 // as (4 0 1)(4 0) composed with (4 2 3)(4 2), or simply
2062 // (4 0 1)(4 0)(4 2 3)(4 2).
2063 // It can then be expanded into swaps as
2064 // (4 0)(4 1)(4 0)(4 2)(4 3)(4 2),
2065 // and broken up into "increasing" segments as
2066 // [(4 0)(4 1)] [(4 0)(4 2)(4 3)] [(4 2)].
2067 // This is equivalent to
2068 // (4 0 1)(4 0 2 3)(4 2),
2069 // which can be implemented as 3 vshufvdd instructions.
2071 using CycleType
= SmallVector
<unsigned,8>;
2072 std::set
<CycleType
> Cycles
;
2073 std::set
<unsigned> All
;
2075 for (unsigned I
: Perm
)
2078 // If the cycle contains LogLen-1, move it to the front of the cycle.
2079 // Otherwise, return the cycle unchanged.
2080 auto canonicalize
= [LogLen
](const CycleType
&C
) -> CycleType
{
2081 unsigned LogPos
, N
= C
.size();
2082 for (LogPos
= 0; LogPos
!= N
; ++LogPos
)
2083 if (C
[LogPos
] == LogLen
-1)
2088 CycleType
NewC(C
.begin()+LogPos
, C
.end());
2089 NewC
.append(C
.begin(), C
.begin()+LogPos
);
2093 auto pfs
= [](const std::set
<CycleType
> &Cs
, unsigned Len
) {
2094 // Ordering: shuff: 5 0 1 2 3 4, deal: 5 4 3 2 1 0 (for Log=6),
2095 // for bytes zero is included, for halfwords is not.
2098 const CycleType
&C
= *Cs
.begin();
2101 int D
= Len
- C
.size();
2102 if (D
!= 0 && D
!= 1)
2105 bool IsDeal
= true, IsShuff
= true;
2106 for (unsigned I
= 1; I
!= Len
-D
; ++I
) {
2107 if (C
[I
] != Len
-1-I
)
2109 if (C
[I
] != I
-(1-D
)) // I-1, I
2112 // At most one, IsDeal or IsShuff, can be non-zero.
2113 assert(!(IsDeal
|| IsShuff
) || IsDeal
!= IsShuff
);
2114 static unsigned Deals
[] = { Hexagon::V6_vdealb
, Hexagon::V6_vdealh
};
2115 static unsigned Shufs
[] = { Hexagon::V6_vshuffb
, Hexagon::V6_vshuffh
};
2116 return IsDeal
? Deals
[D
] : (IsShuff
? Shufs
[D
] : 0);
2119 while (!All
.empty()) {
2120 unsigned A
= *All
.begin();
2124 for (unsigned B
= Perm
[A
]; B
!= A
; B
= Perm
[B
]) {
2130 Cycles
.insert(canonicalize(C
));
2133 MVT SingleTy
= getSingleVT(MVT::i8
);
2134 MVT PairTy
= getPairVT(MVT::i8
);
2136 // Recognize patterns for V6_vdeal{b,h} and V6_vshuff{b,h}.
2137 if (unsigned(VecLen
) == HwLen
) {
2138 if (unsigned SingleOpc
= pfs(Cycles
, LogLen
)) {
2139 Results
.push(SingleOpc
, SingleTy
, {Va
});
2140 return OpRef::res(Results
.top());
2144 // From the cycles, construct the sequence of values that will
2145 // then form the control values for vdealvdd/vshuffvdd, i.e.
2146 // (M a1 a2)(M a3 a4 a5)... -> a1 a2 a3 a4 a5
2147 // This essentially strips the M value from the cycles where
2148 // it's present, and performs the insertion of M (then stripping)
2149 // for cycles without M (as described in an earlier comment).
2150 SmallVector
<unsigned,8> SwapElems
;
2151 // When the input is extended (i.e. single vector becomes a pair),
2152 // this is done by using an "undef" vector as the second input.
2153 // However, then we get
2154 // input 1: GOODBITS
2155 // input 2: ........
2157 // input 1: ....BITS
2158 // input 2: ....GOOD
2159 // Then at the end, this needs to be undone. To accomplish this,
2160 // artificially add "LogLen-1" at both ends of the sequence.
2162 SwapElems
.push_back(LogLen
-1);
2163 for (const CycleType
&C
: Cycles
) {
2164 // Do the transformation: (a1..an) -> (M a1..an)(M a1).
2165 unsigned First
= (C
[0] == LogLen
-1) ? 1 : 0;
2166 SwapElems
.append(C
.begin()+First
, C
.end());
2168 SwapElems
.push_back(C
[0]);
2171 SwapElems
.push_back(LogLen
-1);
2173 const SDLoc
&dl(Results
.InpNode
);
2174 OpRef Arg
= HavePairs
? Va
2175 : concats(Va
, OpRef::undef(SingleTy
), Results
);
2177 Arg
= concats(OpRef::hi(Arg
), OpRef::lo(Arg
), Results
);
2179 for (unsigned I
= 0, E
= SwapElems
.size(); I
!= E
; ) {
2180 bool IsInc
= I
== E
-1 || SwapElems
[I
] < SwapElems
[I
+1];
2181 unsigned S
= (1u << SwapElems
[I
]);
2183 while (++I
< E
-1 && IsInc
== (SwapElems
[I
] < SwapElems
[I
+1]))
2184 S
|= 1u << SwapElems
[I
];
2185 // The above loop will not add a bit for the final SwapElems[I+1],
2187 S
|= 1u << SwapElems
[I
];
2192 Results
.push(Hexagon::A2_tfrsi
, MVT::i32
, {getConst32(S
, dl
)});
2193 Res
.Opc
= IsInc
? Hexagon::V6_vshuffvdd
: Hexagon::V6_vdealvdd
;
2195 Res
.Ops
= { OpRef::hi(Arg
), OpRef::lo(Arg
), OpRef::res(-1) };
2197 Arg
= OpRef::res(Results
.top());
2200 return HavePairs
? Arg
: OpRef::lo(Arg
);
2203 OpRef
HvxSelector::butterfly(ShuffleMask SM
, OpRef Va
, ResultStack
&Results
) {
2204 DEBUG_WITH_TYPE("isel", {dbgs() << __func__
<< '\n';});
2205 // Butterfly shuffles.
2211 // The assumption here is that all elements picked by Mask are in the
2212 // first operand to the vector_shuffle. This assumption is enforced
2215 MVT ResTy
= getSingleVT(MVT::i8
);
2216 PermNetwork::Controls FC
, RC
;
2217 const SDLoc
&dl(Results
.InpNode
);
2218 int VecLen
= SM
.Mask
.size();
2220 for (int M
: SM
.Mask
) {
2221 if (M
!= -1 && M
>= VecLen
)
2222 return OpRef::fail();
2225 // Try the deltas/benes for both single vectors and vector pairs.
2226 ForwardDeltaNetwork
FN(SM
.Mask
);
2228 SDValue Ctl
= getVectorConstant(FC
, dl
);
2229 Results
.push(Hexagon::V6_vdelta
, ResTy
, {Va
, OpRef(Ctl
)});
2230 return OpRef::res(Results
.top());
2233 // Try reverse delta.
2234 ReverseDeltaNetwork
RN(SM
.Mask
);
2236 SDValue Ctl
= getVectorConstant(RC
, dl
);
2237 Results
.push(Hexagon::V6_vrdelta
, ResTy
, {Va
, OpRef(Ctl
)});
2238 return OpRef::res(Results
.top());
2242 BenesNetwork
BN(SM
.Mask
);
2243 if (BN
.run(FC
, RC
)) {
2244 SDValue CtlF
= getVectorConstant(FC
, dl
);
2245 SDValue CtlR
= getVectorConstant(RC
, dl
);
2246 Results
.push(Hexagon::V6_vdelta
, ResTy
, {Va
, OpRef(CtlF
)});
2247 Results
.push(Hexagon::V6_vrdelta
, ResTy
,
2248 {OpRef::res(-1), OpRef(CtlR
)});
2249 return OpRef::res(Results
.top());
2252 return OpRef::fail();
2255 SDValue
HvxSelector::getConst32(int Val
, const SDLoc
&dl
) {
2256 return DAG
.getTargetConstant(Val
, dl
, MVT::i32
);
2259 SDValue
HvxSelector::getVectorConstant(ArrayRef
<uint8_t> Data
,
2261 SmallVector
<SDValue
, 128> Elems
;
2262 for (uint8_t C
: Data
)
2263 Elems
.push_back(DAG
.getConstant(C
, dl
, MVT::i8
));
2264 MVT VecTy
= MVT::getVectorVT(MVT::i8
, Data
.size());
2265 SDValue BV
= DAG
.getBuildVector(VecTy
, dl
, Elems
);
2266 SDValue LV
= Lower
.LowerOperation(BV
, DAG
);
2267 DAG
.RemoveDeadNode(BV
.getNode());
2268 return DAG
.getNode(HexagonISD::ISEL
, dl
, VecTy
, LV
);
2271 void HvxSelector::selectShuffle(SDNode
*N
) {
2272 DEBUG_WITH_TYPE("isel", {
2273 dbgs() << "Starting " << __func__
<< " on node:\n";
2276 MVT ResTy
= N
->getValueType(0).getSimpleVT();
2277 // Assume that vector shuffles operate on vectors of bytes.
2278 assert(ResTy
.isVector() && ResTy
.getVectorElementType() == MVT::i8
);
2280 auto *SN
= cast
<ShuffleVectorSDNode
>(N
);
2281 std::vector
<int> Mask(SN
->getMask().begin(), SN
->getMask().end());
2282 // This shouldn't really be necessary. Is it?
2283 for (int &Idx
: Mask
)
2284 if (Idx
!= -1 && Idx
< 0)
2287 unsigned VecLen
= Mask
.size();
2288 bool HavePairs
= (2*HwLen
== VecLen
);
2289 assert(ResTy
.getSizeInBits() / 8 == VecLen
);
2291 // Vd = vector_shuffle Va, Vb, Mask
2294 bool UseLeft
= false, UseRight
= false;
2295 for (unsigned I
= 0; I
!= VecLen
; ++I
) {
2298 unsigned Idx
= Mask
[I
];
2299 assert(Idx
< 2*VecLen
);
2306 DEBUG_WITH_TYPE("isel", {
2307 dbgs() << "VecLen=" << VecLen
<< " HwLen=" << HwLen
<< " UseLeft="
2308 << UseLeft
<< " UseRight=" << UseRight
<< " HavePairs="
2309 << HavePairs
<< '\n';
2311 // If the mask is all -1's, generate "undef".
2312 if (!UseLeft
&& !UseRight
) {
2313 ISel
.ReplaceNode(N
, ISel
.selectUndef(SDLoc(SN
), ResTy
).getNode());
2317 SDValue Vec0
= N
->getOperand(0);
2318 SDValue Vec1
= N
->getOperand(1);
2319 ResultStack
Results(SN
);
2320 Results
.push(TargetOpcode::COPY
, ResTy
, {Vec0
});
2321 Results
.push(TargetOpcode::COPY
, ResTy
, {Vec1
});
2322 OpRef Va
= OpRef::res(Results
.top()-1);
2323 OpRef Vb
= OpRef::res(Results
.top());
2325 OpRef Res
= !HavePairs
? shuffs2(ShuffleMask(Mask
), Va
, Vb
, Results
)
2326 : shuffp2(ShuffleMask(Mask
), Va
, Vb
, Results
);
2328 bool Done
= Res
.isValid();
2330 // Make sure that Res is on the stack before materializing.
2331 Results
.push(TargetOpcode::COPY
, ResTy
, {Res
});
2332 materialize(Results
);
2334 Done
= scalarizeShuffle(Mask
, SDLoc(N
), ResTy
, Vec0
, Vec1
, N
);
2339 dbgs() << "Unhandled shuffle:\n";
2342 llvm_unreachable("Failed to select vector shuffle");
2346 void HvxSelector::selectRor(SDNode
*N
) {
2347 // If this is a rotation by less than 8, use V6_valignbi.
2348 MVT Ty
= N
->getValueType(0).getSimpleVT();
2350 SDValue VecV
= N
->getOperand(0);
2351 SDValue RotV
= N
->getOperand(1);
2352 SDNode
*NewN
= nullptr;
2354 if (auto *CN
= dyn_cast
<ConstantSDNode
>(RotV
.getNode())) {
2355 unsigned S
= CN
->getZExtValue() % HST
.getVectorLength();
2357 NewN
= VecV
.getNode();
2358 } else if (isUInt
<3>(S
)) {
2359 NewN
= DAG
.getMachineNode(Hexagon::V6_valignbi
, dl
, Ty
,
2360 {VecV
, VecV
, getConst32(S
, dl
)});
2365 NewN
= DAG
.getMachineNode(Hexagon::V6_vror
, dl
, Ty
, {VecV
, RotV
});
2367 ISel
.ReplaceNode(N
, NewN
);
2370 void HvxSelector::selectVAlign(SDNode
*N
) {
2371 SDValue Vv
= N
->getOperand(0);
2372 SDValue Vu
= N
->getOperand(1);
2373 SDValue Rt
= N
->getOperand(2);
2374 SDNode
*NewN
= DAG
.getMachineNode(Hexagon::V6_valignb
, SDLoc(N
),
2375 N
->getValueType(0), {Vv
, Vu
, Rt
});
2376 ISel
.ReplaceNode(N
, NewN
);
2377 DAG
.RemoveDeadNode(N
);
2380 void HexagonDAGToDAGISel::SelectHvxShuffle(SDNode
*N
) {
2381 HvxSelector(*this, *CurDAG
).selectShuffle(N
);
2384 void HexagonDAGToDAGISel::SelectHvxRor(SDNode
*N
) {
2385 HvxSelector(*this, *CurDAG
).selectRor(N
);
2388 void HexagonDAGToDAGISel::SelectHvxVAlign(SDNode
*N
) {
2389 HvxSelector(*this, *CurDAG
).selectVAlign(N
);
2392 void HexagonDAGToDAGISel::SelectV65GatherPred(SDNode
*N
) {
2394 SDValue Chain
= N
->getOperand(0);
2395 SDValue Address
= N
->getOperand(2);
2396 SDValue Predicate
= N
->getOperand(3);
2397 SDValue Base
= N
->getOperand(4);
2398 SDValue Modifier
= N
->getOperand(5);
2399 SDValue Offset
= N
->getOperand(6);
2402 unsigned IntNo
= cast
<ConstantSDNode
>(N
->getOperand(1))->getZExtValue();
2405 llvm_unreachable("Unexpected HVX gather intrinsic.");
2406 case Intrinsic::hexagon_V6_vgathermhq
:
2407 case Intrinsic::hexagon_V6_vgathermhq_128B
:
2408 Opcode
= Hexagon::V6_vgathermhq_pseudo
;
2410 case Intrinsic::hexagon_V6_vgathermwq
:
2411 case Intrinsic::hexagon_V6_vgathermwq_128B
:
2412 Opcode
= Hexagon::V6_vgathermwq_pseudo
;
2414 case Intrinsic::hexagon_V6_vgathermhwq
:
2415 case Intrinsic::hexagon_V6_vgathermhwq_128B
:
2416 Opcode
= Hexagon::V6_vgathermhwq_pseudo
;
2420 SDVTList VTs
= CurDAG
->getVTList(MVT::Other
);
2421 SDValue Ops
[] = { Address
, Predicate
, Base
, Modifier
, Offset
, Chain
};
2422 SDNode
*Result
= CurDAG
->getMachineNode(Opcode
, dl
, VTs
, Ops
);
2424 MachineMemOperand
*MemOp
= cast
<MemIntrinsicSDNode
>(N
)->getMemOperand();
2425 CurDAG
->setNodeMemRefs(cast
<MachineSDNode
>(Result
), {MemOp
});
2427 ReplaceNode(N
, Result
);
2430 void HexagonDAGToDAGISel::SelectV65Gather(SDNode
*N
) {
2432 SDValue Chain
= N
->getOperand(0);
2433 SDValue Address
= N
->getOperand(2);
2434 SDValue Base
= N
->getOperand(3);
2435 SDValue Modifier
= N
->getOperand(4);
2436 SDValue Offset
= N
->getOperand(5);
2439 unsigned IntNo
= cast
<ConstantSDNode
>(N
->getOperand(1))->getZExtValue();
2442 llvm_unreachable("Unexpected HVX gather intrinsic.");
2443 case Intrinsic::hexagon_V6_vgathermh
:
2444 case Intrinsic::hexagon_V6_vgathermh_128B
:
2445 Opcode
= Hexagon::V6_vgathermh_pseudo
;
2447 case Intrinsic::hexagon_V6_vgathermw
:
2448 case Intrinsic::hexagon_V6_vgathermw_128B
:
2449 Opcode
= Hexagon::V6_vgathermw_pseudo
;
2451 case Intrinsic::hexagon_V6_vgathermhw
:
2452 case Intrinsic::hexagon_V6_vgathermhw_128B
:
2453 Opcode
= Hexagon::V6_vgathermhw_pseudo
;
2457 SDVTList VTs
= CurDAG
->getVTList(MVT::Other
);
2458 SDValue Ops
[] = { Address
, Base
, Modifier
, Offset
, Chain
};
2459 SDNode
*Result
= CurDAG
->getMachineNode(Opcode
, dl
, VTs
, Ops
);
2461 MachineMemOperand
*MemOp
= cast
<MemIntrinsicSDNode
>(N
)->getMemOperand();
2462 CurDAG
->setNodeMemRefs(cast
<MachineSDNode
>(Result
), {MemOp
});
2464 ReplaceNode(N
, Result
);
2467 void HexagonDAGToDAGISel::SelectHVXDualOutput(SDNode
*N
) {
2468 unsigned IID
= cast
<ConstantSDNode
>(N
->getOperand(0))->getZExtValue();
2471 case Intrinsic::hexagon_V6_vaddcarry
: {
2472 std::array
<SDValue
, 3> Ops
= {
2473 {N
->getOperand(1), N
->getOperand(2), N
->getOperand(3)}};
2474 SDVTList VTs
= CurDAG
->getVTList(MVT::v16i32
, MVT::v64i1
);
2475 Result
= CurDAG
->getMachineNode(Hexagon::V6_vaddcarry
, SDLoc(N
), VTs
, Ops
);
2478 case Intrinsic::hexagon_V6_vaddcarry_128B
: {
2479 std::array
<SDValue
, 3> Ops
= {
2480 {N
->getOperand(1), N
->getOperand(2), N
->getOperand(3)}};
2481 SDVTList VTs
= CurDAG
->getVTList(MVT::v32i32
, MVT::v128i1
);
2482 Result
= CurDAG
->getMachineNode(Hexagon::V6_vaddcarry
, SDLoc(N
), VTs
, Ops
);
2485 case Intrinsic::hexagon_V6_vsubcarry
: {
2486 std::array
<SDValue
, 3> Ops
= {
2487 {N
->getOperand(1), N
->getOperand(2), N
->getOperand(3)}};
2488 SDVTList VTs
= CurDAG
->getVTList(MVT::v16i32
, MVT::v64i1
);
2489 Result
= CurDAG
->getMachineNode(Hexagon::V6_vsubcarry
, SDLoc(N
), VTs
, Ops
);
2492 case Intrinsic::hexagon_V6_vsubcarry_128B
: {
2493 std::array
<SDValue
, 3> Ops
= {
2494 {N
->getOperand(1), N
->getOperand(2), N
->getOperand(3)}};
2495 SDVTList VTs
= CurDAG
->getVTList(MVT::v32i32
, MVT::v128i1
);
2496 Result
= CurDAG
->getMachineNode(Hexagon::V6_vsubcarry
, SDLoc(N
), VTs
, Ops
);
2500 llvm_unreachable("Unexpected HVX dual output intrinsic.");
2502 ReplaceUses(N
, Result
);
2503 ReplaceUses(SDValue(N
, 0), SDValue(Result
, 0));
2504 ReplaceUses(SDValue(N
, 1), SDValue(Result
, 1));
2505 CurDAG
->RemoveDeadNode(N
);