1 //===-- PerfectShuffle.cpp - Perfect Shuffle Generator --------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file computes an optimal sequence of instructions for doing all shuffles
10 // of two 4-element vectors. With a release build and when configured to emit
11 // an altivec instruction table, this takes about 30s to run on a 2.7Ghz
14 //===----------------------------------------------------------------------===//
23 #define GENERATE_NEON_INS
27 // Masks are 4-nibble hex numbers. Values 0-7 in any nibble means that it takes
28 // an element from that value of the input vectors. A value of 8 means the
29 // entry is undefined.
31 // Mask manipulation functions.
32 static inline unsigned short MakeMask(unsigned V0
, unsigned V1
,
33 unsigned V2
, unsigned V3
) {
34 return (V0
<< (3*4)) | (V1
<< (2*4)) | (V2
<< (1*4)) | (V3
<< (0*4));
37 /// getMaskElt - Return element N of the specified mask.
38 static unsigned getMaskElt(unsigned Mask
, unsigned Elt
) {
39 return (Mask
>> ((3-Elt
)*4)) & 0xF;
42 static unsigned setMaskElt(unsigned Mask
, unsigned Elt
, unsigned NewVal
) {
43 unsigned FieldShift
= ((3-Elt
)*4);
44 return (Mask
& ~(0xF << FieldShift
)) | (NewVal
<< FieldShift
);
47 // Reject elements where the values are 9-15.
48 static bool isValidMask(unsigned short Mask
) {
49 unsigned short UndefBits
= Mask
& 0x8888;
50 return (Mask
& ((UndefBits
>> 1)|(UndefBits
>>2)|(UndefBits
>>3))) == 0;
53 /// hasUndefElements - Return true if any of the elements in the mask are undefs
55 static bool hasUndefElements(unsigned short Mask
) {
56 return (Mask
& 0x8888) != 0;
59 /// isOnlyLHSMask - Return true if this mask only refers to its LHS, not
60 /// including undef values..
61 static bool isOnlyLHSMask(unsigned short Mask
) {
62 return (Mask
& 0x4444) == 0;
65 /// getLHSOnlyMask - Given a mask that refers to its LHS and RHS, modify it to
66 /// refer to the LHS only (for when one argument value is passed into the same
69 static unsigned short getLHSOnlyMask(unsigned short Mask
) {
70 return Mask
& 0xBBBB; // Keep only LHS and Undefs.
74 /// getCompressedMask - Turn a 16-bit uncompressed mask (where each elt uses 4
75 /// bits) into a compressed 13-bit mask, where each elt is multiplied by 9.
76 static unsigned getCompressedMask(unsigned short Mask
) {
77 return getMaskElt(Mask
, 0)*9*9*9 + getMaskElt(Mask
, 1)*9*9 +
78 getMaskElt(Mask
, 2)*9 + getMaskElt(Mask
, 3);
81 static void PrintMask(unsigned i
, std::ostream
&OS
) {
82 OS
<< "<" << (char)(getMaskElt(i
, 0) == 8 ? 'u' : ('0'+getMaskElt(i
, 0)))
83 << "," << (char)(getMaskElt(i
, 1) == 8 ? 'u' : ('0'+getMaskElt(i
, 1)))
84 << "," << (char)(getMaskElt(i
, 2) == 8 ? 'u' : ('0'+getMaskElt(i
, 2)))
85 << "," << (char)(getMaskElt(i
, 3) == 8 ? 'u' : ('0'+getMaskElt(i
, 3)))
89 /// ShuffleVal - This represents a shufflevector operation.
91 Operator
*Op
; // The Operation used to generate this value.
92 unsigned Cost
; // Number of instrs used to generate this value.
93 unsigned short Arg0
, Arg1
; // Input operands for this value.
95 ShuffleVal() : Cost(1000000) {}
99 /// ShufTab - This is the actual shuffle table that we are trying to generate.
101 static ShuffleVal ShufTab
[65536];
103 /// TheOperators - All of the operators that this target supports.
104 static std::vector
<Operator
*> TheOperators
;
106 /// Operator - This is a vector operation that is available for use.
109 unsigned short ShuffleMask
;
110 unsigned short OpNum
;
113 Operator(unsigned short shufflemask
, const char *name
, unsigned opnum
,
115 : Name(name
), ShuffleMask(shufflemask
), OpNum(opnum
),Cost(cost
) {
116 TheOperators
.push_back(this);
119 assert(TheOperators
.back() == this);
120 TheOperators
.pop_back();
123 bool isOnlyLHSOperator() const {
124 return isOnlyLHSMask(ShuffleMask
);
127 const char *getName() const { return Name
; }
128 unsigned getCost() const { return Cost
; }
130 unsigned short getTransformedMask(unsigned short LHSMask
, unsigned RHSMask
) {
131 // Extract the elements from LHSMask and RHSMask, as appropriate.
133 for (unsigned i
= 0; i
!= 4; ++i
) {
134 unsigned SrcElt
= (ShuffleMask
>> (4*i
)) & 0xF;
137 ResElt
= getMaskElt(LHSMask
, SrcElt
);
139 ResElt
= getMaskElt(RHSMask
, SrcElt
-4);
141 assert(SrcElt
== 8 && "Bad src elt!");
144 Result
|= ResElt
<< (4*i
);
150 #ifdef GENERATE_NEON_INS
151 // Special case "insert" op identifier used below
152 static Operator
InsOp(0, "ins", 15, 1);
155 static const char *getZeroCostOpName(unsigned short Op
) {
156 if (ShufTab
[Op
].Arg0
== 0x0123)
158 else if (ShufTab
[Op
].Arg0
== 0x4567)
161 assert(0 && "bad zero cost operation");
166 static void PrintOperation(unsigned ValNo
, unsigned short Vals
[]) {
167 unsigned short ThisOp
= Vals
[ValNo
];
168 std::cerr
<< "t" << ValNo
;
169 PrintMask(ThisOp
, std::cerr
);
170 std::cerr
<< " = " << ShufTab
[ThisOp
].Op
->getName() << "(";
172 if (ShufTab
[ShufTab
[ThisOp
].Arg0
].Cost
== 0) {
173 std::cerr
<< getZeroCostOpName(ShufTab
[ThisOp
].Arg0
);
174 PrintMask(ShufTab
[ThisOp
].Arg0
, std::cerr
);
176 // Figure out what tmp # it is.
177 for (unsigned i
= 0; ; ++i
)
178 if (Vals
[i
] == ShufTab
[ThisOp
].Arg0
) {
179 std::cerr
<< "t" << i
;
184 #ifdef GENERATE_NEON_INS
185 if (ShufTab
[ThisOp
].Op
== &InsOp
) {
186 std::cerr
<< ", lane " << ShufTab
[ThisOp
].Arg1
;
189 if (!ShufTab
[Vals
[ValNo
]].Op
->isOnlyLHSOperator()) {
191 if (ShufTab
[ShufTab
[ThisOp
].Arg1
].Cost
== 0) {
192 std::cerr
<< getZeroCostOpName(ShufTab
[ThisOp
].Arg1
);
193 PrintMask(ShufTab
[ThisOp
].Arg1
, std::cerr
);
195 // Figure out what tmp # it is.
196 for (unsigned i
= 0; ; ++i
)
197 if (Vals
[i
] == ShufTab
[ThisOp
].Arg1
) {
198 std::cerr
<< "t" << i
;
206 static unsigned getNumEntered() {
208 for (unsigned i
= 0; i
!= 65536; ++i
)
209 Count
+= ShufTab
[i
].Cost
< 100;
213 static void EvaluateOps(unsigned short Elt
, unsigned short Vals
[],
215 if (ShufTab
[Elt
].Cost
== 0) return;
216 #ifdef GENERATE_NEON_INS
217 if (ShufTab
[Elt
].Op
== &InsOp
) {
218 EvaluateOps(ShufTab
[Elt
].Arg0
, Vals
, NumVals
);
219 Vals
[NumVals
++] = Elt
;
224 // If this value has already been evaluated, it is free. FIXME: match undefs.
225 for (unsigned i
= 0, e
= NumVals
; i
!= e
; ++i
)
226 if (Vals
[i
] == Elt
) return;
228 // Otherwise, get the operands of the value, then add it.
229 unsigned Arg0
= ShufTab
[Elt
].Arg0
, Arg1
= ShufTab
[Elt
].Arg1
;
230 if (ShufTab
[Arg0
].Cost
)
231 EvaluateOps(Arg0
, Vals
, NumVals
);
232 if (Arg0
!= Arg1
&& ShufTab
[Arg1
].Cost
)
233 EvaluateOps(Arg1
, Vals
, NumVals
);
235 Vals
[NumVals
++] = Elt
;
240 // Seed the table with accesses to the LHS and RHS.
241 ShufTab
[0x0123].Cost
= 0;
242 ShufTab
[0x0123].Op
= nullptr;
243 ShufTab
[0x0123].Arg0
= 0x0123;
244 ShufTab
[0x4567].Cost
= 0;
245 ShufTab
[0x4567].Op
= nullptr;
246 ShufTab
[0x4567].Arg0
= 0x4567;
248 // Seed the first-level of shuffles, shuffles whose inputs are the input to
249 // the vectorshuffle operation.
250 bool MadeChange
= true;
251 unsigned OpCount
= 0;
255 std::cerr
<< "Starting iteration #" << OpCount
<< " with "
256 << getNumEntered() << " entries established.\n";
258 // Scan the table for two reasons: First, compute the maximum cost of any
259 // operation left in the table. Second, make sure that values with undefs
260 // have the cheapest alternative that they match.
261 unsigned MaxCost
= ShufTab
[0].Cost
;
262 for (unsigned i
= 1; i
!= 0x8889; ++i
) {
263 if (!isValidMask(i
)) continue;
264 if (ShufTab
[i
].Cost
> MaxCost
)
265 MaxCost
= ShufTab
[i
].Cost
;
267 // If this value has an undef, make it be computed the cheapest possible
268 // way of any of the things that it matches.
269 if (hasUndefElements(i
)) {
270 // This code is a little bit tricky, so here's the idea: consider some
271 // permutation, like 7u4u. To compute the lowest cost for 7u4u, we
272 // need to take the minimum cost of all of 7[0-8]4[0-8], 81 entries. If
273 // there are 3 undefs, the number rises to 729 entries we have to scan,
274 // and for the 4 undef case, we have to scan the whole table.
276 // Instead of doing this huge amount of scanning, we process the table
277 // entries *in order*, and use the fact that 'u' is 8, larger than any
278 // valid index. Given an entry like 7u4u then, we only need to scan
279 // 7[0-7]4u - 8 entries. We can get away with this, because we already
280 // know that each of 704u, 714u, 724u, etc contain the minimum value of
281 // all of the 704[0-8], 714[0-8] and 724[0-8] entries respectively.
295 unsigned MinCost
= ShufTab
[i
].Cost
;
297 // Scan the 8 entries.
298 for (unsigned j
= 0; j
!= 8; ++j
) {
299 unsigned NewElt
= setMaskElt(i
, UndefIdx
, j
);
300 if (ShufTab
[NewElt
].Cost
< MinCost
) {
301 MinCost
= ShufTab
[NewElt
].Cost
;
306 // If we found something cheaper than what was here before, use it.
309 ShufTab
[i
] = ShufTab
[MinVal
];
312 #ifdef GENERATE_NEON_INS
314 // Similarly, if we take the mask (eg 3,6,1,0) and take the cost with
315 // undef for each lane (eg u,6,1,0 or 3,u,1,0 etc), we can use a single
316 // lane insert to fixup the result.
317 for (unsigned LaneIdx
= 0; LaneIdx
< 4; LaneIdx
++) {
318 if (getMaskElt(i
, LaneIdx
) == 8)
320 unsigned NewElt
= setMaskElt(i
, LaneIdx
, 8);
321 if (ShufTab
[NewElt
].Cost
+ 1 < ShufTab
[i
].Cost
) {
323 ShufTab
[i
].Cost
= ShufTab
[NewElt
].Cost
+ 1;
324 ShufTab
[i
].Op
= &InsOp
;
325 ShufTab
[i
].Arg0
= NewElt
;
326 ShufTab
[i
].Arg1
= LaneIdx
;
330 // Similar idea for using a D register mov, masking out 2 lanes to undef
331 for (unsigned LaneIdx
= 0; LaneIdx
< 4; LaneIdx
+= 2) {
332 unsigned Ln0
= getMaskElt(i
, LaneIdx
);
333 unsigned Ln1
= getMaskElt(i
, LaneIdx
+ 1);
334 if ((Ln0
== 0 && Ln1
== 1) || (Ln0
== 2 && Ln1
== 3) ||
335 (Ln0
== 4 && Ln1
== 5) || (Ln0
== 6 && Ln1
== 7)) {
336 unsigned NewElt
= setMaskElt(i
, LaneIdx
, 8);
337 NewElt
= setMaskElt(NewElt
, LaneIdx
+ 1, 8);
338 if (ShufTab
[NewElt
].Cost
+ 1 < ShufTab
[i
].Cost
) {
340 ShufTab
[i
].Cost
= ShufTab
[NewElt
].Cost
+ 1;
341 ShufTab
[i
].Op
= &InsOp
;
342 ShufTab
[i
].Arg0
= NewElt
;
343 ShufTab
[i
].Arg1
= (LaneIdx
>> 1) | 0x4;
351 for (unsigned LHS
= 0; LHS
!= 0x8889; ++LHS
) {
352 if (!isValidMask(LHS
)) continue;
353 if (ShufTab
[LHS
].Cost
> 1000) continue;
355 // If nothing involving this operand could possibly be cheaper than what
356 // we already have, don't consider it.
357 if (ShufTab
[LHS
].Cost
+ 1 >= MaxCost
)
360 for (unsigned opnum
= 0, e
= TheOperators
.size(); opnum
!= e
; ++opnum
) {
361 Operator
*Op
= TheOperators
[opnum
];
362 #ifdef GENERATE_NEON_INS
367 // Evaluate op(LHS,LHS)
368 unsigned ResultMask
= Op
->getTransformedMask(LHS
, LHS
);
370 unsigned Cost
= ShufTab
[LHS
].Cost
+ Op
->getCost();
371 if (Cost
< ShufTab
[ResultMask
].Cost
) {
372 ShufTab
[ResultMask
].Cost
= Cost
;
373 ShufTab
[ResultMask
].Op
= Op
;
374 ShufTab
[ResultMask
].Arg0
= LHS
;
375 ShufTab
[ResultMask
].Arg1
= LHS
;
379 // If this is a two input instruction, include the op(x,y) cases. If
380 // this is a one input instruction, skip this.
381 if (Op
->isOnlyLHSOperator()) continue;
383 for (unsigned RHS
= 0; RHS
!= 0x8889; ++RHS
) {
384 if (!isValidMask(RHS
)) continue;
385 if (ShufTab
[RHS
].Cost
> 1000) continue;
387 // If nothing involving this operand could possibly be cheaper than
388 // what we already have, don't consider it.
389 if (ShufTab
[RHS
].Cost
+ 1 >= MaxCost
)
393 // Evaluate op(LHS,RHS)
394 unsigned ResultMask
= Op
->getTransformedMask(LHS
, RHS
);
396 if (ShufTab
[ResultMask
].Cost
<= OpCount
||
397 ShufTab
[ResultMask
].Cost
<= ShufTab
[LHS
].Cost
||
398 ShufTab
[ResultMask
].Cost
<= ShufTab
[RHS
].Cost
)
401 // Figure out the cost to evaluate this, knowing that CSE's only need
402 // to be evaluated once.
403 unsigned short Vals
[30];
404 unsigned NumVals
= 0;
405 EvaluateOps(LHS
, Vals
, NumVals
);
406 EvaluateOps(RHS
, Vals
, NumVals
);
408 unsigned Cost
= NumVals
+ Op
->getCost();
409 if (Cost
< ShufTab
[ResultMask
].Cost
) {
410 ShufTab
[ResultMask
].Cost
= Cost
;
411 ShufTab
[ResultMask
].Op
= Op
;
412 ShufTab
[ResultMask
].Arg0
= LHS
;
413 ShufTab
[ResultMask
].Arg1
= RHS
;
421 std::cerr
<< "Finished Table has " << getNumEntered()
422 << " entries established.\n";
424 unsigned CostArray
[10] = { 0 };
426 // Compute a cost histogram.
427 for (unsigned i
= 0; i
!= 65536; ++i
) {
428 if (!isValidMask(i
)) continue;
429 if (ShufTab
[i
].Cost
> 9)
432 ++CostArray
[ShufTab
[i
].Cost
];
435 for (unsigned i
= 0; i
!= 9; ++i
)
437 std::cout
<< "// " << CostArray
[i
] << " entries have cost " << i
<< "\n";
439 std::cout
<< "// " << CostArray
[9] << " entries have higher cost!\n";
442 // Build up the table to emit.
443 std::cout
<< "\n// This table is 6561*4 = 26244 bytes in size.\n";
444 std::cout
<< "static const unsigned PerfectShuffleTable[6561+1] = {\n";
446 for (unsigned i
= 0; i
!= 0x8889; ++i
) {
447 if (!isValidMask(i
)) continue;
449 // CostSat - The cost of this operation saturated to two bits.
450 unsigned CostSat
= ShufTab
[i
].Cost
;
451 if (CostSat
> 4) CostSat
= 4;
452 if (CostSat
== 0) CostSat
= 1;
453 --CostSat
; // Cost is now between 0-3.
455 unsigned OpNum
= ShufTab
[i
].Op
? ShufTab
[i
].Op
->OpNum
: 0;
456 assert(OpNum
< 16 && "Too few bits to encode operation!");
458 unsigned LHS
= getCompressedMask(ShufTab
[i
].Arg0
);
459 unsigned RHS
= getCompressedMask(ShufTab
[i
].Arg1
);
461 // Encode this as 2 bits of saturated cost, 4 bits of opcodes, 13 bits of
462 // LHS, and 13 bits of RHS = 32 bits.
463 unsigned Val
= (CostSat
<< 30) | (OpNum
<< 26) | (LHS
<< 13) | RHS
;
465 std::cout
<< " " << std::setw(10) << Val
<< "U, // ";
466 PrintMask(i
, std::cout
);
467 std::cout
<< ": Cost " << ShufTab
[i
].Cost
;
468 std::cout
<< " " << (ShufTab
[i
].Op
? ShufTab
[i
].Op
->getName() : "copy");
470 if (ShufTab
[ShufTab
[i
].Arg0
].Cost
== 0) {
471 std::cout
<< getZeroCostOpName(ShufTab
[i
].Arg0
);
473 PrintMask(ShufTab
[i
].Arg0
, std::cout
);
476 if (ShufTab
[i
].Op
&& !ShufTab
[i
].Op
->isOnlyLHSOperator()) {
478 if (ShufTab
[ShufTab
[i
].Arg1
].Cost
== 0) {
479 std::cout
<< getZeroCostOpName(ShufTab
[i
].Arg1
);
481 PrintMask(ShufTab
[i
].Arg1
, std::cout
);
484 #ifdef GENERATE_NEON_INS
485 else if (ShufTab
[i
].Op
== &InsOp
) {
486 std::cout
<< ", lane " << ShufTab
[i
].Arg1
;
492 std::cout
<< " 0\n};\n";
495 // Print out the table.
496 for (unsigned i
= 0; i
!= 0x8889; ++i
) {
497 if (!isValidMask(i
)) continue;
498 if (ShufTab
[i
].Cost
< 1000) {
499 PrintMask(i
, std::cerr
);
500 std::cerr
<< " - Cost " << ShufTab
[i
].Cost
<< " - ";
502 unsigned short Vals
[30];
503 unsigned NumVals
= 0;
504 EvaluateOps(i
, Vals
, NumVals
);
506 for (unsigned j
= 0, e
= NumVals
; j
!= e
; ++j
)
507 PrintOperation(j
, Vals
);
515 #ifdef GENERATE_ALTIVEC
517 ///===---------------------------------------------------------------------===//
518 /// The altivec instruction definitions. This is the altivec-specific part of
520 ///===---------------------------------------------------------------------===//
522 // Note that the opcode numbers here must match those in the PPC backend.
524 OP_COPY
= 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
536 struct vmrghw
: public Operator
{
537 vmrghw() : Operator(0x0415, "vmrghw", OP_VMRGHW
) {}
540 struct vmrglw
: public Operator
{
541 vmrglw() : Operator(0x2637, "vmrglw", OP_VMRGLW
) {}
544 template<unsigned Elt
>
545 struct vspltisw
: public Operator
{
546 vspltisw(const char *N
, unsigned Opc
)
547 : Operator(MakeMask(Elt
, Elt
, Elt
, Elt
), N
, Opc
) {}
550 vspltisw
<0> the_vspltisw0("vspltisw0", OP_VSPLTISW0
);
551 vspltisw
<1> the_vspltisw1("vspltisw1", OP_VSPLTISW1
);
552 vspltisw
<2> the_vspltisw2("vspltisw2", OP_VSPLTISW2
);
553 vspltisw
<3> the_vspltisw3("vspltisw3", OP_VSPLTISW3
);
556 struct vsldoi
: public Operator
{
557 vsldoi(const char *Name
, unsigned Opc
)
558 : Operator(MakeMask(N
&7, (N
+1)&7, (N
+2)&7, (N
+3)&7), Name
, Opc
) {
562 vsldoi
<1> the_vsldoi1("vsldoi4" , OP_VSLDOI4
);
563 vsldoi
<2> the_vsldoi2("vsldoi8" , OP_VSLDOI8
);
564 vsldoi
<3> the_vsldoi3("vsldoi12", OP_VSLDOI12
);
570 OP_COPY
= 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
579 OP_VUZPL
, // VUZP, left result
580 OP_VUZPR
, // VUZP, right result
581 OP_VZIPL
, // VZIP, left result
582 OP_VZIPR
, // VZIP, right result
583 OP_VTRNL
, // VTRN, left result
584 OP_VTRNR
// VTRN, right result
587 struct vrev
: public Operator
{
588 vrev() : Operator(0x1032, "vrev", OP_VREV
) {}
591 template<unsigned Elt
>
592 struct vdup
: public Operator
{
593 vdup(const char *N
, unsigned Opc
)
594 : Operator(MakeMask(Elt
, Elt
, Elt
, Elt
), N
, Opc
) {}
597 vdup
<0> the_vdup0("vdup0", OP_VDUP0
);
598 vdup
<1> the_vdup1("vdup1", OP_VDUP1
);
599 vdup
<2> the_vdup2("vdup2", OP_VDUP2
);
600 vdup
<3> the_vdup3("vdup3", OP_VDUP3
);
603 struct vext
: public Operator
{
604 vext(const char *Name
, unsigned Opc
)
605 : Operator(MakeMask(N
&7, (N
+1)&7, (N
+2)&7, (N
+3)&7), Name
, Opc
) {
609 vext
<1> the_vext1("vext1", OP_VEXT1
);
610 vext
<2> the_vext2("vext2", OP_VEXT2
);
611 vext
<3> the_vext3("vext3", OP_VEXT3
);
613 struct vuzpl
: public Operator
{
614 vuzpl() : Operator(0x0246, "vuzpl", OP_VUZPL
, 1) {}
617 struct vuzpr
: public Operator
{
618 vuzpr() : Operator(0x1357, "vuzpr", OP_VUZPR
, 1) {}
621 struct vzipl
: public Operator
{
622 vzipl() : Operator(0x0415, "vzipl", OP_VZIPL
, 1) {}
625 struct vzipr
: public Operator
{
626 vzipr() : Operator(0x2637, "vzipr", OP_VZIPR
, 1) {}
629 struct vtrnl
: public Operator
{
630 vtrnl() : Operator(0x0426, "vtrnl", OP_VTRNL
, 1) {}
633 struct vtrnr
: public Operator
{
634 vtrnr() : Operator(0x1537, "vtrnr", OP_VTRNR
, 1) {}