1 //===-- PerfectShuffle.cpp - Perfect Shuffle Generator --------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file computes an optimal sequence of instructions for doing all shuffles
10 // of two 4-element vectors. With a release build and when configured to emit
11 // an altivec instruction table, this takes about 30s to run on a 2.7Ghz
14 //===----------------------------------------------------------------------===//
23 // Masks are 4-nibble hex numbers. Values 0-7 in any nibble means that it takes
24 // an element from that value of the input vectors. A value of 8 means the
25 // entry is undefined.
27 // Mask manipulation functions.
28 static inline unsigned short MakeMask(unsigned V0
, unsigned V1
,
29 unsigned V2
, unsigned V3
) {
30 return (V0
<< (3*4)) | (V1
<< (2*4)) | (V2
<< (1*4)) | (V3
<< (0*4));
33 /// getMaskElt - Return element N of the specified mask.
34 static unsigned getMaskElt(unsigned Mask
, unsigned Elt
) {
35 return (Mask
>> ((3-Elt
)*4)) & 0xF;
38 static unsigned setMaskElt(unsigned Mask
, unsigned Elt
, unsigned NewVal
) {
39 unsigned FieldShift
= ((3-Elt
)*4);
40 return (Mask
& ~(0xF << FieldShift
)) | (NewVal
<< FieldShift
);
43 // Reject elements where the values are 9-15.
44 static bool isValidMask(unsigned short Mask
) {
45 unsigned short UndefBits
= Mask
& 0x8888;
46 return (Mask
& ((UndefBits
>> 1)|(UndefBits
>>2)|(UndefBits
>>3))) == 0;
49 /// hasUndefElements - Return true if any of the elements in the mask are undefs
51 static bool hasUndefElements(unsigned short Mask
) {
52 return (Mask
& 0x8888) != 0;
55 /// isOnlyLHSMask - Return true if this mask only refers to its LHS, not
56 /// including undef values..
57 static bool isOnlyLHSMask(unsigned short Mask
) {
58 return (Mask
& 0x4444) == 0;
61 /// getLHSOnlyMask - Given a mask that refers to its LHS and RHS, modify it to
62 /// refer to the LHS only (for when one argument value is passed into the same
65 static unsigned short getLHSOnlyMask(unsigned short Mask
) {
66 return Mask
& 0xBBBB; // Keep only LHS and Undefs.
70 /// getCompressedMask - Turn a 16-bit uncompressed mask (where each elt uses 4
71 /// bits) into a compressed 13-bit mask, where each elt is multiplied by 9.
72 static unsigned getCompressedMask(unsigned short Mask
) {
73 return getMaskElt(Mask
, 0)*9*9*9 + getMaskElt(Mask
, 1)*9*9 +
74 getMaskElt(Mask
, 2)*9 + getMaskElt(Mask
, 3);
77 static void PrintMask(unsigned i
, std::ostream
&OS
) {
78 OS
<< "<" << (char)(getMaskElt(i
, 0) == 8 ? 'u' : ('0'+getMaskElt(i
, 0)))
79 << "," << (char)(getMaskElt(i
, 1) == 8 ? 'u' : ('0'+getMaskElt(i
, 1)))
80 << "," << (char)(getMaskElt(i
, 2) == 8 ? 'u' : ('0'+getMaskElt(i
, 2)))
81 << "," << (char)(getMaskElt(i
, 3) == 8 ? 'u' : ('0'+getMaskElt(i
, 3)))
85 /// ShuffleVal - This represents a shufflevector operation.
87 Operator
*Op
; // The Operation used to generate this value.
88 unsigned Cost
; // Number of instrs used to generate this value.
89 unsigned short Arg0
, Arg1
; // Input operands for this value.
91 ShuffleVal() : Cost(1000000) {}
95 /// ShufTab - This is the actual shuffle table that we are trying to generate.
97 static ShuffleVal ShufTab
[65536];
99 /// TheOperators - All of the operators that this target supports.
100 static std::vector
<Operator
*> TheOperators
;
102 /// Operator - This is a vector operation that is available for use.
105 unsigned short ShuffleMask
;
106 unsigned short OpNum
;
109 Operator(unsigned short shufflemask
, const char *name
, unsigned opnum
,
111 : Name(name
), ShuffleMask(shufflemask
), OpNum(opnum
),Cost(cost
) {
112 TheOperators
.push_back(this);
115 assert(TheOperators
.back() == this);
116 TheOperators
.pop_back();
119 bool isOnlyLHSOperator() const {
120 return isOnlyLHSMask(ShuffleMask
);
123 const char *getName() const { return Name
; }
124 unsigned getCost() const { return Cost
; }
126 unsigned short getTransformedMask(unsigned short LHSMask
, unsigned RHSMask
) {
127 // Extract the elements from LHSMask and RHSMask, as appropriate.
129 for (unsigned i
= 0; i
!= 4; ++i
) {
130 unsigned SrcElt
= (ShuffleMask
>> (4*i
)) & 0xF;
133 ResElt
= getMaskElt(LHSMask
, SrcElt
);
135 ResElt
= getMaskElt(RHSMask
, SrcElt
-4);
137 assert(SrcElt
== 8 && "Bad src elt!");
140 Result
|= ResElt
<< (4*i
);
146 static const char *getZeroCostOpName(unsigned short Op
) {
147 if (ShufTab
[Op
].Arg0
== 0x0123)
149 else if (ShufTab
[Op
].Arg0
== 0x4567)
152 assert(0 && "bad zero cost operation");
157 static void PrintOperation(unsigned ValNo
, unsigned short Vals
[]) {
158 unsigned short ThisOp
= Vals
[ValNo
];
159 std::cerr
<< "t" << ValNo
;
160 PrintMask(ThisOp
, std::cerr
);
161 std::cerr
<< " = " << ShufTab
[ThisOp
].Op
->getName() << "(";
163 if (ShufTab
[ShufTab
[ThisOp
].Arg0
].Cost
== 0) {
164 std::cerr
<< getZeroCostOpName(ShufTab
[ThisOp
].Arg0
);
165 PrintMask(ShufTab
[ThisOp
].Arg0
, std::cerr
);
167 // Figure out what tmp # it is.
168 for (unsigned i
= 0; ; ++i
)
169 if (Vals
[i
] == ShufTab
[ThisOp
].Arg0
) {
170 std::cerr
<< "t" << i
;
175 if (!ShufTab
[Vals
[ValNo
]].Op
->isOnlyLHSOperator()) {
177 if (ShufTab
[ShufTab
[ThisOp
].Arg1
].Cost
== 0) {
178 std::cerr
<< getZeroCostOpName(ShufTab
[ThisOp
].Arg1
);
179 PrintMask(ShufTab
[ThisOp
].Arg1
, std::cerr
);
181 // Figure out what tmp # it is.
182 for (unsigned i
= 0; ; ++i
)
183 if (Vals
[i
] == ShufTab
[ThisOp
].Arg1
) {
184 std::cerr
<< "t" << i
;
192 static unsigned getNumEntered() {
194 for (unsigned i
= 0; i
!= 65536; ++i
)
195 Count
+= ShufTab
[i
].Cost
< 100;
199 static void EvaluateOps(unsigned short Elt
, unsigned short Vals
[],
201 if (ShufTab
[Elt
].Cost
== 0) return;
203 // If this value has already been evaluated, it is free. FIXME: match undefs.
204 for (unsigned i
= 0, e
= NumVals
; i
!= e
; ++i
)
205 if (Vals
[i
] == Elt
) return;
207 // Otherwise, get the operands of the value, then add it.
208 unsigned Arg0
= ShufTab
[Elt
].Arg0
, Arg1
= ShufTab
[Elt
].Arg1
;
209 if (ShufTab
[Arg0
].Cost
)
210 EvaluateOps(Arg0
, Vals
, NumVals
);
211 if (Arg0
!= Arg1
&& ShufTab
[Arg1
].Cost
)
212 EvaluateOps(Arg1
, Vals
, NumVals
);
214 Vals
[NumVals
++] = Elt
;
219 // Seed the table with accesses to the LHS and RHS.
220 ShufTab
[0x0123].Cost
= 0;
221 ShufTab
[0x0123].Op
= nullptr;
222 ShufTab
[0x0123].Arg0
= 0x0123;
223 ShufTab
[0x4567].Cost
= 0;
224 ShufTab
[0x4567].Op
= nullptr;
225 ShufTab
[0x4567].Arg0
= 0x4567;
227 // Seed the first-level of shuffles, shuffles whose inputs are the input to
228 // the vectorshuffle operation.
229 bool MadeChange
= true;
230 unsigned OpCount
= 0;
234 std::cerr
<< "Starting iteration #" << OpCount
<< " with "
235 << getNumEntered() << " entries established.\n";
237 // Scan the table for two reasons: First, compute the maximum cost of any
238 // operation left in the table. Second, make sure that values with undefs
239 // have the cheapest alternative that they match.
240 unsigned MaxCost
= ShufTab
[0].Cost
;
241 for (unsigned i
= 1; i
!= 0x8889; ++i
) {
242 if (!isValidMask(i
)) continue;
243 if (ShufTab
[i
].Cost
> MaxCost
)
244 MaxCost
= ShufTab
[i
].Cost
;
246 // If this value has an undef, make it be computed the cheapest possible
247 // way of any of the things that it matches.
248 if (hasUndefElements(i
)) {
249 // This code is a little bit tricky, so here's the idea: consider some
250 // permutation, like 7u4u. To compute the lowest cost for 7u4u, we
251 // need to take the minimum cost of all of 7[0-8]4[0-8], 81 entries. If
252 // there are 3 undefs, the number rises to 729 entries we have to scan,
253 // and for the 4 undef case, we have to scan the whole table.
255 // Instead of doing this huge amount of scanning, we process the table
256 // entries *in order*, and use the fact that 'u' is 8, larger than any
257 // valid index. Given an entry like 7u4u then, we only need to scan
258 // 7[0-7]4u - 8 entries. We can get away with this, because we already
259 // know that each of 704u, 714u, 724u, etc contain the minimum value of
260 // all of the 704[0-8], 714[0-8] and 724[0-8] entries respectively.
274 unsigned MinCost
= ShufTab
[i
].Cost
;
276 // Scan the 8 entries.
277 for (unsigned j
= 0; j
!= 8; ++j
) {
278 unsigned NewElt
= setMaskElt(i
, UndefIdx
, j
);
279 if (ShufTab
[NewElt
].Cost
< MinCost
) {
280 MinCost
= ShufTab
[NewElt
].Cost
;
285 // If we found something cheaper than what was here before, use it.
288 ShufTab
[i
] = ShufTab
[MinVal
];
293 for (unsigned LHS
= 0; LHS
!= 0x8889; ++LHS
) {
294 if (!isValidMask(LHS
)) continue;
295 if (ShufTab
[LHS
].Cost
> 1000) continue;
297 // If nothing involving this operand could possibly be cheaper than what
298 // we already have, don't consider it.
299 if (ShufTab
[LHS
].Cost
+ 1 >= MaxCost
)
302 for (unsigned opnum
= 0, e
= TheOperators
.size(); opnum
!= e
; ++opnum
) {
303 Operator
*Op
= TheOperators
[opnum
];
305 // Evaluate op(LHS,LHS)
306 unsigned ResultMask
= Op
->getTransformedMask(LHS
, LHS
);
308 unsigned Cost
= ShufTab
[LHS
].Cost
+ Op
->getCost();
309 if (Cost
< ShufTab
[ResultMask
].Cost
) {
310 ShufTab
[ResultMask
].Cost
= Cost
;
311 ShufTab
[ResultMask
].Op
= Op
;
312 ShufTab
[ResultMask
].Arg0
= LHS
;
313 ShufTab
[ResultMask
].Arg1
= LHS
;
317 // If this is a two input instruction, include the op(x,y) cases. If
318 // this is a one input instruction, skip this.
319 if (Op
->isOnlyLHSOperator()) continue;
321 for (unsigned RHS
= 0; RHS
!= 0x8889; ++RHS
) {
322 if (!isValidMask(RHS
)) continue;
323 if (ShufTab
[RHS
].Cost
> 1000) continue;
325 // If nothing involving this operand could possibly be cheaper than
326 // what we already have, don't consider it.
327 if (ShufTab
[RHS
].Cost
+ 1 >= MaxCost
)
331 // Evaluate op(LHS,RHS)
332 unsigned ResultMask
= Op
->getTransformedMask(LHS
, RHS
);
334 if (ShufTab
[ResultMask
].Cost
<= OpCount
||
335 ShufTab
[ResultMask
].Cost
<= ShufTab
[LHS
].Cost
||
336 ShufTab
[ResultMask
].Cost
<= ShufTab
[RHS
].Cost
)
339 // Figure out the cost to evaluate this, knowing that CSE's only need
340 // to be evaluated once.
341 unsigned short Vals
[30];
342 unsigned NumVals
= 0;
343 EvaluateOps(LHS
, Vals
, NumVals
);
344 EvaluateOps(RHS
, Vals
, NumVals
);
346 unsigned Cost
= NumVals
+ Op
->getCost();
347 if (Cost
< ShufTab
[ResultMask
].Cost
) {
348 ShufTab
[ResultMask
].Cost
= Cost
;
349 ShufTab
[ResultMask
].Op
= Op
;
350 ShufTab
[ResultMask
].Arg0
= LHS
;
351 ShufTab
[ResultMask
].Arg1
= RHS
;
359 std::cerr
<< "Finished Table has " << getNumEntered()
360 << " entries established.\n";
362 unsigned CostArray
[10] = { 0 };
364 // Compute a cost histogram.
365 for (unsigned i
= 0; i
!= 65536; ++i
) {
366 if (!isValidMask(i
)) continue;
367 if (ShufTab
[i
].Cost
> 9)
370 ++CostArray
[ShufTab
[i
].Cost
];
373 for (unsigned i
= 0; i
!= 9; ++i
)
375 std::cout
<< "// " << CostArray
[i
] << " entries have cost " << i
<< "\n";
377 std::cout
<< "// " << CostArray
[9] << " entries have higher cost!\n";
380 // Build up the table to emit.
381 std::cout
<< "\n// This table is 6561*4 = 26244 bytes in size.\n";
382 std::cout
<< "static const unsigned PerfectShuffleTable[6561+1] = {\n";
384 for (unsigned i
= 0; i
!= 0x8889; ++i
) {
385 if (!isValidMask(i
)) continue;
387 // CostSat - The cost of this operation saturated to two bits.
388 unsigned CostSat
= ShufTab
[i
].Cost
;
389 if (CostSat
> 4) CostSat
= 4;
390 if (CostSat
== 0) CostSat
= 1;
391 --CostSat
; // Cost is now between 0-3.
393 unsigned OpNum
= ShufTab
[i
].Op
? ShufTab
[i
].Op
->OpNum
: 0;
394 assert(OpNum
< 16 && "Too few bits to encode operation!");
396 unsigned LHS
= getCompressedMask(ShufTab
[i
].Arg0
);
397 unsigned RHS
= getCompressedMask(ShufTab
[i
].Arg1
);
399 // Encode this as 2 bits of saturated cost, 4 bits of opcodes, 13 bits of
400 // LHS, and 13 bits of RHS = 32 bits.
401 unsigned Val
= (CostSat
<< 30) | (OpNum
<< 26) | (LHS
<< 13) | RHS
;
403 std::cout
<< " " << std::setw(10) << Val
<< "U, // ";
404 PrintMask(i
, std::cout
);
405 std::cout
<< ": Cost " << ShufTab
[i
].Cost
;
406 std::cout
<< " " << (ShufTab
[i
].Op
? ShufTab
[i
].Op
->getName() : "copy");
408 if (ShufTab
[ShufTab
[i
].Arg0
].Cost
== 0) {
409 std::cout
<< getZeroCostOpName(ShufTab
[i
].Arg0
);
411 PrintMask(ShufTab
[i
].Arg0
, std::cout
);
414 if (ShufTab
[i
].Op
&& !ShufTab
[i
].Op
->isOnlyLHSOperator()) {
416 if (ShufTab
[ShufTab
[i
].Arg1
].Cost
== 0) {
417 std::cout
<< getZeroCostOpName(ShufTab
[i
].Arg1
);
419 PrintMask(ShufTab
[i
].Arg1
, std::cout
);
424 std::cout
<< " 0\n};\n";
427 // Print out the table.
428 for (unsigned i
= 0; i
!= 0x8889; ++i
) {
429 if (!isValidMask(i
)) continue;
430 if (ShufTab
[i
].Cost
< 1000) {
431 PrintMask(i
, std::cerr
);
432 std::cerr
<< " - Cost " << ShufTab
[i
].Cost
<< " - ";
434 unsigned short Vals
[30];
435 unsigned NumVals
= 0;
436 EvaluateOps(i
, Vals
, NumVals
);
438 for (unsigned j
= 0, e
= NumVals
; j
!= e
; ++j
)
439 PrintOperation(j
, Vals
);
447 #ifdef GENERATE_ALTIVEC
449 ///===---------------------------------------------------------------------===//
450 /// The altivec instruction definitions. This is the altivec-specific part of
452 ///===---------------------------------------------------------------------===//
454 // Note that the opcode numbers here must match those in the PPC backend.
456 OP_COPY
= 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
468 struct vmrghw
: public Operator
{
469 vmrghw() : Operator(0x0415, "vmrghw", OP_VMRGHW
) {}
472 struct vmrglw
: public Operator
{
473 vmrglw() : Operator(0x2637, "vmrglw", OP_VMRGLW
) {}
476 template<unsigned Elt
>
477 struct vspltisw
: public Operator
{
478 vspltisw(const char *N
, unsigned Opc
)
479 : Operator(MakeMask(Elt
, Elt
, Elt
, Elt
), N
, Opc
) {}
482 vspltisw
<0> the_vspltisw0("vspltisw0", OP_VSPLTISW0
);
483 vspltisw
<1> the_vspltisw1("vspltisw1", OP_VSPLTISW1
);
484 vspltisw
<2> the_vspltisw2("vspltisw2", OP_VSPLTISW2
);
485 vspltisw
<3> the_vspltisw3("vspltisw3", OP_VSPLTISW3
);
488 struct vsldoi
: public Operator
{
489 vsldoi(const char *Name
, unsigned Opc
)
490 : Operator(MakeMask(N
&7, (N
+1)&7, (N
+2)&7, (N
+3)&7), Name
, Opc
) {
494 vsldoi
<1> the_vsldoi1("vsldoi4" , OP_VSLDOI4
);
495 vsldoi
<2> the_vsldoi2("vsldoi8" , OP_VSLDOI8
);
496 vsldoi
<3> the_vsldoi3("vsldoi12", OP_VSLDOI12
);
500 #define GENERATE_NEON
504 OP_COPY
= 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
513 OP_VUZPL
, // VUZP, left result
514 OP_VUZPR
, // VUZP, right result
515 OP_VZIPL
, // VZIP, left result
516 OP_VZIPR
, // VZIP, right result
517 OP_VTRNL
, // VTRN, left result
518 OP_VTRNR
// VTRN, right result
521 struct vrev
: public Operator
{
522 vrev() : Operator(0x1032, "vrev", OP_VREV
) {}
525 template<unsigned Elt
>
526 struct vdup
: public Operator
{
527 vdup(const char *N
, unsigned Opc
)
528 : Operator(MakeMask(Elt
, Elt
, Elt
, Elt
), N
, Opc
) {}
531 vdup
<0> the_vdup0("vdup0", OP_VDUP0
);
532 vdup
<1> the_vdup1("vdup1", OP_VDUP1
);
533 vdup
<2> the_vdup2("vdup2", OP_VDUP2
);
534 vdup
<3> the_vdup3("vdup3", OP_VDUP3
);
537 struct vext
: public Operator
{
538 vext(const char *Name
, unsigned Opc
)
539 : Operator(MakeMask(N
&7, (N
+1)&7, (N
+2)&7, (N
+3)&7), Name
, Opc
) {
543 vext
<1> the_vext1("vext1", OP_VEXT1
);
544 vext
<2> the_vext2("vext2", OP_VEXT2
);
545 vext
<3> the_vext3("vext3", OP_VEXT3
);
547 struct vuzpl
: public Operator
{
548 vuzpl() : Operator(0x0246, "vuzpl", OP_VUZPL
, 2) {}
551 struct vuzpr
: public Operator
{
552 vuzpr() : Operator(0x1357, "vuzpr", OP_VUZPR
, 2) {}
555 struct vzipl
: public Operator
{
556 vzipl() : Operator(0x0415, "vzipl", OP_VZIPL
, 2) {}
559 struct vzipr
: public Operator
{
560 vzipr() : Operator(0x2637, "vzipr", OP_VZIPR
, 2) {}
563 struct vtrnl
: public Operator
{
564 vtrnl() : Operator(0x0426, "vtrnl", OP_VTRNL
, 2) {}
567 struct vtrnr
: public Operator
{
568 vtrnr() : Operator(0x1537, "vtrnr", OP_VTRNR
, 2) {}