1 //===-- PerfectShuffle.cpp - Perfect Shuffle Generator --------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file computes an optimal sequence of instructions for doing all shuffles
11 // of two 4-element vectors. With a release build and when configured to emit
12 // an altivec instruction table, this takes about 30s to run on a 2.7Ghz
15 //===----------------------------------------------------------------------===//
24 // Masks are 4-nibble hex numbers. Values 0-7 in any nibble means that it takes
25 // an element from that value of the input vectors. A value of 8 means the
26 // entry is undefined.
28 // Mask manipulation functions.
29 static inline unsigned short MakeMask(unsigned V0
, unsigned V1
,
30 unsigned V2
, unsigned V3
) {
31 return (V0
<< (3*4)) | (V1
<< (2*4)) | (V2
<< (1*4)) | (V3
<< (0*4));
34 /// getMaskElt - Return element N of the specified mask.
35 static unsigned getMaskElt(unsigned Mask
, unsigned Elt
) {
36 return (Mask
>> ((3-Elt
)*4)) & 0xF;
39 static unsigned setMaskElt(unsigned Mask
, unsigned Elt
, unsigned NewVal
) {
40 unsigned FieldShift
= ((3-Elt
)*4);
41 return (Mask
& ~(0xF << FieldShift
)) | (NewVal
<< FieldShift
);
44 // Reject elements where the values are 9-15.
45 static bool isValidMask(unsigned short Mask
) {
46 unsigned short UndefBits
= Mask
& 0x8888;
47 return (Mask
& ((UndefBits
>> 1)|(UndefBits
>>2)|(UndefBits
>>3))) == 0;
50 /// hasUndefElements - Return true if any of the elements in the mask are undefs
52 static bool hasUndefElements(unsigned short Mask
) {
53 return (Mask
& 0x8888) != 0;
56 /// isOnlyLHSMask - Return true if this mask only refers to its LHS, not
57 /// including undef values..
58 static bool isOnlyLHSMask(unsigned short Mask
) {
59 return (Mask
& 0x4444) == 0;
62 /// getLHSOnlyMask - Given a mask that refers to its LHS and RHS, modify it to
63 /// refer to the LHS only (for when one argument value is passed into the same
66 static unsigned short getLHSOnlyMask(unsigned short Mask
) {
67 return Mask
& 0xBBBB; // Keep only LHS and Undefs.
71 /// getCompressedMask - Turn a 16-bit uncompressed mask (where each elt uses 4
72 /// bits) into a compressed 13-bit mask, where each elt is multiplied by 9.
73 static unsigned getCompressedMask(unsigned short Mask
) {
74 return getMaskElt(Mask
, 0)*9*9*9 + getMaskElt(Mask
, 1)*9*9 +
75 getMaskElt(Mask
, 2)*9 + getMaskElt(Mask
, 3);
78 static void PrintMask(unsigned i
, std::ostream
&OS
) {
79 OS
<< "<" << (char)(getMaskElt(i
, 0) == 8 ? 'u' : ('0'+getMaskElt(i
, 0)))
80 << "," << (char)(getMaskElt(i
, 1) == 8 ? 'u' : ('0'+getMaskElt(i
, 1)))
81 << "," << (char)(getMaskElt(i
, 2) == 8 ? 'u' : ('0'+getMaskElt(i
, 2)))
82 << "," << (char)(getMaskElt(i
, 3) == 8 ? 'u' : ('0'+getMaskElt(i
, 3)))
86 /// ShuffleVal - This represents a shufflevector operation.
88 unsigned Cost
; // Number of instrs used to generate this value.
89 Operator
*Op
; // The Operation used to generate this value.
90 unsigned short Arg0
, Arg1
; // Input operands for this value.
92 ShuffleVal() : Cost(1000000) {}
96 /// ShufTab - This is the actual shuffle table that we are trying to generate.
98 static ShuffleVal ShufTab
[65536];
100 /// TheOperators - All of the operators that this target supports.
101 static std::vector
<Operator
*> TheOperators
;
103 /// Operator - This is a vector operation that is available for use.
105 unsigned short ShuffleMask
;
106 unsigned short OpNum
;
110 Operator(unsigned short shufflemask
, const char *name
, unsigned opnum
,
112 : ShuffleMask(shufflemask
), OpNum(opnum
), Name(name
), Cost(cost
) {
113 TheOperators
.push_back(this);
116 assert(TheOperators
.back() == this);
117 TheOperators
.pop_back();
120 bool isOnlyLHSOperator() const {
121 return isOnlyLHSMask(ShuffleMask
);
124 const char *getName() const { return Name
; }
125 unsigned getCost() const { return Cost
; }
127 unsigned short getTransformedMask(unsigned short LHSMask
, unsigned RHSMask
) {
128 // Extract the elements from LHSMask and RHSMask, as appropriate.
130 for (unsigned i
= 0; i
!= 4; ++i
) {
131 unsigned SrcElt
= (ShuffleMask
>> (4*i
)) & 0xF;
134 ResElt
= getMaskElt(LHSMask
, SrcElt
);
136 ResElt
= getMaskElt(RHSMask
, SrcElt
-4);
138 assert(SrcElt
== 8 && "Bad src elt!");
141 Result
|= ResElt
<< (4*i
);
147 static const char *getZeroCostOpName(unsigned short Op
) {
148 if (ShufTab
[Op
].Arg0
== 0x0123)
150 else if (ShufTab
[Op
].Arg0
== 0x4567)
153 assert(0 && "bad zero cost operation");
158 static void PrintOperation(unsigned ValNo
, unsigned short Vals
[]) {
159 unsigned short ThisOp
= Vals
[ValNo
];
160 std::cerr
<< "t" << ValNo
;
161 PrintMask(ThisOp
, std::cerr
);
162 std::cerr
<< " = " << ShufTab
[ThisOp
].Op
->getName() << "(";
164 if (ShufTab
[ShufTab
[ThisOp
].Arg0
].Cost
== 0) {
165 std::cerr
<< getZeroCostOpName(ShufTab
[ThisOp
].Arg0
);
166 PrintMask(ShufTab
[ThisOp
].Arg0
, std::cerr
);
168 // Figure out what tmp # it is.
169 for (unsigned i
= 0; ; ++i
)
170 if (Vals
[i
] == ShufTab
[ThisOp
].Arg0
) {
171 std::cerr
<< "t" << i
;
176 if (!ShufTab
[Vals
[ValNo
]].Op
->isOnlyLHSOperator()) {
178 if (ShufTab
[ShufTab
[ThisOp
].Arg1
].Cost
== 0) {
179 std::cerr
<< getZeroCostOpName(ShufTab
[ThisOp
].Arg1
);
180 PrintMask(ShufTab
[ThisOp
].Arg1
, std::cerr
);
182 // Figure out what tmp # it is.
183 for (unsigned i
= 0; ; ++i
)
184 if (Vals
[i
] == ShufTab
[ThisOp
].Arg1
) {
185 std::cerr
<< "t" << i
;
193 static unsigned getNumEntered() {
195 for (unsigned i
= 0; i
!= 65536; ++i
)
196 Count
+= ShufTab
[i
].Cost
< 100;
200 static void EvaluateOps(unsigned short Elt
, unsigned short Vals
[],
202 if (ShufTab
[Elt
].Cost
== 0) return;
204 // If this value has already been evaluated, it is free. FIXME: match undefs.
205 for (unsigned i
= 0, e
= NumVals
; i
!= e
; ++i
)
206 if (Vals
[i
] == Elt
) return;
208 // Otherwise, get the operands of the value, then add it.
209 unsigned Arg0
= ShufTab
[Elt
].Arg0
, Arg1
= ShufTab
[Elt
].Arg1
;
210 if (ShufTab
[Arg0
].Cost
)
211 EvaluateOps(Arg0
, Vals
, NumVals
);
212 if (Arg0
!= Arg1
&& ShufTab
[Arg1
].Cost
)
213 EvaluateOps(Arg1
, Vals
, NumVals
);
215 Vals
[NumVals
++] = Elt
;
220 // Seed the table with accesses to the LHS and RHS.
221 ShufTab
[0x0123].Cost
= 0;
222 ShufTab
[0x0123].Op
= 0;
223 ShufTab
[0x0123].Arg0
= 0x0123;
224 ShufTab
[0x4567].Cost
= 0;
225 ShufTab
[0x4567].Op
= 0;
226 ShufTab
[0x4567].Arg0
= 0x4567;
228 // Seed the first-level of shuffles, shuffles whose inputs are the input to
229 // the vectorshuffle operation.
230 bool MadeChange
= true;
231 unsigned OpCount
= 0;
235 std::cerr
<< "Starting iteration #" << OpCount
<< " with "
236 << getNumEntered() << " entries established.\n";
238 // Scan the table for two reasons: First, compute the maximum cost of any
239 // operation left in the table. Second, make sure that values with undefs
240 // have the cheapest alternative that they match.
241 unsigned MaxCost
= ShufTab
[0].Cost
;
242 for (unsigned i
= 1; i
!= 0x8889; ++i
) {
243 if (!isValidMask(i
)) continue;
244 if (ShufTab
[i
].Cost
> MaxCost
)
245 MaxCost
= ShufTab
[i
].Cost
;
247 // If this value has an undef, make it be computed the cheapest possible
248 // way of any of the things that it matches.
249 if (hasUndefElements(i
)) {
250 // This code is a little bit tricky, so here's the idea: consider some
251 // permutation, like 7u4u. To compute the lowest cost for 7u4u, we
252 // need to take the minimum cost of all of 7[0-8]4[0-8], 81 entries. If
253 // there are 3 undefs, the number rises to 729 entries we have to scan,
254 // and for the 4 undef case, we have to scan the whole table.
256 // Instead of doing this huge amount of scanning, we process the table
257 // entries *in order*, and use the fact that 'u' is 8, larger than any
258 // valid index. Given an entry like 7u4u then, we only need to scan
259 // 7[0-7]4u - 8 entries. We can get away with this, because we already
260 // know that each of 704u, 714u, 724u, etc contain the minimum value of
261 // all of the 704[0-8], 714[0-8] and 724[0-8] entries respectively.
275 unsigned MinCost
= ShufTab
[i
].Cost
;
277 // Scan the 8 entries.
278 for (unsigned j
= 0; j
!= 8; ++j
) {
279 unsigned NewElt
= setMaskElt(i
, UndefIdx
, j
);
280 if (ShufTab
[NewElt
].Cost
< MinCost
) {
281 MinCost
= ShufTab
[NewElt
].Cost
;
286 // If we found something cheaper than what was here before, use it.
289 ShufTab
[i
] = ShufTab
[MinVal
];
294 for (unsigned LHS
= 0; LHS
!= 0x8889; ++LHS
) {
295 if (!isValidMask(LHS
)) continue;
296 if (ShufTab
[LHS
].Cost
> 1000) continue;
298 // If nothing involving this operand could possibly be cheaper than what
299 // we already have, don't consider it.
300 if (ShufTab
[LHS
].Cost
+ 1 >= MaxCost
)
303 for (unsigned opnum
= 0, e
= TheOperators
.size(); opnum
!= e
; ++opnum
) {
304 Operator
*Op
= TheOperators
[opnum
];
306 // Evaluate op(LHS,LHS)
307 unsigned ResultMask
= Op
->getTransformedMask(LHS
, LHS
);
309 unsigned Cost
= ShufTab
[LHS
].Cost
+ Op
->getCost();
310 if (Cost
< ShufTab
[ResultMask
].Cost
) {
311 ShufTab
[ResultMask
].Cost
= Cost
;
312 ShufTab
[ResultMask
].Op
= Op
;
313 ShufTab
[ResultMask
].Arg0
= LHS
;
314 ShufTab
[ResultMask
].Arg1
= LHS
;
318 // If this is a two input instruction, include the op(x,y) cases. If
319 // this is a one input instruction, skip this.
320 if (Op
->isOnlyLHSOperator()) continue;
322 for (unsigned RHS
= 0; RHS
!= 0x8889; ++RHS
) {
323 if (!isValidMask(RHS
)) continue;
324 if (ShufTab
[RHS
].Cost
> 1000) continue;
326 // If nothing involving this operand could possibly be cheaper than
327 // what we already have, don't consider it.
328 if (ShufTab
[RHS
].Cost
+ 1 >= MaxCost
)
332 // Evaluate op(LHS,RHS)
333 unsigned ResultMask
= Op
->getTransformedMask(LHS
, RHS
);
335 if (ShufTab
[ResultMask
].Cost
<= OpCount
||
336 ShufTab
[ResultMask
].Cost
<= ShufTab
[LHS
].Cost
||
337 ShufTab
[ResultMask
].Cost
<= ShufTab
[RHS
].Cost
)
340 // Figure out the cost to evaluate this, knowing that CSE's only need
341 // to be evaluated once.
342 unsigned short Vals
[30];
343 unsigned NumVals
= 0;
344 EvaluateOps(LHS
, Vals
, NumVals
);
345 EvaluateOps(RHS
, Vals
, NumVals
);
347 unsigned Cost
= NumVals
+ Op
->getCost();
348 if (Cost
< ShufTab
[ResultMask
].Cost
) {
349 ShufTab
[ResultMask
].Cost
= Cost
;
350 ShufTab
[ResultMask
].Op
= Op
;
351 ShufTab
[ResultMask
].Arg0
= LHS
;
352 ShufTab
[ResultMask
].Arg1
= RHS
;
360 std::cerr
<< "Finished Table has " << getNumEntered()
361 << " entries established.\n";
363 unsigned CostArray
[10] = { 0 };
365 // Compute a cost histogram.
366 for (unsigned i
= 0; i
!= 65536; ++i
) {
367 if (!isValidMask(i
)) continue;
368 if (ShufTab
[i
].Cost
> 9)
371 ++CostArray
[ShufTab
[i
].Cost
];
374 for (unsigned i
= 0; i
!= 9; ++i
)
376 std::cout
<< "// " << CostArray
[i
] << " entries have cost " << i
<< "\n";
378 std::cout
<< "// " << CostArray
[9] << " entries have higher cost!\n";
381 // Build up the table to emit.
382 std::cout
<< "\n// This table is 6561*4 = 26244 bytes in size.\n";
383 std::cout
<< "static const unsigned PerfectShuffleTable[6561+1] = {\n";
385 for (unsigned i
= 0; i
!= 0x8889; ++i
) {
386 if (!isValidMask(i
)) continue;
388 // CostSat - The cost of this operation saturated to two bits.
389 unsigned CostSat
= ShufTab
[i
].Cost
;
390 if (CostSat
> 4) CostSat
= 4;
391 if (CostSat
== 0) CostSat
= 1;
392 --CostSat
; // Cost is now between 0-3.
394 unsigned OpNum
= ShufTab
[i
].Op
? ShufTab
[i
].Op
->OpNum
: 0;
395 assert(OpNum
< 16 && "Too few bits to encode operation!");
397 unsigned LHS
= getCompressedMask(ShufTab
[i
].Arg0
);
398 unsigned RHS
= getCompressedMask(ShufTab
[i
].Arg1
);
400 // Encode this as 2 bits of saturated cost, 4 bits of opcodes, 13 bits of
401 // LHS, and 13 bits of RHS = 32 bits.
402 unsigned Val
= (CostSat
<< 30) | (OpNum
<< 26) | (LHS
<< 13) | RHS
;
404 std::cout
<< " " << std::setw(10) << Val
<< "U, // ";
405 PrintMask(i
, std::cout
);
406 std::cout
<< ": Cost " << ShufTab
[i
].Cost
;
407 std::cout
<< " " << (ShufTab
[i
].Op
? ShufTab
[i
].Op
->getName() : "copy");
409 if (ShufTab
[ShufTab
[i
].Arg0
].Cost
== 0) {
410 std::cout
<< getZeroCostOpName(ShufTab
[i
].Arg0
);
412 PrintMask(ShufTab
[i
].Arg0
, std::cout
);
415 if (ShufTab
[i
].Op
&& !ShufTab
[i
].Op
->isOnlyLHSOperator()) {
417 if (ShufTab
[ShufTab
[i
].Arg1
].Cost
== 0) {
418 std::cout
<< getZeroCostOpName(ShufTab
[i
].Arg1
);
420 PrintMask(ShufTab
[i
].Arg1
, std::cout
);
425 std::cout
<< " 0\n};\n";
428 // Print out the table.
429 for (unsigned i
= 0; i
!= 0x8889; ++i
) {
430 if (!isValidMask(i
)) continue;
431 if (ShufTab
[i
].Cost
< 1000) {
432 PrintMask(i
, std::cerr
);
433 std::cerr
<< " - Cost " << ShufTab
[i
].Cost
<< " - ";
435 unsigned short Vals
[30];
436 unsigned NumVals
= 0;
437 EvaluateOps(i
, Vals
, NumVals
);
439 for (unsigned j
= 0, e
= NumVals
; j
!= e
; ++j
)
440 PrintOperation(j
, Vals
);
448 #ifdef GENERATE_ALTIVEC
450 ///===---------------------------------------------------------------------===//
451 /// The altivec instruction definitions. This is the altivec-specific part of
453 ///===---------------------------------------------------------------------===//
455 // Note that the opcode numbers here must match those in the PPC backend.
457 OP_COPY
= 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
469 struct vmrghw
: public Operator
{
470 vmrghw() : Operator(0x0415, "vmrghw", OP_VMRGHW
) {}
473 struct vmrglw
: public Operator
{
474 vmrglw() : Operator(0x2637, "vmrglw", OP_VMRGLW
) {}
477 template<unsigned Elt
>
478 struct vspltisw
: public Operator
{
479 vspltisw(const char *N
, unsigned Opc
)
480 : Operator(MakeMask(Elt
, Elt
, Elt
, Elt
), N
, Opc
) {}
483 vspltisw
<0> the_vspltisw0("vspltisw0", OP_VSPLTISW0
);
484 vspltisw
<1> the_vspltisw1("vspltisw1", OP_VSPLTISW1
);
485 vspltisw
<2> the_vspltisw2("vspltisw2", OP_VSPLTISW2
);
486 vspltisw
<3> the_vspltisw3("vspltisw3", OP_VSPLTISW3
);
489 struct vsldoi
: public Operator
{
490 vsldoi(const char *Name
, unsigned Opc
)
491 : Operator(MakeMask(N
&7, (N
+1)&7, (N
+2)&7, (N
+3)&7), Name
, Opc
) {
495 vsldoi
<1> the_vsldoi1("vsldoi4" , OP_VSLDOI4
);
496 vsldoi
<2> the_vsldoi2("vsldoi8" , OP_VSLDOI8
);
497 vsldoi
<3> the_vsldoi3("vsldoi12", OP_VSLDOI12
);
501 #define GENERATE_NEON
505 OP_COPY
= 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
514 OP_VUZPL
, // VUZP, left result
515 OP_VUZPR
, // VUZP, right result
516 OP_VZIPL
, // VZIP, left result
517 OP_VZIPR
, // VZIP, right result
518 OP_VTRNL
, // VTRN, left result
519 OP_VTRNR
// VTRN, right result
522 struct vrev
: public Operator
{
523 vrev() : Operator(0x1032, "vrev", OP_VREV
) {}
526 template<unsigned Elt
>
527 struct vdup
: public Operator
{
528 vdup(const char *N
, unsigned Opc
)
529 : Operator(MakeMask(Elt
, Elt
, Elt
, Elt
), N
, Opc
) {}
532 vdup
<0> the_vdup0("vdup0", OP_VDUP0
);
533 vdup
<1> the_vdup1("vdup1", OP_VDUP1
);
534 vdup
<2> the_vdup2("vdup2", OP_VDUP2
);
535 vdup
<3> the_vdup3("vdup3", OP_VDUP3
);
538 struct vext
: public Operator
{
539 vext(const char *Name
, unsigned Opc
)
540 : Operator(MakeMask(N
&7, (N
+1)&7, (N
+2)&7, (N
+3)&7), Name
, Opc
) {
544 vext
<1> the_vext1("vext1", OP_VEXT1
);
545 vext
<2> the_vext2("vext2", OP_VEXT2
);
546 vext
<3> the_vext3("vext3", OP_VEXT3
);
548 struct vuzpl
: public Operator
{
549 vuzpl() : Operator(0x0246, "vuzpl", OP_VUZPL
, 2) {}
552 struct vuzpr
: public Operator
{
553 vuzpr() : Operator(0x1357, "vuzpr", OP_VUZPR
, 2) {}
556 struct vzipl
: public Operator
{
557 vzipl() : Operator(0x0415, "vzipl", OP_VZIPL
, 2) {}
560 struct vzipr
: public Operator
{
561 vzipr() : Operator(0x2637, "vzipr", OP_VZIPR
, 2) {}
564 struct vtrnl
: public Operator
{
565 vtrnl() : Operator(0x0426, "vtrnl", OP_VTRNL
, 2) {}
568 struct vtrnr
: public Operator
{
569 vtrnr() : Operator(0x1537, "vtrnr", OP_VTRNR
, 2) {}