1 //===-- PerfectShuffle.cpp - Perfect Shuffle Generator --------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file computes an optimal sequence of instructions for doing all shuffles
11 // of two 4-element vectors. With a release build and when configured to emit
12 // an altivec instruction table, this takes about 30s to run on a 2.7Ghz
15 //===----------------------------------------------------------------------===//
23 // Masks are 4-nibble hex numbers. Values 0-7 in any nibble means that it takes
24 // an element from that value of the input vectors. A value of 8 means the
25 // entry is undefined.
27 // Mask manipulation functions.
28 static inline unsigned short MakeMask(unsigned V0
, unsigned V1
,
29 unsigned V2
, unsigned V3
) {
30 return (V0
<< (3*4)) | (V1
<< (2*4)) | (V2
<< (1*4)) | (V3
<< (0*4));
33 /// getMaskElt - Return element N of the specified mask.
34 static unsigned getMaskElt(unsigned Mask
, unsigned Elt
) {
35 return (Mask
>> ((3-Elt
)*4)) & 0xF;
38 static unsigned setMaskElt(unsigned Mask
, unsigned Elt
, unsigned NewVal
) {
39 unsigned FieldShift
= ((3-Elt
)*4);
40 return (Mask
& ~(0xF << FieldShift
)) | (NewVal
<< FieldShift
);
43 // Reject elements where the values are 9-15.
44 static bool isValidMask(unsigned short Mask
) {
45 unsigned short UndefBits
= Mask
& 0x8888;
46 return (Mask
& ((UndefBits
>> 1)|(UndefBits
>>2)|(UndefBits
>>3))) == 0;
49 /// hasUndefElements - Return true if any of the elements in the mask are undefs
51 static bool hasUndefElements(unsigned short Mask
) {
52 return (Mask
& 0x8888) != 0;
55 /// isOnlyLHSMask - Return true if this mask only refers to its LHS, not
56 /// including undef values..
57 static bool isOnlyLHSMask(unsigned short Mask
) {
58 return (Mask
& 0x4444) == 0;
61 /// getLHSOnlyMask - Given a mask that refers to its LHS and RHS, modify it to
62 /// refer to the LHS only (for when one argument value is passed into the same
65 static unsigned short getLHSOnlyMask(unsigned short Mask
) {
66 return Mask
& 0xBBBB; // Keep only LHS and Undefs.
70 /// getCompressedMask - Turn a 16-bit uncompressed mask (where each elt uses 4
71 /// bits) into a compressed 13-bit mask, where each elt is multiplied by 9.
72 static unsigned getCompressedMask(unsigned short Mask
) {
73 return getMaskElt(Mask
, 0)*9*9*9 + getMaskElt(Mask
, 1)*9*9 +
74 getMaskElt(Mask
, 2)*9 + getMaskElt(Mask
, 3);
77 static void PrintMask(unsigned i
, std::ostream
&OS
) {
78 OS
<< "<" << (char)(getMaskElt(i
, 0) == 8 ? 'u' : ('0'+getMaskElt(i
, 0)))
79 << "," << (char)(getMaskElt(i
, 1) == 8 ? 'u' : ('0'+getMaskElt(i
, 1)))
80 << "," << (char)(getMaskElt(i
, 2) == 8 ? 'u' : ('0'+getMaskElt(i
, 2)))
81 << "," << (char)(getMaskElt(i
, 3) == 8 ? 'u' : ('0'+getMaskElt(i
, 3)))
85 /// ShuffleVal - This represents a shufflevector operation.
87 unsigned Cost
; // Number of instrs used to generate this value.
88 Operator
*Op
; // The Operation used to generate this value.
89 unsigned short Arg0
, Arg1
; // Input operands for this value.
91 ShuffleVal() : Cost(1000000) {}
95 /// ShufTab - This is the actual shuffle table that we are trying to generate.
97 static ShuffleVal ShufTab
[65536];
99 /// TheOperators - All of the operators that this target supports.
100 static std::vector
<Operator
*> TheOperators
;
102 /// Operator - This is a vector operation that is available for use.
104 unsigned short ShuffleMask
;
105 unsigned short OpNum
;
108 Operator(unsigned short shufflemask
, const char *name
, unsigned opnum
)
109 : ShuffleMask(shufflemask
), OpNum(opnum
), Name(name
) {
110 TheOperators
.push_back(this);
113 assert(TheOperators
.back() == this);
114 TheOperators
.pop_back();
117 bool isOnlyLHSOperator() const {
118 return isOnlyLHSMask(ShuffleMask
);
121 const char *getName() const { return Name
; }
123 unsigned short getTransformedMask(unsigned short LHSMask
, unsigned RHSMask
) {
124 // Extract the elements from LHSMask and RHSMask, as appropriate.
126 for (unsigned i
= 0; i
!= 4; ++i
) {
127 unsigned SrcElt
= (ShuffleMask
>> (4*i
)) & 0xF;
130 ResElt
= getMaskElt(LHSMask
, SrcElt
);
132 ResElt
= getMaskElt(RHSMask
, SrcElt
-4);
134 assert(SrcElt
== 8 && "Bad src elt!");
137 Result
|= ResElt
<< (4*i
);
143 static const char *getZeroCostOpName(unsigned short Op
) {
144 if (ShufTab
[Op
].Arg0
== 0x0123)
146 else if (ShufTab
[Op
].Arg0
== 0x4567)
149 assert(0 && "bad zero cost operation");
154 static void PrintOperation(unsigned ValNo
, unsigned short Vals
[]) {
155 unsigned short ThisOp
= Vals
[ValNo
];
156 std::cerr
<< "t" << ValNo
;
157 PrintMask(ThisOp
, std::cerr
);
158 std::cerr
<< " = " << ShufTab
[ThisOp
].Op
->getName() << "(";
160 if (ShufTab
[ShufTab
[ThisOp
].Arg0
].Cost
== 0) {
161 std::cerr
<< getZeroCostOpName(ShufTab
[ThisOp
].Arg0
);
162 PrintMask(ShufTab
[ThisOp
].Arg0
, std::cerr
);
164 // Figure out what tmp # it is.
165 for (unsigned i
= 0; ; ++i
)
166 if (Vals
[i
] == ShufTab
[ThisOp
].Arg0
) {
167 std::cerr
<< "t" << i
;
172 if (!ShufTab
[Vals
[ValNo
]].Op
->isOnlyLHSOperator()) {
174 if (ShufTab
[ShufTab
[ThisOp
].Arg1
].Cost
== 0) {
175 std::cerr
<< getZeroCostOpName(ShufTab
[ThisOp
].Arg1
);
176 PrintMask(ShufTab
[ThisOp
].Arg1
, std::cerr
);
178 // Figure out what tmp # it is.
179 for (unsigned i
= 0; ; ++i
)
180 if (Vals
[i
] == ShufTab
[ThisOp
].Arg1
) {
181 std::cerr
<< "t" << i
;
189 static unsigned getNumEntered() {
191 for (unsigned i
= 0; i
!= 65536; ++i
)
192 Count
+= ShufTab
[i
].Cost
< 100;
196 static void EvaluateOps(unsigned short Elt
, unsigned short Vals
[],
198 if (ShufTab
[Elt
].Cost
== 0) return;
200 // If this value has already been evaluated, it is free. FIXME: match undefs.
201 for (unsigned i
= 0, e
= NumVals
; i
!= e
; ++i
)
202 if (Vals
[i
] == Elt
) return;
204 // Otherwise, get the operands of the value, then add it.
205 unsigned Arg0
= ShufTab
[Elt
].Arg0
, Arg1
= ShufTab
[Elt
].Arg1
;
206 if (ShufTab
[Arg0
].Cost
)
207 EvaluateOps(Arg0
, Vals
, NumVals
);
208 if (Arg0
!= Arg1
&& ShufTab
[Arg1
].Cost
)
209 EvaluateOps(Arg1
, Vals
, NumVals
);
211 Vals
[NumVals
++] = Elt
;
216 // Seed the table with accesses to the LHS and RHS.
217 ShufTab
[0x0123].Cost
= 0;
218 ShufTab
[0x0123].Op
= 0;
219 ShufTab
[0x0123].Arg0
= 0x0123;
220 ShufTab
[0x4567].Cost
= 0;
221 ShufTab
[0x4567].Op
= 0;
222 ShufTab
[0x4567].Arg0
= 0x4567;
224 // Seed the first-level of shuffles, shuffles whose inputs are the input to
225 // the vectorshuffle operation.
226 bool MadeChange
= true;
227 unsigned OpCount
= 0;
231 std::cerr
<< "Starting iteration #" << OpCount
<< " with "
232 << getNumEntered() << " entries established.\n";
234 // Scan the table for two reasons: First, compute the maximum cost of any
235 // operation left in the table. Second, make sure that values with undefs
236 // have the cheapest alternative that they match.
237 unsigned MaxCost
= ShufTab
[0].Cost
;
238 for (unsigned i
= 1; i
!= 0x8889; ++i
) {
239 if (!isValidMask(i
)) continue;
240 if (ShufTab
[i
].Cost
> MaxCost
)
241 MaxCost
= ShufTab
[i
].Cost
;
243 // If this value has an undef, make it be computed the cheapest possible
244 // way of any of the things that it matches.
245 if (hasUndefElements(i
)) {
246 // This code is a little bit tricky, so here's the idea: consider some
247 // permutation, like 7u4u. To compute the lowest cost for 7u4u, we
248 // need to take the minimum cost of all of 7[0-8]4[0-8], 81 entries. If
249 // there are 3 undefs, the number rises to 729 entries we have to scan,
250 // and for the 4 undef case, we have to scan the whole table.
252 // Instead of doing this huge amount of scanning, we process the table
253 // entries *in order*, and use the fact that 'u' is 8, larger than any
254 // valid index. Given an entry like 7u4u then, we only need to scan
255 // 7[0-7]4u - 8 entries. We can get away with this, because we already
256 // know that each of 704u, 714u, 724u, etc contain the minimum value of
257 // all of the 704[0-8], 714[0-8] and 724[0-8] entries respectively.
271 unsigned MinCost
= ShufTab
[i
].Cost
;
273 // Scan the 8 entries.
274 for (unsigned j
= 0; j
!= 8; ++j
) {
275 unsigned NewElt
= setMaskElt(i
, UndefIdx
, j
);
276 if (ShufTab
[NewElt
].Cost
< MinCost
) {
277 MinCost
= ShufTab
[NewElt
].Cost
;
282 // If we found something cheaper than what was here before, use it.
285 ShufTab
[i
] = ShufTab
[MinVal
];
290 for (unsigned LHS
= 0; LHS
!= 0x8889; ++LHS
) {
291 if (!isValidMask(LHS
)) continue;
292 if (ShufTab
[LHS
].Cost
> 1000) continue;
294 // If nothing involving this operand could possibly be cheaper than what
295 // we already have, don't consider it.
296 if (ShufTab
[LHS
].Cost
+ 1 >= MaxCost
)
299 for (unsigned opnum
= 0, e
= TheOperators
.size(); opnum
!= e
; ++opnum
) {
300 Operator
*Op
= TheOperators
[opnum
];
302 // Evaluate op(LHS,LHS)
303 unsigned ResultMask
= Op
->getTransformedMask(LHS
, LHS
);
305 unsigned Cost
= ShufTab
[LHS
].Cost
+ 1;
306 if (Cost
< ShufTab
[ResultMask
].Cost
) {
307 ShufTab
[ResultMask
].Cost
= Cost
;
308 ShufTab
[ResultMask
].Op
= Op
;
309 ShufTab
[ResultMask
].Arg0
= LHS
;
310 ShufTab
[ResultMask
].Arg1
= LHS
;
314 // If this is a two input instruction, include the op(x,y) cases. If
315 // this is a one input instruction, skip this.
316 if (Op
->isOnlyLHSOperator()) continue;
318 for (unsigned RHS
= 0; RHS
!= 0x8889; ++RHS
) {
319 if (!isValidMask(RHS
)) continue;
320 if (ShufTab
[RHS
].Cost
> 1000) continue;
322 // If nothing involving this operand could possibly be cheaper than
323 // what we already have, don't consider it.
324 if (ShufTab
[RHS
].Cost
+ 1 >= MaxCost
)
328 // Evaluate op(LHS,RHS)
329 unsigned ResultMask
= Op
->getTransformedMask(LHS
, RHS
);
331 if (ShufTab
[ResultMask
].Cost
<= OpCount
||
332 ShufTab
[ResultMask
].Cost
<= ShufTab
[LHS
].Cost
||
333 ShufTab
[ResultMask
].Cost
<= ShufTab
[RHS
].Cost
)
336 // Figure out the cost to evaluate this, knowing that CSE's only need
337 // to be evaluated once.
338 unsigned short Vals
[30];
339 unsigned NumVals
= 0;
340 EvaluateOps(LHS
, Vals
, NumVals
);
341 EvaluateOps(RHS
, Vals
, NumVals
);
343 unsigned Cost
= NumVals
+ 1;
344 if (Cost
< ShufTab
[ResultMask
].Cost
) {
345 ShufTab
[ResultMask
].Cost
= Cost
;
346 ShufTab
[ResultMask
].Op
= Op
;
347 ShufTab
[ResultMask
].Arg0
= LHS
;
348 ShufTab
[ResultMask
].Arg1
= RHS
;
356 std::cerr
<< "Finished Table has " << getNumEntered()
357 << " entries established.\n";
359 unsigned CostArray
[10] = { 0 };
361 // Compute a cost histogram.
362 for (unsigned i
= 0; i
!= 65536; ++i
) {
363 if (!isValidMask(i
)) continue;
364 if (ShufTab
[i
].Cost
> 9)
367 ++CostArray
[ShufTab
[i
].Cost
];
370 for (unsigned i
= 0; i
!= 9; ++i
)
372 std::cout
<< "// " << CostArray
[i
] << " entries have cost " << i
<< "\n";
374 std::cout
<< "// " << CostArray
[9] << " entries have higher cost!\n";
377 // Build up the table to emit.
378 std::cout
<< "\n// This table is 6561*4 = 26244 bytes in size.\n";
379 std::cout
<< "static const unsigned PerfectShuffleTable[6561+1] = {\n";
381 for (unsigned i
= 0; i
!= 0x8889; ++i
) {
382 if (!isValidMask(i
)) continue;
384 // CostSat - The cost of this operation saturated to two bits.
385 unsigned CostSat
= ShufTab
[i
].Cost
;
386 if (CostSat
> 4) CostSat
= 4;
387 if (CostSat
== 0) CostSat
= 1;
388 --CostSat
; // Cost is now between 0-3.
390 unsigned OpNum
= ShufTab
[i
].Op
? ShufTab
[i
].Op
->OpNum
: 0;
391 assert(OpNum
< 16 && "Too few bits to encode operation!");
393 unsigned LHS
= getCompressedMask(ShufTab
[i
].Arg0
);
394 unsigned RHS
= getCompressedMask(ShufTab
[i
].Arg1
);
396 // Encode this as 2 bits of saturated cost, 4 bits of opcodes, 13 bits of
397 // LHS, and 13 bits of RHS = 32 bits.
398 unsigned Val
= (CostSat
<< 30) | (OpNum
<< 26) | (LHS
<< 13) | RHS
;
400 std::cout
<< " " << Val
<< "U,\t// ";
401 PrintMask(i
, std::cout
);
402 std::cout
<< ": Cost " << ShufTab
[i
].Cost
;
403 std::cout
<< " " << (ShufTab
[i
].Op
? ShufTab
[i
].Op
->getName() : "copy");
405 if (ShufTab
[ShufTab
[i
].Arg0
].Cost
== 0) {
406 std::cout
<< getZeroCostOpName(ShufTab
[i
].Arg0
);
408 PrintMask(ShufTab
[i
].Arg0
, std::cout
);
411 if (ShufTab
[i
].Op
&& !ShufTab
[i
].Op
->isOnlyLHSOperator()) {
413 if (ShufTab
[ShufTab
[i
].Arg1
].Cost
== 0) {
414 std::cout
<< getZeroCostOpName(ShufTab
[i
].Arg1
);
416 PrintMask(ShufTab
[i
].Arg1
, std::cout
);
421 std::cout
<< " 0\n};\n";
424 // Print out the table.
425 for (unsigned i
= 0; i
!= 0x8889; ++i
) {
426 if (!isValidMask(i
)) continue;
427 if (ShufTab
[i
].Cost
< 1000) {
428 PrintMask(i
, std::cerr
);
429 std::cerr
<< " - Cost " << ShufTab
[i
].Cost
<< " - ";
431 unsigned short Vals
[30];
432 unsigned NumVals
= 0;
433 EvaluateOps(i
, Vals
, NumVals
);
435 for (unsigned j
= 0, e
= NumVals
; j
!= e
; ++j
)
436 PrintOperation(j
, Vals
);
444 #define GENERATE_ALTIVEC
446 #ifdef GENERATE_ALTIVEC
448 ///===---------------------------------------------------------------------===//
449 /// The altivec instruction definitions. This is the altivec-specific part of
451 ///===---------------------------------------------------------------------===//
453 // Note that the opcode numbers here must match those in the PPC backend.
455 OP_COPY
= 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
467 struct vmrghw
: public Operator
{
468 vmrghw() : Operator(0x0415, "vmrghw", OP_VMRGHW
) {}
471 struct vmrglw
: public Operator
{
472 vmrglw() : Operator(0x2637, "vmrglw", OP_VMRGLW
) {}
475 template<unsigned Elt
>
476 struct vspltisw
: public Operator
{
477 vspltisw(const char *N
, unsigned Opc
)
478 : Operator(MakeMask(Elt
, Elt
, Elt
, Elt
), N
, Opc
) {}
481 vspltisw
<0> the_vspltisw0("vspltisw0", OP_VSPLTISW0
);
482 vspltisw
<1> the_vspltisw1("vspltisw1", OP_VSPLTISW1
);
483 vspltisw
<2> the_vspltisw2("vspltisw2", OP_VSPLTISW2
);
484 vspltisw
<3> the_vspltisw3("vspltisw3", OP_VSPLTISW3
);
487 struct vsldoi
: public Operator
{
488 vsldoi(const char *Name
, unsigned Opc
)
489 : Operator(MakeMask(N
&7, (N
+1)&7, (N
+2)&7, (N
+3)&7), Name
, Opc
) {
493 vsldoi
<1> the_vsldoi1("vsldoi4" , OP_VSLDOI4
);
494 vsldoi
<2> the_vsldoi2("vsldoi8" , OP_VSLDOI8
);
495 vsldoi
<3> the_vsldoi3("vsldoi12", OP_VSLDOI12
);