[ARM] MVE predicate store patterns
[llvm-complete.git] / utils / PerfectShuffle / PerfectShuffle.cpp
blob0f2e67298404b9fc6f5f652be2979cc342e3628c
1 //===-- PerfectShuffle.cpp - Perfect Shuffle Generator --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file computes an optimal sequence of instructions for doing all shuffles
10 // of two 4-element vectors. With a release build and when configured to emit
11 // an altivec instruction table, this takes about 30s to run on a 2.7Ghz
12 // PowerPC G5.
14 //===----------------------------------------------------------------------===//
16 #include <cassert>
17 #include <cstdlib>
18 #include <iomanip>
19 #include <iostream>
20 #include <vector>
21 struct Operator;
23 // Masks are 4-nibble hex numbers. Values 0-7 in any nibble means that it takes
24 // an element from that value of the input vectors. A value of 8 means the
25 // entry is undefined.
27 // Mask manipulation functions.
28 static inline unsigned short MakeMask(unsigned V0, unsigned V1,
29 unsigned V2, unsigned V3) {
30 return (V0 << (3*4)) | (V1 << (2*4)) | (V2 << (1*4)) | (V3 << (0*4));
33 /// getMaskElt - Return element N of the specified mask.
34 static unsigned getMaskElt(unsigned Mask, unsigned Elt) {
35 return (Mask >> ((3-Elt)*4)) & 0xF;
38 static unsigned setMaskElt(unsigned Mask, unsigned Elt, unsigned NewVal) {
39 unsigned FieldShift = ((3-Elt)*4);
40 return (Mask & ~(0xF << FieldShift)) | (NewVal << FieldShift);
43 // Reject elements where the values are 9-15.
44 static bool isValidMask(unsigned short Mask) {
45 unsigned short UndefBits = Mask & 0x8888;
46 return (Mask & ((UndefBits >> 1)|(UndefBits>>2)|(UndefBits>>3))) == 0;
49 /// hasUndefElements - Return true if any of the elements in the mask are undefs
50 ///
51 static bool hasUndefElements(unsigned short Mask) {
52 return (Mask & 0x8888) != 0;
55 /// isOnlyLHSMask - Return true if this mask only refers to its LHS, not
56 /// including undef values..
57 static bool isOnlyLHSMask(unsigned short Mask) {
58 return (Mask & 0x4444) == 0;
61 /// getLHSOnlyMask - Given a mask that refers to its LHS and RHS, modify it to
62 /// refer to the LHS only (for when one argument value is passed into the same
63 /// function twice).
64 #if 0
65 static unsigned short getLHSOnlyMask(unsigned short Mask) {
66 return Mask & 0xBBBB; // Keep only LHS and Undefs.
68 #endif
70 /// getCompressedMask - Turn a 16-bit uncompressed mask (where each elt uses 4
71 /// bits) into a compressed 13-bit mask, where each elt is multiplied by 9.
72 static unsigned getCompressedMask(unsigned short Mask) {
73 return getMaskElt(Mask, 0)*9*9*9 + getMaskElt(Mask, 1)*9*9 +
74 getMaskElt(Mask, 2)*9 + getMaskElt(Mask, 3);
77 static void PrintMask(unsigned i, std::ostream &OS) {
78 OS << "<" << (char)(getMaskElt(i, 0) == 8 ? 'u' : ('0'+getMaskElt(i, 0)))
79 << "," << (char)(getMaskElt(i, 1) == 8 ? 'u' : ('0'+getMaskElt(i, 1)))
80 << "," << (char)(getMaskElt(i, 2) == 8 ? 'u' : ('0'+getMaskElt(i, 2)))
81 << "," << (char)(getMaskElt(i, 3) == 8 ? 'u' : ('0'+getMaskElt(i, 3)))
82 << ">";
85 /// ShuffleVal - This represents a shufflevector operation.
86 struct ShuffleVal {
87 Operator *Op; // The Operation used to generate this value.
88 unsigned Cost; // Number of instrs used to generate this value.
89 unsigned short Arg0, Arg1; // Input operands for this value.
91 ShuffleVal() : Cost(1000000) {}
95 /// ShufTab - This is the actual shuffle table that we are trying to generate.
96 ///
97 static ShuffleVal ShufTab[65536];
99 /// TheOperators - All of the operators that this target supports.
100 static std::vector<Operator*> TheOperators;
102 /// Operator - This is a vector operation that is available for use.
103 struct Operator {
104 const char *Name;
105 unsigned short ShuffleMask;
106 unsigned short OpNum;
107 unsigned Cost;
109 Operator(unsigned short shufflemask, const char *name, unsigned opnum,
110 unsigned cost = 1)
111 : Name(name), ShuffleMask(shufflemask), OpNum(opnum),Cost(cost) {
112 TheOperators.push_back(this);
114 ~Operator() {
115 assert(TheOperators.back() == this);
116 TheOperators.pop_back();
119 bool isOnlyLHSOperator() const {
120 return isOnlyLHSMask(ShuffleMask);
123 const char *getName() const { return Name; }
124 unsigned getCost() const { return Cost; }
126 unsigned short getTransformedMask(unsigned short LHSMask, unsigned RHSMask) {
127 // Extract the elements from LHSMask and RHSMask, as appropriate.
128 unsigned Result = 0;
129 for (unsigned i = 0; i != 4; ++i) {
130 unsigned SrcElt = (ShuffleMask >> (4*i)) & 0xF;
131 unsigned ResElt;
132 if (SrcElt < 4)
133 ResElt = getMaskElt(LHSMask, SrcElt);
134 else if (SrcElt < 8)
135 ResElt = getMaskElt(RHSMask, SrcElt-4);
136 else {
137 assert(SrcElt == 8 && "Bad src elt!");
138 ResElt = 8;
140 Result |= ResElt << (4*i);
142 return Result;
146 static const char *getZeroCostOpName(unsigned short Op) {
147 if (ShufTab[Op].Arg0 == 0x0123)
148 return "LHS";
149 else if (ShufTab[Op].Arg0 == 0x4567)
150 return "RHS";
151 else {
152 assert(0 && "bad zero cost operation");
153 abort();
157 static void PrintOperation(unsigned ValNo, unsigned short Vals[]) {
158 unsigned short ThisOp = Vals[ValNo];
159 std::cerr << "t" << ValNo;
160 PrintMask(ThisOp, std::cerr);
161 std::cerr << " = " << ShufTab[ThisOp].Op->getName() << "(";
163 if (ShufTab[ShufTab[ThisOp].Arg0].Cost == 0) {
164 std::cerr << getZeroCostOpName(ShufTab[ThisOp].Arg0);
165 PrintMask(ShufTab[ThisOp].Arg0, std::cerr);
166 } else {
167 // Figure out what tmp # it is.
168 for (unsigned i = 0; ; ++i)
169 if (Vals[i] == ShufTab[ThisOp].Arg0) {
170 std::cerr << "t" << i;
171 break;
175 if (!ShufTab[Vals[ValNo]].Op->isOnlyLHSOperator()) {
176 std::cerr << ", ";
177 if (ShufTab[ShufTab[ThisOp].Arg1].Cost == 0) {
178 std::cerr << getZeroCostOpName(ShufTab[ThisOp].Arg1);
179 PrintMask(ShufTab[ThisOp].Arg1, std::cerr);
180 } else {
181 // Figure out what tmp # it is.
182 for (unsigned i = 0; ; ++i)
183 if (Vals[i] == ShufTab[ThisOp].Arg1) {
184 std::cerr << "t" << i;
185 break;
189 std::cerr << ") ";
192 static unsigned getNumEntered() {
193 unsigned Count = 0;
194 for (unsigned i = 0; i != 65536; ++i)
195 Count += ShufTab[i].Cost < 100;
196 return Count;
199 static void EvaluateOps(unsigned short Elt, unsigned short Vals[],
200 unsigned &NumVals) {
201 if (ShufTab[Elt].Cost == 0) return;
203 // If this value has already been evaluated, it is free. FIXME: match undefs.
204 for (unsigned i = 0, e = NumVals; i != e; ++i)
205 if (Vals[i] == Elt) return;
207 // Otherwise, get the operands of the value, then add it.
208 unsigned Arg0 = ShufTab[Elt].Arg0, Arg1 = ShufTab[Elt].Arg1;
209 if (ShufTab[Arg0].Cost)
210 EvaluateOps(Arg0, Vals, NumVals);
211 if (Arg0 != Arg1 && ShufTab[Arg1].Cost)
212 EvaluateOps(Arg1, Vals, NumVals);
214 Vals[NumVals++] = Elt;
218 int main() {
219 // Seed the table with accesses to the LHS and RHS.
220 ShufTab[0x0123].Cost = 0;
221 ShufTab[0x0123].Op = nullptr;
222 ShufTab[0x0123].Arg0 = 0x0123;
223 ShufTab[0x4567].Cost = 0;
224 ShufTab[0x4567].Op = nullptr;
225 ShufTab[0x4567].Arg0 = 0x4567;
227 // Seed the first-level of shuffles, shuffles whose inputs are the input to
228 // the vectorshuffle operation.
229 bool MadeChange = true;
230 unsigned OpCount = 0;
231 while (MadeChange) {
232 MadeChange = false;
233 ++OpCount;
234 std::cerr << "Starting iteration #" << OpCount << " with "
235 << getNumEntered() << " entries established.\n";
237 // Scan the table for two reasons: First, compute the maximum cost of any
238 // operation left in the table. Second, make sure that values with undefs
239 // have the cheapest alternative that they match.
240 unsigned MaxCost = ShufTab[0].Cost;
241 for (unsigned i = 1; i != 0x8889; ++i) {
242 if (!isValidMask(i)) continue;
243 if (ShufTab[i].Cost > MaxCost)
244 MaxCost = ShufTab[i].Cost;
246 // If this value has an undef, make it be computed the cheapest possible
247 // way of any of the things that it matches.
248 if (hasUndefElements(i)) {
249 // This code is a little bit tricky, so here's the idea: consider some
250 // permutation, like 7u4u. To compute the lowest cost for 7u4u, we
251 // need to take the minimum cost of all of 7[0-8]4[0-8], 81 entries. If
252 // there are 3 undefs, the number rises to 729 entries we have to scan,
253 // and for the 4 undef case, we have to scan the whole table.
255 // Instead of doing this huge amount of scanning, we process the table
256 // entries *in order*, and use the fact that 'u' is 8, larger than any
257 // valid index. Given an entry like 7u4u then, we only need to scan
258 // 7[0-7]4u - 8 entries. We can get away with this, because we already
259 // know that each of 704u, 714u, 724u, etc contain the minimum value of
260 // all of the 704[0-8], 714[0-8] and 724[0-8] entries respectively.
261 unsigned UndefIdx;
262 if (i & 0x8000)
263 UndefIdx = 0;
264 else if (i & 0x0800)
265 UndefIdx = 1;
266 else if (i & 0x0080)
267 UndefIdx = 2;
268 else if (i & 0x0008)
269 UndefIdx = 3;
270 else
271 abort();
273 unsigned MinVal = i;
274 unsigned MinCost = ShufTab[i].Cost;
276 // Scan the 8 entries.
277 for (unsigned j = 0; j != 8; ++j) {
278 unsigned NewElt = setMaskElt(i, UndefIdx, j);
279 if (ShufTab[NewElt].Cost < MinCost) {
280 MinCost = ShufTab[NewElt].Cost;
281 MinVal = NewElt;
285 // If we found something cheaper than what was here before, use it.
286 if (i != MinVal) {
287 MadeChange = true;
288 ShufTab[i] = ShufTab[MinVal];
293 for (unsigned LHS = 0; LHS != 0x8889; ++LHS) {
294 if (!isValidMask(LHS)) continue;
295 if (ShufTab[LHS].Cost > 1000) continue;
297 // If nothing involving this operand could possibly be cheaper than what
298 // we already have, don't consider it.
299 if (ShufTab[LHS].Cost + 1 >= MaxCost)
300 continue;
302 for (unsigned opnum = 0, e = TheOperators.size(); opnum != e; ++opnum) {
303 Operator *Op = TheOperators[opnum];
305 // Evaluate op(LHS,LHS)
306 unsigned ResultMask = Op->getTransformedMask(LHS, LHS);
308 unsigned Cost = ShufTab[LHS].Cost + Op->getCost();
309 if (Cost < ShufTab[ResultMask].Cost) {
310 ShufTab[ResultMask].Cost = Cost;
311 ShufTab[ResultMask].Op = Op;
312 ShufTab[ResultMask].Arg0 = LHS;
313 ShufTab[ResultMask].Arg1 = LHS;
314 MadeChange = true;
317 // If this is a two input instruction, include the op(x,y) cases. If
318 // this is a one input instruction, skip this.
319 if (Op->isOnlyLHSOperator()) continue;
321 for (unsigned RHS = 0; RHS != 0x8889; ++RHS) {
322 if (!isValidMask(RHS)) continue;
323 if (ShufTab[RHS].Cost > 1000) continue;
325 // If nothing involving this operand could possibly be cheaper than
326 // what we already have, don't consider it.
327 if (ShufTab[RHS].Cost + 1 >= MaxCost)
328 continue;
331 // Evaluate op(LHS,RHS)
332 unsigned ResultMask = Op->getTransformedMask(LHS, RHS);
334 if (ShufTab[ResultMask].Cost <= OpCount ||
335 ShufTab[ResultMask].Cost <= ShufTab[LHS].Cost ||
336 ShufTab[ResultMask].Cost <= ShufTab[RHS].Cost)
337 continue;
339 // Figure out the cost to evaluate this, knowing that CSE's only need
340 // to be evaluated once.
341 unsigned short Vals[30];
342 unsigned NumVals = 0;
343 EvaluateOps(LHS, Vals, NumVals);
344 EvaluateOps(RHS, Vals, NumVals);
346 unsigned Cost = NumVals + Op->getCost();
347 if (Cost < ShufTab[ResultMask].Cost) {
348 ShufTab[ResultMask].Cost = Cost;
349 ShufTab[ResultMask].Op = Op;
350 ShufTab[ResultMask].Arg0 = LHS;
351 ShufTab[ResultMask].Arg1 = RHS;
352 MadeChange = true;
359 std::cerr << "Finished Table has " << getNumEntered()
360 << " entries established.\n";
362 unsigned CostArray[10] = { 0 };
364 // Compute a cost histogram.
365 for (unsigned i = 0; i != 65536; ++i) {
366 if (!isValidMask(i)) continue;
367 if (ShufTab[i].Cost > 9)
368 ++CostArray[9];
369 else
370 ++CostArray[ShufTab[i].Cost];
373 for (unsigned i = 0; i != 9; ++i)
374 if (CostArray[i])
375 std::cout << "// " << CostArray[i] << " entries have cost " << i << "\n";
376 if (CostArray[9])
377 std::cout << "// " << CostArray[9] << " entries have higher cost!\n";
380 // Build up the table to emit.
381 std::cout << "\n// This table is 6561*4 = 26244 bytes in size.\n";
382 std::cout << "static const unsigned PerfectShuffleTable[6561+1] = {\n";
384 for (unsigned i = 0; i != 0x8889; ++i) {
385 if (!isValidMask(i)) continue;
387 // CostSat - The cost of this operation saturated to two bits.
388 unsigned CostSat = ShufTab[i].Cost;
389 if (CostSat > 4) CostSat = 4;
390 if (CostSat == 0) CostSat = 1;
391 --CostSat; // Cost is now between 0-3.
393 unsigned OpNum = ShufTab[i].Op ? ShufTab[i].Op->OpNum : 0;
394 assert(OpNum < 16 && "Too few bits to encode operation!");
396 unsigned LHS = getCompressedMask(ShufTab[i].Arg0);
397 unsigned RHS = getCompressedMask(ShufTab[i].Arg1);
399 // Encode this as 2 bits of saturated cost, 4 bits of opcodes, 13 bits of
400 // LHS, and 13 bits of RHS = 32 bits.
401 unsigned Val = (CostSat << 30) | (OpNum << 26) | (LHS << 13) | RHS;
403 std::cout << " " << std::setw(10) << Val << "U, // ";
404 PrintMask(i, std::cout);
405 std::cout << ": Cost " << ShufTab[i].Cost;
406 std::cout << " " << (ShufTab[i].Op ? ShufTab[i].Op->getName() : "copy");
407 std::cout << " ";
408 if (ShufTab[ShufTab[i].Arg0].Cost == 0) {
409 std::cout << getZeroCostOpName(ShufTab[i].Arg0);
410 } else {
411 PrintMask(ShufTab[i].Arg0, std::cout);
414 if (ShufTab[i].Op && !ShufTab[i].Op->isOnlyLHSOperator()) {
415 std::cout << ", ";
416 if (ShufTab[ShufTab[i].Arg1].Cost == 0) {
417 std::cout << getZeroCostOpName(ShufTab[i].Arg1);
418 } else {
419 PrintMask(ShufTab[i].Arg1, std::cout);
422 std::cout << "\n";
424 std::cout << " 0\n};\n";
426 if (0) {
427 // Print out the table.
428 for (unsigned i = 0; i != 0x8889; ++i) {
429 if (!isValidMask(i)) continue;
430 if (ShufTab[i].Cost < 1000) {
431 PrintMask(i, std::cerr);
432 std::cerr << " - Cost " << ShufTab[i].Cost << " - ";
434 unsigned short Vals[30];
435 unsigned NumVals = 0;
436 EvaluateOps(i, Vals, NumVals);
438 for (unsigned j = 0, e = NumVals; j != e; ++j)
439 PrintOperation(j, Vals);
440 std::cerr << "\n";
447 #ifdef GENERATE_ALTIVEC
449 ///===---------------------------------------------------------------------===//
450 /// The altivec instruction definitions. This is the altivec-specific part of
451 /// this file.
452 ///===---------------------------------------------------------------------===//
454 // Note that the opcode numbers here must match those in the PPC backend.
455 enum {
456 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
457 OP_VMRGHW,
458 OP_VMRGLW,
459 OP_VSPLTISW0,
460 OP_VSPLTISW1,
461 OP_VSPLTISW2,
462 OP_VSPLTISW3,
463 OP_VSLDOI4,
464 OP_VSLDOI8,
465 OP_VSLDOI12
468 struct vmrghw : public Operator {
469 vmrghw() : Operator(0x0415, "vmrghw", OP_VMRGHW) {}
470 } the_vmrghw;
472 struct vmrglw : public Operator {
473 vmrglw() : Operator(0x2637, "vmrglw", OP_VMRGLW) {}
474 } the_vmrglw;
476 template<unsigned Elt>
477 struct vspltisw : public Operator {
478 vspltisw(const char *N, unsigned Opc)
479 : Operator(MakeMask(Elt, Elt, Elt, Elt), N, Opc) {}
482 vspltisw<0> the_vspltisw0("vspltisw0", OP_VSPLTISW0);
483 vspltisw<1> the_vspltisw1("vspltisw1", OP_VSPLTISW1);
484 vspltisw<2> the_vspltisw2("vspltisw2", OP_VSPLTISW2);
485 vspltisw<3> the_vspltisw3("vspltisw3", OP_VSPLTISW3);
487 template<unsigned N>
488 struct vsldoi : public Operator {
489 vsldoi(const char *Name, unsigned Opc)
490 : Operator(MakeMask(N&7, (N+1)&7, (N+2)&7, (N+3)&7), Name, Opc) {
494 vsldoi<1> the_vsldoi1("vsldoi4" , OP_VSLDOI4);
495 vsldoi<2> the_vsldoi2("vsldoi8" , OP_VSLDOI8);
496 vsldoi<3> the_vsldoi3("vsldoi12", OP_VSLDOI12);
498 #endif
500 #define GENERATE_NEON
502 #ifdef GENERATE_NEON
503 enum {
504 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
505 OP_VREV,
506 OP_VDUP0,
507 OP_VDUP1,
508 OP_VDUP2,
509 OP_VDUP3,
510 OP_VEXT1,
511 OP_VEXT2,
512 OP_VEXT3,
513 OP_VUZPL, // VUZP, left result
514 OP_VUZPR, // VUZP, right result
515 OP_VZIPL, // VZIP, left result
516 OP_VZIPR, // VZIP, right result
517 OP_VTRNL, // VTRN, left result
518 OP_VTRNR // VTRN, right result
521 struct vrev : public Operator {
522 vrev() : Operator(0x1032, "vrev", OP_VREV) {}
523 } the_vrev;
525 template<unsigned Elt>
526 struct vdup : public Operator {
527 vdup(const char *N, unsigned Opc)
528 : Operator(MakeMask(Elt, Elt, Elt, Elt), N, Opc) {}
531 vdup<0> the_vdup0("vdup0", OP_VDUP0);
532 vdup<1> the_vdup1("vdup1", OP_VDUP1);
533 vdup<2> the_vdup2("vdup2", OP_VDUP2);
534 vdup<3> the_vdup3("vdup3", OP_VDUP3);
536 template<unsigned N>
537 struct vext : public Operator {
538 vext(const char *Name, unsigned Opc)
539 : Operator(MakeMask(N&7, (N+1)&7, (N+2)&7, (N+3)&7), Name, Opc) {
543 vext<1> the_vext1("vext1", OP_VEXT1);
544 vext<2> the_vext2("vext2", OP_VEXT2);
545 vext<3> the_vext3("vext3", OP_VEXT3);
547 struct vuzpl : public Operator {
548 vuzpl() : Operator(0x0246, "vuzpl", OP_VUZPL, 2) {}
549 } the_vuzpl;
551 struct vuzpr : public Operator {
552 vuzpr() : Operator(0x1357, "vuzpr", OP_VUZPR, 2) {}
553 } the_vuzpr;
555 struct vzipl : public Operator {
556 vzipl() : Operator(0x0415, "vzipl", OP_VZIPL, 2) {}
557 } the_vzipl;
559 struct vzipr : public Operator {
560 vzipr() : Operator(0x2637, "vzipr", OP_VZIPR, 2) {}
561 } the_vzipr;
563 struct vtrnl : public Operator {
564 vtrnl() : Operator(0x0426, "vtrnl", OP_VTRNL, 2) {}
565 } the_vtrnl;
567 struct vtrnr : public Operator {
568 vtrnr() : Operator(0x1537, "vtrnr", OP_VTRNR, 2) {}
569 } the_vtrnr;
571 #endif