lib/Target/PowerPC/README.txt

   1 //===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
   2
   3 TODO:
   4 * gpr0 allocation
   5 * implement do-loop -> bdnz transform
   6 * lmw/stmw pass a la arm load store optimizer for prolog/epilog
   7
   8 ===-------------------------------------------------------------------------===
   9
  10 On PPC64, this:
  11
  12 long f2 (long x) { return 0xfffffff000000000UL; }
  13 long f3 (long x) { return 0x1ffffffffUL; }
  14
  15 could compile into:
  16
  17 _f2:
  18         li r3,-1
  19         rldicr r3,r3,0,27
  20         blr
  21 _f3:
  22         li r3,-1
  23         rldicl r3,r3,0,31
  24         blr
  25
  26 we produce:
  27
  28 _f2:
  29         lis r2, 4095
  30         ori r2, r2, 65535
  31         sldi r3, r2, 36
  32         blr
  33 _f3:
  34         li r2, 1
  35         sldi r2, r2, 32
  36         oris r2, r2, 65535
  37         ori r3, r2, 65535
  38         blr
  39
  40 ===-------------------------------------------------------------------------===
  41
  42 This code:
  43
  44 unsigned add32carry(unsigned sum, unsigned x) {
  45  unsigned z = sum + x;
  46  if (sum + x < x)
  47      z++;
  48  return z;
  49 }
  50
  51 Should compile to something like:
  52
  53         addc r3,r3,r4
  54         addze r3,r3
  55
  56 instead we get:
  57
  58         add r3, r4, r3
  59         cmplw cr7, r3, r4
  60         mfcr r4 ; 1
  61         rlwinm r4, r4, 29, 31, 31
  62         add r3, r3, r4
  63
  64 Ick.
  65
  66 ===-------------------------------------------------------------------------===
  67
  68 Support 'update' load/store instructions.  These are cracked on the G5, but are
  69 still a codesize win.
  70
  71 With preinc enabled, this:
  72
  73 long *%test4(long *%X, long *%dest) {
  74         %Y = getelementptr long* %X, int 4
  75         %A = load long* %Y
  76         store long %A, long* %dest
  77         ret long* %Y
  78 }
  79
  80 compiles to:
  81
  82 _test4:
  83         mr r2, r3
  84         lwzu r5, 32(r2)
  85         lwz r3, 36(r3)
  86         stw r5, 0(r4)
  87         stw r3, 4(r4)
  88         mr r3, r2
  89         blr
  90
  91 with -sched=list-burr, I get:
  92
  93 _test4:
  94         lwz r2, 36(r3)
  95         lwzu r5, 32(r3)
  96         stw r2, 4(r4)
  97         stw r5, 0(r4)
  98         blr
  99
 100 ===-------------------------------------------------------------------------===
 101
 102 We compile the hottest inner loop of viterbi to:
 103
 104         li r6, 0
 105         b LBB1_84       ;bb432.i
 106 LBB1_83:        ;bb420.i
 107         lbzx r8, r5, r7
 108         addi r6, r7, 1
 109         stbx r8, r4, r7
 110 LBB1_84:        ;bb432.i
 111         mr r7, r6
 112         cmplwi cr0, r7, 143
 113         bne cr0, LBB1_83        ;bb420.i
 114
 115 The CBE manages to produce:
 116
 117         li r0, 143
 118         mtctr r0
 119 loop:
 120         lbzx r2, r2, r11
 121         stbx r0, r2, r9
 122         addi r2, r2, 1
 123         bdz later
 124         b loop
 125
 126 This could be much better (bdnz instead of bdz) but it still beats us.  If we
 127 produced this with bdnz, the loop would be a single dispatch group.
 128
 129 ===-------------------------------------------------------------------------===
 130
 131 Compile:
 132
 133 void foo(int *P) {
 134  if (P)  *P = 0;
 135 }
 136
 137 into:
 138
 139 _foo:
 140         cmpwi cr0,r3,0
 141         beqlr cr0
 142         li r0,0
 143         stw r0,0(r3)
 144         blr
 145
 146 This is effectively a simple form of predication.
 147
 148 ===-------------------------------------------------------------------------===
 149
 150 Lump the constant pool for each function into ONE pic object, and reference
 151 pieces of it as offsets from the start.  For functions like this (contrived
 152 to have lots of constants obviously):
 153
 154 double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
 155
 156 We generate:
 157
 158 _X:
 159         lis r2, ha16(.CPI_X_0)
 160         lfd f0, lo16(.CPI_X_0)(r2)
 161         lis r2, ha16(.CPI_X_1)
 162         lfd f2, lo16(.CPI_X_1)(r2)
 163         fmadd f0, f1, f0, f2
 164         lis r2, ha16(.CPI_X_2)
 165         lfd f1, lo16(.CPI_X_2)(r2)
 166         lis r2, ha16(.CPI_X_3)
 167         lfd f2, lo16(.CPI_X_3)(r2)
 168         fmadd f1, f0, f1, f2
 169         blr
 170
 171 It would be better to materialize .CPI_X into a register, then use immediates
 172 off of the register to avoid the lis's.  This is even more important in PIC
 173 mode.
 174
 175 Note that this (and the static variable version) is discussed here for GCC:
 176 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
 177
 178 Here's another example (the sgn function):
 179 double testf(double a) {
 180        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
 181 }
 182
 183 it produces a BB like this:
 184 LBB1_1: ; cond_true
 185         lis r2, ha16(LCPI1_0)
 186         lfs f0, lo16(LCPI1_0)(r2)
 187         lis r2, ha16(LCPI1_1)
 188         lis r3, ha16(LCPI1_2)
 189         lfs f2, lo16(LCPI1_2)(r3)
 190         lfs f3, lo16(LCPI1_1)(r2)
 191         fsub f0, f0, f1
 192         fsel f1, f0, f2, f3
 193         blr
 194
 195 ===-------------------------------------------------------------------------===
 196
 197 PIC Code Gen IPO optimization:
 198
 199 Squish small scalar globals together into a single global struct, allowing the
 200 address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
 201 of the GOT on targets with one).
 202
 203 Note that this is discussed here for GCC:
 204 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
 205
 206 ===-------------------------------------------------------------------------===
 207
 208 Implement Newton-Rhapson method for improving estimate instructions to the
 209 correct accuracy, and implementing divide as multiply by reciprocal when it has
 210 more than one use.  Itanium would want this too.
 211
 212 ===-------------------------------------------------------------------------===
 213
 214 Compile offsets from allocas:
 215
 216 int *%test() {
 217         %X = alloca { int, int }
 218         %Y = getelementptr {int,int}* %X, int 0, uint 1
 219         ret int* %Y
 220 }
 221
 222 into a single add, not two:
 223
 224 _test:
 225         addi r2, r1, -8
 226         addi r3, r2, 4
 227         blr
 228
 229 --> important for C++.
 230
 231 ===-------------------------------------------------------------------------===
 232
 233 No loads or stores of the constants should be needed:
 234
 235 struct foo { double X, Y; };
 236 void xxx(struct foo F);
 237 void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
 238
 239 ===-------------------------------------------------------------------------===
 240
 241 Darwin Stub removal:
 242
 243 We still generate calls to foo$stub, and stubs, on Darwin.  This is not
 244 necessary when building with the Leopard (10.5) or later linker, as stubs are
 245 generated by ld when necessary.  Parameterizing this based on the deployment
 246 target (-mmacosx-version-min) is probably enough.  x86-32 does this right, see
 247 its logic.
 248
 249 ===-------------------------------------------------------------------------===
 250
 251 Darwin Stub LICM optimization:
 252
 253 Loops like this:
 254
 255   for (...)  bar();
 256
 257 Have to go through an indirect stub if bar is external or linkonce.  It would
 258 be better to compile it as:
 259
 260      fp = &bar;
 261      for (...)  fp();
 262
 263 which only computes the address of bar once (instead of each time through the
 264 stub).  This is Darwin specific and would have to be done in the code generator.
 265 Probably not a win on x86.
 266
 267 ===-------------------------------------------------------------------------===
 268
 269 Simple IPO for argument passing, change:
 270   void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
 271
 272 the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
 273 of arguments get assigned to r3 through r10. That is, if you have a function
 274 foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
 275 argument bytes for r4 and r5. The trick then would be to shuffle the argument
 276 order for functions we can internalize so that the maximum number of
 277 integers/pointers get passed in regs before you see any of the fp arguments.
 278
 279 Instead of implementing this, it would actually probably be easier to just
 280 implement a PPC fastcc, where we could do whatever we wanted to the CC,
 281 including having this work sanely.
 282
 283 ===-------------------------------------------------------------------------===
 284
 285 Fix Darwin FP-In-Integer Registers ABI
 286
 287 Darwin passes doubles in structures in integer registers, which is very very
 288 bad.  Add something like a BITCAST to LLVM, then do an i-p transformation that
 289 percolates these things out of functions.
 290
 291 Check out how horrible this is:
 292 http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
 293
 294 This is an extension of "interprocedural CC unmunging" that can't be done with
 295 just fastcc.
 296
 297 ===-------------------------------------------------------------------------===
 298
 299 Compile this:
 300
 301 int foo(int a) {
 302   int b = (a < 8);
 303   if (b) {
 304     return b * 3;     // ignore the fact that this is always 3.
 305   } else {
 306     return 2;
 307   }
 308 }
 309
 310 into something not this:
 311
 312 _foo:
 313 1)      cmpwi cr7, r3, 8
 314         mfcr r2, 1
 315         rlwinm r2, r2, 29, 31, 31
 316 1)      cmpwi cr0, r3, 7
 317         bgt cr0, LBB1_2 ; UnifiedReturnBlock
 318 LBB1_1: ; then
 319         rlwinm r2, r2, 0, 31, 31
 320         mulli r3, r2, 3
 321         blr
 322 LBB1_2: ; UnifiedReturnBlock
 323         li r3, 2
 324         blr
 325
 326 In particular, the two compares (marked 1) could be shared by reversing one.
 327 This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
 328 same operands (but backwards) exists.  In this case, this wouldn't save us
 329 anything though, because the compares still wouldn't be shared.
 330
 331 ===-------------------------------------------------------------------------===
 332
 333 We should custom expand setcc instead of pretending that we have it.  That
 334 would allow us to expose the access of the crbit after the mfcr, allowing
 335 that access to be trivially folded into other ops.  A simple example:
 336
 337 int foo(int a, int b) { return (a < b) << 4; }
 338
 339 compiles into:
 340
 341 _foo:
 342         cmpw cr7, r3, r4
 343         mfcr r2, 1
 344         rlwinm r2, r2, 29, 31, 31
 345         slwi r3, r2, 4
 346         blr
 347
 348 ===-------------------------------------------------------------------------===
 349
 350 Fold add and sub with constant into non-extern, non-weak addresses so this:
 351
 352 static int a;
 353 void bar(int b) { a = b; }
 354 void foo(unsigned char *c) {
 355   *c = a;
 356 }
 357
 358 So that
 359
 360 _foo:
 361         lis r2, ha16(_a)
 362         la r2, lo16(_a)(r2)
 363         lbz r2, 3(r2)
 364         stb r2, 0(r3)
 365         blr
 366
 367 Becomes
 368
 369 _foo:
 370         lis r2, ha16(_a+3)
 371         lbz r2, lo16(_a+3)(r2)
 372         stb r2, 0(r3)
 373         blr
 374
 375 ===-------------------------------------------------------------------------===
 376
 377 We generate really bad code for this:
 378
 379 int f(signed char *a, _Bool b, _Bool c) {
 380    signed char t = 0;
 381   if (b)  t = *a;
 382   if (c)  *a = t;
 383 }
 384
 385 ===-------------------------------------------------------------------------===
 386
 387 This:
 388 int test(unsigned *P) { return *P >> 24; }
 389
 390 Should compile to:
 391
 392 _test:
 393         lbz r3,0(r3)
 394         blr
 395
 396 not:
 397
 398 _test:
 399         lwz r2, 0(r3)
 400         srwi r3, r2, 24
 401         blr
 402
 403 ===-------------------------------------------------------------------------===
 404
 405 On the G5, logical CR operations are more expensive in their three
 406 address form: ops that read/write the same register are half as expensive as
 407 those that read from two registers that are different from their destination.
 408
 409 We should model this with two separate instructions.  The isel should generate
 410 the "two address" form of the instructions.  When the register allocator
 411 detects that it needs to insert a copy due to the two-addresness of the CR
 412 logical op, it will invoke PPCInstrInfo::convertToThreeAddress.  At this point
 413 we can convert to the "three address" instruction, to save code space.
 414
 415 This only matters when we start generating cr logical ops.
 416
 417 ===-------------------------------------------------------------------------===
 418
 419 We should compile these two functions to the same thing:
 420
 421 #include <stdlib.h>
 422 void f(int a, int b, int *P) {
 423   *P = (a-b)>=0?(a-b):(b-a);
 424 }
 425 void g(int a, int b, int *P) {
 426   *P = abs(a-b);
 427 }
 428
 429 Further, they should compile to something better than:
 430
 431 _g:
 432         subf r2, r4, r3
 433         subfic r3, r2, 0
 434         cmpwi cr0, r2, -1
 435         bgt cr0, LBB2_2 ; entry
 436 LBB2_1: ; entry
 437         mr r2, r3
 438 LBB2_2: ; entry
 439         stw r2, 0(r5)
 440         blr
 441
 442 GCC produces:
 443
 444 _g:
 445         subf r4,r4,r3
 446         srawi r2,r4,31
 447         xor r0,r2,r4
 448         subf r0,r2,r0
 449         stw r0,0(r5)
 450         blr
 451
 452 ... which is much nicer.
 453
 454 This theoretically may help improve twolf slightly (used in dimbox.c:142?).
 455
 456 ===-------------------------------------------------------------------------===
 457
 458 PR5945: This:
 459 define i32 @clamp0g(i32 %a) {
 460 entry:
 461         %cmp = icmp slt i32 %a, 0
 462         %sel = select i1 %cmp, i32 0, i32 %a
 463         ret i32 %sel
 464 }
 465
 466 Is compile to this with the PowerPC (32-bit) backend:
 467
 468 _clamp0g:
 469         cmpwi cr0, r3, 0
 470         li r2, 0
 471         blt cr0, LBB1_2
 472 ; BB#1:                                                     ; %entry
 473         mr r2, r3
 474 LBB1_2:                                                     ; %entry
 475         mr r3, r2
 476         blr
 477
 478 This could be reduced to the much simpler:
 479
 480 _clamp0g:
 481         srawi r2, r3, 31
 482         andc r3, r3, r2
 483         blr
 484
 485 ===-------------------------------------------------------------------------===
 486
 487 int foo(int N, int ***W, int **TK, int X) {
 488   int t, i;
 489
 490   for (t = 0; t < N; ++t)
 491     for (i = 0; i < 4; ++i)
 492       W[t / X][i][t % X] = TK[i][t];
 493
 494   return 5;
 495 }
 496
 497 We generate relatively atrocious code for this loop compared to gcc.
 498
 499 We could also strength reduce the rem and the div:
 500 http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
 501
 502 ===-------------------------------------------------------------------------===
 503
 504 float foo(float X) { return (int)(X); }
 505
 506 Currently produces:
 507
 508 _foo:
 509         fctiwz f0, f1
 510         stfd f0, -8(r1)
 511         lwz r2, -4(r1)
 512         extsw r2, r2
 513         std r2, -16(r1)
 514         lfd f0, -16(r1)
 515         fcfid f0, f0
 516         frsp f1, f0
 517         blr
 518
 519 We could use a target dag combine to turn the lwz/extsw into an lwa when the
 520 lwz has a single use.  Since LWA is cracked anyway, this would be a codesize
 521 win only.
 522
 523 ===-------------------------------------------------------------------------===
 524
 525 We generate ugly code for this:
 526
 527 void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
 528   unsigned code = 0;
 529   if(dx < -dw) code |= 1;
 530   if(dx > dw)  code |= 2;
 531   if(dy < -dw) code |= 4;
 532   if(dy > dw)  code |= 8;
 533   if(dz < -dw) code |= 16;
 534   if(dz > dw)  code |= 32;
 535   *ret = code;
 536 }
 537
 538 ===-------------------------------------------------------------------------===
 539
 540 Complete the signed i32 to FP conversion code using 64-bit registers
 541 transformation, good for PI.  See PPCISelLowering.cpp, this comment:
 542
 543      // FIXME: disable this lowered code.  This generates 64-bit register values,
 544      // and we don't model the fact that the top part is clobbered by calls.  We
 545      // need to flag these together so that the value isn't live across a call.
 546      //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
 547
 548 Also, if the registers are spilled to the stack, we have to ensure that all
 549 64-bits of them are save/restored, otherwise we will miscompile the code.  It
 550 sounds like we need to get the 64-bit register classes going.
 551
 552 ===-------------------------------------------------------------------------===
 553
 554 %struct.B = type { i8, [3 x i8] }
 555
 556 define void @bar(%struct.B* %b) {
 557 entry:
 558         %tmp = bitcast %struct.B* %b to i32*              ; <uint*> [#uses=1]
 559         %tmp = load i32* %tmp          ; <uint> [#uses=1]
 560         %tmp3 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=1]
 561         %tmp4 = load i32* %tmp3                ; <uint> [#uses=1]
 562         %tmp8 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=2]
 563         %tmp9 = load i32* %tmp8                ; <uint> [#uses=1]
 564         %tmp4.mask17 = shl i32 %tmp4, i8 1          ; <uint> [#uses=1]
 565         %tmp1415 = and i32 %tmp4.mask17, 2147483648            ; <uint> [#uses=1]
 566         %tmp.masked = and i32 %tmp, 2147483648         ; <uint> [#uses=1]
 567         %tmp11 = or i32 %tmp1415, %tmp.masked          ; <uint> [#uses=1]
 568         %tmp12 = and i32 %tmp9, 2147483647             ; <uint> [#uses=1]
 569         %tmp13 = or i32 %tmp12, %tmp11         ; <uint> [#uses=1]
 570         store i32 %tmp13, i32* %tmp8
 571         ret void
 572 }
 573
 574 We emit:
 575
 576 _foo:
 577         lwz r2, 0(r3)
 578         slwi r4, r2, 1
 579         or r4, r4, r2
 580         rlwimi r2, r4, 0, 0, 0
 581         stw r2, 0(r3)
 582         blr
 583
 584 We could collapse a bunch of those ORs and ANDs and generate the following
 585 equivalent code:
 586
 587 _foo:
 588         lwz r2, 0(r3)
 589         rlwinm r4, r2, 1, 0, 0
 590         or r2, r2, r4
 591         stw r2, 0(r3)
 592         blr
 593
 594 ===-------------------------------------------------------------------------===
 595
 596 We compile:
 597
 598 unsigned test6(unsigned x) {
 599   return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
 600 }
 601
 602 into:
 603
 604 _test6:
 605         lis r2, 255
 606         rlwinm r3, r3, 16, 0, 31
 607         ori r2, r2, 255
 608         and r3, r3, r2
 609         blr
 610
 611 GCC gets it down to:
 612
 613 _test6:
 614         rlwinm r0,r3,16,8,15
 615         rlwinm r3,r3,16,24,31
 616         or r3,r3,r0
 617         blr
 618
 619
 620 ===-------------------------------------------------------------------------===
 621
 622 Consider a function like this:
 623
 624 float foo(float X) { return X + 1234.4123f; }
 625
 626 The FP constant ends up in the constant pool, so we need to get the LR register.
 627  This ends up producing code like this:
 628
 629 _foo:
 630 .LBB_foo_0:     ; entry
 631         mflr r11
 632 ***     stw r11, 8(r1)
 633         bl "L00000$pb"
 634 "L00000$pb":
 635         mflr r2
 636         addis r2, r2, ha16(.CPI_foo_0-"L00000$pb")
 637         lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2)
 638         fadds f1, f1, f0
 639 ***     lwz r11, 8(r1)
 640         mtlr r11
 641         blr
 642
 643 This is functional, but there is no reason to spill the LR register all the way
 644 to the stack (the two marked instrs): spilling it to a GPR is quite enough.
 645
 646 Implementing this will require some codegen improvements.  Nate writes:
 647
 648 "So basically what we need to support the "no stack frame save and restore" is a
 649 generalization of the LR optimization to "callee-save regs".
 650
 651 Currently, we have LR marked as a callee-save reg.  The register allocator sees
 652 that it's callee save, and spills it directly to the stack.
 653
 654 Ideally, something like this would happen:
 655
 656 LR would be in a separate register class from the GPRs. The class of LR would be
 657 marked "unspillable".  When the register allocator came across an unspillable
 658 reg, it would ask "what is the best class to copy this into that I *can* spill"
 659 If it gets a class back, which it will in this case (the gprs), it grabs a free
 660 register of that class.  If it is then later necessary to spill that reg, so be
 661 it.
 662
 663 ===-------------------------------------------------------------------------===
 664
 665 We compile this:
 666 int test(_Bool X) {
 667   return X ? 524288 : 0;
 668 }
 669
 670 to:
 671 _test:
 672         cmplwi cr0, r3, 0
 673         lis r2, 8
 674         li r3, 0
 675         beq cr0, LBB1_2 ;entry
 676 LBB1_1: ;entry
 677         mr r3, r2
 678 LBB1_2: ;entry
 679         blr
 680
 681 instead of:
 682 _test:
 683         addic r2,r3,-1
 684         subfe r0,r2,r3
 685         slwi r3,r0,19
 686         blr
 687
 688 This sort of thing occurs a lot due to globalopt.
 689
 690 ===-------------------------------------------------------------------------===
 691
 692 We compile:
 693
 694 define i32 @bar(i32 %x) nounwind readnone ssp {
 695 entry:
 696   %0 = icmp eq i32 %x, 0                          ; <i1> [#uses=1]
 697   %neg = sext i1 %0 to i32              ; <i32> [#uses=1]
 698   ret i32 %neg
 699 }
 700
 701 to:
 702
 703 _bar:
 704         cntlzw r2, r3
 705         slwi r2, r2, 26
 706         srawi r3, r2, 31
 707         blr
 708
 709 it would be better to produce:
 710
 711 _bar:
 712         addic r3,r3,-1
 713         subfe r3,r3,r3
 714         blr
 715
 716 ===-------------------------------------------------------------------------===
 717
 718 We currently compile 32-bit bswap:
 719
 720 declare i32 @llvm.bswap.i32(i32 %A)
 721 define i32 @test(i32 %A) {
 722         %B = call i32 @llvm.bswap.i32(i32 %A)
 723         ret i32 %B
 724 }
 725
 726 to:
 727
 728 _test:
 729         rlwinm r2, r3, 24, 16, 23
 730         slwi r4, r3, 24
 731         rlwimi r2, r3, 8, 24, 31
 732         rlwimi r4, r3, 8, 8, 15
 733         rlwimi r4, r2, 0, 16, 31
 734         mr r3, r4
 735         blr
 736
 737 it would be more efficient to produce:
 738
 739 _foo:   mr r0,r3
 740         rlwinm r3,r3,8,0xffffffff
 741         rlwimi r3,r0,24,0,7
 742         rlwimi r3,r0,24,16,23
 743         blr
 744
 745 ===-------------------------------------------------------------------------===
 746
 747 test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to:
 748
 749 __ZNK4llvm5APInt17countLeadingZerosEv:
 750         ld r2, 0(r3)
 751         cntlzd r2, r2
 752         or r2, r2, r2     <<-- silly.
 753         addi r3, r2, -64
 754         blr
 755
 756 The dead or is a 'truncate' from 64- to 32-bits.
 757
 758 ===-------------------------------------------------------------------------===
 759
 760 We generate horrible ppc code for this:
 761
 762 #define N  2000000
 763 double   a[N],c[N];
 764 void simpleloop() {
 765    int j;
 766    for (j=0; j<N; j++)
 767      c[j] = a[j];
 768 }
 769
 770 LBB1_1: ;bb
 771         lfdx f0, r3, r4
 772         addi r5, r5, 1                 ;; Extra IV for the exit value compare.
 773         stfdx f0, r2, r4
 774         addi r4, r4, 8
 775
 776         xoris r6, r5, 30               ;; This is due to a large immediate.
 777         cmplwi cr0, r6, 33920
 778         bne cr0, LBB1_1
 779
 780 //===---------------------------------------------------------------------===//
 781
 782 This:
 783         #include <algorithm>
 784         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
 785         { return std::make_pair(a + b, a + b < a); }
 786         bool no_overflow(unsigned a, unsigned b)
 787         { return !full_add(a, b).second; }
 788
 789 Should compile to:
 790
 791 __Z11no_overflowjj:
 792         add r4,r3,r4
 793         subfc r3,r3,r4
 794         li r3,0
 795         adde r3,r3,r3
 796         blr
 797
 798 (or better) not:
 799
 800 __Z11no_overflowjj:
 801         add r2, r4, r3
 802         cmplw cr7, r2, r3
 803         mfcr r2
 804         rlwinm r2, r2, 29, 31, 31
 805         xori r3, r2, 1
 806         blr
 807
 808 //===---------------------------------------------------------------------===//
 809
 810 We compile some FP comparisons into an mfcr with two rlwinms and an or.  For
 811 example:
 812 #include <math.h>
 813 int test(double x, double y) { return islessequal(x, y);}
 814 int test2(double x, double y) {  return islessgreater(x, y);}
 815 int test3(double x, double y) {  return !islessequal(x, y);}
 816
 817 Compiles into (all three are similar, but the bits differ):
 818
 819 _test:
 820         fcmpu cr7, f1, f2
 821         mfcr r2
 822         rlwinm r3, r2, 29, 31, 31
 823         rlwinm r2, r2, 31, 31, 31
 824         or r3, r2, r3
 825         blr
 826
 827 GCC compiles this into:
 828
 829  _test:
 830         fcmpu cr7,f1,f2
 831         cror 30,28,30
 832         mfcr r3
 833         rlwinm r3,r3,31,1
 834         blr
 835
 836 which is more efficient and can use mfocr.  See PR642 for some more context.
 837
 838 //===---------------------------------------------------------------------===//
 839
 840 void foo(float *data, float d) {
 841    long i;
 842    for (i = 0; i < 8000; i++)
 843       data[i] = d;
 844 }
 845 void foo2(float *data, float d) {
 846    long i;
 847    data--;
 848    for (i = 0; i < 8000; i++) {
 849       data[1] = d;
 850       data++;
 851    }
 852 }
 853
 854 These compile to:
 855
 856 _foo:
 857         li r2, 0
 858 LBB1_1: ; bb
 859         addi r4, r2, 4
 860         stfsx f1, r3, r2
 861         cmplwi cr0, r4, 32000
 862         mr r2, r4
 863         bne cr0, LBB1_1 ; bb
 864         blr
 865 _foo2:
 866         li r2, 0
 867 LBB2_1: ; bb
 868         addi r4, r2, 4
 869         stfsx f1, r3, r2
 870         cmplwi cr0, r4, 32000
 871         mr r2, r4
 872         bne cr0, LBB2_1 ; bb
 873         blr
 874
 875 The 'mr' could be eliminated to folding the add into the cmp better.
 876
 877 //===---------------------------------------------------------------------===//
 878 Codegen for the following (low-probability) case deteriorated considerably
 879 when the correctness fixes for unordered comparisons went in (PR 642, 58871).
 880 It should be possible to recover the code quality described in the comments.
 881
 882 ; RUN: llvm-as < %s | llc -march=ppc32  | grep or | count 3
 883 ; This should produce one 'or' or 'cror' instruction per function.
 884
 885 ; RUN: llvm-as < %s | llc -march=ppc32  | grep mfcr | count 3
 886 ; PR2964
 887
 888 define i32 @test(double %x, double %y) nounwind  {
 889 entry:
 890         %tmp3 = fcmp ole double %x, %y          ; <i1> [#uses=1]
 891         %tmp345 = zext i1 %tmp3 to i32          ; <i32> [#uses=1]
 892         ret i32 %tmp345
 893 }
 894
 895 define i32 @test2(double %x, double %y) nounwind  {
 896 entry:
 897         %tmp3 = fcmp one double %x, %y          ; <i1> [#uses=1]
 898         %tmp345 = zext i1 %tmp3 to i32          ; <i32> [#uses=1]
 899         ret i32 %tmp345
 900 }
 901
 902 define i32 @test3(double %x, double %y) nounwind  {
 903 entry:
 904         %tmp3 = fcmp ugt double %x, %y          ; <i1> [#uses=1]
 905         %tmp34 = zext i1 %tmp3 to i32           ; <i32> [#uses=1]
 906         ret i32 %tmp34
 907 }
 908 //===----------------------------------------------------------------------===//
 909 ; RUN: llvm-as < %s | llc -march=ppc32 | not grep fneg
 910
 911 ; This could generate FSEL with appropriate flags (FSEL is not IEEE-safe, and
 912 ; should not be generated except with -enable-finite-only-fp-math or the like).
 913 ; With the correctness fixes for PR642 (58871) LowerSELECT_CC would need to
 914 ; recognize a more elaborate tree than a simple SETxx.
 915
 916 define double @test_FNEG_sel(double %A, double %B, double %C) {
 917         %D = fsub double -0.000000e+00, %A               ; <double> [#uses=1]
 918         %Cond = fcmp ugt double %D, -0.000000e+00               ; <i1> [#uses=1]
 919         %E = select i1 %Cond, double %B, double %C              ; <double> [#uses=1]
 920         ret double %E
 921 }
 922
 923 //===----------------------------------------------------------------------===//
 924 The save/restore sequence for CR in prolog/epilog is terrible:
 925 - Each CR subreg is saved individually, rather than doing one save as a unit.
 926 - On Darwin, the save is done after the decrement of SP, which means the offset
 927 from SP of the save slot can be too big for a store instruction, which means we
 928 need an additional register (currently hacked in 96015+96020; the solution there
 929 is correct, but poor).
 930 - On SVR4 the same thing can happen, and I don't think saving before the SP
 931 decrement is safe on that target, as there is no red zone.  This is currently
 932 broken AFAIK, although it's not a target I can exercise.
 933 The following demonstrates the problem:
 934 extern void bar(char *p);
 935 void foo() {
 936   char x[100000];
 937   bar(x);
 938   __asm__("" ::: "cr2");
 939 }