lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 We should add support for the "movbe" instruction, which does a byte-swapping
   6 copy (3-addr bswap + memory support?)  This is available on Atom processors.
   7
   8 //===---------------------------------------------------------------------===//
   9
  10 CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
  11 backend knows how to three-addressify this shift, but it appears the register
  12 allocator isn't even asking it to do so in this case.  We should investigate
  13 why this isn't happening, it could have significant impact on other important
  14 cases for X86 as well.
  15
  16 //===---------------------------------------------------------------------===//
  17
  18 This should be one DIV/IDIV instruction, not a libcall:
  19
  20 unsigned test(unsigned long long X, unsigned Y) {
  21         return X/Y;
  22 }
  23
  24 This can be done trivially with a custom legalizer.  What about overflow
  25 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  26
  27 //===---------------------------------------------------------------------===//
  28
  29 Improvements to the multiply -> shift/add algorithm:
  30 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  31
  32 //===---------------------------------------------------------------------===//
  33
  34 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  35 long long foo(int x) { return 1LL << x; }
  36
  37 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  38 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  39 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  40
  41 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  42
  43 One better solution for 1LL << x is:
  44         xorl    %eax, %eax
  45         xorl    %edx, %edx
  46         testb   $32, %cl
  47         sete    %al
  48         setne   %dl
  49         sall    %cl, %eax
  50         sall    %cl, %edx
  51
  52 But that requires good 8-bit subreg support.
  53
  54 Also, this might be better.  It's an extra shift, but it's one instruction
  55 shorter, and doesn't stress 8-bit subreg support.
  56 (From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
  57 but without the unnecessary and.)
  58         movl %ecx, %eax
  59         shrl $5, %eax
  60         movl %eax, %edx
  61         xorl $1, %edx
  62         sall %cl, %eax
  63         sall %cl. %edx
  64
  65 64-bit shifts (in general) expand to really bad code.  Instead of using
  66 cmovs, we should expand to a conditional branch like GCC produces.
  67
  68 //===---------------------------------------------------------------------===//
  69
  70 Some isel ideas:
  71
  72 1. Dynamic programming based approach when compile time if not an
  73    issue.
  74 2. Code duplication (addressing mode) during isel.
  75 3. Other ideas from "Register-Sensitive Selection, Duplication, and
  76    Sequencing of Instructions".
  77 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
  78    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
  79    and other related papers.
  80    http://citeseer.ist.psu.edu/govindarajan01minimum.html
  81
  82 //===---------------------------------------------------------------------===//
  83
  84 Should we promote i16 to i32 to avoid partial register update stalls?
  85
  86 //===---------------------------------------------------------------------===//
  87
  88 Leave any_extend as pseudo instruction and hint to register
  89 allocator. Delay codegen until post register allocation.
  90 Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
  91 the coalescer how to deal with it though.
  92
  93 //===---------------------------------------------------------------------===//
  94
  95 It appears icc use push for parameter passing. Need to investigate.
  96
  97 //===---------------------------------------------------------------------===//
  98
  99 This:
 100
 101 void foo(void);
 102 void bar(int x, int *P) {
 103   x >>= 2;
 104   if (x)
 105     foo();
 106   *P = x;
 107 }
 108
 109 compiles into:
 110
 111         movq    %rsi, %rbx
 112         movl    %edi, %r14d
 113         sarl    $2, %r14d
 114         testl   %r14d, %r14d
 115         je      LBB0_2
 116
 117 Instead of doing an explicit test, we can use the flags off the sar.  This
 118 occurs in a bigger testcase like this, which is pretty common:
 119
 120 #include <vector>
 121 int test1(std::vector<int> &X) {
 122   int Sum = 0;
 123   for (long i = 0, e = X.size(); i != e; ++i)
 124     X[i] = 0;
 125   return Sum;
 126 }
 127
 128 //===---------------------------------------------------------------------===//
 129
 130 Only use inc/neg/not instructions on processors where they are faster than
 131 add/sub/xor.  They are slower on the P4 due to only updating some processor
 132 flags.
 133
 134 //===---------------------------------------------------------------------===//
 135
 136 The instruction selector sometimes misses folding a load into a compare.  The
 137 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 138 commutative, it is not matched with the load on both sides.  The dag combiner
 139 should be made smart enough to cannonicalize the load into the RHS of a compare
 140 when it can invert the result of the compare for free.
 141
 142 //===---------------------------------------------------------------------===//
 143
 144 In many cases, LLVM generates code like this:
 145
 146 _test:
 147         movl 8(%esp), %eax
 148         cmpl %eax, 4(%esp)
 149         setl %al
 150         movzbl %al, %eax
 151         ret
 152
 153 on some processors (which ones?), it is more efficient to do this:
 154
 155 _test:
 156         movl 8(%esp), %ebx
 157         xor  %eax, %eax
 158         cmpl %ebx, 4(%esp)
 159         setl %al
 160         ret
 161
 162 Doing this correctly is tricky though, as the xor clobbers the flags.
 163
 164 //===---------------------------------------------------------------------===//
 165
 166 We should generate bts/btr/etc instructions on targets where they are cheap or
 167 when codesize is important.  e.g., for:
 168
 169 void setbit(int *target, int bit) {
 170     *target |= (1 << bit);
 171 }
 172 void clearbit(int *target, int bit) {
 173     *target &= ~(1 << bit);
 174 }
 175
 176 //===---------------------------------------------------------------------===//
 177
 178 Instead of the following for memset char*, 1, 10:
 179
 180         movl $16843009, 4(%edx)
 181         movl $16843009, (%edx)
 182         movw $257, 8(%edx)
 183
 184 It might be better to generate
 185
 186         movl $16843009, %eax
 187         movl %eax, 4(%edx)
 188         movl %eax, (%edx)
 189         movw al, 8(%edx)
 190
 191 when we can spare a register. It reduces code size.
 192
 193 //===---------------------------------------------------------------------===//
 194
 195 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 196 get this:
 197
 198 define i32 @test1(i32 %X) {
 199     %Y = sdiv i32 %X, 8
 200     ret i32 %Y
 201 }
 202
 203 _test1:
 204         movl 4(%esp), %eax
 205         movl %eax, %ecx
 206         sarl $31, %ecx
 207         shrl $29, %ecx
 208         addl %ecx, %eax
 209         sarl $3, %eax
 210         ret
 211
 212 GCC knows several different ways to codegen it, one of which is this:
 213
 214 _test1:
 215         movl    4(%esp), %eax
 216         cmpl    $-1, %eax
 217         leal    7(%eax), %ecx
 218         cmovle  %ecx, %eax
 219         sarl    $3, %eax
 220         ret
 221
 222 which is probably slower, but it's interesting at least :)
 223
 224 //===---------------------------------------------------------------------===//
 225
 226 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 227 We should leave these as libcalls for everything over a much lower threshold,
 228 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 229 stores, TLB preheating, etc)
 230
 231 //===---------------------------------------------------------------------===//
 232
 233 Optimize this into something reasonable:
 234  x * copysign(1.0, y) * copysign(1.0, z)
 235
 236 //===---------------------------------------------------------------------===//
 237
 238 Optimize copysign(x, *y) to use an integer load from y.
 239
 240 //===---------------------------------------------------------------------===//
 241
 242 The following tests perform worse with LSR:
 243
 244 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 245
 246 //===---------------------------------------------------------------------===//
 247
 248 Adding to the list of cmp / test poor codegen issues:
 249
 250 int test(__m128 *A, __m128 *B) {
 251   if (_mm_comige_ss(*A, *B))
 252     return 3;
 253   else
 254     return 4;
 255 }
 256
 257 _test:
 258         movl 8(%esp), %eax
 259         movaps (%eax), %xmm0
 260         movl 4(%esp), %eax
 261         movaps (%eax), %xmm1
 262         comiss %xmm0, %xmm1
 263         setae %al
 264         movzbl %al, %ecx
 265         movl $3, %eax
 266         movl $4, %edx
 267         cmpl $0, %ecx
 268         cmove %edx, %eax
 269         ret
 270
 271 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 272 are a number of issues. 1) We are introducing a setcc between the result of the
 273 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 274 so a any extend (which becomes a zero extend) is added.
 275
 276 We probably need some kind of target DAG combine hook to fix this.
 277
 278 //===---------------------------------------------------------------------===//
 279
 280 We generate significantly worse code for this than GCC:
 281 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 282 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 283
 284 There is also one case we do worse on PPC.
 285
 286 //===---------------------------------------------------------------------===//
 287
 288 For this:
 289
 290 int test(int a)
 291 {
 292   return a * 3;
 293 }
 294
 295 We currently emits
 296         imull $3, 4(%esp), %eax
 297
 298 Perhaps this is what we really should generate is? Is imull three or four
 299 cycles? Note: ICC generates this:
 300         movl    4(%esp), %eax
 301         leal    (%eax,%eax,2), %eax
 302
 303 The current instruction priority is based on pattern complexity. The former is
 304 more "complex" because it folds a load so the latter will not be emitted.
 305
 306 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 307 should always try to match LEA first since the LEA matching code does some
 308 estimate to determine whether the match is profitable.
 309
 310 However, if we care more about code size, then imull is better. It's two bytes
 311 shorter than movl + leal.
 312
 313 On a Pentium M, both variants have the same characteristics with regard
 314 to throughput; however, the multiplication has a latency of four cycles, as
 315 opposed to two cycles for the movl+lea variant.
 316
 317 //===---------------------------------------------------------------------===//
 318
 319 __builtin_ffs codegen is messy.
 320
 321 int ffs_(unsigned X) { return __builtin_ffs(X); }
 322
 323 llvm produces:
 324 ffs_:
 325         movl    4(%esp), %ecx
 326         bsfl    %ecx, %eax
 327         movl    $32, %edx
 328         cmove   %edx, %eax
 329         incl    %eax
 330         xorl    %edx, %edx
 331         testl   %ecx, %ecx
 332         cmove   %edx, %eax
 333         ret
 334
 335 vs gcc:
 336
 337 _ffs_:
 338         movl    $-1, %edx
 339         bsfl    4(%esp), %eax
 340         cmove   %edx, %eax
 341         addl    $1, %eax
 342         ret
 343
 344 Another example of __builtin_ffs (use predsimplify to eliminate a select):
 345
 346 int foo (unsigned long j) {
 347   if (j)
 348     return __builtin_ffs (j) - 1;
 349   else
 350     return 0;
 351 }
 352
 353 //===---------------------------------------------------------------------===//
 354
 355 It appears gcc place string data with linkonce linkage in
 356 .section __TEXT,__const_coal,coalesced instead of
 357 .section __DATA,__const_coal,coalesced.
 358 Take a look at darwin.h, there are other Darwin assembler directives that we
 359 do not make use of.
 360
 361 //===---------------------------------------------------------------------===//
 362
 363 define i32 @foo(i32* %a, i32 %t) {
 364 entry:
 365         br label %cond_true
 366
 367 cond_true:              ; preds = %cond_true, %entry
 368         %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ]           ; <i32> [#uses=3]
 369         %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ]             ; <i32> [#uses=1]
 370         %tmp2 = getelementptr i32* %a, i32 %x.0.0               ; <i32*> [#uses=1]
 371         %tmp3 = load i32* %tmp2         ; <i32> [#uses=1]
 372         %tmp5 = add i32 %t_addr.0.0, %x.0.0             ; <i32> [#uses=1]
 373         %tmp7 = add i32 %tmp5, %tmp3            ; <i32> [#uses=2]
 374         %tmp9 = add i32 %x.0.0, 1               ; <i32> [#uses=2]
 375         %tmp = icmp sgt i32 %tmp9, 39           ; <i1> [#uses=1]
 376         br i1 %tmp, label %bb12, label %cond_true
 377
 378 bb12:           ; preds = %cond_true
 379         ret i32 %tmp7
 380 }
 381 is pessimized by -loop-reduce and -indvars
 382
 383 //===---------------------------------------------------------------------===//
 384
 385 u32 to float conversion improvement:
 386
 387 float uint32_2_float( unsigned u ) {
 388   float fl = (int) (u & 0xffff);
 389   float fh = (int) (u >> 16);
 390   fh *= 0x1.0p16f;
 391   return fh + fl;
 392 }
 393
 394 00000000        subl    $0x04,%esp
 395 00000003        movl    0x08(%esp,1),%eax
 396 00000007        movl    %eax,%ecx
 397 00000009        shrl    $0x10,%ecx
 398 0000000c        cvtsi2ss        %ecx,%xmm0
 399 00000010        andl    $0x0000ffff,%eax
 400 00000015        cvtsi2ss        %eax,%xmm1
 401 00000019        mulss   0x00000078,%xmm0
 402 00000021        addss   %xmm1,%xmm0
 403 00000025        movss   %xmm0,(%esp,1)
 404 0000002a        flds    (%esp,1)
 405 0000002d        addl    $0x04,%esp
 406 00000030        ret
 407
 408 //===---------------------------------------------------------------------===//
 409
 410 When using fastcc abi, align stack slot of argument of type double on 8 byte
 411 boundary to improve performance.
 412
 413 //===---------------------------------------------------------------------===//
 414
 415 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
 416 simplifications for integer "x cmp y ? a : b".
 417
 418 //===---------------------------------------------------------------------===//
 419
 420 Consider the expansion of:
 421
 422 define i32 @test3(i32 %X) {
 423         %tmp1 = urem i32 %X, 255
 424         ret i32 %tmp1
 425 }
 426
 427 Currently it compiles to:
 428
 429 ...
 430         movl $2155905153, %ecx
 431         movl 8(%esp), %esi
 432         movl %esi, %eax
 433         mull %ecx
 434 ...
 435
 436 This could be "reassociated" into:
 437
 438         movl $2155905153, %eax
 439         movl 8(%esp), %ecx
 440         mull %ecx
 441
 442 to avoid the copy.  In fact, the existing two-address stuff would do this
 443 except that mul isn't a commutative 2-addr instruction.  I guess this has
 444 to be done at isel time based on the #uses to mul?
 445
 446 //===---------------------------------------------------------------------===//
 447
 448 Make sure the instruction which starts a loop does not cross a cacheline
 449 boundary. This requires knowning the exact length of each machine instruction.
 450 That is somewhat complicated, but doable. Example 256.bzip2:
 451
 452 In the new trace, the hot loop has an instruction which crosses a cacheline
 453 boundary.  In addition to potential cache misses, this can't help decoding as I
 454 imagine there has to be some kind of complicated decoder reset and realignment
 455 to grab the bytes from the next cacheline.
 456
 457 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
 458 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
 459 937  937 0x3d0a incl     %esi
 460 3    3   0x3d0b cmpb     %bl, %dl
 461 27   27  0x3d0d jnz      0x000062db <main+11707>
 462
 463 //===---------------------------------------------------------------------===//
 464
 465 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
 466
 467 //===---------------------------------------------------------------------===//
 468
 469 This could be a single 16-bit load.
 470
 471 int f(char *p) {
 472     if ((p[0] == 1) & (p[1] == 2)) return 1;
 473     return 0;
 474 }
 475
 476 //===---------------------------------------------------------------------===//
 477
 478 We should inline lrintf and probably other libc functions.
 479
 480 //===---------------------------------------------------------------------===//
 481
 482 Use the FLAGS values from arithmetic instructions more.  For example, compile:
 483
 484 int add_zf(int *x, int y, int a, int b) {
 485      if ((*x += y) == 0)
 486           return a;
 487      else
 488           return b;
 489 }
 490
 491 to:
 492        addl    %esi, (%rdi)
 493        movl    %edx, %eax
 494        cmovne  %ecx, %eax
 495        ret
 496 instead of:
 497
 498 _add_zf:
 499         addl (%rdi), %esi
 500         movl %esi, (%rdi)
 501         testl %esi, %esi
 502         cmove %edx, %ecx
 503         movl %ecx, %eax
 504         ret
 505
 506 As another example, compile function f2 in test/CodeGen/X86/cmp-test.ll
 507 without a test instruction.
 508
 509 //===---------------------------------------------------------------------===//
 510
 511 These two functions have identical effects:
 512
 513 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
 514 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 515
 516 We currently compile them to:
 517
 518 _f:
 519         movl 4(%esp), %eax
 520         movl %eax, %ecx
 521         incl %ecx
 522         movl 8(%esp), %edx
 523         cmpl %edx, %ecx
 524         jne LBB1_2      #UnifiedReturnBlock
 525 LBB1_1: #cond_true
 526         addl $2, %eax
 527         ret
 528 LBB1_2: #UnifiedReturnBlock
 529         movl %ecx, %eax
 530         ret
 531 _f2:
 532         movl 4(%esp), %eax
 533         movl %eax, %ecx
 534         incl %ecx
 535         cmpl 8(%esp), %ecx
 536         sete %cl
 537         movzbl %cl, %ecx
 538         leal 1(%ecx,%eax), %eax
 539         ret
 540
 541 both of which are inferior to GCC's:
 542
 543 _f:
 544         movl    4(%esp), %edx
 545         leal    1(%edx), %eax
 546         addl    $2, %edx
 547         cmpl    8(%esp), %eax
 548         cmove   %edx, %eax
 549         ret
 550 _f2:
 551         movl    4(%esp), %eax
 552         addl    $1, %eax
 553         xorl    %edx, %edx
 554         cmpl    8(%esp), %eax
 555         sete    %dl
 556         addl    %edx, %eax
 557         ret
 558
 559 //===---------------------------------------------------------------------===//
 560
 561 This code:
 562
 563 void test(int X) {
 564   if (X) abort();
 565 }
 566
 567 is currently compiled to:
 568
 569 _test:
 570         subl $12, %esp
 571         cmpl $0, 16(%esp)
 572         jne LBB1_1
 573         addl $12, %esp
 574         ret
 575 LBB1_1:
 576         call L_abort$stub
 577
 578 It would be better to produce:
 579
 580 _test:
 581         subl $12, %esp
 582         cmpl $0, 16(%esp)
 583         jne L_abort$stub
 584         addl $12, %esp
 585         ret
 586
 587 This can be applied to any no-return function call that takes no arguments etc.
 588 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
 589 something like this:
 590
 591 _test:
 592         cmpl $0, 4(%esp)
 593         jne LBB1_1
 594         ret
 595 LBB1_1:
 596         subl $12, %esp
 597         call L_abort$stub
 598
 599 Both are useful in different situations.  Finally, it could be shrink-wrapped
 600 and tail called, like this:
 601
 602 _test:
 603         cmpl $0, 4(%esp)
 604         jne LBB1_1
 605         ret
 606 LBB1_1:
 607         pop %eax   # realign stack.
 608         call L_abort$stub
 609
 610 Though this probably isn't worth it.
 611
 612 //===---------------------------------------------------------------------===//
 613
 614 Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
 615 a neg instead of a sub instruction.  Consider:
 616
 617 int test(char X) { return 7-X; }
 618
 619 we currently produce:
 620 _test:
 621         movl $7, %eax
 622         movsbl 4(%esp), %ecx
 623         subl %ecx, %eax
 624         ret
 625
 626 We would use one fewer register if codegen'd as:
 627
 628         movsbl 4(%esp), %eax
 629         neg %eax
 630         add $7, %eax
 631         ret
 632
 633 Note that this isn't beneficial if the load can be folded into the sub.  In
 634 this case, we want a sub:
 635
 636 int test(int X) { return 7-X; }
 637 _test:
 638         movl $7, %eax
 639         subl 4(%esp), %eax
 640         ret
 641
 642 //===---------------------------------------------------------------------===//
 643
 644 Leaf functions that require one 4-byte spill slot have a prolog like this:
 645
 646 _foo:
 647         pushl   %esi
 648         subl    $4, %esp
 649 ...
 650 and an epilog like this:
 651         addl    $4, %esp
 652         popl    %esi
 653         ret
 654
 655 It would be smaller, and potentially faster, to push eax on entry and to
 656 pop into a dummy register instead of using addl/subl of esp.  Just don't pop
 657 into any return registers :)
 658
 659 //===---------------------------------------------------------------------===//
 660
 661 The X86 backend should fold (branch (or (setcc, setcc))) into multiple
 662 branches.  We generate really poor code for:
 663
 664 double testf(double a) {
 665        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
 666 }
 667
 668 For example, the entry BB is:
 669
 670 _testf:
 671         subl    $20, %esp
 672         pxor    %xmm0, %xmm0
 673         movsd   24(%esp), %xmm1
 674         ucomisd %xmm0, %xmm1
 675         setnp   %al
 676         sete    %cl
 677         testb   %cl, %al
 678         jne     LBB1_5  # UnifiedReturnBlock
 679 LBB1_1: # cond_true
 680
 681
 682 it would be better to replace the last four instructions with:
 683
 684         jp LBB1_1
 685         je LBB1_5
 686 LBB1_1:
 687
 688 We also codegen the inner ?: into a diamond:
 689
 690        cvtss2sd        LCPI1_0(%rip), %xmm2
 691         cvtss2sd        LCPI1_1(%rip), %xmm3
 692         ucomisd %xmm1, %xmm0
 693         ja      LBB1_3  # cond_true
 694 LBB1_2: # cond_true
 695         movapd  %xmm3, %xmm2
 696 LBB1_3: # cond_true
 697         movapd  %xmm2, %xmm0
 698         ret
 699
 700 We should sink the load into xmm3 into the LBB1_2 block.  This should
 701 be pretty easy, and will nuke all the copies.
 702
 703 //===---------------------------------------------------------------------===//
 704
 705 This:
 706         #include <algorithm>
 707         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
 708         { return std::make_pair(a + b, a + b < a); }
 709         bool no_overflow(unsigned a, unsigned b)
 710         { return !full_add(a, b).second; }
 711
 712 Should compile to:
 713         addl    %esi, %edi
 714         setae   %al
 715         movzbl  %al, %eax
 716         ret
 717
 718 on x86-64, instead of the rather stupid-looking:
 719         addl    %esi, %edi
 720         setb    %al
 721         xorb    $1, %al
 722         movzbl  %al, %eax
 723         ret
 724
 725
 726 //===---------------------------------------------------------------------===//
 727
 728 The following code:
 729
 730 bb114.preheader:                ; preds = %cond_next94
 731         %tmp231232 = sext i16 %tmp62 to i32             ; <i32> [#uses=1]
 732         %tmp233 = sub i32 32, %tmp231232                ; <i32> [#uses=1]
 733         %tmp245246 = sext i16 %tmp65 to i32             ; <i32> [#uses=1]
 734         %tmp252253 = sext i16 %tmp68 to i32             ; <i32> [#uses=1]
 735         %tmp254 = sub i32 32, %tmp252253                ; <i32> [#uses=1]
 736         %tmp553554 = bitcast i16* %tmp37 to i8*         ; <i8*> [#uses=2]
 737         %tmp583584 = sext i16 %tmp98 to i32             ; <i32> [#uses=1]
 738         %tmp585 = sub i32 32, %tmp583584                ; <i32> [#uses=1]
 739         %tmp614615 = sext i16 %tmp101 to i32            ; <i32> [#uses=1]
 740         %tmp621622 = sext i16 %tmp104 to i32            ; <i32> [#uses=1]
 741         %tmp623 = sub i32 32, %tmp621622                ; <i32> [#uses=1]
 742         br label %bb114
 743
 744 produces:
 745
 746 LBB3_5: # bb114.preheader
 747         movswl  -68(%ebp), %eax
 748         movl    $32, %ecx
 749         movl    %ecx, -80(%ebp)
 750         subl    %eax, -80(%ebp)
 751         movswl  -52(%ebp), %eax
 752         movl    %ecx, -84(%ebp)
 753         subl    %eax, -84(%ebp)
 754         movswl  -70(%ebp), %eax
 755         movl    %ecx, -88(%ebp)
 756         subl    %eax, -88(%ebp)
 757         movswl  -50(%ebp), %eax
 758         subl    %eax, %ecx
 759         movl    %ecx, -76(%ebp)
 760         movswl  -42(%ebp), %eax
 761         movl    %eax, -92(%ebp)
 762         movswl  -66(%ebp), %eax
 763         movl    %eax, -96(%ebp)
 764         movw    $0, -98(%ebp)
 765
 766 This appears to be bad because the RA is not folding the store to the stack
 767 slot into the movl.  The above instructions could be:
 768         movl    $32, -80(%ebp)
 769 ...
 770         movl    $32, -84(%ebp)
 771 ...
 772 This seems like a cross between remat and spill folding.
 773
 774 This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
 775 change, so we could simply subtract %eax from %ecx first and then use %ecx (or
 776 vice-versa).
 777
 778 //===---------------------------------------------------------------------===//
 779
 780 This code:
 781
 782         %tmp659 = icmp slt i16 %tmp654, 0               ; <i1> [#uses=1]
 783         br i1 %tmp659, label %cond_true662, label %cond_next715
 784
 785 produces this:
 786
 787         testw   %cx, %cx
 788         movswl  %cx, %esi
 789         jns     LBB4_109        # cond_next715
 790
 791 Shark tells us that using %cx in the testw instruction is sub-optimal. It
 792 suggests using the 32-bit register (which is what ICC uses).
 793
 794 //===---------------------------------------------------------------------===//
 795
 796 We compile this:
 797
 798 void compare (long long foo) {
 799   if (foo < 4294967297LL)
 800     abort();
 801 }
 802
 803 to:
 804
 805 compare:
 806         subl    $4, %esp
 807         cmpl    $0, 8(%esp)
 808         setne   %al
 809         movzbw  %al, %ax
 810         cmpl    $1, 12(%esp)
 811         setg    %cl
 812         movzbw  %cl, %cx
 813         cmove   %ax, %cx
 814         testb   $1, %cl
 815         jne     .LBB1_2 # UnifiedReturnBlock
 816 .LBB1_1:        # ifthen
 817         call    abort
 818 .LBB1_2:        # UnifiedReturnBlock
 819         addl    $4, %esp
 820         ret
 821
 822 (also really horrible code on ppc).  This is due to the expand code for 64-bit
 823 compares.  GCC produces multiple branches, which is much nicer:
 824
 825 compare:
 826         subl    $12, %esp
 827         movl    20(%esp), %edx
 828         movl    16(%esp), %eax
 829         decl    %edx
 830         jle     .L7
 831 .L5:
 832         addl    $12, %esp
 833         ret
 834         .p2align 4,,7
 835 .L7:
 836         jl      .L4
 837         cmpl    $0, %eax
 838         .p2align 4,,8
 839         ja      .L5
 840 .L4:
 841         .p2align 4,,9
 842         call    abort
 843
 844 //===---------------------------------------------------------------------===//
 845
 846 Tail call optimization improvements: Tail call optimization currently
 847 pushes all arguments on the top of the stack (their normal place for
 848 non-tail call optimized calls) that source from the callers arguments
 849 or  that source from a virtual register (also possibly sourcing from
 850 callers arguments).
 851 This is done to prevent overwriting of parameters (see example
 852 below) that might be used later.
 853
 854 example:
 855
 856 int callee(int32, int64);
 857 int caller(int32 arg1, int32 arg2) {
 858   int64 local = arg2 * 2;
 859   return callee(arg2, (int64)local);
 860 }
 861
 862 [arg1]          [!arg2 no longer valid since we moved local onto it]
 863 [arg2]      ->  [(int64)
 864 [RETADDR]        local  ]
 865
 866 Moving arg1 onto the stack slot of callee function would overwrite
 867 arg2 of the caller.
 868
 869 Possible optimizations:
 870
 871
 872  - Analyse the actual parameters of the callee to see which would
 873    overwrite a caller parameter which is used by the callee and only
 874    push them onto the top of the stack.
 875
 876    int callee (int32 arg1, int32 arg2);
 877    int caller (int32 arg1, int32 arg2) {
 878        return callee(arg1,arg2);
 879    }
 880
 881    Here we don't need to write any variables to the top of the stack
 882    since they don't overwrite each other.
 883
 884    int callee (int32 arg1, int32 arg2);
 885    int caller (int32 arg1, int32 arg2) {
 886        return callee(arg2,arg1);
 887    }
 888
 889    Here we need to push the arguments because they overwrite each
 890    other.
 891
 892 //===---------------------------------------------------------------------===//
 893
 894 main ()
 895 {
 896   int i = 0;
 897   unsigned long int z = 0;
 898
 899   do {
 900     z -= 0x00004000;
 901     i++;
 902     if (i > 0x00040000)
 903       abort ();
 904   } while (z > 0);
 905   exit (0);
 906 }
 907
 908 gcc compiles this to:
 909
 910 _main:
 911         subl    $28, %esp
 912         xorl    %eax, %eax
 913         jmp     L2
 914 L3:
 915         cmpl    $262144, %eax
 916         je      L10
 917 L2:
 918         addl    $1, %eax
 919         cmpl    $262145, %eax
 920         jne     L3
 921         call    L_abort$stub
 922 L10:
 923         movl    $0, (%esp)
 924         call    L_exit$stub
 925
 926 llvm:
 927
 928 _main:
 929         subl    $12, %esp
 930         movl    $1, %eax
 931         movl    $16384, %ecx
 932 LBB1_1: # bb
 933         cmpl    $262145, %eax
 934         jge     LBB1_4  # cond_true
 935 LBB1_2: # cond_next
 936         incl    %eax
 937         addl    $4294950912, %ecx
 938         cmpl    $16384, %ecx
 939         jne     LBB1_1  # bb
 940 LBB1_3: # bb11
 941         xorl    %eax, %eax
 942         addl    $12, %esp
 943         ret
 944 LBB1_4: # cond_true
 945         call    L_abort$stub
 946
 947 1. LSR should rewrite the first cmp with induction variable %ecx.
 948 2. DAG combiner should fold
 949         leal    1(%eax), %edx
 950         cmpl    $262145, %edx
 951    =>
 952         cmpl    $262144, %eax
 953
 954 //===---------------------------------------------------------------------===//
 955
 956 define i64 @test(double %X) {
 957         %Y = fptosi double %X to i64
 958         ret i64 %Y
 959 }
 960
 961 compiles to:
 962
 963 _test:
 964         subl    $20, %esp
 965         movsd   24(%esp), %xmm0
 966         movsd   %xmm0, 8(%esp)
 967         fldl    8(%esp)
 968         fisttpll        (%esp)
 969         movl    4(%esp), %edx
 970         movl    (%esp), %eax
 971         addl    $20, %esp
 972         #FP_REG_KILL
 973         ret
 974
 975 This should just fldl directly from the input stack slot.
 976
 977 //===---------------------------------------------------------------------===//
 978
 979 This code:
 980 int foo (int x) { return (x & 65535) | 255; }
 981
 982 Should compile into:
 983
 984 _foo:
 985         movzwl  4(%esp), %eax
 986         orl     $255, %eax
 987         ret
 988
 989 instead of:
 990 _foo:
 991         movl    $65280, %eax
 992         andl    4(%esp), %eax
 993         orl     $255, %eax
 994         ret
 995
 996 //===---------------------------------------------------------------------===//
 997
 998 We're codegen'ing multiply of long longs inefficiently:
 999
1000 unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
1001   return arg1 *  arg2;
1002 }
1003
1004 We compile to (fomit-frame-pointer):
1005
1006 _LLM:
1007         pushl   %esi
1008         movl    8(%esp), %ecx
1009         movl    16(%esp), %esi
1010         movl    %esi, %eax
1011         mull    %ecx
1012         imull   12(%esp), %esi
1013         addl    %edx, %esi
1014         imull   20(%esp), %ecx
1015         movl    %esi, %edx
1016         addl    %ecx, %edx
1017         popl    %esi
1018         ret
1019
1020 This looks like a scheduling deficiency and lack of remat of the load from
1021 the argument area.  ICC apparently produces:
1022
1023         movl      8(%esp), %ecx
1024         imull     12(%esp), %ecx
1025         movl      16(%esp), %eax
1026         imull     4(%esp), %eax
1027         addl      %eax, %ecx
1028         movl      4(%esp), %eax
1029         mull      12(%esp)
1030         addl      %ecx, %edx
1031         ret
1032
1033 Note that it remat'd loads from 4(esp) and 12(esp).  See this GCC PR:
1034 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
1035
1036 //===---------------------------------------------------------------------===//
1037
1038 We can fold a store into "zeroing a reg".  Instead of:
1039
1040 xorl    %eax, %eax
1041 movl    %eax, 124(%esp)
1042
1043 we should get:
1044
1045 movl    $0, 124(%esp)
1046
1047 if the flags of the xor are dead.
1048
1049 Likewise, we isel "x<<1" into "add reg,reg".  If reg is spilled, this should
1050 be folded into: shl [mem], 1
1051
1052 //===---------------------------------------------------------------------===//
1053
1054 In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
1055 or and instruction, for example:
1056
1057         xorpd   LCPI1_0, %xmm2
1058
1059 However, if xmm2 gets spilled, we end up with really ugly code like this:
1060
1061         movsd   (%esp), %xmm0
1062         xorpd   LCPI1_0, %xmm0
1063         movsd   %xmm0, (%esp)
1064
1065 Since we 'know' that this is a 'neg', we can actually "fold" the spill into
1066 the neg/abs instruction, turning it into an *integer* operation, like this:
1067
1068         xorl 2147483648, [mem+4]     ## 2147483648 = (1 << 31)
1069
1070 you could also use xorb, but xorl is less likely to lead to a partial register
1071 stall.  Here is a contrived testcase:
1072
1073 double a, b, c;
1074 void test(double *P) {
1075   double X = *P;
1076   a = X;
1077   bar();
1078   X = -X;
1079   b = X;
1080   bar();
1081   c = X;
1082 }
1083
1084 //===---------------------------------------------------------------------===//
1085
1086 The generated code on x86 for checking for signed overflow on a multiply the
1087 obvious way is much longer than it needs to be.
1088
1089 int x(int a, int b) {
1090   long long prod = (long long)a*b;
1091   return  prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
1092 }
1093
1094 See PR2053 for more details.
1095
1096 //===---------------------------------------------------------------------===//
1097
1098 We should investigate using cdq/ctld (effect: edx = sar eax, 31)
1099 more aggressively; it should cost the same as a move+shift on any modern
1100 processor, but it's a lot shorter. Downside is that it puts more
1101 pressure on register allocation because it has fixed operands.
1102
1103 Example:
1104 int abs(int x) {return x < 0 ? -x : x;}
1105
1106 gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
1107 abs:
1108         movl    4(%esp), %eax
1109         cltd
1110         xorl    %edx, %eax
1111         subl    %edx, %eax
1112         ret
1113
1114 //===---------------------------------------------------------------------===//
1115
1116 Take the following code (from
1117 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
1118
1119 extern unsigned char first_one[65536];
1120 int FirstOnet(unsigned long long arg1)
1121 {
1122   if (arg1 >> 48)
1123     return (first_one[arg1 >> 48]);
1124   return 0;
1125 }
1126
1127
1128 The following code is currently generated:
1129 FirstOnet:
1130         movl    8(%esp), %eax
1131         cmpl    $65536, %eax
1132         movl    4(%esp), %ecx
1133         jb      .LBB1_2 # UnifiedReturnBlock
1134 .LBB1_1:        # ifthen
1135         shrl    $16, %eax
1136         movzbl  first_one(%eax), %eax
1137         ret
1138 .LBB1_2:        # UnifiedReturnBlock
1139         xorl    %eax, %eax
1140         ret
1141
1142 We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
1143 lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
1144
1145 //===---------------------------------------------------------------------===//
1146
1147 We compile this function:
1148
1149 define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext  %d) nounwind  {
1150 entry:
1151         %tmp2 = icmp eq i8 %d, 0                ; <i1> [#uses=1]
1152         br i1 %tmp2, label %bb7, label %bb
1153
1154 bb:             ; preds = %entry
1155         %tmp6 = add i32 %b, %a          ; <i32> [#uses=1]
1156         ret i32 %tmp6
1157
1158 bb7:            ; preds = %entry
1159         %tmp10 = sub i32 %a, %c         ; <i32> [#uses=1]
1160         ret i32 %tmp10
1161 }
1162
1163 to:
1164
1165 foo:                                    # @foo
1166 # BB#0:                                 # %entry
1167         movl    4(%esp), %ecx
1168         cmpb    $0, 16(%esp)
1169         je      .LBB0_2
1170 # BB#1:                                 # %bb
1171         movl    8(%esp), %eax
1172         addl    %ecx, %eax
1173         ret
1174 .LBB0_2:                                # %bb7
1175         movl    12(%esp), %edx
1176         movl    %ecx, %eax
1177         subl    %edx, %eax
1178         ret
1179
1180 There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
1181 couple more movls by putting 4(%esp) into %eax instead of %ecx.
1182
1183 //===---------------------------------------------------------------------===//
1184
1185 See rdar://4653682.
1186
1187 From flops:
1188
1189 LBB1_15:        # bb310
1190         cvtss2sd        LCPI1_0, %xmm1
1191         addsd   %xmm1, %xmm0
1192         movsd   176(%esp), %xmm2
1193         mulsd   %xmm0, %xmm2
1194         movapd  %xmm2, %xmm3
1195         mulsd   %xmm3, %xmm3
1196         movapd  %xmm3, %xmm4
1197         mulsd   LCPI1_23, %xmm4
1198         addsd   LCPI1_24, %xmm4
1199         mulsd   %xmm3, %xmm4
1200         addsd   LCPI1_25, %xmm4
1201         mulsd   %xmm3, %xmm4
1202         addsd   LCPI1_26, %xmm4
1203         mulsd   %xmm3, %xmm4
1204         addsd   LCPI1_27, %xmm4
1205         mulsd   %xmm3, %xmm4
1206         addsd   LCPI1_28, %xmm4
1207         mulsd   %xmm3, %xmm4
1208         addsd   %xmm1, %xmm4
1209         mulsd   %xmm2, %xmm4
1210         movsd   152(%esp), %xmm1
1211         addsd   %xmm4, %xmm1
1212         movsd   %xmm1, 152(%esp)
1213         incl    %eax
1214         cmpl    %eax, %esi
1215         jge     LBB1_15 # bb310
1216 LBB1_16:        # bb358.loopexit
1217         movsd   152(%esp), %xmm0
1218         addsd   %xmm0, %xmm0
1219         addsd   LCPI1_22, %xmm0
1220         movsd   %xmm0, 152(%esp)
1221
1222 Rather than spilling the result of the last addsd in the loop, we should have
1223 insert a copy to split the interval (one for the duration of the loop, one
1224 extending to the fall through). The register pressure in the loop isn't high
1225 enough to warrant the spill.
1226
1227 Also check why xmm7 is not used at all in the function.
1228
1229 //===---------------------------------------------------------------------===//
1230
1231 Take the following:
1232
1233 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
1234 target triple = "i386-apple-darwin8"
1235 @in_exit.4870.b = internal global i1 false              ; <i1*> [#uses=2]
1236 define fastcc void @abort_gzip() noreturn nounwind  {
1237 entry:
1238         %tmp.b.i = load i1* @in_exit.4870.b             ; <i1> [#uses=1]
1239         br i1 %tmp.b.i, label %bb.i, label %bb4.i
1240 bb.i:           ; preds = %entry
1241         tail call void @exit( i32 1 ) noreturn nounwind
1242         unreachable
1243 bb4.i:          ; preds = %entry
1244         store i1 true, i1* @in_exit.4870.b
1245         tail call void @exit( i32 1 ) noreturn nounwind
1246         unreachable
1247 }
1248 declare void @exit(i32) noreturn nounwind
1249
1250 This compiles into:
1251 _abort_gzip:                            ## @abort_gzip
1252 ## BB#0:                                ## %entry
1253         subl    $12, %esp
1254         movb    _in_exit.4870.b, %al
1255         cmpb    $1, %al
1256         jne     LBB0_2
1257
1258 We somehow miss folding the movb into the cmpb.
1259
1260 //===---------------------------------------------------------------------===//
1261
1262 We compile:
1263
1264 int test(int x, int y) {
1265   return x-y-1;
1266 }
1267
1268 into (-m64):
1269
1270 _test:
1271         decl    %edi
1272         movl    %edi, %eax
1273         subl    %esi, %eax
1274         ret
1275
1276 it would be better to codegen as: x+~y  (notl+addl)
1277
1278 //===---------------------------------------------------------------------===//
1279
1280 This code:
1281
1282 int foo(const char *str,...)
1283 {
1284  __builtin_va_list a; int x;
1285  __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
1286  return x;
1287 }
1288
1289 gets compiled into this on x86-64:
1290         subq    $200, %rsp
1291         movaps  %xmm7, 160(%rsp)
1292         movaps  %xmm6, 144(%rsp)
1293         movaps  %xmm5, 128(%rsp)
1294         movaps  %xmm4, 112(%rsp)
1295         movaps  %xmm3, 96(%rsp)
1296         movaps  %xmm2, 80(%rsp)
1297         movaps  %xmm1, 64(%rsp)
1298         movaps  %xmm0, 48(%rsp)
1299         movq    %r9, 40(%rsp)
1300         movq    %r8, 32(%rsp)
1301         movq    %rcx, 24(%rsp)
1302         movq    %rdx, 16(%rsp)
1303         movq    %rsi, 8(%rsp)
1304         leaq    (%rsp), %rax
1305         movq    %rax, 192(%rsp)
1306         leaq    208(%rsp), %rax
1307         movq    %rax, 184(%rsp)
1308         movl    $48, 180(%rsp)
1309         movl    $8, 176(%rsp)
1310         movl    176(%rsp), %eax
1311         cmpl    $47, %eax
1312         jbe     .LBB1_3 # bb
1313 .LBB1_1:        # bb3
1314         movq    184(%rsp), %rcx
1315         leaq    8(%rcx), %rax
1316         movq    %rax, 184(%rsp)
1317 .LBB1_2:        # bb4
1318         movl    (%rcx), %eax
1319         addq    $200, %rsp
1320         ret
1321 .LBB1_3:        # bb
1322         movl    %eax, %ecx
1323         addl    $8, %eax
1324         addq    192(%rsp), %rcx
1325         movl    %eax, 176(%rsp)
1326         jmp     .LBB1_2 # bb4
1327
1328 gcc 4.3 generates:
1329         subq    $96, %rsp
1330 .LCFI0:
1331         leaq    104(%rsp), %rax
1332         movq    %rsi, -80(%rsp)
1333         movl    $8, -120(%rsp)
1334         movq    %rax, -112(%rsp)
1335         leaq    -88(%rsp), %rax
1336         movq    %rax, -104(%rsp)
1337         movl    $8, %eax
1338         cmpl    $48, %eax
1339         jb      .L6
1340         movq    -112(%rsp), %rdx
1341         movl    (%rdx), %eax
1342         addq    $96, %rsp
1343         ret
1344         .p2align 4,,10
1345         .p2align 3
1346 .L6:
1347         mov     %eax, %edx
1348         addq    -104(%rsp), %rdx
1349         addl    $8, %eax
1350         movl    %eax, -120(%rsp)
1351         movl    (%rdx), %eax
1352         addq    $96, %rsp
1353         ret
1354
1355 and it gets compiled into this on x86:
1356         pushl   %ebp
1357         movl    %esp, %ebp
1358         subl    $4, %esp
1359         leal    12(%ebp), %eax
1360         movl    %eax, -4(%ebp)
1361         leal    16(%ebp), %eax
1362         movl    %eax, -4(%ebp)
1363         movl    12(%ebp), %eax
1364         addl    $4, %esp
1365         popl    %ebp
1366         ret
1367
1368 gcc 4.3 generates:
1369         pushl   %ebp
1370         movl    %esp, %ebp
1371         movl    12(%ebp), %eax
1372         popl    %ebp
1373         ret
1374
1375 //===---------------------------------------------------------------------===//
1376
1377 Teach tblgen not to check bitconvert source type in some cases. This allows us
1378 to consolidate the following patterns in X86InstrMMX.td:
1379
1380 def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1381                                                   (iPTR 0))))),
1382           (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
1383 def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1384                                                   (iPTR 0))))),
1385           (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
1386 def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1387                                                   (iPTR 0))))),
1388           (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
1389
1390 There are other cases in various td files.
1391
1392 //===---------------------------------------------------------------------===//
1393
1394 Take something like the following on x86-32:
1395 unsigned a(unsigned long long x, unsigned y) {return x % y;}
1396
1397 We currently generate a libcall, but we really shouldn't: the expansion is
1398 shorter and likely faster than the libcall.  The expected code is something
1399 like the following:
1400
1401         movl    12(%ebp), %eax
1402         movl    16(%ebp), %ecx
1403         xorl    %edx, %edx
1404         divl    %ecx
1405         movl    8(%ebp), %eax
1406         divl    %ecx
1407         movl    %edx, %eax
1408         ret
1409
1410 A similar code sequence works for division.
1411
1412 //===---------------------------------------------------------------------===//
1413
1414 These should compile to the same code, but the later codegen's to useless
1415 instructions on X86. This may be a trivial dag combine (GCC PR7061):
1416
1417 struct s1 { unsigned char a, b; };
1418 unsigned long f1(struct s1 x) {
1419     return x.a + x.b;
1420 }
1421 struct s2 { unsigned a: 8, b: 8; };
1422 unsigned long f2(struct s2 x) {
1423     return x.a + x.b;
1424 }
1425
1426 //===---------------------------------------------------------------------===//
1427
1428 We currently compile this:
1429
1430 define i32 @func1(i32 %v1, i32 %v2) nounwind {
1431 entry:
1432   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
1433   %sum = extractvalue {i32, i1} %t, 0
1434   %obit = extractvalue {i32, i1} %t, 1
1435   br i1 %obit, label %overflow, label %normal
1436 normal:
1437   ret i32 %sum
1438 overflow:
1439   call void @llvm.trap()
1440   unreachable
1441 }
1442 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
1443 declare void @llvm.trap()
1444
1445 to:
1446
1447 _func1:
1448         movl    4(%esp), %eax
1449         addl    8(%esp), %eax
1450         jo      LBB1_2  ## overflow
1451 LBB1_1: ## normal
1452         ret
1453 LBB1_2: ## overflow
1454         ud2
1455
1456 it would be nice to produce "into" someday.
1457
1458 //===---------------------------------------------------------------------===//
1459
1460 This code:
1461
1462 void vec_mpys1(int y[], const int x[], int scaler) {
1463 int i;
1464 for (i = 0; i < 150; i++)
1465  y[i] += (((long long)scaler * (long long)x[i]) >> 31);
1466 }
1467
1468 Compiles to this loop with GCC 3.x:
1469
1470 .L5:
1471         movl    %ebx, %eax
1472         imull   (%edi,%ecx,4)
1473         shrdl   $31, %edx, %eax
1474         addl    %eax, (%esi,%ecx,4)
1475         incl    %ecx
1476         cmpl    $149, %ecx
1477         jle     .L5
1478
1479 llvm-gcc compiles it to the much uglier:
1480
1481 LBB1_1: ## bb1
1482         movl    24(%esp), %eax
1483         movl    (%eax,%edi,4), %ebx
1484         movl    %ebx, %ebp
1485         imull   %esi, %ebp
1486         movl    %ebx, %eax
1487         mull    %ecx
1488         addl    %ebp, %edx
1489         sarl    $31, %ebx
1490         imull   %ecx, %ebx
1491         addl    %edx, %ebx
1492         shldl   $1, %eax, %ebx
1493         movl    20(%esp), %eax
1494         addl    %ebx, (%eax,%edi,4)
1495         incl    %edi
1496         cmpl    $150, %edi
1497         jne     LBB1_1  ## bb1
1498
1499 The issue is that we hoist the cast of "scaler" to long long outside of the
1500 loop, the value comes into the loop as two values, and
1501 RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the
1502 constructed BUILD_PAIR which represents the cast value.
1503
1504 This can be handled by making CodeGenPrepare sink the cast.
1505
1506 //===---------------------------------------------------------------------===//
1507
1508 Test instructions can be eliminated by using EFLAGS values from arithmetic
1509 instructions. This is currently not done for mul, and, or, xor, neg, shl,
1510 sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
1511 for read-modify-write instructions. It is also current not done if the
1512 OF or CF flags are needed.
1513
1514 The shift operators have the complication that when the shift count is
1515 zero, EFLAGS is not set, so they can only subsume a test instruction if
1516 the shift count is known to be non-zero. Also, using the EFLAGS value
1517 from a shift is apparently very slow on some x86 implementations.
1518
1519 In read-modify-write instructions, the root node in the isel match is
1520 the store, and isel has no way for the use of the EFLAGS result of the
1521 arithmetic to be remapped to the new node.
1522
1523 Add and subtract instructions set OF on signed overflow and CF on unsiged
1524 overflow, while test instructions always clear OF and CF. In order to
1525 replace a test with an add or subtract in a situation where OF or CF is
1526 needed, codegen must be able to prove that the operation cannot see
1527 signed or unsigned overflow, respectively.
1528
1529 //===---------------------------------------------------------------------===//
1530
1531 memcpy/memmove do not lower to SSE copies when possible.  A silly example is:
1532 define <16 x float> @foo(<16 x float> %A) nounwind {
1533         %tmp = alloca <16 x float>, align 16
1534         %tmp2 = alloca <16 x float>, align 16
1535         store <16 x float> %A, <16 x float>* %tmp
1536         %s = bitcast <16 x float>* %tmp to i8*
1537         %s2 = bitcast <16 x float>* %tmp2 to i8*
1538         call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
1539         %R = load <16 x float>* %tmp2
1540         ret <16 x float> %R
1541 }
1542
1543 declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
1544
1545 which compiles to:
1546
1547 _foo:
1548         subl    $140, %esp
1549         movaps  %xmm3, 112(%esp)
1550         movaps  %xmm2, 96(%esp)
1551         movaps  %xmm1, 80(%esp)
1552         movaps  %xmm0, 64(%esp)
1553         movl    60(%esp), %eax
1554         movl    %eax, 124(%esp)
1555         movl    56(%esp), %eax
1556         movl    %eax, 120(%esp)
1557         movl    52(%esp), %eax
1558         <many many more 32-bit copies>
1559         movaps  (%esp), %xmm0
1560         movaps  16(%esp), %xmm1
1561         movaps  32(%esp), %xmm2
1562         movaps  48(%esp), %xmm3
1563         addl    $140, %esp
1564         ret
1565
1566 On Nehalem, it may even be cheaper to just use movups when unaligned than to
1567 fall back to lower-granularity chunks.
1568
1569 //===---------------------------------------------------------------------===//
1570
1571 Implement processor-specific optimizations for parity with GCC on these
1572 processors.  GCC does two optimizations:
1573
1574 1. ix86_pad_returns inserts a noop before ret instructions if immediately
1575    preceeded by a conditional branch or is the target of a jump.
1576 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
1577    code contains more than 3 branches.
1578
1579 The first one is done for all AMDs, Core2, and "Generic"
1580 The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
1581   Core 2, and "Generic"
1582
1583 //===---------------------------------------------------------------------===//
1584
1585 Testcase:
1586 int a(int x) { return (x & 127) > 31; }
1587
1588 Current output:
1589         movl    4(%esp), %eax
1590         andl    $127, %eax
1591         cmpl    $31, %eax
1592         seta    %al
1593         movzbl  %al, %eax
1594         ret
1595
1596 Ideal output:
1597         xorl    %eax, %eax
1598         testl   $96, 4(%esp)
1599         setne   %al
1600         ret
1601
1602 This should definitely be done in instcombine, canonicalizing the range
1603 condition into a != condition.  We get this IR:
1604
1605 define i32 @a(i32 %x) nounwind readnone {
1606 entry:
1607         %0 = and i32 %x, 127            ; <i32> [#uses=1]
1608         %1 = icmp ugt i32 %0, 31                ; <i1> [#uses=1]
1609         %2 = zext i1 %1 to i32          ; <i32> [#uses=1]
1610         ret i32 %2
1611 }
1612
1613 Instcombine prefers to strength reduce relational comparisons to equality
1614 comparisons when possible, this should be another case of that.  This could
1615 be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it
1616 looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already
1617 be redesigned to use ComputeMaskedBits and friends.
1618
1619
1620 //===---------------------------------------------------------------------===//
1621 Testcase:
1622 int x(int a) { return (a&0xf0)>>4; }
1623
1624 Current output:
1625         movl    4(%esp), %eax
1626         shrl    $4, %eax
1627         andl    $15, %eax
1628         ret
1629
1630 Ideal output:
1631         movzbl  4(%esp), %eax
1632         shrl    $4, %eax
1633         ret
1634
1635 //===---------------------------------------------------------------------===//
1636
1637 Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
1638 properly.
1639
1640 When the return value is not used (i.e. only care about the value in the
1641 memory), x86 does not have to use add to implement these. Instead, it can use
1642 add, sub, inc, dec instructions with the "lock" prefix.
1643
1644 This is currently implemented using a bit of instruction selection trick. The
1645 issue is the target independent pattern produces one output and a chain and we
1646 want to map it into one that just output a chain. The current trick is to select
1647 it into a MERGE_VALUES with the first definition being an implicit_def. The
1648 proper solution is to add new ISD opcodes for the no-output variant. DAG
1649 combiner can then transform the node before it gets to target node selection.
1650
1651 Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
1652 fact these instructions are identical to the non-lock versions. We need a way to
1653 add target specific information to target nodes and have this information
1654 carried over to machine instructions. Asm printer (or JIT) can use this
1655 information to add the "lock" prefix.
1656
1657 //===---------------------------------------------------------------------===//
1658
1659 struct B {
1660   unsigned char y0 : 1;
1661 };
1662
1663 int bar(struct B* a) { return a->y0; }
1664
1665 define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize {
1666   %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0
1667   %2 = load i8* %1, align 1
1668   %3 = and i8 %2, 1
1669   %4 = zext i8 %3 to i32
1670   ret i32 %4
1671 }
1672
1673 bar:                                    # @bar
1674 # BB#0:
1675         movb    (%rdi), %al
1676         andb    $1, %al
1677         movzbl  %al, %eax
1678         ret
1679
1680 Missed optimization: should be movl+andl.
1681
1682 //===---------------------------------------------------------------------===//
1683
1684 The x86_64 abi says:
1685
1686 Booleans, when stored in a memory object, are stored as single byte objects the
1687 value of which is always 0 (false) or 1 (true).
1688
1689 We are not using this fact:
1690
1691 int bar(_Bool *a) { return *a; }
1692
1693 define i32 @bar(i8* nocapture %a) nounwind readonly optsize {
1694   %1 = load i8* %a, align 1, !tbaa !0
1695   %tmp = and i8 %1, 1
1696   %2 = zext i8 %tmp to i32
1697   ret i32 %2
1698 }
1699
1700 bar:
1701         movb    (%rdi), %al
1702         andb    $1, %al
1703         movzbl  %al, %eax
1704         ret
1705
1706 GCC produces
1707
1708 bar:
1709         movzbl  (%rdi), %eax
1710         ret
1711
1712 //===---------------------------------------------------------------------===//
1713
1714 Consider the following two functions compiled with clang:
1715 _Bool foo(int *x) { return !(*x & 4); }
1716 unsigned bar(int *x) { return !(*x & 4); }
1717
1718 foo:
1719         movl    4(%esp), %eax
1720         testb   $4, (%eax)
1721         sete    %al
1722         movzbl  %al, %eax
1723         ret
1724
1725 bar:
1726         movl    4(%esp), %eax
1727         movl    (%eax), %eax
1728         shrl    $2, %eax
1729         andl    $1, %eax
1730         xorl    $1, %eax
1731         ret
1732
1733 The second function generates more code even though the two functions are
1734 are functionally identical.
1735
1736 //===---------------------------------------------------------------------===//
1737
1738 Take the following C code:
1739 int x(int y) { return (y & 63) << 14; }
1740
1741 Code produced by gcc:
1742         andl    $63, %edi
1743         sall    $14, %edi
1744         movl    %edi, %eax
1745         ret
1746
1747 Code produced by clang:
1748         shll    $14, %edi
1749         movl    %edi, %eax
1750         andl    $1032192, %eax
1751         ret
1752
1753 The code produced by gcc is 3 bytes shorter.  This sort of construct often
1754 shows up with bitfields.
1755
1756 //===---------------------------------------------------------------------===//
1757
1758 Take the following C code:
1759 int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
1760
1761 We generate the following IR with clang:
1762 define i32 @f(i32 %a, i32 %b) nounwind readnone {
1763 entry:
1764   %tmp = xor i32 %b, %a                           ; <i32> [#uses=1]
1765   %tmp6 = and i32 %tmp, 255                       ; <i32> [#uses=1]
1766   %cmp = icmp eq i32 %tmp6, 0                     ; <i1> [#uses=1]
1767   %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
1768   ret i32 %conv5
1769 }
1770
1771 And the following x86 code:
1772         xorl    %esi, %edi
1773         testb   $-1, %dil
1774         sete    %al
1775         movzbl  %al, %eax
1776         ret
1777
1778 A cmpb instead of the xorl+testb would be one instruction shorter.
1779
1780 //===---------------------------------------------------------------------===//
1781
1782 Given the following C code:
1783 int f(int a, int b) { return (signed char)a == (signed char)b; }
1784
1785 We generate the following IR with clang:
1786 define i32 @f(i32 %a, i32 %b) nounwind readnone {
1787 entry:
1788   %sext = shl i32 %a, 24                          ; <i32> [#uses=1]
1789   %conv1 = ashr i32 %sext, 24                     ; <i32> [#uses=1]
1790   %sext6 = shl i32 %b, 24                         ; <i32> [#uses=1]
1791   %conv4 = ashr i32 %sext6, 24                    ; <i32> [#uses=1]
1792   %cmp = icmp eq i32 %conv1, %conv4               ; <i1> [#uses=1]
1793   %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
1794   ret i32 %conv5
1795 }
1796
1797 And the following x86 code:
1798         movsbl  %sil, %eax
1799         movsbl  %dil, %ecx
1800         cmpl    %eax, %ecx
1801         sete    %al
1802         movzbl  %al, %eax
1803         ret
1804
1805
1806 It should be possible to eliminate the sign extensions.
1807
1808 //===---------------------------------------------------------------------===//
1809
1810 LLVM misses a load+store narrowing opportunity in this code:
1811
1812 %struct.bf = type { i64, i16, i16, i32 }
1813
1814 @bfi = external global %struct.bf*                ; <%struct.bf**> [#uses=2]
1815
1816 define void @t1() nounwind ssp {
1817 entry:
1818   %0 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
1819   %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]
1820   %2 = bitcast i16* %1 to i32*                    ; <i32*> [#uses=2]
1821   %3 = load i32* %2, align 1                      ; <i32> [#uses=1]
1822   %4 = and i32 %3, -65537                         ; <i32> [#uses=1]
1823   store i32 %4, i32* %2, align 1
1824   %5 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
1825   %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]
1826   %7 = bitcast i16* %6 to i32*                    ; <i32*> [#uses=2]
1827   %8 = load i32* %7, align 1                      ; <i32> [#uses=1]
1828   %9 = and i32 %8, -131073                        ; <i32> [#uses=1]
1829   store i32 %9, i32* %7, align 1
1830   ret void
1831 }
1832
1833 LLVM currently emits this:
1834
1835   movq  bfi(%rip), %rax
1836   andl  $-65537, 8(%rax)
1837   movq  bfi(%rip), %rax
1838   andl  $-131073, 8(%rax)
1839   ret
1840
1841 It could narrow the loads and stores to emit this:
1842
1843   movq  bfi(%rip), %rax
1844   andb  $-2, 10(%rax)
1845   movq  bfi(%rip), %rax
1846   andb  $-3, 10(%rax)
1847   ret
1848
1849 The trouble is that there is a TokenFactor between the store and the
1850 load, making it non-trivial to determine if there's anything between
1851 the load and the store which would prohibit narrowing.
1852
1853 //===---------------------------------------------------------------------===//
1854
1855 This code:
1856 void foo(unsigned x) {
1857   if (x == 0) bar();
1858   else if (x == 1) qux();
1859 }
1860
1861 currently compiles into:
1862 _foo:
1863         movl    4(%esp), %eax
1864         cmpl    $1, %eax
1865         je      LBB0_3
1866         testl   %eax, %eax
1867         jne     LBB0_4
1868
1869 the testl could be removed:
1870 _foo:
1871         movl    4(%esp), %eax
1872         cmpl    $1, %eax
1873         je      LBB0_3
1874         jb      LBB0_4
1875
1876 0 is the only unsigned number < 1.
1877
1878 //===---------------------------------------------------------------------===//
1879
1880 This code:
1881
1882 %0 = type { i32, i1 }
1883
1884 define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp {
1885 entry:
1886   %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x)
1887   %cmp = extractvalue %0 %uadd, 1
1888   %inc = zext i1 %cmp to i32
1889   %add = add i32 %x, %sum
1890   %z.0 = add i32 %add, %inc
1891   ret i32 %z.0
1892 }
1893
1894 declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
1895
1896 compiles to:
1897
1898 _add32carry:                            ## @add32carry
1899         addl    %esi, %edi
1900         sbbl    %ecx, %ecx
1901         movl    %edi, %eax
1902         subl    %ecx, %eax
1903         ret
1904
1905 But it could be:
1906
1907 _add32carry:
1908         leal    (%rsi,%rdi), %eax
1909         cmpl    %esi, %eax
1910         adcl    $0, %eax
1911         ret
1912
1913 //===---------------------------------------------------------------------===//
1914
1915 The hot loop of 256.bzip2 contains code that looks a bit like this:
1916
1917 int foo(char *P, char *Q, int x, int y) {
1918   if (P[0] != Q[0])
1919      return P[0] < Q[0];
1920   if (P[1] != Q[1])
1921      return P[1] < Q[1];
1922   if (P[2] != Q[2])
1923      return P[2] < Q[2];
1924    return P[3] < Q[3];
1925 }
1926
1927 In the real code, we get a lot more wrong than this.  However, even in this
1928 code we generate:
1929
1930 _foo:                                   ## @foo
1931 ## BB#0:                                ## %entry
1932         movb    (%rsi), %al
1933         movb    (%rdi), %cl
1934         cmpb    %al, %cl
1935         je      LBB0_2
1936 LBB0_1:                                 ## %if.then
1937         cmpb    %al, %cl
1938         jmp     LBB0_5
1939 LBB0_2:                                 ## %if.end
1940         movb    1(%rsi), %al
1941         movb    1(%rdi), %cl
1942         cmpb    %al, %cl
1943         jne     LBB0_1
1944 ## BB#3:                                ## %if.end38
1945         movb    2(%rsi), %al
1946         movb    2(%rdi), %cl
1947         cmpb    %al, %cl
1948         jne     LBB0_1
1949 ## BB#4:                                ## %if.end60
1950         movb    3(%rdi), %al
1951         cmpb    3(%rsi), %al
1952 LBB0_5:                                 ## %if.end60
1953         setl    %al
1954         movzbl  %al, %eax
1955         ret
1956
1957 Note that we generate jumps to LBB0_1 which does a redundant compare.  The
1958 redundant compare also forces the register values to be live, which prevents
1959 folding one of the loads into the compare.  In contrast, GCC 4.2 produces:
1960
1961 _foo:
1962         movzbl  (%rsi), %eax
1963         cmpb    %al, (%rdi)
1964         jne     L10
1965 L12:
1966         movzbl  1(%rsi), %eax
1967         cmpb    %al, 1(%rdi)
1968         jne     L10
1969         movzbl  2(%rsi), %eax
1970         cmpb    %al, 2(%rdi)
1971         jne     L10
1972         movzbl  3(%rdi), %eax
1973         cmpb    3(%rsi), %al
1974 L10:
1975         setl    %al
1976         movzbl  %al, %eax
1977         ret
1978
1979 which is "perfect".
1980
1981 //===---------------------------------------------------------------------===//
1982
1983 For the branch in the following code:
1984 int a();
1985 int b(int x, int y) {
1986   if (x & (1<<(y&7)))
1987     return a();
1988   return y;
1989 }
1990
1991 We currently generate:
1992         movb    %sil, %al
1993         andb    $7, %al
1994         movzbl  %al, %eax
1995         btl     %eax, %edi
1996         jae     .LBB0_2
1997
1998 movl+andl would be shorter than the movb+andb+movzbl sequence.
1999
2000 //===---------------------------------------------------------------------===//
2001
2002 For the following:
2003 struct u1 {
2004     float x, y;
2005 };
2006 float foo(struct u1 u) {
2007     return u.x + u.y;
2008 }
2009
2010 We currently generate:
2011         movdqa  %xmm0, %xmm1
2012         pshufd  $1, %xmm0, %xmm0        # xmm0 = xmm0[1,0,0,0]
2013         addss   %xmm1, %xmm0
2014         ret
2015
2016 We could save an instruction here by commuting the addss.
2017
2018 //===---------------------------------------------------------------------===//