lib/Target/X86/README-SSE.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend: SSE-specific stuff.
   3 //===---------------------------------------------------------------------===//
   4
   5 - Consider eliminating the unaligned SSE load intrinsics, replacing them with
   6   unaligned LLVM load instructions.
   7
   8 //===---------------------------------------------------------------------===//
   9
  10 Expand libm rounding functions inline:  Significant speedups possible.
  11 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
  12
  13 //===---------------------------------------------------------------------===//
  14
  15 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
  16 other fast SSE modes.
  17
  18 //===---------------------------------------------------------------------===//
  19
  20 Think about doing i64 math in SSE regs.
  21
  22 //===---------------------------------------------------------------------===//
  23
  24 This testcase should have no SSE instructions in it, and only one load from
  25 a constant pool:
  26
  27 double %test3(bool %B) {
  28         %C = select bool %B, double 123.412, double 523.01123123
  29         ret double %C
  30 }
  31
  32 Currently, the select is being lowered, which prevents the dag combiner from
  33 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
  34
  35 The pattern isel got this one right.
  36
  37 //===---------------------------------------------------------------------===//
  38
  39 SSE doesn't have [mem] op= reg instructions.  If we have an SSE instruction
  40 like this:
  41
  42   X += y
  43
  44 and the register allocator decides to spill X, it is cheaper to emit this as:
  45
  46 Y += [xslot]
  47 store Y -> [xslot]
  48
  49 than as:
  50
  51 tmp = [xslot]
  52 tmp += y
  53 store tmp -> [xslot]
  54
  55 ..and this uses one fewer register (so this should be done at load folding
  56 time, not at spiller time).  *Note* however that this can only be done
  57 if Y is dead.  Here's a testcase:
  58
  59 %.str_3 = external global [15 x sbyte]          ; <[15 x sbyte]*> [#uses=0]
  60 implementation   ; Functions:
  61 declare void %printf(int, ...)
  62 void %main() {
  63 build_tree.exit:
  64         br label %no_exit.i7
  65 no_exit.i7:             ; preds = %no_exit.i7, %build_tree.exit
  66         %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ]      ; <double> [#uses=1]
  67         %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ]     ; <double> [#uses=1]
  68         %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
  69         %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
  70         br bool false, label %Compute_Tree.exit23, label %no_exit.i7
  71 Compute_Tree.exit23:            ; preds = %no_exit.i7
  72         tail call void (int, ...)* %printf( int 0 )
  73         store double %tmp.34.i18, double* null
  74         ret void
  75 }
  76
  77 We currently emit:
  78
  79 .BBmain_1:
  80         xorpd %XMM1, %XMM1
  81         addsd %XMM0, %XMM1
  82 ***     movsd %XMM2, QWORD PTR [%ESP + 8]
  83 ***     addsd %XMM2, %XMM1
  84 ***     movsd QWORD PTR [%ESP + 8], %XMM2
  85         jmp .BBmain_1   # no_exit.i7
  86
  87 This is a bugpoint reduced testcase, which is why the testcase doesn't make
  88 much sense (e.g. its an infinite loop). :)
  89
  90 //===---------------------------------------------------------------------===//
  91
  92 SSE should implement 'select_cc' using 'emulated conditional moves' that use
  93 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
  94
  95 double %X(double %Y, double %Z, double %A, double %B) {
  96         %C = setlt double %A, %B
  97         %z = add double %Z, 0.0    ;; select operand is not a load
  98         %D = select bool %C, double %Y, double %z
  99         ret double %D
 100 }
 101
 102 We currently emit:
 103
 104 _X:
 105         subl $12, %esp
 106         xorpd %xmm0, %xmm0
 107         addsd 24(%esp), %xmm0
 108         movsd 32(%esp), %xmm1
 109         movsd 16(%esp), %xmm2
 110         ucomisd 40(%esp), %xmm1
 111         jb LBB_X_2
 112 LBB_X_1:
 113         movsd %xmm0, %xmm2
 114 LBB_X_2:
 115         movsd %xmm2, (%esp)
 116         fldl (%esp)
 117         addl $12, %esp
 118         ret
 119
 120 //===---------------------------------------------------------------------===//
 121
 122 It's not clear whether we should use pxor or xorps / xorpd to clear XMM
 123 registers. The choice may depend on subtarget information. We should do some
 124 more experiments on different x86 machines.
 125
 126 //===---------------------------------------------------------------------===//
 127
 128 Currently the x86 codegen isn't very good at mixing SSE and FPStack
 129 code:
 130
 131 unsigned int foo(double x) { return x; }
 132
 133 foo:
 134         subl $20, %esp
 135         movsd 24(%esp), %xmm0
 136         movsd %xmm0, 8(%esp)
 137         fldl 8(%esp)
 138         fisttpll (%esp)
 139         movl (%esp), %eax
 140         addl $20, %esp
 141         ret
 142
 143 This will be solved when we go to a dynamic programming based isel.
 144
 145 //===---------------------------------------------------------------------===//
 146
 147 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
 148 feasible.
 149
 150 //===---------------------------------------------------------------------===//
 151
 152 Teach the coalescer to commute 2-addr instructions, allowing us to eliminate
 153 the reg-reg copy in this example:
 154
 155 float foo(int *x, float *y, unsigned c) {
 156   float res = 0.0;
 157   unsigned i;
 158   for (i = 0; i < c; i++) {
 159     float xx = (float)x[i];
 160     xx = xx * y[i];
 161     xx += res;
 162     res = xx;
 163   }
 164   return res;
 165 }
 166
 167 LBB_foo_3:      # no_exit
 168         cvtsi2ss %XMM0, DWORD PTR [%EDX + 4*%ESI]
 169         mulss %XMM0, DWORD PTR [%EAX + 4*%ESI]
 170         addss %XMM0, %XMM1
 171         inc %ESI
 172         cmp %ESI, %ECX
 173 ****    movaps %XMM1, %XMM0
 174         jb LBB_foo_3    # no_exit
 175
 176 //===---------------------------------------------------------------------===//
 177
 178 Codegen:
 179   if (copysign(1.0, x) == copysign(1.0, y))
 180 into:
 181   if (x^y & mask)
 182 when using SSE.
 183
 184 //===---------------------------------------------------------------------===//
 185
 186 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
 187 of a v4sf value.
 188
 189 //===---------------------------------------------------------------------===//
 190
 191 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
 192 Perhaps use pxor / xorp* to clear a XMM register first?
 193
 194 //===---------------------------------------------------------------------===//
 195
 196 How to decide when to use the "floating point version" of logical ops? Here are
 197 some code fragments:
 198
 199         movaps LCPI5_5, %xmm2
 200         divps %xmm1, %xmm2
 201         mulps %xmm2, %xmm3
 202         mulps 8656(%ecx), %xmm3
 203         addps 8672(%ecx), %xmm3
 204         andps LCPI5_6, %xmm2
 205         andps LCPI5_1, %xmm3
 206         por %xmm2, %xmm3
 207         movdqa %xmm3, (%edi)
 208
 209         movaps LCPI5_5, %xmm1
 210         divps %xmm0, %xmm1
 211         mulps %xmm1, %xmm3
 212         mulps 8656(%ecx), %xmm3
 213         addps 8672(%ecx), %xmm3
 214         andps LCPI5_6, %xmm1
 215         andps LCPI5_1, %xmm3
 216         orps %xmm1, %xmm3
 217         movaps %xmm3, 112(%esp)
 218         movaps %xmm3, (%ebx)
 219
 220 Due to some minor source change, the later case ended up using orps and movaps
 221 instead of por and movdqa. Does it matter?
 222
 223 //===---------------------------------------------------------------------===//
 224
 225 X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
 226 to choose between movaps, movapd, and movdqa based on types of source and
 227 destination?
 228
 229 How about andps, andpd, and pand? Do we really care about the type of the packed
 230 elements? If not, why not always use the "ps" variants which are likely to be
 231 shorter.
 232
 233 //===---------------------------------------------------------------------===//
 234
 235 External test Nurbs exposed some problems. Look for
 236 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
 237 emits:
 238
 239         movaps    (%edx), %xmm2                                 #59.21
 240         movaps    (%edx), %xmm5                                 #60.21
 241         movaps    (%edx), %xmm4                                 #61.21
 242         movaps    (%edx), %xmm3                                 #62.21
 243         movl      40(%ecx), %ebp                                #69.49
 244         shufps    $0, %xmm2, %xmm5                              #60.21
 245         movl      100(%esp), %ebx                               #69.20
 246         movl      (%ebx), %edi                                  #69.20
 247         imull     %ebp, %edi                                    #69.49
 248         addl      (%eax), %edi                                  #70.33
 249         shufps    $85, %xmm2, %xmm4                             #61.21
 250         shufps    $170, %xmm2, %xmm3                            #62.21
 251         shufps    $255, %xmm2, %xmm2                            #63.21
 252         lea       (%ebp,%ebp,2), %ebx                           #69.49
 253         negl      %ebx                                          #69.49
 254         lea       -3(%edi,%ebx), %ebx                           #70.33
 255         shll      $4, %ebx                                      #68.37
 256         addl      32(%ecx), %ebx                                #68.37
 257         testb     $15, %bl                                      #91.13
 258         jne       L_B1.24       # Prob 5%                       #91.13
 259
 260 This is the llvm code after instruction scheduling:
 261
 262 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 263         %reg1078 = MOV32ri -3
 264         %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
 265         %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
 266         %reg1080 = IMUL32rr %reg1079, %reg1037
 267         %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
 268         %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
 269         %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
 270         %reg1082 = SHL32ri %reg1038, 4
 271         %reg1039 = ADD32rr %reg1036, %reg1082
 272         %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
 273         %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
 274         %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
 275         %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
 276         %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
 277         %reg1040 = MOV32rr %reg1039
 278         %reg1084 = AND32ri8 %reg1039, 15
 279         CMP32ri8 %reg1084, 0
 280         JE mbb<cond_next204,0xa914d30>
 281
 282 Still ok. After register allocation:
 283
 284 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 285         %EAX = MOV32ri -3
 286         %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
 287         ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
 288         %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
 289         %EDX = MOV32rm %EDX, 1, %NOREG, 40
 290         IMUL32rr %EAX<def&use>, %EDX
 291         %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
 292         %ESI = MOV32rm %ESI, 1, %NOREG, 0
 293         MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
 294         %EAX = LEA32r %ESI, 1, %EAX, -3
 295         %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
 296         %ESI = MOV32rm %ESI, 1, %NOREG, 32
 297         %EDI = MOV32rr %EAX
 298         SHL32ri %EDI<def&use>, 4
 299         ADD32rr %EDI<def&use>, %ESI
 300         %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
 301         %XMM1 = MOVAPSrr %XMM0
 302         SHUFPSrr %XMM1<def&use>, %XMM1, 170
 303         %XMM2 = MOVAPSrr %XMM0
 304         SHUFPSrr %XMM2<def&use>, %XMM2, 0
 305         %XMM3 = MOVAPSrr %XMM0
 306         SHUFPSrr %XMM3<def&use>, %XMM3, 255
 307         SHUFPSrr %XMM0<def&use>, %XMM0, 85
 308         %EBX = MOV32rr %EDI
 309         AND32ri8 %EBX<def&use>, 15
 310         CMP32ri8 %EBX, 0
 311         JE mbb<cond_next204,0xa914d30>
 312
 313 This looks really bad. The problem is shufps is a destructive opcode. Since it
 314 appears as operand two in more than one shufps ops. It resulted in a number of
 315 copies. Note icc also suffers from the same problem. Either the instruction
 316 selector should select pshufd or The register allocator can made the two-address
 317 to three-address transformation.
 318
 319 It also exposes some other problems. See MOV32ri -3 and the spills.
 320
 321 //===---------------------------------------------------------------------===//
 322
 323 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
 324
 325 LLVM is producing bad code.
 326
 327 LBB_main_4:     # cond_true44
 328         addps %xmm1, %xmm2
 329         subps %xmm3, %xmm2
 330         movaps (%ecx), %xmm4
 331         movaps %xmm2, %xmm1
 332         addps %xmm4, %xmm1
 333         addl $16, %ecx
 334         incl %edx
 335         cmpl $262144, %edx
 336         movaps %xmm3, %xmm2
 337         movaps %xmm4, %xmm3
 338         jne LBB_main_4  # cond_true44
 339
 340 There are two problems. 1) No need to two loop induction variables. We can
 341 compare against 262144 * 16. 2) Known register coalescer issue. We should
 342 be able eliminate one of the movaps:
 343
 344         addps %xmm2, %xmm1    <=== Commute!
 345         subps %xmm3, %xmm1
 346         movaps (%ecx), %xmm4
 347         movaps %xmm1, %xmm1   <=== Eliminate!
 348         addps %xmm4, %xmm1
 349         addl $16, %ecx
 350         incl %edx
 351         cmpl $262144, %edx
 352         movaps %xmm3, %xmm2
 353         movaps %xmm4, %xmm3
 354         jne LBB_main_4  # cond_true44
 355
 356 //===---------------------------------------------------------------------===//
 357
 358 Consider:
 359
 360 __m128 test(float a) {
 361   return _mm_set_ps(0.0, 0.0, 0.0, a*a);
 362 }
 363
 364 This compiles into:
 365
 366 movss 4(%esp), %xmm1
 367 mulss %xmm1, %xmm1
 368 xorps %xmm0, %xmm0
 369 movss %xmm1, %xmm0
 370 ret
 371
 372 Because mulss doesn't modify the top 3 elements, the top elements of
 373 xmm1 are already zero'd.  We could compile this to:
 374
 375 movss 4(%esp), %xmm0
 376 mulss %xmm0, %xmm0
 377 ret
 378
 379 //===---------------------------------------------------------------------===//
 380
 381 Here's a sick and twisted idea.  Consider code like this:
 382
 383 __m128 test(__m128 a) {
 384   float b = *(float*)&A;
 385   ...
 386   return _mm_set_ps(0.0, 0.0, 0.0, b);
 387 }
 388
 389 This might compile to this code:
 390
 391 movaps c(%esp), %xmm1
 392 xorps %xmm0, %xmm0
 393 movss %xmm1, %xmm0
 394 ret
 395
 396 Now consider if the ... code caused xmm1 to get spilled.  This might produce
 397 this code:
 398
 399 movaps c(%esp), %xmm1
 400 movaps %xmm1, c2(%esp)
 401 ...
 402
 403 xorps %xmm0, %xmm0
 404 movaps c2(%esp), %xmm1
 405 movss %xmm1, %xmm0
 406 ret
 407
 408 However, since the reload is only used by these instructions, we could
 409 "fold" it into the uses, producing something like this:
 410
 411 movaps c(%esp), %xmm1
 412 movaps %xmm1, c2(%esp)
 413 ...
 414
 415 movss c2(%esp), %xmm0
 416 ret
 417
 418 ... saving two instructions.
 419
 420 The basic idea is that a reload from a spill slot, can, if only one 4-byte
 421 chunk is used, bring in 3 zeros the the one element instead of 4 elements.
 422 This can be used to simplify a variety of shuffle operations, where the
 423 elements are fixed zeros.
 424
 425 //===---------------------------------------------------------------------===//
 426
 427 For this:
 428
 429 #include <emmintrin.h>
 430 void test(__m128d *r, __m128d *A, double B) {
 431   *r = _mm_loadl_pd(*A, &B);
 432 }
 433
 434 We generates:
 435
 436         subl $12, %esp
 437         movsd 24(%esp), %xmm0
 438         movsd %xmm0, (%esp)
 439         movl 20(%esp), %eax
 440         movapd (%eax), %xmm0
 441         movlpd (%esp), %xmm0
 442         movl 16(%esp), %eax
 443         movapd %xmm0, (%eax)
 444         addl $12, %esp
 445         ret
 446
 447 icc generates:
 448
 449         movl      4(%esp), %edx                                 #3.6
 450         movl      8(%esp), %eax                                 #3.6
 451         movapd    (%eax), %xmm0                                 #4.22
 452         movlpd    12(%esp), %xmm0                               #4.8
 453         movapd    %xmm0, (%edx)                                 #4.3
 454         ret                                                     #5.1
 455
 456 So icc is smart enough to know that B is in memory so it doesn't load it and
 457 store it back to stack.
 458
 459 //===---------------------------------------------------------------------===//
 460
 461 __m128d test1( __m128d A, __m128d B) {
 462   return _mm_shuffle_pd(A, B, 0x3);
 463 }
 464
 465 compiles to
 466
 467 shufpd $3, %xmm1, %xmm0
 468
 469 Perhaps it's better to use unpckhpd instead?
 470
 471 unpckhpd %xmm1, %xmm0
 472
 473 Don't know if unpckhpd is faster. But it is shorter.
 474
 475 //===---------------------------------------------------------------------===//
 476
 477 This code generates ugly code, probably due to costs being off or something:
 478
 479 void %test(float* %P, <4 x float>* %P2 ) {
 480         %xFloat0.688 = load float* %P
 481         %loadVector37.712 = load <4 x float>* %P2
 482         %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3
 483         store <4 x float> %inFloat3.713, <4 x float>* %P2
 484         ret void
 485 }
 486
 487 Generates:
 488
 489 _test:
 490         pxor %xmm0, %xmm0
 491         movd %xmm0, %eax        ;; EAX = 0!
 492         movl 8(%esp), %ecx
 493         movaps (%ecx), %xmm0
 494         pinsrw $6, %eax, %xmm0
 495         shrl $16, %eax          ;; EAX = 0 again!
 496         pinsrw $7, %eax, %xmm0
 497         movaps %xmm0, (%ecx)
 498         ret
 499
 500 It would be better to generate:
 501
 502 _test:
 503         movl 8(%esp), %ecx
 504         movaps (%ecx), %xmm0
 505         xor %eax, %eax
 506         pinsrw $6, %eax, %xmm0
 507         pinsrw $7, %eax, %xmm0
 508         movaps %xmm0, (%ecx)
 509         ret
 510
 511 or use pxor (to make a zero vector) and shuffle (to insert it).
 512
 513 //===---------------------------------------------------------------------===//
 514
 515 Some useful information in the Apple Altivec / SSE Migration Guide:
 516
 517 http://developer.apple.com/documentation/Performance/Conceptual/
 518 Accelerate_sse_migration/index.html
 519
 520 e.g. SSE select using and, andnot, or. Various SSE compare translations.
 521
 522 //===---------------------------------------------------------------------===//
 523
 524 Add hooks to commute some CMPP operations.
 525
 526 //===---------------------------------------------------------------------===//
 527
 528 Apply the same transformation that merged four float into a single 128-bit load
 529 to loads from constant pool.
 530
 531 //===---------------------------------------------------------------------===//
 532
 533 Floating point max / min are commutable when -enable-unsafe-fp-path is
 534 specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
 535 nodes which are selected to max / min instructions that are marked commutable.
 536
 537 //===---------------------------------------------------------------------===//
 538
 539 We should compile this:
 540 #include <xmmintrin.h>
 541 typedef union {
 542   int i[4];
 543   float f[4];
 544   __m128 v;
 545 } vector4_t;
 546 void swizzle (const void *a, vector4_t * b, vector4_t * c) {
 547   b->v = _mm_loadl_pi (b->v, (__m64 *) a);
 548   c->v = _mm_loadl_pi (c->v, ((__m64 *) a) + 1);
 549 }
 550
 551 to:
 552
 553 _swizzle:
 554         movl    4(%esp), %eax
 555         movl    8(%esp), %edx
 556         movl    12(%esp), %ecx
 557         movlps  (%eax), %xmm0
 558         movlps  %xmm0, (%edx)
 559         movlps  8(%eax), %xmm0
 560         movlps  %xmm0, (%ecx)
 561         ret
 562
 563 not:
 564
 565 swizzle:
 566         movl 8(%esp), %eax
 567         movaps (%eax), %xmm0
 568         movl 4(%esp), %ecx
 569         movlps (%ecx), %xmm0
 570         movaps %xmm0, (%eax)
 571         movl 12(%esp), %eax
 572         movaps (%eax), %xmm0
 573         movlps 8(%ecx), %xmm0
 574         movaps %xmm0, (%eax)
 575         ret
 576
 577 //===---------------------------------------------------------------------===//
 578
 579 This code:
 580
 581 #include <emmintrin.h>
 582 __m128i test(long long i) { return _mm_cvtsi64x_si128(i); }
 583
 584 Should turn into a single 'movq %rdi, %xmm0' instruction.  Instead, we
 585 get this (on x86-64):
 586
 587 _test:
 588         movd %rdi, %xmm1
 589         xorps %xmm0, %xmm0
 590         movsd %xmm1, %xmm0
 591         ret
 592
 593 The LLVM IR is:
 594
 595 target triple = "x86_64-apple-darwin8"
 596 define <2 x i64> @test(i64 %i) {
 597 entry:
 598         %tmp10 = insertelement <2 x i64> undef, i64 %i, i32 0
 599         %tmp11 = insertelement <2 x i64> %tmp10, i64 0, i32 1
 600         ret <2 x i64> %tmp11
 601 }
 602
 603 //===---------------------------------------------------------------------===//
 604
 605 These functions should produce the same code:
 606
 607 #include <emmintrin.h>
 608
 609 typedef long long __m128i __attribute__ ((__vector_size__ (16)));
 610
 611 int foo(__m128i* val) {
 612   return __builtin_ia32_vec_ext_v4si(*val, 1);
 613 }
 614 int bar(__m128i* val) {
 615   union vs {
 616     __m128i *_v;
 617     int* _s;
 618   } v = {val};
 619   return v._s[1];
 620 }
 621
 622 We currently produce (with -m64):
 623
 624 _foo:
 625         pshufd $1, (%rdi), %xmm0
 626         movd %xmm0, %eax
 627         ret
 628 _bar:
 629         movl 4(%rdi), %eax
 630         ret
 631
 632 //===---------------------------------------------------------------------===//
 633
 634 We should materialize vector constants like "all ones" and "signbit" with
 635 code like:
 636
 637      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
 638
 639 and:
 640      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
 641      psrlq   xmm1, 31     ; xmm1 = all 100000000000...
 642
 643 instead of using a load from the constant pool.  The later is important for
 644 ABS/NEG/copysign etc.
 645
 646 //===---------------------------------------------------------------------===//
 647
 648 "converting 64-bit constant pool entry to 32-bit not necessarily beneficial"
 649 http://llvm.org/PR1264
 650
 651 For this test case:
 652
 653 define double @foo(double %x) {
 654         %y = mul double %x, 5.000000e-01
 655         ret double %y
 656 }
 657
 658 llc -march=x86-64 currently produces a 32-bit constant pool entry and this code:
 659
 660         cvtss2sd .LCPI1_0(%rip), %xmm1
 661         mulsd %xmm1, %xmm0
 662
 663 instead of just using a 64-bit constant pool entry with this:
 664
 665         mulsd .LCPI1_0(%rip), %xmm0
 666
 667 This is due to the code in ExpandConstantFP in LegalizeDAG.cpp. It notices that
 668 x86-64 indeed has an instruction to load a 32-bit float from memory and convert
 669 it into a 64-bit float in a register, however it doesn't notice that this isn't
 670 beneficial because it prevents the load from being folded into the multiply.
 671
 672 //===---------------------------------------------------------------------===//
 673
 674 In this loop:
 675
 676 bb49:           ; preds = %bb49, %bb49.preheader
 677         %indvar = phi i32 [ 0, %bb49.preheader ], [ %indvar.next, %bb49 ]               ; <i32> [#uses=2]
 678         %dp.089.0.rec = shl i32 %indvar, 3              ; <i32> [#uses=2]
 679         %dp.089.0 = getelementptr i32* %tmp89, i32 %dp.089.0.rec                ; <i32*> [#uses=1]
 680         %tmp5051 = bitcast i32* %dp.089.0 to <2 x i64>*         ; <<2 x i64>*> [#uses=1]
 681         store <2 x i64> zeroinitializer, <2 x i64>* %tmp5051, align 16
 682         %dp.089.0.sum105 = or i32 %dp.089.0.rec, 4              ; <i32> [#uses=1]
 683         %tmp56 = getelementptr i32* %tmp89, i32 %dp.089.0.sum105                ; <i32*> [#uses=1]
 684         %tmp5657 = bitcast i32* %tmp56 to <2 x i64>*            ; <<2 x i64>*> [#uses=1]
 685         store <2 x i64> zeroinitializer, <2 x i64>* %tmp5657, align 16
 686         %indvar.next = add i32 %indvar, 1               ; <i32> [#uses=2]
 687         %exitcond = icmp eq i32 %indvar.next, %tmp98            ; <i1> [#uses=1]
 688         br i1 %exitcond, label %bb72, label %bb49
 689
 690 we get:
 691
 692 LBB5_6: # bb49.preheader
 693         shlw    $2, %si
 694         decw    %si
 695         movzwl  %si, %eax
 696         incl    %eax
 697         xorl    %ecx, %ecx
 698 LBB5_7: # bb49
 699         xorps   %xmm0, %xmm0            # (1)
 700         movaps  %xmm0, (%edx)
 701         movaps  %xmm0, 16(%edx)
 702         addl    $32, %edx
 703         incl    %ecx
 704         cmpl    %eax, %ecx
 705         jne     LBB4_7  # bb47
 706
 707 The instruction at (1) can be moved out of the main body of the loop.
 708
 709 //===---------------------------------------------------------------------===//
 710
 711 These functions:
 712
 713 #include <xmmintrin.h>
 714 __m128i a;
 715 void x(unsigned short n) {
 716   a = _mm_slli_epi32 (a, n);
 717 }
 718 void y(unsigned n) {
 719   a = _mm_slli_epi32 (a, n);
 720 }
 721
 722 compile to ( -O3 -static -fomit-frame-pointer):
 723 _x:
 724         movzwl  4(%esp), %eax
 725         movd    %eax, %xmm0
 726         movaps  _a, %xmm1
 727         pslld   %xmm0, %xmm1
 728         movaps  %xmm1, _a
 729         ret
 730 _y:
 731         movd    4(%esp), %xmm0
 732         movaps  _a, %xmm1
 733         pslld   %xmm0, %xmm1
 734         movaps  %xmm1, _a
 735         ret
 736
 737 "y" looks good, but "x" does silly movzwl stuff around into a GPR.  It seems
 738 like movd would be sufficient in both cases as the value is already zero
 739 extended in the 32-bit stack slot IIRC.  For signed short, it should also be
 740 save, as a really-signed value would be undefined for pslld.
 741
 742
 743 //===---------------------------------------------------------------------===//