lib/Target/X86/README-SSE.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend: SSE-specific stuff.
   3 //===---------------------------------------------------------------------===//
   4
   5 //===---------------------------------------------------------------------===//
   6
   7 SSE Variable shift can be custom lowered to something like this, which uses a
   8 small table + unaligned load + shuffle instead of going through memory.
   9
  10 __m128i_shift_right:
  11         .byte     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
  12         .byte    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
  13
  14 ...
  15 __m128i shift_right(__m128i value, unsigned long offset) {
  16   return _mm_shuffle_epi8(value,
  17                _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
  18 }
  19
  20 //===---------------------------------------------------------------------===//
  21
  22 SSE has instructions for doing operations on complex numbers, we should pattern
  23 match them.   For example, this should turn into a horizontal add:
  24
  25 typedef float __attribute__((vector_size(16))) v4f32;
  26 float f32(v4f32 A) {
  27   return A[0]+A[1]+A[2]+A[3];
  28 }
  29
  30 Instead we get this:
  31
  32 _f32:                                   ## @f32
  33         pshufd  $1, %xmm0, %xmm1        ## xmm1 = xmm0[1,0,0,0]
  34         addss   %xmm0, %xmm1
  35         pshufd  $3, %xmm0, %xmm2        ## xmm2 = xmm0[3,0,0,0]
  36         movhlps %xmm0, %xmm0            ## xmm0 = xmm0[1,1]
  37         movaps  %xmm0, %xmm3
  38         addss   %xmm1, %xmm3
  39         movdqa  %xmm2, %xmm0
  40         addss   %xmm3, %xmm0
  41         ret
  42
  43 Also, there are cases where some simple local SLP would improve codegen a bit.
  44 compiling this:
  45
  46 _Complex float f32(_Complex float A, _Complex float B) {
  47   return A+B;
  48 }
  49
  50 into:
  51
  52 _f32:                                   ## @f32
  53         movdqa  %xmm0, %xmm2
  54         addss   %xmm1, %xmm2
  55         pshufd  $1, %xmm1, %xmm1        ## xmm1 = xmm1[1,0,0,0]
  56         pshufd  $1, %xmm0, %xmm3        ## xmm3 = xmm0[1,0,0,0]
  57         addss   %xmm1, %xmm3
  58         movaps  %xmm2, %xmm0
  59         unpcklps        %xmm3, %xmm0    ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
  60         ret
  61
  62 seems silly when it could just be one addps.
  63
  64
  65 //===---------------------------------------------------------------------===//
  66
  67 Expand libm rounding functions inline:  Significant speedups possible.
  68 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
  69
  70 //===---------------------------------------------------------------------===//
  71
  72 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
  73 other fast SSE modes.
  74
  75 //===---------------------------------------------------------------------===//
  76
  77 Think about doing i64 math in SSE regs on x86-32.
  78
  79 //===---------------------------------------------------------------------===//
  80
  81 This testcase should have no SSE instructions in it, and only one load from
  82 a constant pool:
  83
  84 double %test3(bool %B) {
  85         %C = select bool %B, double 123.412, double 523.01123123
  86         ret double %C
  87 }
  88
  89 Currently, the select is being lowered, which prevents the dag combiner from
  90 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
  91
  92 The pattern isel got this one right.
  93
  94 //===---------------------------------------------------------------------===//
  95
  96 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
  97 feasible.
  98
  99 //===---------------------------------------------------------------------===//
 100
 101 Codegen:
 102   if (copysign(1.0, x) == copysign(1.0, y))
 103 into:
 104   if (x^y & mask)
 105 when using SSE.
 106
 107 //===---------------------------------------------------------------------===//
 108
 109 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
 110 of a v4sf value.
 111
 112 //===---------------------------------------------------------------------===//
 113
 114 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
 115 Perhaps use pxor / xorp* to clear a XMM register first?
 116
 117 //===---------------------------------------------------------------------===//
 118
 119 External test Nurbs exposed some problems. Look for
 120 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
 121 emits:
 122
 123         movaps    (%edx), %xmm2                                 #59.21
 124         movaps    (%edx), %xmm5                                 #60.21
 125         movaps    (%edx), %xmm4                                 #61.21
 126         movaps    (%edx), %xmm3                                 #62.21
 127         movl      40(%ecx), %ebp                                #69.49
 128         shufps    $0, %xmm2, %xmm5                              #60.21
 129         movl      100(%esp), %ebx                               #69.20
 130         movl      (%ebx), %edi                                  #69.20
 131         imull     %ebp, %edi                                    #69.49
 132         addl      (%eax), %edi                                  #70.33
 133         shufps    $85, %xmm2, %xmm4                             #61.21
 134         shufps    $170, %xmm2, %xmm3                            #62.21
 135         shufps    $255, %xmm2, %xmm2                            #63.21
 136         lea       (%ebp,%ebp,2), %ebx                           #69.49
 137         negl      %ebx                                          #69.49
 138         lea       -3(%edi,%ebx), %ebx                           #70.33
 139         shll      $4, %ebx                                      #68.37
 140         addl      32(%ecx), %ebx                                #68.37
 141         testb     $15, %bl                                      #91.13
 142         jne       L_B1.24       # Prob 5%                       #91.13
 143
 144 This is the llvm code after instruction scheduling:
 145
 146 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 147         %reg1078 = MOV32ri -3
 148         %reg1079 = ADD32rm %reg1078, %reg1068, 1, %noreg, 0
 149         %reg1037 = MOV32rm %reg1024, 1, %noreg, 40
 150         %reg1080 = IMUL32rr %reg1079, %reg1037
 151         %reg1081 = MOV32rm %reg1058, 1, %noreg, 0
 152         %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
 153         %reg1036 = MOV32rm %reg1024, 1, %noreg, 32
 154         %reg1082 = SHL32ri %reg1038, 4
 155         %reg1039 = ADD32rr %reg1036, %reg1082
 156         %reg1083 = MOVAPSrm %reg1059, 1, %noreg, 0
 157         %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
 158         %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
 159         %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
 160         %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
 161         %reg1040 = MOV32rr %reg1039
 162         %reg1084 = AND32ri8 %reg1039, 15
 163         CMP32ri8 %reg1084, 0
 164         JE mbb<cond_next204,0xa914d30>
 165
 166 Still ok. After register allocation:
 167
 168 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 169         %eax = MOV32ri -3
 170         %edx = MOV32rm %stack.3, 1, %noreg, 0
 171         ADD32rm %eax<def&use>, %edx, 1, %noreg, 0
 172         %edx = MOV32rm %stack.7, 1, %noreg, 0
 173         %edx = MOV32rm %edx, 1, %noreg, 40
 174         IMUL32rr %eax<def&use>, %edx
 175         %esi = MOV32rm %stack.5, 1, %noreg, 0
 176         %esi = MOV32rm %esi, 1, %noreg, 0
 177         MOV32mr %stack.4, 1, %noreg, 0, %esi
 178         %eax = LEA32r %esi, 1, %eax, -3
 179         %esi = MOV32rm %stack.7, 1, %noreg, 0
 180         %esi = MOV32rm %esi, 1, %noreg, 32
 181         %edi = MOV32rr %eax
 182         SHL32ri %edi<def&use>, 4
 183         ADD32rr %edi<def&use>, %esi
 184         %xmm0 = MOVAPSrm %ecx, 1, %noreg, 0
 185         %xmm1 = MOVAPSrr %xmm0
 186         SHUFPSrr %xmm1<def&use>, %xmm1, 170
 187         %xmm2 = MOVAPSrr %xmm0
 188         SHUFPSrr %xmm2<def&use>, %xmm2, 0
 189         %xmm3 = MOVAPSrr %xmm0
 190         SHUFPSrr %xmm3<def&use>, %xmm3, 255
 191         SHUFPSrr %xmm0<def&use>, %xmm0, 85
 192         %ebx = MOV32rr %edi
 193         AND32ri8 %ebx<def&use>, 15
 194         CMP32ri8 %ebx, 0
 195         JE mbb<cond_next204,0xa914d30>
 196
 197 This looks really bad. The problem is shufps is a destructive opcode. Since it
 198 appears as operand two in more than one shufps ops. It resulted in a number of
 199 copies. Note icc also suffers from the same problem. Either the instruction
 200 selector should select pshufd or The register allocator can made the two-address
 201 to three-address transformation.
 202
 203 It also exposes some other problems. See MOV32ri -3 and the spills.
 204
 205 //===---------------------------------------------------------------------===//
 206
 207 Consider:
 208
 209 __m128 test(float a) {
 210   return _mm_set_ps(0.0, 0.0, 0.0, a*a);
 211 }
 212
 213 This compiles into:
 214
 215 movss 4(%esp), %xmm1
 216 mulss %xmm1, %xmm1
 217 xorps %xmm0, %xmm0
 218 movss %xmm1, %xmm0
 219 ret
 220
 221 Because mulss doesn't modify the top 3 elements, the top elements of
 222 xmm1 are already zero'd.  We could compile this to:
 223
 224 movss 4(%esp), %xmm0
 225 mulss %xmm0, %xmm0
 226 ret
 227
 228 //===---------------------------------------------------------------------===//
 229
 230 Here's a sick and twisted idea.  Consider code like this:
 231
 232 __m128 test(__m128 a) {
 233   float b = *(float*)&A;
 234   ...
 235   return _mm_set_ps(0.0, 0.0, 0.0, b);
 236 }
 237
 238 This might compile to this code:
 239
 240 movaps c(%esp), %xmm1
 241 xorps %xmm0, %xmm0
 242 movss %xmm1, %xmm0
 243 ret
 244
 245 Now consider if the ... code caused xmm1 to get spilled.  This might produce
 246 this code:
 247
 248 movaps c(%esp), %xmm1
 249 movaps %xmm1, c2(%esp)
 250 ...
 251
 252 xorps %xmm0, %xmm0
 253 movaps c2(%esp), %xmm1
 254 movss %xmm1, %xmm0
 255 ret
 256
 257 However, since the reload is only used by these instructions, we could
 258 "fold" it into the uses, producing something like this:
 259
 260 movaps c(%esp), %xmm1
 261 movaps %xmm1, c2(%esp)
 262 ...
 263
 264 movss c2(%esp), %xmm0
 265 ret
 266
 267 ... saving two instructions.
 268
 269 The basic idea is that a reload from a spill slot, can, if only one 4-byte
 270 chunk is used, bring in 3 zeros the one element instead of 4 elements.
 271 This can be used to simplify a variety of shuffle operations, where the
 272 elements are fixed zeros.
 273
 274 //===---------------------------------------------------------------------===//
 275
 276 This code generates ugly code, probably due to costs being off or something:
 277
 278 define void @test(float* %P, <4 x float>* %P2 ) {
 279         %xFloat0.688 = load float* %P
 280         %tmp = load <4 x float>* %P2
 281         %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
 282         store <4 x float> %inFloat3.713, <4 x float>* %P2
 283         ret void
 284 }
 285
 286 Generates:
 287
 288 _test:
 289         movl    8(%esp), %eax
 290         movaps  (%eax), %xmm0
 291         pxor    %xmm1, %xmm1
 292         movaps  %xmm0, %xmm2
 293         shufps  $50, %xmm1, %xmm2
 294         shufps  $132, %xmm2, %xmm0
 295         movaps  %xmm0, (%eax)
 296         ret
 297
 298 Would it be better to generate:
 299
 300 _test:
 301         movl 8(%esp), %ecx
 302         movaps (%ecx), %xmm0
 303         xor %eax, %eax
 304         pinsrw $6, %eax, %xmm0
 305         pinsrw $7, %eax, %xmm0
 306         movaps %xmm0, (%ecx)
 307         ret
 308
 309 ?
 310
 311 //===---------------------------------------------------------------------===//
 312
 313 Some useful information in the Apple Altivec / SSE Migration Guide:
 314
 315 http://developer.apple.com/documentation/Performance/Conceptual/
 316 Accelerate_sse_migration/index.html
 317
 318 e.g. SSE select using and, andnot, or. Various SSE compare translations.
 319
 320 //===---------------------------------------------------------------------===//
 321
 322 Add hooks to commute some CMPP operations.
 323
 324 //===---------------------------------------------------------------------===//
 325
 326 Apply the same transformation that merged four float into a single 128-bit load
 327 to loads from constant pool.
 328
 329 //===---------------------------------------------------------------------===//
 330
 331 Floating point max / min are commutable when -enable-unsafe-fp-path is
 332 specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
 333 nodes which are selected to max / min instructions that are marked commutable.
 334
 335 //===---------------------------------------------------------------------===//
 336
 337 We should materialize vector constants like "all ones" and "signbit" with
 338 code like:
 339
 340      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
 341
 342 and:
 343      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
 344      psrlq   xmm1, 31     ; xmm1 = all 100000000000...
 345
 346 instead of using a load from the constant pool.  The later is important for
 347 ABS/NEG/copysign etc.
 348
 349 //===---------------------------------------------------------------------===//
 350
 351 These functions:
 352
 353 #include <xmmintrin.h>
 354 __m128i a;
 355 void x(unsigned short n) {
 356   a = _mm_slli_epi32 (a, n);
 357 }
 358 void y(unsigned n) {
 359   a = _mm_slli_epi32 (a, n);
 360 }
 361
 362 compile to ( -O3 -static -fomit-frame-pointer):
 363 _x:
 364         movzwl  4(%esp), %eax
 365         movd    %eax, %xmm0
 366         movaps  _a, %xmm1
 367         pslld   %xmm0, %xmm1
 368         movaps  %xmm1, _a
 369         ret
 370 _y:
 371         movd    4(%esp), %xmm0
 372         movaps  _a, %xmm1
 373         pslld   %xmm0, %xmm1
 374         movaps  %xmm1, _a
 375         ret
 376
 377 "y" looks good, but "x" does silly movzwl stuff around into a GPR.  It seems
 378 like movd would be sufficient in both cases as the value is already zero
 379 extended in the 32-bit stack slot IIRC.  For signed short, it should also be
 380 save, as a really-signed value would be undefined for pslld.
 381
 382
 383 //===---------------------------------------------------------------------===//
 384
 385 #include <math.h>
 386 int t1(double d) { return signbit(d); }
 387
 388 This currently compiles to:
 389         subl    $12, %esp
 390         movsd   16(%esp), %xmm0
 391         movsd   %xmm0, (%esp)
 392         movl    4(%esp), %eax
 393         shrl    $31, %eax
 394         addl    $12, %esp
 395         ret
 396
 397 We should use movmskp{s|d} instead.
 398
 399 //===---------------------------------------------------------------------===//
 400
 401 CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
 402 (aligned) vector load.  This functionality has a couple of problems.
 403
 404 1. The code to infer alignment from loads of globals is in the X86 backend,
 405    not the dag combiner.  This is because dagcombine2 needs to be able to see
 406    through the X86ISD::Wrapper node, which DAGCombine can't really do.
 407 2. The code for turning 4 x load into a single vector load is target
 408    independent and should be moved to the dag combiner.
 409 3. The code for turning 4 x load into a vector load can only handle a direct
 410    load from a global or a direct load from the stack.  It should be generalized
 411    to handle any load from P, P+4, P+8, P+12, where P can be anything.
 412 4. The alignment inference code cannot handle loads from globals in non-static
 413    mode because it doesn't look through the extra dyld stub load.  If you try
 414    vec_align.ll without -relocation-model=static, you'll see what I mean.
 415
 416 //===---------------------------------------------------------------------===//
 417
 418 We should lower store(fneg(load p), q) into an integer load+xor+store, which
 419 eliminates a constant pool load.  For example, consider:
 420
 421 define i64 @ccosf(float %z.0, float %z.1) nounwind readonly  {
 422 entry:
 423  %tmp6 = fsub float -0.000000e+00, %z.1         ; <float> [#uses=1]
 424  %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
 425  ret i64 %tmp20
 426 }
 427 declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
 428
 429 This currently compiles to:
 430
 431 LCPI1_0:                                        #  <4 x float>
 432         .long   2147483648      # float -0
 433         .long   2147483648      # float -0
 434         .long   2147483648      # float -0
 435         .long   2147483648      # float -0
 436 _ccosf:
 437         subl    $12, %esp
 438         movss   16(%esp), %xmm0
 439         movss   %xmm0, 4(%esp)
 440         movss   20(%esp), %xmm0
 441         xorps   LCPI1_0, %xmm0
 442         movss   %xmm0, (%esp)
 443         call    L_ccoshf$stub
 444         addl    $12, %esp
 445         ret
 446
 447 Note the load into xmm0, then xor (to negate), then store.  In PIC mode,
 448 this code computes the pic base and does two loads to do the constant pool
 449 load, so the improvement is much bigger.
 450
 451 The tricky part about this xform is that the argument load/store isn't exposed
 452 until post-legalize, and at that point, the fneg has been custom expanded into
 453 an X86 fxor.  This means that we need to handle this case in the x86 backend
 454 instead of in target independent code.
 455
 456 //===---------------------------------------------------------------------===//
 457
 458 Non-SSE4 insert into 16 x i8 is atrociously bad.
 459
 460 //===---------------------------------------------------------------------===//
 461
 462 <2 x i64> extract is substantially worse than <2 x f64>, even if the destination
 463 is memory.
 464
 465 //===---------------------------------------------------------------------===//
 466
 467 INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
 468 any number of 0.0 simultaneously.  Currently we only use it for simple
 469 insertions.
 470
 471 See comments in LowerINSERT_VECTOR_ELT_SSE4.
 472
 473 //===---------------------------------------------------------------------===//
 474
 475 On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
 476 Custom.  All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
 477 legal, it'll just take a few extra patterns written in the .td file.
 478
 479 Note: this is not a code quality issue; the custom lowered code happens to be
 480 right, but we shouldn't have to custom lower anything.  This is probably related
 481 to <2 x i64> ops being so bad.
 482
 483 //===---------------------------------------------------------------------===//
 484
 485 LLVM currently generates stack realignment code, when it is not necessary
 486 needed. The problem is that we need to know about stack alignment too early,
 487 before RA runs.
 488
 489 At that point we don't know, whether there will be vector spill, or not.
 490 Stack realignment logic is overly conservative here, but otherwise we can
 491 produce unaligned loads/stores.
 492
 493 Fixing this will require some huge RA changes.
 494
 495 Testcase:
 496 #include <emmintrin.h>
 497
 498 typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
 499
 500 static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
 501 - 22725, - 12873};;
 502
 503 vSInt16 madd(vSInt16 b)
 504 {
 505     return _mm_madd_epi16(a, b);
 506 }
 507
 508 Generated code (x86-32, linux):
 509 madd:
 510         pushl   %ebp
 511         movl    %esp, %ebp
 512         andl    $-16, %esp
 513         movaps  .LCPI1_0, %xmm1
 514         pmaddwd %xmm1, %xmm0
 515         movl    %ebp, %esp
 516         popl    %ebp
 517         ret
 518
 519 //===---------------------------------------------------------------------===//
 520
 521 Consider:
 522 #include <emmintrin.h>
 523 __m128 foo2 (float x) {
 524  return _mm_set_ps (0, 0, x, 0);
 525 }
 526
 527 In x86-32 mode, we generate this spiffy code:
 528
 529 _foo2:
 530         movss   4(%esp), %xmm0
 531         pshufd  $81, %xmm0, %xmm0
 532         ret
 533
 534 in x86-64 mode, we generate this code, which could be better:
 535
 536 _foo2:
 537         xorps   %xmm1, %xmm1
 538         movss   %xmm0, %xmm1
 539         pshufd  $81, %xmm1, %xmm0
 540         ret
 541
 542 In sse4 mode, we could use insertps to make both better.
 543
 544 Here's another testcase that could use insertps [mem]:
 545
 546 #include <xmmintrin.h>
 547 extern float x2, x3;
 548 __m128 foo1 (float x1, float x4) {
 549  return _mm_set_ps (x2, x1, x3, x4);
 550 }
 551
 552 gcc mainline compiles it to:
 553
 554 foo1:
 555        insertps        $0x10, x2(%rip), %xmm0
 556        insertps        $0x10, x3(%rip), %xmm1
 557        movaps  %xmm1, %xmm2
 558        movlhps %xmm0, %xmm2
 559        movaps  %xmm2, %xmm0
 560        ret
 561
 562 //===---------------------------------------------------------------------===//
 563
 564 We compile vector multiply-by-constant into poor code:
 565
 566 define <4 x i32> @f(<4 x i32> %i) nounwind  {
 567         %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
 568         ret <4 x i32> %A
 569 }
 570
 571 On targets without SSE4.1, this compiles into:
 572
 573 LCPI1_0:                                        ##  <4 x i32>
 574         .long   10
 575         .long   10
 576         .long   10
 577         .long   10
 578         .text
 579         .align  4,0x90
 580         .globl  _f
 581 _f:
 582         pshufd  $3, %xmm0, %xmm1
 583         movd    %xmm1, %eax
 584         imull   LCPI1_0+12, %eax
 585         movd    %eax, %xmm1
 586         pshufd  $1, %xmm0, %xmm2
 587         movd    %xmm2, %eax
 588         imull   LCPI1_0+4, %eax
 589         movd    %eax, %xmm2
 590         punpckldq       %xmm1, %xmm2
 591         movd    %xmm0, %eax
 592         imull   LCPI1_0, %eax
 593         movd    %eax, %xmm1
 594         movhlps %xmm0, %xmm0
 595         movd    %xmm0, %eax
 596         imull   LCPI1_0+8, %eax
 597         movd    %eax, %xmm0
 598         punpckldq       %xmm0, %xmm1
 599         movaps  %xmm1, %xmm0
 600         punpckldq       %xmm2, %xmm0
 601         ret
 602
 603 It would be better to synthesize integer vector multiplication by constants
 604 using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
 605 simple cases such as multiplication by powers of two would be better as
 606 vector shifts than as multiplications.
 607
 608 //===---------------------------------------------------------------------===//
 609
 610 We compile this:
 611
 612 __m128i
 613 foo2 (char x)
 614 {
 615   return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
 616 }
 617
 618 into:
 619         movl    $1, %eax
 620         xorps   %xmm0, %xmm0
 621         pinsrw  $2, %eax, %xmm0
 622         movzbl  4(%esp), %eax
 623         pinsrw  $3, %eax, %xmm0
 624         movl    $256, %eax
 625         pinsrw  $7, %eax, %xmm0
 626         ret
 627
 628
 629 gcc-4.2:
 630         subl    $12, %esp
 631         movzbl  16(%esp), %eax
 632         movdqa  LC0, %xmm0
 633         pinsrw  $3, %eax, %xmm0
 634         addl    $12, %esp
 635         ret
 636         .const
 637         .align 4
 638 LC0:
 639         .word   0
 640         .word   0
 641         .word   1
 642         .word   0
 643         .word   0
 644         .word   0
 645         .word   0
 646         .word   256
 647
 648 With SSE4, it should be
 649       movdqa  .LC0(%rip), %xmm0
 650       pinsrb  $6, %edi, %xmm0
 651
 652 //===---------------------------------------------------------------------===//
 653
 654 We should transform a shuffle of two vectors of constants into a single vector
 655 of constants. Also, insertelement of a constant into a vector of constants
 656 should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
 657
 658 We compiled it to something horrible:
 659
 660         .align  4
 661 LCPI1_1:                                        ##  float
 662         .long   1065353216      ## float 1
 663         .const
 664
 665         .align  4
 666 LCPI1_0:                                        ##  <4 x float>
 667         .space  4
 668         .long   1065353216      ## float 1
 669         .space  4
 670         .long   1065353216      ## float 1
 671         .text
 672         .align  4,0x90
 673         .globl  _t
 674 _t:
 675         xorps   %xmm0, %xmm0
 676         movhps  LCPI1_0, %xmm0
 677         movss   LCPI1_1, %xmm1
 678         movaps  %xmm0, %xmm2
 679         shufps  $2, %xmm1, %xmm2
 680         shufps  $132, %xmm2, %xmm0
 681         movaps  %xmm0, 0
 682
 683 //===---------------------------------------------------------------------===//
 684 rdar://5907648
 685
 686 This function:
 687
 688 float foo(unsigned char x) {
 689   return x;
 690 }
 691
 692 compiles to (x86-32):
 693
 694 define float @foo(i8 zeroext  %x) nounwind  {
 695         %tmp12 = uitofp i8 %x to float          ; <float> [#uses=1]
 696         ret float %tmp12
 697 }
 698
 699 compiles to:
 700
 701 _foo:
 702         subl    $4, %esp
 703         movzbl  8(%esp), %eax
 704         cvtsi2ss        %eax, %xmm0
 705         movss   %xmm0, (%esp)
 706         flds    (%esp)
 707         addl    $4, %esp
 708         ret
 709
 710 We should be able to use:
 711   cvtsi2ss 8($esp), %xmm0
 712 since we know the stack slot is already zext'd.
 713
 714 //===---------------------------------------------------------------------===//
 715
 716 Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
 717 when code size is critical. movlps is slower than movsd on core2 but it's one
 718 byte shorter.
 719
 720 //===---------------------------------------------------------------------===//
 721
 722 We should use a dynamic programming based approach to tell when using FPStack
 723 operations is cheaper than SSE.  SciMark montecarlo contains code like this
 724 for example:
 725
 726 double MonteCarlo_num_flops(int Num_samples) {
 727     return ((double) Num_samples)* 4.0;
 728 }
 729
 730 In fpstack mode, this compiles into:
 731
 732 LCPI1_0:
 733         .long   1082130432      ## float 4.000000e+00
 734 _MonteCarlo_num_flops:
 735         subl    $4, %esp
 736         movl    8(%esp), %eax
 737         movl    %eax, (%esp)
 738         fildl   (%esp)
 739         fmuls   LCPI1_0
 740         addl    $4, %esp
 741         ret
 742
 743 in SSE mode, it compiles into significantly slower code:
 744
 745 _MonteCarlo_num_flops:
 746         subl    $12, %esp
 747         cvtsi2sd        16(%esp), %xmm0
 748         mulsd   LCPI1_0, %xmm0
 749         movsd   %xmm0, (%esp)
 750         fldl    (%esp)
 751         addl    $12, %esp
 752         ret
 753
 754 There are also other cases in scimark where using fpstack is better, it is
 755 cheaper to do fld1 than load from a constant pool for example, so
 756 "load, add 1.0, store" is better done in the fp stack, etc.
 757
 758 //===---------------------------------------------------------------------===//
 759
 760 These should compile into the same code (PR6214): Perhaps instcombine should
 761 canonicalize the former into the later?
 762
 763 define float @foo(float %x) nounwind {
 764   %t = bitcast float %x to i32
 765   %s = and i32 %t, 2147483647
 766   %d = bitcast i32 %s to float
 767   ret float %d
 768 }
 769
 770 declare float @fabsf(float %n)
 771 define float @bar(float %x) nounwind {
 772   %d = call float @fabsf(float %x)
 773   ret float %d
 774 }
 775
 776 //===---------------------------------------------------------------------===//
 777
 778 This IR (from PR6194):
 779
 780 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 781 target triple = "x86_64-apple-darwin10.0.0"
 782
 783 %0 = type { double, double }
 784 %struct.float3 = type { float, float, float }
 785
 786 define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
 787 entry:
 788   %tmp18 = extractvalue %0 %0, 0                  ; <double> [#uses=1]
 789   %tmp19 = bitcast double %tmp18 to i64           ; <i64> [#uses=1]
 790   %tmp20 = zext i64 %tmp19 to i128                ; <i128> [#uses=1]
 791   %tmp10 = lshr i128 %tmp20, 32                   ; <i128> [#uses=1]
 792   %tmp11 = trunc i128 %tmp10 to i32               ; <i32> [#uses=1]
 793   %tmp12 = bitcast i32 %tmp11 to float            ; <float> [#uses=1]
 794   %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
 795   store float %tmp12, float* %tmp5
 796   ret void
 797 }
 798
 799 Compiles to:
 800
 801 _test:                                  ## @test
 802         movd    %xmm0, %rax
 803         shrq    $32, %rax
 804         movl    %eax, 4(%rdi)
 805         ret
 806
 807 This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
 808 doing a shuffle from v[1] to v[0] then a float store.
 809
 810 //===---------------------------------------------------------------------===//
 811
 812 [UNSAFE FP]
 813
 814 void foo(double, double, double);
 815 void norm(double x, double y, double z) {
 816   double scale = __builtin_sqrt(x*x + y*y + z*z);
 817   foo(x/scale, y/scale, z/scale);
 818 }
 819
 820 We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is
 821 slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first
 822 and emit 3 mulsd in place of the divs. This can be done as a target-independent
 823 transform.
 824
 825 If we're dealing with floats instead of doubles we could even replace the sqrtss
 826 and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the
 827 cost of reduced accuracy.
 828
 829 //===---------------------------------------------------------------------===//