1 //===---------------------------------------------------------------------===//
2 // Random ideas for the X86 backend: SSE-specific stuff.
3 //===---------------------------------------------------------------------===//
5 //===---------------------------------------------------------------------===//
7 SSE Variable shift can be custom lowered to something like this, which uses a
8 small table + unaligned load + shuffle instead of going through memory.
11 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
12 .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
15 __m128i shift_right(__m128i value, unsigned long offset) {
16 return _mm_shuffle_epi8(value,
17 _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
20 //===---------------------------------------------------------------------===//
22 SSE has instructions for doing operations on complex numbers, we should pattern
23 match them. For example, this should turn into a horizontal add:
25 typedef float __attribute__((vector_size(16))) v4f32;
27 return A[0]+A[1]+A[2]+A[3];
33 pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0]
35 pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0]
36 movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1]
43 Also, there are cases where some simple local SLP would improve codegen a bit.
46 _Complex float f32(_Complex float A, _Complex float B) {
55 pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0]
56 pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0]
59 unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
62 seems silly when it could just be one addps.
65 //===---------------------------------------------------------------------===//
67 Expand libm rounding functions inline: Significant speedups possible.
68 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
70 //===---------------------------------------------------------------------===//
72 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
75 //===---------------------------------------------------------------------===//
77 Think about doing i64 math in SSE regs on x86-32.
79 //===---------------------------------------------------------------------===//
81 This testcase should have no SSE instructions in it, and only one load from
84 double %test3(bool %B) {
85 %C = select bool %B, double 123.412, double 523.01123123
89 Currently, the select is being lowered, which prevents the dag combiner from
90 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
92 The pattern isel got this one right.
94 //===---------------------------------------------------------------------===//
96 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
99 //===---------------------------------------------------------------------===//
102 if (copysign(1.0, x) == copysign(1.0, y))
107 //===---------------------------------------------------------------------===//
109 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
112 //===---------------------------------------------------------------------===//
114 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
115 Perhaps use pxor / xorp* to clear a XMM register first?
117 //===---------------------------------------------------------------------===//
119 External test Nurbs exposed some problems. Look for
120 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
123 movaps (%edx), %xmm2 #59.21
124 movaps (%edx), %xmm5 #60.21
125 movaps (%edx), %xmm4 #61.21
126 movaps (%edx), %xmm3 #62.21
127 movl 40(%ecx), %ebp #69.49
128 shufps $0, %xmm2, %xmm5 #60.21
129 movl 100(%esp), %ebx #69.20
130 movl (%ebx), %edi #69.20
131 imull %ebp, %edi #69.49
132 addl (%eax), %edi #70.33
133 shufps $85, %xmm2, %xmm4 #61.21
134 shufps $170, %xmm2, %xmm3 #62.21
135 shufps $255, %xmm2, %xmm2 #63.21
136 lea (%ebp,%ebp,2), %ebx #69.49
138 lea -3(%edi,%ebx), %ebx #70.33
140 addl 32(%ecx), %ebx #68.37
141 testb $15, %bl #91.13
142 jne L_B1.24 # Prob 5% #91.13
144 This is the llvm code after instruction scheduling:
146 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
147 %reg1078 = MOV32ri -3
148 %reg1079 = ADD32rm %reg1078, %reg1068, 1, %noreg, 0
149 %reg1037 = MOV32rm %reg1024, 1, %noreg, 40
150 %reg1080 = IMUL32rr %reg1079, %reg1037
151 %reg1081 = MOV32rm %reg1058, 1, %noreg, 0
152 %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
153 %reg1036 = MOV32rm %reg1024, 1, %noreg, 32
154 %reg1082 = SHL32ri %reg1038, 4
155 %reg1039 = ADD32rr %reg1036, %reg1082
156 %reg1083 = MOVAPSrm %reg1059, 1, %noreg, 0
157 %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
158 %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
159 %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
160 %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
161 %reg1040 = MOV32rr %reg1039
162 %reg1084 = AND32ri8 %reg1039, 15
164 JE mbb<cond_next204,0xa914d30>
166 Still ok. After register allocation:
168 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
170 %edx = MOV32rm %stack.3, 1, %noreg, 0
171 ADD32rm %eax<def&use>, %edx, 1, %noreg, 0
172 %edx = MOV32rm %stack.7, 1, %noreg, 0
173 %edx = MOV32rm %edx, 1, %noreg, 40
174 IMUL32rr %eax<def&use>, %edx
175 %esi = MOV32rm %stack.5, 1, %noreg, 0
176 %esi = MOV32rm %esi, 1, %noreg, 0
177 MOV32mr %stack.4, 1, %noreg, 0, %esi
178 %eax = LEA32r %esi, 1, %eax, -3
179 %esi = MOV32rm %stack.7, 1, %noreg, 0
180 %esi = MOV32rm %esi, 1, %noreg, 32
182 SHL32ri %edi<def&use>, 4
183 ADD32rr %edi<def&use>, %esi
184 %xmm0 = MOVAPSrm %ecx, 1, %noreg, 0
185 %xmm1 = MOVAPSrr %xmm0
186 SHUFPSrr %xmm1<def&use>, %xmm1, 170
187 %xmm2 = MOVAPSrr %xmm0
188 SHUFPSrr %xmm2<def&use>, %xmm2, 0
189 %xmm3 = MOVAPSrr %xmm0
190 SHUFPSrr %xmm3<def&use>, %xmm3, 255
191 SHUFPSrr %xmm0<def&use>, %xmm0, 85
193 AND32ri8 %ebx<def&use>, 15
195 JE mbb<cond_next204,0xa914d30>
197 This looks really bad. The problem is shufps is a destructive opcode. Since it
198 appears as operand two in more than one shufps ops. It resulted in a number of
199 copies. Note icc also suffers from the same problem. Either the instruction
200 selector should select pshufd or The register allocator can made the two-address
201 to three-address transformation.
203 It also exposes some other problems. See MOV32ri -3 and the spills.
205 //===---------------------------------------------------------------------===//
209 __m128 test(float a) {
210 return _mm_set_ps(0.0, 0.0, 0.0, a*a);
221 Because mulss doesn't modify the top 3 elements, the top elements of
222 xmm1 are already zero'd. We could compile this to:
228 //===---------------------------------------------------------------------===//
230 Here's a sick and twisted idea. Consider code like this:
232 __m128 test(__m128 a) {
233 float b = *(float*)&A;
235 return _mm_set_ps(0.0, 0.0, 0.0, b);
238 This might compile to this code:
240 movaps c(%esp), %xmm1
245 Now consider if the ... code caused xmm1 to get spilled. This might produce
248 movaps c(%esp), %xmm1
249 movaps %xmm1, c2(%esp)
253 movaps c2(%esp), %xmm1
257 However, since the reload is only used by these instructions, we could
258 "fold" it into the uses, producing something like this:
260 movaps c(%esp), %xmm1
261 movaps %xmm1, c2(%esp)
264 movss c2(%esp), %xmm0
267 ... saving two instructions.
269 The basic idea is that a reload from a spill slot, can, if only one 4-byte
270 chunk is used, bring in 3 zeros the one element instead of 4 elements.
271 This can be used to simplify a variety of shuffle operations, where the
272 elements are fixed zeros.
274 //===---------------------------------------------------------------------===//
276 This code generates ugly code, probably due to costs being off or something:
278 define void @test(float* %P, <4 x float>* %P2 ) {
279 %xFloat0.688 = load float* %P
280 %tmp = load <4 x float>* %P2
281 %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
282 store <4 x float> %inFloat3.713, <4 x float>* %P2
293 shufps $50, %xmm1, %xmm2
294 shufps $132, %xmm2, %xmm0
298 Would it be better to generate:
304 pinsrw $6, %eax, %xmm0
305 pinsrw $7, %eax, %xmm0
311 //===---------------------------------------------------------------------===//
313 Some useful information in the Apple Altivec / SSE Migration Guide:
315 http://developer.apple.com/documentation/Performance/Conceptual/
316 Accelerate_sse_migration/index.html
318 e.g. SSE select using and, andnot, or. Various SSE compare translations.
320 //===---------------------------------------------------------------------===//
322 Add hooks to commute some CMPP operations.
324 //===---------------------------------------------------------------------===//
326 Apply the same transformation that merged four float into a single 128-bit load
327 to loads from constant pool.
329 //===---------------------------------------------------------------------===//
331 Floating point max / min are commutable when -enable-unsafe-fp-path is
332 specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
333 nodes which are selected to max / min instructions that are marked commutable.
335 //===---------------------------------------------------------------------===//
337 We should materialize vector constants like "all ones" and "signbit" with
340 cmpeqps xmm1, xmm1 ; xmm1 = all-ones
343 cmpeqps xmm1, xmm1 ; xmm1 = all-ones
344 psrlq xmm1, 31 ; xmm1 = all 100000000000...
346 instead of using a load from the constant pool. The later is important for
347 ABS/NEG/copysign etc.
349 //===---------------------------------------------------------------------===//
353 #include <xmmintrin.h>
355 void x(unsigned short n) {
356 a = _mm_slli_epi32 (a, n);
359 a = _mm_slli_epi32 (a, n);
362 compile to ( -O3 -static -fomit-frame-pointer):
377 "y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems
378 like movd would be sufficient in both cases as the value is already zero
379 extended in the 32-bit stack slot IIRC. For signed short, it should also be
380 save, as a really-signed value would be undefined for pslld.
383 //===---------------------------------------------------------------------===//
386 int t1(double d) { return signbit(d); }
388 This currently compiles to:
390 movsd 16(%esp), %xmm0
397 We should use movmskp{s|d} instead.
399 //===---------------------------------------------------------------------===//
401 CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
402 (aligned) vector load. This functionality has a couple of problems.
404 1. The code to infer alignment from loads of globals is in the X86 backend,
405 not the dag combiner. This is because dagcombine2 needs to be able to see
406 through the X86ISD::Wrapper node, which DAGCombine can't really do.
407 2. The code for turning 4 x load into a single vector load is target
408 independent and should be moved to the dag combiner.
409 3. The code for turning 4 x load into a vector load can only handle a direct
410 load from a global or a direct load from the stack. It should be generalized
411 to handle any load from P, P+4, P+8, P+12, where P can be anything.
412 4. The alignment inference code cannot handle loads from globals in non-static
413 mode because it doesn't look through the extra dyld stub load. If you try
414 vec_align.ll without -relocation-model=static, you'll see what I mean.
416 //===---------------------------------------------------------------------===//
418 We should lower store(fneg(load p), q) into an integer load+xor+store, which
419 eliminates a constant pool load. For example, consider:
421 define i64 @ccosf(float %z.0, float %z.1) nounwind readonly {
423 %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1]
424 %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
427 declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
429 This currently compiles to:
431 LCPI1_0: # <4 x float>
432 .long 2147483648 # float -0
433 .long 2147483648 # float -0
434 .long 2147483648 # float -0
435 .long 2147483648 # float -0
438 movss 16(%esp), %xmm0
440 movss 20(%esp), %xmm0
447 Note the load into xmm0, then xor (to negate), then store. In PIC mode,
448 this code computes the pic base and does two loads to do the constant pool
449 load, so the improvement is much bigger.
451 The tricky part about this xform is that the argument load/store isn't exposed
452 until post-legalize, and at that point, the fneg has been custom expanded into
453 an X86 fxor. This means that we need to handle this case in the x86 backend
454 instead of in target independent code.
456 //===---------------------------------------------------------------------===//
458 Non-SSE4 insert into 16 x i8 is atrociously bad.
460 //===---------------------------------------------------------------------===//
462 <2 x i64> extract is substantially worse than <2 x f64>, even if the destination
465 //===---------------------------------------------------------------------===//
467 INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
468 any number of 0.0 simultaneously. Currently we only use it for simple
471 See comments in LowerINSERT_VECTOR_ELT_SSE4.
473 //===---------------------------------------------------------------------===//
475 On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
476 Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
477 legal, it'll just take a few extra patterns written in the .td file.
479 Note: this is not a code quality issue; the custom lowered code happens to be
480 right, but we shouldn't have to custom lower anything. This is probably related
481 to <2 x i64> ops being so bad.
483 //===---------------------------------------------------------------------===//
485 LLVM currently generates stack realignment code, when it is not necessary
486 needed. The problem is that we need to know about stack alignment too early,
489 At that point we don't know, whether there will be vector spill, or not.
490 Stack realignment logic is overly conservative here, but otherwise we can
491 produce unaligned loads/stores.
493 Fixing this will require some huge RA changes.
496 #include <emmintrin.h>
498 typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
500 static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
503 vSInt16 madd(vSInt16 b)
505 return _mm_madd_epi16(a, b);
508 Generated code (x86-32, linux):
513 movaps .LCPI1_0, %xmm1
519 //===---------------------------------------------------------------------===//
522 #include <emmintrin.h>
523 __m128 foo2 (float x) {
524 return _mm_set_ps (0, 0, x, 0);
527 In x86-32 mode, we generate this spiffy code:
531 pshufd $81, %xmm0, %xmm0
534 in x86-64 mode, we generate this code, which could be better:
539 pshufd $81, %xmm1, %xmm0
542 In sse4 mode, we could use insertps to make both better.
544 Here's another testcase that could use insertps [mem]:
546 #include <xmmintrin.h>
548 __m128 foo1 (float x1, float x4) {
549 return _mm_set_ps (x2, x1, x3, x4);
552 gcc mainline compiles it to:
555 insertps $0x10, x2(%rip), %xmm0
556 insertps $0x10, x3(%rip), %xmm1
562 //===---------------------------------------------------------------------===//
564 We compile vector multiply-by-constant into poor code:
566 define <4 x i32> @f(<4 x i32> %i) nounwind {
567 %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
571 On targets without SSE4.1, this compiles into:
573 LCPI1_0: ## <4 x i32>
582 pshufd $3, %xmm0, %xmm1
584 imull LCPI1_0+12, %eax
586 pshufd $1, %xmm0, %xmm2
588 imull LCPI1_0+4, %eax
590 punpckldq %xmm1, %xmm2
596 imull LCPI1_0+8, %eax
598 punpckldq %xmm0, %xmm1
600 punpckldq %xmm2, %xmm0
603 It would be better to synthesize integer vector multiplication by constants
604 using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
605 simple cases such as multiplication by powers of two would be better as
606 vector shifts than as multiplications.
608 //===---------------------------------------------------------------------===//
615 return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
621 pinsrw $2, %eax, %xmm0
623 pinsrw $3, %eax, %xmm0
625 pinsrw $7, %eax, %xmm0
631 movzbl 16(%esp), %eax
633 pinsrw $3, %eax, %xmm0
648 With SSE4, it should be
649 movdqa .LC0(%rip), %xmm0
650 pinsrb $6, %edi, %xmm0
652 //===---------------------------------------------------------------------===//
654 We should transform a shuffle of two vectors of constants into a single vector
655 of constants. Also, insertelement of a constant into a vector of constants
656 should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
658 We compiled it to something horrible:
662 .long 1065353216 ## float 1
666 LCPI1_0: ## <4 x float>
668 .long 1065353216 ## float 1
670 .long 1065353216 ## float 1
676 movhps LCPI1_0, %xmm0
679 shufps $2, %xmm1, %xmm2
680 shufps $132, %xmm2, %xmm0
683 //===---------------------------------------------------------------------===//
688 float foo(unsigned char x) {
692 compiles to (x86-32):
694 define float @foo(i8 zeroext %x) nounwind {
695 %tmp12 = uitofp i8 %x to float ; <float> [#uses=1]
710 We should be able to use:
711 cvtsi2ss 8($esp), %xmm0
712 since we know the stack slot is already zext'd.
714 //===---------------------------------------------------------------------===//
716 Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
717 when code size is critical. movlps is slower than movsd on core2 but it's one
720 //===---------------------------------------------------------------------===//
722 We should use a dynamic programming based approach to tell when using FPStack
723 operations is cheaper than SSE. SciMark montecarlo contains code like this
726 double MonteCarlo_num_flops(int Num_samples) {
727 return ((double) Num_samples)* 4.0;
730 In fpstack mode, this compiles into:
733 .long 1082130432 ## float 4.000000e+00
734 _MonteCarlo_num_flops:
743 in SSE mode, it compiles into significantly slower code:
745 _MonteCarlo_num_flops:
747 cvtsi2sd 16(%esp), %xmm0
754 There are also other cases in scimark where using fpstack is better, it is
755 cheaper to do fld1 than load from a constant pool for example, so
756 "load, add 1.0, store" is better done in the fp stack, etc.
758 //===---------------------------------------------------------------------===//
760 These should compile into the same code (PR6214): Perhaps instcombine should
761 canonicalize the former into the later?
763 define float @foo(float %x) nounwind {
764 %t = bitcast float %x to i32
765 %s = and i32 %t, 2147483647
766 %d = bitcast i32 %s to float
770 declare float @fabsf(float %n)
771 define float @bar(float %x) nounwind {
772 %d = call float @fabsf(float %x)
776 //===---------------------------------------------------------------------===//
778 This IR (from PR6194):
780 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
781 target triple = "x86_64-apple-darwin10.0.0"
783 %0 = type { double, double }
784 %struct.float3 = type { float, float, float }
786 define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
788 %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1]
789 %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1]
790 %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1]
791 %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1]
792 %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1]
793 %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1]
794 %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
795 store float %tmp12, float* %tmp5
807 This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
808 doing a shuffle from v[1] to v[0] then a float store.
810 //===---------------------------------------------------------------------===//
814 void foo(double, double, double);
815 void norm(double x, double y, double z) {
816 double scale = __builtin_sqrt(x*x + y*y + z*z);
817 foo(x/scale, y/scale, z/scale);
820 We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is
821 slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first
822 and emit 3 mulsd in place of the divs. This can be done as a target-independent
825 If we're dealing with floats instead of doubles we could even replace the sqrtss
826 and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the
827 cost of reduced accuracy.
829 //===---------------------------------------------------------------------===//