1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE
3 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1
4 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
9 ; Tests for SSE2 and below, without SSE3+.
11 define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
12 ; X86-SSE-LABEL: test1:
14 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
15 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
16 ; X86-SSE-NEXT: movaps (%ecx), %xmm0
17 ; X86-SSE-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
18 ; X86-SSE-NEXT: movaps %xmm0, (%eax)
21 ; X86-AVX-LABEL: test1:
23 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
24 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
25 ; X86-AVX-NEXT: vmovaps (%ecx), %xmm0
26 ; X86-AVX-NEXT: vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
27 ; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
30 ; X64-SSE-LABEL: test1:
32 ; X64-SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],mem[1]
33 ; X64-SSE-NEXT: movapd %xmm0, (%rdi)
36 ; X64-AVX-LABEL: test1:
38 ; X64-AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
39 ; X64-AVX-NEXT: vmovaps %xmm0, (%rdi)
41 %tmp3 = load <2 x double>, <2 x double>* %A, align 16
42 %tmp7 = insertelement <2 x double> undef, double %B, i32 0
43 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 >
44 store <2 x double> %tmp9, <2 x double>* %r, align 16
48 define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
49 ; X86-SSE-LABEL: test2:
51 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
52 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
53 ; X86-SSE-NEXT: movaps (%ecx), %xmm0
54 ; X86-SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
55 ; X86-SSE-NEXT: movaps %xmm0, (%eax)
58 ; X86-AVX-LABEL: test2:
60 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
61 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
62 ; X86-AVX-NEXT: vmovaps (%ecx), %xmm0
63 ; X86-AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
64 ; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
67 ; X64-SSE-LABEL: test2:
69 ; X64-SSE-NEXT: movaps (%rsi), %xmm1
70 ; X64-SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
71 ; X64-SSE-NEXT: movaps %xmm1, (%rdi)
74 ; X64-AVX-LABEL: test2:
76 ; X64-AVX-NEXT: vmovaps (%rsi), %xmm1
77 ; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
78 ; X64-AVX-NEXT: vmovaps %xmm0, (%rdi)
80 %tmp3 = load <2 x double>, <2 x double>* %A, align 16
81 %tmp7 = insertelement <2 x double> undef, double %B, i32 0
82 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 >
83 store <2 x double> %tmp9, <2 x double>* %r, align 16
88 define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind {
89 ; X86-SSE-LABEL: test3:
91 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
92 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
93 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
94 ; X86-SSE-NEXT: movaps (%edx), %xmm0
95 ; X86-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
96 ; X86-SSE-NEXT: movaps %xmm0, (%eax)
99 ; X86-AVX-LABEL: test3:
101 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
102 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
103 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
104 ; X86-AVX-NEXT: vmovaps (%edx), %xmm0
105 ; X86-AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
106 ; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
109 ; X64-SSE-LABEL: test3:
111 ; X64-SSE-NEXT: movaps (%rsi), %xmm0
112 ; X64-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
113 ; X64-SSE-NEXT: movaps %xmm0, (%rdi)
116 ; X64-AVX-LABEL: test3:
118 ; X64-AVX-NEXT: vmovaps (%rsi), %xmm0
119 ; X64-AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
120 ; X64-AVX-NEXT: vmovaps %xmm0, (%rdi)
122 %tmp = load <4 x float>, <4 x float>* %B ; <<4 x float>> [#uses=2]
123 %tmp3 = load <4 x float>, <4 x float>* %A ; <<4 x float>> [#uses=2]
124 %tmp.upgrd.1 = extractelement <4 x float> %tmp3, i32 0 ; <float> [#uses=1]
125 %tmp7 = extractelement <4 x float> %tmp, i32 0 ; <float> [#uses=1]
126 %tmp8 = extractelement <4 x float> %tmp3, i32 1 ; <float> [#uses=1]
127 %tmp9 = extractelement <4 x float> %tmp, i32 1 ; <float> [#uses=1]
128 %tmp10 = insertelement <4 x float> undef, float %tmp.upgrd.1, i32 0 ; <<4 x float>> [#uses=1]
129 %tmp11 = insertelement <4 x float> %tmp10, float %tmp7, i32 1 ; <<4 x float>> [#uses=1]
130 %tmp12 = insertelement <4 x float> %tmp11, float %tmp8, i32 2 ; <<4 x float>> [#uses=1]
131 %tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3 ; <<4 x float>> [#uses=1]
132 store <4 x float> %tmp13, <4 x float>* %res
136 define void @test4(<4 x float> %X, <4 x float>* %res) nounwind {
137 ; X86-SSE-LABEL: test4:
139 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
140 ; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
141 ; X86-SSE-NEXT: movaps %xmm0, (%eax)
144 ; X86-AVX-LABEL: test4:
146 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
147 ; X86-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
148 ; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
151 ; X64-SSE-LABEL: test4:
153 ; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
154 ; X64-SSE-NEXT: movaps %xmm0, (%rdi)
157 ; X64-AVX-LABEL: test4:
159 ; X64-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
160 ; X64-AVX-NEXT: vmovaps %xmm0, (%rdi)
162 %tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1]
163 store <4 x float> %tmp5, <4 x float>* %res
167 define <4 x i32> @test5(i8** %ptr) nounwind {
168 ; X86-SSE-LABEL: test5:
170 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
171 ; X86-SSE-NEXT: movl (%eax), %eax
172 ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
173 ; X86-SSE-NEXT: pxor %xmm0, %xmm0
174 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
175 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
178 ; X86-AVX-LABEL: test5:
180 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
181 ; X86-AVX-NEXT: movl (%eax), %eax
182 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
183 ; X86-AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
184 ; X86-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
185 ; X86-AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
188 ; X64-SSE-LABEL: test5:
190 ; X64-SSE-NEXT: movq (%rdi), %rax
191 ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
192 ; X64-SSE-NEXT: pxor %xmm0, %xmm0
193 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
194 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
197 ; X64-AVX-LABEL: test5:
199 ; X64-AVX-NEXT: movq (%rdi), %rax
200 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
201 ; X64-AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
202 ; X64-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
203 ; X64-AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
205 %tmp = load i8*, i8** %ptr ; <i8*> [#uses=1]
206 %tmp.upgrd.1 = bitcast i8* %tmp to float* ; <float*> [#uses=1]
207 %tmp.upgrd.2 = load float, float* %tmp.upgrd.1 ; <float> [#uses=1]
208 %tmp.upgrd.3 = insertelement <4 x float> undef, float %tmp.upgrd.2, i32 0 ; <<4 x float>> [#uses=1]
209 %tmp9 = insertelement <4 x float> %tmp.upgrd.3, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1]
210 %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1]
211 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1]
212 %tmp21 = bitcast <4 x float> %tmp11 to <16 x i8> ; <<16 x i8>> [#uses=1]
213 %tmp22 = shufflevector <16 x i8> %tmp21, <16 x i8> zeroinitializer, <16 x i32> < i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23 > ; <<16 x i8>> [#uses=1]
214 %tmp31 = bitcast <16 x i8> %tmp22 to <8 x i16> ; <<8 x i16>> [#uses=1]
215 %tmp.upgrd.4 = shufflevector <8 x i16> zeroinitializer, <8 x i16> %tmp31, <8 x i32> < i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11 > ; <<8 x i16>> [#uses=1]
216 %tmp36 = bitcast <8 x i16> %tmp.upgrd.4 to <4 x i32> ; <<4 x i32>> [#uses=1]
220 define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind {
221 ; X86-SSE-LABEL: test6:
223 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
224 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
225 ; X86-SSE-NEXT: movaps (%ecx), %xmm0
226 ; X86-SSE-NEXT: movaps %xmm0, (%eax)
229 ; X86-AVX-LABEL: test6:
231 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
232 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
233 ; X86-AVX-NEXT: vmovaps (%ecx), %xmm0
234 ; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
237 ; X64-SSE-LABEL: test6:
239 ; X64-SSE-NEXT: movaps (%rsi), %xmm0
240 ; X64-SSE-NEXT: movaps %xmm0, (%rdi)
243 ; X64-AVX-LABEL: test6:
245 ; X64-AVX-NEXT: vmovaps (%rsi), %xmm0
246 ; X64-AVX-NEXT: vmovaps %xmm0, (%rdi)
248 %tmp1 = load <4 x float>, <4 x float>* %A ; <<4 x float>> [#uses=1]
249 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
250 store <4 x float> %tmp2, <4 x float>* %res
254 define void @test7() nounwind {
257 ; SSE-NEXT: xorps %xmm0, %xmm0
258 ; SSE-NEXT: movaps %xmm0, 0
259 ; SSE-NEXT: ret{{[l|q]}}
263 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
264 ; AVX-NEXT: vmovaps %xmm0, 0
265 ; AVX-NEXT: ret{{[l|q]}}
266 bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>>:1 [#uses=1]
267 shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer ; <<4 x float>>:2 [#uses=1]
268 store <4 x float> %2, <4 x float>* null
272 @x = external global [4 x i32]
274 define <2 x i64> @test8() nounwind {
275 ; X86-SSE-LABEL: test8:
277 ; X86-SSE-NEXT: movups x, %xmm0
280 ; X86-AVX-LABEL: test8:
282 ; X86-AVX-NEXT: vmovups x, %xmm0
285 ; X64-SSE-LABEL: test8:
287 ; X64-SSE-NEXT: movups {{.*}}(%rip), %xmm0
290 ; X64-AVX-LABEL: test8:
292 ; X64-AVX-NEXT: vmovups {{.*}}(%rip), %xmm0
294 %tmp = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 0) ; <i32> [#uses=1]
295 %tmp3 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 1) ; <i32> [#uses=1]
296 %tmp5 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 2) ; <i32> [#uses=1]
297 %tmp7 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 3) ; <i32> [#uses=1]
298 %tmp.upgrd.1 = insertelement <4 x i32> undef, i32 %tmp, i32 0 ; <<4 x i32>> [#uses=1]
299 %tmp13 = insertelement <4 x i32> %tmp.upgrd.1, i32 %tmp3, i32 1 ; <<4 x i32>> [#uses=1]
300 %tmp14 = insertelement <4 x i32> %tmp13, i32 %tmp5, i32 2 ; <<4 x i32>> [#uses=1]
301 %tmp15 = insertelement <4 x i32> %tmp14, i32 %tmp7, i32 3 ; <<4 x i32>> [#uses=1]
302 %tmp16 = bitcast <4 x i32> %tmp15 to <2 x i64> ; <<2 x i64>> [#uses=1]
306 define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) nounwind {
307 ; X86-SSE-LABEL: test9:
309 ; X86-SSE-NEXT: movups {{[0-9]+}}(%esp), %xmm0
312 ; X86-AVX-LABEL: test9:
314 ; X86-AVX-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
317 ; X64-SSE-LABEL: test9:
319 ; X64-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
320 ; X64-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
321 ; X64-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
324 ; X64-AVX-LABEL: test9:
326 ; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
327 ; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
328 ; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
330 %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1]
331 %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1]
332 %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1]
333 %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1]
334 ret <4 x float> %tmp13
337 define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind {
338 ; X86-SSE-LABEL: test10:
340 ; X86-SSE-NEXT: movups {{[0-9]+}}(%esp), %xmm0
343 ; X86-AVX-LABEL: test10:
345 ; X86-AVX-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
348 ; X64-SSE-LABEL: test10:
350 ; X64-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
351 ; X64-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
352 ; X64-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
355 ; X64-AVX-LABEL: test10:
357 ; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
358 ; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
359 ; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
361 %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1]
362 %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1]
363 %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1]
364 %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1]
365 ret <4 x float> %tmp13
368 define <2 x double> @test11(double %a, double %b) nounwind {
369 ; X86-SSE-LABEL: test11:
371 ; X86-SSE-NEXT: movups {{[0-9]+}}(%esp), %xmm0
374 ; X86-AVX-LABEL: test11:
376 ; X86-AVX-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
379 ; X64-SSE-LABEL: test11:
381 ; X64-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
384 ; X64-AVX-LABEL: test11:
386 ; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
388 %tmp = insertelement <2 x double> undef, double %a, i32 0 ; <<2 x double>> [#uses=1]
389 %tmp7 = insertelement <2 x double> %tmp, double %b, i32 1 ; <<2 x double>> [#uses=1]
390 ret <2 x double> %tmp7
393 define void @test12() nounwind {
396 ; SSE-NEXT: movapd 0, %xmm0
397 ; SSE-NEXT: movapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
398 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
399 ; SSE-NEXT: xorps %xmm2, %xmm2
400 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
401 ; SSE-NEXT: addps %xmm1, %xmm2
402 ; SSE-NEXT: movaps %xmm2, 0
403 ; SSE-NEXT: ret{{[l|q]}}
405 ; AVX1-LABEL: test12:
407 ; AVX1-NEXT: vmovaps 0, %xmm0
408 ; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
409 ; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
410 ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
411 ; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0
412 ; AVX1-NEXT: vmovaps %xmm0, 0
413 ; AVX1-NEXT: ret{{[l|q]}}
415 ; AVX512-LABEL: test12:
417 ; AVX512-NEXT: vmovaps 0, %xmm0
418 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
419 ; AVX512-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
420 ; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
421 ; AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
422 ; AVX512-NEXT: vaddps %xmm0, %xmm1, %xmm0
423 ; AVX512-NEXT: vmovaps %xmm0, 0
424 ; AVX512-NEXT: ret{{[l|q]}}
425 %tmp1 = load <4 x float>, <4 x float>* null ; <<4 x float>> [#uses=2]
426 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
427 %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
428 %tmp4 = fadd <4 x float> %tmp2, %tmp3 ; <<4 x float>> [#uses=1]
429 store <4 x float> %tmp4, <4 x float>* null
433 define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
434 ; X86-SSE-LABEL: test13:
436 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
437 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
438 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
439 ; X86-SSE-NEXT: movaps (%edx), %xmm0
440 ; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
441 ; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
442 ; X86-SSE-NEXT: movaps %xmm0, (%eax)
445 ; X86-AVX-LABEL: test13:
447 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
448 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
449 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
450 ; X86-AVX-NEXT: vmovaps (%edx), %xmm0
451 ; X86-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
452 ; X86-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
453 ; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
456 ; X64-SSE-LABEL: test13:
458 ; X64-SSE-NEXT: movaps (%rdx), %xmm0
459 ; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
460 ; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
461 ; X64-SSE-NEXT: movaps %xmm0, (%rdi)
464 ; X64-AVX-LABEL: test13:
466 ; X64-AVX-NEXT: vmovaps (%rdx), %xmm0
467 ; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
468 ; X64-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
469 ; X64-AVX-NEXT: vmovaps %xmm0, (%rdi)
471 %tmp3 = load <4 x float>, <4 x float>* %B ; <<4 x float>> [#uses=1]
472 %tmp5 = load <4 x float>, <4 x float>* %C ; <<4 x float>> [#uses=1]
473 %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1]
474 store <4 x float> %tmp11, <4 x float>* %res
478 define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind {
479 ; X86-SSE-LABEL: test14:
481 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
482 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
483 ; X86-SSE-NEXT: movaps (%ecx), %xmm1
484 ; X86-SSE-NEXT: movaps (%eax), %xmm2
485 ; X86-SSE-NEXT: movaps %xmm2, %xmm0
486 ; X86-SSE-NEXT: addps %xmm1, %xmm0
487 ; X86-SSE-NEXT: subps %xmm1, %xmm2
488 ; X86-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
491 ; X86-AVX-LABEL: test14:
493 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
494 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
495 ; X86-AVX-NEXT: vmovaps (%ecx), %xmm0
496 ; X86-AVX-NEXT: vmovaps (%eax), %xmm1
497 ; X86-AVX-NEXT: vaddps %xmm0, %xmm1, %xmm2
498 ; X86-AVX-NEXT: vsubps %xmm0, %xmm1, %xmm0
499 ; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0]
502 ; X64-SSE-LABEL: test14:
504 ; X64-SSE-NEXT: movaps (%rsi), %xmm1
505 ; X64-SSE-NEXT: movaps (%rdi), %xmm2
506 ; X64-SSE-NEXT: movaps %xmm2, %xmm0
507 ; X64-SSE-NEXT: addps %xmm1, %xmm0
508 ; X64-SSE-NEXT: subps %xmm1, %xmm2
509 ; X64-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
512 ; X64-AVX-LABEL: test14:
514 ; X64-AVX-NEXT: vmovaps (%rsi), %xmm0
515 ; X64-AVX-NEXT: vmovaps (%rdi), %xmm1
516 ; X64-AVX-NEXT: vaddps %xmm0, %xmm1, %xmm2
517 ; X64-AVX-NEXT: vsubps %xmm0, %xmm1, %xmm0
518 ; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0]
520 %tmp = load <4 x float>, <4 x float>* %y ; <<4 x float>> [#uses=2]
521 %tmp5 = load <4 x float>, <4 x float>* %x ; <<4 x float>> [#uses=2]
522 %tmp9 = fadd <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1]
523 %tmp21 = fsub <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1]
524 %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 > ; <<4 x float>> [#uses=1]
525 ret <4 x float> %tmp27
528 define <4 x float> @test15(<4 x float>* %x, <4 x float>* %y) nounwind {
529 ; X86-SSE-LABEL: test15:
530 ; X86-SSE: # %bb.0: # %entry
531 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
532 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
533 ; X86-SSE-NEXT: movaps (%ecx), %xmm0
534 ; X86-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
537 ; X86-AVX-LABEL: test15:
538 ; X86-AVX: # %bb.0: # %entry
539 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
540 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
541 ; X86-AVX-NEXT: vmovaps (%ecx), %xmm0
542 ; X86-AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
545 ; X64-SSE-LABEL: test15:
546 ; X64-SSE: # %bb.0: # %entry
547 ; X64-SSE-NEXT: movaps (%rdi), %xmm0
548 ; X64-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
551 ; X64-AVX-LABEL: test15:
552 ; X64-AVX: # %bb.0: # %entry
553 ; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
554 ; X64-AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
557 %tmp = load <4 x float>, <4 x float>* %y ; <<4 x float>> [#uses=1]
558 %tmp3 = load <4 x float>, <4 x float>* %x ; <<4 x float>> [#uses=1]
559 %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
560 ret <4 x float> %tmp4
565 define <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocapture %dst) {
566 ; X86-SSE-LABEL: test16:
568 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
569 ; X86-SSE-NEXT: movaps 96(%eax), %xmm0
570 ; X86-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
573 ; X86-AVX-LABEL: test16:
575 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
576 ; X86-AVX-NEXT: vmovaps 96(%eax), %xmm0
577 ; X86-AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
580 ; X64-SSE-LABEL: test16:
582 ; X64-SSE-NEXT: movaps 96(%rdi), %xmm0
583 ; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
586 ; X64-AVX-LABEL: test16:
588 ; X64-AVX-NEXT: vmovaps 96(%rdi), %xmm0
589 ; X64-AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
591 %i5 = getelementptr inbounds <4 x double>, <4 x double>* %srcA, i32 3
592 %i6 = load <4 x double>, <4 x double>* %i5, align 32
593 %i7 = shufflevector <4 x double> %i6, <4 x double> undef, <2 x i32> <i32 0, i32 2>
598 define fastcc void @test17() nounwind {
599 ; X86-SSE-LABEL: test17:
600 ; X86-SSE: # %bb.0: # %entry
601 ; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = <u,u,32768,32768>
602 ; X86-SSE-NEXT: movaps %xmm0, (%eax)
605 ; X86-AVX1-LABEL: test17:
606 ; X86-AVX1: # %bb.0: # %entry
607 ; X86-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = <u,u,32768,32768>
608 ; X86-AVX1-NEXT: vmovaps %xmm0, (%eax)
609 ; X86-AVX1-NEXT: retl
611 ; X86-AVX512-LABEL: test17:
612 ; X86-AVX512: # %bb.0: # %entry
613 ; X86-AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768]
614 ; X86-AVX512-NEXT: vmovaps %xmm0, (%eax)
615 ; X86-AVX512-NEXT: retl
617 ; X64-SSE-LABEL: test17:
618 ; X64-SSE: # %bb.0: # %entry
619 ; X64-SSE-NEXT: movaps {{.*#+}} xmm0 = <u,u,32768,32768>
620 ; X64-SSE-NEXT: movaps %xmm0, (%rax)
623 ; X64-AVX1-LABEL: test17:
624 ; X64-AVX1: # %bb.0: # %entry
625 ; X64-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = <u,u,32768,32768>
626 ; X64-AVX1-NEXT: vmovaps %xmm0, (%rax)
627 ; X64-AVX1-NEXT: retq
629 ; X64-AVX512-LABEL: test17:
630 ; X64-AVX512: # %bb.0: # %entry
631 ; X64-AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768]
632 ; X64-AVX512-NEXT: vmovaps %xmm0, (%rax)
633 ; X64-AVX512-NEXT: retq
635 %0 = insertelement <4 x i32> undef, i32 undef, i32 1
636 %1 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32768, i32 32768>, <4 x i32> %0, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
637 %2 = bitcast <4 x i32> %1 to <4 x float>
638 store <4 x float> %2, <4 x float> * undef
643 define <4 x float> @f(<4 x double>) nounwind {
645 ; SSE: # %bb.0: # %entry
646 ; SSE-NEXT: cvtpd2ps %xmm1, %xmm1
647 ; SSE-NEXT: cvtpd2ps %xmm0, %xmm0
648 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
649 ; SSE-NEXT: ret{{[l|q]}}
652 ; AVX: # %bb.0: # %entry
653 ; AVX-NEXT: vcvtpd2ps %ymm0, %xmm0
654 ; AVX-NEXT: vzeroupper
655 ; AVX-NEXT: ret{{[l|q]}}
657 %double2float.i = fptrunc <4 x double> %0 to <4 x float>
658 ret <4 x float> %double2float.i
661 define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
662 ; SSE-LABEL: test_insert_64_zext:
664 ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
665 ; SSE-NEXT: ret{{[l|q]}}
667 ; AVX-LABEL: test_insert_64_zext:
669 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
670 ; AVX-NEXT: ret{{[l|q]}}
671 %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
675 define <4 x i32> @PR19721(<4 x i32> %i) {
676 ; X86-SSE-LABEL: PR19721:
678 ; X86-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0
681 ; X86-AVX-LABEL: PR19721:
683 ; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
684 ; X86-AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
687 ; X64-SSE-LABEL: PR19721:
689 ; X64-SSE-NEXT: movq %xmm0, %rax
690 ; X64-SSE-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
691 ; X64-SSE-NEXT: andq %rax, %rcx
692 ; X64-SSE-NEXT: movq %rcx, %xmm1
693 ; X64-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
696 ; X64-AVX1-LABEL: PR19721:
698 ; X64-AVX1-NEXT: vmovq %xmm0, %rax
699 ; X64-AVX1-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
700 ; X64-AVX1-NEXT: andq %rax, %rcx
701 ; X64-AVX1-NEXT: vmovq %rcx, %xmm1
702 ; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
703 ; X64-AVX1-NEXT: retq
705 ; X64-AVX512-LABEL: PR19721:
706 ; X64-AVX512: # %bb.0:
707 ; X64-AVX512-NEXT: vmovq %xmm0, %rax
708 ; X64-AVX512-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
709 ; X64-AVX512-NEXT: andq %rax, %rcx
710 ; X64-AVX512-NEXT: vmovq %rcx, %xmm1
711 ; X64-AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
712 ; X64-AVX512-NEXT: retq
713 %bc = bitcast <4 x i32> %i to i128
714 %insert = and i128 %bc, -4294967296
715 %bc2 = bitcast i128 %insert to <4 x i32>
719 define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) {
720 ; SSE-LABEL: test_mul:
722 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
723 ; SSE-NEXT: pmuludq %xmm1, %xmm0
724 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
725 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
726 ; SSE-NEXT: pmuludq %xmm2, %xmm1
727 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
728 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
729 ; SSE-NEXT: ret{{[l|q]}}
731 ; AVX-LABEL: test_mul:
733 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
734 ; AVX-NEXT: ret{{[l|q]}}
735 %m = mul <4 x i32> %x, %y