1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE
3 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X86-AVX,AVX1,X86-AVX1
4 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX,X86-AVX,AVX512,X86-AVX512
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX1,X64-AVX1
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX512,X64-AVX512
9 ; Tests for SSE2 and below, without SSE3+.
11 define void @test1(ptr %r, ptr %A, double %B) nounwind {
12 ; X86-SSE-LABEL: test1:
14 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
15 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
16 ; X86-SSE-NEXT: movaps (%ecx), %xmm0
17 ; X86-SSE-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
18 ; X86-SSE-NEXT: movaps %xmm0, (%eax)
21 ; X86-AVX-LABEL: test1:
23 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
24 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
25 ; X86-AVX-NEXT: vmovaps (%ecx), %xmm0
26 ; X86-AVX-NEXT: vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
27 ; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
30 ; X64-SSE-LABEL: test1:
32 ; X64-SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],mem[1]
33 ; X64-SSE-NEXT: movapd %xmm0, (%rdi)
36 ; X64-AVX-LABEL: test1:
38 ; X64-AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
39 ; X64-AVX-NEXT: vmovaps %xmm0, (%rdi)
41 %tmp3 = load <2 x double>, ptr %A, align 16
42 %tmp7 = insertelement <2 x double> undef, double %B, i32 0
43 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 >
44 store <2 x double> %tmp9, ptr %r, align 16
48 define void @test2(ptr %r, ptr %A, double %B) nounwind {
49 ; X86-SSE-LABEL: test2:
51 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
52 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
53 ; X86-SSE-NEXT: movaps (%ecx), %xmm0
54 ; X86-SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
55 ; X86-SSE-NEXT: movaps %xmm0, (%eax)
58 ; X86-AVX-LABEL: test2:
60 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
61 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
62 ; X86-AVX-NEXT: vmovaps (%ecx), %xmm0
63 ; X86-AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
64 ; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
67 ; X64-SSE-LABEL: test2:
69 ; X64-SSE-NEXT: movaps (%rsi), %xmm1
70 ; X64-SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
71 ; X64-SSE-NEXT: movaps %xmm1, (%rdi)
74 ; X64-AVX-LABEL: test2:
76 ; X64-AVX-NEXT: vmovaps (%rsi), %xmm1
77 ; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
78 ; X64-AVX-NEXT: vmovaps %xmm0, (%rdi)
80 %tmp3 = load <2 x double>, ptr %A, align 16
81 %tmp7 = insertelement <2 x double> undef, double %B, i32 0
82 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 >
83 store <2 x double> %tmp9, ptr %r, align 16
88 define void @test3(ptr %res, ptr %A, ptr %B) nounwind {
89 ; X86-SSE-LABEL: test3:
91 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
92 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
93 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
94 ; X86-SSE-NEXT: movaps (%edx), %xmm0
95 ; X86-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
96 ; X86-SSE-NEXT: movaps %xmm0, (%eax)
99 ; X86-AVX-LABEL: test3:
101 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
102 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
103 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
104 ; X86-AVX-NEXT: vmovaps (%edx), %xmm0
105 ; X86-AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
106 ; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
109 ; X64-SSE-LABEL: test3:
111 ; X64-SSE-NEXT: movaps (%rsi), %xmm0
112 ; X64-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
113 ; X64-SSE-NEXT: movaps %xmm0, (%rdi)
116 ; X64-AVX-LABEL: test3:
118 ; X64-AVX-NEXT: vmovaps (%rsi), %xmm0
119 ; X64-AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
120 ; X64-AVX-NEXT: vmovaps %xmm0, (%rdi)
122 %tmp = load <4 x float>, ptr %B ; <<4 x float>> [#uses=2]
123 %tmp3 = load <4 x float>, ptr %A ; <<4 x float>> [#uses=2]
124 %tmp.upgrd.1 = extractelement <4 x float> %tmp3, i32 0 ; <float> [#uses=1]
125 %tmp7 = extractelement <4 x float> %tmp, i32 0 ; <float> [#uses=1]
126 %tmp8 = extractelement <4 x float> %tmp3, i32 1 ; <float> [#uses=1]
127 %tmp9 = extractelement <4 x float> %tmp, i32 1 ; <float> [#uses=1]
128 %tmp10 = insertelement <4 x float> undef, float %tmp.upgrd.1, i32 0 ; <<4 x float>> [#uses=1]
129 %tmp11 = insertelement <4 x float> %tmp10, float %tmp7, i32 1 ; <<4 x float>> [#uses=1]
130 %tmp12 = insertelement <4 x float> %tmp11, float %tmp8, i32 2 ; <<4 x float>> [#uses=1]
131 %tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3 ; <<4 x float>> [#uses=1]
132 store <4 x float> %tmp13, ptr %res
136 define void @test4(<4 x float> %X, ptr %res) nounwind {
137 ; X86-SSE-LABEL: test4:
139 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
140 ; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
141 ; X86-SSE-NEXT: movaps %xmm0, (%eax)
144 ; X86-AVX-LABEL: test4:
146 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
147 ; X86-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
148 ; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
151 ; X64-SSE-LABEL: test4:
153 ; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
154 ; X64-SSE-NEXT: movaps %xmm0, (%rdi)
157 ; X64-AVX-LABEL: test4:
159 ; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
160 ; X64-AVX-NEXT: vmovaps %xmm0, (%rdi)
162 %tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1]
163 store <4 x float> %tmp5, ptr %res
167 define <4 x i32> @test5(ptr %ptr) nounwind {
168 ; X86-SSE-LABEL: test5:
170 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
171 ; X86-SSE-NEXT: movl (%eax), %eax
172 ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
173 ; X86-SSE-NEXT: pxor %xmm0, %xmm0
174 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
175 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
178 ; X86-AVX-LABEL: test5:
180 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
181 ; X86-AVX-NEXT: movl (%eax), %eax
182 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
183 ; X86-AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
184 ; X86-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
185 ; X86-AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
188 ; X64-SSE-LABEL: test5:
190 ; X64-SSE-NEXT: movq (%rdi), %rax
191 ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
192 ; X64-SSE-NEXT: pxor %xmm0, %xmm0
193 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
194 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
197 ; X64-AVX-LABEL: test5:
199 ; X64-AVX-NEXT: movq (%rdi), %rax
200 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
201 ; X64-AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
202 ; X64-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
203 ; X64-AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
205 %tmp = load ptr, ptr %ptr ; <ptr> [#uses=1]
206 %tmp.upgrd.2 = load float, ptr %tmp ; <float> [#uses=1]
207 %tmp.upgrd.3 = insertelement <4 x float> undef, float %tmp.upgrd.2, i32 0 ; <<4 x float>> [#uses=1]
208 %tmp9 = insertelement <4 x float> %tmp.upgrd.3, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1]
209 %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1]
210 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1]
211 %tmp21 = bitcast <4 x float> %tmp11 to <16 x i8> ; <<16 x i8>> [#uses=1]
212 %tmp22 = shufflevector <16 x i8> %tmp21, <16 x i8> zeroinitializer, <16 x i32> < i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23 > ; <<16 x i8>> [#uses=1]
213 %tmp31 = bitcast <16 x i8> %tmp22 to <8 x i16> ; <<8 x i16>> [#uses=1]
214 %tmp.upgrd.4 = shufflevector <8 x i16> zeroinitializer, <8 x i16> %tmp31, <8 x i32> < i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11 > ; <<8 x i16>> [#uses=1]
215 %tmp36 = bitcast <8 x i16> %tmp.upgrd.4 to <4 x i32> ; <<4 x i32>> [#uses=1]
219 define void @test6(ptr %res, ptr %A) nounwind {
220 ; X86-SSE-LABEL: test6:
222 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
223 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
224 ; X86-SSE-NEXT: movaps (%ecx), %xmm0
225 ; X86-SSE-NEXT: movaps %xmm0, (%eax)
228 ; X86-AVX-LABEL: test6:
230 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
231 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
232 ; X86-AVX-NEXT: vmovaps (%ecx), %xmm0
233 ; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
236 ; X64-SSE-LABEL: test6:
238 ; X64-SSE-NEXT: movaps (%rsi), %xmm0
239 ; X64-SSE-NEXT: movaps %xmm0, (%rdi)
242 ; X64-AVX-LABEL: test6:
244 ; X64-AVX-NEXT: vmovaps (%rsi), %xmm0
245 ; X64-AVX-NEXT: vmovaps %xmm0, (%rdi)
247 %tmp1 = load <4 x float>, ptr %A ; <<4 x float>> [#uses=1]
248 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
249 store <4 x float> %tmp2, ptr %res
253 define void @test7() nounwind {
256 ; SSE-NEXT: xorps %xmm0, %xmm0
257 ; SSE-NEXT: movaps %xmm0, 0
258 ; SSE-NEXT: ret{{[l|q]}}
262 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
263 ; AVX-NEXT: vmovaps %xmm0, 0
264 ; AVX-NEXT: ret{{[l|q]}}
265 bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>>:1 [#uses=1]
266 shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer ; <<4 x float>>:2 [#uses=1]
267 store <4 x float> %2, ptr null
271 @x = external dso_local global [4 x i32]
273 define <2 x i64> @test8() nounwind {
274 ; X86-SSE-LABEL: test8:
276 ; X86-SSE-NEXT: movups x, %xmm0
279 ; X86-AVX-LABEL: test8:
281 ; X86-AVX-NEXT: vmovups x, %xmm0
284 ; X64-SSE-LABEL: test8:
286 ; X64-SSE-NEXT: movups x(%rip), %xmm0
289 ; X64-AVX-LABEL: test8:
291 ; X64-AVX-NEXT: vmovups x(%rip), %xmm0
293 %tmp = load i32, ptr @x ; <i32> [#uses=1]
294 %tmp3 = load i32, ptr getelementptr ([4 x i32], ptr @x, i32 0, i32 1) ; <i32> [#uses=1]
295 %tmp5 = load i32, ptr getelementptr ([4 x i32], ptr @x, i32 0, i32 2) ; <i32> [#uses=1]
296 %tmp7 = load i32, ptr getelementptr ([4 x i32], ptr @x, i32 0, i32 3) ; <i32> [#uses=1]
297 %tmp.upgrd.1 = insertelement <4 x i32> undef, i32 %tmp, i32 0 ; <<4 x i32>> [#uses=1]
298 %tmp13 = insertelement <4 x i32> %tmp.upgrd.1, i32 %tmp3, i32 1 ; <<4 x i32>> [#uses=1]
299 %tmp14 = insertelement <4 x i32> %tmp13, i32 %tmp5, i32 2 ; <<4 x i32>> [#uses=1]
300 %tmp15 = insertelement <4 x i32> %tmp14, i32 %tmp7, i32 3 ; <<4 x i32>> [#uses=1]
301 %tmp16 = bitcast <4 x i32> %tmp15 to <2 x i64> ; <<2 x i64>> [#uses=1]
305 define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) nounwind {
306 ; X86-SSE-LABEL: test9:
308 ; X86-SSE-NEXT: movups {{[0-9]+}}(%esp), %xmm0
311 ; X86-AVX-LABEL: test9:
313 ; X86-AVX-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
316 ; X64-SSE-LABEL: test9:
318 ; X64-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
319 ; X64-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
320 ; X64-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
323 ; X64-AVX-LABEL: test9:
325 ; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
326 ; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
327 ; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
329 %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1]
330 %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1]
331 %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1]
332 %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1]
333 ret <4 x float> %tmp13
336 define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind {
337 ; X86-SSE-LABEL: test10:
339 ; X86-SSE-NEXT: movups {{[0-9]+}}(%esp), %xmm0
342 ; X86-AVX-LABEL: test10:
344 ; X86-AVX-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
347 ; X64-SSE-LABEL: test10:
349 ; X64-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
350 ; X64-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
351 ; X64-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
354 ; X64-AVX-LABEL: test10:
356 ; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
357 ; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
358 ; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
360 %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1]
361 %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1]
362 %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1]
363 %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1]
364 ret <4 x float> %tmp13
367 define <2 x double> @test11(double %a, double %b) nounwind {
368 ; X86-SSE-LABEL: test11:
370 ; X86-SSE-NEXT: movups {{[0-9]+}}(%esp), %xmm0
373 ; X86-AVX-LABEL: test11:
375 ; X86-AVX-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
378 ; X64-SSE-LABEL: test11:
380 ; X64-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
383 ; X64-AVX-LABEL: test11:
385 ; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
387 %tmp = insertelement <2 x double> undef, double %a, i32 0 ; <<2 x double>> [#uses=1]
388 %tmp7 = insertelement <2 x double> %tmp, double %b, i32 1 ; <<2 x double>> [#uses=1]
389 ret <2 x double> %tmp7
392 define void @test12() nounwind {
395 ; SSE-NEXT: movapd 0, %xmm0
396 ; SSE-NEXT: movapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
397 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
398 ; SSE-NEXT: xorps %xmm2, %xmm2
399 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
400 ; SSE-NEXT: addps %xmm1, %xmm2
401 ; SSE-NEXT: movaps %xmm2, 0
402 ; SSE-NEXT: ret{{[l|q]}}
404 ; AVX1-LABEL: test12:
406 ; AVX1-NEXT: vmovaps 0, %xmm0
407 ; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
408 ; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
409 ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
410 ; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0
411 ; AVX1-NEXT: vmovaps %xmm0, 0
412 ; AVX1-NEXT: ret{{[l|q]}}
414 ; AVX512-LABEL: test12:
416 ; AVX512-NEXT: vmovaps 0, %xmm0
417 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
418 ; AVX512-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
419 ; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
420 ; AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
421 ; AVX512-NEXT: vaddps %xmm0, %xmm1, %xmm0
422 ; AVX512-NEXT: vmovaps %xmm0, 0
423 ; AVX512-NEXT: ret{{[l|q]}}
424 %tmp1 = load <4 x float>, ptr null ; <<4 x float>> [#uses=2]
425 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
426 %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
427 %tmp4 = fadd <4 x float> %tmp2, %tmp3 ; <<4 x float>> [#uses=1]
428 store <4 x float> %tmp4, ptr null
432 define void @test13(ptr %res, ptr %A, ptr %B, ptr %C) nounwind {
433 ; X86-SSE-LABEL: test13:
435 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
436 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
437 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
438 ; X86-SSE-NEXT: movaps (%edx), %xmm0
439 ; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
440 ; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
441 ; X86-SSE-NEXT: movaps %xmm0, (%eax)
444 ; X86-AVX-LABEL: test13:
446 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
447 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
448 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
449 ; X86-AVX-NEXT: vmovaps (%edx), %xmm0
450 ; X86-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
451 ; X86-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
452 ; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
455 ; X64-SSE-LABEL: test13:
457 ; X64-SSE-NEXT: movaps (%rdx), %xmm0
458 ; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
459 ; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
460 ; X64-SSE-NEXT: movaps %xmm0, (%rdi)
463 ; X64-AVX-LABEL: test13:
465 ; X64-AVX-NEXT: vmovaps (%rdx), %xmm0
466 ; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
467 ; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
468 ; X64-AVX-NEXT: vmovaps %xmm0, (%rdi)
470 %tmp3 = load <4 x float>, ptr %B ; <<4 x float>> [#uses=1]
471 %tmp5 = load <4 x float>, ptr %C ; <<4 x float>> [#uses=1]
472 %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1]
473 store <4 x float> %tmp11, ptr %res
477 define <4 x float> @test14(ptr %x, ptr %y) nounwind {
478 ; X86-SSE-LABEL: test14:
480 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
481 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
482 ; X86-SSE-NEXT: movaps (%ecx), %xmm1
483 ; X86-SSE-NEXT: movaps (%eax), %xmm2
484 ; X86-SSE-NEXT: movaps %xmm2, %xmm0
485 ; X86-SSE-NEXT: addps %xmm1, %xmm0
486 ; X86-SSE-NEXT: subps %xmm1, %xmm2
487 ; X86-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
490 ; X86-AVX-LABEL: test14:
492 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
493 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
494 ; X86-AVX-NEXT: vmovaps (%ecx), %xmm0
495 ; X86-AVX-NEXT: vmovaps (%eax), %xmm1
496 ; X86-AVX-NEXT: vaddps %xmm0, %xmm1, %xmm2
497 ; X86-AVX-NEXT: vsubps %xmm0, %xmm1, %xmm0
498 ; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0]
501 ; X64-SSE-LABEL: test14:
503 ; X64-SSE-NEXT: movaps (%rsi), %xmm1
504 ; X64-SSE-NEXT: movaps (%rdi), %xmm2
505 ; X64-SSE-NEXT: movaps %xmm2, %xmm0
506 ; X64-SSE-NEXT: addps %xmm1, %xmm0
507 ; X64-SSE-NEXT: subps %xmm1, %xmm2
508 ; X64-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
511 ; X64-AVX-LABEL: test14:
513 ; X64-AVX-NEXT: vmovaps (%rsi), %xmm0
514 ; X64-AVX-NEXT: vmovaps (%rdi), %xmm1
515 ; X64-AVX-NEXT: vaddps %xmm0, %xmm1, %xmm2
516 ; X64-AVX-NEXT: vsubps %xmm0, %xmm1, %xmm0
517 ; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0]
519 %tmp = load <4 x float>, ptr %y ; <<4 x float>> [#uses=2]
520 %tmp5 = load <4 x float>, ptr %x ; <<4 x float>> [#uses=2]
521 %tmp9 = fadd <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1]
522 %tmp21 = fsub <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1]
523 %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 > ; <<4 x float>> [#uses=1]
524 ret <4 x float> %tmp27
527 define <4 x float> @test15(ptr %x, ptr %y) nounwind {
528 ; X86-SSE-LABEL: test15:
529 ; X86-SSE: # %bb.0: # %entry
530 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
531 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
532 ; X86-SSE-NEXT: movaps (%ecx), %xmm0
533 ; X86-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
536 ; X86-AVX-LABEL: test15:
537 ; X86-AVX: # %bb.0: # %entry
538 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
539 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
540 ; X86-AVX-NEXT: vmovaps (%ecx), %xmm0
541 ; X86-AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
544 ; X64-SSE-LABEL: test15:
545 ; X64-SSE: # %bb.0: # %entry
546 ; X64-SSE-NEXT: movaps (%rdi), %xmm0
547 ; X64-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
550 ; X64-AVX-LABEL: test15:
551 ; X64-AVX: # %bb.0: # %entry
552 ; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
553 ; X64-AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
556 %tmp = load <4 x float>, ptr %y ; <<4 x float>> [#uses=1]
557 %tmp3 = load <4 x float>, ptr %x ; <<4 x float>> [#uses=1]
558 %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
559 ret <4 x float> %tmp4
564 define <2 x double> @test16(ptr nocapture %srcA, ptr nocapture %dst) {
565 ; X86-SSE-LABEL: test16:
567 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
568 ; X86-SSE-NEXT: movaps 96(%eax), %xmm0
569 ; X86-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
572 ; X86-AVX-LABEL: test16:
574 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
575 ; X86-AVX-NEXT: vmovaps 96(%eax), %xmm0
576 ; X86-AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
579 ; X64-SSE-LABEL: test16:
581 ; X64-SSE-NEXT: movaps 96(%rdi), %xmm0
582 ; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
585 ; X64-AVX-LABEL: test16:
587 ; X64-AVX-NEXT: vmovaps 96(%rdi), %xmm0
588 ; X64-AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
590 %i5 = getelementptr inbounds <4 x double>, ptr %srcA, i32 3
591 %i6 = load <4 x double>, ptr %i5, align 32
592 %i7 = shufflevector <4 x double> %i6, <4 x double> undef, <2 x i32> <i32 0, i32 2>
597 define fastcc void @test17() nounwind {
598 ; X86-SSE-LABEL: test17:
599 ; X86-SSE: # %bb.0: # %entry
600 ; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = <u,u,32768,32768>
601 ; X86-SSE-NEXT: movaps %xmm0, (%eax)
604 ; X86-AVX-LABEL: test17:
605 ; X86-AVX: # %bb.0: # %entry
606 ; X86-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768]
607 ; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
610 ; X64-SSE-LABEL: test17:
611 ; X64-SSE: # %bb.0: # %entry
612 ; X64-SSE-NEXT: movaps {{.*#+}} xmm0 = <u,u,32768,32768>
613 ; X64-SSE-NEXT: movaps %xmm0, (%rax)
616 ; X64-AVX-LABEL: test17:
617 ; X64-AVX: # %bb.0: # %entry
618 ; X64-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768]
619 ; X64-AVX-NEXT: vmovaps %xmm0, (%rax)
622 %0 = insertelement <4 x i32> undef, i32 undef, i32 1
623 %1 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32768, i32 32768>, <4 x i32> %0, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
624 %2 = bitcast <4 x i32> %1 to <4 x float>
625 store <4 x float> %2, ptr undef
630 define <4 x float> @f(<4 x double>) nounwind {
632 ; SSE: # %bb.0: # %entry
633 ; SSE-NEXT: cvtpd2ps %xmm1, %xmm1
634 ; SSE-NEXT: cvtpd2ps %xmm0, %xmm0
635 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
636 ; SSE-NEXT: ret{{[l|q]}}
639 ; AVX: # %bb.0: # %entry
640 ; AVX-NEXT: vcvtpd2ps %ymm0, %xmm0
641 ; AVX-NEXT: vzeroupper
642 ; AVX-NEXT: ret{{[l|q]}}
644 %double2float.i = fptrunc <4 x double> %0 to <4 x float>
645 ret <4 x float> %double2float.i
648 define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
649 ; SSE-LABEL: test_insert_64_zext:
651 ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
652 ; SSE-NEXT: ret{{[l|q]}}
654 ; AVX-LABEL: test_insert_64_zext:
656 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
657 ; AVX-NEXT: ret{{[l|q]}}
658 %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
662 define <4 x i32> @PR19721(<4 x i32> %i) {
663 ; X86-SSE-LABEL: PR19721:
665 ; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
668 ; AVX-LABEL: PR19721:
670 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
671 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
672 ; AVX-NEXT: ret{{[l|q]}}
674 ; X64-SSE-LABEL: PR19721:
676 ; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
678 %bc = bitcast <4 x i32> %i to i128
679 %insert = and i128 %bc, -4294967296
680 %bc2 = bitcast i128 %insert to <4 x i32>
684 define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) {
685 ; SSE-LABEL: test_mul:
687 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
688 ; SSE-NEXT: pmuludq %xmm1, %xmm0
689 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
690 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
691 ; SSE-NEXT: pmuludq %xmm2, %xmm1
692 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
693 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
694 ; SSE-NEXT: ret{{[l|q]}}
696 ; AVX-LABEL: test_mul:
698 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
699 ; AVX-NEXT: ret{{[l|q]}}
700 %m = mul <4 x i32> %x, %y
703 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: