1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X86-SSE
3 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X86-AVX,AVX1
4 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX,X86-AVX,AVX512
5 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X64-SSE
6 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX1
7 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX512
9 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse41-builtins.c
11 define <2 x i64> @test_mm_blend_epi16(<2 x i64> %a0, <2 x i64> %a1) {
12 ; SSE-LABEL: test_mm_blend_epi16:
14 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6,7]
15 ; SSE-NEXT: ret{{[l|q]}}
17 ; AVX-LABEL: test_mm_blend_epi16:
19 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6,7]
20 ; AVX-NEXT: ret{{[l|q]}}
21 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
22 %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
23 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> %arg1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 7>
24 %res = bitcast <8 x i16> %shuf to <2 x i64>
28 define <2 x double> @test_mm_blend_pd(<2 x double> %a0, <2 x double> %a1) {
29 ; SSE-LABEL: test_mm_blend_pd:
31 ; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
32 ; SSE-NEXT: ret{{[l|q]}}
34 ; AVX-LABEL: test_mm_blend_pd:
36 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
37 ; AVX-NEXT: ret{{[l|q]}}
38 %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 3>
42 define <4 x float> @test_mm_blend_ps(<4 x float> %a0, <4 x float> %a1) {
43 ; SSE-LABEL: test_mm_blend_ps:
45 ; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
46 ; SSE-NEXT: ret{{[l|q]}}
48 ; AVX-LABEL: test_mm_blend_ps:
50 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
51 ; AVX-NEXT: ret{{[l|q]}}
52 %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
56 define <2 x i64> @test_mm_blendv_epi8(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
57 ; SSE-LABEL: test_mm_blendv_epi8:
59 ; SSE-NEXT: movdqa %xmm0, %xmm3
60 ; SSE-NEXT: movaps %xmm2, %xmm0
61 ; SSE-NEXT: pblendvb %xmm0, %xmm1, %xmm3
62 ; SSE-NEXT: movdqa %xmm3, %xmm0
63 ; SSE-NEXT: ret{{[l|q]}}
65 ; AVX-LABEL: test_mm_blendv_epi8:
67 ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
68 ; AVX-NEXT: ret{{[l|q]}}
69 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
70 %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
71 %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
72 %call = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %arg0, <16 x i8> %arg1, <16 x i8> %arg2)
73 %res = bitcast <16 x i8> %call to <2 x i64>
76 declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
78 define <2 x double> @test_mm_blendv_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
79 ; SSE-LABEL: test_mm_blendv_pd:
81 ; SSE-NEXT: movapd %xmm0, %xmm3
82 ; SSE-NEXT: movaps %xmm2, %xmm0
83 ; SSE-NEXT: blendvpd %xmm0, %xmm1, %xmm3
84 ; SSE-NEXT: movapd %xmm3, %xmm0
85 ; SSE-NEXT: ret{{[l|q]}}
87 ; AVX-LABEL: test_mm_blendv_pd:
89 ; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
90 ; AVX-NEXT: ret{{[l|q]}}
91 %res = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
94 declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
96 define <4 x float> @test_mm_blendv_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
97 ; SSE-LABEL: test_mm_blendv_ps:
99 ; SSE-NEXT: movaps %xmm0, %xmm3
100 ; SSE-NEXT: movaps %xmm2, %xmm0
101 ; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm3
102 ; SSE-NEXT: movaps %xmm3, %xmm0
103 ; SSE-NEXT: ret{{[l|q]}}
105 ; AVX-LABEL: test_mm_blendv_ps:
107 ; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
108 ; AVX-NEXT: ret{{[l|q]}}
109 %res = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
112 declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
114 define <2 x double> @test_mm_ceil_pd(<2 x double> %a0) {
115 ; SSE-LABEL: test_mm_ceil_pd:
117 ; SSE-NEXT: roundpd $2, %xmm0, %xmm0
118 ; SSE-NEXT: ret{{[l|q]}}
120 ; AVX-LABEL: test_mm_ceil_pd:
122 ; AVX-NEXT: vroundpd $2, %xmm0, %xmm0
123 ; AVX-NEXT: ret{{[l|q]}}
124 %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 2)
125 ret <2 x double> %res
127 declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
129 define <4 x float> @test_mm_ceil_ps(<4 x float> %a0) {
130 ; SSE-LABEL: test_mm_ceil_ps:
132 ; SSE-NEXT: roundps $2, %xmm0, %xmm0
133 ; SSE-NEXT: ret{{[l|q]}}
135 ; AVX-LABEL: test_mm_ceil_ps:
137 ; AVX-NEXT: vroundps $2, %xmm0, %xmm0
138 ; AVX-NEXT: ret{{[l|q]}}
139 %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 2)
142 declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
144 define <2 x double> @test_mm_ceil_sd(<2 x double> %a0, <2 x double> %a1) {
145 ; SSE-LABEL: test_mm_ceil_sd:
147 ; SSE-NEXT: roundsd $2, %xmm1, %xmm0
148 ; SSE-NEXT: ret{{[l|q]}}
150 ; AVX-LABEL: test_mm_ceil_sd:
152 ; AVX-NEXT: vroundsd $2, %xmm1, %xmm0, %xmm0
153 ; AVX-NEXT: ret{{[l|q]}}
154 %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 2)
155 ret <2 x double> %res
157 declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
159 define <4 x float> @test_mm_ceil_ss(<4 x float> %a0, <4 x float> %a1) {
160 ; SSE-LABEL: test_mm_ceil_ss:
162 ; SSE-NEXT: roundss $2, %xmm1, %xmm0
163 ; SSE-NEXT: ret{{[l|q]}}
165 ; AVX-LABEL: test_mm_ceil_ss:
167 ; AVX-NEXT: vroundss $2, %xmm1, %xmm0, %xmm0
168 ; AVX-NEXT: ret{{[l|q]}}
169 %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 2)
172 declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
174 define <2 x i64> @test_mm_cmpeq_epi64(<2 x i64> %a0, <2 x i64> %a1) {
175 ; SSE-LABEL: test_mm_cmpeq_epi64:
177 ; SSE-NEXT: pcmpeqq %xmm1, %xmm0
178 ; SSE-NEXT: ret{{[l|q]}}
180 ; AVX1-LABEL: test_mm_cmpeq_epi64:
182 ; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
183 ; AVX1-NEXT: ret{{[l|q]}}
185 ; AVX512-LABEL: test_mm_cmpeq_epi64:
187 ; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
188 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
189 ; AVX512-NEXT: ret{{[l|q]}}
190 %cmp = icmp eq <2 x i64> %a0, %a1
191 %res = sext <2 x i1> %cmp to <2 x i64>
195 define <2 x i64> @test_mm_cvtepi8_epi16(<2 x i64> %a0) {
196 ; SSE-LABEL: test_mm_cvtepi8_epi16:
198 ; SSE-NEXT: pmovsxbw %xmm0, %xmm0
199 ; SSE-NEXT: ret{{[l|q]}}
201 ; AVX-LABEL: test_mm_cvtepi8_epi16:
203 ; AVX-NEXT: vpmovsxbw %xmm0, %xmm0
204 ; AVX-NEXT: ret{{[l|q]}}
205 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
206 %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
207 %sext = sext <8 x i8> %ext0 to <8 x i16>
208 %res = bitcast <8 x i16> %sext to <2 x i64>
212 define <2 x i64> @test_mm_cvtepi8_epi32(<2 x i64> %a0) {
213 ; SSE-LABEL: test_mm_cvtepi8_epi32:
215 ; SSE-NEXT: pmovsxbd %xmm0, %xmm0
216 ; SSE-NEXT: ret{{[l|q]}}
218 ; AVX-LABEL: test_mm_cvtepi8_epi32:
220 ; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
221 ; AVX-NEXT: ret{{[l|q]}}
222 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
223 %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
224 %sext = sext <4 x i8> %ext0 to <4 x i32>
225 %res = bitcast <4 x i32> %sext to <2 x i64>
229 define <2 x i64> @test_mm_cvtepi8_epi64(<2 x i64> %a0) {
230 ; SSE-LABEL: test_mm_cvtepi8_epi64:
232 ; SSE-NEXT: pmovsxbq %xmm0, %xmm0
233 ; SSE-NEXT: ret{{[l|q]}}
235 ; AVX-LABEL: test_mm_cvtepi8_epi64:
237 ; AVX-NEXT: vpmovsxbq %xmm0, %xmm0
238 ; AVX-NEXT: ret{{[l|q]}}
239 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
240 %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
241 %sext = sext <2 x i8> %ext0 to <2 x i64>
245 define <2 x i64> @test_mm_cvtepi16_epi32(<2 x i64> %a0) {
246 ; SSE-LABEL: test_mm_cvtepi16_epi32:
248 ; SSE-NEXT: pmovsxwd %xmm0, %xmm0
249 ; SSE-NEXT: ret{{[l|q]}}
251 ; AVX-LABEL: test_mm_cvtepi16_epi32:
253 ; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
254 ; AVX-NEXT: ret{{[l|q]}}
255 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
256 %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
257 %sext = sext <4 x i16> %ext0 to <4 x i32>
258 %res = bitcast <4 x i32> %sext to <2 x i64>
262 define <2 x i64> @test_mm_cvtepi16_epi64(<2 x i64> %a0) {
263 ; SSE-LABEL: test_mm_cvtepi16_epi64:
265 ; SSE-NEXT: pmovsxwq %xmm0, %xmm0
266 ; SSE-NEXT: ret{{[l|q]}}
268 ; AVX-LABEL: test_mm_cvtepi16_epi64:
270 ; AVX-NEXT: vpmovsxwq %xmm0, %xmm0
271 ; AVX-NEXT: ret{{[l|q]}}
272 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
273 %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
274 %sext = sext <2 x i16> %ext0 to <2 x i64>
278 define <2 x i64> @test_mm_cvtepi32_epi64(<2 x i64> %a0) {
279 ; SSE-LABEL: test_mm_cvtepi32_epi64:
281 ; SSE-NEXT: pmovsxdq %xmm0, %xmm0
282 ; SSE-NEXT: ret{{[l|q]}}
284 ; AVX-LABEL: test_mm_cvtepi32_epi64:
286 ; AVX-NEXT: vpmovsxdq %xmm0, %xmm0
287 ; AVX-NEXT: ret{{[l|q]}}
288 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
289 %ext0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
290 %sext = sext <2 x i32> %ext0 to <2 x i64>
294 define <2 x i64> @test_mm_cvtepu8_epi16(<2 x i64> %a0) {
295 ; SSE-LABEL: test_mm_cvtepu8_epi16:
297 ; SSE-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
298 ; SSE-NEXT: ret{{[l|q]}}
300 ; AVX-LABEL: test_mm_cvtepu8_epi16:
302 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
303 ; AVX-NEXT: ret{{[l|q]}}
304 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
305 %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
306 %sext = zext <8 x i8> %ext0 to <8 x i16>
307 %res = bitcast <8 x i16> %sext to <2 x i64>
311 define <2 x i64> @test_mm_cvtepu8_epi32(<2 x i64> %a0) {
312 ; SSE-LABEL: test_mm_cvtepu8_epi32:
314 ; SSE-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
315 ; SSE-NEXT: ret{{[l|q]}}
317 ; AVX-LABEL: test_mm_cvtepu8_epi32:
319 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
320 ; AVX-NEXT: ret{{[l|q]}}
321 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
322 %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
323 %sext = zext <4 x i8> %ext0 to <4 x i32>
324 %res = bitcast <4 x i32> %sext to <2 x i64>
328 define <2 x i64> @test_mm_cvtepu8_epi64(<2 x i64> %a0) {
329 ; SSE-LABEL: test_mm_cvtepu8_epi64:
331 ; SSE-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
332 ; SSE-NEXT: ret{{[l|q]}}
334 ; AVX-LABEL: test_mm_cvtepu8_epi64:
336 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
337 ; AVX-NEXT: ret{{[l|q]}}
338 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
339 %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
340 %sext = zext <2 x i8> %ext0 to <2 x i64>
344 define <2 x i64> @test_mm_cvtepu16_epi32(<2 x i64> %a0) {
345 ; SSE-LABEL: test_mm_cvtepu16_epi32:
347 ; SSE-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
348 ; SSE-NEXT: ret{{[l|q]}}
350 ; AVX-LABEL: test_mm_cvtepu16_epi32:
352 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
353 ; AVX-NEXT: ret{{[l|q]}}
354 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
355 %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
356 %sext = zext <4 x i16> %ext0 to <4 x i32>
357 %res = bitcast <4 x i32> %sext to <2 x i64>
361 define <2 x i64> @test_mm_cvtepu16_epi64(<2 x i64> %a0) {
362 ; SSE-LABEL: test_mm_cvtepu16_epi64:
364 ; SSE-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
365 ; SSE-NEXT: ret{{[l|q]}}
367 ; AVX-LABEL: test_mm_cvtepu16_epi64:
369 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
370 ; AVX-NEXT: ret{{[l|q]}}
371 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
372 %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
373 %sext = zext <2 x i16> %ext0 to <2 x i64>
377 define <2 x i64> @test_mm_cvtepu32_epi64(<2 x i64> %a0) {
378 ; SSE-LABEL: test_mm_cvtepu32_epi64:
380 ; SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
381 ; SSE-NEXT: ret{{[l|q]}}
383 ; AVX-LABEL: test_mm_cvtepu32_epi64:
385 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
386 ; AVX-NEXT: ret{{[l|q]}}
387 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
388 %ext0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
389 %sext = zext <2 x i32> %ext0 to <2 x i64>
393 define <2 x double> @test_mm_dp_pd(<2 x double> %a0, <2 x double> %a1) {
394 ; SSE-LABEL: test_mm_dp_pd:
396 ; SSE-NEXT: dppd $7, %xmm1, %xmm0
397 ; SSE-NEXT: ret{{[l|q]}}
399 ; AVX-LABEL: test_mm_dp_pd:
401 ; AVX-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0
402 ; AVX-NEXT: ret{{[l|q]}}
403 %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7)
404 ret <2 x double> %res
406 declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
408 define <4 x float> @test_mm_dp_ps(<4 x float> %a0, <4 x float> %a1) {
409 ; SSE-LABEL: test_mm_dp_ps:
411 ; SSE-NEXT: dpps $7, %xmm1, %xmm0
412 ; SSE-NEXT: ret{{[l|q]}}
414 ; AVX-LABEL: test_mm_dp_ps:
416 ; AVX-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0
417 ; AVX-NEXT: ret{{[l|q]}}
418 %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7)
421 declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
423 define i32 @test_mm_extract_epi8(<2 x i64> %a0) {
424 ; SSE-LABEL: test_mm_extract_epi8:
426 ; SSE-NEXT: pextrb $1, %xmm0, %eax
427 ; SSE-NEXT: movzbl %al, %eax
428 ; SSE-NEXT: ret{{[l|q]}}
430 ; AVX-LABEL: test_mm_extract_epi8:
432 ; AVX-NEXT: vpextrb $1, %xmm0, %eax
433 ; AVX-NEXT: movzbl %al, %eax
434 ; AVX-NEXT: ret{{[l|q]}}
435 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
436 %ext = extractelement <16 x i8> %arg0, i32 1
437 %res = zext i8 %ext to i32
441 define i32 @test_mm_extract_epi32(<2 x i64> %a0) {
442 ; SSE-LABEL: test_mm_extract_epi32:
444 ; SSE-NEXT: extractps $1, %xmm0, %eax
445 ; SSE-NEXT: ret{{[l|q]}}
447 ; AVX-LABEL: test_mm_extract_epi32:
449 ; AVX-NEXT: vextractps $1, %xmm0, %eax
450 ; AVX-NEXT: ret{{[l|q]}}
451 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
452 %ext = extractelement <4 x i32> %arg0, i32 1
456 define i64 @test_mm_extract_epi64(<2 x i64> %a0) {
457 ; X86-SSE-LABEL: test_mm_extract_epi64:
459 ; X86-SSE-NEXT: extractps $2, %xmm0, %eax
460 ; X86-SSE-NEXT: extractps $3, %xmm0, %edx
463 ; X86-AVX-LABEL: test_mm_extract_epi64:
465 ; X86-AVX-NEXT: vextractps $2, %xmm0, %eax
466 ; X86-AVX-NEXT: vextractps $3, %xmm0, %edx
469 ; X64-SSE-LABEL: test_mm_extract_epi64:
471 ; X64-SSE-NEXT: pextrq $1, %xmm0, %rax
474 ; X64-AVX-LABEL: test_mm_extract_epi64:
476 ; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax
478 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
479 %ext = extractelement <2 x i64> %a0, i32 1
483 define i32 @test_mm_extract_ps(<4 x float> %a0) {
484 ; SSE-LABEL: test_mm_extract_ps:
486 ; SSE-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
487 ; SSE-NEXT: movd %xmm0, %eax
488 ; SSE-NEXT: ret{{[l|q]}}
490 ; AVX-LABEL: test_mm_extract_ps:
492 ; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
493 ; AVX-NEXT: vmovd %xmm0, %eax
494 ; AVX-NEXT: ret{{[l|q]}}
495 %ext = extractelement <4 x float> %a0, i32 1
496 %bc = bitcast float %ext to i32
500 define <2 x double> @test_mm_floor_pd(<2 x double> %a0) {
501 ; SSE-LABEL: test_mm_floor_pd:
503 ; SSE-NEXT: roundpd $1, %xmm0, %xmm0
504 ; SSE-NEXT: ret{{[l|q]}}
506 ; AVX-LABEL: test_mm_floor_pd:
508 ; AVX-NEXT: vroundpd $1, %xmm0, %xmm0
509 ; AVX-NEXT: ret{{[l|q]}}
510 %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 1)
511 ret <2 x double> %res
514 define <4 x float> @test_mm_floor_ps(<4 x float> %a0) {
515 ; SSE-LABEL: test_mm_floor_ps:
517 ; SSE-NEXT: roundps $1, %xmm0, %xmm0
518 ; SSE-NEXT: ret{{[l|q]}}
520 ; AVX-LABEL: test_mm_floor_ps:
522 ; AVX-NEXT: vroundps $1, %xmm0, %xmm0
523 ; AVX-NEXT: ret{{[l|q]}}
524 %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 1)
528 define <2 x double> @test_mm_floor_sd(<2 x double> %a0, <2 x double> %a1) {
529 ; SSE-LABEL: test_mm_floor_sd:
531 ; SSE-NEXT: roundsd $1, %xmm1, %xmm0
532 ; SSE-NEXT: ret{{[l|q]}}
534 ; AVX-LABEL: test_mm_floor_sd:
536 ; AVX-NEXT: vroundsd $1, %xmm1, %xmm0, %xmm0
537 ; AVX-NEXT: ret{{[l|q]}}
538 %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 1)
539 ret <2 x double> %res
542 define <4 x float> @test_mm_floor_ss(<4 x float> %a0, <4 x float> %a1) {
543 ; SSE-LABEL: test_mm_floor_ss:
545 ; SSE-NEXT: roundss $1, %xmm1, %xmm0
546 ; SSE-NEXT: ret{{[l|q]}}
548 ; AVX-LABEL: test_mm_floor_ss:
550 ; AVX-NEXT: vroundss $1, %xmm1, %xmm0, %xmm0
551 ; AVX-NEXT: ret{{[l|q]}}
552 %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 1)
556 define <2 x i64> @test_mm_insert_epi8(<2 x i64> %a0, i8 %a1) {
557 ; X86-SSE-LABEL: test_mm_insert_epi8:
559 ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax
560 ; X86-SSE-NEXT: pinsrb $1, %eax, %xmm0
563 ; X86-AVX-LABEL: test_mm_insert_epi8:
565 ; X86-AVX-NEXT: movzbl {{[0-9]+}}(%esp), %eax
566 ; X86-AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
569 ; X64-SSE-LABEL: test_mm_insert_epi8:
571 ; X64-SSE-NEXT: movzbl %dil, %eax
572 ; X64-SSE-NEXT: pinsrb $1, %eax, %xmm0
575 ; X64-AVX-LABEL: test_mm_insert_epi8:
577 ; X64-AVX-NEXT: movzbl %dil, %eax
578 ; X64-AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
580 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
581 %res = insertelement <16 x i8> %arg0, i8 %a1,i32 1
582 %bc = bitcast <16 x i8> %res to <2 x i64>
586 define <2 x i64> @test_mm_insert_epi32(<2 x i64> %a0, i32 %a1) {
587 ; X86-SSE-LABEL: test_mm_insert_epi32:
589 ; X86-SSE-NEXT: pinsrd $1, {{[0-9]+}}(%esp), %xmm0
592 ; X86-AVX-LABEL: test_mm_insert_epi32:
594 ; X86-AVX-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
597 ; X64-SSE-LABEL: test_mm_insert_epi32:
599 ; X64-SSE-NEXT: pinsrd $1, %edi, %xmm0
602 ; X64-AVX-LABEL: test_mm_insert_epi32:
604 ; X64-AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0
606 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
607 %res = insertelement <4 x i32> %arg0, i32 %a1,i32 1
608 %bc = bitcast <4 x i32> %res to <2 x i64>
612 define <2 x i64> @test_mm_insert_epi64(<2 x i64> %a0, i64 %a1) {
613 ; X86-SSE-LABEL: test_mm_insert_epi64:
615 ; X86-SSE-NEXT: pinsrd $2, {{[0-9]+}}(%esp), %xmm0
616 ; X86-SSE-NEXT: pinsrd $3, {{[0-9]+}}(%esp), %xmm0
619 ; X86-AVX-LABEL: test_mm_insert_epi64:
621 ; X86-AVX-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
622 ; X86-AVX-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
625 ; X64-SSE-LABEL: test_mm_insert_epi64:
627 ; X64-SSE-NEXT: pinsrq $1, %rdi, %xmm0
630 ; X64-AVX-LABEL: test_mm_insert_epi64:
632 ; X64-AVX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
634 %res = insertelement <2 x i64> %a0, i64 %a1,i32 1
638 define <4 x float> @test_mm_insert_ps(<4 x float> %a0, <4 x float> %a1) {
639 ; SSE-LABEL: test_mm_insert_ps:
641 ; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1],zero,xmm0[3]
642 ; SSE-NEXT: ret{{[l|q]}}
644 ; AVX-LABEL: test_mm_insert_ps:
646 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[1],zero,xmm0[3]
647 ; AVX-NEXT: ret{{[l|q]}}
648 %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 4)
651 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
653 define <2 x i64> @test_mm_max_epi8(<2 x i64> %a0, <2 x i64> %a1) {
654 ; SSE-LABEL: test_mm_max_epi8:
656 ; SSE-NEXT: pmaxsb %xmm1, %xmm0
657 ; SSE-NEXT: ret{{[l|q]}}
659 ; AVX-LABEL: test_mm_max_epi8:
661 ; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
662 ; AVX-NEXT: ret{{[l|q]}}
663 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
664 %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
665 %sel = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %arg0, <16 x i8> %arg1)
666 %bc = bitcast <16 x i8> %sel to <2 x i64>
669 declare <16 x i8> @llvm.smax.v16i8(<16 x i8>, <16 x i8>)
671 define <2 x i64> @test_mm_max_epi32(<2 x i64> %a0, <2 x i64> %a1) {
672 ; SSE-LABEL: test_mm_max_epi32:
674 ; SSE-NEXT: pmaxsd %xmm1, %xmm0
675 ; SSE-NEXT: ret{{[l|q]}}
677 ; AVX-LABEL: test_mm_max_epi32:
679 ; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
680 ; AVX-NEXT: ret{{[l|q]}}
681 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
682 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
683 %sel = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %arg0, <4 x i32> %arg1)
684 %bc = bitcast <4 x i32> %sel to <2 x i64>
687 declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
689 define <2 x i64> @test_mm_max_epu16(<2 x i64> %a0, <2 x i64> %a1) {
690 ; SSE-LABEL: test_mm_max_epu16:
692 ; SSE-NEXT: pmaxuw %xmm1, %xmm0
693 ; SSE-NEXT: ret{{[l|q]}}
695 ; AVX-LABEL: test_mm_max_epu16:
697 ; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
698 ; AVX-NEXT: ret{{[l|q]}}
699 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
700 %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
701 %sel = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %arg0, <8 x i16> %arg1)
702 %bc = bitcast <8 x i16> %sel to <2 x i64>
705 declare <8 x i16> @llvm.umax.v8i16(<8 x i16>, <8 x i16>)
707 define <2 x i64> @test_mm_max_epu32(<2 x i64> %a0, <2 x i64> %a1) {
708 ; SSE-LABEL: test_mm_max_epu32:
710 ; SSE-NEXT: pmaxud %xmm1, %xmm0
711 ; SSE-NEXT: ret{{[l|q]}}
713 ; AVX-LABEL: test_mm_max_epu32:
715 ; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
716 ; AVX-NEXT: ret{{[l|q]}}
717 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
718 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
719 %sel = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %arg0, <4 x i32> %arg1)
720 %bc = bitcast <4 x i32> %sel to <2 x i64>
723 declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>)
725 define <2 x i64> @test_mm_min_epi8(<2 x i64> %a0, <2 x i64> %a1) {
726 ; SSE-LABEL: test_mm_min_epi8:
728 ; SSE-NEXT: pminsb %xmm1, %xmm0
729 ; SSE-NEXT: ret{{[l|q]}}
731 ; AVX-LABEL: test_mm_min_epi8:
733 ; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
734 ; AVX-NEXT: ret{{[l|q]}}
735 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
736 %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
737 %sel = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %arg0, <16 x i8> %arg1)
738 %bc = bitcast <16 x i8> %sel to <2 x i64>
741 declare <16 x i8> @llvm.smin.v16i8(<16 x i8>, <16 x i8>)
743 define <2 x i64> @test_mm_min_epi32(<2 x i64> %a0, <2 x i64> %a1) {
744 ; SSE-LABEL: test_mm_min_epi32:
746 ; SSE-NEXT: pminsd %xmm1, %xmm0
747 ; SSE-NEXT: ret{{[l|q]}}
749 ; AVX-LABEL: test_mm_min_epi32:
751 ; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
752 ; AVX-NEXT: ret{{[l|q]}}
753 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
754 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
755 %sel = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %arg0, <4 x i32> %arg1)
756 %bc = bitcast <4 x i32> %sel to <2 x i64>
759 declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>)
761 define <2 x i64> @test_mm_min_epu16(<2 x i64> %a0, <2 x i64> %a1) {
762 ; SSE-LABEL: test_mm_min_epu16:
764 ; SSE-NEXT: pminuw %xmm1, %xmm0
765 ; SSE-NEXT: ret{{[l|q]}}
767 ; AVX-LABEL: test_mm_min_epu16:
769 ; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
770 ; AVX-NEXT: ret{{[l|q]}}
771 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
772 %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
773 %sel = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %arg0, <8 x i16> %arg1)
774 %bc = bitcast <8 x i16> %sel to <2 x i64>
777 declare <8 x i16> @llvm.umin.v8i16(<8 x i16>, <8 x i16>)
779 define <2 x i64> @test_mm_min_epu32(<2 x i64> %a0, <2 x i64> %a1) {
780 ; SSE-LABEL: test_mm_min_epu32:
782 ; SSE-NEXT: pminud %xmm1, %xmm0
783 ; SSE-NEXT: ret{{[l|q]}}
785 ; AVX-LABEL: test_mm_min_epu32:
787 ; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
788 ; AVX-NEXT: ret{{[l|q]}}
789 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
790 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
791 %sel = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %arg0, <4 x i32> %arg1)
792 %bc = bitcast <4 x i32> %sel to <2 x i64>
795 declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>)
797 define <2 x i64> @test_mm_minpos_epu16(<2 x i64> %a0) {
798 ; SSE-LABEL: test_mm_minpos_epu16:
800 ; SSE-NEXT: phminposuw %xmm0, %xmm0
801 ; SSE-NEXT: ret{{[l|q]}}
803 ; AVX-LABEL: test_mm_minpos_epu16:
805 ; AVX-NEXT: vphminposuw %xmm0, %xmm0
806 ; AVX-NEXT: ret{{[l|q]}}
807 %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
808 %res = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %arg0)
809 %bc = bitcast <8 x i16> %res to <2 x i64>
812 declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
814 define <2 x i64> @test_mm_mpsadbw_epu8(<2 x i64> %a0, <2 x i64> %a1) {
815 ; SSE-LABEL: test_mm_mpsadbw_epu8:
817 ; SSE-NEXT: mpsadbw $1, %xmm1, %xmm0
818 ; SSE-NEXT: ret{{[l|q]}}
820 ; AVX-LABEL: test_mm_mpsadbw_epu8:
822 ; AVX-NEXT: vmpsadbw $1, %xmm1, %xmm0, %xmm0
823 ; AVX-NEXT: ret{{[l|q]}}
824 %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
825 %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
826 %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %arg0, <16 x i8> %arg1, i8 1)
827 %bc = bitcast <8 x i16> %res to <2 x i64>
830 declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone
832 define <2 x i64> @test_mm_mul_epi32(<2 x i64> %a0, <2 x i64> %a1) {
833 ; SSE-LABEL: test_mm_mul_epi32:
835 ; SSE-NEXT: pmuldq %xmm1, %xmm0
836 ; SSE-NEXT: ret{{[l|q]}}
838 ; AVX1-LABEL: test_mm_mul_epi32:
840 ; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
841 ; AVX1-NEXT: ret{{[l|q]}}
843 ; AVX512-LABEL: test_mm_mul_epi32:
845 ; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
846 ; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0
847 ; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
848 ; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1
849 ; AVX512-NEXT: vpmullq %xmm1, %xmm0, %xmm0
850 ; AVX512-NEXT: ret{{[l|q]}}
851 %A = shl <2 x i64> %a0, <i64 32, i64 32>
852 %A1 = ashr exact <2 x i64> %A, <i64 32, i64 32>
853 %B = shl <2 x i64> %a1, <i64 32, i64 32>
854 %B1 = ashr exact <2 x i64> %B, <i64 32, i64 32>
855 %res = mul nsw <2 x i64> %A1, %B1
859 define <2 x i64> @test_mm_mullo_epi32(<2 x i64> %a0, <2 x i64> %a1) {
860 ; SSE-LABEL: test_mm_mullo_epi32:
862 ; SSE-NEXT: pmulld %xmm1, %xmm0
863 ; SSE-NEXT: ret{{[l|q]}}
865 ; AVX-LABEL: test_mm_mullo_epi32:
867 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
868 ; AVX-NEXT: ret{{[l|q]}}
869 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
870 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
871 %res = mul <4 x i32> %arg0, %arg1
872 %bc = bitcast <4 x i32> %res to <2 x i64>
876 define <2 x i64> @test_mm_packus_epi32(<2 x i64> %a0, <2 x i64> %a1) {
877 ; SSE-LABEL: test_mm_packus_epi32:
879 ; SSE-NEXT: packusdw %xmm1, %xmm0
880 ; SSE-NEXT: ret{{[l|q]}}
882 ; AVX-LABEL: test_mm_packus_epi32:
884 ; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
885 ; AVX-NEXT: ret{{[l|q]}}
886 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
887 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
888 %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %arg0, <4 x i32> %arg1)
889 %bc = bitcast <8 x i16> %res to <2 x i64>
892 declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
894 define <2 x double> @test_mm_round_pd(<2 x double> %a0) {
895 ; SSE-LABEL: test_mm_round_pd:
897 ; SSE-NEXT: roundpd $4, %xmm0, %xmm0
898 ; SSE-NEXT: ret{{[l|q]}}
900 ; AVX-LABEL: test_mm_round_pd:
902 ; AVX-NEXT: vroundpd $4, %xmm0, %xmm0
903 ; AVX-NEXT: ret{{[l|q]}}
904 %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 4)
905 ret <2 x double> %res
908 define <4 x float> @test_mm_round_ps(<4 x float> %a0) {
909 ; SSE-LABEL: test_mm_round_ps:
911 ; SSE-NEXT: roundps $4, %xmm0, %xmm0
912 ; SSE-NEXT: ret{{[l|q]}}
914 ; AVX-LABEL: test_mm_round_ps:
916 ; AVX-NEXT: vroundps $4, %xmm0, %xmm0
917 ; AVX-NEXT: ret{{[l|q]}}
918 %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 4)
922 define <2 x double> @test_mm_round_sd(<2 x double> %a0, <2 x double> %a1) {
923 ; SSE-LABEL: test_mm_round_sd:
925 ; SSE-NEXT: roundsd $4, %xmm1, %xmm0
926 ; SSE-NEXT: ret{{[l|q]}}
928 ; AVX-LABEL: test_mm_round_sd:
930 ; AVX-NEXT: vroundsd $4, %xmm1, %xmm0, %xmm0
931 ; AVX-NEXT: ret{{[l|q]}}
932 %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 4)
933 ret <2 x double> %res
936 define <4 x float> @test_mm_round_ss(<4 x float> %a0, <4 x float> %a1) {
937 ; SSE-LABEL: test_mm_round_ss:
939 ; SSE-NEXT: roundss $4, %xmm1, %xmm0
940 ; SSE-NEXT: ret{{[l|q]}}
942 ; AVX-LABEL: test_mm_round_ss:
944 ; AVX-NEXT: vroundss $4, %xmm1, %xmm0, %xmm0
945 ; AVX-NEXT: ret{{[l|q]}}
946 %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 4)
950 define <2 x i64> @test_mm_stream_load_si128(<2 x i64>* %a0) {
951 ; X86-SSE-LABEL: test_mm_stream_load_si128:
953 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
954 ; X86-SSE-NEXT: movntdqa (%eax), %xmm0
957 ; X86-AVX-LABEL: test_mm_stream_load_si128:
959 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
960 ; X86-AVX-NEXT: vmovntdqa (%eax), %xmm0
963 ; X64-SSE-LABEL: test_mm_stream_load_si128:
965 ; X64-SSE-NEXT: movntdqa (%rdi), %xmm0
968 ; X64-AVX-LABEL: test_mm_stream_load_si128:
970 ; X64-AVX-NEXT: vmovntdqa (%rdi), %xmm0
972 %arg0 = bitcast <2 x i64>* %a0 to i8*
973 %res = call <2 x i64> @llvm.x86.sse41.movntdqa(i8* %arg0)
976 declare <2 x i64> @llvm.x86.sse41.movntdqa(i8*) nounwind readnone
978 define i32 @test_mm_test_all_ones(<2 x i64> %a0) {
979 ; SSE-LABEL: test_mm_test_all_ones:
981 ; SSE-NEXT: pcmpeqd %xmm1, %xmm1
982 ; SSE-NEXT: xorl %eax, %eax
983 ; SSE-NEXT: ptest %xmm1, %xmm0
985 ; SSE-NEXT: ret{{[l|q]}}
987 ; AVX-LABEL: test_mm_test_all_ones:
989 ; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
990 ; AVX-NEXT: xorl %eax, %eax
991 ; AVX-NEXT: vptest %xmm1, %xmm0
993 ; AVX-NEXT: ret{{[l|q]}}
994 %res = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> <i64 -1, i64 -1>)
997 declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
999 define i32 @test_mm_test_all_zeros(<2 x i64> %a0, <2 x i64> %a1) {
1000 ; SSE-LABEL: test_mm_test_all_zeros:
1002 ; SSE-NEXT: xorl %eax, %eax
1003 ; SSE-NEXT: ptest %xmm1, %xmm0
1004 ; SSE-NEXT: sete %al
1005 ; SSE-NEXT: ret{{[l|q]}}
1007 ; AVX-LABEL: test_mm_test_all_zeros:
1009 ; AVX-NEXT: xorl %eax, %eax
1010 ; AVX-NEXT: vptest %xmm1, %xmm0
1011 ; AVX-NEXT: sete %al
1012 ; AVX-NEXT: ret{{[l|q]}}
1013 %res = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %a0, <2 x i64> %a1)
1016 declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
1018 define i32 @test_mm_test_mix_ones_zeros(<2 x i64> %a0, <2 x i64> %a1) {
1019 ; SSE-LABEL: test_mm_test_mix_ones_zeros:
1021 ; SSE-NEXT: xorl %eax, %eax
1022 ; SSE-NEXT: ptest %xmm1, %xmm0
1023 ; SSE-NEXT: seta %al
1024 ; SSE-NEXT: ret{{[l|q]}}
1026 ; AVX-LABEL: test_mm_test_mix_ones_zeros:
1028 ; AVX-NEXT: xorl %eax, %eax
1029 ; AVX-NEXT: vptest %xmm1, %xmm0
1030 ; AVX-NEXT: seta %al
1031 ; AVX-NEXT: ret{{[l|q]}}
1032 %res = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %a0, <2 x i64> %a1)
1035 declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
1037 define i32 @test_mm_testc_si128(<2 x i64> %a0, <2 x i64> %a1) {
1038 ; SSE-LABEL: test_mm_testc_si128:
1040 ; SSE-NEXT: xorl %eax, %eax
1041 ; SSE-NEXT: ptest %xmm1, %xmm0
1042 ; SSE-NEXT: setb %al
1043 ; SSE-NEXT: ret{{[l|q]}}
1045 ; AVX-LABEL: test_mm_testc_si128:
1047 ; AVX-NEXT: xorl %eax, %eax
1048 ; AVX-NEXT: vptest %xmm1, %xmm0
1049 ; AVX-NEXT: setb %al
1050 ; AVX-NEXT: ret{{[l|q]}}
1051 %res = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1)
1055 define i32 @test_mm_testnzc_si128(<2 x i64> %a0, <2 x i64> %a1) {
1056 ; SSE-LABEL: test_mm_testnzc_si128:
1058 ; SSE-NEXT: xorl %eax, %eax
1059 ; SSE-NEXT: ptest %xmm1, %xmm0
1060 ; SSE-NEXT: seta %al
1061 ; SSE-NEXT: ret{{[l|q]}}
1063 ; AVX-LABEL: test_mm_testnzc_si128:
1065 ; AVX-NEXT: xorl %eax, %eax
1066 ; AVX-NEXT: vptest %xmm1, %xmm0
1067 ; AVX-NEXT: seta %al
1068 ; AVX-NEXT: ret{{[l|q]}}
1069 %res = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %a0, <2 x i64> %a1)
1073 define i32 @test_mm_testz_si128(<2 x i64> %a0, <2 x i64> %a1) {
1074 ; SSE-LABEL: test_mm_testz_si128:
1076 ; SSE-NEXT: xorl %eax, %eax
1077 ; SSE-NEXT: ptest %xmm1, %xmm0
1078 ; SSE-NEXT: sete %al
1079 ; SSE-NEXT: ret{{[l|q]}}
1081 ; AVX-LABEL: test_mm_testz_si128:
1083 ; AVX-NEXT: xorl %eax, %eax
1084 ; AVX-NEXT: vptest %xmm1, %xmm0
1085 ; AVX-NEXT: sete %al
1086 ; AVX-NEXT: ret{{[l|q]}}
1087 %res = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %a0, <2 x i64> %a1)