1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512
8 declare <4 x ptr> @llvm.vp.inttoptr.v4p0.v4i32(<4 x i32>, <4 x i1>, i32)
9 define <4 x ptr> @inttoptr_v4p0_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
10 ; X86-LABEL: inttoptr_v4p0_v4i32:
14 ; SSE-LABEL: inttoptr_v4p0_v4i32:
16 ; SSE-NEXT: movaps %xmm0, %xmm1
17 ; SSE-NEXT: xorps %xmm2, %xmm2
18 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
19 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
22 ; AVX1-LABEL: inttoptr_v4p0_v4i32:
24 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
25 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
26 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
27 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
30 ; AVX2-LABEL: inttoptr_v4p0_v4i32:
32 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
35 ; AVX512-LABEL: inttoptr_v4p0_v4i32:
37 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
39 %v = call <4 x ptr> @llvm.vp.inttoptr.v4p0.v4i32(<4 x i32> %va, <4 x i1> %m, i32 %evl)
43 declare <4 x ptr> @llvm.vp.inttoptr.v4p0.v4i64(<4 x i64>, <4 x i1>, i32)
45 define <4 x ptr> @inttoptr_v4p0_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
46 ; X86-LABEL: inttoptr_v4p0_v4i64:
48 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
49 ; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
50 ; X86-NEXT: vzeroupper
53 ; SSE-LABEL: inttoptr_v4p0_v4i64:
57 ; AVX-LABEL: inttoptr_v4p0_v4i64:
60 %v = call <4 x ptr> @llvm.vp.inttoptr.v4p0.v4i64(<4 x i64> %va, <4 x i1> %m, i32 %evl)
64 declare <4 x i32> @llvm.vp.ptrtoint.v4i32.v4p0(<4 x ptr>, <4 x i1>, i32)
66 define <4 x i32> @ptrtoint_v4i32_v4p0(<4 x ptr> %va, <4 x i1> %m, i32 zeroext %evl) {
67 ; X86-LABEL: ptrtoint_v4i32_v4p0:
71 ; SSE-LABEL: ptrtoint_v4i32_v4p0:
73 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
76 ; AVX1-LABEL: ptrtoint_v4i32_v4p0:
78 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
79 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
80 ; AVX1-NEXT: vzeroupper
83 ; AVX2-LABEL: ptrtoint_v4i32_v4p0:
85 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
86 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
87 ; AVX2-NEXT: vzeroupper
90 ; AVX512-LABEL: ptrtoint_v4i32_v4p0:
92 ; AVX512-NEXT: vpmovqd %ymm0, %xmm0
93 ; AVX512-NEXT: vzeroupper
95 %v = call <4 x i32> @llvm.vp.ptrtoint.v4i32.v4p0(<4 x ptr> %va, <4 x i1> %m, i32 %evl)
99 declare <4 x i64> @llvm.vp.ptrtoint.v4i64.v4p0(<4 x ptr>, <4 x i1>, i32)
101 define <4 x i64> @ptrtoint_v4i64_v4p0(<4 x ptr> %va, <4 x i1> %m, i32 zeroext %evl) {
102 ; X86-LABEL: ptrtoint_v4i64_v4p0:
104 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
105 ; X86-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
106 ; X86-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
107 ; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
110 ; SSE-LABEL: ptrtoint_v4i64_v4p0:
114 ; AVX-LABEL: ptrtoint_v4i64_v4p0:
117 %v = call <4 x i64> @llvm.vp.ptrtoint.v4i64.v4p0(<4 x ptr> %va, <4 x i1> %m, i32 %evl)
121 define <4 x i32> @vsext_v4i32_v4i1(<4 x i1> %va, <4 x i1> %m, i32 zeroext %evl) {
122 ; X86-LABEL: vsext_v4i32_v4i1:
124 ; X86-NEXT: vpslld $31, %xmm0, %xmm0
125 ; X86-NEXT: vpsrad $31, %xmm0, %xmm0
128 ; SSE-LABEL: vsext_v4i32_v4i1:
130 ; SSE-NEXT: pslld $31, %xmm0
131 ; SSE-NEXT: psrad $31, %xmm0
134 ; AVX-LABEL: vsext_v4i32_v4i1:
136 ; AVX-NEXT: vpslld $31, %xmm0, %xmm0
137 ; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
139 %v = call <4 x i32> @llvm.vp.sext.v4i32.v4i1(<4 x i1> %va, <4 x i1> %m, i32 %evl)
143 define <4 x i64> @vsext_v4i64_v4i1(<4 x i1> %va, <4 x i1> %m, i32 zeroext %evl) {
144 ; X86-LABEL: vsext_v4i64_v4i1:
146 ; X86-NEXT: vpslld $31, %xmm0, %xmm0
147 ; X86-NEXT: vpsrad $31, %xmm0, %xmm0
148 ; X86-NEXT: vpmovsxdq %xmm0, %xmm1
149 ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
150 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
153 ; SSE-LABEL: vsext_v4i64_v4i1:
155 ; SSE-NEXT: pslld $31, %xmm0
156 ; SSE-NEXT: psrad $31, %xmm0
157 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
158 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
159 ; SSE-NEXT: movdqa %xmm2, %xmm0
162 ; AVX1-LABEL: vsext_v4i64_v4i1:
164 ; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
165 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
166 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
167 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
168 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
171 ; AVX2-LABEL: vsext_v4i64_v4i1:
173 ; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
174 ; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
175 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
178 ; AVX512-LABEL: vsext_v4i64_v4i1:
180 ; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
181 ; AVX512-NEXT: vpsrad $31, %xmm0, %xmm0
182 ; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0
184 %v = call <4 x i64> @llvm.vp.sext.v4i64.v4i1(<4 x i1> %va, <4 x i1> %m, i32 %evl)
188 define <4 x i32> @vzext_v4i32_v4i1(<4 x i1> %va, <4 x i1> %m, i32 zeroext %evl) {
189 ; X86-LABEL: vzext_v4i32_v4i1:
191 ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
194 ; SSE-LABEL: vzext_v4i32_v4i1:
196 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
199 ; AVX1-LABEL: vzext_v4i32_v4i1:
201 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
204 ; AVX2-LABEL: vzext_v4i32_v4i1:
206 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1]
207 ; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0
210 ; AVX512-LABEL: vzext_v4i32_v4i1:
212 ; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
214 %v = call <4 x i32> @llvm.vp.zext.v4i32.v4i1(<4 x i1> %va, <4 x i1> %m, i32 %evl)
218 define <4 x i64> @vzext_v4i64_v4i1(<4 x i1> %va, <4 x i1> %m, i32 zeroext %evl) {
219 ; X86-LABEL: vzext_v4i64_v4i1:
221 ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
222 ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
223 ; X86-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
224 ; X86-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
225 ; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
228 ; SSE-LABEL: vzext_v4i64_v4i1:
230 ; SSE-NEXT: movaps %xmm0, %xmm1
231 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
232 ; SSE-NEXT: xorps %xmm2, %xmm2
233 ; SSE-NEXT: movaps %xmm1, %xmm0
234 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
235 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
238 ; AVX1-LABEL: vzext_v4i64_v4i1:
240 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
241 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
242 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
243 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
244 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
247 ; AVX2-LABEL: vzext_v4i64_v4i1:
249 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
250 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
251 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
254 ; AVX512-LABEL: vzext_v4i64_v4i1:
256 ; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
257 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
259 %v = call <4 x i64> @llvm.vp.zext.v4i64.v4i1(<4 x i1> %va, <4 x i1> %m, i32 %evl)
263 define <4 x i1> @vtrunc_v4i1_v4i32(<4 x i32> %a, <4 x i1> %m, i32 zeroext %vl) {
264 ; X86-LABEL: vtrunc_v4i1_v4i32:
268 ; SSE-LABEL: vtrunc_v4i1_v4i32:
272 ; AVX-LABEL: vtrunc_v4i1_v4i32:
275 %v = call <4 x i1> @llvm.vp.trunc.v4i1.v4i32(<4 x i32> %a, <4 x i1> %m, i32 %vl)
279 define <4 x i1> @vtrunc_v4i1_v4i64(<4 x i64> %a, <4 x i1> %m, i32 zeroext %vl) {
280 ; X86-LABEL: vtrunc_v4i1_v4i64:
282 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
283 ; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
284 ; X86-NEXT: vzeroupper
287 ; SSE-LABEL: vtrunc_v4i1_v4i64:
289 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
292 ; AVX1-LABEL: vtrunc_v4i1_v4i64:
294 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
295 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
296 ; AVX1-NEXT: vzeroupper
299 ; AVX2-LABEL: vtrunc_v4i1_v4i64:
301 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
302 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
303 ; AVX2-NEXT: vzeroupper
306 ; AVX512-LABEL: vtrunc_v4i1_v4i64:
308 ; AVX512-NEXT: vpmovqd %ymm0, %xmm0
309 ; AVX512-NEXT: vzeroupper
311 %v = call <4 x i1> @llvm.vp.trunc.v4i1.v4i64(<4 x i64> %a, <4 x i1> %m, i32 %vl)
315 define <4 x i32> @vfptoui_v4i32_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext %evl) {
316 ; X86-LABEL: vfptoui_v4i32_v4f32:
318 ; X86-NEXT: vcvttps2dq %xmm0, %xmm1
319 ; X86-NEXT: vpsrad $31, %xmm1, %xmm2
320 ; X86-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
321 ; X86-NEXT: vcvttps2dq %xmm0, %xmm0
322 ; X86-NEXT: vpand %xmm2, %xmm0, %xmm0
323 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
326 ; SSE-LABEL: vfptoui_v4i32_v4f32:
328 ; SSE-NEXT: cvttps2dq %xmm0, %xmm1
329 ; SSE-NEXT: movdqa %xmm1, %xmm2
330 ; SSE-NEXT: psrad $31, %xmm2
331 ; SSE-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
332 ; SSE-NEXT: cvttps2dq %xmm0, %xmm0
333 ; SSE-NEXT: pand %xmm2, %xmm0
334 ; SSE-NEXT: por %xmm1, %xmm0
337 ; AVX1-LABEL: vfptoui_v4i32_v4f32:
339 ; AVX1-NEXT: vcvttps2dq %xmm0, %xmm1
340 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
341 ; AVX1-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
342 ; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0
343 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
344 ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
347 ; AVX2-LABEL: vfptoui_v4i32_v4f32:
349 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
350 ; AVX2-NEXT: vsubps %xmm1, %xmm0, %xmm1
351 ; AVX2-NEXT: vcvttps2dq %xmm1, %xmm1
352 ; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0
353 ; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2
354 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
355 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
358 ; AVX512-LABEL: vfptoui_v4i32_v4f32:
360 ; AVX512-NEXT: vcvttps2udq %xmm0, %xmm0
362 %v = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f32(<4 x float> %va, <4 x i1> %m, i32 %evl)
366 define <4 x i32> @vfptosi_v4i32_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext %evl) {
367 ; X86-LABEL: vfptosi_v4i32_v4f32:
369 ; X86-NEXT: vcvttps2dq %xmm0, %xmm0
372 ; SSE-LABEL: vfptosi_v4i32_v4f32:
374 ; SSE-NEXT: cvttps2dq %xmm0, %xmm0
377 ; AVX-LABEL: vfptosi_v4i32_v4f32:
379 ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
381 %v = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f32(<4 x float> %va, <4 x i1> %m, i32 %evl)
385 define <4 x float> @vuitofp_v4f32_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
386 ; X86-LABEL: vuitofp_v4f32_v4i32:
388 ; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
389 ; X86-NEXT: vpsrld $16, %xmm0, %xmm0
390 ; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
391 ; X86-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
392 ; X86-NEXT: vaddps %xmm0, %xmm1, %xmm0
395 ; SSE-LABEL: vuitofp_v4f32_v4i32:
397 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
398 ; SSE-NEXT: pand %xmm0, %xmm1
399 ; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
400 ; SSE-NEXT: psrld $16, %xmm0
401 ; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
402 ; SSE-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
403 ; SSE-NEXT: addps %xmm1, %xmm0
406 ; AVX1-LABEL: vuitofp_v4f32_v4i32:
408 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
409 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
410 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
411 ; AVX1-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
412 ; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0
415 ; AVX2-LABEL: vuitofp_v4f32_v4i32:
417 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200]
418 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
419 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
420 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928]
421 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
422 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
423 ; AVX2-NEXT: vsubps %xmm2, %xmm0, %xmm0
424 ; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0
427 ; AVX512-LABEL: vuitofp_v4f32_v4i32:
429 ; AVX512-NEXT: vcvtudq2ps %xmm0, %xmm0
431 %v = call <4 x float> @llvm.vp.uitofp.v4f32.v4i32(<4 x i32> %va, <4 x i1> %m, i32 %evl)
435 define <4 x float> @vsitofp_v4f32_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
436 ; X86-LABEL: vsitofp_v4f32_v4i32:
438 ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0
441 ; SSE-LABEL: vsitofp_v4f32_v4i32:
443 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
446 ; AVX-LABEL: vsitofp_v4f32_v4i32:
448 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
450 %v = call <4 x float> @llvm.vp.sitofp.v4f32.v4i32(<4 x i32> %va, <4 x i1> %m, i32 %evl)
454 define <2 x half> @vfptrunc_v2f16_v2f64(<2 x double> %a, <2 x i1> %m, i32 zeroext %vl) {
455 ; X86-LABEL: vfptrunc_v2f16_v2f64:
457 ; X86-NEXT: subl $40, %esp
458 ; X86-NEXT: .cfi_def_cfa_offset 44
459 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
460 ; X86-NEXT: vmovlps %xmm0, (%esp)
461 ; X86-NEXT: calll __truncdfhf2
462 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
463 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
464 ; X86-NEXT: vmovhps %xmm0, (%esp)
465 ; X86-NEXT: calll __truncdfhf2
466 ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
467 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
468 ; X86-NEXT: addl $40, %esp
469 ; X86-NEXT: .cfi_def_cfa_offset 4
472 ; SSE-LABEL: vfptrunc_v2f16_v2f64:
474 ; SSE-NEXT: subq $40, %rsp
475 ; SSE-NEXT: .cfi_def_cfa_offset 48
476 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
477 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
478 ; SSE-NEXT: callq __truncdfhf2@PLT
479 ; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
480 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
481 ; SSE-NEXT: callq __truncdfhf2@PLT
482 ; SSE-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload
483 ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
484 ; SSE-NEXT: addq $40, %rsp
485 ; SSE-NEXT: .cfi_def_cfa_offset 8
488 ; AVX1-LABEL: vfptrunc_v2f16_v2f64:
490 ; AVX1-NEXT: subq $40, %rsp
491 ; AVX1-NEXT: .cfi_def_cfa_offset 48
492 ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
493 ; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
494 ; AVX1-NEXT: callq __truncdfhf2@PLT
495 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
496 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
497 ; AVX1-NEXT: callq __truncdfhf2@PLT
498 ; AVX1-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
499 ; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
500 ; AVX1-NEXT: addq $40, %rsp
501 ; AVX1-NEXT: .cfi_def_cfa_offset 8
504 ; AVX2-LABEL: vfptrunc_v2f16_v2f64:
506 ; AVX2-NEXT: subq $40, %rsp
507 ; AVX2-NEXT: .cfi_def_cfa_offset 48
508 ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
509 ; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
510 ; AVX2-NEXT: callq __truncdfhf2@PLT
511 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
512 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
513 ; AVX2-NEXT: callq __truncdfhf2@PLT
514 ; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
515 ; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
516 ; AVX2-NEXT: addq $40, %rsp
517 ; AVX2-NEXT: .cfi_def_cfa_offset 8
520 ; AVX512-LABEL: vfptrunc_v2f16_v2f64:
522 ; AVX512-NEXT: subq $40, %rsp
523 ; AVX512-NEXT: .cfi_def_cfa_offset 48
524 ; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
525 ; AVX512-NEXT: callq __truncdfhf2@PLT
526 ; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
527 ; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
528 ; AVX512-NEXT: # xmm0 = mem[1,0]
529 ; AVX512-NEXT: callq __truncdfhf2@PLT
530 ; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
531 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
532 ; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
533 ; AVX512-NEXT: callq __truncdfhf2@PLT
534 ; AVX512-NEXT: vpbroadcastw %xmm0, %xmm1
535 ; AVX512-NEXT: vmovss {{.*#+}} xmm0 = [4,0,0,0]
536 ; AVX512-NEXT: vpermi2ps (%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
537 ; AVX512-NEXT: addq $40, %rsp
538 ; AVX512-NEXT: .cfi_def_cfa_offset 8
540 %v = call <2 x half> @llvm.vp.fptrunc.v2f16.v2f64(<2 x double> %a, <2 x i1> %m, i32 %vl)
544 define <2 x double> @vfpext_v2f32_v2f64(<2 x float> %a, <2 x i1> %m, i32 zeroext %vl) {
545 ; X86-LABEL: vfpext_v2f32_v2f64:
547 ; X86-NEXT: vcvtps2pd %xmm0, %xmm0
550 ; SSE-LABEL: vfpext_v2f32_v2f64:
552 ; SSE-NEXT: cvtps2pd %xmm0, %xmm0
555 ; AVX-LABEL: vfpext_v2f32_v2f64:
557 ; AVX-NEXT: vcvtps2pd %xmm0, %xmm0
559 %v = call <2 x double> @llvm.vp.fpext.v2f64.v2f32(<2 x float> %a, <2 x i1> %m, i32 %vl)
563 declare <4 x i32> @llvm.vp.sext.v4i32.v4i1(<4 x i1>, <4 x i1>, i32)
564 declare <4 x i64> @llvm.vp.sext.v4i64.v4i1(<4 x i1>, <4 x i1>, i32)
565 declare <4 x i32> @llvm.vp.zext.v4i32.v4i1(<4 x i1>, <4 x i1>, i32)
566 declare <4 x i64> @llvm.vp.zext.v4i64.v4i1(<4 x i1>, <4 x i1>, i32)
567 declare <4 x i1> @llvm.vp.trunc.v4i1.v4i32(<4 x i32>, <4 x i1>, i32)
568 declare <4 x i1> @llvm.vp.trunc.v4i1.v4i64(<4 x i64>, <4 x i1>, i32)
569 declare <4 x i32> @llvm.vp.fptoui.v4i32.v4f32(<4 x float>, <4 x i1>, i32)
570 declare <4 x i32> @llvm.vp.fptosi.v4i32.v4f32(<4 x float>, <4 x i1>, i32)
571 declare <4 x float> @llvm.vp.uitofp.v4f32.v4i32(<4 x i32>, <4 x i1>, i32)
572 declare <4 x float> @llvm.vp.sitofp.v4f32.v4i32(<4 x i32>, <4 x i1>, i32)
573 declare <2 x half> @llvm.vp.fptrunc.v2f16.v2f64(<2 x double>, <2 x i1>, i32)
574 declare <2 x double> @llvm.vp.fpext.v2f64.v2f32(<2 x float>, <2 x i1>, i32)