1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64
3 ; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86
6 define <8 x double> @test_mask_compress_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask) {
7 ; X64-LABEL: test_mask_compress_pd_512:
9 ; X64-NEXT: kmovw %edi, %k1
10 ; X64-NEXT: vcompresspd %zmm0, %zmm1 {%k1}
11 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
14 ; X86-LABEL: test_mask_compress_pd_512:
16 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
17 ; X86-NEXT: kmovw %eax, %k1
18 ; X86-NEXT: vcompresspd %zmm0, %zmm1 {%k1}
19 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
21 %1 = bitcast i8 %mask to <8 x i1>
22 %2 = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> %data, <8 x double> %passthru, <8 x i1> %1)
26 define <8 x double> @test_maskz_compress_pd_512(<8 x double> %data, i8 %mask) {
27 ; X64-LABEL: test_maskz_compress_pd_512:
29 ; X64-NEXT: kmovw %edi, %k1
30 ; X64-NEXT: vcompresspd %zmm0, %zmm0 {%k1} {z}
33 ; X86-LABEL: test_maskz_compress_pd_512:
35 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
36 ; X86-NEXT: kmovw %eax, %k1
37 ; X86-NEXT: vcompresspd %zmm0, %zmm0 {%k1} {z}
39 %1 = bitcast i8 %mask to <8 x i1>
40 %2 = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> %data, <8 x double> zeroinitializer, <8 x i1> %1)
44 define <8 x double> @test_compress_pd_512(<8 x double> %data) {
45 ; CHECK-LABEL: test_compress_pd_512:
47 ; CHECK-NEXT: ret{{[l|q]}}
48 %1 = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> %data, <8 x double> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
52 define <16 x float> @test_mask_compress_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask) {
53 ; X64-LABEL: test_mask_compress_ps_512:
55 ; X64-NEXT: kmovw %edi, %k1
56 ; X64-NEXT: vcompressps %zmm0, %zmm1 {%k1}
57 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
60 ; X86-LABEL: test_mask_compress_ps_512:
62 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
63 ; X86-NEXT: vcompressps %zmm0, %zmm1 {%k1}
64 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
66 %1 = bitcast i16 %mask to <16 x i1>
67 %2 = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> %data, <16 x float> %passthru, <16 x i1> %1)
71 define <16 x float> @test_maskz_compress_ps_512(<16 x float> %data, i16 %mask) {
72 ; X64-LABEL: test_maskz_compress_ps_512:
74 ; X64-NEXT: kmovw %edi, %k1
75 ; X64-NEXT: vcompressps %zmm0, %zmm0 {%k1} {z}
78 ; X86-LABEL: test_maskz_compress_ps_512:
80 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
81 ; X86-NEXT: vcompressps %zmm0, %zmm0 {%k1} {z}
83 %1 = bitcast i16 %mask to <16 x i1>
84 %2 = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> %data, <16 x float> zeroinitializer, <16 x i1> %1)
88 define <16 x float> @test_compress_ps_512(<16 x float> %data) {
89 ; CHECK-LABEL: test_compress_ps_512:
91 ; CHECK-NEXT: ret{{[l|q]}}
92 %1 = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> %data, <16 x float> undef, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
96 define <8 x i64> @test_mask_compress_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) {
97 ; X64-LABEL: test_mask_compress_q_512:
99 ; X64-NEXT: kmovw %edi, %k1
100 ; X64-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
101 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
104 ; X86-LABEL: test_mask_compress_q_512:
106 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
107 ; X86-NEXT: kmovw %eax, %k1
108 ; X86-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
109 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
111 %1 = bitcast i8 %mask to <8 x i1>
112 %2 = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> %data, <8 x i64> %passthru, <8 x i1> %1)
116 define <8 x i64> @test_maskz_compress_q_512(<8 x i64> %data, i8 %mask) {
117 ; X64-LABEL: test_maskz_compress_q_512:
119 ; X64-NEXT: kmovw %edi, %k1
120 ; X64-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
123 ; X86-LABEL: test_maskz_compress_q_512:
125 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
126 ; X86-NEXT: kmovw %eax, %k1
127 ; X86-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
129 %1 = bitcast i8 %mask to <8 x i1>
130 %2 = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> %data, <8 x i64> zeroinitializer, <8 x i1> %1)
134 define <8 x i64> @test_compress_q_512(<8 x i64> %data) {
135 ; CHECK-LABEL: test_compress_q_512:
137 ; CHECK-NEXT: ret{{[l|q]}}
138 %1 = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> %data, <8 x i64> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
142 define <16 x i32> @test_mask_compress_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) {
143 ; X64-LABEL: test_mask_compress_d_512:
145 ; X64-NEXT: kmovw %edi, %k1
146 ; X64-NEXT: vpcompressd %zmm0, %zmm1 {%k1}
147 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
150 ; X86-LABEL: test_mask_compress_d_512:
152 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
153 ; X86-NEXT: vpcompressd %zmm0, %zmm1 {%k1}
154 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
156 %1 = bitcast i16 %mask to <16 x i1>
157 %2 = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %data, <16 x i32> %passthru, <16 x i1> %1)
161 define <16 x i32> @test_maskz_compress_d_512(<16 x i32> %data, i16 %mask) {
162 ; X64-LABEL: test_maskz_compress_d_512:
164 ; X64-NEXT: kmovw %edi, %k1
165 ; X64-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z}
168 ; X86-LABEL: test_maskz_compress_d_512:
170 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
171 ; X86-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z}
173 %1 = bitcast i16 %mask to <16 x i1>
174 %2 = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %data, <16 x i32> zeroinitializer, <16 x i1> %1)
178 define <16 x i32> @test_compress_d_512(<16 x i32> %data) {
179 ; CHECK-LABEL: test_compress_d_512:
181 ; CHECK-NEXT: ret{{[l|q]}}
182 %1 = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %data, <16 x i32> undef, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
186 define <8 x double> @test_expand_pd_512(<8 x double> %data) {
187 ; CHECK-LABEL: test_expand_pd_512:
189 ; CHECK-NEXT: ret{{[l|q]}}
190 %1 = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> %data, <8 x double> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
194 define <8 x double> @test_mask_expand_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask) {
195 ; X64-LABEL: test_mask_expand_pd_512:
197 ; X64-NEXT: kmovw %edi, %k1
198 ; X64-NEXT: vexpandpd %zmm0, %zmm1 {%k1}
199 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
202 ; X86-LABEL: test_mask_expand_pd_512:
204 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
205 ; X86-NEXT: kmovw %eax, %k1
206 ; X86-NEXT: vexpandpd %zmm0, %zmm1 {%k1}
207 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
209 %1 = bitcast i8 %mask to <8 x i1>
210 %2 = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> %data, <8 x double> %passthru, <8 x i1> %1)
214 define <8 x double> @test_maskz_expand_pd_512(<8 x double> %data, i8 %mask) {
215 ; X64-LABEL: test_maskz_expand_pd_512:
217 ; X64-NEXT: kmovw %edi, %k1
218 ; X64-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z}
221 ; X86-LABEL: test_maskz_expand_pd_512:
223 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
224 ; X86-NEXT: kmovw %eax, %k1
225 ; X86-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z}
227 %1 = bitcast i8 %mask to <8 x i1>
228 %2 = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> %data, <8 x double> zeroinitializer, <8 x i1> %1)
232 define <16 x float> @test_expand_ps_512(<16 x float> %data) {
233 ; CHECK-LABEL: test_expand_ps_512:
235 ; CHECK-NEXT: ret{{[l|q]}}
236 %1 = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> %data, <16 x float> undef, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
240 define <16 x float> @test_mask_expand_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask) {
241 ; X64-LABEL: test_mask_expand_ps_512:
243 ; X64-NEXT: kmovw %edi, %k1
244 ; X64-NEXT: vexpandps %zmm0, %zmm1 {%k1}
245 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
248 ; X86-LABEL: test_mask_expand_ps_512:
250 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
251 ; X86-NEXT: vexpandps %zmm0, %zmm1 {%k1}
252 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
254 %1 = bitcast i16 %mask to <16 x i1>
255 %2 = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> %data, <16 x float> %passthru, <16 x i1> %1)
259 define <16 x float> @test_maskz_expand_ps_512(<16 x float> %data, i16 %mask) {
260 ; X64-LABEL: test_maskz_expand_ps_512:
262 ; X64-NEXT: kmovw %edi, %k1
263 ; X64-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z}
266 ; X86-LABEL: test_maskz_expand_ps_512:
268 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
269 ; X86-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z}
271 %1 = bitcast i16 %mask to <16 x i1>
272 %2 = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> %data, <16 x float> zeroinitializer, <16 x i1> %1)
276 define <8 x i64> @test_expand_q_512(<8 x i64> %data) {
277 ; CHECK-LABEL: test_expand_q_512:
279 ; CHECK-NEXT: ret{{[l|q]}}
280 %1 = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> %data, <8 x i64> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
284 define <8 x i64> @test_mask_expand_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) {
285 ; X64-LABEL: test_mask_expand_q_512:
287 ; X64-NEXT: kmovw %edi, %k1
288 ; X64-NEXT: vpexpandq %zmm0, %zmm1 {%k1}
289 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
292 ; X86-LABEL: test_mask_expand_q_512:
294 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
295 ; X86-NEXT: kmovw %eax, %k1
296 ; X86-NEXT: vpexpandq %zmm0, %zmm1 {%k1}
297 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
299 %1 = bitcast i8 %mask to <8 x i1>
300 %2 = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> %data, <8 x i64> %passthru, <8 x i1> %1)
304 define <8 x i64> @test_maskz_expand_q_512(<8 x i64> %data, i8 %mask) {
305 ; X64-LABEL: test_maskz_expand_q_512:
307 ; X64-NEXT: kmovw %edi, %k1
308 ; X64-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
311 ; X86-LABEL: test_maskz_expand_q_512:
313 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
314 ; X86-NEXT: kmovw %eax, %k1
315 ; X86-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
317 %1 = bitcast i8 %mask to <8 x i1>
318 %2 = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> %data, <8 x i64> zeroinitializer, <8 x i1> %1)
322 define <16 x i32> @test_expand_d_512(<16 x i32> %data) {
323 ; CHECK-LABEL: test_expand_d_512:
325 ; CHECK-NEXT: ret{{[l|q]}}
326 %1 = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %data, <16 x i32> undef, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
330 define <16 x i32> @test_mask_expand_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) {
331 ; X64-LABEL: test_mask_expand_d_512:
333 ; X64-NEXT: kmovw %edi, %k1
334 ; X64-NEXT: vpexpandd %zmm0, %zmm1 {%k1}
335 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
338 ; X86-LABEL: test_mask_expand_d_512:
340 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
341 ; X86-NEXT: vpexpandd %zmm0, %zmm1 {%k1}
342 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
344 %1 = bitcast i16 %mask to <16 x i1>
345 %2 = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %data, <16 x i32> %passthru, <16 x i1> %1)
349 define <16 x i32> @test_maskz_expand_d_512(<16 x i32> %data, i16 %mask) {
350 ; X64-LABEL: test_maskz_expand_d_512:
352 ; X64-NEXT: kmovw %edi, %k1
353 ; X64-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z}
356 ; X86-LABEL: test_maskz_expand_d_512:
358 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
359 ; X86-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z}
361 %1 = bitcast i16 %mask to <16 x i1>
362 %2 = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %data, <16 x i32> zeroinitializer, <16 x i1> %1)
366 define <16 x float> @test_rcp_ps_512(<16 x float> %a0) {
367 ; CHECK-LABEL: test_rcp_ps_512:
369 ; CHECK-NEXT: vrcp14ps %zmm0, %zmm0
370 ; CHECK-NEXT: ret{{[l|q]}}
371 %res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
372 ret <16 x float> %res
374 declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
376 define <8 x double> @test_rcp_pd_512(<8 x double> %a0) {
377 ; CHECK-LABEL: test_rcp_pd_512:
379 ; CHECK-NEXT: vrcp14pd %zmm0, %zmm0
380 ; CHECK-NEXT: ret{{[l|q]}}
381 %res = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1) ; <<8 x double>> [#uses=1]
382 ret <8 x double> %res
384 declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>, <8 x double>, i8) nounwind readnone
386 declare <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32, i32)
388 define <2 x double> @test_rndscale_sd(<2 x double> %a, <2 x double> %b) {
389 ; CHECK-LABEL: test_rndscale_sd:
391 ; CHECK-NEXT: vroundsd $11, %xmm1, %xmm0, %xmm0
392 ; CHECK-NEXT: ret{{[l|q]}}
393 %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> undef, i8 -1, i32 11, i32 4)
397 define <2 x double> @test_rndscale_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
398 ; X64-LABEL: test_rndscale_sd_mask:
400 ; X64-NEXT: kmovw %edi, %k1
401 ; X64-NEXT: vrndscalesd $11, %xmm1, %xmm0, %xmm2 {%k1}
402 ; X64-NEXT: vmovapd %xmm2, %xmm0
405 ; X86-LABEL: test_rndscale_sd_mask:
407 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
408 ; X86-NEXT: kmovw %eax, %k1
409 ; X86-NEXT: vrndscalesd $11, %xmm1, %xmm0, %xmm2 {%k1}
410 ; X86-NEXT: vmovapd %xmm2, %xmm0
412 %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 11, i32 4)
416 define <2 x double> @test_rndscale_sd_mask_load(<2 x double> %a, ptr %bptr, <2 x double> %c, i8 %mask) {
417 ; X64-LABEL: test_rndscale_sd_mask_load:
419 ; X64-NEXT: kmovw %esi, %k1
420 ; X64-NEXT: vrndscalesd $11, (%rdi), %xmm0, %xmm1 {%k1}
421 ; X64-NEXT: vmovapd %xmm1, %xmm0
424 ; X86-LABEL: test_rndscale_sd_mask_load:
426 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
427 ; X86-NEXT: kmovw %eax, %k1
428 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
429 ; X86-NEXT: vrndscalesd $11, (%eax), %xmm0, %xmm1 {%k1}
430 ; X86-NEXT: vmovapd %xmm1, %xmm0
432 %b = load <2 x double>, ptr %bptr
433 %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 11, i32 4)
437 define <2 x double> @test_rndscale_sd_maskz(<2 x double> %a, <2 x double> %b, i8 %mask) {
438 ; X64-LABEL: test_rndscale_sd_maskz:
440 ; X64-NEXT: kmovw %edi, %k1
441 ; X64-NEXT: vrndscalesd $11, %xmm1, %xmm0, %xmm0 {%k1} {z}
444 ; X86-LABEL: test_rndscale_sd_maskz:
446 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
447 ; X86-NEXT: kmovw %eax, %k1
448 ; X86-NEXT: vrndscalesd $11, %xmm1, %xmm0, %xmm0 {%k1} {z}
450 %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> zeroinitializer, i8 %mask, i32 11, i32 4)
454 declare <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32, i32)
456 define <4 x float> @test_rndscale_ss(<4 x float> %a, <4 x float> %b) {
457 ; CHECK-LABEL: test_rndscale_ss:
459 ; CHECK-NEXT: vroundss $11, %xmm1, %xmm0, %xmm0
460 ; CHECK-NEXT: ret{{[l|q]}}
461 %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 11, i32 4)
465 define <4 x float> @test_rndscale_ss_load(<4 x float> %a, ptr %bptr) {
466 ; X64-LABEL: test_rndscale_ss_load:
468 ; X64-NEXT: vroundss $11, (%rdi), %xmm0, %xmm0
471 ; X86-LABEL: test_rndscale_ss_load:
473 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
474 ; X86-NEXT: vroundss $11, (%eax), %xmm0, %xmm0
476 %b = load <4 x float>, ptr %bptr
477 %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 11, i32 4)
481 define <4 x float> @test_rndscale_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
482 ; X64-LABEL: test_rndscale_ss_mask:
484 ; X64-NEXT: kmovw %edi, %k1
485 ; X64-NEXT: vrndscaless $11, %xmm1, %xmm0, %xmm2 {%k1}
486 ; X64-NEXT: vmovaps %xmm2, %xmm0
489 ; X86-LABEL: test_rndscale_ss_mask:
491 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
492 ; X86-NEXT: kmovw %eax, %k1
493 ; X86-NEXT: vrndscaless $11, %xmm1, %xmm0, %xmm2 {%k1}
494 ; X86-NEXT: vmovaps %xmm2, %xmm0
496 %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask, i32 11, i32 4)
500 define <4 x float> @test_rndscale_ss_maskz(<4 x float> %a, <4 x float> %b, i8 %mask) {
501 ; X64-LABEL: test_rndscale_ss_maskz:
503 ; X64-NEXT: kmovw %edi, %k1
504 ; X64-NEXT: vrndscaless $11, %xmm1, %xmm0, %xmm0 {%k1} {z}
507 ; X86-LABEL: test_rndscale_ss_maskz:
509 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
510 ; X86-NEXT: kmovw %eax, %k1
511 ; X86-NEXT: vrndscaless $11, %xmm1, %xmm0, %xmm0 {%k1} {z}
513 %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask, i32 11, i32 4)
517 declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
519 define <8 x double> @test7(<8 x double> %a) {
520 ; CHECK-LABEL: test7:
522 ; CHECK-NEXT: vrndscalepd $11, %zmm0, %zmm0
523 ; CHECK-NEXT: ret{{[l|q]}}
524 %res = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %a, i32 11, <8 x double> %a, i8 -1, i32 4)
528 declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
530 define <16 x float> @test8(<16 x float> %a) {
531 ; CHECK-LABEL: test8:
533 ; CHECK-NEXT: vrndscaleps $11, %zmm0, %zmm0
534 ; CHECK-NEXT: ret{{[l|q]}}
535 %res = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %a, i32 11, <16 x float> %a, i16 -1, i32 4)
539 define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) {
540 ; CHECK-LABEL: test_rsqrt_ps_512:
542 ; CHECK-NEXT: vrsqrt14ps %zmm0, %zmm0
543 ; CHECK-NEXT: ret{{[l|q]}}
544 %res = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
545 ret <16 x float> %res
547 declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
549 define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) {
550 ; CHECK-LABEL: test_sqrt_pd_512:
552 ; CHECK-NEXT: vsqrtpd %zmm0, %zmm0
553 ; CHECK-NEXT: ret{{[l|q]}}
554 %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0)
558 define <8 x double> @test_mask_sqrt_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
559 ; X64-LABEL: test_mask_sqrt_pd_512:
561 ; X64-NEXT: kmovw %edi, %k1
562 ; X64-NEXT: vsqrtpd %zmm0, %zmm1 {%k1}
563 ; X64-NEXT: vmovapd %zmm1, %zmm0
566 ; X86-LABEL: test_mask_sqrt_pd_512:
568 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
569 ; X86-NEXT: kmovw %eax, %k1
570 ; X86-NEXT: vsqrtpd %zmm0, %zmm1 {%k1}
571 ; X86-NEXT: vmovapd %zmm1, %zmm0
573 %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0)
574 %2 = bitcast i8 %mask to <8 x i1>
575 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
579 define <8 x double> @test_maskz_sqrt_pd_512(<8 x double> %a0, i8 %mask) {
580 ; X64-LABEL: test_maskz_sqrt_pd_512:
582 ; X64-NEXT: kmovw %edi, %k1
583 ; X64-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z}
586 ; X86-LABEL: test_maskz_sqrt_pd_512:
588 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
589 ; X86-NEXT: kmovw %eax, %k1
590 ; X86-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z}
592 %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0)
593 %2 = bitcast i8 %mask to <8 x i1>
594 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
597 declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
599 define <8 x double> @test_sqrt_round_pd_512(<8 x double> %a0) {
600 ; CHECK-LABEL: test_sqrt_round_pd_512:
602 ; CHECK-NEXT: vsqrtpd {rz-sae}, %zmm0, %zmm0
603 ; CHECK-NEXT: ret{{[l|q]}}
604 %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11)
608 define <8 x double> @test_mask_sqrt_round_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
609 ; X64-LABEL: test_mask_sqrt_round_pd_512:
611 ; X64-NEXT: kmovw %edi, %k1
612 ; X64-NEXT: vsqrtpd {rz-sae}, %zmm0, %zmm1 {%k1}
613 ; X64-NEXT: vmovapd %zmm1, %zmm0
616 ; X86-LABEL: test_mask_sqrt_round_pd_512:
618 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
619 ; X86-NEXT: kmovw %eax, %k1
620 ; X86-NEXT: vsqrtpd {rz-sae}, %zmm0, %zmm1 {%k1}
621 ; X86-NEXT: vmovapd %zmm1, %zmm0
623 %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11)
624 %2 = bitcast i8 %mask to <8 x i1>
625 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
629 define <8 x double> @test_maskz_sqrt_round_pd_512(<8 x double> %a0, i8 %mask) {
630 ; X64-LABEL: test_maskz_sqrt_round_pd_512:
632 ; X64-NEXT: kmovw %edi, %k1
633 ; X64-NEXT: vsqrtpd {rz-sae}, %zmm0, %zmm0 {%k1} {z}
636 ; X86-LABEL: test_maskz_sqrt_round_pd_512:
638 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
639 ; X86-NEXT: kmovw %eax, %k1
640 ; X86-NEXT: vsqrtpd {rz-sae}, %zmm0, %zmm0 {%k1} {z}
642 %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11)
643 %2 = bitcast i8 %mask to <8 x i1>
644 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
647 declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, i32) nounwind readnone
649 define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) {
650 ; CHECK-LABEL: test_sqrt_ps_512:
652 ; CHECK-NEXT: vsqrtps %zmm0, %zmm0
653 ; CHECK-NEXT: ret{{[l|q]}}
654 %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0)
658 define <16 x float> @test_mask_sqrt_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
659 ; X64-LABEL: test_mask_sqrt_ps_512:
661 ; X64-NEXT: kmovw %edi, %k1
662 ; X64-NEXT: vsqrtps %zmm0, %zmm1 {%k1}
663 ; X64-NEXT: vmovaps %zmm1, %zmm0
666 ; X86-LABEL: test_mask_sqrt_ps_512:
668 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
669 ; X86-NEXT: vsqrtps %zmm0, %zmm1 {%k1}
670 ; X86-NEXT: vmovaps %zmm1, %zmm0
672 %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0)
673 %2 = bitcast i16 %mask to <16 x i1>
674 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
678 define <16 x float> @test_maskz_sqrt_ps_512(<16 x float> %a0, i16 %mask) {
679 ; X64-LABEL: test_maskz_sqrt_ps_512:
681 ; X64-NEXT: kmovw %edi, %k1
682 ; X64-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z}
685 ; X86-LABEL: test_maskz_sqrt_ps_512:
687 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
688 ; X86-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z}
690 %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0)
691 %2 = bitcast i16 %mask to <16 x i1>
692 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
695 declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
697 define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0) {
698 ; CHECK-LABEL: test_sqrt_round_ps_512:
700 ; CHECK-NEXT: vsqrtps {rz-sae}, %zmm0, %zmm0
701 ; CHECK-NEXT: ret{{[l|q]}}
702 %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11)
706 define <16 x float> @test_mask_sqrt_round_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
707 ; X64-LABEL: test_mask_sqrt_round_ps_512:
709 ; X64-NEXT: kmovw %edi, %k1
710 ; X64-NEXT: vsqrtps {rz-sae}, %zmm0, %zmm1 {%k1}
711 ; X64-NEXT: vmovaps %zmm1, %zmm0
714 ; X86-LABEL: test_mask_sqrt_round_ps_512:
716 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
717 ; X86-NEXT: vsqrtps {rz-sae}, %zmm0, %zmm1 {%k1}
718 ; X86-NEXT: vmovaps %zmm1, %zmm0
720 %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11)
721 %2 = bitcast i16 %mask to <16 x i1>
722 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
726 define <16 x float> @test_maskz_sqrt_round_ps_512(<16 x float> %a0, i16 %mask) {
727 ; X64-LABEL: test_maskz_sqrt_round_ps_512:
729 ; X64-NEXT: kmovw %edi, %k1
730 ; X64-NEXT: vsqrtps {rz-sae}, %zmm0, %zmm0 {%k1} {z}
733 ; X86-LABEL: test_maskz_sqrt_round_ps_512:
735 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
736 ; X86-NEXT: vsqrtps {rz-sae}, %zmm0, %zmm0 {%k1} {z}
738 %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11)
739 %2 = bitcast i16 %mask to <16 x i1>
740 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
743 declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, i32) nounwind readnone
745 define <8 x double> @test_getexp_pd_512(<8 x double> %a0) {
746 ; CHECK-LABEL: test_getexp_pd_512:
748 ; CHECK-NEXT: vgetexppd %zmm0, %zmm0
749 ; CHECK-NEXT: ret{{[l|q]}}
750 %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 4)
751 ret <8 x double> %res
753 define <8 x double> @test_getexp_round_pd_512(<8 x double> %a0) {
754 ; CHECK-LABEL: test_getexp_round_pd_512:
756 ; CHECK-NEXT: vgetexppd {sae}, %zmm0, %zmm0
757 ; CHECK-NEXT: ret{{[l|q]}}
758 %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 12)
759 ret <8 x double> %res
761 declare <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone
763 define <16 x float> @test_getexp_ps_512(<16 x float> %a0) {
764 ; CHECK-LABEL: test_getexp_ps_512:
766 ; CHECK-NEXT: vgetexpps %zmm0, %zmm0
767 ; CHECK-NEXT: ret{{[l|q]}}
768 %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
769 ret <16 x float> %res
772 define <16 x float> @test_getexp_round_ps_512(<16 x float> %a0) {
773 ; CHECK-LABEL: test_getexp_round_ps_512:
775 ; CHECK-NEXT: vgetexpps {sae}, %zmm0, %zmm0
776 ; CHECK-NEXT: ret{{[l|q]}}
777 %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
778 ret <16 x float> %res
780 declare <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone
782 declare <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
784 define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
785 ; X64-LABEL: test_sqrt_ss:
787 ; X64-NEXT: kmovw %edi, %k1
788 ; X64-NEXT: vmovaps %xmm2, %xmm3
789 ; X64-NEXT: vsqrtss %xmm1, %xmm0, %xmm3 {%k1}
790 ; X64-NEXT: vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
791 ; X64-NEXT: vaddps %xmm2, %xmm3, %xmm2
792 ; X64-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
793 ; X64-NEXT: vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0
794 ; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0
795 ; X64-NEXT: vaddps %xmm0, %xmm2, %xmm0
798 ; X86-LABEL: test_sqrt_ss:
800 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
801 ; X86-NEXT: kmovw %eax, %k1
802 ; X86-NEXT: vmovaps %xmm2, %xmm3
803 ; X86-NEXT: vsqrtss %xmm1, %xmm0, %xmm3 {%k1}
804 ; X86-NEXT: vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
805 ; X86-NEXT: vaddps %xmm2, %xmm3, %xmm2
806 ; X86-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
807 ; X86-NEXT: vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0
808 ; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0
809 ; X86-NEXT: vaddps %xmm0, %xmm2, %xmm0
811 %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
812 %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 9)
813 %res2 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 10)
814 %res3 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 11)
816 %res.1 = fadd <4 x float> %res0, %res1
817 %res.2 = fadd <4 x float> %res2, %res3
818 %res = fadd <4 x float> %res.1, %res.2
822 declare <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
824 define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
825 ; X64-LABEL: test_sqrt_sd:
827 ; X64-NEXT: kmovw %edi, %k1
828 ; X64-NEXT: vmovapd %xmm2, %xmm3
829 ; X64-NEXT: vsqrtsd %xmm1, %xmm0, %xmm3 {%k1}
830 ; X64-NEXT: vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
831 ; X64-NEXT: vaddpd %xmm2, %xmm3, %xmm2
832 ; X64-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
833 ; X64-NEXT: vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0
834 ; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0
835 ; X64-NEXT: vaddpd %xmm0, %xmm2, %xmm0
838 ; X86-LABEL: test_sqrt_sd:
840 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
841 ; X86-NEXT: kmovw %eax, %k1
842 ; X86-NEXT: vmovapd %xmm2, %xmm3
843 ; X86-NEXT: vsqrtsd %xmm1, %xmm0, %xmm3 {%k1}
844 ; X86-NEXT: vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
845 ; X86-NEXT: vaddpd %xmm2, %xmm3, %xmm2
846 ; X86-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
847 ; X86-NEXT: vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0
848 ; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0
849 ; X86-NEXT: vaddpd %xmm0, %xmm2, %xmm0
851 %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
852 %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 9)
853 %res2 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 10)
854 %res3 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 11)
856 %res.1 = fadd <2 x double> %res0, %res1
857 %res.2 = fadd <2 x double> %res2, %res3
858 %res = fadd <2 x double> %res.1, %res.2
859 ret <2 x double> %res
862 define i32 @test_x86_avx512_cvttsd2usi(<2 x double> %a0) {
863 ; CHECK-LABEL: test_x86_avx512_cvttsd2usi:
865 ; CHECK-NEXT: vcvttsd2usi %xmm0, %ecx
866 ; CHECK-NEXT: vcvttsd2usi {sae}, %xmm0, %eax
867 ; CHECK-NEXT: addl %ecx, %eax
868 ; CHECK-NEXT: ret{{[l|q]}}
869 %res0 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 4) ;
870 %res1 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 8) ;
871 %res2 = add i32 %res0, %res1
874 declare i32 @llvm.x86.avx512.cvttsd2usi(<2 x double>, i32) nounwind readnone
876 define i32 @test_x86_avx512_cvttsd2si(<2 x double> %a0) {
877 ; CHECK-LABEL: test_x86_avx512_cvttsd2si:
879 ; CHECK-NEXT: vcvttsd2si %xmm0, %ecx
880 ; CHECK-NEXT: vcvttsd2si {sae}, %xmm0, %eax
881 ; CHECK-NEXT: addl %ecx, %eax
882 ; CHECK-NEXT: ret{{[l|q]}}
883 %res0 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 4) ;
884 %res1 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 8) ;
885 %res2 = add i32 %res0, %res1
888 declare i32 @llvm.x86.avx512.cvttsd2si(<2 x double>, i32) nounwind readnone
890 define i32 @test_x86_avx512_cvttss2si(<4 x float> %a0) {
891 ; CHECK-LABEL: test_x86_avx512_cvttss2si:
893 ; CHECK-NEXT: vcvttss2si {sae}, %xmm0, %ecx
894 ; CHECK-NEXT: vcvttss2si %xmm0, %eax
895 ; CHECK-NEXT: addl %ecx, %eax
896 ; CHECK-NEXT: ret{{[l|q]}}
897 %res0 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 8) ;
898 %res1 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 4) ;
899 %res2 = add i32 %res0, %res1
902 declare i32 @llvm.x86.avx512.cvttss2si(<4 x float>, i32) nounwind readnone
904 define i32 @test_x86_avx512_cvttss2si_load(ptr %a0) {
905 ; X64-LABEL: test_x86_avx512_cvttss2si_load:
907 ; X64-NEXT: vcvttss2si (%rdi), %eax
910 ; X86-LABEL: test_x86_avx512_cvttss2si_load:
912 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
913 ; X86-NEXT: vcvttss2si (%eax), %eax
915 %a1 = load <4 x float>, ptr %a0
916 %res = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a1, i32 4) ;
920 define i32 @test_x86_avx512_cvttss2usi(<4 x float> %a0) {
921 ; CHECK-LABEL: test_x86_avx512_cvttss2usi:
923 ; CHECK-NEXT: vcvttss2usi {sae}, %xmm0, %ecx
924 ; CHECK-NEXT: vcvttss2usi %xmm0, %eax
925 ; CHECK-NEXT: addl %ecx, %eax
926 ; CHECK-NEXT: ret{{[l|q]}}
927 %res0 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 8) ;
928 %res1 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 4) ;
929 %res2 = add i32 %res0, %res1
932 declare i32 @llvm.x86.avx512.cvttss2usi(<4 x float>, i32) nounwind readnone
934 define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) {
935 ; CHECK-LABEL: test_x86_avx512_cvtsd2usi32:
937 ; CHECK-NEXT: vcvtsd2usi %xmm0, %eax
938 ; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %ecx
939 ; CHECK-NEXT: addl %eax, %ecx
940 ; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %eax
941 ; CHECK-NEXT: addl %ecx, %eax
942 ; CHECK-NEXT: ret{{[l|q]}}
944 %res = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 4)
945 %res1 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 11)
946 %res2 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 9)
947 %res3 = add i32 %res, %res1
948 %res4 = add i32 %res3, %res2
951 declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone
953 define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) {
954 ; CHECK-LABEL: test_x86_avx512_cvtsd2si32:
956 ; CHECK-NEXT: vcvtsd2si %xmm0, %eax
957 ; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %ecx
958 ; CHECK-NEXT: addl %eax, %ecx
959 ; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %eax
960 ; CHECK-NEXT: addl %ecx, %eax
961 ; CHECK-NEXT: ret{{[l|q]}}
963 %res = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 4)
964 %res1 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 11)
965 %res2 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 9)
966 %res3 = add i32 %res, %res1
967 %res4 = add i32 %res3, %res2
970 declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone
972 define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) {
973 ; CHECK-LABEL: test_x86_avx512_cvtss2usi32:
975 ; CHECK-NEXT: vcvtss2usi %xmm0, %eax
976 ; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %ecx
977 ; CHECK-NEXT: addl %eax, %ecx
978 ; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %eax
979 ; CHECK-NEXT: addl %ecx, %eax
980 ; CHECK-NEXT: ret{{[l|q]}}
982 %res = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 4)
983 %res1 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 11)
984 %res2 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 9)
985 %res3 = add i32 %res, %res1
986 %res4 = add i32 %res3, %res2
989 declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone
991 define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) {
992 ; CHECK-LABEL: test_x86_avx512_cvtss2si32:
994 ; CHECK-NEXT: vcvtss2si %xmm0, %eax
995 ; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %ecx
996 ; CHECK-NEXT: addl %eax, %ecx
997 ; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %eax
998 ; CHECK-NEXT: addl %ecx, %eax
999 ; CHECK-NEXT: ret{{[l|q]}}
1001 %res = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 4)
1002 %res1 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 11)
1003 %res2 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 9)
1004 %res3 = add i32 %res, %res1
1005 %res4 = add i32 %res3, %res2
1008 declare i32 @llvm.x86.avx512.vcvtss2si32(<4 x float>, i32) nounwind readnone
1010 define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 %mask, ptr %dst) {
1011 ; X64-LABEL: test_x86_vcvtps2ph_256:
1013 ; X64-NEXT: kmovw %edi, %k1
1014 ; X64-NEXT: vcvtps2ph $3, {sae}, %zmm0, %ymm2 {%k1} {z}
1015 ; X64-NEXT: vcvtps2ph $4, {sae}, %zmm0, %ymm1 {%k1}
1016 ; X64-NEXT: vpaddw %ymm1, %ymm2, %ymm1
1017 ; X64-NEXT: vcvtps2ph $2, %zmm0, (%rsi)
1018 ; X64-NEXT: vmovdqa %ymm1, %ymm0
1021 ; X86-LABEL: test_x86_vcvtps2ph_256:
1023 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1024 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1025 ; X86-NEXT: vcvtps2ph $3, {sae}, %zmm0, %ymm2 {%k1} {z}
1026 ; X86-NEXT: vcvtps2ph $4, {sae}, %zmm0, %ymm1 {%k1}
1027 ; X86-NEXT: vpaddw %ymm1, %ymm2, %ymm1
1028 ; X86-NEXT: vcvtps2ph $2, %zmm0, (%eax)
1029 ; X86-NEXT: vmovdqa %ymm1, %ymm0
1031 %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
1032 %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 11, <16 x i16> zeroinitializer, i16 %mask)
1033 %res3 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 12, <16 x i16> %src, i16 %mask)
1034 store <16 x i16> %res1, ptr %dst
1035 %res = add <16 x i16> %res2, %res3
1039 declare <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float>, i32, <16 x i16>, i16) nounwind readonly
1041 define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) {
1042 ; CHECK-LABEL: test_cmpps:
1044 ; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
1045 ; CHECK-NEXT: kmovw %k0, %eax
1046 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
1047 ; CHECK-NEXT: vzeroupper
1048 ; CHECK-NEXT: ret{{[l|q]}}
1049 %res = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
1050 %1 = bitcast <16 x i1> %res to i16
1053 declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, <16 x i1>, i32)
1055 define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) {
1056 ; CHECK-LABEL: test_cmppd:
1058 ; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k0
1059 ; CHECK-NEXT: kmovw %k0, %eax
1060 ; CHECK-NEXT: # kill: def $al killed $al killed $eax
1061 ; CHECK-NEXT: vzeroupper
1062 ; CHECK-NEXT: ret{{[l|q]}}
1063 %res = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
1064 %1 = bitcast <8 x i1> %res to i8
1067 declare <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, <8 x i1>, i32)
1069 ; Function Attrs: nounwind readnone
1072 define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) {
1073 ; CHECK-LABEL: test_vmaxpd:
1075 ; CHECK-NEXT: vmaxpd %zmm1, %zmm0, %zmm0
1076 ; CHECK-NEXT: ret{{[l|q]}}
1077 %1 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
1080 declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32)
1082 define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) {
1083 ; CHECK-LABEL: test_vminpd:
1085 ; CHECK-NEXT: vminpd %zmm1, %zmm0, %zmm0
1086 ; CHECK-NEXT: ret{{[l|q]}}
1087 %1 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
1090 declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32)
1092 define void @test_mask_store_ss(ptr %ptr, <4 x float> %data, i8 %mask) {
1093 ; X64-LABEL: test_mask_store_ss:
1095 ; X64-NEXT: kmovw %esi, %k1
1096 ; X64-NEXT: vmovss %xmm0, (%rdi) {%k1}
1099 ; X86-LABEL: test_mask_store_ss:
1101 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1102 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
1103 ; X86-NEXT: kmovw %ecx, %k1
1104 ; X86-NEXT: vmovss %xmm0, (%eax) {%k1}
1106 %1 = and i8 %mask, 1
1107 %2 = bitcast i8 %1 to <8 x i1>
1108 %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1109 call void @llvm.masked.store.v4f32.p0(<4 x float> %data, ptr %ptr, i32 1, <4 x i1> %extract)
1112 declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32, <4 x i1>) #1
1115 declare <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float>, <16 x float>, i32)
1116 declare <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float>, <16 x float>, i32)
1117 declare <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double>, <8 x double>, i32)
1119 define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1) {
1120 ; CHECK-LABEL: test_vsubps_rn:
1122 ; CHECK-NEXT: vsubps {rn-sae}, %zmm1, %zmm0, %zmm0
1123 ; CHECK-NEXT: ret{{[l|q]}}
1124 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1128 define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1) {
1129 ; CHECK-LABEL: test_vsubps_rd:
1131 ; CHECK-NEXT: vsubps {rd-sae}, %zmm1, %zmm0, %zmm0
1132 ; CHECK-NEXT: ret{{[l|q]}}
1133 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1137 define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1) {
1138 ; CHECK-LABEL: test_vsubps_ru:
1140 ; CHECK-NEXT: vsubps {ru-sae}, %zmm1, %zmm0, %zmm0
1141 ; CHECK-NEXT: ret{{[l|q]}}
1142 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1146 define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1) {
1147 ; CHECK-LABEL: test_vsubps_rz:
1149 ; CHECK-NEXT: vsubps {rz-sae}, %zmm1, %zmm0, %zmm0
1150 ; CHECK-NEXT: ret{{[l|q]}}
1151 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1155 define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1) {
1156 ; CHECK-LABEL: test_vmulps_rn:
1158 ; CHECK-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm0
1159 ; CHECK-NEXT: ret{{[l|q]}}
1160 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1164 define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1) {
1165 ; CHECK-LABEL: test_vmulps_rd:
1167 ; CHECK-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm0
1168 ; CHECK-NEXT: ret{{[l|q]}}
1169 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1173 define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1) {
1174 ; CHECK-LABEL: test_vmulps_ru:
1176 ; CHECK-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm0
1177 ; CHECK-NEXT: ret{{[l|q]}}
1178 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1182 define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) {
1183 ; CHECK-LABEL: test_vmulps_rz:
1185 ; CHECK-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm0
1186 ; CHECK-NEXT: ret{{[l|q]}}
1187 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1192 define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1193 ; X64-LABEL: test_vmulps_mask_rn:
1195 ; X64-NEXT: kmovw %edi, %k1
1196 ; X64-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1199 ; X86-LABEL: test_vmulps_mask_rn:
1201 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1202 ; X86-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1204 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1205 %2 = bitcast i16 %mask to <16 x i1>
1206 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1210 define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1211 ; X64-LABEL: test_vmulps_mask_rd:
1213 ; X64-NEXT: kmovw %edi, %k1
1214 ; X64-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1217 ; X86-LABEL: test_vmulps_mask_rd:
1219 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1220 ; X86-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1222 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1223 %2 = bitcast i16 %mask to <16 x i1>
1224 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1228 define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1229 ; X64-LABEL: test_vmulps_mask_ru:
1231 ; X64-NEXT: kmovw %edi, %k1
1232 ; X64-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1235 ; X86-LABEL: test_vmulps_mask_ru:
1237 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1238 ; X86-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1240 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1241 %2 = bitcast i16 %mask to <16 x i1>
1242 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1246 define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1247 ; X64-LABEL: test_vmulps_mask_rz:
1249 ; X64-NEXT: kmovw %edi, %k1
1250 ; X64-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1253 ; X86-LABEL: test_vmulps_mask_rz:
1255 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1256 ; X86-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1258 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1259 %2 = bitcast i16 %mask to <16 x i1>
1260 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1264 ;; With Passthru value
1265 define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
1266 ; X64-LABEL: test_vmulps_mask_passthru_rn:
1268 ; X64-NEXT: kmovw %edi, %k1
1269 ; X64-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1270 ; X64-NEXT: vmovaps %zmm2, %zmm0
1273 ; X86-LABEL: test_vmulps_mask_passthru_rn:
1275 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1276 ; X86-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1277 ; X86-NEXT: vmovaps %zmm2, %zmm0
1279 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1280 %2 = bitcast i16 %mask to <16 x i1>
1281 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
1285 define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
1286 ; X64-LABEL: test_vmulps_mask_passthru_rd:
1288 ; X64-NEXT: kmovw %edi, %k1
1289 ; X64-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1290 ; X64-NEXT: vmovaps %zmm2, %zmm0
1293 ; X86-LABEL: test_vmulps_mask_passthru_rd:
1295 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1296 ; X86-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1297 ; X86-NEXT: vmovaps %zmm2, %zmm0
1299 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1300 %2 = bitcast i16 %mask to <16 x i1>
1301 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
1305 define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
1306 ; X64-LABEL: test_vmulps_mask_passthru_ru:
1308 ; X64-NEXT: kmovw %edi, %k1
1309 ; X64-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1310 ; X64-NEXT: vmovaps %zmm2, %zmm0
1313 ; X86-LABEL: test_vmulps_mask_passthru_ru:
1315 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1316 ; X86-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1317 ; X86-NEXT: vmovaps %zmm2, %zmm0
1319 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1320 %2 = bitcast i16 %mask to <16 x i1>
1321 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
1325 define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
1326 ; X64-LABEL: test_vmulps_mask_passthru_rz:
1328 ; X64-NEXT: kmovw %edi, %k1
1329 ; X64-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1330 ; X64-NEXT: vmovaps %zmm2, %zmm0
1333 ; X86-LABEL: test_vmulps_mask_passthru_rz:
1335 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1336 ; X86-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1337 ; X86-NEXT: vmovaps %zmm2, %zmm0
1339 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1340 %2 = bitcast i16 %mask to <16 x i1>
1341 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
1346 define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
1347 ; X64-LABEL: test_vmulpd_mask_rn:
1349 ; X64-NEXT: kmovw %edi, %k1
1350 ; X64-NEXT: vmulpd {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1353 ; X86-LABEL: test_vmulpd_mask_rn:
1355 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1356 ; X86-NEXT: kmovw %eax, %k1
1357 ; X86-NEXT: vmulpd {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1359 %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 8)
1360 %2 = bitcast i8 %mask to <8 x i1>
1361 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
1365 define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
1366 ; X64-LABEL: test_vmulpd_mask_rd:
1368 ; X64-NEXT: kmovw %edi, %k1
1369 ; X64-NEXT: vmulpd {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1372 ; X86-LABEL: test_vmulpd_mask_rd:
1374 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1375 ; X86-NEXT: kmovw %eax, %k1
1376 ; X86-NEXT: vmulpd {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1378 %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 9)
1379 %2 = bitcast i8 %mask to <8 x i1>
1380 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
1384 define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
1385 ; X64-LABEL: test_vmulpd_mask_ru:
1387 ; X64-NEXT: kmovw %edi, %k1
1388 ; X64-NEXT: vmulpd {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1391 ; X86-LABEL: test_vmulpd_mask_ru:
1393 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1394 ; X86-NEXT: kmovw %eax, %k1
1395 ; X86-NEXT: vmulpd {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1397 %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 10)
1398 %2 = bitcast i8 %mask to <8 x i1>
1399 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
1403 define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
1404 ; X64-LABEL: test_vmulpd_mask_rz:
1406 ; X64-NEXT: kmovw %edi, %k1
1407 ; X64-NEXT: vmulpd {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1410 ; X86-LABEL: test_vmulpd_mask_rz:
1412 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1413 ; X86-NEXT: kmovw %eax, %k1
1414 ; X86-NEXT: vmulpd {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1416 %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 11)
1417 %2 = bitcast i8 %mask to <8 x i1>
1418 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
1422 define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1423 ; X64-LABEL: test_mm512_maskz_add_round_ps_rn_sae:
1425 ; X64-NEXT: kmovw %edi, %k1
1426 ; X64-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1429 ; X86-LABEL: test_mm512_maskz_add_round_ps_rn_sae:
1431 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1432 ; X86-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1434 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1435 %2 = bitcast i16 %mask to <16 x i1>
1436 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1440 define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1441 ; X64-LABEL: test_mm512_maskz_add_round_ps_rd_sae:
1443 ; X64-NEXT: kmovw %edi, %k1
1444 ; X64-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1447 ; X86-LABEL: test_mm512_maskz_add_round_ps_rd_sae:
1449 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1450 ; X86-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1452 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1453 %2 = bitcast i16 %mask to <16 x i1>
1454 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1458 define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1459 ; X64-LABEL: test_mm512_maskz_add_round_ps_ru_sae:
1461 ; X64-NEXT: kmovw %edi, %k1
1462 ; X64-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1465 ; X86-LABEL: test_mm512_maskz_add_round_ps_ru_sae:
1467 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1468 ; X86-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1470 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1471 %2 = bitcast i16 %mask to <16 x i1>
1472 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1476 define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1477 ; X64-LABEL: test_mm512_maskz_add_round_ps_rz_sae:
1479 ; X64-NEXT: kmovw %edi, %k1
1480 ; X64-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1483 ; X86-LABEL: test_mm512_maskz_add_round_ps_rz_sae:
1485 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1486 ; X86-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1488 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1489 %2 = bitcast i16 %mask to <16 x i1>
1490 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1494 define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1495 ; X64-LABEL: test_mm512_maskz_add_round_ps_current:
1497 ; X64-NEXT: kmovw %edi, %k1
1498 ; X64-NEXT: vaddps %zmm1, %zmm0, %zmm0 {%k1} {z}
1501 ; X86-LABEL: test_mm512_maskz_add_round_ps_current:
1503 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1504 ; X86-NEXT: vaddps %zmm1, %zmm0, %zmm0 {%k1} {z}
1506 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
1507 %2 = bitcast i16 %mask to <16 x i1>
1508 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1512 define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1513 ; X64-LABEL: test_mm512_mask_add_round_ps_rn_sae:
1515 ; X64-NEXT: kmovw %edi, %k1
1516 ; X64-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1517 ; X64-NEXT: vmovaps %zmm2, %zmm0
1520 ; X86-LABEL: test_mm512_mask_add_round_ps_rn_sae:
1522 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1523 ; X86-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1524 ; X86-NEXT: vmovaps %zmm2, %zmm0
1526 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1527 %2 = bitcast i16 %mask to <16 x i1>
1528 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1532 define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1533 ; X64-LABEL: test_mm512_mask_add_round_ps_rd_sae:
1535 ; X64-NEXT: kmovw %edi, %k1
1536 ; X64-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1537 ; X64-NEXT: vmovaps %zmm2, %zmm0
1540 ; X86-LABEL: test_mm512_mask_add_round_ps_rd_sae:
1542 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1543 ; X86-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1544 ; X86-NEXT: vmovaps %zmm2, %zmm0
1546 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1547 %2 = bitcast i16 %mask to <16 x i1>
1548 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1552 define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1553 ; X64-LABEL: test_mm512_mask_add_round_ps_ru_sae:
1555 ; X64-NEXT: kmovw %edi, %k1
1556 ; X64-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1557 ; X64-NEXT: vmovaps %zmm2, %zmm0
1560 ; X86-LABEL: test_mm512_mask_add_round_ps_ru_sae:
1562 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1563 ; X86-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1564 ; X86-NEXT: vmovaps %zmm2, %zmm0
1566 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1567 %2 = bitcast i16 %mask to <16 x i1>
1568 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1572 define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1573 ; X64-LABEL: test_mm512_mask_add_round_ps_rz_sae:
1575 ; X64-NEXT: kmovw %edi, %k1
1576 ; X64-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1577 ; X64-NEXT: vmovaps %zmm2, %zmm0
1580 ; X86-LABEL: test_mm512_mask_add_round_ps_rz_sae:
1582 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1583 ; X86-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1584 ; X86-NEXT: vmovaps %zmm2, %zmm0
1586 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1587 %2 = bitcast i16 %mask to <16 x i1>
1588 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1592 define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1593 ; X64-LABEL: test_mm512_mask_add_round_ps_current:
1595 ; X64-NEXT: kmovw %edi, %k1
1596 ; X64-NEXT: vaddps %zmm1, %zmm0, %zmm2 {%k1}
1597 ; X64-NEXT: vmovaps %zmm2, %zmm0
1600 ; X86-LABEL: test_mm512_mask_add_round_ps_current:
1602 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1603 ; X86-NEXT: vaddps %zmm1, %zmm0, %zmm2 {%k1}
1604 ; X86-NEXT: vmovaps %zmm2, %zmm0
1606 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
1607 %2 = bitcast i16 %mask to <16 x i1>
1608 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1612 define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1613 ; CHECK-LABEL: test_mm512_add_round_ps_rn_sae:
1615 ; CHECK-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0
1616 ; CHECK-NEXT: ret{{[l|q]}}
1617 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1621 define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1622 ; CHECK-LABEL: test_mm512_add_round_ps_rd_sae:
1624 ; CHECK-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0
1625 ; CHECK-NEXT: ret{{[l|q]}}
1626 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1630 define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1631 ; CHECK-LABEL: test_mm512_add_round_ps_ru_sae:
1633 ; CHECK-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm0
1634 ; CHECK-NEXT: ret{{[l|q]}}
1635 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1639 define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1640 ; CHECK-LABEL: test_mm512_add_round_ps_rz_sae:
1642 ; CHECK-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm0
1643 ; CHECK-NEXT: ret{{[l|q]}}
1644 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1648 define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1649 ; CHECK-LABEL: test_mm512_add_round_ps_current:
1651 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
1652 ; CHECK-NEXT: ret{{[l|q]}}
1653 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
1656 declare <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float>, <16 x float>, i32)
1658 define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1659 ; X64-LABEL: test_mm512_mask_sub_round_ps_rn_sae:
1661 ; X64-NEXT: kmovw %edi, %k1
1662 ; X64-NEXT: vsubps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1663 ; X64-NEXT: vmovaps %zmm2, %zmm0
1666 ; X86-LABEL: test_mm512_mask_sub_round_ps_rn_sae:
1668 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1669 ; X86-NEXT: vsubps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1670 ; X86-NEXT: vmovaps %zmm2, %zmm0
1672 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1673 %2 = bitcast i16 %mask to <16 x i1>
1674 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1678 define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1679 ; X64-LABEL: test_mm512_mask_sub_round_ps_rd_sae:
1681 ; X64-NEXT: kmovw %edi, %k1
1682 ; X64-NEXT: vsubps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1683 ; X64-NEXT: vmovaps %zmm2, %zmm0
1686 ; X86-LABEL: test_mm512_mask_sub_round_ps_rd_sae:
1688 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1689 ; X86-NEXT: vsubps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1690 ; X86-NEXT: vmovaps %zmm2, %zmm0
1692 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1693 %2 = bitcast i16 %mask to <16 x i1>
1694 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1698 define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1699 ; X64-LABEL: test_mm512_mask_sub_round_ps_ru_sae:
1701 ; X64-NEXT: kmovw %edi, %k1
1702 ; X64-NEXT: vsubps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1703 ; X64-NEXT: vmovaps %zmm2, %zmm0
1706 ; X86-LABEL: test_mm512_mask_sub_round_ps_ru_sae:
1708 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1709 ; X86-NEXT: vsubps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1710 ; X86-NEXT: vmovaps %zmm2, %zmm0
1712 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1713 %2 = bitcast i16 %mask to <16 x i1>
1714 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1718 define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1719 ; X64-LABEL: test_mm512_mask_sub_round_ps_rz_sae:
1721 ; X64-NEXT: kmovw %edi, %k1
1722 ; X64-NEXT: vsubps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1723 ; X64-NEXT: vmovaps %zmm2, %zmm0
1726 ; X86-LABEL: test_mm512_mask_sub_round_ps_rz_sae:
1728 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1729 ; X86-NEXT: vsubps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1730 ; X86-NEXT: vmovaps %zmm2, %zmm0
1732 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1733 %2 = bitcast i16 %mask to <16 x i1>
1734 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1738 define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1739 ; X64-LABEL: test_mm512_mask_sub_round_ps_current:
1741 ; X64-NEXT: kmovw %edi, %k1
1742 ; X64-NEXT: vsubps %zmm1, %zmm0, %zmm2 {%k1}
1743 ; X64-NEXT: vmovaps %zmm2, %zmm0
1746 ; X86-LABEL: test_mm512_mask_sub_round_ps_current:
1748 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1749 ; X86-NEXT: vsubps %zmm1, %zmm0, %zmm2 {%k1}
1750 ; X86-NEXT: vmovaps %zmm2, %zmm0
1752 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
1753 %2 = bitcast i16 %mask to <16 x i1>
1754 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1758 define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1759 ; CHECK-LABEL: test_mm512_sub_round_ps_rn_sae:
1761 ; CHECK-NEXT: vsubps {rn-sae}, %zmm1, %zmm0, %zmm0
1762 ; CHECK-NEXT: ret{{[l|q]}}
1763 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1767 define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1768 ; CHECK-LABEL: test_mm512_sub_round_ps_rd_sae:
1770 ; CHECK-NEXT: vsubps {rd-sae}, %zmm1, %zmm0, %zmm0
1771 ; CHECK-NEXT: ret{{[l|q]}}
1772 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1776 define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1777 ; CHECK-LABEL: test_mm512_sub_round_ps_ru_sae:
1779 ; CHECK-NEXT: vsubps {ru-sae}, %zmm1, %zmm0, %zmm0
1780 ; CHECK-NEXT: ret{{[l|q]}}
1781 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1785 define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1786 ; CHECK-LABEL: test_mm512_sub_round_ps_rz_sae:
1788 ; CHECK-NEXT: vsubps {rz-sae}, %zmm1, %zmm0, %zmm0
1789 ; CHECK-NEXT: ret{{[l|q]}}
1790 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1794 define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1795 ; CHECK-LABEL: test_mm512_sub_round_ps_current:
1797 ; CHECK-NEXT: vsubps %zmm1, %zmm0, %zmm0
1798 ; CHECK-NEXT: ret{{[l|q]}}
1799 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
1803 define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1804 ; X64-LABEL: test_mm512_maskz_div_round_ps_rn_sae:
1806 ; X64-NEXT: kmovw %edi, %k1
1807 ; X64-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1810 ; X86-LABEL: test_mm512_maskz_div_round_ps_rn_sae:
1812 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1813 ; X86-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1815 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1816 %2 = bitcast i16 %mask to <16 x i1>
1817 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1821 define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1822 ; X64-LABEL: test_mm512_maskz_div_round_ps_rd_sae:
1824 ; X64-NEXT: kmovw %edi, %k1
1825 ; X64-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1828 ; X86-LABEL: test_mm512_maskz_div_round_ps_rd_sae:
1830 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1831 ; X86-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1833 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1834 %2 = bitcast i16 %mask to <16 x i1>
1835 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1839 define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1840 ; X64-LABEL: test_mm512_maskz_div_round_ps_ru_sae:
1842 ; X64-NEXT: kmovw %edi, %k1
1843 ; X64-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1846 ; X86-LABEL: test_mm512_maskz_div_round_ps_ru_sae:
1848 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1849 ; X86-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1851 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1852 %2 = bitcast i16 %mask to <16 x i1>
1853 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1857 define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1858 ; X64-LABEL: test_mm512_maskz_div_round_ps_rz_sae:
1860 ; X64-NEXT: kmovw %edi, %k1
1861 ; X64-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1864 ; X86-LABEL: test_mm512_maskz_div_round_ps_rz_sae:
1866 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1867 ; X86-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1869 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1870 %2 = bitcast i16 %mask to <16 x i1>
1871 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1875 define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1876 ; X64-LABEL: test_mm512_maskz_div_round_ps_current:
1878 ; X64-NEXT: kmovw %edi, %k1
1879 ; X64-NEXT: vdivps %zmm1, %zmm0, %zmm0 {%k1} {z}
1882 ; X86-LABEL: test_mm512_maskz_div_round_ps_current:
1884 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1885 ; X86-NEXT: vdivps %zmm1, %zmm0, %zmm0 {%k1} {z}
1887 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
1888 %2 = bitcast i16 %mask to <16 x i1>
1889 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1893 define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1894 ; X64-LABEL: test_mm512_mask_div_round_ps_rn_sae:
1896 ; X64-NEXT: kmovw %edi, %k1
1897 ; X64-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1898 ; X64-NEXT: vmovaps %zmm2, %zmm0
1901 ; X86-LABEL: test_mm512_mask_div_round_ps_rn_sae:
1903 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1904 ; X86-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1905 ; X86-NEXT: vmovaps %zmm2, %zmm0
1907 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1908 %2 = bitcast i16 %mask to <16 x i1>
1909 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1913 define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1914 ; X64-LABEL: test_mm512_mask_div_round_ps_rd_sae:
1916 ; X64-NEXT: kmovw %edi, %k1
1917 ; X64-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1918 ; X64-NEXT: vmovaps %zmm2, %zmm0
1921 ; X86-LABEL: test_mm512_mask_div_round_ps_rd_sae:
1923 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1924 ; X86-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1925 ; X86-NEXT: vmovaps %zmm2, %zmm0
1927 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1928 %2 = bitcast i16 %mask to <16 x i1>
1929 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1933 define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1934 ; X64-LABEL: test_mm512_mask_div_round_ps_ru_sae:
1936 ; X64-NEXT: kmovw %edi, %k1
1937 ; X64-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1938 ; X64-NEXT: vmovaps %zmm2, %zmm0
1941 ; X86-LABEL: test_mm512_mask_div_round_ps_ru_sae:
1943 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1944 ; X86-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1945 ; X86-NEXT: vmovaps %zmm2, %zmm0
1947 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1948 %2 = bitcast i16 %mask to <16 x i1>
1949 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1953 define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1954 ; X64-LABEL: test_mm512_mask_div_round_ps_rz_sae:
1956 ; X64-NEXT: kmovw %edi, %k1
1957 ; X64-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1958 ; X64-NEXT: vmovaps %zmm2, %zmm0
1961 ; X86-LABEL: test_mm512_mask_div_round_ps_rz_sae:
1963 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1964 ; X86-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1965 ; X86-NEXT: vmovaps %zmm2, %zmm0
1967 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1968 %2 = bitcast i16 %mask to <16 x i1>
1969 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1973 define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1974 ; X64-LABEL: test_mm512_mask_div_round_ps_current:
1976 ; X64-NEXT: kmovw %edi, %k1
1977 ; X64-NEXT: vdivps %zmm1, %zmm0, %zmm2 {%k1}
1978 ; X64-NEXT: vmovaps %zmm2, %zmm0
1981 ; X86-LABEL: test_mm512_mask_div_round_ps_current:
1983 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1984 ; X86-NEXT: vdivps %zmm1, %zmm0, %zmm2 {%k1}
1985 ; X86-NEXT: vmovaps %zmm2, %zmm0
1987 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
1988 %2 = bitcast i16 %mask to <16 x i1>
1989 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1993 define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1994 ; CHECK-LABEL: test_mm512_div_round_ps_rn_sae:
1996 ; CHECK-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm0
1997 ; CHECK-NEXT: ret{{[l|q]}}
1998 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
2002 define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2003 ; CHECK-LABEL: test_mm512_div_round_ps_rd_sae:
2005 ; CHECK-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm0
2006 ; CHECK-NEXT: ret{{[l|q]}}
2007 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
2011 define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2012 ; CHECK-LABEL: test_mm512_div_round_ps_ru_sae:
2014 ; CHECK-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm0
2015 ; CHECK-NEXT: ret{{[l|q]}}
2016 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
2020 define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2021 ; CHECK-LABEL: test_mm512_div_round_ps_rz_sae:
2023 ; CHECK-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm0
2024 ; CHECK-NEXT: ret{{[l|q]}}
2025 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
2029 define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2030 ; CHECK-LABEL: test_mm512_div_round_ps_current:
2032 ; CHECK-NEXT: vdivps %zmm1, %zmm0, %zmm0
2033 ; CHECK-NEXT: ret{{[l|q]}}
2034 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
2037 declare <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float>, <16 x float>, i32)
2039 define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2040 ; X64-LABEL: test_mm512_maskz_min_round_ps_sae:
2042 ; X64-NEXT: kmovw %edi, %k1
2043 ; X64-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2046 ; X86-LABEL: test_mm512_maskz_min_round_ps_sae:
2048 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2049 ; X86-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2051 %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
2052 %2 = bitcast i16 %mask to <16 x i1>
2053 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
2057 define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2058 ; X64-LABEL: test_mm512_maskz_min_round_ps_current:
2060 ; X64-NEXT: kmovw %edi, %k1
2061 ; X64-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
2064 ; X86-LABEL: test_mm512_maskz_min_round_ps_current:
2066 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2067 ; X86-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
2069 %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
2070 %2 = bitcast i16 %mask to <16 x i1>
2071 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
2075 define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2076 ; X64-LABEL: test_mm512_mask_min_round_ps_sae:
2078 ; X64-NEXT: kmovw %edi, %k1
2079 ; X64-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
2080 ; X64-NEXT: vmovaps %zmm2, %zmm0
2083 ; X86-LABEL: test_mm512_mask_min_round_ps_sae:
2085 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2086 ; X86-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
2087 ; X86-NEXT: vmovaps %zmm2, %zmm0
2089 %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
2090 %2 = bitcast i16 %mask to <16 x i1>
2091 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
2095 define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2096 ; X64-LABEL: test_mm512_mask_min_round_ps_current:
2098 ; X64-NEXT: kmovw %edi, %k1
2099 ; X64-NEXT: vminps %zmm1, %zmm0, %zmm2 {%k1}
2100 ; X64-NEXT: vmovaps %zmm2, %zmm0
2103 ; X86-LABEL: test_mm512_mask_min_round_ps_current:
2105 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2106 ; X86-NEXT: vminps %zmm1, %zmm0, %zmm2 {%k1}
2107 ; X86-NEXT: vmovaps %zmm2, %zmm0
2109 %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
2110 %2 = bitcast i16 %mask to <16 x i1>
2111 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
2115 define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2116 ; CHECK-LABEL: test_mm512_min_round_ps_sae:
2118 ; CHECK-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm0
2119 ; CHECK-NEXT: ret{{[l|q]}}
2120 %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
2124 define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2125 ; CHECK-LABEL: test_mm512_min_round_ps_current:
2127 ; CHECK-NEXT: vminps %zmm1, %zmm0, %zmm0
2128 ; CHECK-NEXT: ret{{[l|q]}}
2129 %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
2132 declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32)
2134 define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2135 ; X64-LABEL: test_mm512_maskz_max_round_ps_sae:
2137 ; X64-NEXT: kmovw %edi, %k1
2138 ; X64-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2141 ; X86-LABEL: test_mm512_maskz_max_round_ps_sae:
2143 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2144 ; X86-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2146 %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
2147 %2 = bitcast i16 %mask to <16 x i1>
2148 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
2152 define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2153 ; X64-LABEL: test_mm512_maskz_max_round_ps_current:
2155 ; X64-NEXT: kmovw %edi, %k1
2156 ; X64-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
2159 ; X86-LABEL: test_mm512_maskz_max_round_ps_current:
2161 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2162 ; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
2164 %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
2165 %2 = bitcast i16 %mask to <16 x i1>
2166 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
2170 define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2171 ; X64-LABEL: test_mm512_mask_max_round_ps_sae:
2173 ; X64-NEXT: kmovw %edi, %k1
2174 ; X64-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
2175 ; X64-NEXT: vmovaps %zmm2, %zmm0
2178 ; X86-LABEL: test_mm512_mask_max_round_ps_sae:
2180 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2181 ; X86-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
2182 ; X86-NEXT: vmovaps %zmm2, %zmm0
2184 %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
2185 %2 = bitcast i16 %mask to <16 x i1>
2186 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
2190 define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2191 ; X64-LABEL: test_mm512_mask_max_round_ps_current:
2193 ; X64-NEXT: kmovw %edi, %k1
2194 ; X64-NEXT: vmaxps %zmm1, %zmm0, %zmm2 {%k1}
2195 ; X64-NEXT: vmovaps %zmm2, %zmm0
2198 ; X86-LABEL: test_mm512_mask_max_round_ps_current:
2200 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2201 ; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm2 {%k1}
2202 ; X86-NEXT: vmovaps %zmm2, %zmm0
2204 %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
2205 %2 = bitcast i16 %mask to <16 x i1>
2206 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
2210 define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2211 ; CHECK-LABEL: test_mm512_max_round_ps_sae:
2213 ; CHECK-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm0
2214 ; CHECK-NEXT: ret{{[l|q]}}
2215 %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
2219 define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2220 ; CHECK-LABEL: test_mm512_max_round_ps_current:
2222 ; CHECK-NEXT: vmaxps %zmm1, %zmm0, %zmm0
2223 ; CHECK-NEXT: ret{{[l|q]}}
2224 %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
2227 declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32)
2229 declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
2231 define <4 x float> @test_mask_add_ss_rn(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
2232 ; X64-LABEL: test_mask_add_ss_rn:
2234 ; X64-NEXT: kmovw %edi, %k1
2235 ; X64-NEXT: vaddss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2236 ; X64-NEXT: vmovaps %xmm2, %xmm0
2239 ; X86-LABEL: test_mask_add_ss_rn:
2241 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2242 ; X86-NEXT: kmovw %eax, %k1
2243 ; X86-NEXT: vaddss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2244 ; X86-NEXT: vmovaps %xmm2, %xmm0
2246 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
2247 ret <4 x float> %res
2250 define <4 x float> @test_mask_add_ss_rd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
2251 ; X64-LABEL: test_mask_add_ss_rd:
2253 ; X64-NEXT: kmovw %edi, %k1
2254 ; X64-NEXT: vaddss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2255 ; X64-NEXT: vmovaps %xmm2, %xmm0
2258 ; X86-LABEL: test_mask_add_ss_rd:
2260 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2261 ; X86-NEXT: kmovw %eax, %k1
2262 ; X86-NEXT: vaddss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2263 ; X86-NEXT: vmovaps %xmm2, %xmm0
2265 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 9)
2266 ret <4 x float> %res
2269 define <4 x float> @test_mask_add_ss_ru(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
2270 ; X64-LABEL: test_mask_add_ss_ru:
2272 ; X64-NEXT: kmovw %edi, %k1
2273 ; X64-NEXT: vaddss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2274 ; X64-NEXT: vmovaps %xmm2, %xmm0
2277 ; X86-LABEL: test_mask_add_ss_ru:
2279 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2280 ; X86-NEXT: kmovw %eax, %k1
2281 ; X86-NEXT: vaddss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2282 ; X86-NEXT: vmovaps %xmm2, %xmm0
2284 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 10)
2285 ret <4 x float> %res
2288 define <4 x float> @test_mask_add_ss_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
2289 ; X64-LABEL: test_mask_add_ss_rz:
2291 ; X64-NEXT: kmovw %edi, %k1
2292 ; X64-NEXT: vaddss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2293 ; X64-NEXT: vmovaps %xmm2, %xmm0
2296 ; X86-LABEL: test_mask_add_ss_rz:
2298 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2299 ; X86-NEXT: kmovw %eax, %k1
2300 ; X86-NEXT: vaddss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2301 ; X86-NEXT: vmovaps %xmm2, %xmm0
2303 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 11)
2304 ret <4 x float> %res
2307 define <4 x float> @test_mask_add_ss_current(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
2308 ; X64-LABEL: test_mask_add_ss_current:
2310 ; X64-NEXT: kmovw %edi, %k1
2311 ; X64-NEXT: vaddss %xmm1, %xmm0, %xmm2 {%k1}
2312 ; X64-NEXT: vmovaps %xmm2, %xmm0
2315 ; X86-LABEL: test_mask_add_ss_current:
2317 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2318 ; X86-NEXT: kmovw %eax, %k1
2319 ; X86-NEXT: vaddss %xmm1, %xmm0, %xmm2 {%k1}
2320 ; X86-NEXT: vmovaps %xmm2, %xmm0
2322 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
2323 ret <4 x float> %res
2326 define <4 x float> @test_maskz_add_ss_rn(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
2327 ; X64-LABEL: test_maskz_add_ss_rn:
2329 ; X64-NEXT: kmovw %edi, %k1
2330 ; X64-NEXT: vaddss {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
2333 ; X86-LABEL: test_maskz_add_ss_rn:
2335 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2336 ; X86-NEXT: kmovw %eax, %k1
2337 ; X86-NEXT: vaddss {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
2339 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
2340 ret <4 x float> %res
2343 define <4 x float> @test_add_ss_rn(<4 x float> %a0, <4 x float> %a1) {
2344 ; CHECK-LABEL: test_add_ss_rn:
2346 ; CHECK-NEXT: vaddss {rn-sae}, %xmm1, %xmm0, %xmm0
2347 ; CHECK-NEXT: ret{{[l|q]}}
2348 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
2349 ret <4 x float> %res
2352 define <4 x float> @test_mask_add_ss_current_memfold(<4 x float> %a0, ptr %a1, <4 x float> %a2, i8 %mask) {
2353 ; X64-LABEL: test_mask_add_ss_current_memfold:
2355 ; X64-NEXT: kmovw %esi, %k1
2356 ; X64-NEXT: vaddss (%rdi), %xmm0, %xmm1 {%k1}
2357 ; X64-NEXT: vmovaps %xmm1, %xmm0
2360 ; X86-LABEL: test_mask_add_ss_current_memfold:
2362 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2363 ; X86-NEXT: kmovw %eax, %k1
2364 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2365 ; X86-NEXT: vaddss (%eax), %xmm0, %xmm1 {%k1}
2366 ; X86-NEXT: vmovaps %xmm1, %xmm0
2368 %a1.val = load float, ptr %a1
2369 %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
2370 %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
2371 %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
2372 %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
2373 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> %a2, i8 %mask, i32 4)
2374 ret <4 x float> %res
2377 define <4 x float> @test_maskz_add_ss_current_memfold(<4 x float> %a0, ptr %a1, i8 %mask) {
2378 ; X64-LABEL: test_maskz_add_ss_current_memfold:
2380 ; X64-NEXT: kmovw %esi, %k1
2381 ; X64-NEXT: vaddss (%rdi), %xmm0, %xmm0 {%k1} {z}
2384 ; X86-LABEL: test_maskz_add_ss_current_memfold:
2386 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2387 ; X86-NEXT: kmovw %eax, %k1
2388 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2389 ; X86-NEXT: vaddss (%eax), %xmm0, %xmm0 {%k1} {z}
2391 %a1.val = load float, ptr %a1
2392 %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
2393 %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
2394 %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
2395 %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
2396 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> zeroinitializer, i8 %mask, i32 4)
2397 ret <4 x float> %res
2400 declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
2402 define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
2403 ; X64-LABEL: test_mask_add_sd_rn:
2405 ; X64-NEXT: kmovw %edi, %k1
2406 ; X64-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2407 ; X64-NEXT: vmovapd %xmm2, %xmm0
2410 ; X86-LABEL: test_mask_add_sd_rn:
2412 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2413 ; X86-NEXT: kmovw %eax, %k1
2414 ; X86-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2415 ; X86-NEXT: vmovapd %xmm2, %xmm0
2417 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
2418 ret <2 x double> %res
2421 define <2 x double> @test_mask_add_sd_rd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
2422 ; X64-LABEL: test_mask_add_sd_rd:
2424 ; X64-NEXT: kmovw %edi, %k1
2425 ; X64-NEXT: vaddsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2426 ; X64-NEXT: vmovapd %xmm2, %xmm0
2429 ; X86-LABEL: test_mask_add_sd_rd:
2431 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2432 ; X86-NEXT: kmovw %eax, %k1
2433 ; X86-NEXT: vaddsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2434 ; X86-NEXT: vmovapd %xmm2, %xmm0
2436 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 9)
2437 ret <2 x double> %res
2440 define <2 x double> @test_mask_add_sd_ru(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
2441 ; X64-LABEL: test_mask_add_sd_ru:
2443 ; X64-NEXT: kmovw %edi, %k1
2444 ; X64-NEXT: vaddsd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2445 ; X64-NEXT: vmovapd %xmm2, %xmm0
2448 ; X86-LABEL: test_mask_add_sd_ru:
2450 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2451 ; X86-NEXT: kmovw %eax, %k1
2452 ; X86-NEXT: vaddsd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2453 ; X86-NEXT: vmovapd %xmm2, %xmm0
2455 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 10)
2456 ret <2 x double> %res
2459 define <2 x double> @test_mask_add_sd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
2460 ; X64-LABEL: test_mask_add_sd_rz:
2462 ; X64-NEXT: kmovw %edi, %k1
2463 ; X64-NEXT: vaddsd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2464 ; X64-NEXT: vmovapd %xmm2, %xmm0
2467 ; X86-LABEL: test_mask_add_sd_rz:
2469 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2470 ; X86-NEXT: kmovw %eax, %k1
2471 ; X86-NEXT: vaddsd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2472 ; X86-NEXT: vmovapd %xmm2, %xmm0
2474 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 11)
2475 ret <2 x double> %res
2478 define <2 x double> @test_mask_add_sd_current(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
2479 ; X64-LABEL: test_mask_add_sd_current:
2481 ; X64-NEXT: kmovw %edi, %k1
2482 ; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm2 {%k1}
2483 ; X64-NEXT: vmovapd %xmm2, %xmm0
2486 ; X86-LABEL: test_mask_add_sd_current:
2488 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2489 ; X86-NEXT: kmovw %eax, %k1
2490 ; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm2 {%k1}
2491 ; X86-NEXT: vmovapd %xmm2, %xmm0
2493 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
2494 ret <2 x double> %res
2497 define <2 x double> @test_maskz_add_sd_rn(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
2498 ; X64-LABEL: test_maskz_add_sd_rn:
2500 ; X64-NEXT: kmovw %edi, %k1
2501 ; X64-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
2504 ; X86-LABEL: test_maskz_add_sd_rn:
2506 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2507 ; X86-NEXT: kmovw %eax, %k1
2508 ; X86-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
2510 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
2511 ret <2 x double> %res
2514 define <2 x double> @test_add_sd_rn(<2 x double> %a0, <2 x double> %a1) {
2515 ; CHECK-LABEL: test_add_sd_rn:
2517 ; CHECK-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0
2518 ; CHECK-NEXT: ret{{[l|q]}}
2519 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 8)
2520 ret <2 x double> %res
2523 define <2 x double> @test_mask_add_sd_current_memfold(<2 x double> %a0, ptr %a1, <2 x double> %a2, i8 %mask) {
2524 ; X64-LABEL: test_mask_add_sd_current_memfold:
2526 ; X64-NEXT: kmovw %esi, %k1
2527 ; X64-NEXT: vaddsd (%rdi), %xmm0, %xmm1 {%k1}
2528 ; X64-NEXT: vmovapd %xmm1, %xmm0
2531 ; X86-LABEL: test_mask_add_sd_current_memfold:
2533 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2534 ; X86-NEXT: kmovw %eax, %k1
2535 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2536 ; X86-NEXT: vaddsd (%eax), %xmm0, %xmm1 {%k1}
2537 ; X86-NEXT: vmovapd %xmm1, %xmm0
2539 %a1.val = load double, ptr %a1
2540 %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
2541 %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
2542 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> %a2, i8 %mask, i32 4)
2543 ret <2 x double> %res
2546 define <2 x double> @test_maskz_add_sd_current_memfold(<2 x double> %a0, ptr %a1, i8 %mask) {
2547 ; X64-LABEL: test_maskz_add_sd_current_memfold:
2549 ; X64-NEXT: kmovw %esi, %k1
2550 ; X64-NEXT: vaddsd (%rdi), %xmm0, %xmm0 {%k1} {z}
2553 ; X86-LABEL: test_maskz_add_sd_current_memfold:
2555 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2556 ; X86-NEXT: kmovw %eax, %k1
2557 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2558 ; X86-NEXT: vaddsd (%eax), %xmm0, %xmm0 {%k1} {z}
2560 %a1.val = load double, ptr %a1
2561 %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
2562 %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
2563 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> zeroinitializer, i8 %mask, i32 4)
2564 ret <2 x double> %res
2567 declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
2569 define <4 x float> @test_mask_max_ss_sae(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
2570 ; X64-LABEL: test_mask_max_ss_sae:
2572 ; X64-NEXT: kmovw %edi, %k1
2573 ; X64-NEXT: vmaxss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
2574 ; X64-NEXT: vmovaps %xmm2, %xmm0
2577 ; X86-LABEL: test_mask_max_ss_sae:
2579 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2580 ; X86-NEXT: kmovw %eax, %k1
2581 ; X86-NEXT: vmaxss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
2582 ; X86-NEXT: vmovaps %xmm2, %xmm0
2584 %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
2585 ret <4 x float> %res
2588 define <4 x float> @test_maskz_max_ss_sae(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
2589 ; X64-LABEL: test_maskz_max_ss_sae:
2591 ; X64-NEXT: kmovw %edi, %k1
2592 ; X64-NEXT: vmaxss {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
2595 ; X86-LABEL: test_maskz_max_ss_sae:
2597 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2598 ; X86-NEXT: kmovw %eax, %k1
2599 ; X86-NEXT: vmaxss {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
2601 %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
2602 ret <4 x float> %res
2605 define <4 x float> @test_max_ss_sae(<4 x float> %a0, <4 x float> %a1) {
2606 ; CHECK-LABEL: test_max_ss_sae:
2608 ; CHECK-NEXT: vmaxss {sae}, %xmm1, %xmm0, %xmm0
2609 ; CHECK-NEXT: ret{{[l|q]}}
2610 %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
2611 ret <4 x float> %res
2614 define <4 x float> @test_mask_max_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
2615 ; X64-LABEL: test_mask_max_ss:
2617 ; X64-NEXT: kmovw %edi, %k1
2618 ; X64-NEXT: vmaxss %xmm1, %xmm0, %xmm2 {%k1}
2619 ; X64-NEXT: vmovaps %xmm2, %xmm0
2622 ; X86-LABEL: test_mask_max_ss:
2624 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2625 ; X86-NEXT: kmovw %eax, %k1
2626 ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm2 {%k1}
2627 ; X86-NEXT: vmovaps %xmm2, %xmm0
2629 %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
2630 ret <4 x float> %res
2633 define <4 x float> @test_maskz_max_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
2634 ; X64-LABEL: test_maskz_max_ss:
2636 ; X64-NEXT: kmovw %edi, %k1
2637 ; X64-NEXT: vmaxss %xmm1, %xmm0, %xmm0 {%k1} {z}
2640 ; X86-LABEL: test_maskz_max_ss:
2642 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2643 ; X86-NEXT: kmovw %eax, %k1
2644 ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm0 {%k1} {z}
2646 %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 4)
2647 ret <4 x float> %res
2650 define <4 x float> @test_max_ss(<4 x float> %a0, <4 x float> %a1) {
2651 ; CHECK-LABEL: test_max_ss:
2653 ; CHECK-NEXT: vmaxss %xmm1, %xmm0, %xmm0
2654 ; CHECK-NEXT: ret{{[l|q]}}
2655 %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 4)
2656 ret <4 x float> %res
2659 define <4 x float> @test_mask_max_ss_memfold(<4 x float> %a0, ptr %a1, <4 x float> %a2, i8 %mask) {
2660 ; X64-LABEL: test_mask_max_ss_memfold:
2662 ; X64-NEXT: kmovw %esi, %k1
2663 ; X64-NEXT: vmaxss (%rdi), %xmm0, %xmm1 {%k1}
2664 ; X64-NEXT: vmovaps %xmm1, %xmm0
2667 ; X86-LABEL: test_mask_max_ss_memfold:
2669 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2670 ; X86-NEXT: kmovw %eax, %k1
2671 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2672 ; X86-NEXT: vmaxss (%eax), %xmm0, %xmm1 {%k1}
2673 ; X86-NEXT: vmovaps %xmm1, %xmm0
2675 %a1.val = load float, ptr %a1
2676 %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
2677 %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
2678 %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
2679 %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
2680 %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> %a2, i8 %mask, i32 4)
2681 ret <4 x float> %res
2684 define <4 x float> @test_maskz_max_ss_memfold(<4 x float> %a0, ptr %a1, i8 %mask) {
2685 ; X64-LABEL: test_maskz_max_ss_memfold:
2687 ; X64-NEXT: kmovw %esi, %k1
2688 ; X64-NEXT: vmaxss (%rdi), %xmm0, %xmm0 {%k1} {z}
2691 ; X86-LABEL: test_maskz_max_ss_memfold:
2693 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2694 ; X86-NEXT: kmovw %eax, %k1
2695 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2696 ; X86-NEXT: vmaxss (%eax), %xmm0, %xmm0 {%k1} {z}
2698 %a1.val = load float, ptr %a1
2699 %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
2700 %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
2701 %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
2702 %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
2703 %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> zeroinitializer, i8 %mask, i32 4)
2704 ret <4 x float> %res
2706 declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
2708 define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
2709 ; X64-LABEL: test_mask_max_sd_sae:
2711 ; X64-NEXT: kmovw %edi, %k1
2712 ; X64-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
2713 ; X64-NEXT: vmovapd %xmm2, %xmm0
2716 ; X86-LABEL: test_mask_max_sd_sae:
2718 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2719 ; X86-NEXT: kmovw %eax, %k1
2720 ; X86-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
2721 ; X86-NEXT: vmovapd %xmm2, %xmm0
2723 %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
2724 ret <2 x double> %res
2727 define <2 x double> @test_maskz_max_sd_sae(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
2728 ; X64-LABEL: test_maskz_max_sd_sae:
2730 ; X64-NEXT: kmovw %edi, %k1
2731 ; X64-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
2734 ; X86-LABEL: test_maskz_max_sd_sae:
2736 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2737 ; X86-NEXT: kmovw %eax, %k1
2738 ; X86-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
2740 %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
2741 ret <2 x double> %res
2744 define <2 x double> @test_max_sd_sae(<2 x double> %a0, <2 x double> %a1) {
2745 ; CHECK-LABEL: test_max_sd_sae:
2747 ; CHECK-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm0
2748 ; CHECK-NEXT: ret{{[l|q]}}
2749 %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 8)
2750 ret <2 x double> %res
2753 define <2 x double> @test_mask_max_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
2754 ; X64-LABEL: test_mask_max_sd:
2756 ; X64-NEXT: kmovw %edi, %k1
2757 ; X64-NEXT: vmaxsd %xmm1, %xmm0, %xmm2 {%k1}
2758 ; X64-NEXT: vmovapd %xmm2, %xmm0
2761 ; X86-LABEL: test_mask_max_sd:
2763 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2764 ; X86-NEXT: kmovw %eax, %k1
2765 ; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm2 {%k1}
2766 ; X86-NEXT: vmovapd %xmm2, %xmm0
2768 %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
2769 ret <2 x double> %res
2772 define <2 x double> @test_maskz_max_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
2773 ; X64-LABEL: test_maskz_max_sd:
2775 ; X64-NEXT: kmovw %edi, %k1
2776 ; X64-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2779 ; X86-LABEL: test_maskz_max_sd:
2781 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2782 ; X86-NEXT: kmovw %eax, %k1
2783 ; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2785 %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 4)
2786 ret <2 x double> %res
2789 define <2 x double> @test_max_sd(<2 x double> %a0, <2 x double> %a1) {
2790 ; CHECK-LABEL: test_max_sd:
2792 ; CHECK-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
2793 ; CHECK-NEXT: ret{{[l|q]}}
2794 %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4)
2795 ret <2 x double> %res
2798 define <2 x double> @test_mask_max_sd_memfold(<2 x double> %a0, ptr %a1, <2 x double> %a2, i8 %mask) {
2799 ; X64-LABEL: test_mask_max_sd_memfold:
2801 ; X64-NEXT: kmovw %esi, %k1
2802 ; X64-NEXT: vmaxsd (%rdi), %xmm0, %xmm1 {%k1}
2803 ; X64-NEXT: vmovapd %xmm1, %xmm0
2806 ; X86-LABEL: test_mask_max_sd_memfold:
2808 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2809 ; X86-NEXT: kmovw %eax, %k1
2810 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2811 ; X86-NEXT: vmaxsd (%eax), %xmm0, %xmm1 {%k1}
2812 ; X86-NEXT: vmovapd %xmm1, %xmm0
2814 %a1.val = load double, ptr %a1
2815 %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
2816 %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
2817 %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> %a2, i8 %mask, i32 4)
2818 ret <2 x double> %res
2821 define <2 x double> @test_maskz_max_sd_memfold(<2 x double> %a0, ptr %a1, i8 %mask) {
2822 ; X64-LABEL: test_maskz_max_sd_memfold:
2824 ; X64-NEXT: kmovw %esi, %k1
2825 ; X64-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 {%k1} {z}
2828 ; X86-LABEL: test_maskz_max_sd_memfold:
2830 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2831 ; X86-NEXT: kmovw %eax, %k1
2832 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2833 ; X86-NEXT: vmaxsd (%eax), %xmm0, %xmm0 {%k1} {z}
2835 %a1.val = load double, ptr %a1
2836 %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
2837 %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
2838 %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> zeroinitializer, i8 %mask, i32 4)
2839 ret <2 x double> %res
2842 define <4 x float> @test_x86_avx512_cvtsi2ss32(<4 x float> %a, i32 %b) {
2843 ; X64-LABEL: test_x86_avx512_cvtsi2ss32:
2845 ; X64-NEXT: vcvtsi2ss %edi, {rz-sae}, %xmm0, %xmm0
2848 ; X86-LABEL: test_x86_avx512_cvtsi2ss32:
2850 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2851 ; X86-NEXT: vcvtsi2ss %eax, {rz-sae}, %xmm0, %xmm0
2853 %res = call <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float> %a, i32 %b, i32 11) ; <<<4 x float>> [#uses=1]
2854 ret <4 x float> %res
2856 declare <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float>, i32, i32) nounwind readnone
2858 define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss (<4 x float> %a, i32 %b) {
2859 ; X64-LABEL: test_x86_avx512__mm_cvt_roundu32_ss:
2861 ; X64-NEXT: vcvtusi2ss %edi, {rd-sae}, %xmm0, %xmm0
2864 ; X86-LABEL: test_x86_avx512__mm_cvt_roundu32_ss:
2866 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2867 ; X86-NEXT: vcvtusi2ss %eax, {rd-sae}, %xmm0, %xmm0
2869 %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 9) ; <<<4 x float>> [#uses=1]
2870 ret <4 x float> %res
2873 define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss_mem(<4 x float> %a, ptr %ptr) {
2874 ; X64-LABEL: test_x86_avx512__mm_cvt_roundu32_ss_mem:
2876 ; X64-NEXT: movl (%rdi), %eax
2877 ; X64-NEXT: vcvtusi2ss %eax, {rd-sae}, %xmm0, %xmm0
2880 ; X86-LABEL: test_x86_avx512__mm_cvt_roundu32_ss_mem:
2882 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2883 ; X86-NEXT: movl (%eax), %eax
2884 ; X86-NEXT: vcvtusi2ss %eax, {rd-sae}, %xmm0, %xmm0
2886 %b = load i32, ptr %ptr
2887 %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 9) ; <<<4 x float>> [#uses=1]
2888 ret <4 x float> %res
2891 define <4 x float> @test_x86_avx512__mm_cvtu32_ss(<4 x float> %a, i32 %b) {
2892 ; X64-LABEL: test_x86_avx512__mm_cvtu32_ss:
2894 ; X64-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0
2897 ; X86-LABEL: test_x86_avx512__mm_cvtu32_ss:
2899 ; X86-NEXT: vcvtusi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
2901 %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1]
2902 ret <4 x float> %res
2905 define <4 x float> @test_x86_avx512__mm_cvtu32_ss_mem(<4 x float> %a, ptr %ptr) {
2906 ; X64-LABEL: test_x86_avx512__mm_cvtu32_ss_mem:
2908 ; X64-NEXT: vcvtusi2ssl (%rdi), %xmm0, %xmm0
2911 ; X86-LABEL: test_x86_avx512__mm_cvtu32_ss_mem:
2913 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2914 ; X86-NEXT: vcvtusi2ssl (%eax), %xmm0, %xmm0
2916 %b = load i32, ptr %ptr
2917 %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1]
2918 ret <4 x float> %res
2920 declare <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float>, i32, i32) nounwind readnone
2922 declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>)
2924 define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p) {
2925 ; X64-LABEL: test_int_x86_avx512_vpermi2var_d_512:
2927 ; X64-NEXT: vpermt2d (%rdi), %zmm1, %zmm0
2930 ; X86-LABEL: test_int_x86_avx512_vpermi2var_d_512:
2932 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2933 ; X86-NEXT: vpermt2d (%eax), %zmm1, %zmm0
2935 %x2 = load <16 x i32>, ptr %x2p
2936 %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
2940 define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, i16 %x3) {
2941 ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512:
2943 ; X64-NEXT: kmovw %esi, %k1
2944 ; X64-NEXT: vpermi2d (%rdi), %zmm0, %zmm1 {%k1}
2945 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
2948 ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512:
2950 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2951 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2952 ; X86-NEXT: vpermi2d (%eax), %zmm0, %zmm1 {%k1}
2953 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
2955 %x2 = load <16 x i32>, ptr %x2p
2956 %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
2957 %2 = bitcast i16 %x3 to <16 x i1>
2958 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1
2962 declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>)
2964 define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) {
2965 ; CHECK-LABEL: test_int_x86_avx512_vpermi2var_pd_512:
2967 ; CHECK-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0
2968 ; CHECK-NEXT: ret{{[l|q]}}
2969 %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)
2973 define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
2974 ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512:
2976 ; X64-NEXT: kmovw %edi, %k1
2977 ; X64-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
2978 ; X64-NEXT: vmovapd %zmm1, %zmm0
2981 ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512:
2983 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2984 ; X86-NEXT: kmovw %eax, %k1
2985 ; X86-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
2986 ; X86-NEXT: vmovapd %zmm1, %zmm0
2988 %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)
2989 %2 = bitcast <8 x i64> %x1 to <8 x double>
2990 %3 = bitcast i8 %x3 to <8 x i1>
2991 %4 = select <8 x i1> %3, <8 x double> %1, <8 x double> %2
2995 declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>)
2997 define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) {
2998 ; CHECK-LABEL: test_int_x86_avx512_vpermi2var_ps_512:
3000 ; CHECK-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0
3001 ; CHECK-NEXT: ret{{[l|q]}}
3002 %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)
3006 define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
3007 ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512:
3009 ; X64-NEXT: kmovw %edi, %k1
3010 ; X64-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
3011 ; X64-NEXT: vmovaps %zmm1, %zmm0
3014 ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512:
3016 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3017 ; X86-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
3018 ; X86-NEXT: vmovaps %zmm1, %zmm0
3020 %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)
3021 %2 = bitcast <16 x i32> %x1 to <16 x float>
3022 %3 = bitcast i16 %x3 to <16 x i1>
3023 %4 = select <16 x i1> %3, <16 x float> %1, <16 x float> %2
3027 declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>)
3029 define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) {
3030 ; CHECK-LABEL: test_int_x86_avx512_vpermi2var_q_512:
3032 ; CHECK-NEXT: vpermt2q %zmm2, %zmm1, %zmm0
3033 ; CHECK-NEXT: ret{{[l|q]}}
3034 %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
3038 define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
3039 ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512:
3041 ; X64-NEXT: kmovw %edi, %k1
3042 ; X64-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
3043 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
3046 ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512:
3048 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3049 ; X86-NEXT: kmovw %eax, %k1
3050 ; X86-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
3051 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
3053 %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
3054 %2 = bitcast i8 %x3 to <8 x i1>
3055 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x1
3059 define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, i16 %x3) {
3060 ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512:
3062 ; X64-NEXT: kmovw %esi, %k1
3063 ; X64-NEXT: vpermi2d (%rdi), %zmm1, %zmm0 {%k1} {z}
3066 ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512:
3068 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3069 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3070 ; X86-NEXT: vpermi2d (%eax), %zmm1, %zmm0 {%k1} {z}
3072 %x2 = load <16 x i32>, ptr %x2p
3073 %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2)
3074 %2 = bitcast i16 %x3 to <16 x i1>
3075 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
3079 define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, ptr %x2ptr, i8 %x3) {
3080 ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512:
3082 ; X64-NEXT: kmovw %esi, %k1
3083 ; X64-NEXT: vpermi2pd (%rdi){1to8}, %zmm1, %zmm0 {%k1} {z}
3086 ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512:
3088 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3089 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
3090 ; X86-NEXT: kmovw %ecx, %k1
3091 ; X86-NEXT: vpermi2pd (%eax){1to8}, %zmm1, %zmm0 {%k1} {z}
3093 %x2s = load double, ptr %x2ptr
3094 %x2ins = insertelement <8 x double> undef, double %x2s, i32 0
3095 %x2 = shufflevector <8 x double> %x2ins, <8 x double> undef, <8 x i32> zeroinitializer
3096 %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x1, <8 x i64> %x0, <8 x double> %x2)
3097 %2 = bitcast i8 %x3 to <8 x i1>
3098 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
3102 define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
3103 ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512:
3105 ; X64-NEXT: kmovw %edi, %k1
3106 ; X64-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 {%k1} {z}
3109 ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512:
3111 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3112 ; X86-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 {%k1} {z}
3114 %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0, <16 x float> %x2)
3115 %2 = bitcast i16 %x3 to <16 x i1>
3116 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
3120 define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
3121 ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512:
3123 ; X64-NEXT: kmovw %edi, %k1
3124 ; X64-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 {%k1} {z}
3127 ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512:
3129 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3130 ; X86-NEXT: kmovw %eax, %k1
3131 ; X86-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 {%k1} {z}
3133 %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x1, <8 x i64> %x0, <8 x i64> %x2)
3134 %2 = bitcast i8 %x3 to <8 x i1>
3135 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
3139 define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
3140 ; CHECK-LABEL: test_int_x86_avx512_vpermt2var_d_512:
3142 ; CHECK-NEXT: vpermi2d %zmm2, %zmm1, %zmm0
3143 ; CHECK-NEXT: ret{{[l|q]}}
3144 %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2)
3148 define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
3149 ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512:
3151 ; X64-NEXT: kmovw %edi, %k1
3152 ; X64-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 {%k1}
3153 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
3156 ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512:
3158 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3159 ; X86-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 {%k1}
3160 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
3162 %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2)
3163 %2 = bitcast i16 %x3 to <16 x i1>
3164 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1
3168 declare <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
3169 define <8 x double>@test_int_x86_avx512_mask_scalef_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
3170 ; X64-LABEL: test_int_x86_avx512_mask_scalef_pd_512:
3172 ; X64-NEXT: kmovw %edi, %k1
3173 ; X64-NEXT: vscalefpd {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3174 ; X64-NEXT: vscalefpd {rn-sae}, %zmm1, %zmm0, %zmm0
3175 ; X64-NEXT: vaddpd %zmm0, %zmm2, %zmm0
3178 ; X86-LABEL: test_int_x86_avx512_mask_scalef_pd_512:
3180 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3181 ; X86-NEXT: kmovw %eax, %k1
3182 ; X86-NEXT: vscalefpd {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3183 ; X86-NEXT: vscalefpd {rn-sae}, %zmm1, %zmm0, %zmm0
3184 ; X86-NEXT: vaddpd %zmm0, %zmm2, %zmm0
3186 %res = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 11)
3187 %res1 = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 8)
3188 %res2 = fadd <8 x double> %res, %res1
3189 ret <8 x double> %res2
3192 declare <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
3193 define <16 x float>@test_int_x86_avx512_mask_scalef_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
3194 ; X64-LABEL: test_int_x86_avx512_mask_scalef_ps_512:
3196 ; X64-NEXT: kmovw %edi, %k1
3197 ; X64-NEXT: vscalefps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3198 ; X64-NEXT: vscalefps {rn-sae}, %zmm1, %zmm0, %zmm0
3199 ; X64-NEXT: vaddps %zmm0, %zmm2, %zmm0
3202 ; X86-LABEL: test_int_x86_avx512_mask_scalef_ps_512:
3204 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3205 ; X86-NEXT: vscalefps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3206 ; X86-NEXT: vscalefps {rn-sae}, %zmm1, %zmm0, %zmm0
3207 ; X86-NEXT: vaddps %zmm0, %zmm2, %zmm0
3209 %res = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 10)
3210 %res1 = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 8)
3211 %res2 = fadd <16 x float> %res, %res1
3212 ret <16 x float> %res2
3215 declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64>, <16 x i8>, i8)
3217 define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
3218 ; X64-LABEL: test_int_x86_avx512_mask_pmov_qb_512:
3220 ; X64-NEXT: kmovw %edi, %k1
3221 ; X64-NEXT: vpmovqb %zmm0, %xmm2
3222 ; X64-NEXT: vpmovqb %zmm0, %xmm1 {%k1}
3223 ; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1
3224 ; X64-NEXT: vpmovqb %zmm0, %xmm0 {%k1} {z}
3225 ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3226 ; X64-NEXT: vzeroupper
3229 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qb_512:
3231 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3232 ; X86-NEXT: kmovw %eax, %k1
3233 ; X86-NEXT: vpmovqb %zmm0, %xmm2
3234 ; X86-NEXT: vpmovqb %zmm0, %xmm1 {%k1}
3235 ; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1
3236 ; X86-NEXT: vpmovqb %zmm0, %xmm0 {%k1} {z}
3237 ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3238 ; X86-NEXT: vzeroupper
3240 %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
3241 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
3242 %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
3243 %res3 = add <16 x i8> %res0, %res1
3244 %res4 = add <16 x i8> %res3, %res2
3248 declare void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr %ptr, <8 x i64>, i8)
3250 define void @test_int_x86_avx512_mask_pmov_qb_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) {
3251 ; X64-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_512:
3253 ; X64-NEXT: kmovw %esi, %k1
3254 ; X64-NEXT: vpmovqb %zmm0, (%rdi)
3255 ; X64-NEXT: vpmovqb %zmm0, (%rdi) {%k1}
3256 ; X64-NEXT: vzeroupper
3259 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_512:
3261 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3262 ; X86-NEXT: kmovw %eax, %k1
3263 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3264 ; X86-NEXT: vpmovqb %zmm0, (%eax)
3265 ; X86-NEXT: vpmovqb %zmm0, (%eax) {%k1}
3266 ; X86-NEXT: vzeroupper
3268 call void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
3269 call void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
3273 declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64>, <16 x i8>, i8)
3275 define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
3276 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qb_512:
3278 ; X64-NEXT: kmovw %edi, %k1
3279 ; X64-NEXT: vpmovsqb %zmm0, %xmm2
3280 ; X64-NEXT: vpmovsqb %zmm0, %xmm1 {%k1}
3281 ; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1
3282 ; X64-NEXT: vpmovsqb %zmm0, %xmm0 {%k1} {z}
3283 ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3284 ; X64-NEXT: vzeroupper
3287 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qb_512:
3289 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3290 ; X86-NEXT: kmovw %eax, %k1
3291 ; X86-NEXT: vpmovsqb %zmm0, %xmm2
3292 ; X86-NEXT: vpmovsqb %zmm0, %xmm1 {%k1}
3293 ; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1
3294 ; X86-NEXT: vpmovsqb %zmm0, %xmm0 {%k1} {z}
3295 ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3296 ; X86-NEXT: vzeroupper
3298 %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
3299 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
3300 %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
3301 %res3 = add <16 x i8> %res0, %res1
3302 %res4 = add <16 x i8> %res3, %res2
3306 declare void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr %ptr, <8 x i64>, i8)
3308 define void @test_int_x86_avx512_mask_pmovs_qb_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) {
3309 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_512:
3311 ; X64-NEXT: kmovw %esi, %k1
3312 ; X64-NEXT: vpmovsqb %zmm0, (%rdi)
3313 ; X64-NEXT: vpmovsqb %zmm0, (%rdi) {%k1}
3314 ; X64-NEXT: vzeroupper
3317 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_512:
3319 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3320 ; X86-NEXT: kmovw %eax, %k1
3321 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3322 ; X86-NEXT: vpmovsqb %zmm0, (%eax)
3323 ; X86-NEXT: vpmovsqb %zmm0, (%eax) {%k1}
3324 ; X86-NEXT: vzeroupper
3326 call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
3327 call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
3331 declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64>, <16 x i8>, i8)
3333 define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
3334 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qb_512:
3336 ; X64-NEXT: kmovw %edi, %k1
3337 ; X64-NEXT: vpmovusqb %zmm0, %xmm2
3338 ; X64-NEXT: vpmovusqb %zmm0, %xmm1 {%k1}
3339 ; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1
3340 ; X64-NEXT: vpmovusqb %zmm0, %xmm0 {%k1} {z}
3341 ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3342 ; X64-NEXT: vzeroupper
3345 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qb_512:
3347 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3348 ; X86-NEXT: kmovw %eax, %k1
3349 ; X86-NEXT: vpmovusqb %zmm0, %xmm2
3350 ; X86-NEXT: vpmovusqb %zmm0, %xmm1 {%k1}
3351 ; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1
3352 ; X86-NEXT: vpmovusqb %zmm0, %xmm0 {%k1} {z}
3353 ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3354 ; X86-NEXT: vzeroupper
3356 %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
3357 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
3358 %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
3359 %res3 = add <16 x i8> %res0, %res1
3360 %res4 = add <16 x i8> %res3, %res2
3364 declare void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr %ptr, <8 x i64>, i8)
3366 define void @test_int_x86_avx512_mask_pmovus_qb_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) {
3367 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_512:
3369 ; X64-NEXT: kmovw %esi, %k1
3370 ; X64-NEXT: vpmovusqb %zmm0, (%rdi)
3371 ; X64-NEXT: vpmovusqb %zmm0, (%rdi) {%k1}
3372 ; X64-NEXT: vzeroupper
3375 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_512:
3377 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3378 ; X86-NEXT: kmovw %eax, %k1
3379 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3380 ; X86-NEXT: vpmovusqb %zmm0, (%eax)
3381 ; X86-NEXT: vpmovusqb %zmm0, (%eax) {%k1}
3382 ; X86-NEXT: vzeroupper
3384 call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
3385 call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
3389 declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
3391 define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
3392 ; X64-LABEL: test_int_x86_avx512_mask_pmov_qw_512:
3394 ; X64-NEXT: kmovw %edi, %k1
3395 ; X64-NEXT: vpmovqw %zmm0, %xmm2
3396 ; X64-NEXT: vpmovqw %zmm0, %xmm1 {%k1}
3397 ; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1
3398 ; X64-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z}
3399 ; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0
3400 ; X64-NEXT: vzeroupper
3403 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qw_512:
3405 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3406 ; X86-NEXT: kmovw %eax, %k1
3407 ; X86-NEXT: vpmovqw %zmm0, %xmm2
3408 ; X86-NEXT: vpmovqw %zmm0, %xmm1 {%k1}
3409 ; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1
3410 ; X86-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z}
3411 ; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0
3412 ; X86-NEXT: vzeroupper
3414 %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
3415 %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
3416 %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
3417 %res3 = add <8 x i16> %res0, %res1
3418 %res4 = add <8 x i16> %res3, %res2
3422 declare void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr %ptr, <8 x i64>, i8)
3424 define void @test_int_x86_avx512_mask_pmov_qw_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) {
3425 ; X64-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_512:
3427 ; X64-NEXT: kmovw %esi, %k1
3428 ; X64-NEXT: vpmovqw %zmm0, (%rdi)
3429 ; X64-NEXT: vpmovqw %zmm0, (%rdi) {%k1}
3430 ; X64-NEXT: vzeroupper
3433 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_512:
3435 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3436 ; X86-NEXT: kmovw %eax, %k1
3437 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3438 ; X86-NEXT: vpmovqw %zmm0, (%eax)
3439 ; X86-NEXT: vpmovqw %zmm0, (%eax) {%k1}
3440 ; X86-NEXT: vzeroupper
3442 call void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
3443 call void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
3447 declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8)
3449 define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
3450 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qw_512:
3452 ; X64-NEXT: kmovw %edi, %k1
3453 ; X64-NEXT: vpmovsqw %zmm0, %xmm2
3454 ; X64-NEXT: vpmovsqw %zmm0, %xmm1 {%k1}
3455 ; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1
3456 ; X64-NEXT: vpmovsqw %zmm0, %xmm0 {%k1} {z}
3457 ; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0
3458 ; X64-NEXT: vzeroupper
3461 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qw_512:
3463 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3464 ; X86-NEXT: kmovw %eax, %k1
3465 ; X86-NEXT: vpmovsqw %zmm0, %xmm2
3466 ; X86-NEXT: vpmovsqw %zmm0, %xmm1 {%k1}
3467 ; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1
3468 ; X86-NEXT: vpmovsqw %zmm0, %xmm0 {%k1} {z}
3469 ; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0
3470 ; X86-NEXT: vzeroupper
3472 %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
3473 %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
3474 %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
3475 %res3 = add <8 x i16> %res0, %res1
3476 %res4 = add <8 x i16> %res3, %res2
3480 declare void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr %ptr, <8 x i64>, i8)
3482 define void @test_int_x86_avx512_mask_pmovs_qw_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) {
3483 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_512:
3485 ; X64-NEXT: kmovw %esi, %k1
3486 ; X64-NEXT: vpmovsqw %zmm0, (%rdi)
3487 ; X64-NEXT: vpmovsqw %zmm0, (%rdi) {%k1}
3488 ; X64-NEXT: vzeroupper
3491 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_512:
3493 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3494 ; X86-NEXT: kmovw %eax, %k1
3495 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3496 ; X86-NEXT: vpmovsqw %zmm0, (%eax)
3497 ; X86-NEXT: vpmovsqw %zmm0, (%eax) {%k1}
3498 ; X86-NEXT: vzeroupper
3500 call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
3501 call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
3505 declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8)
3507 define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
3508 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qw_512:
3510 ; X64-NEXT: kmovw %edi, %k1
3511 ; X64-NEXT: vpmovusqw %zmm0, %xmm2
3512 ; X64-NEXT: vpmovusqw %zmm0, %xmm1 {%k1}
3513 ; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1
3514 ; X64-NEXT: vpmovusqw %zmm0, %xmm0 {%k1} {z}
3515 ; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0
3516 ; X64-NEXT: vzeroupper
3519 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qw_512:
3521 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3522 ; X86-NEXT: kmovw %eax, %k1
3523 ; X86-NEXT: vpmovusqw %zmm0, %xmm2
3524 ; X86-NEXT: vpmovusqw %zmm0, %xmm1 {%k1}
3525 ; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1
3526 ; X86-NEXT: vpmovusqw %zmm0, %xmm0 {%k1} {z}
3527 ; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0
3528 ; X86-NEXT: vzeroupper
3530 %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
3531 %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
3532 %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
3533 %res3 = add <8 x i16> %res0, %res1
3534 %res4 = add <8 x i16> %res3, %res2
3538 declare void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr %ptr, <8 x i64>, i8)
3540 define void @test_int_x86_avx512_mask_pmovus_qw_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) {
3541 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_512:
3543 ; X64-NEXT: kmovw %esi, %k1
3544 ; X64-NEXT: vpmovusqw %zmm0, (%rdi)
3545 ; X64-NEXT: vpmovusqw %zmm0, (%rdi) {%k1}
3546 ; X64-NEXT: vzeroupper
3549 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_512:
3551 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3552 ; X86-NEXT: kmovw %eax, %k1
3553 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3554 ; X86-NEXT: vpmovusqw %zmm0, (%eax)
3555 ; X86-NEXT: vpmovusqw %zmm0, (%eax) {%k1}
3556 ; X86-NEXT: vzeroupper
3558 call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
3559 call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
3563 define <8 x i32>@test_int_x86_avx512_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1) {
3564 ; CHECK-LABEL: test_int_x86_avx512_pmov_qd_512:
3566 ; CHECK-NEXT: vpmovqd %zmm0, %ymm0
3567 ; CHECK-NEXT: ret{{[l|q]}}
3568 %1 = trunc <8 x i64> %x0 to <8 x i32>
3572 define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
3573 ; X64-LABEL: test_int_x86_avx512_mask_pmov_qd_512:
3575 ; X64-NEXT: kmovw %edi, %k1
3576 ; X64-NEXT: vpmovqd %zmm0, %ymm1 {%k1}
3577 ; X64-NEXT: vmovdqa %ymm1, %ymm0
3580 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qd_512:
3582 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3583 ; X86-NEXT: kmovw %eax, %k1
3584 ; X86-NEXT: vpmovqd %zmm0, %ymm1 {%k1}
3585 ; X86-NEXT: vmovdqa %ymm1, %ymm0
3587 %1 = trunc <8 x i64> %x0 to <8 x i32>
3588 %2 = bitcast i8 %x2 to <8 x i1>
3589 %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x1
3593 define <8 x i32>@test_int_x86_avx512_maskz_pmov_qd_512(<8 x i64> %x0, i8 %x2) {
3594 ; X64-LABEL: test_int_x86_avx512_maskz_pmov_qd_512:
3596 ; X64-NEXT: kmovw %edi, %k1
3597 ; X64-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z}
3600 ; X86-LABEL: test_int_x86_avx512_maskz_pmov_qd_512:
3602 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3603 ; X86-NEXT: kmovw %eax, %k1
3604 ; X86-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z}
3606 %1 = trunc <8 x i64> %x0 to <8 x i32>
3607 %2 = bitcast i8 %x2 to <8 x i1>
3608 %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer
3612 declare void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr %ptr, <8 x i64>, i8)
3614 define void @test_int_x86_avx512_mask_pmov_qd_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) {
3615 ; X64-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_512:
3617 ; X64-NEXT: kmovw %esi, %k1
3618 ; X64-NEXT: vpmovqd %zmm0, (%rdi)
3619 ; X64-NEXT: vpmovqd %zmm0, (%rdi) {%k1}
3620 ; X64-NEXT: vzeroupper
3623 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_512:
3625 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3626 ; X86-NEXT: kmovw %eax, %k1
3627 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3628 ; X86-NEXT: vpmovqd %zmm0, (%eax)
3629 ; X86-NEXT: vpmovqd %zmm0, (%eax) {%k1}
3630 ; X86-NEXT: vzeroupper
3632 call void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
3633 call void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
3637 declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8)
3639 define <8 x i32>@test_int_x86_avx512_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1) {
3640 ; CHECK-LABEL: test_int_x86_avx512_pmovs_qd_512:
3642 ; CHECK-NEXT: vpmovsqd %zmm0, %ymm0
3643 ; CHECK-NEXT: ret{{[l|q]}}
3644 %res = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
3648 define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
3649 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qd_512:
3651 ; X64-NEXT: kmovw %edi, %k1
3652 ; X64-NEXT: vpmovsqd %zmm0, %ymm1 {%k1}
3653 ; X64-NEXT: vmovdqa %ymm1, %ymm0
3656 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qd_512:
3658 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3659 ; X86-NEXT: kmovw %eax, %k1
3660 ; X86-NEXT: vpmovsqd %zmm0, %ymm1 {%k1}
3661 ; X86-NEXT: vmovdqa %ymm1, %ymm0
3663 %res = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
3667 define <8 x i32>@test_int_x86_avx512_maskz_pmovs_qd_512(<8 x i64> %x0, i8 %x2) {
3668 ; X64-LABEL: test_int_x86_avx512_maskz_pmovs_qd_512:
3670 ; X64-NEXT: kmovw %edi, %k1
3671 ; X64-NEXT: vpmovsqd %zmm0, %ymm0 {%k1} {z}
3674 ; X86-LABEL: test_int_x86_avx512_maskz_pmovs_qd_512:
3676 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3677 ; X86-NEXT: kmovw %eax, %k1
3678 ; X86-NEXT: vpmovsqd %zmm0, %ymm0 {%k1} {z}
3680 %res = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
3684 declare void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr %ptr, <8 x i64>, i8)
3686 define void @test_int_x86_avx512_mask_pmovs_qd_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) {
3687 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_512:
3689 ; X64-NEXT: kmovw %esi, %k1
3690 ; X64-NEXT: vpmovsqd %zmm0, (%rdi)
3691 ; X64-NEXT: vpmovsqd %zmm0, (%rdi) {%k1}
3692 ; X64-NEXT: vzeroupper
3695 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_512:
3697 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3698 ; X86-NEXT: kmovw %eax, %k1
3699 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3700 ; X86-NEXT: vpmovsqd %zmm0, (%eax)
3701 ; X86-NEXT: vpmovsqd %zmm0, (%eax) {%k1}
3702 ; X86-NEXT: vzeroupper
3704 call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
3705 call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
3709 declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8)
3711 define <8 x i32>@test_int_x86_avx512_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1) {
3712 ; CHECK-LABEL: test_int_x86_avx512_pmovus_qd_512:
3714 ; CHECK-NEXT: vpmovusqd %zmm0, %ymm0
3715 ; CHECK-NEXT: ret{{[l|q]}}
3716 %res = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
3720 define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
3721 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qd_512:
3723 ; X64-NEXT: kmovw %edi, %k1
3724 ; X64-NEXT: vpmovusqd %zmm0, %ymm1 {%k1}
3725 ; X64-NEXT: vmovdqa %ymm1, %ymm0
3728 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qd_512:
3730 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3731 ; X86-NEXT: kmovw %eax, %k1
3732 ; X86-NEXT: vpmovusqd %zmm0, %ymm1 {%k1}
3733 ; X86-NEXT: vmovdqa %ymm1, %ymm0
3735 %res = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
3739 define <8 x i32>@test_int_x86_avx512_maskz_pmovus_qd_512(<8 x i64> %x0, i8 %x2) {
3740 ; X64-LABEL: test_int_x86_avx512_maskz_pmovus_qd_512:
3742 ; X64-NEXT: kmovw %edi, %k1
3743 ; X64-NEXT: vpmovusqd %zmm0, %ymm0 {%k1} {z}
3746 ; X86-LABEL: test_int_x86_avx512_maskz_pmovus_qd_512:
3748 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3749 ; X86-NEXT: kmovw %eax, %k1
3750 ; X86-NEXT: vpmovusqd %zmm0, %ymm0 {%k1} {z}
3752 %res = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
3756 declare void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr %ptr, <8 x i64>, i8)
3758 define void @test_int_x86_avx512_mask_pmovus_qd_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) {
3759 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_512:
3761 ; X64-NEXT: kmovw %esi, %k1
3762 ; X64-NEXT: vpmovusqd %zmm0, (%rdi)
3763 ; X64-NEXT: vpmovusqd %zmm0, (%rdi) {%k1}
3764 ; X64-NEXT: vzeroupper
3767 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_512:
3769 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3770 ; X86-NEXT: kmovw %eax, %k1
3771 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3772 ; X86-NEXT: vpmovusqd %zmm0, (%eax)
3773 ; X86-NEXT: vpmovusqd %zmm0, (%eax) {%k1}
3774 ; X86-NEXT: vzeroupper
3776 call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
3777 call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
3781 declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
3783 define <16 x i8>@test_int_x86_avx512_mask_pmov_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
3784 ; X64-LABEL: test_int_x86_avx512_mask_pmov_db_512:
3786 ; X64-NEXT: kmovw %edi, %k1
3787 ; X64-NEXT: vpmovdb %zmm0, %xmm2
3788 ; X64-NEXT: vpmovdb %zmm0, %xmm1 {%k1}
3789 ; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1
3790 ; X64-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z}
3791 ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3792 ; X64-NEXT: vzeroupper
3795 ; X86-LABEL: test_int_x86_avx512_mask_pmov_db_512:
3797 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3798 ; X86-NEXT: vpmovdb %zmm0, %xmm2
3799 ; X86-NEXT: vpmovdb %zmm0, %xmm1 {%k1}
3800 ; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1
3801 ; X86-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z}
3802 ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3803 ; X86-NEXT: vzeroupper
3805 %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
3806 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
3807 %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
3808 %res3 = add <16 x i8> %res0, %res1
3809 %res4 = add <16 x i8> %res3, %res2
3813 declare void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr %ptr, <16 x i32>, i16)
3815 define void @test_int_x86_avx512_mask_pmov_db_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) {
3816 ; X64-LABEL: test_int_x86_avx512_mask_pmov_db_mem_512:
3818 ; X64-NEXT: kmovw %esi, %k1
3819 ; X64-NEXT: vpmovdb %zmm0, (%rdi)
3820 ; X64-NEXT: vpmovdb %zmm0, (%rdi) {%k1}
3821 ; X64-NEXT: vzeroupper
3824 ; X86-LABEL: test_int_x86_avx512_mask_pmov_db_mem_512:
3826 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3827 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3828 ; X86-NEXT: vpmovdb %zmm0, (%eax)
3829 ; X86-NEXT: vpmovdb %zmm0, (%eax) {%k1}
3830 ; X86-NEXT: vzeroupper
3832 call void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1)
3833 call void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2)
3837 declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16)
3839 define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
3840 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_db_512:
3842 ; X64-NEXT: kmovw %edi, %k1
3843 ; X64-NEXT: vpmovsdb %zmm0, %xmm2
3844 ; X64-NEXT: vpmovsdb %zmm0, %xmm1 {%k1}
3845 ; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1
3846 ; X64-NEXT: vpmovsdb %zmm0, %xmm0 {%k1} {z}
3847 ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3848 ; X64-NEXT: vzeroupper
3851 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_db_512:
3853 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3854 ; X86-NEXT: vpmovsdb %zmm0, %xmm2
3855 ; X86-NEXT: vpmovsdb %zmm0, %xmm1 {%k1}
3856 ; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1
3857 ; X86-NEXT: vpmovsdb %zmm0, %xmm0 {%k1} {z}
3858 ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3859 ; X86-NEXT: vzeroupper
3861 %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
3862 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
3863 %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
3864 %res3 = add <16 x i8> %res0, %res1
3865 %res4 = add <16 x i8> %res3, %res2
3869 declare void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr %ptr, <16 x i32>, i16)
3871 define void @test_int_x86_avx512_mask_pmovs_db_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) {
3872 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_512:
3874 ; X64-NEXT: kmovw %esi, %k1
3875 ; X64-NEXT: vpmovsdb %zmm0, (%rdi)
3876 ; X64-NEXT: vpmovsdb %zmm0, (%rdi) {%k1}
3877 ; X64-NEXT: vzeroupper
3880 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_512:
3882 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3883 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3884 ; X86-NEXT: vpmovsdb %zmm0, (%eax)
3885 ; X86-NEXT: vpmovsdb %zmm0, (%eax) {%k1}
3886 ; X86-NEXT: vzeroupper
3888 call void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1)
3889 call void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2)
3893 declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16)
3895 define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
3896 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_db_512:
3898 ; X64-NEXT: kmovw %edi, %k1
3899 ; X64-NEXT: vpmovusdb %zmm0, %xmm2
3900 ; X64-NEXT: vpmovusdb %zmm0, %xmm1 {%k1}
3901 ; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1
3902 ; X64-NEXT: vpmovusdb %zmm0, %xmm0 {%k1} {z}
3903 ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3904 ; X64-NEXT: vzeroupper
3907 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_db_512:
3909 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3910 ; X86-NEXT: vpmovusdb %zmm0, %xmm2
3911 ; X86-NEXT: vpmovusdb %zmm0, %xmm1 {%k1}
3912 ; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1
3913 ; X86-NEXT: vpmovusdb %zmm0, %xmm0 {%k1} {z}
3914 ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3915 ; X86-NEXT: vzeroupper
3917 %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
3918 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
3919 %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
3920 %res3 = add <16 x i8> %res0, %res1
3921 %res4 = add <16 x i8> %res3, %res2
3925 declare void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr %ptr, <16 x i32>, i16)
3927 define void @test_int_x86_avx512_mask_pmovus_db_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) {
3928 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_512:
3930 ; X64-NEXT: kmovw %esi, %k1
3931 ; X64-NEXT: vpmovusdb %zmm0, (%rdi)
3932 ; X64-NEXT: vpmovusdb %zmm0, (%rdi) {%k1}
3933 ; X64-NEXT: vzeroupper
3936 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_512:
3938 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3939 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3940 ; X86-NEXT: vpmovusdb %zmm0, (%eax)
3941 ; X86-NEXT: vpmovusdb %zmm0, (%eax) {%k1}
3942 ; X86-NEXT: vzeroupper
3944 call void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1)
3945 call void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2)
3949 declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16)
3951 define <16 x i16>@test_int_x86_avx512_mask_pmov_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
3952 ; X64-LABEL: test_int_x86_avx512_mask_pmov_dw_512:
3954 ; X64-NEXT: kmovw %edi, %k1
3955 ; X64-NEXT: vpmovdw %zmm0, %ymm2
3956 ; X64-NEXT: vpmovdw %zmm0, %ymm1 {%k1}
3957 ; X64-NEXT: vpaddw %ymm1, %ymm2, %ymm1
3958 ; X64-NEXT: vpmovdw %zmm0, %ymm0 {%k1} {z}
3959 ; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0
3962 ; X86-LABEL: test_int_x86_avx512_mask_pmov_dw_512:
3964 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3965 ; X86-NEXT: vpmovdw %zmm0, %ymm2
3966 ; X86-NEXT: vpmovdw %zmm0, %ymm1 {%k1}
3967 ; X86-NEXT: vpaddw %ymm1, %ymm2, %ymm1
3968 ; X86-NEXT: vpmovdw %zmm0, %ymm0 {%k1} {z}
3969 ; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0
3971 %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
3972 %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
3973 %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
3974 %res3 = add <16 x i16> %res0, %res1
3975 %res4 = add <16 x i16> %res3, %res2
3976 ret <16 x i16> %res4
3979 declare void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr %ptr, <16 x i32>, i16)
3981 define void @test_int_x86_avx512_mask_pmov_dw_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) {
3982 ; X64-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_512:
3984 ; X64-NEXT: kmovw %esi, %k1
3985 ; X64-NEXT: vpmovdw %zmm0, (%rdi)
3986 ; X64-NEXT: vpmovdw %zmm0, (%rdi) {%k1}
3987 ; X64-NEXT: vzeroupper
3990 ; X86-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_512:
3992 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3993 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3994 ; X86-NEXT: vpmovdw %zmm0, (%eax)
3995 ; X86-NEXT: vpmovdw %zmm0, (%eax) {%k1}
3996 ; X86-NEXT: vzeroupper
3998 call void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1)
3999 call void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2)
4003 declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16)
4005 define <16 x i16>@test_int_x86_avx512_mask_pmovs_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
4006 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_dw_512:
4008 ; X64-NEXT: kmovw %edi, %k1
4009 ; X64-NEXT: vpmovsdw %zmm0, %ymm2
4010 ; X64-NEXT: vpmovsdw %zmm0, %ymm1 {%k1}
4011 ; X64-NEXT: vpaddw %ymm1, %ymm2, %ymm1
4012 ; X64-NEXT: vpmovsdw %zmm0, %ymm0 {%k1} {z}
4013 ; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0
4016 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_dw_512:
4018 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4019 ; X86-NEXT: vpmovsdw %zmm0, %ymm2
4020 ; X86-NEXT: vpmovsdw %zmm0, %ymm1 {%k1}
4021 ; X86-NEXT: vpaddw %ymm1, %ymm2, %ymm1
4022 ; X86-NEXT: vpmovsdw %zmm0, %ymm0 {%k1} {z}
4023 ; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0
4025 %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
4026 %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
4027 %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
4028 %res3 = add <16 x i16> %res0, %res1
4029 %res4 = add <16 x i16> %res3, %res2
4030 ret <16 x i16> %res4
4033 declare void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr %ptr, <16 x i32>, i16)
4035 define void @test_int_x86_avx512_mask_pmovs_dw_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) {
4036 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_512:
4038 ; X64-NEXT: kmovw %esi, %k1
4039 ; X64-NEXT: vpmovsdw %zmm0, (%rdi)
4040 ; X64-NEXT: vpmovsdw %zmm0, (%rdi) {%k1}
4041 ; X64-NEXT: vzeroupper
4044 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_512:
4046 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4047 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
4048 ; X86-NEXT: vpmovsdw %zmm0, (%eax)
4049 ; X86-NEXT: vpmovsdw %zmm0, (%eax) {%k1}
4050 ; X86-NEXT: vzeroupper
4052 call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1)
4053 call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2)
4057 declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16)
4059 define <16 x i16>@test_int_x86_avx512_mask_pmovus_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
4060 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_dw_512:
4062 ; X64-NEXT: kmovw %edi, %k1
4063 ; X64-NEXT: vpmovusdw %zmm0, %ymm2
4064 ; X64-NEXT: vpmovusdw %zmm0, %ymm1 {%k1}
4065 ; X64-NEXT: vpaddw %ymm1, %ymm2, %ymm1
4066 ; X64-NEXT: vpmovusdw %zmm0, %ymm0 {%k1} {z}
4067 ; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0
4070 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_dw_512:
4072 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4073 ; X86-NEXT: vpmovusdw %zmm0, %ymm2
4074 ; X86-NEXT: vpmovusdw %zmm0, %ymm1 {%k1}
4075 ; X86-NEXT: vpaddw %ymm1, %ymm2, %ymm1
4076 ; X86-NEXT: vpmovusdw %zmm0, %ymm0 {%k1} {z}
4077 ; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0
4079 %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
4080 %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
4081 %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
4082 %res3 = add <16 x i16> %res0, %res1
4083 %res4 = add <16 x i16> %res3, %res2
4084 ret <16 x i16> %res4
4087 declare void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr %ptr, <16 x i32>, i16)
4089 define void @test_int_x86_avx512_mask_pmovus_dw_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) {
4090 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_512:
4092 ; X64-NEXT: kmovw %esi, %k1
4093 ; X64-NEXT: vpmovusdw %zmm0, (%rdi)
4094 ; X64-NEXT: vpmovusdw %zmm0, (%rdi) {%k1}
4095 ; X64-NEXT: vzeroupper
4098 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_512:
4100 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4101 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
4102 ; X86-NEXT: vpmovusdw %zmm0, (%eax)
4103 ; X86-NEXT: vpmovusdw %zmm0, (%eax) {%k1}
4104 ; X86-NEXT: vzeroupper
4106 call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1)
4107 call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2)
4111 declare <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32>, i32)
4113 define <16 x float>@test_int_x86_avx512_mask_cvt_dq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) {
4114 ; X64-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_512:
4116 ; X64-NEXT: kmovw %edi, %k1
4117 ; X64-NEXT: vcvtdq2ps %zmm0, %zmm1 {%k1}
4118 ; X64-NEXT: vcvtdq2ps {rn-sae}, %zmm0, %zmm0
4119 ; X64-NEXT: vaddps %zmm0, %zmm1, %zmm0
4122 ; X86-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_512:
4124 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4125 ; X86-NEXT: vcvtdq2ps %zmm0, %zmm1 {%k1}
4126 ; X86-NEXT: vcvtdq2ps {rn-sae}, %zmm0, %zmm0
4127 ; X86-NEXT: vaddps %zmm0, %zmm1, %zmm0
4129 %cvt = sitofp <16 x i32> %x0 to <16 x float>
4130 %1 = bitcast i16 %x2 to <16 x i1>
4131 %2 = select <16 x i1> %1, <16 x float> %cvt, <16 x float> %x1
4132 %3 = call <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32> %x0, i32 8)
4133 %res2 = fadd <16 x float> %2, %3
4134 ret <16 x float> %res2
4137 declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double>, <8 x i32>, i8, i32)
4139 define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
4140 ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_512:
4142 ; X64-NEXT: kmovw %edi, %k1
4143 ; X64-NEXT: vcvtpd2dq %zmm0, %ymm1 {%k1}
4144 ; X64-NEXT: vcvtpd2dq {rn-sae}, %zmm0, %ymm0
4145 ; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0
4148 ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_512:
4150 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4151 ; X86-NEXT: kmovw %eax, %k1
4152 ; X86-NEXT: vcvtpd2dq %zmm0, %ymm1 {%k1}
4153 ; X86-NEXT: vcvtpd2dq {rn-sae}, %zmm0, %ymm0
4154 ; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0
4156 %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
4157 %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
4158 %res2 = add <8 x i32> %res, %res1
4162 declare <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double>, <8 x float>, i8, i32)
4164 define <8 x float>@test_int_x86_avx512_mask_cvt_pd2ps_512(<8 x double> %x0, <8 x float> %x1, i8 %x2) {
4165 ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_512:
4167 ; X64-NEXT: kmovw %edi, %k1
4168 ; X64-NEXT: vcvtpd2ps %zmm0, %ymm1 {%k1}
4169 ; X64-NEXT: vcvtpd2ps {ru-sae}, %zmm0, %ymm0
4170 ; X64-NEXT: vaddps %ymm0, %ymm1, %ymm0
4173 ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_512:
4175 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4176 ; X86-NEXT: kmovw %eax, %k1
4177 ; X86-NEXT: vcvtpd2ps %zmm0, %ymm1 {%k1}
4178 ; X86-NEXT: vcvtpd2ps {ru-sae}, %zmm0, %ymm0
4179 ; X86-NEXT: vaddps %ymm0, %ymm1, %ymm0
4181 %res = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 %x2, i32 4)
4182 %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 -1, i32 10)
4183 %res2 = fadd <8 x float> %res, %res1
4184 ret <8 x float> %res2
4187 declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
4189 define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
4190 ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_512:
4192 ; X64-NEXT: kmovw %edi, %k1
4193 ; X64-NEXT: vcvtpd2udq {ru-sae}, %zmm0, %ymm1 {%k1}
4194 ; X64-NEXT: vcvtpd2udq {rn-sae}, %zmm0, %ymm0
4195 ; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0
4198 ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_512:
4200 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4201 ; X86-NEXT: kmovw %eax, %k1
4202 ; X86-NEXT: vcvtpd2udq {ru-sae}, %zmm0, %ymm1 {%k1}
4203 ; X86-NEXT: vcvtpd2udq {rn-sae}, %zmm0, %ymm0
4204 ; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0
4206 %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 10)
4207 %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
4208 %res2 = add <8 x i32> %res, %res1
4212 declare <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float>, <16 x i32>, i16, i32)
4214 define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
4215 ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_512:
4217 ; X64-NEXT: kmovw %edi, %k1
4218 ; X64-NEXT: vcvtps2dq {ru-sae}, %zmm0, %zmm1 {%k1}
4219 ; X64-NEXT: vcvtps2dq {rn-sae}, %zmm0, %zmm0
4220 ; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0
4223 ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_512:
4225 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4226 ; X86-NEXT: vcvtps2dq {ru-sae}, %zmm0, %zmm1 {%k1}
4227 ; X86-NEXT: vcvtps2dq {rn-sae}, %zmm0, %zmm0
4228 ; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0
4230 %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 10)
4231 %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
4232 %res2 = add <16 x i32> %res, %res1
4233 ret <16 x i32> %res2
4236 declare <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float>, <8 x double>, i8, i32)
4238 define <8 x double>@test_int_x86_avx512_mask_cvt_ps2pd_512(<8 x float> %x0, <8 x double> %x1, i8 %x2) {
4239 ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_512:
4241 ; X64-NEXT: kmovw %edi, %k1
4242 ; X64-NEXT: vcvtps2pd %ymm0, %zmm1 {%k1}
4243 ; X64-NEXT: vcvtps2pd {sae}, %ymm0, %zmm0
4244 ; X64-NEXT: vaddpd %zmm0, %zmm1, %zmm0
4247 ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_512:
4249 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4250 ; X86-NEXT: kmovw %eax, %k1
4251 ; X86-NEXT: vcvtps2pd %ymm0, %zmm1 {%k1}
4252 ; X86-NEXT: vcvtps2pd {sae}, %ymm0, %zmm0
4253 ; X86-NEXT: vaddpd %zmm0, %zmm1, %zmm0
4255 %res = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 %x2, i32 4)
4256 %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 -1, i32 8)
4257 %res2 = fadd <8 x double> %res, %res1
4258 ret <8 x double> %res2
4261 declare <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float>, <16 x i32>, i16, i32)
4263 define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
4264 ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_512:
4266 ; X64-NEXT: kmovw %edi, %k1
4267 ; X64-NEXT: vcvtps2udq {ru-sae}, %zmm0, %zmm1 {%k1}
4268 ; X64-NEXT: vcvtps2udq {rn-sae}, %zmm0, %zmm0
4269 ; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0
4272 ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_512:
4274 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4275 ; X86-NEXT: vcvtps2udq {ru-sae}, %zmm0, %zmm1 {%k1}
4276 ; X86-NEXT: vcvtps2udq {rn-sae}, %zmm0, %zmm0
4277 ; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0
4279 %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 10)
4280 %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
4281 %res2 = add <16 x i32> %res, %res1
4282 ret <16 x i32> %res2
4285 declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double>, <8 x i32>, i8, i32)
4287 define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
4288 ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_512:
4290 ; X64-NEXT: kmovw %edi, %k1
4291 ; X64-NEXT: vcvttpd2dq %zmm0, %ymm1 {%k1}
4292 ; X64-NEXT: vcvttpd2dq {sae}, %zmm0, %ymm0
4293 ; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0
4296 ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_512:
4298 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4299 ; X86-NEXT: kmovw %eax, %k1
4300 ; X86-NEXT: vcvttpd2dq %zmm0, %ymm1 {%k1}
4301 ; X86-NEXT: vcvttpd2dq {sae}, %zmm0, %ymm0
4302 ; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0
4304 %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
4305 %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
4306 %res2 = add <8 x i32> %res, %res1
4310 declare <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32>, i32)
4312 define <16 x float>@test_int_x86_avx512_mask_cvt_udq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) {
4313 ; X64-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_512:
4315 ; X64-NEXT: kmovw %edi, %k1
4316 ; X64-NEXT: vcvtudq2ps %zmm0, %zmm1 {%k1}
4317 ; X64-NEXT: vcvtudq2ps {rn-sae}, %zmm0, %zmm0
4318 ; X64-NEXT: vaddps %zmm0, %zmm1, %zmm0
4321 ; X86-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_512:
4323 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4324 ; X86-NEXT: vcvtudq2ps %zmm0, %zmm1 {%k1}
4325 ; X86-NEXT: vcvtudq2ps {rn-sae}, %zmm0, %zmm0
4326 ; X86-NEXT: vaddps %zmm0, %zmm1, %zmm0
4328 %cvt = uitofp <16 x i32> %x0 to <16 x float>
4329 %1 = bitcast i16 %x2 to <16 x i1>
4330 %2 = select <16 x i1> %1, <16 x float> %cvt, <16 x float> %x1
4331 %3 = call <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32> %x0, i32 8)
4332 %res2 = fadd <16 x float> %2, %3
4333 ret <16 x float> %res2
4336 declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
4338 define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
4339 ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_512:
4341 ; X64-NEXT: kmovw %edi, %k1
4342 ; X64-NEXT: vcvttpd2udq %zmm0, %ymm1 {%k1}
4343 ; X64-NEXT: vcvttpd2udq {sae}, %zmm0, %ymm0
4344 ; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0
4347 ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_512:
4349 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4350 ; X86-NEXT: kmovw %eax, %k1
4351 ; X86-NEXT: vcvttpd2udq %zmm0, %ymm1 {%k1}
4352 ; X86-NEXT: vcvttpd2udq {sae}, %zmm0, %ymm0
4353 ; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0
4355 %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
4356 %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
4357 %res2 = add <8 x i32> %res, %res1
4361 declare <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float>, <16 x i32>, i16, i32)
4363 define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
4364 ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_512:
4366 ; X64-NEXT: kmovw %edi, %k1
4367 ; X64-NEXT: vcvttps2dq %zmm0, %zmm1 {%k1}
4368 ; X64-NEXT: vcvttps2dq {sae}, %zmm0, %zmm0
4369 ; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0
4372 ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_512:
4374 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4375 ; X86-NEXT: vcvttps2dq %zmm0, %zmm1 {%k1}
4376 ; X86-NEXT: vcvttps2dq {sae}, %zmm0, %zmm0
4377 ; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0
4379 %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4)
4380 %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
4381 %res2 = add <16 x i32> %res, %res1
4382 ret <16 x i32> %res2
4385 declare <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float>, <16 x i32>, i16, i32)
4387 define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
4388 ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_512:
4390 ; X64-NEXT: kmovw %edi, %k1
4391 ; X64-NEXT: vcvttps2udq %zmm0, %zmm1 {%k1}
4392 ; X64-NEXT: vcvttps2udq {sae}, %zmm0, %zmm0
4393 ; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0
4396 ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_512:
4398 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4399 ; X86-NEXT: vcvttps2udq %zmm0, %zmm1 {%k1}
4400 ; X86-NEXT: vcvttps2udq {sae}, %zmm0, %zmm0
4401 ; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0
4403 %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4)
4404 %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
4405 %res2 = add <16 x i32> %res, %res1
4406 ret <16 x i32> %res2
4409 declare <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
4411 define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1) {
4412 ; CHECK-LABEL: test_getexp_ss:
4414 ; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm0
4415 ; CHECK-NEXT: ret{{[l|q]}}
4416 %res = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
4417 ret <4 x float> %res
4420 define <4 x float> @test_mask_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
4421 ; X64-LABEL: test_mask_getexp_ss:
4423 ; X64-NEXT: kmovw %edi, %k1
4424 ; X64-NEXT: vmovaps %xmm2, %xmm3
4425 ; X64-NEXT: vgetexpss %xmm1, %xmm0, %xmm3 {%k1}
4426 ; X64-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
4427 ; X64-NEXT: vaddps %xmm2, %xmm3, %xmm0
4430 ; X86-LABEL: test_mask_getexp_ss:
4432 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4433 ; X86-NEXT: kmovw %eax, %k1
4434 ; X86-NEXT: vmovaps %xmm2, %xmm3
4435 ; X86-NEXT: vgetexpss %xmm1, %xmm0, %xmm3 {%k1}
4436 ; X86-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
4437 ; X86-NEXT: vaddps %xmm2, %xmm3, %xmm0
4439 %res0 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
4440 %res1 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
4441 %res.1 = fadd <4 x float> %res0, %res1
4442 ret <4 x float> %res.1
4445 define <4 x float> @test_maskz_getexp_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
4446 ; X64-LABEL: test_maskz_getexp_ss:
4448 ; X64-NEXT: kmovw %edi, %k1
4449 ; X64-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
4452 ; X86-LABEL: test_maskz_getexp_ss:
4454 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4455 ; X86-NEXT: kmovw %eax, %k1
4456 ; X86-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
4458 %res = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
4459 ret <4 x float> %res
4462 declare <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
4464 define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1) {
4465 ; CHECK-LABEL: test_getexp_sd:
4467 ; CHECK-NEXT: vgetexpsd %xmm1, %xmm0, %xmm0
4468 ; CHECK-NEXT: ret{{[l|q]}}
4469 %res = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4)
4470 ret <2 x double> %res
4473 define <2 x double> @test_mask_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
4474 ; X64-LABEL: test_mask_getexp_sd:
4476 ; X64-NEXT: kmovw %edi, %k1
4477 ; X64-NEXT: vmovapd %xmm2, %xmm3
4478 ; X64-NEXT: vgetexpsd %xmm1, %xmm0, %xmm3 {%k1}
4479 ; X64-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
4480 ; X64-NEXT: vaddpd %xmm2, %xmm3, %xmm0
4483 ; X86-LABEL: test_mask_getexp_sd:
4485 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4486 ; X86-NEXT: kmovw %eax, %k1
4487 ; X86-NEXT: vmovapd %xmm2, %xmm3
4488 ; X86-NEXT: vgetexpsd %xmm1, %xmm0, %xmm3 {%k1}
4489 ; X86-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
4490 ; X86-NEXT: vaddpd %xmm2, %xmm3, %xmm0
4492 %res0 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
4493 %res1 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
4494 %res.1 = fadd <2 x double> %res0, %res1
4495 ret <2 x double> %res.1
4498 define <2 x double> @test_maskz_getexp_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
4499 ; X64-LABEL: test_maskz_getexp_sd:
4501 ; X64-NEXT: kmovw %edi, %k1
4502 ; X64-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
4505 ; X86-LABEL: test_maskz_getexp_sd:
4507 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4508 ; X86-NEXT: kmovw %eax, %k1
4509 ; X86-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
4511 %res = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
4512 ret <2 x double> %res
4515 declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32)
4517 define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) {
4518 ; X64-LABEL: test_int_x86_avx512_mask_cmp_sd:
4520 ; X64-NEXT: kmovw %edi, %k1
4521 ; X64-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
4522 ; X64-NEXT: kmovw %k0, %eax
4523 ; X64-NEXT: # kill: def $al killed $al killed $eax
4526 ; X86-LABEL: test_int_x86_avx512_mask_cmp_sd:
4528 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4529 ; X86-NEXT: kmovw %eax, %k1
4530 ; X86-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
4531 ; X86-NEXT: kmovw %k0, %eax
4532 ; X86-NEXT: # kill: def $al killed $al killed $eax
4535 %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
4539 define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) {
4540 ; X64-LABEL: test_int_x86_avx512_mask_cmp_sd_all:
4542 ; X64-NEXT: kmovw %edi, %k1
4543 ; X64-NEXT: vcmplesd %xmm1, %xmm0, %k0
4544 ; X64-NEXT: kmovw %k0, %ecx
4545 ; X64-NEXT: vcmpunordsd {sae}, %xmm1, %xmm0, %k0
4546 ; X64-NEXT: kmovw %k0, %edx
4547 ; X64-NEXT: vcmpneqsd %xmm1, %xmm0, %k0 {%k1}
4548 ; X64-NEXT: kmovw %k0, %esi
4549 ; X64-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
4550 ; X64-NEXT: kmovw %k0, %eax
4551 ; X64-NEXT: orl %ecx, %edx
4552 ; X64-NEXT: orl %esi, %eax
4553 ; X64-NEXT: orl %edx, %eax
4554 ; X64-NEXT: # kill: def $al killed $al killed $eax
4557 ; X86-LABEL: test_int_x86_avx512_mask_cmp_sd_all:
4559 ; X86-NEXT: pushl %esi
4560 ; X86-NEXT: .cfi_def_cfa_offset 8
4561 ; X86-NEXT: .cfi_offset %esi, -8
4562 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4563 ; X86-NEXT: kmovw %eax, %k1
4564 ; X86-NEXT: vcmplesd %xmm1, %xmm0, %k0
4565 ; X86-NEXT: kmovw %k0, %ecx
4566 ; X86-NEXT: vcmpunordsd {sae}, %xmm1, %xmm0, %k0
4567 ; X86-NEXT: kmovw %k0, %edx
4568 ; X86-NEXT: vcmpneqsd %xmm1, %xmm0, %k0 {%k1}
4569 ; X86-NEXT: kmovw %k0, %esi
4570 ; X86-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
4571 ; X86-NEXT: kmovw %k0, %eax
4572 ; X86-NEXT: orl %ecx, %edx
4573 ; X86-NEXT: orl %esi, %eax
4574 ; X86-NEXT: orl %edx, %eax
4575 ; X86-NEXT: # kill: def $al killed $al killed $eax
4576 ; X86-NEXT: popl %esi
4577 ; X86-NEXT: .cfi_def_cfa_offset 4
4580 %res1 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 2, i8 -1, i32 4)
4581 %res2 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 3, i8 -1, i32 8)
4582 %res3 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 4, i8 %x3, i32 4)
4583 %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
4585 %res11 = or i8 %res1, %res2
4586 %res12 = or i8 %res3, %res4
4587 %res13 = or i8 %res11, %res12
4591 declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)
4593 define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) {
4594 ; X64-LABEL: test_int_x86_avx512_mask_cmp_ss:
4596 ; X64-NEXT: kmovw %edi, %k1
4597 ; X64-NEXT: vcmpunordss %xmm1, %xmm0, %k0 {%k1}
4598 ; X64-NEXT: kmovw %k0, %eax
4599 ; X64-NEXT: # kill: def $al killed $al killed $eax
4602 ; X86-LABEL: test_int_x86_avx512_mask_cmp_ss:
4604 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4605 ; X86-NEXT: kmovw %eax, %k1
4606 ; X86-NEXT: vcmpunordss %xmm1, %xmm0, %k0 {%k1}
4607 ; X86-NEXT: kmovw %k0, %eax
4608 ; X86-NEXT: # kill: def $al killed $al killed $eax
4611 %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 %x3, i32 4)
4616 define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) {
4617 ; X64-LABEL: test_int_x86_avx512_mask_cmp_ss_all:
4619 ; X64-NEXT: kmovw %edi, %k1
4620 ; X64-NEXT: vcmpless %xmm1, %xmm0, %k0
4621 ; X64-NEXT: kmovw %k0, %ecx
4622 ; X64-NEXT: vcmpunordss {sae}, %xmm1, %xmm0, %k0
4623 ; X64-NEXT: kmovw %k0, %edx
4624 ; X64-NEXT: vcmpneqss %xmm1, %xmm0, %k0 {%k1}
4625 ; X64-NEXT: kmovw %k0, %esi
4626 ; X64-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
4627 ; X64-NEXT: kmovw %k0, %eax
4628 ; X64-NEXT: andl %ecx, %edx
4629 ; X64-NEXT: andl %esi, %eax
4630 ; X64-NEXT: andl %edx, %eax
4631 ; X64-NEXT: # kill: def $al killed $al killed $eax
4634 ; X86-LABEL: test_int_x86_avx512_mask_cmp_ss_all:
4636 ; X86-NEXT: pushl %esi
4637 ; X86-NEXT: .cfi_def_cfa_offset 8
4638 ; X86-NEXT: .cfi_offset %esi, -8
4639 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4640 ; X86-NEXT: kmovw %eax, %k1
4641 ; X86-NEXT: vcmpless %xmm1, %xmm0, %k0
4642 ; X86-NEXT: kmovw %k0, %ecx
4643 ; X86-NEXT: vcmpunordss {sae}, %xmm1, %xmm0, %k0
4644 ; X86-NEXT: kmovw %k0, %edx
4645 ; X86-NEXT: vcmpneqss %xmm1, %xmm0, %k0 {%k1}
4646 ; X86-NEXT: kmovw %k0, %esi
4647 ; X86-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
4648 ; X86-NEXT: kmovw %k0, %eax
4649 ; X86-NEXT: andl %ecx, %edx
4650 ; X86-NEXT: andl %esi, %eax
4651 ; X86-NEXT: andl %edx, %eax
4652 ; X86-NEXT: # kill: def $al killed $al killed $eax
4653 ; X86-NEXT: popl %esi
4654 ; X86-NEXT: .cfi_def_cfa_offset 4
4656 %res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4)
4657 %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 -1, i32 8)
4658 %res3 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 4, i8 %x3, i32 4)
4659 %res4 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 5, i8 %x3, i32 8)
4661 %res11 = and i8 %res1, %res2
4662 %res12 = and i8 %res3, %res4
4663 %res13 = and i8 %res11, %res12
4667 declare <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
4669 define <8 x double>@test_int_x86_avx512_mask_getmant_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
4670 ; X64-LABEL: test_int_x86_avx512_mask_getmant_pd_512:
4672 ; X64-NEXT: kmovw %edi, %k1
4673 ; X64-NEXT: vgetmantpd $11, %zmm0, %zmm1 {%k1}
4674 ; X64-NEXT: vgetmantpd $11, {sae}, %zmm0, %zmm0
4675 ; X64-NEXT: vaddpd %zmm0, %zmm1, %zmm0
4678 ; X86-LABEL: test_int_x86_avx512_mask_getmant_pd_512:
4680 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4681 ; X86-NEXT: kmovw %eax, %k1
4682 ; X86-NEXT: vgetmantpd $11, %zmm0, %zmm1 {%k1}
4683 ; X86-NEXT: vgetmantpd $11, {sae}, %zmm0, %zmm0
4684 ; X86-NEXT: vaddpd %zmm0, %zmm1, %zmm0
4686 %res = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 %x3, i32 4)
4687 %res1 = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 -1, i32 8)
4688 %res2 = fadd <8 x double> %res, %res1
4689 ret <8 x double> %res2
4692 declare <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
4694 define <16 x float>@test_int_x86_avx512_mask_getmant_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
4695 ; X64-LABEL: test_int_x86_avx512_mask_getmant_ps_512:
4697 ; X64-NEXT: kmovw %edi, %k1
4698 ; X64-NEXT: vgetmantps $11, %zmm0, %zmm1 {%k1}
4699 ; X64-NEXT: vgetmantps $11, {sae}, %zmm0, %zmm0
4700 ; X64-NEXT: vaddps %zmm0, %zmm1, %zmm0
4703 ; X86-LABEL: test_int_x86_avx512_mask_getmant_ps_512:
4705 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4706 ; X86-NEXT: vgetmantps $11, %zmm0, %zmm1 {%k1}
4707 ; X86-NEXT: vgetmantps $11, {sae}, %zmm0, %zmm0
4708 ; X86-NEXT: vaddps %zmm0, %zmm1, %zmm0
4710 %res = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 %x3, i32 4)
4711 %res1 = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 -1, i32 8)
4712 %res2 = fadd <16 x float> %res, %res1
4713 ret <16 x float> %res2
4716 declare <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double>, <2 x double>, i32, <2 x double>, i8, i32)
4718 define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
4719 ; X64-LABEL: test_int_x86_avx512_mask_getmant_sd:
4721 ; X64-NEXT: kmovw %edi, %k1
4722 ; X64-NEXT: vmovapd %xmm2, %xmm3
4723 ; X64-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm3 {%k1}
4724 ; X64-NEXT: vgetmantsd $12, %xmm1, %xmm0, %xmm4 {%k1} {z}
4725 ; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm3
4726 ; X64-NEXT: vgetmantsd $13, {sae}, %xmm1, %xmm0, %xmm2 {%k1}
4727 ; X64-NEXT: vgetmantsd $14, %xmm1, %xmm0, %xmm0
4728 ; X64-NEXT: vaddpd %xmm0, %xmm2, %xmm0
4729 ; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0
4732 ; X86-LABEL: test_int_x86_avx512_mask_getmant_sd:
4734 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4735 ; X86-NEXT: kmovw %eax, %k1
4736 ; X86-NEXT: vmovapd %xmm2, %xmm3
4737 ; X86-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm3 {%k1}
4738 ; X86-NEXT: vgetmantsd $12, %xmm1, %xmm0, %xmm4 {%k1} {z}
4739 ; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm3
4740 ; X86-NEXT: vgetmantsd $13, {sae}, %xmm1, %xmm0, %xmm2 {%k1}
4741 ; X86-NEXT: vgetmantsd $14, %xmm1, %xmm0, %xmm0
4742 ; X86-NEXT: vaddpd %xmm0, %xmm2, %xmm0
4743 ; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0
4745 %res = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 4)
4746 %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 12, <2 x double> zeroinitializer, i8 %x3, i32 4)
4747 %res2 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 13, <2 x double> %x2, i8 %x3, i32 8)
4748 %res3 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 14, <2 x double> %x2, i8 -1, i32 4)
4749 %res11 = fadd <2 x double> %res, %res1
4750 %res12 = fadd <2 x double> %res2, %res3
4751 %res13 = fadd <2 x double> %res11, %res12
4752 ret <2 x double> %res13
4755 declare <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float>, <4 x float>, i32, <4 x float>, i8, i32)
4757 define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
4758 ; X64-LABEL: test_int_x86_avx512_mask_getmant_ss:
4760 ; X64-NEXT: kmovw %edi, %k1
4761 ; X64-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1}
4762 ; X64-NEXT: vgetmantss $12, %xmm1, %xmm0, %xmm3 {%k1} {z}
4763 ; X64-NEXT: vaddps %xmm3, %xmm2, %xmm2
4764 ; X64-NEXT: vgetmantss $13, {sae}, %xmm1, %xmm0, %xmm3
4765 ; X64-NEXT: vgetmantss $14, %xmm1, %xmm0, %xmm0
4766 ; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0
4767 ; X64-NEXT: vaddps %xmm0, %xmm2, %xmm0
4770 ; X86-LABEL: test_int_x86_avx512_mask_getmant_ss:
4772 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4773 ; X86-NEXT: kmovw %eax, %k1
4774 ; X86-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1}
4775 ; X86-NEXT: vgetmantss $12, %xmm1, %xmm0, %xmm3 {%k1} {z}
4776 ; X86-NEXT: vaddps %xmm3, %xmm2, %xmm2
4777 ; X86-NEXT: vgetmantss $13, {sae}, %xmm1, %xmm0, %xmm3
4778 ; X86-NEXT: vgetmantss $14, %xmm1, %xmm0, %xmm0
4779 ; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0
4780 ; X86-NEXT: vaddps %xmm0, %xmm2, %xmm0
4782 %res = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 %x3, i32 4)
4783 %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 12, <4 x float> zeroinitializer, i8 %x3, i32 4)
4784 %res2 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 13, <4 x float> %x2, i8 -1, i32 8)
4785 %res3 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 14, <4 x float> %x2, i8 -1, i32 4)
4786 %res11 = fadd <4 x float> %res, %res1
4787 %res12 = fadd <4 x float> %res2, %res3
4788 %res13 = fadd <4 x float> %res11, %res12
4789 ret <4 x float> %res13
4792 define <4 x float> @test_int_x86_avx512_mask_getmant_ss_load(<4 x float> %x0, ptr %x1p) {
4793 ; X64-LABEL: test_int_x86_avx512_mask_getmant_ss_load:
4795 ; X64-NEXT: vgetmantss $11, (%rdi), %xmm0, %xmm0
4798 ; X86-LABEL: test_int_x86_avx512_mask_getmant_ss_load:
4800 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
4801 ; X86-NEXT: vgetmantss $11, (%eax), %xmm0, %xmm0
4803 %x1 = load <4 x float>, ptr %x1p
4804 %res = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> undef, i8 -1, i32 4)
4805 ret <4 x float> %res
4808 declare <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double>, <8 x i64>)
4810 define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1) {
4811 ; CHECK-LABEL: test_int_x86_avx512_vpermilvar_pd_512:
4813 ; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm0
4814 ; CHECK-NEXT: ret{{[l|q]}}
4815 %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1)
4816 ret <8 x double> %res
4819 define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_mask(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %mask) {
4820 ; X64-LABEL: test_int_x86_avx512_vpermilvar_pd_512_mask:
4822 ; X64-NEXT: kmovw %edi, %k1
4823 ; X64-NEXT: vpermilpd %zmm1, %zmm0, %zmm2 {%k1}
4824 ; X64-NEXT: vmovapd %zmm2, %zmm0
4827 ; X86-LABEL: test_int_x86_avx512_vpermilvar_pd_512_mask:
4829 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4830 ; X86-NEXT: kmovw %eax, %k1
4831 ; X86-NEXT: vpermilpd %zmm1, %zmm0, %zmm2 {%k1}
4832 ; X86-NEXT: vmovapd %zmm2, %zmm0
4834 %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1)
4835 %mask.cast = bitcast i8 %mask to <8 x i1>
4836 %res2 = select <8 x i1> %mask.cast, <8 x double> %res, <8 x double> %x2
4837 ret <8 x double> %res2
4840 define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_maskz(<8 x double> %x0, <8 x i64> %x1, i8 %mask) {
4841 ; X64-LABEL: test_int_x86_avx512_vpermilvar_pd_512_maskz:
4843 ; X64-NEXT: kmovw %edi, %k1
4844 ; X64-NEXT: vpermilpd %zmm1, %zmm0, %zmm0 {%k1} {z}
4847 ; X86-LABEL: test_int_x86_avx512_vpermilvar_pd_512_maskz:
4849 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4850 ; X86-NEXT: kmovw %eax, %k1
4851 ; X86-NEXT: vpermilpd %zmm1, %zmm0, %zmm0 {%k1} {z}
4853 %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1)
4854 %mask.cast = bitcast i8 %mask to <8 x i1>
4855 %res2 = select <8 x i1> %mask.cast, <8 x double> %res, <8 x double> zeroinitializer
4856 ret <8 x double> %res2
4859 declare <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float>, <16 x i32>)
4861 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1) {
4862 ; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_512:
4864 ; CHECK-NEXT: vpermilps %zmm1, %zmm0, %zmm0
4865 ; CHECK-NEXT: ret{{[l|q]}}
4866 %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1)
4867 ret <16 x float> %res
4870 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_mask(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %mask) {
4871 ; X64-LABEL: test_int_x86_avx512_vpermilvar_ps_512_mask:
4873 ; X64-NEXT: kmovw %edi, %k1
4874 ; X64-NEXT: vpermilps %zmm1, %zmm0, %zmm2 {%k1}
4875 ; X64-NEXT: vmovaps %zmm2, %zmm0
4878 ; X86-LABEL: test_int_x86_avx512_vpermilvar_ps_512_mask:
4880 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4881 ; X86-NEXT: vpermilps %zmm1, %zmm0, %zmm2 {%k1}
4882 ; X86-NEXT: vmovaps %zmm2, %zmm0
4884 %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1)
4885 %mask.cast = bitcast i16 %mask to <16 x i1>
4886 %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> %x2
4887 ret <16 x float> %res2
4890 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_maskz(<16 x float> %x0, <16 x i32> %x1, i16 %mask) {
4891 ; X64-LABEL: test_int_x86_avx512_vpermilvar_ps_512_maskz:
4893 ; X64-NEXT: kmovw %edi, %k1
4894 ; X64-NEXT: vpermilps %zmm1, %zmm0, %zmm0 {%k1} {z}
4897 ; X86-LABEL: test_int_x86_avx512_vpermilvar_ps_512_maskz:
4899 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4900 ; X86-NEXT: vpermilps %zmm1, %zmm0, %zmm0 {%k1} {z}
4902 %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1)
4903 %mask.cast = bitcast i16 %mask to <16 x i1>
4904 %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> zeroinitializer
4905 ret <16 x float> %res2
4908 ; Test case to make sure we can print shuffle decode comments for constant pool loads.
4909 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool(<16 x float> %x0, <16 x i32> %x1) {
4910 ; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_512_constant_pool:
4912 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
4913 ; CHECK-NEXT: ret{{[l|q]}}
4914 %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
4915 ret <16 x float> %res
4918 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %mask) {
4919 ; X64-LABEL: test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask:
4921 ; X64-NEXT: kmovw %edi, %k1
4922 ; X64-NEXT: vpermilps {{.*#+}} zmm2 {%k1} = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
4923 ; X64-NEXT: vmovaps %zmm2, %zmm0
4926 ; X86-LABEL: test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask:
4928 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4929 ; X86-NEXT: vpermilps {{.*#+}} zmm2 {%k1} = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
4930 ; X86-NEXT: vmovaps %zmm2, %zmm0
4932 %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
4933 %mask.cast = bitcast i16 %mask to <16 x i1>
4934 %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> %x2
4935 ret <16 x float> %res2
4938 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz(<16 x float> %x0, <16 x i32> %x1, i16 %mask) {
4939 ; X64-LABEL: test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz:
4941 ; X64-NEXT: kmovw %edi, %k1
4942 ; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
4945 ; X86-LABEL: test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz:
4947 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4948 ; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
4950 %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
4951 %mask.cast = bitcast i16 %mask to <16 x i1>
4952 %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> zeroinitializer
4953 ret <16 x float> %res2
4956 declare <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double>, <4 x float>, <2 x double>, i8, i32)
4958 define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<2 x double> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) {
4959 ; X64-LABEL: test_int_x86_avx512_mask_cvt_ss2sd_round:
4961 ; X64-NEXT: kmovw %edi, %k1
4962 ; X64-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm2 {%k1}
4963 ; X64-NEXT: vcvtss2sd {sae}, %xmm1, %xmm0, %xmm0
4964 ; X64-NEXT: vaddpd %xmm0, %xmm2, %xmm0
4967 ; X86-LABEL: test_int_x86_avx512_mask_cvt_ss2sd_round:
4969 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4970 ; X86-NEXT: kmovw %eax, %k1
4971 ; X86-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm2 {%k1}
4972 ; X86-NEXT: vcvtss2sd {sae}, %xmm1, %xmm0, %xmm0
4973 ; X86-NEXT: vaddpd %xmm0, %xmm2, %xmm0
4975 %res = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> %x0, <4 x float> %x1, <2 x double> %x2, i8 %x3, i32 4)
4976 %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> %x0, <4 x float> %x1, <2 x double> %x2, i8 -1, i32 8)
4977 %res2 = fadd <2 x double> %res, %res1
4978 ret <2 x double> %res2
4981 declare <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float>, <2 x double>, <4 x float>, i8, i32)
4983 define <4 x float>@test_int_x86_avx512_mask_cvt_sd2ss_round(<4 x float> %x0,<2 x double> %x1, <4 x float> %x2, i8 %x3) {
4984 ; X64-LABEL: test_int_x86_avx512_mask_cvt_sd2ss_round:
4986 ; X64-NEXT: kmovw %edi, %k1
4987 ; X64-NEXT: vcvtsd2ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
4988 ; X64-NEXT: vcvtsd2ss {rn-sae}, %xmm1, %xmm0, %xmm0
4989 ; X64-NEXT: vaddps %xmm0, %xmm2, %xmm0
4992 ; X86-LABEL: test_int_x86_avx512_mask_cvt_sd2ss_round:
4994 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4995 ; X86-NEXT: kmovw %eax, %k1
4996 ; X86-NEXT: vcvtsd2ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
4997 ; X86-NEXT: vcvtsd2ss {rn-sae}, %xmm1, %xmm0, %xmm0
4998 ; X86-NEXT: vaddps %xmm0, %xmm2, %xmm0
5000 %res = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> %x0, <2 x double> %x1, <4 x float> %x2, i8 %x3, i32 11)
5001 %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> %x0, <2 x double> %x1, <4 x float> %x2, i8 -1, i32 8)
5002 %res2 = fadd <4 x float> %res, %res1
5003 ret <4 x float> %res2
5006 declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32)
5008 define <16 x i32>@test_int_x86_avx512_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
5009 ; CHECK-LABEL: test_int_x86_avx512_pternlog_d_512:
5011 ; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0
5012 ; CHECK-NEXT: ret{{[l|q]}}
5013 %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
5017 define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) {
5018 ; X64-LABEL: test_int_x86_avx512_mask_pternlog_d_512:
5020 ; X64-NEXT: kmovw %edi, %k1
5021 ; X64-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1}
5024 ; X86-LABEL: test_int_x86_avx512_mask_pternlog_d_512:
5026 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
5027 ; X86-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1}
5029 %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
5030 %2 = bitcast i16 %x4 to <16 x i1>
5031 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
5035 define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) {
5036 ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_d_512:
5038 ; X64-NEXT: kmovw %edi, %k1
5039 ; X64-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z}
5042 ; X86-LABEL: test_int_x86_avx512_maskz_pternlog_d_512:
5044 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
5045 ; X86-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z}
5047 %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
5048 %2 = bitcast i16 %x4 to <16 x i1>
5049 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
5053 declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32)
5055 define <8 x i64>@test_int_x86_avx512_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) {
5056 ; CHECK-LABEL: test_int_x86_avx512_pternlog_q_512:
5058 ; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0
5059 ; CHECK-NEXT: ret{{[l|q]}}
5060 %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
5064 define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) {
5065 ; X64-LABEL: test_int_x86_avx512_mask_pternlog_q_512:
5067 ; X64-NEXT: kmovw %edi, %k1
5068 ; X64-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1}
5071 ; X86-LABEL: test_int_x86_avx512_mask_pternlog_q_512:
5073 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5074 ; X86-NEXT: kmovw %eax, %k1
5075 ; X86-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1}
5077 %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
5078 %2 = bitcast i8 %x4 to <8 x i1>
5079 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0
5083 define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) {
5084 ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_q_512:
5086 ; X64-NEXT: kmovw %edi, %k1
5087 ; X64-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} {z}
5090 ; X86-LABEL: test_int_x86_avx512_maskz_pternlog_q_512:
5092 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5093 ; X86-NEXT: kmovw %eax, %k1
5094 ; X86-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} {z}
5096 %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
5097 %2 = bitcast i8 %x4 to <8 x i1>
5098 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
5102 define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
5103 ; CHECK-LABEL: test_x86_avx512_comi_sd_eq_sae:
5105 ; CHECK-NEXT: vcmpeqsd {sae}, %xmm1, %xmm0, %k0
5106 ; CHECK-NEXT: kmovw %k0, %eax
5107 ; CHECK-NEXT: ret{{[l|q]}}
5108 %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 8)
5112 define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
5113 ; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq_sae:
5115 ; CHECK-NEXT: vcmpeq_uqsd {sae}, %xmm1, %xmm0, %k0
5116 ; CHECK-NEXT: kmovw %k0, %eax
5117 ; CHECK-NEXT: ret{{[l|q]}}
5118 %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 8)
5122 define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
5123 ; CHECK-LABEL: test_x86_avx512_comi_sd_eq:
5125 ; CHECK-NEXT: vcmpeqsd %xmm1, %xmm0, %k0
5126 ; CHECK-NEXT: kmovw %k0, %eax
5127 ; CHECK-NEXT: ret{{[l|q]}}
5128 %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 4)
5132 define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
5133 ; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq:
5135 ; CHECK-NEXT: vcmpeq_uqsd %xmm1, %xmm0, %k0
5136 ; CHECK-NEXT: kmovw %k0, %eax
5137 ; CHECK-NEXT: ret{{[l|q]}}
5138 %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 4)
5142 define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
5143 ; CHECK-LABEL: test_x86_avx512_comi_sd_lt_sae:
5145 ; CHECK-NEXT: vcmpltsd {sae}, %xmm1, %xmm0, %k0
5146 ; CHECK-NEXT: kmovw %k0, %eax
5147 ; CHECK-NEXT: ret{{[l|q]}}
5148 %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 8)
5152 define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
5153 ; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt_sae:
5155 ; CHECK-NEXT: vcmpngesd {sae}, %xmm1, %xmm0, %k0
5156 ; CHECK-NEXT: kmovw %k0, %eax
5157 ; CHECK-NEXT: ret{{[l|q]}}
5158 %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 8)
5162 define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
5163 ; CHECK-LABEL: test_x86_avx512_comi_sd_lt:
5165 ; CHECK-NEXT: vcmpltsd %xmm1, %xmm0, %k0
5166 ; CHECK-NEXT: kmovw %k0, %eax
5167 ; CHECK-NEXT: ret{{[l|q]}}
5168 %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 4)
5172 define i32 @test_x86_avx512_ucomi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
5173 ; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt:
5175 ; CHECK-NEXT: vcmpngesd %xmm1, %xmm0, %k0
5176 ; CHECK-NEXT: kmovw %k0, %eax
5177 ; CHECK-NEXT: ret{{[l|q]}}
5178 %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 4)
5182 declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32)
5184 define i32 @test_x86_avx512_ucomi_ss_lt(<4 x float> %a0, <4 x float> %a1) {
5185 ; CHECK-LABEL: test_x86_avx512_ucomi_ss_lt:
5187 ; CHECK-NEXT: vcmpngess %xmm1, %xmm0, %k0
5188 ; CHECK-NEXT: kmovw %k0, %eax
5189 ; CHECK-NEXT: ret{{[l|q]}}
5190 %res = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %a0, <4 x float> %a1, i32 9, i32 4)
5194 declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32)
5196 declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>)
5198 define <8 x double>@test_int_x86_avx512_permvar_df_512(<8 x double> %x0, <8 x i64> %x1) {
5199 ; CHECK-LABEL: test_int_x86_avx512_permvar_df_512:
5201 ; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0
5202 ; CHECK-NEXT: ret{{[l|q]}}
5203 %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1)
5207 define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
5208 ; X64-LABEL: test_int_x86_avx512_mask_permvar_df_512:
5210 ; X64-NEXT: kmovw %edi, %k1
5211 ; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm2 {%k1}
5212 ; X64-NEXT: vmovapd %zmm2, %zmm0
5215 ; X86-LABEL: test_int_x86_avx512_mask_permvar_df_512:
5217 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5218 ; X86-NEXT: kmovw %eax, %k1
5219 ; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm2 {%k1}
5220 ; X86-NEXT: vmovapd %zmm2, %zmm0
5222 %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1)
5223 %2 = bitcast i8 %x3 to <8 x i1>
5224 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %x2
5228 define <8 x double>@test_int_x86_avx512_maskz_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, i8 %x3) {
5229 ; X64-LABEL: test_int_x86_avx512_maskz_permvar_df_512:
5231 ; X64-NEXT: kmovw %edi, %k1
5232 ; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
5235 ; X86-LABEL: test_int_x86_avx512_maskz_permvar_df_512:
5237 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5238 ; X86-NEXT: kmovw %eax, %k1
5239 ; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
5241 %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1)
5242 %2 = bitcast i8 %x3 to <8 x i1>
5243 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
5247 declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>)
5249 define <8 x i64>@test_int_x86_avx512_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1) {
5250 ; CHECK-LABEL: test_int_x86_avx512_permvar_di_512:
5252 ; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0
5253 ; CHECK-NEXT: ret{{[l|q]}}
5254 %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1)
5258 define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
5259 ; X64-LABEL: test_int_x86_avx512_mask_permvar_di_512:
5261 ; X64-NEXT: kmovw %edi, %k1
5262 ; X64-NEXT: vpermq %zmm0, %zmm1, %zmm2 {%k1}
5263 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
5266 ; X86-LABEL: test_int_x86_avx512_mask_permvar_di_512:
5268 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5269 ; X86-NEXT: kmovw %eax, %k1
5270 ; X86-NEXT: vpermq %zmm0, %zmm1, %zmm2 {%k1}
5271 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
5273 %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1)
5274 %2 = bitcast i8 %x3 to <8 x i1>
5275 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2
5279 define <8 x i64>@test_int_x86_avx512_maskz_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3) {
5280 ; X64-LABEL: test_int_x86_avx512_maskz_permvar_di_512:
5282 ; X64-NEXT: kmovw %edi, %k1
5283 ; X64-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
5286 ; X86-LABEL: test_int_x86_avx512_maskz_permvar_di_512:
5288 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5289 ; X86-NEXT: kmovw %eax, %k1
5290 ; X86-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
5292 %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1)
5293 %2 = bitcast i8 %x3 to <8 x i1>
5294 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
5298 declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>)
5300 define <16 x float>@test_int_x86_avx512_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1) {
5301 ; CHECK-LABEL: test_int_x86_avx512_permvar_sf_512:
5303 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
5304 ; CHECK-NEXT: ret{{[l|q]}}
5305 %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1)
5309 define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
5310 ; X64-LABEL: test_int_x86_avx512_mask_permvar_sf_512:
5312 ; X64-NEXT: kmovw %edi, %k1
5313 ; X64-NEXT: vpermps %zmm0, %zmm1, %zmm2 {%k1}
5314 ; X64-NEXT: vmovaps %zmm2, %zmm0
5317 ; X86-LABEL: test_int_x86_avx512_mask_permvar_sf_512:
5319 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
5320 ; X86-NEXT: vpermps %zmm0, %zmm1, %zmm2 {%k1}
5321 ; X86-NEXT: vmovaps %zmm2, %zmm0
5323 %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1)
5324 %2 = bitcast i16 %x3 to <16 x i1>
5325 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %x2
5329 define <16 x float>@test_int_x86_avx512_maskz_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, i16 %x3) {
5330 ; X64-LABEL: test_int_x86_avx512_maskz_permvar_sf_512:
5332 ; X64-NEXT: kmovw %edi, %k1
5333 ; X64-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
5336 ; X86-LABEL: test_int_x86_avx512_maskz_permvar_sf_512:
5338 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
5339 ; X86-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
5341 %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1)
5342 %2 = bitcast i16 %x3 to <16 x i1>
5343 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
5347 declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>)
5349 define <16 x i32>@test_int_x86_avx512_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1) {
5350 ; CHECK-LABEL: test_int_x86_avx512_permvar_si_512:
5352 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
5353 ; CHECK-NEXT: ret{{[l|q]}}
5354 %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1)
5358 define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
5359 ; X64-LABEL: test_int_x86_avx512_mask_permvar_si_512:
5361 ; X64-NEXT: kmovw %edi, %k1
5362 ; X64-NEXT: vpermd %zmm0, %zmm1, %zmm2 {%k1}
5363 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
5366 ; X86-LABEL: test_int_x86_avx512_mask_permvar_si_512:
5368 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
5369 ; X86-NEXT: vpermd %zmm0, %zmm1, %zmm2 {%k1}
5370 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
5372 %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1)
5373 %2 = bitcast i16 %x3 to <16 x i1>
5374 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2
5378 define <16 x i32>@test_int_x86_avx512_maskz_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x3) {
5379 ; X64-LABEL: test_int_x86_avx512_maskz_permvar_si_512:
5381 ; X64-NEXT: kmovw %edi, %k1
5382 ; X64-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
5385 ; X86-LABEL: test_int_x86_avx512_maskz_permvar_si_512:
5387 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
5388 ; X86-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
5390 %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1)
5391 %2 = bitcast i16 %x3 to <16 x i1>
5392 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
5396 declare <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32)
5398 define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) {
5399 ; X64-LABEL: test_int_x86_avx512_mask_fixupimm_pd_512:
5401 ; X64-NEXT: kmovw %edi, %k1
5402 ; X64-NEXT: vmovapd %zmm0, %zmm3
5403 ; X64-NEXT: vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1}
5404 ; X64-NEXT: vxorpd %xmm4, %xmm4, %xmm4
5405 ; X64-NEXT: vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z}
5406 ; X64-NEXT: vaddpd %zmm4, %zmm3, %zmm3
5407 ; X64-NEXT: vfixupimmpd $3, {sae}, %zmm2, %zmm1, %zmm0
5408 ; X64-NEXT: vaddpd %zmm0, %zmm3, %zmm0
5411 ; X86-LABEL: test_int_x86_avx512_mask_fixupimm_pd_512:
5413 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5414 ; X86-NEXT: kmovw %eax, %k1
5415 ; X86-NEXT: vmovapd %zmm0, %zmm3
5416 ; X86-NEXT: vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1}
5417 ; X86-NEXT: vxorpd %xmm4, %xmm4, %xmm4
5418 ; X86-NEXT: vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z}
5419 ; X86-NEXT: vaddpd %zmm4, %zmm3, %zmm3
5420 ; X86-NEXT: vfixupimmpd $3, {sae}, %zmm2, %zmm1, %zmm0
5421 ; X86-NEXT: vaddpd %zmm0, %zmm3, %zmm0
5423 %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 4, i8 %x4, i32 4)
5424 %res1 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> %x1, <8 x i64> %x2, i32 5, i8 %x4, i32 4)
5425 %res2 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 -1, i32 8)
5426 %res3 = fadd <8 x double> %res, %res1
5427 %res4 = fadd <8 x double> %res3, %res2
5428 ret <8 x double> %res4
5431 define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512_load(<8 x double> %x0, <8 x double> %x1, ptr %x2ptr) {
5432 ; X64-LABEL: test_int_x86_avx512_mask_fixupimm_pd_512_load:
5434 ; X64-NEXT: vfixupimmpd $3, (%rdi), %zmm1, %zmm0
5437 ; X86-LABEL: test_int_x86_avx512_mask_fixupimm_pd_512_load:
5439 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
5440 ; X86-NEXT: vfixupimmpd $3, (%eax), %zmm1, %zmm0
5442 %x2 = load <8 x i64>, ptr %x2ptr
5443 %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 -1, i32 4)
5444 ret <8 x double> %res
5447 declare <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32)
5449 define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) {
5450 ; X64-LABEL: test_int_x86_avx512_maskz_fixupimm_pd_512:
5452 ; X64-NEXT: kmovw %edi, %k1
5453 ; X64-NEXT: vmovapd %zmm0, %zmm3
5454 ; X64-NEXT: vfixupimmpd $3, %zmm2, %zmm1, %zmm3 {%k1} {z}
5455 ; X64-NEXT: vxorpd %xmm4, %xmm4, %xmm4
5456 ; X64-NEXT: vmovapd %zmm0, %zmm5
5457 ; X64-NEXT: vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z}
5458 ; X64-NEXT: vaddpd %zmm5, %zmm3, %zmm3
5459 ; X64-NEXT: vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0
5460 ; X64-NEXT: vaddpd %zmm0, %zmm3, %zmm0
5463 ; X86-LABEL: test_int_x86_avx512_maskz_fixupimm_pd_512:
5465 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5466 ; X86-NEXT: kmovw %eax, %k1
5467 ; X86-NEXT: vmovapd %zmm0, %zmm3
5468 ; X86-NEXT: vfixupimmpd $3, %zmm2, %zmm1, %zmm3 {%k1} {z}
5469 ; X86-NEXT: vxorpd %xmm4, %xmm4, %xmm4
5470 ; X86-NEXT: vmovapd %zmm0, %zmm5
5471 ; X86-NEXT: vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z}
5472 ; X86-NEXT: vaddpd %zmm5, %zmm3, %zmm3
5473 ; X86-NEXT: vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0
5474 ; X86-NEXT: vaddpd %zmm0, %zmm3, %zmm0
5476 %res = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 %x4, i32 4)
5477 %res1 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> zeroinitializer, i32 5, i8 %x4, i32 4)
5478 %res2 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 2, i8 -1, i32 8)
5479 %res3 = fadd <8 x double> %res, %res1
5480 %res4 = fadd <8 x double> %res3, %res2
5481 ret <8 x double> %res4
5484 declare <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float>, <4 x float>, <4 x i32>, i32, i8, i32)
5486 define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) {
5487 ; X64-LABEL: test_int_x86_avx512_mask_fixupimm_ss:
5489 ; X64-NEXT: kmovw %edi, %k1
5490 ; X64-NEXT: vmovaps %xmm0, %xmm3
5491 ; X64-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1}
5492 ; X64-NEXT: vxorps %xmm4, %xmm4, %xmm4
5493 ; X64-NEXT: vmovaps %xmm0, %xmm5
5494 ; X64-NEXT: vfixupimmss $5, %xmm4, %xmm1, %xmm5 {%k1}
5495 ; X64-NEXT: vaddps %xmm5, %xmm3, %xmm3
5496 ; X64-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0
5497 ; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0
5500 ; X86-LABEL: test_int_x86_avx512_mask_fixupimm_ss:
5502 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5503 ; X86-NEXT: kmovw %eax, %k1
5504 ; X86-NEXT: vmovaps %xmm0, %xmm3
5505 ; X86-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1}
5506 ; X86-NEXT: vxorps %xmm4, %xmm4, %xmm4
5507 ; X86-NEXT: vmovaps %xmm0, %xmm5
5508 ; X86-NEXT: vfixupimmss $5, %xmm4, %xmm1, %xmm5 {%k1}
5509 ; X86-NEXT: vaddps %xmm5, %xmm3, %xmm3
5510 ; X86-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0
5511 ; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0
5513 %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4)
5514 %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 4)
5515 %res2 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 -1, i32 8)
5516 %res3 = fadd <4 x float> %res, %res1
5517 %res4 = fadd <4 x float> %res3, %res2
5518 ret <4 x float> %res4
5521 declare <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float>, <4 x float>, <4 x i32>, i32, i8, i32)
5523 define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) {
5524 ; X64-LABEL: test_int_x86_avx512_maskz_fixupimm_ss:
5526 ; X64-NEXT: kmovw %edi, %k1
5527 ; X64-NEXT: vmovaps %xmm0, %xmm3
5528 ; X64-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1} {z}
5529 ; X64-NEXT: vxorps %xmm4, %xmm4, %xmm4
5530 ; X64-NEXT: vmovaps %xmm0, %xmm5
5531 ; X64-NEXT: vfixupimmss $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z}
5532 ; X64-NEXT: vaddps %xmm5, %xmm3, %xmm3
5533 ; X64-NEXT: vfixupimmss $6, %xmm2, %xmm1, %xmm0
5534 ; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0
5537 ; X86-LABEL: test_int_x86_avx512_maskz_fixupimm_ss:
5539 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5540 ; X86-NEXT: kmovw %eax, %k1
5541 ; X86-NEXT: vmovaps %xmm0, %xmm3
5542 ; X86-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1} {z}
5543 ; X86-NEXT: vxorps %xmm4, %xmm4, %xmm4
5544 ; X86-NEXT: vmovaps %xmm0, %xmm5
5545 ; X86-NEXT: vfixupimmss $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z}
5546 ; X86-NEXT: vaddps %xmm5, %xmm3, %xmm3
5547 ; X86-NEXT: vfixupimmss $6, %xmm2, %xmm1, %xmm0
5548 ; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0
5550 %res = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4)
5551 %res1 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 8)
5552 %res2 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 6, i8 -1, i32 4)
5553 %res3 = fadd <4 x float> %res, %res1
5554 %res4 = fadd <4 x float> %res3, %res2
5555 ret <4 x float> %res4
5558 declare <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float>, <16 x float>, <16 x i32>, i32, i16, i32)
5560 define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) {
5561 ; X64-LABEL: test_int_x86_avx512_mask_fixupimm_ps_512:
5563 ; X64-NEXT: kmovw %edi, %k1
5564 ; X64-NEXT: vmovaps %zmm0, %zmm3
5565 ; X64-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1}
5566 ; X64-NEXT: vxorps %xmm4, %xmm4, %xmm4
5567 ; X64-NEXT: vmovaps %zmm0, %zmm5
5568 ; X64-NEXT: vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1}
5569 ; X64-NEXT: vaddps %zmm5, %zmm3, %zmm3
5570 ; X64-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0
5571 ; X64-NEXT: vaddps %zmm0, %zmm3, %zmm0
5574 ; X86-LABEL: test_int_x86_avx512_mask_fixupimm_ps_512:
5576 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
5577 ; X86-NEXT: vmovaps %zmm0, %zmm3
5578 ; X86-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1}
5579 ; X86-NEXT: vxorps %xmm4, %xmm4, %xmm4
5580 ; X86-NEXT: vmovaps %zmm0, %zmm5
5581 ; X86-NEXT: vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1}
5582 ; X86-NEXT: vaddps %zmm5, %zmm3, %zmm3
5583 ; X86-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0
5584 ; X86-NEXT: vaddps %zmm0, %zmm3, %zmm0
5586 %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4)
5587 %res1 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 4)
5588 %res2 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 8)
5589 %res3 = fadd <16 x float> %res, %res1
5590 %res4 = fadd <16 x float> %res3, %res2
5591 ret <16 x float> %res4
5594 define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512_load(<16 x float> %x0, <16 x float> %x1, ptr %x2ptr) {
5595 ; X64-LABEL: test_int_x86_avx512_mask_fixupimm_ps_512_load:
5597 ; X64-NEXT: vfixupimmps $5, (%rdi), %zmm1, %zmm0
5600 ; X86-LABEL: test_int_x86_avx512_mask_fixupimm_ps_512_load:
5602 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
5603 ; X86-NEXT: vfixupimmps $5, (%eax), %zmm1, %zmm0
5605 %x2 = load <16 x i32>, ptr %x2ptr
5606 %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 4)
5607 ret <16 x float> %res
5610 declare <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float>, <16 x float>, <16 x i32>, i32, i16, i32)
5612 define <16 x float>@test_int_x86_avx512_maskz_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) {
5613 ; X64-LABEL: test_int_x86_avx512_maskz_fixupimm_ps_512:
5615 ; X64-NEXT: kmovw %edi, %k1
5616 ; X64-NEXT: vmovaps %zmm0, %zmm3
5617 ; X64-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1} {z}
5618 ; X64-NEXT: vxorps %xmm4, %xmm4, %xmm4
5619 ; X64-NEXT: vmovaps %zmm0, %zmm5
5620 ; X64-NEXT: vfixupimmps $6, {sae}, %zmm4, %zmm1, %zmm5 {%k1} {z}
5621 ; X64-NEXT: vaddps %zmm5, %zmm3, %zmm3
5622 ; X64-NEXT: vfixupimmps $7, %zmm2, %zmm1, %zmm0
5623 ; X64-NEXT: vaddps %zmm0, %zmm3, %zmm0
5626 ; X86-LABEL: test_int_x86_avx512_maskz_fixupimm_ps_512:
5628 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
5629 ; X86-NEXT: vmovaps %zmm0, %zmm3
5630 ; X86-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1} {z}
5631 ; X86-NEXT: vxorps %xmm4, %xmm4, %xmm4
5632 ; X86-NEXT: vmovaps %zmm0, %zmm5
5633 ; X86-NEXT: vfixupimmps $6, {sae}, %zmm4, %zmm1, %zmm5 {%k1} {z}
5634 ; X86-NEXT: vaddps %zmm5, %zmm3, %zmm3
5635 ; X86-NEXT: vfixupimmps $7, %zmm2, %zmm1, %zmm0
5636 ; X86-NEXT: vaddps %zmm0, %zmm3, %zmm0
5638 %res = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4)
5639 %res1 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 6, i16 %x4, i32 8)
5640 %res2 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 7, i16 -1, i32 4)
5641 %res3 = fadd <16 x float> %res, %res1
5642 %res4 = fadd <16 x float> %res3, %res2
5643 ret <16 x float> %res4
5646 declare <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double>, <2 x double>, <2 x i64>, i32, i8, i32)
5648 define <2 x double>@test_int_x86_avx512_mask_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) {
5649 ; X64-LABEL: test_int_x86_avx512_mask_fixupimm_sd:
5651 ; X64-NEXT: kmovw %edi, %k1
5652 ; X64-NEXT: vmovapd %xmm0, %xmm3
5653 ; X64-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1}
5654 ; X64-NEXT: vxorpd %xmm4, %xmm4, %xmm4
5655 ; X64-NEXT: vmovapd %xmm0, %xmm5
5656 ; X64-NEXT: vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1}
5657 ; X64-NEXT: vaddpd %xmm5, %xmm3, %xmm3
5658 ; X64-NEXT: vfixupimmsd $6, %xmm2, %xmm1, %xmm0
5659 ; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0
5662 ; X86-LABEL: test_int_x86_avx512_mask_fixupimm_sd:
5664 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5665 ; X86-NEXT: kmovw %eax, %k1
5666 ; X86-NEXT: vmovapd %xmm0, %xmm3
5667 ; X86-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1}
5668 ; X86-NEXT: vxorpd %xmm4, %xmm4, %xmm4
5669 ; X86-NEXT: vmovapd %xmm0, %xmm5
5670 ; X86-NEXT: vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1}
5671 ; X86-NEXT: vaddpd %xmm5, %xmm3, %xmm3
5672 ; X86-NEXT: vfixupimmsd $6, %xmm2, %xmm1, %xmm0
5673 ; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0
5675 %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4)
5676 %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8)
5677 %res2 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 6, i8 -1, i32 4)
5678 %res3 = fadd <2 x double> %res, %res1
5679 %res4 = fadd <2 x double> %res3, %res2
5680 ret <2 x double> %res4
5683 declare <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double>, <2 x double>, <2 x i64>, i32, i8, i32)
5685 define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) {
5686 ; X64-LABEL: test_int_x86_avx512_maskz_fixupimm_sd:
5688 ; X64-NEXT: kmovw %edi, %k1
5689 ; X64-NEXT: vmovapd %xmm0, %xmm3
5690 ; X64-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1} {z}
5691 ; X64-NEXT: vxorpd %xmm4, %xmm4, %xmm4
5692 ; X64-NEXT: vmovapd %xmm0, %xmm5
5693 ; X64-NEXT: vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z}
5694 ; X64-NEXT: vaddpd %xmm5, %xmm3, %xmm3
5695 ; X64-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5696 ; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0
5699 ; X86-LABEL: test_int_x86_avx512_maskz_fixupimm_sd:
5701 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5702 ; X86-NEXT: kmovw %eax, %k1
5703 ; X86-NEXT: vmovapd %xmm0, %xmm3
5704 ; X86-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1} {z}
5705 ; X86-NEXT: vxorpd %xmm4, %xmm4, %xmm4
5706 ; X86-NEXT: vmovapd %xmm0, %xmm5
5707 ; X86-NEXT: vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z}
5708 ; X86-NEXT: vaddpd %xmm5, %xmm3, %xmm3
5709 ; X86-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5710 ; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0
5712 %res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4)
5713 %res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8)
5714 %res2 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 8)
5715 %res3 = fadd <2 x double> %res, %res1
5716 %res4 = fadd <2 x double> %res3, %res2
5717 ret <2 x double> %res4
5720 declare double @llvm.fma.f64(double, double, double) #1
5721 declare double @llvm.x86.avx512.vfmadd.f64(double, double, double, i32) #0
5723 define <2 x double> @test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) {
5724 ; X64-LABEL: test_int_x86_avx512_mask_vfmadd_sd:
5726 ; X64-NEXT: kmovw %edi, %k1
5727 ; X64-NEXT: vmovapd %xmm0, %xmm3
5728 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm3 {%k1} = (xmm1 * xmm3) + xmm2
5729 ; X64-NEXT: vmovapd %xmm0, %xmm4
5730 ; X64-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm4
5731 ; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm3
5732 ; X64-NEXT: vfmadd213sd {ru-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5733 ; X64-NEXT: vaddpd %xmm3, %xmm0, %xmm0
5736 ; X86-LABEL: test_int_x86_avx512_mask_vfmadd_sd:
5738 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5739 ; X86-NEXT: kmovw %eax, %k1
5740 ; X86-NEXT: vmovapd %xmm0, %xmm3
5741 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm3 {%k1} = (xmm1 * xmm3) + xmm2
5742 ; X86-NEXT: vmovapd %xmm0, %xmm4
5743 ; X86-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm4
5744 ; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm3
5745 ; X86-NEXT: vfmadd213sd {ru-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5746 ; X86-NEXT: vaddpd %xmm3, %xmm0, %xmm0
5748 %1 = extractelement <2 x double> %x0, i64 0
5749 %2 = extractelement <2 x double> %x1, i64 0
5750 %3 = extractelement <2 x double> %x2, i64 0
5751 %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
5752 %5 = bitcast i8 %x3 to <8 x i1>
5753 %6 = extractelement <8 x i1> %5, i64 0
5754 %7 = select i1 %6, double %4, double %1
5755 %8 = insertelement <2 x double> %x0, double %7, i64 0
5756 %9 = extractelement <2 x double> %x0, i64 0
5757 %10 = extractelement <2 x double> %x1, i64 0
5758 %11 = extractelement <2 x double> %x2, i64 0
5759 %12 = call double @llvm.x86.avx512.vfmadd.f64(double %9, double %10, double %11, i32 11)
5760 %13 = insertelement <2 x double> %x0, double %12, i64 0
5761 %14 = extractelement <2 x double> %x0, i64 0
5762 %15 = extractelement <2 x double> %x1, i64 0
5763 %16 = extractelement <2 x double> %x2, i64 0
5764 %17 = call double @llvm.x86.avx512.vfmadd.f64(double %14, double %15, double %16, i32 10)
5765 %18 = bitcast i8 %x3 to <8 x i1>
5766 %19 = extractelement <8 x i1> %18, i64 0
5767 %20 = select i1 %19, double %17, double %14
5768 %21 = insertelement <2 x double> %x0, double %20, i64 0
5769 %res3 = fadd <2 x double> %8, %13
5770 %res4 = fadd <2 x double> %21, %res3
5771 ret <2 x double> %res4
5774 define <4 x float> @test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) {
5775 ; X64-LABEL: test_int_x86_avx512_mask_vfmadd_ss:
5777 ; X64-NEXT: kmovw %edi, %k1
5778 ; X64-NEXT: vmovaps %xmm0, %xmm3
5779 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm3 {%k1} = (xmm1 * xmm3) + xmm2
5780 ; X64-NEXT: vmovaps %xmm0, %xmm4
5781 ; X64-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm4
5782 ; X64-NEXT: vaddps %xmm4, %xmm3, %xmm3
5783 ; X64-NEXT: vfmadd213ss {ru-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5784 ; X64-NEXT: vaddps %xmm3, %xmm0, %xmm0
5787 ; X86-LABEL: test_int_x86_avx512_mask_vfmadd_ss:
5789 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5790 ; X86-NEXT: kmovw %eax, %k1
5791 ; X86-NEXT: vmovaps %xmm0, %xmm3
5792 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm3 {%k1} = (xmm1 * xmm3) + xmm2
5793 ; X86-NEXT: vmovaps %xmm0, %xmm4
5794 ; X86-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm4
5795 ; X86-NEXT: vaddps %xmm4, %xmm3, %xmm3
5796 ; X86-NEXT: vfmadd213ss {ru-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5797 ; X86-NEXT: vaddps %xmm3, %xmm0, %xmm0
5799 %1 = extractelement <4 x float> %x0, i64 0
5800 %2 = extractelement <4 x float> %x1, i64 0
5801 %3 = extractelement <4 x float> %x2, i64 0
5802 %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
5803 %5 = bitcast i8 %x3 to <8 x i1>
5804 %6 = extractelement <8 x i1> %5, i64 0
5805 %7 = select i1 %6, float %4, float %1
5806 %8 = insertelement <4 x float> %x0, float %7, i64 0
5807 %9 = extractelement <4 x float> %x0, i64 0
5808 %10 = extractelement <4 x float> %x1, i64 0
5809 %11 = extractelement <4 x float> %x2, i64 0
5810 %12 = call float @llvm.x86.avx512.vfmadd.f32(float %9, float %10, float %11, i32 11)
5811 %13 = insertelement <4 x float> %x0, float %12, i64 0
5812 %14 = extractelement <4 x float> %x0, i64 0
5813 %15 = extractelement <4 x float> %x1, i64 0
5814 %16 = extractelement <4 x float> %x2, i64 0
5815 %17 = call float @llvm.x86.avx512.vfmadd.f32(float %14, float %15, float %16, i32 10)
5816 %18 = bitcast i8 %x3 to <8 x i1>
5817 %19 = extractelement <8 x i1> %18, i64 0
5818 %20 = select i1 %19, float %17, float %14
5819 %21 = insertelement <4 x float> %x0, float %20, i64 0
5820 %res3 = fadd <4 x float> %8, %13
5821 %res4 = fadd <4 x float> %21, %res3
5822 ret <4 x float> %res4
5825 define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
5826 ; X64-LABEL: test_int_x86_avx512_maskz_vfmadd_sd:
5828 ; X64-NEXT: kmovw %edi, %k1
5829 ; X64-NEXT: vmovapd %xmm0, %xmm3
5830 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm3 {%k1} {z} = (xmm1 * xmm3) + xmm2
5831 ; X64-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5832 ; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0
5835 ; X86-LABEL: test_int_x86_avx512_maskz_vfmadd_sd:
5837 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5838 ; X86-NEXT: kmovw %eax, %k1
5839 ; X86-NEXT: vmovapd %xmm0, %xmm3
5840 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm3 {%k1} {z} = (xmm1 * xmm3) + xmm2
5841 ; X86-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5842 ; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0
5844 %1 = extractelement <2 x double> %x0, i64 0
5845 %2 = extractelement <2 x double> %x1, i64 0
5846 %3 = extractelement <2 x double> %x2, i64 0
5847 %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
5848 %5 = bitcast i8 %x3 to <8 x i1>
5849 %6 = extractelement <8 x i1> %5, i64 0
5850 %7 = select i1 %6, double %4, double 0.000000e+00
5851 %8 = insertelement <2 x double> %x0, double %7, i64 0
5852 %9 = extractelement <2 x double> %x0, i64 0
5853 %10 = extractelement <2 x double> %x1, i64 0
5854 %11 = extractelement <2 x double> %x2, i64 0
5855 %12 = call double @llvm.x86.avx512.vfmadd.f64(double %9, double %10, double %11, i32 11)
5856 %13 = bitcast i8 %x3 to <8 x i1>
5857 %14 = extractelement <8 x i1> %13, i64 0
5858 %15 = select i1 %14, double %12, double 0.000000e+00
5859 %16 = insertelement <2 x double> %x0, double %15, i64 0
5860 %res2 = fadd <2 x double> %8, %16
5861 ret <2 x double> %res2
5864 declare float @llvm.fma.f32(float, float, float) #1
5865 declare float @llvm.x86.avx512.vfmadd.f32(float, float, float, i32) #0
5867 define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
5868 ; X64-LABEL: test_int_x86_avx512_maskz_vfmadd_ss:
5870 ; X64-NEXT: kmovw %edi, %k1
5871 ; X64-NEXT: vmovaps %xmm0, %xmm3
5872 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm3 {%k1} {z} = (xmm1 * xmm3) + xmm2
5873 ; X64-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5874 ; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0
5877 ; X86-LABEL: test_int_x86_avx512_maskz_vfmadd_ss:
5879 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5880 ; X86-NEXT: kmovw %eax, %k1
5881 ; X86-NEXT: vmovaps %xmm0, %xmm3
5882 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm3 {%k1} {z} = (xmm1 * xmm3) + xmm2
5883 ; X86-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5884 ; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0
5886 %1 = extractelement <4 x float> %x0, i64 0
5887 %2 = extractelement <4 x float> %x1, i64 0
5888 %3 = extractelement <4 x float> %x2, i64 0
5889 %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
5890 %5 = bitcast i8 %x3 to <8 x i1>
5891 %6 = extractelement <8 x i1> %5, i64 0
5892 %7 = select i1 %6, float %4, float 0.000000e+00
5893 %8 = insertelement <4 x float> %x0, float %7, i64 0
5894 %9 = extractelement <4 x float> %x0, i64 0
5895 %10 = extractelement <4 x float> %x1, i64 0
5896 %11 = extractelement <4 x float> %x2, i64 0
5897 %12 = call float @llvm.x86.avx512.vfmadd.f32(float %9, float %10, float %11, i32 11)
5898 %13 = bitcast i8 %x3 to <8 x i1>
5899 %14 = extractelement <8 x i1> %13, i64 0
5900 %15 = select i1 %14, float %12, float 0.000000e+00
5901 %16 = insertelement <4 x float> %x0, float %15, i64 0
5902 %res2 = fadd <4 x float> %8, %16
5903 ret <4 x float> %res2
5906 ; Make sure we don't commute this to fold the load as that source isn't commutable.
5907 define <4 x float> @test_int_x86_avx512_maskz_vfmadd_ss_load0(i8 zeroext %0, ptr nocapture readonly %1, float %2, float %3) {
5908 ; X64-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_load0:
5910 ; X64-NEXT: vmovaps (%rsi), %xmm2
5911 ; X64-NEXT: kmovw %edi, %k1
5912 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm2 {%k1} {z} = (xmm0 * xmm2) + xmm1
5913 ; X64-NEXT: vmovaps %xmm2, %xmm0
5916 ; X86-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_load0:
5918 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
5919 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5920 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
5921 ; X86-NEXT: vmovaps (%ecx), %xmm0
5922 ; X86-NEXT: kmovw %eax, %k1
5923 ; X86-NEXT: vfmadd132ss {{.*#+}} xmm0 {%k1} {z} = (xmm0 * mem) + xmm1
5925 %5 = load <4 x float>, ptr %1, align 16
5926 %6 = extractelement <4 x float> %5, i64 0
5927 %7 = tail call float @llvm.fma.f32(float %6, float %2, float %3) #2
5928 %8 = bitcast i8 %0 to <8 x i1>
5929 %9 = extractelement <8 x i1> %8, i64 0
5930 %10 = select i1 %9, float %7, float 0.000000e+00
5931 %11 = insertelement <4 x float> %5, float %10, i64 0
5935 define <2 x double> @test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) {
5936 ; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_sd:
5938 ; X64-NEXT: kmovw %edi, %k1
5939 ; X64-NEXT: vmovapd %xmm2, %xmm3
5940 ; X64-NEXT: vfmadd231sd {{.*#+}} xmm3 {%k1} = (xmm0 * xmm1) + xmm3
5941 ; X64-NEXT: vmovapd %xmm2, %xmm4
5942 ; X64-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm4
5943 ; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm3
5944 ; X64-NEXT: vfmadd231sd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5945 ; X64-NEXT: vaddpd %xmm3, %xmm2, %xmm0
5948 ; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_sd:
5950 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5951 ; X86-NEXT: kmovw %eax, %k1
5952 ; X86-NEXT: vmovapd %xmm2, %xmm3
5953 ; X86-NEXT: vfmadd231sd {{.*#+}} xmm3 {%k1} = (xmm0 * xmm1) + xmm3
5954 ; X86-NEXT: vmovapd %xmm2, %xmm4
5955 ; X86-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm4
5956 ; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm3
5957 ; X86-NEXT: vfmadd231sd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5958 ; X86-NEXT: vaddpd %xmm3, %xmm2, %xmm0
5960 %1 = extractelement <2 x double> %x0, i64 0
5961 %2 = extractelement <2 x double> %x1, i64 0
5962 %3 = extractelement <2 x double> %x2, i64 0
5963 %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
5964 %5 = bitcast i8 %x3 to <8 x i1>
5965 %6 = extractelement <8 x i1> %5, i64 0
5966 %7 = select i1 %6, double %4, double %3
5967 %8 = insertelement <2 x double> %x2, double %7, i64 0
5968 %9 = extractelement <2 x double> %x0, i64 0
5969 %10 = extractelement <2 x double> %x1, i64 0
5970 %11 = extractelement <2 x double> %x2, i64 0
5971 %12 = call double @llvm.x86.avx512.vfmadd.f64(double %9, double %10, double %11, i32 11)
5972 %13 = insertelement <2 x double> %x2, double %12, i64 0
5973 %14 = extractelement <2 x double> %x0, i64 0
5974 %15 = extractelement <2 x double> %x1, i64 0
5975 %16 = extractelement <2 x double> %x2, i64 0
5976 %17 = call double @llvm.x86.avx512.vfmadd.f64(double %14, double %15, double %16, i32 10)
5977 %18 = bitcast i8 %x3 to <8 x i1>
5978 %19 = extractelement <8 x i1> %18, i64 0
5979 %20 = select i1 %19, double %17, double %16
5980 %21 = insertelement <2 x double> %x2, double %20, i64 0
5981 %res3 = fadd <2 x double> %8, %13
5982 %res4 = fadd <2 x double> %21, %res3
5983 ret <2 x double> %res4
5986 define <4 x float> @test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) {
5987 ; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_ss:
5989 ; X64-NEXT: kmovw %edi, %k1
5990 ; X64-NEXT: vmovaps %xmm2, %xmm3
5991 ; X64-NEXT: vfmadd231ss {{.*#+}} xmm3 {%k1} = (xmm0 * xmm1) + xmm3
5992 ; X64-NEXT: vmovaps %xmm2, %xmm4
5993 ; X64-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm4
5994 ; X64-NEXT: vaddps %xmm4, %xmm3, %xmm3
5995 ; X64-NEXT: vfmadd231ss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5996 ; X64-NEXT: vaddps %xmm3, %xmm2, %xmm0
5999 ; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_ss:
6001 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6002 ; X86-NEXT: kmovw %eax, %k1
6003 ; X86-NEXT: vmovaps %xmm2, %xmm3
6004 ; X86-NEXT: vfmadd231ss {{.*#+}} xmm3 {%k1} = (xmm0 * xmm1) + xmm3
6005 ; X86-NEXT: vmovaps %xmm2, %xmm4
6006 ; X86-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm4
6007 ; X86-NEXT: vaddps %xmm4, %xmm3, %xmm3
6008 ; X86-NEXT: vfmadd231ss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6009 ; X86-NEXT: vaddps %xmm3, %xmm2, %xmm0
6011 %1 = extractelement <4 x float> %x0, i64 0
6012 %2 = extractelement <4 x float> %x1, i64 0
6013 %3 = extractelement <4 x float> %x2, i64 0
6014 %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
6015 %5 = bitcast i8 %x3 to <8 x i1>
6016 %6 = extractelement <8 x i1> %5, i64 0
6017 %7 = select i1 %6, float %4, float %3
6018 %8 = insertelement <4 x float> %x2, float %7, i64 0
6019 %9 = extractelement <4 x float> %x0, i64 0
6020 %10 = extractelement <4 x float> %x1, i64 0
6021 %11 = extractelement <4 x float> %x2, i64 0
6022 %12 = call float @llvm.x86.avx512.vfmadd.f32(float %9, float %10, float %11, i32 11)
6023 %13 = insertelement <4 x float> %x2, float %12, i64 0
6024 %14 = extractelement <4 x float> %x0, i64 0
6025 %15 = extractelement <4 x float> %x1, i64 0
6026 %16 = extractelement <4 x float> %x2, i64 0
6027 %17 = call float @llvm.x86.avx512.vfmadd.f32(float %14, float %15, float %16, i32 10)
6028 %18 = bitcast i8 %x3 to <8 x i1>
6029 %19 = extractelement <8 x i1> %18, i64 0
6030 %20 = select i1 %19, float %17, float %16
6031 %21 = insertelement <4 x float> %x2, float %20, i64 0
6032 %res3 = fadd <4 x float> %8, %13
6033 %res4 = fadd <4 x float> %21, %res3
6034 ret <4 x float> %res4
6037 define void @fmadd_ss_mask_memfold(ptr %a, ptr %b, i8 %c) {
6038 ; X64-LABEL: fmadd_ss_mask_memfold:
6040 ; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
6041 ; X64-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
6042 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm0 * xmm1) + xmm0
6043 ; X64-NEXT: kmovw %edx, %k1
6044 ; X64-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
6045 ; X64-NEXT: vmovss %xmm0, (%rdi)
6048 ; X86-LABEL: fmadd_ss_mask_memfold:
6050 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6051 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6052 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
6053 ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
6054 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
6055 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm0 * xmm1) + xmm0
6056 ; X86-NEXT: kmovw %eax, %k1
6057 ; X86-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
6058 ; X86-NEXT: vmovss %xmm0, (%edx)
6060 %a.val = load float, ptr %a
6061 %av0 = insertelement <4 x float> undef, float %a.val, i32 0
6062 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
6063 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
6064 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
6066 %b.val = load float, ptr %b
6067 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
6068 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
6069 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
6070 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
6071 %1 = extractelement <4 x float> %av, i64 0
6072 %2 = extractelement <4 x float> %bv, i64 0
6073 %3 = extractelement <4 x float> %av, i64 0
6074 %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
6075 %5 = bitcast i8 %c to <8 x i1>
6076 %6 = extractelement <8 x i1> %5, i64 0
6077 %7 = select i1 %6, float %4, float %1
6078 %8 = insertelement <4 x float> %av, float %7, i64 0
6079 %sr = extractelement <4 x float> %8, i32 0
6080 store float %sr, ptr %a
6084 define void @fmadd_ss_maskz_memfold(ptr %a, ptr %b, i8 %c) {
6085 ; X64-LABEL: fmadd_ss_maskz_memfold:
6087 ; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
6088 ; X64-NEXT: vfmadd231ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
6089 ; X64-NEXT: kmovw %edx, %k1
6090 ; X64-NEXT: vmovss %xmm0, %xmm0, %xmm0 {%k1} {z}
6091 ; X64-NEXT: vmovss %xmm0, (%rdi)
6094 ; X86-LABEL: fmadd_ss_maskz_memfold:
6096 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6097 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6098 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
6099 ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
6100 ; X86-NEXT: vfmadd231ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
6101 ; X86-NEXT: kmovw %eax, %k1
6102 ; X86-NEXT: vmovss %xmm0, %xmm0, %xmm0 {%k1} {z}
6103 ; X86-NEXT: vmovss %xmm0, (%edx)
6105 %a.val = load float, ptr %a
6106 %av0 = insertelement <4 x float> undef, float %a.val, i32 0
6107 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
6108 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
6109 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
6111 %b.val = load float, ptr %b
6112 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
6113 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
6114 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
6115 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
6116 %1 = extractelement <4 x float> %av, i64 0
6117 %2 = extractelement <4 x float> %bv, i64 0
6118 %3 = extractelement <4 x float> %av, i64 0
6119 %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
6120 %5 = bitcast i8 %c to <8 x i1>
6121 %6 = extractelement <8 x i1> %5, i64 0
6122 %7 = select i1 %6, float %4, float 0.000000e+00
6123 %8 = insertelement <4 x float> %av, float %7, i64 0
6124 %sr = extractelement <4 x float> %8, i32 0
6125 store float %sr, ptr %a
6129 define void @fmadd_sd_mask_memfold(ptr %a, ptr %b, i8 %c) {
6130 ; X64-LABEL: fmadd_sd_mask_memfold:
6132 ; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
6133 ; X64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
6134 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm1 = (xmm0 * xmm1) + xmm0
6135 ; X64-NEXT: kmovw %edx, %k1
6136 ; X64-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
6137 ; X64-NEXT: vmovsd %xmm0, (%rdi)
6140 ; X86-LABEL: fmadd_sd_mask_memfold:
6142 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6143 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6144 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
6145 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
6146 ; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
6147 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm1 = (xmm0 * xmm1) + xmm0
6148 ; X86-NEXT: kmovw %eax, %k1
6149 ; X86-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
6150 ; X86-NEXT: vmovsd %xmm0, (%edx)
6152 %a.val = load double, ptr %a
6153 %av0 = insertelement <2 x double> undef, double %a.val, i32 0
6154 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
6156 %b.val = load double, ptr %b
6157 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
6158 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
6159 %1 = extractelement <2 x double> %av, i64 0
6160 %2 = extractelement <2 x double> %bv, i64 0
6161 %3 = extractelement <2 x double> %av, i64 0
6162 %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
6163 %5 = bitcast i8 %c to <8 x i1>
6164 %6 = extractelement <8 x i1> %5, i64 0
6165 %7 = select i1 %6, double %4, double %1
6166 %8 = insertelement <2 x double> %av, double %7, i64 0
6167 %sr = extractelement <2 x double> %8, i32 0
6168 store double %sr, ptr %a
6172 define void @fmadd_sd_maskz_memfold(ptr %a, ptr %b, i8 %c) {
6173 ; X64-LABEL: fmadd_sd_maskz_memfold:
6175 ; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
6176 ; X64-NEXT: vfmadd231sd {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
6177 ; X64-NEXT: kmovw %edx, %k1
6178 ; X64-NEXT: vmovsd %xmm0, %xmm0, %xmm0 {%k1} {z}
6179 ; X64-NEXT: vmovsd %xmm0, (%rdi)
6182 ; X86-LABEL: fmadd_sd_maskz_memfold:
6184 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6185 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6186 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
6187 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
6188 ; X86-NEXT: vfmadd231sd {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
6189 ; X86-NEXT: kmovw %eax, %k1
6190 ; X86-NEXT: vmovsd %xmm0, %xmm0, %xmm0 {%k1} {z}
6191 ; X86-NEXT: vmovsd %xmm0, (%edx)
6193 %a.val = load double, ptr %a
6194 %av0 = insertelement <2 x double> undef, double %a.val, i32 0
6195 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
6197 %b.val = load double, ptr %b
6198 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
6199 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
6200 %1 = extractelement <2 x double> %av, i64 0
6201 %2 = extractelement <2 x double> %bv, i64 0
6202 %3 = extractelement <2 x double> %av, i64 0
6203 %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
6204 %5 = bitcast i8 %c to <8 x i1>
6205 %6 = extractelement <8 x i1> %5, i64 0
6206 %7 = select i1 %6, double %4, double 0.000000e+00
6207 %8 = insertelement <2 x double> %av, double %7, i64 0
6208 %sr = extractelement <2 x double> %8, i32 0
6209 store double %sr, ptr %a
6213 define <2 x double> @test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) {
6214 ; X64-LABEL: test_int_x86_avx512_mask3_vfmsub_sd:
6216 ; X64-NEXT: kmovw %edi, %k1
6217 ; X64-NEXT: vmovapd %xmm2, %xmm3
6218 ; X64-NEXT: vfmsub231sd {{.*#+}} xmm3 {%k1} = (xmm0 * xmm1) - xmm3
6219 ; X64-NEXT: vmovapd %xmm2, %xmm4
6220 ; X64-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4
6221 ; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm3
6222 ; X64-NEXT: vfmsub231sd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6223 ; X64-NEXT: vaddpd %xmm3, %xmm2, %xmm0
6226 ; X86-LABEL: test_int_x86_avx512_mask3_vfmsub_sd:
6228 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6229 ; X86-NEXT: kmovw %eax, %k1
6230 ; X86-NEXT: vmovapd %xmm2, %xmm3
6231 ; X86-NEXT: vfmsub231sd {{.*#+}} xmm3 {%k1} = (xmm0 * xmm1) - xmm3
6232 ; X86-NEXT: vmovapd %xmm2, %xmm4
6233 ; X86-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4
6234 ; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm3
6235 ; X86-NEXT: vfmsub231sd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6236 ; X86-NEXT: vaddpd %xmm3, %xmm2, %xmm0
6238 %1 = fneg <2 x double> %x2
6239 %2 = extractelement <2 x double> %x0, i64 0
6240 %3 = extractelement <2 x double> %x1, i64 0
6241 %4 = extractelement <2 x double> %1, i64 0
6242 %5 = call double @llvm.fma.f64(double %2, double %3, double %4)
6243 %6 = extractelement <2 x double> %x2, i64 0
6244 %7 = bitcast i8 %x3 to <8 x i1>
6245 %8 = extractelement <8 x i1> %7, i64 0
6246 %9 = select i1 %8, double %5, double %6
6247 %10 = insertelement <2 x double> %x2, double %9, i64 0
6248 %11 = fneg <2 x double> %x2
6249 %12 = extractelement <2 x double> %x0, i64 0
6250 %13 = extractelement <2 x double> %x1, i64 0
6251 %14 = extractelement <2 x double> %11, i64 0
6252 %15 = call double @llvm.x86.avx512.vfmadd.f64(double %12, double %13, double %14, i32 11)
6253 %16 = extractelement <2 x double> %x2, i64 0
6254 %17 = insertelement <2 x double> %x2, double %15, i64 0
6255 %18 = fneg <2 x double> %x2
6256 %19 = extractelement <2 x double> %x0, i64 0
6257 %20 = extractelement <2 x double> %x1, i64 0
6258 %21 = extractelement <2 x double> %18, i64 0
6259 %22 = call double @llvm.x86.avx512.vfmadd.f64(double %19, double %20, double %21, i32 10)
6260 %23 = extractelement <2 x double> %x2, i64 0
6261 %24 = bitcast i8 %x3 to <8 x i1>
6262 %25 = extractelement <8 x i1> %24, i64 0
6263 %26 = select i1 %25, double %22, double %23
6264 %27 = insertelement <2 x double> %x2, double %26, i64 0
6265 %res3 = fadd <2 x double> %10, %17
6266 %res4 = fadd <2 x double> %27, %res3
6267 ret <2 x double> %res4
6270 define <4 x float> @test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) {
6271 ; X64-LABEL: test_int_x86_avx512_mask3_vfmsub_ss:
6273 ; X64-NEXT: kmovw %edi, %k1
6274 ; X64-NEXT: vmovaps %xmm2, %xmm3
6275 ; X64-NEXT: vfmsub231ss {{.*#+}} xmm3 {%k1} = (xmm0 * xmm1) - xmm3
6276 ; X64-NEXT: vmovaps %xmm2, %xmm4
6277 ; X64-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4
6278 ; X64-NEXT: vaddps %xmm4, %xmm3, %xmm3
6279 ; X64-NEXT: vfmsub231ss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6280 ; X64-NEXT: vaddps %xmm3, %xmm2, %xmm0
6283 ; X86-LABEL: test_int_x86_avx512_mask3_vfmsub_ss:
6285 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6286 ; X86-NEXT: kmovw %eax, %k1
6287 ; X86-NEXT: vmovaps %xmm2, %xmm3
6288 ; X86-NEXT: vfmsub231ss {{.*#+}} xmm3 {%k1} = (xmm0 * xmm1) - xmm3
6289 ; X86-NEXT: vmovaps %xmm2, %xmm4
6290 ; X86-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4
6291 ; X86-NEXT: vaddps %xmm4, %xmm3, %xmm3
6292 ; X86-NEXT: vfmsub231ss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6293 ; X86-NEXT: vaddps %xmm3, %xmm2, %xmm0
6295 %1 = fneg <4 x float> %x2
6296 %2 = extractelement <4 x float> %x0, i64 0
6297 %3 = extractelement <4 x float> %x1, i64 0
6298 %4 = extractelement <4 x float> %1, i64 0
6299 %5 = call float @llvm.fma.f32(float %2, float %3, float %4)
6300 %6 = extractelement <4 x float> %x2, i64 0
6301 %7 = bitcast i8 %x3 to <8 x i1>
6302 %8 = extractelement <8 x i1> %7, i64 0
6303 %9 = select i1 %8, float %5, float %6
6304 %10 = insertelement <4 x float> %x2, float %9, i64 0
6305 %11 = fneg <4 x float> %x2
6306 %12 = extractelement <4 x float> %x0, i64 0
6307 %13 = extractelement <4 x float> %x1, i64 0
6308 %14 = extractelement <4 x float> %11, i64 0
6309 %15 = call float @llvm.x86.avx512.vfmadd.f32(float %12, float %13, float %14, i32 11)
6310 %16 = extractelement <4 x float> %x2, i64 0
6311 %17 = insertelement <4 x float> %x2, float %15, i64 0
6312 %18 = fneg <4 x float> %x2
6313 %19 = extractelement <4 x float> %x0, i64 0
6314 %20 = extractelement <4 x float> %x1, i64 0
6315 %21 = extractelement <4 x float> %18, i64 0
6316 %22 = call float @llvm.x86.avx512.vfmadd.f32(float %19, float %20, float %21, i32 10)
6317 %23 = extractelement <4 x float> %x2, i64 0
6318 %24 = bitcast i8 %x3 to <8 x i1>
6319 %25 = extractelement <8 x i1> %24, i64 0
6320 %26 = select i1 %25, float %22, float %23
6321 %27 = insertelement <4 x float> %x2, float %26, i64 0
6322 %res3 = fadd <4 x float> %10, %17
6323 %res4 = fadd <4 x float> %27, %res3
6324 ret <4 x float> %res4
6327 define <2 x double> @test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) {
6328 ; X64-LABEL: test_int_x86_avx512_mask3_vfnmsub_sd:
6330 ; X64-NEXT: kmovw %edi, %k1
6331 ; X64-NEXT: vmovapd %xmm2, %xmm3
6332 ; X64-NEXT: vfnmsub231sd {{.*#+}} xmm3 {%k1} = -(xmm0 * xmm1) - xmm3
6333 ; X64-NEXT: vmovapd %xmm2, %xmm4
6334 ; X64-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4
6335 ; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm3
6336 ; X64-NEXT: vfnmsub231sd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6337 ; X64-NEXT: vaddpd %xmm3, %xmm2, %xmm0
6340 ; X86-LABEL: test_int_x86_avx512_mask3_vfnmsub_sd:
6342 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6343 ; X86-NEXT: kmovw %eax, %k1
6344 ; X86-NEXT: vmovapd %xmm2, %xmm3
6345 ; X86-NEXT: vfnmsub231sd {{.*#+}} xmm3 {%k1} = -(xmm0 * xmm1) - xmm3
6346 ; X86-NEXT: vmovapd %xmm2, %xmm4
6347 ; X86-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4
6348 ; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm3
6349 ; X86-NEXT: vfnmsub231sd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6350 ; X86-NEXT: vaddpd %xmm3, %xmm2, %xmm0
6352 %1 = fneg <2 x double> %x0
6353 %2 = fneg <2 x double> %x2
6354 %3 = extractelement <2 x double> %1, i64 0
6355 %4 = extractelement <2 x double> %x1, i64 0
6356 %5 = extractelement <2 x double> %2, i64 0
6357 %6 = call double @llvm.fma.f64(double %3, double %4, double %5)
6358 %7 = extractelement <2 x double> %x2, i64 0
6359 %8 = bitcast i8 %x3 to <8 x i1>
6360 %9 = extractelement <8 x i1> %8, i64 0
6361 %10 = select i1 %9, double %6, double %7
6362 %11 = insertelement <2 x double> %x2, double %10, i64 0
6363 %12 = fneg <2 x double> %x0
6364 %13 = fneg <2 x double> %x2
6365 %14 = extractelement <2 x double> %12, i64 0
6366 %15 = extractelement <2 x double> %x1, i64 0
6367 %16 = extractelement <2 x double> %13, i64 0
6368 %17 = call double @llvm.x86.avx512.vfmadd.f64(double %14, double %15, double %16, i32 11)
6369 %18 = extractelement <2 x double> %x2, i64 0
6370 %19 = insertelement <2 x double> %x2, double %17, i64 0
6371 %20 = fneg <2 x double> %x0
6372 %21 = fneg <2 x double> %x2
6373 %22 = extractelement <2 x double> %20, i64 0
6374 %23 = extractelement <2 x double> %x1, i64 0
6375 %24 = extractelement <2 x double> %21, i64 0
6376 %25 = call double @llvm.x86.avx512.vfmadd.f64(double %22, double %23, double %24, i32 10)
6377 %26 = extractelement <2 x double> %x2, i64 0
6378 %27 = bitcast i8 %x3 to <8 x i1>
6379 %28 = extractelement <8 x i1> %27, i64 0
6380 %29 = select i1 %28, double %25, double %26
6381 %30 = insertelement <2 x double> %x2, double %29, i64 0
6382 %res3 = fadd <2 x double> %11, %19
6383 %res4 = fadd <2 x double> %30, %res3
6384 ret <2 x double> %res4
6387 define <4 x float> @test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) {
6388 ; X64-LABEL: test_int_x86_avx512_mask3_vfnmsub_ss:
6390 ; X64-NEXT: kmovw %edi, %k1
6391 ; X64-NEXT: vmovaps %xmm2, %xmm3
6392 ; X64-NEXT: vfnmsub231ss {{.*#+}} xmm3 {%k1} = -(xmm0 * xmm1) - xmm3
6393 ; X64-NEXT: vmovaps %xmm2, %xmm4
6394 ; X64-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4
6395 ; X64-NEXT: vaddps %xmm4, %xmm3, %xmm3
6396 ; X64-NEXT: vfnmsub231ss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6397 ; X64-NEXT: vaddps %xmm3, %xmm2, %xmm0
6400 ; X86-LABEL: test_int_x86_avx512_mask3_vfnmsub_ss:
6402 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6403 ; X86-NEXT: kmovw %eax, %k1
6404 ; X86-NEXT: vmovaps %xmm2, %xmm3
6405 ; X86-NEXT: vfnmsub231ss {{.*#+}} xmm3 {%k1} = -(xmm0 * xmm1) - xmm3
6406 ; X86-NEXT: vmovaps %xmm2, %xmm4
6407 ; X86-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4
6408 ; X86-NEXT: vaddps %xmm4, %xmm3, %xmm3
6409 ; X86-NEXT: vfnmsub231ss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6410 ; X86-NEXT: vaddps %xmm3, %xmm2, %xmm0
6412 %1 = fneg <4 x float> %x0
6413 %2 = fneg <4 x float> %x2
6414 %3 = extractelement <4 x float> %1, i64 0
6415 %4 = extractelement <4 x float> %x1, i64 0
6416 %5 = extractelement <4 x float> %2, i64 0
6417 %6 = call float @llvm.fma.f32(float %3, float %4, float %5)
6418 %7 = extractelement <4 x float> %x2, i64 0
6419 %8 = bitcast i8 %x3 to <8 x i1>
6420 %9 = extractelement <8 x i1> %8, i64 0
6421 %10 = select i1 %9, float %6, float %7
6422 %11 = insertelement <4 x float> %x2, float %10, i64 0
6423 %12 = fneg <4 x float> %x0
6424 %13 = fneg <4 x float> %x2
6425 %14 = extractelement <4 x float> %12, i64 0
6426 %15 = extractelement <4 x float> %x1, i64 0
6427 %16 = extractelement <4 x float> %13, i64 0
6428 %17 = call float @llvm.x86.avx512.vfmadd.f32(float %14, float %15, float %16, i32 11)
6429 %18 = extractelement <4 x float> %x2, i64 0
6430 %19 = insertelement <4 x float> %x2, float %17, i64 0
6431 %20 = fneg <4 x float> %x0
6432 %21 = fneg <4 x float> %x2
6433 %22 = extractelement <4 x float> %20, i64 0
6434 %23 = extractelement <4 x float> %x1, i64 0
6435 %24 = extractelement <4 x float> %21, i64 0
6436 %25 = call float @llvm.x86.avx512.vfmadd.f32(float %22, float %23, float %24, i32 10)
6437 %26 = extractelement <4 x float> %x2, i64 0
6438 %27 = bitcast i8 %x3 to <8 x i1>
6439 %28 = extractelement <8 x i1> %27, i64 0
6440 %29 = select i1 %28, float %25, float %26
6441 %30 = insertelement <4 x float> %x2, float %29, i64 0
6442 %res3 = fadd <4 x float> %11, %19
6443 %res4 = fadd <4 x float> %30, %res3
6444 ret <4 x float> %res4
6447 define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, ptr%ptr_b ,i8 %x3,i32 %x4) {
6448 ; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_rm:
6450 ; X64-NEXT: kmovw %esi, %k1
6451 ; X64-NEXT: vfmadd231ss {{.*#+}} xmm1 {%k1} = (xmm0 * mem) + xmm1
6452 ; X64-NEXT: vmovaps %xmm1, %xmm0
6455 ; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_rm:
6457 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6458 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
6459 ; X86-NEXT: kmovw %ecx, %k1
6460 ; X86-NEXT: vfmadd231ss {{.*#+}} xmm1 {%k1} = (xmm0 * mem) + xmm1
6461 ; X86-NEXT: vmovaps %xmm1, %xmm0
6463 %q = load float, ptr %ptr_b
6464 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
6465 %1 = extractelement <4 x float> %x0, i64 0
6466 %2 = extractelement <4 x float> %vecinit.i, i64 0
6467 %3 = extractelement <4 x float> %x1, i64 0
6468 %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
6469 %5 = bitcast i8 %x3 to <8 x i1>
6470 %6 = extractelement <8 x i1> %5, i64 0
6471 %7 = select i1 %6, float %4, float %3
6472 %8 = insertelement <4 x float> %x1, float %7, i64 0
6476 define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,ptr%ptr_b ,i8 %x3,i32 %x4) {
6477 ; X64-LABEL: test_int_x86_avx512_mask_vfmadd_ss_rm:
6479 ; X64-NEXT: kmovw %esi, %k1
6480 ; X64-NEXT: vfmadd132ss {{.*#+}} xmm0 {%k1} = (xmm0 * mem) + xmm1
6483 ; X86-LABEL: test_int_x86_avx512_mask_vfmadd_ss_rm:
6485 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6486 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
6487 ; X86-NEXT: kmovw %ecx, %k1
6488 ; X86-NEXT: vfmadd132ss {{.*#+}} xmm0 {%k1} = (xmm0 * mem) + xmm1
6490 %q = load float, ptr %ptr_b
6491 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
6492 %1 = extractelement <4 x float> %x0, i64 0
6493 %2 = extractelement <4 x float> %vecinit.i, i64 0
6494 %3 = extractelement <4 x float> %x1, i64 0
6495 %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
6496 %5 = bitcast i8 %x3 to <8 x i1>
6497 %6 = extractelement <8 x i1> %5, i64 0
6498 %7 = select i1 %6, float %4, float %1
6499 %8 = insertelement <4 x float> %x0, float %7, i64 0
6504 define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,ptr%ptr_b ,i8 %x3,i32 %x4) {
6505 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm:
6507 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
6508 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
6509 ; CHECK-NEXT: ret{{[l|q]}}
6510 %q = load float, ptr %ptr_b
6511 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
6512 %1 = extractelement <4 x float> %x0, i64 0
6513 %2 = extractelement <4 x float> %x1, i64 0
6514 %3 = extractelement <4 x float> %vecinit.i, i64 0
6515 %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
6516 %5 = select i1 false, float %4, float 0.000000e+00
6517 %6 = insertelement <4 x float> %x0, float %5, i64 0
6521 define <16 x i32> @test_x86_avx512_psll_d_512(<16 x i32> %a0, <4 x i32> %a1) {
6522 ; CHECK-LABEL: test_x86_avx512_psll_d_512:
6524 ; CHECK-NEXT: vpslld %xmm1, %zmm0, %zmm0
6525 ; CHECK-NEXT: ret{{[l|q]}}
6526 %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6529 define <16 x i32> @test_x86_avx512_mask_psll_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) {
6530 ; X64-LABEL: test_x86_avx512_mask_psll_d_512:
6532 ; X64-NEXT: kmovw %edi, %k1
6533 ; X64-NEXT: vpslld %xmm1, %zmm0, %zmm2 {%k1}
6534 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
6537 ; X86-LABEL: test_x86_avx512_mask_psll_d_512:
6539 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6540 ; X86-NEXT: vpslld %xmm1, %zmm0, %zmm2 {%k1}
6541 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
6543 %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6544 %mask.cast = bitcast i16 %mask to <16 x i1>
6545 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
6546 ret <16 x i32> %res2
6548 define <16 x i32> @test_x86_avx512_maskz_psll_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
6549 ; X64-LABEL: test_x86_avx512_maskz_psll_d_512:
6551 ; X64-NEXT: kmovw %edi, %k1
6552 ; X64-NEXT: vpslld %xmm1, %zmm0, %zmm0 {%k1} {z}
6555 ; X86-LABEL: test_x86_avx512_maskz_psll_d_512:
6557 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6558 ; X86-NEXT: vpslld %xmm1, %zmm0, %zmm0 {%k1} {z}
6560 %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6561 %mask.cast = bitcast i16 %mask to <16 x i1>
6562 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
6563 ret <16 x i32> %res2
6565 declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>) nounwind readnone
6568 define <8 x i64> @test_x86_avx512_psll_q_512(<8 x i64> %a0, <2 x i64> %a1) {
6569 ; CHECK-LABEL: test_x86_avx512_psll_q_512:
6571 ; CHECK-NEXT: vpsllq %xmm1, %zmm0, %zmm0
6572 ; CHECK-NEXT: ret{{[l|q]}}
6573 %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
6576 define <8 x i64> @test_x86_avx512_mask_psll_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) {
6577 ; X64-LABEL: test_x86_avx512_mask_psll_q_512:
6579 ; X64-NEXT: kmovw %edi, %k1
6580 ; X64-NEXT: vpsllq %xmm1, %zmm0, %zmm2 {%k1}
6581 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
6584 ; X86-LABEL: test_x86_avx512_mask_psll_q_512:
6586 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6587 ; X86-NEXT: kmovw %eax, %k1
6588 ; X86-NEXT: vpsllq %xmm1, %zmm0, %zmm2 {%k1}
6589 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
6591 %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
6592 %mask.cast = bitcast i8 %mask to <8 x i1>
6593 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
6596 define <8 x i64> @test_x86_avx512_maskz_psll_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
6597 ; X64-LABEL: test_x86_avx512_maskz_psll_q_512:
6599 ; X64-NEXT: kmovw %edi, %k1
6600 ; X64-NEXT: vpsllq %xmm1, %zmm0, %zmm0 {%k1} {z}
6603 ; X86-LABEL: test_x86_avx512_maskz_psll_q_512:
6605 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6606 ; X86-NEXT: kmovw %eax, %k1
6607 ; X86-NEXT: vpsllq %xmm1, %zmm0, %zmm0 {%k1} {z}
6609 %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
6610 %mask.cast = bitcast i8 %mask to <8 x i1>
6611 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
6614 declare <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64>, <2 x i64>) nounwind readnone
6617 define <16 x i32> @test_x86_avx512_pslli_d_512(<16 x i32> %a0) {
6618 ; CHECK-LABEL: test_x86_avx512_pslli_d_512:
6620 ; CHECK-NEXT: vpslld $7, %zmm0, %zmm0
6621 ; CHECK-NEXT: ret{{[l|q]}}
6622 %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
6625 define <16 x i32> @test_x86_avx512_mask_pslli_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
6626 ; X64-LABEL: test_x86_avx512_mask_pslli_d_512:
6628 ; X64-NEXT: kmovw %edi, %k1
6629 ; X64-NEXT: vpslld $7, %zmm0, %zmm1 {%k1}
6630 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
6633 ; X86-LABEL: test_x86_avx512_mask_pslli_d_512:
6635 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6636 ; X86-NEXT: vpslld $7, %zmm0, %zmm1 {%k1}
6637 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
6639 %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
6640 %mask.cast = bitcast i16 %mask to <16 x i1>
6641 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
6642 ret <16 x i32> %res2
6644 define <16 x i32> @test_x86_avx512_maskz_pslli_d_512(<16 x i32> %a0, i16 %mask) {
6645 ; X64-LABEL: test_x86_avx512_maskz_pslli_d_512:
6647 ; X64-NEXT: kmovw %edi, %k1
6648 ; X64-NEXT: vpslld $7, %zmm0, %zmm0 {%k1} {z}
6651 ; X86-LABEL: test_x86_avx512_maskz_pslli_d_512:
6653 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6654 ; X86-NEXT: vpslld $7, %zmm0, %zmm0 {%k1} {z}
6656 %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
6657 %mask.cast = bitcast i16 %mask to <16 x i1>
6658 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
6659 ret <16 x i32> %res2
6661 declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) nounwind readnone
6664 define <8 x i64> @test_x86_avx512_pslli_q_512(<8 x i64> %a0) {
6665 ; CHECK-LABEL: test_x86_avx512_pslli_q_512:
6667 ; CHECK-NEXT: vpsllq $7, %zmm0, %zmm0
6668 ; CHECK-NEXT: ret{{[l|q]}}
6669 %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
6672 define <8 x i64> @test_x86_avx512_mask_pslli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
6673 ; X64-LABEL: test_x86_avx512_mask_pslli_q_512:
6675 ; X64-NEXT: kmovw %edi, %k1
6676 ; X64-NEXT: vpsllq $7, %zmm0, %zmm1 {%k1}
6677 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
6680 ; X86-LABEL: test_x86_avx512_mask_pslli_q_512:
6682 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6683 ; X86-NEXT: kmovw %eax, %k1
6684 ; X86-NEXT: vpsllq $7, %zmm0, %zmm1 {%k1}
6685 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
6687 %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
6688 %mask.cast = bitcast i8 %mask to <8 x i1>
6689 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
6692 define <8 x i64> @test_x86_avx512_maskz_pslli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
6693 ; X64-LABEL: test_x86_avx512_maskz_pslli_q_512:
6695 ; X64-NEXT: kmovw %edi, %k1
6696 ; X64-NEXT: vpsllq $7, %zmm0, %zmm0 {%k1} {z}
6699 ; X86-LABEL: test_x86_avx512_maskz_pslli_q_512:
6701 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6702 ; X86-NEXT: kmovw %eax, %k1
6703 ; X86-NEXT: vpsllq $7, %zmm0, %zmm0 {%k1} {z}
6705 %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
6706 %mask.cast = bitcast i8 %mask to <8 x i1>
6707 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
6710 declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) nounwind readnone
6713 define <8 x i64> @test_x86_avx512_psra_q_512(<8 x i64> %a0, <2 x i64> %a1) {
6714 ; CHECK-LABEL: test_x86_avx512_psra_q_512:
6716 ; CHECK-NEXT: vpsraq %xmm1, %zmm0, %zmm0
6717 ; CHECK-NEXT: ret{{[l|q]}}
6718 %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
6721 define <8 x i64> @test_x86_avx512_mask_psra_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) {
6722 ; X64-LABEL: test_x86_avx512_mask_psra_q_512:
6724 ; X64-NEXT: kmovw %edi, %k1
6725 ; X64-NEXT: vpsraq %xmm1, %zmm0, %zmm2 {%k1}
6726 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
6729 ; X86-LABEL: test_x86_avx512_mask_psra_q_512:
6731 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6732 ; X86-NEXT: kmovw %eax, %k1
6733 ; X86-NEXT: vpsraq %xmm1, %zmm0, %zmm2 {%k1}
6734 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
6736 %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
6737 %mask.cast = bitcast i8 %mask to <8 x i1>
6738 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
6741 define <8 x i64> @test_x86_avx512_maskz_psra_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
6742 ; X64-LABEL: test_x86_avx512_maskz_psra_q_512:
6744 ; X64-NEXT: kmovw %edi, %k1
6745 ; X64-NEXT: vpsraq %xmm1, %zmm0, %zmm0 {%k1} {z}
6748 ; X86-LABEL: test_x86_avx512_maskz_psra_q_512:
6750 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6751 ; X86-NEXT: kmovw %eax, %k1
6752 ; X86-NEXT: vpsraq %xmm1, %zmm0, %zmm0 {%k1} {z}
6754 %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
6755 %mask.cast = bitcast i8 %mask to <8 x i1>
6756 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
6759 declare <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64>, <2 x i64>) nounwind readnone
6762 define <16 x i32> @test_x86_avx512_psra_d_512(<16 x i32> %a0, <4 x i32> %a1) {
6763 ; CHECK-LABEL: test_x86_avx512_psra_d_512:
6765 ; CHECK-NEXT: vpsrad %xmm1, %zmm0, %zmm0
6766 ; CHECK-NEXT: ret{{[l|q]}}
6767 %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6770 define <16 x i32> @test_x86_avx512_mask_psra_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) {
6771 ; X64-LABEL: test_x86_avx512_mask_psra_d_512:
6773 ; X64-NEXT: kmovw %edi, %k1
6774 ; X64-NEXT: vpsrad %xmm1, %zmm0, %zmm2 {%k1}
6775 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
6778 ; X86-LABEL: test_x86_avx512_mask_psra_d_512:
6780 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6781 ; X86-NEXT: vpsrad %xmm1, %zmm0, %zmm2 {%k1}
6782 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
6784 %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6785 %mask.cast = bitcast i16 %mask to <16 x i1>
6786 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
6787 ret <16 x i32> %res2
6789 define <16 x i32> @test_x86_avx512_maskz_psra_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
6790 ; X64-LABEL: test_x86_avx512_maskz_psra_d_512:
6792 ; X64-NEXT: kmovw %edi, %k1
6793 ; X64-NEXT: vpsrad %xmm1, %zmm0, %zmm0 {%k1} {z}
6796 ; X86-LABEL: test_x86_avx512_maskz_psra_d_512:
6798 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6799 ; X86-NEXT: vpsrad %xmm1, %zmm0, %zmm0 {%k1} {z}
6801 %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6802 %mask.cast = bitcast i16 %mask to <16 x i1>
6803 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
6804 ret <16 x i32> %res2
6806 declare <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32>, <4 x i32>) nounwind readnone
6810 define <8 x i64> @test_x86_avx512_psrai_q_512(<8 x i64> %a0) {
6811 ; CHECK-LABEL: test_x86_avx512_psrai_q_512:
6813 ; CHECK-NEXT: vpsraq $7, %zmm0, %zmm0
6814 ; CHECK-NEXT: ret{{[l|q]}}
6815 %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
6818 define <8 x i64> @test_x86_avx512_mask_psrai_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
6819 ; X64-LABEL: test_x86_avx512_mask_psrai_q_512:
6821 ; X64-NEXT: kmovw %edi, %k1
6822 ; X64-NEXT: vpsraq $7, %zmm0, %zmm1 {%k1}
6823 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
6826 ; X86-LABEL: test_x86_avx512_mask_psrai_q_512:
6828 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6829 ; X86-NEXT: kmovw %eax, %k1
6830 ; X86-NEXT: vpsraq $7, %zmm0, %zmm1 {%k1}
6831 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
6833 %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
6834 %mask.cast = bitcast i8 %mask to <8 x i1>
6835 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
6838 define <8 x i64> @test_x86_avx512_maskz_psrai_q_512(<8 x i64> %a0, i8 %mask) {
6839 ; X64-LABEL: test_x86_avx512_maskz_psrai_q_512:
6841 ; X64-NEXT: kmovw %edi, %k1
6842 ; X64-NEXT: vpsraq $7, %zmm0, %zmm0 {%k1} {z}
6845 ; X86-LABEL: test_x86_avx512_maskz_psrai_q_512:
6847 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6848 ; X86-NEXT: kmovw %eax, %k1
6849 ; X86-NEXT: vpsraq $7, %zmm0, %zmm0 {%k1} {z}
6851 %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
6852 %mask.cast = bitcast i8 %mask to <8 x i1>
6853 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
6856 declare <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64>, i32) nounwind readnone
6859 define <16 x i32> @test_x86_avx512_psrai_d_512(<16 x i32> %a0) {
6860 ; CHECK-LABEL: test_x86_avx512_psrai_d_512:
6862 ; CHECK-NEXT: vpsrad $7, %zmm0, %zmm0
6863 ; CHECK-NEXT: ret{{[l|q]}}
6864 %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
6867 define <16 x i32> @test_x86_avx512_mask_psrai_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
6868 ; X64-LABEL: test_x86_avx512_mask_psrai_d_512:
6870 ; X64-NEXT: kmovw %edi, %k1
6871 ; X64-NEXT: vpsrad $7, %zmm0, %zmm1 {%k1}
6872 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
6875 ; X86-LABEL: test_x86_avx512_mask_psrai_d_512:
6877 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6878 ; X86-NEXT: vpsrad $7, %zmm0, %zmm1 {%k1}
6879 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
6881 %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
6882 %mask.cast = bitcast i16 %mask to <16 x i1>
6883 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
6884 ret <16 x i32> %res2
6886 define <16 x i32> @test_x86_avx512_maskz_psrai_d_512(<16 x i32> %a0, i16 %mask) {
6887 ; X64-LABEL: test_x86_avx512_maskz_psrai_d_512:
6889 ; X64-NEXT: kmovw %edi, %k1
6890 ; X64-NEXT: vpsrad $7, %zmm0, %zmm0 {%k1} {z}
6893 ; X86-LABEL: test_x86_avx512_maskz_psrai_d_512:
6895 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6896 ; X86-NEXT: vpsrad $7, %zmm0, %zmm0 {%k1} {z}
6898 %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
6899 %mask.cast = bitcast i16 %mask to <16 x i1>
6900 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
6901 ret <16 x i32> %res2
6903 declare <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32>, i32) nounwind readnone
6907 define <16 x i32> @test_x86_avx512_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1) {
6908 ; CHECK-LABEL: test_x86_avx512_psrl_d_512:
6910 ; CHECK-NEXT: vpsrld %xmm1, %zmm0, %zmm0
6911 ; CHECK-NEXT: ret{{[l|q]}}
6912 %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6915 define <16 x i32> @test_x86_avx512_mask_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) {
6916 ; X64-LABEL: test_x86_avx512_mask_psrl_d_512:
6918 ; X64-NEXT: kmovw %edi, %k1
6919 ; X64-NEXT: vpsrld %xmm1, %zmm0, %zmm2 {%k1}
6920 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
6923 ; X86-LABEL: test_x86_avx512_mask_psrl_d_512:
6925 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6926 ; X86-NEXT: vpsrld %xmm1, %zmm0, %zmm2 {%k1}
6927 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
6929 %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6930 %mask.cast = bitcast i16 %mask to <16 x i1>
6931 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
6932 ret <16 x i32> %res2
6934 define <16 x i32> @test_x86_avx512_maskz_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
6935 ; X64-LABEL: test_x86_avx512_maskz_psrl_d_512:
6937 ; X64-NEXT: kmovw %edi, %k1
6938 ; X64-NEXT: vpsrld %xmm1, %zmm0, %zmm0 {%k1} {z}
6941 ; X86-LABEL: test_x86_avx512_maskz_psrl_d_512:
6943 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6944 ; X86-NEXT: vpsrld %xmm1, %zmm0, %zmm0 {%k1} {z}
6946 %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6947 %mask.cast = bitcast i16 %mask to <16 x i1>
6948 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
6949 ret <16 x i32> %res2
6951 declare <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32>, <4 x i32>) nounwind readnone
6954 define <8 x i64> @test_x86_avx512_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1) {
6955 ; CHECK-LABEL: test_x86_avx512_psrl_q_512:
6957 ; CHECK-NEXT: vpsrlq %xmm1, %zmm0, %zmm0
6958 ; CHECK-NEXT: ret{{[l|q]}}
6959 %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
6962 define <8 x i64> @test_x86_avx512_mask_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) {
6963 ; X64-LABEL: test_x86_avx512_mask_psrl_q_512:
6965 ; X64-NEXT: kmovw %edi, %k1
6966 ; X64-NEXT: vpsrlq %xmm1, %zmm0, %zmm2 {%k1}
6967 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
6970 ; X86-LABEL: test_x86_avx512_mask_psrl_q_512:
6972 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6973 ; X86-NEXT: kmovw %eax, %k1
6974 ; X86-NEXT: vpsrlq %xmm1, %zmm0, %zmm2 {%k1}
6975 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
6977 %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
6978 %mask.cast = bitcast i8 %mask to <8 x i1>
6979 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
6982 define <8 x i64> @test_x86_avx512_maskz_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
6983 ; X64-LABEL: test_x86_avx512_maskz_psrl_q_512:
6985 ; X64-NEXT: kmovw %edi, %k1
6986 ; X64-NEXT: vpsrlq %xmm1, %zmm0, %zmm0 {%k1} {z}
6989 ; X86-LABEL: test_x86_avx512_maskz_psrl_q_512:
6991 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6992 ; X86-NEXT: kmovw %eax, %k1
6993 ; X86-NEXT: vpsrlq %xmm1, %zmm0, %zmm0 {%k1} {z}
6995 %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
6996 %mask.cast = bitcast i8 %mask to <8 x i1>
6997 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
7000 declare <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64>, <2 x i64>) nounwind readnone
7003 define <16 x i32> @test_x86_avx512_psrli_d_512(<16 x i32> %a0) {
7004 ; CHECK-LABEL: test_x86_avx512_psrli_d_512:
7006 ; CHECK-NEXT: vpsrld $7, %zmm0, %zmm0
7007 ; CHECK-NEXT: ret{{[l|q]}}
7008 %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
7011 define <16 x i32> @test_x86_avx512_mask_psrli_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
7012 ; X64-LABEL: test_x86_avx512_mask_psrli_d_512:
7014 ; X64-NEXT: kmovw %edi, %k1
7015 ; X64-NEXT: vpsrld $7, %zmm0, %zmm1 {%k1}
7016 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
7019 ; X86-LABEL: test_x86_avx512_mask_psrli_d_512:
7021 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7022 ; X86-NEXT: vpsrld $7, %zmm0, %zmm1 {%k1}
7023 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
7025 %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
7026 %mask.cast = bitcast i16 %mask to <16 x i1>
7027 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
7028 ret <16 x i32> %res2
7030 define <16 x i32> @test_x86_avx512_maskz_psrli_d_512(<16 x i32> %a0, i16 %mask) {
7031 ; X64-LABEL: test_x86_avx512_maskz_psrli_d_512:
7033 ; X64-NEXT: kmovw %edi, %k1
7034 ; X64-NEXT: vpsrld $7, %zmm0, %zmm0 {%k1} {z}
7037 ; X86-LABEL: test_x86_avx512_maskz_psrli_d_512:
7039 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7040 ; X86-NEXT: vpsrld $7, %zmm0, %zmm0 {%k1} {z}
7042 %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
7043 %mask.cast = bitcast i16 %mask to <16 x i1>
7044 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
7045 ret <16 x i32> %res2
7047 declare <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32>, i32) nounwind readnone
7050 define <8 x i64> @test_x86_avx512_psrli_q_512(<8 x i64> %a0) {
7051 ; CHECK-LABEL: test_x86_avx512_psrli_q_512:
7053 ; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm0
7054 ; CHECK-NEXT: ret{{[l|q]}}
7055 %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
7058 define <8 x i64> @test_x86_avx512_mask_psrli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
7059 ; X64-LABEL: test_x86_avx512_mask_psrli_q_512:
7061 ; X64-NEXT: kmovw %edi, %k1
7062 ; X64-NEXT: vpsrlq $7, %zmm0, %zmm1 {%k1}
7063 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
7066 ; X86-LABEL: test_x86_avx512_mask_psrli_q_512:
7068 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7069 ; X86-NEXT: kmovw %eax, %k1
7070 ; X86-NEXT: vpsrlq $7, %zmm0, %zmm1 {%k1}
7071 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
7073 %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
7074 %mask.cast = bitcast i8 %mask to <8 x i1>
7075 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
7078 define <8 x i64> @test_x86_avx512_maskz_psrli_q_512(<8 x i64> %a0, i8 %mask) {
7079 ; X64-LABEL: test_x86_avx512_maskz_psrli_q_512:
7081 ; X64-NEXT: kmovw %edi, %k1
7082 ; X64-NEXT: vpsrlq $7, %zmm0, %zmm0 {%k1} {z}
7085 ; X86-LABEL: test_x86_avx512_maskz_psrli_q_512:
7087 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7088 ; X86-NEXT: kmovw %eax, %k1
7089 ; X86-NEXT: vpsrlq $7, %zmm0, %zmm0 {%k1} {z}
7091 %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
7092 %mask.cast = bitcast i8 %mask to <8 x i1>
7093 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
7096 declare <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64>, i32) nounwind readnone
7098 define <16 x i32> @test_x86_avx512_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1) {
7099 ; CHECK-LABEL: test_x86_avx512_psllv_d_512:
7101 ; CHECK-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
7102 ; CHECK-NEXT: ret{{[l|q]}}
7103 %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
7107 define <16 x i32> @test_x86_avx512_psllv_d_512_const() {
7108 ; X64-LABEL: test_x86_avx512_psllv_d_512_const:
7110 ; X64-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0]
7111 ; X64-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
7112 ; X64-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295]
7113 ; X64-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
7114 ; X64-NEXT: vpaddd %zmm1, %zmm0, %zmm0
7117 ; X86-LABEL: test_x86_avx512_psllv_d_512_const:
7119 ; X86-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0]
7120 ; X86-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0
7121 ; X86-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295]
7122 ; X86-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm1, %zmm1
7123 ; X86-NEXT: vpaddd %zmm1, %zmm0, %zmm0
7125 %res0 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> <i32 2, i32 9, i32 0, i32 -1, i32 3, i32 7, i32 -1, i32 0, i32 4, i32 5, i32 -2, i32 0, i32 5, i32 3, i32 -3, i32 0>, <16 x i32> <i32 1, i32 0, i32 33, i32 -1,i32 2, i32 0, i32 34, i32 -2, i32 3, i32 0, i32 35, i32 -1, i32 4, i32 0, i32 36, i32 -3>)
7126 %res1 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 -1>)
7127 %res2 = add <16 x i32> %res0, %res1
7128 ret <16 x i32> %res2
7131 define <16 x i32> @test_x86_avx512_mask_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
7132 ; X64-LABEL: test_x86_avx512_mask_psllv_d_512:
7134 ; X64-NEXT: kmovw %edi, %k1
7135 ; X64-NEXT: vpsllvd %zmm1, %zmm0, %zmm2 {%k1}
7136 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
7139 ; X86-LABEL: test_x86_avx512_mask_psllv_d_512:
7141 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7142 ; X86-NEXT: vpsllvd %zmm1, %zmm0, %zmm2 {%k1}
7143 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
7145 %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
7146 %mask.cast = bitcast i16 %mask to <16 x i1>
7147 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2
7148 ret <16 x i32> %res2
7151 define <16 x i32> @test_x86_avx512_maskz_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
7152 ; X64-LABEL: test_x86_avx512_maskz_psllv_d_512:
7154 ; X64-NEXT: kmovw %edi, %k1
7155 ; X64-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 {%k1} {z}
7158 ; X86-LABEL: test_x86_avx512_maskz_psllv_d_512:
7160 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7161 ; X86-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 {%k1} {z}
7163 %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
7164 %mask.cast = bitcast i16 %mask to <16 x i1>
7165 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
7166 ret <16 x i32> %res2
7169 declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>) nounwind readnone
7171 define <8 x i64> @test_x86_avx512_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1) {
7172 ; CHECK-LABEL: test_x86_avx512_psllv_q_512:
7174 ; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm0
7175 ; CHECK-NEXT: ret{{[l|q]}}
7176 %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
7180 define <8 x i64> @test_x86_avx512_psllv_q_512_const() {
7181 ; X64-LABEL: test_x86_avx512_psllv_q_512_const:
7183 ; X64-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,9,0,18446744073709551615,3,7,18446744073709551615,0]
7184 ; X64-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
7185 ; X64-NEXT: vpmovsxbq {{.*#+}} zmm1 = [4,4,4,4,4,4,4,18446744073709551615]
7186 ; X64-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
7187 ; X64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
7190 ; X86-LABEL: test_x86_avx512_psllv_q_512_const:
7192 ; X86-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,9,0,18446744073709551615,3,7,18446744073709551615,0]
7193 ; X86-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0
7194 ; X86-NEXT: vpmovsxbq {{.*#+}} zmm1 = [4,4,4,4,4,4,4,18446744073709551615]
7195 ; X86-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm1, %zmm1
7196 ; X86-NEXT: vpaddq %zmm1, %zmm0, %zmm0
7198 %res0 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> <i64 2, i64 9, i64 0, i64 -1, i64 3, i64 7, i64 -1, i64 0>, <8 x i64> <i64 1, i64 0, i64 33, i64 -1,i64 2, i64 0, i64 34, i64 -2>)
7199 %res1 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 -1>)
7200 %res2 = add <8 x i64> %res0, %res1
7204 define <8 x i64> @test_x86_avx512_mask_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
7205 ; X64-LABEL: test_x86_avx512_mask_psllv_q_512:
7207 ; X64-NEXT: kmovw %edi, %k1
7208 ; X64-NEXT: vpsllvq %zmm1, %zmm0, %zmm2 {%k1}
7209 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
7212 ; X86-LABEL: test_x86_avx512_mask_psllv_q_512:
7214 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7215 ; X86-NEXT: kmovw %eax, %k1
7216 ; X86-NEXT: vpsllvq %zmm1, %zmm0, %zmm2 {%k1}
7217 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
7219 %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
7220 %mask.cast = bitcast i8 %mask to <8 x i1>
7221 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2
7225 define <8 x i64> @test_x86_avx512_maskz_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
7226 ; X64-LABEL: test_x86_avx512_maskz_psllv_q_512:
7228 ; X64-NEXT: kmovw %edi, %k1
7229 ; X64-NEXT: vpsllvq %zmm1, %zmm0, %zmm0 {%k1} {z}
7232 ; X86-LABEL: test_x86_avx512_maskz_psllv_q_512:
7234 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7235 ; X86-NEXT: kmovw %eax, %k1
7236 ; X86-NEXT: vpsllvq %zmm1, %zmm0, %zmm0 {%k1} {z}
7238 %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
7239 %mask.cast = bitcast i8 %mask to <8 x i1>
7240 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
7244 declare <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64>, <8 x i64>) nounwind readnone
7246 define <16 x i32> @test_x86_avx512_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1) {
7247 ; CHECK-LABEL: test_x86_avx512_psrav_d_512:
7249 ; CHECK-NEXT: vpsravd %zmm1, %zmm0, %zmm0
7250 ; CHECK-NEXT: ret{{[l|q]}}
7251 %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
7255 define <16 x i32> @test_x86_avx512_mask_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
7256 ; X64-LABEL: test_x86_avx512_mask_psrav_d_512:
7258 ; X64-NEXT: kmovw %edi, %k1
7259 ; X64-NEXT: vpsravd %zmm1, %zmm0, %zmm2 {%k1}
7260 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
7263 ; X86-LABEL: test_x86_avx512_mask_psrav_d_512:
7265 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7266 ; X86-NEXT: vpsravd %zmm1, %zmm0, %zmm2 {%k1}
7267 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
7269 %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
7270 %mask.cast = bitcast i16 %mask to <16 x i1>
7271 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2
7272 ret <16 x i32> %res2
7275 define <16 x i32> @test_x86_avx512_maskz_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
7276 ; X64-LABEL: test_x86_avx512_maskz_psrav_d_512:
7278 ; X64-NEXT: kmovw %edi, %k1
7279 ; X64-NEXT: vpsravd %zmm1, %zmm0, %zmm0 {%k1} {z}
7282 ; X86-LABEL: test_x86_avx512_maskz_psrav_d_512:
7284 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7285 ; X86-NEXT: vpsravd %zmm1, %zmm0, %zmm0 {%k1} {z}
7287 %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
7288 %mask.cast = bitcast i16 %mask to <16 x i1>
7289 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
7290 ret <16 x i32> %res2
7293 declare <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32>, <16 x i32>) nounwind readnone
7295 define <8 x i64> @test_x86_avx512_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1) {
7296 ; CHECK-LABEL: test_x86_avx512_psrav_q_512:
7298 ; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm0
7299 ; CHECK-NEXT: ret{{[l|q]}}
7300 %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
7304 define <8 x i64> @test_x86_avx512_mask_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
7305 ; X64-LABEL: test_x86_avx512_mask_psrav_q_512:
7307 ; X64-NEXT: kmovw %edi, %k1
7308 ; X64-NEXT: vpsravq %zmm1, %zmm0, %zmm2 {%k1}
7309 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
7312 ; X86-LABEL: test_x86_avx512_mask_psrav_q_512:
7314 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7315 ; X86-NEXT: kmovw %eax, %k1
7316 ; X86-NEXT: vpsravq %zmm1, %zmm0, %zmm2 {%k1}
7317 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
7319 %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
7320 %mask.cast = bitcast i8 %mask to <8 x i1>
7321 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2
7325 define <8 x i64> @test_x86_avx512_maskz_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
7326 ; X64-LABEL: test_x86_avx512_maskz_psrav_q_512:
7328 ; X64-NEXT: kmovw %edi, %k1
7329 ; X64-NEXT: vpsravq %zmm1, %zmm0, %zmm0 {%k1} {z}
7332 ; X86-LABEL: test_x86_avx512_maskz_psrav_q_512:
7334 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7335 ; X86-NEXT: kmovw %eax, %k1
7336 ; X86-NEXT: vpsravq %zmm1, %zmm0, %zmm0 {%k1} {z}
7338 %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
7339 %mask.cast = bitcast i8 %mask to <8 x i1>
7340 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
7344 declare <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64>, <8 x i64>) nounwind readnone
7346 define <16 x i32> @test_x86_avx512_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1) {
7347 ; CHECK-LABEL: test_x86_avx512_psrlv_d_512:
7349 ; CHECK-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
7350 ; CHECK-NEXT: ret{{[l|q]}}
7351 %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
7355 define <16 x i32> @test_x86_avx512_psrlv_d_512_const() {
7356 ; X64-LABEL: test_x86_avx512_psrlv_d_512_const:
7358 ; X64-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0]
7359 ; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
7360 ; X64-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295]
7361 ; X64-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
7362 ; X64-NEXT: vpaddd %zmm1, %zmm0, %zmm0
7365 ; X86-LABEL: test_x86_avx512_psrlv_d_512_const:
7367 ; X86-NEXT: vpmovsxbd {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0]
7368 ; X86-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0
7369 ; X86-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295]
7370 ; X86-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm1, %zmm1
7371 ; X86-NEXT: vpaddd %zmm1, %zmm0, %zmm0
7373 %res0 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> <i32 2, i32 9, i32 0, i32 -1, i32 3, i32 7, i32 -1, i32 0, i32 4, i32 5, i32 -2, i32 0, i32 5, i32 3, i32 -3, i32 0>, <16 x i32> <i32 1, i32 0, i32 33, i32 -1,i32 2, i32 0, i32 34, i32 -2, i32 3, i32 0, i32 35, i32 -1, i32 4, i32 0, i32 36, i32 -3>)
7374 %res1 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 -1 >)
7375 %res2 = add <16 x i32> %res0, %res1
7376 ret <16 x i32> %res2
7379 define <16 x i32> @test_x86_avx512_mask_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
7380 ; X64-LABEL: test_x86_avx512_mask_psrlv_d_512:
7382 ; X64-NEXT: kmovw %edi, %k1
7383 ; X64-NEXT: vpsrlvd %zmm1, %zmm0, %zmm2 {%k1}
7384 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
7387 ; X86-LABEL: test_x86_avx512_mask_psrlv_d_512:
7389 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7390 ; X86-NEXT: vpsrlvd %zmm1, %zmm0, %zmm2 {%k1}
7391 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
7393 %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
7394 %mask.cast = bitcast i16 %mask to <16 x i1>
7395 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2
7396 ret <16 x i32> %res2
7399 define <16 x i32> @test_x86_avx512_maskz_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
7400 ; X64-LABEL: test_x86_avx512_maskz_psrlv_d_512:
7402 ; X64-NEXT: kmovw %edi, %k1
7403 ; X64-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 {%k1} {z}
7406 ; X86-LABEL: test_x86_avx512_maskz_psrlv_d_512:
7408 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7409 ; X86-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 {%k1} {z}
7411 %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
7412 %mask.cast = bitcast i16 %mask to <16 x i1>
7413 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
7414 ret <16 x i32> %res2
7417 declare <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32>, <16 x i32>) nounwind readnone
7419 define <8 x i64> @test_x86_avx512_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1) {
7420 ; CHECK-LABEL: test_x86_avx512_psrlv_q_512:
7422 ; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0
7423 ; CHECK-NEXT: ret{{[l|q]}}
7424 %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
7428 define <8 x i64> @test_x86_avx512_psrlv_q_512_const() {
7429 ; X64-LABEL: test_x86_avx512_psrlv_q_512_const:
7431 ; X64-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,9,0,18446744073709551615,3,7,18446744073709551615,0]
7432 ; X64-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
7433 ; X64-NEXT: vpmovsxbq {{.*#+}} zmm1 = [4,4,4,4,4,4,4,18446744073709551615]
7434 ; X64-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
7435 ; X64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
7438 ; X86-LABEL: test_x86_avx512_psrlv_q_512_const:
7440 ; X86-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,9,0,18446744073709551615,3,7,18446744073709551615,0]
7441 ; X86-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0
7442 ; X86-NEXT: vpmovsxbq {{.*#+}} zmm1 = [4,4,4,4,4,4,4,18446744073709551615]
7443 ; X86-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm1, %zmm1
7444 ; X86-NEXT: vpaddq %zmm1, %zmm0, %zmm0
7446 %res0 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> <i64 2, i64 9, i64 0, i64 -1, i64 3, i64 7, i64 -1, i64 0>, <8 x i64> <i64 1, i64 0, i64 33, i64 -1,i64 2, i64 0, i64 34, i64 -2>)
7447 %res1 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 -1>)
7448 %res2 = add <8 x i64> %res0, %res1
7452 define <8 x i64> @test_x86_avx512_mask_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
7453 ; X64-LABEL: test_x86_avx512_mask_psrlv_q_512:
7455 ; X64-NEXT: kmovw %edi, %k1
7456 ; X64-NEXT: vpsrlvq %zmm1, %zmm0, %zmm2 {%k1}
7457 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
7460 ; X86-LABEL: test_x86_avx512_mask_psrlv_q_512:
7462 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7463 ; X86-NEXT: kmovw %eax, %k1
7464 ; X86-NEXT: vpsrlvq %zmm1, %zmm0, %zmm2 {%k1}
7465 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
7467 %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
7468 %mask.cast = bitcast i8 %mask to <8 x i1>
7469 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2
7473 define <8 x i64> @test_x86_avx512_maskz_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
7474 ; X64-LABEL: test_x86_avx512_maskz_psrlv_q_512:
7476 ; X64-NEXT: kmovw %edi, %k1
7477 ; X64-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} {z}
7480 ; X86-LABEL: test_x86_avx512_maskz_psrlv_q_512:
7482 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7483 ; X86-NEXT: kmovw %eax, %k1
7484 ; X86-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} {z}
7486 %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
7487 %mask.cast = bitcast i8 %mask to <8 x i1>
7488 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
7492 declare <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64>, <8 x i64>) nounwind readnone
7495 define <8 x double> @test_mm256_castpd128_pd256_freeze(<2 x double> %a0) nounwind {
7496 ; CHECK-LABEL: test_mm256_castpd128_pd256_freeze:
7498 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
7499 ; CHECK-NEXT: ret{{[l|q]}}
7500 %a1 = freeze <2 x double> poison
7501 %res = shufflevector <2 x double> %a0, <2 x double> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
7502 ret <8 x double> %res
7506 define <8 x double> @test_mm256_castpd256_pd256_freeze(<4 x double> %a0) nounwind {
7507 ; CHECK-LABEL: test_mm256_castpd256_pd256_freeze:
7509 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
7510 ; CHECK-NEXT: ret{{[l|q]}}
7511 %a1 = freeze <4 x double> poison
7512 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
7513 ret <8 x double> %res
7517 define <16 x float> @test_mm256_castps128_ps512_freeze(<4 x float> %a0) nounwind {
7518 ; CHECK-LABEL: test_mm256_castps128_ps512_freeze:
7520 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
7521 ; CHECK-NEXT: ret{{[l|q]}}
7522 %a1 = freeze <4 x float> poison
7523 %res = shufflevector <4 x float> %a0, <4 x float> %a1, <16x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
7524 ret <16 x float> %res
7528 define <16 x float> @test_mm256_castps256_ps512_freeze(<8 x float> %a0) nounwind {
7529 ; CHECK-LABEL: test_mm256_castps256_ps512_freeze:
7531 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
7532 ; CHECK-NEXT: ret{{[l|q]}}
7533 %a1 = freeze <8 x float> poison
7534 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <16x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
7535 ret <16 x float> %res
7539 define <8 x i64> @test_mm512_castsi128_si512_freeze(<2 x i64> %a0) nounwind {
7540 ; CHECK-LABEL: test_mm512_castsi128_si512_freeze:
7542 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
7543 ; CHECK-NEXT: ret{{[l|q]}}
7544 %a1 = freeze <2 x i64> poison
7545 %res = shufflevector <2 x i64> %a0, <2 x i64> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
7550 define <8 x i64> @test_mm512_castsi256_si512_pd256_freeze(<4 x i64> %a0) nounwind {
7551 ; CHECK-LABEL: test_mm512_castsi256_si512_pd256_freeze:
7553 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
7554 ; CHECK-NEXT: ret{{[l|q]}}
7555 %a1 = freeze <4 x i64> poison
7556 %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
7561 define <16 x float> @bad_mask_transition(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d, <16 x float> %e, <16 x float> %f) {
7562 ; X64-LABEL: bad_mask_transition:
7563 ; X64: # %bb.0: # %entry
7564 ; X64-NEXT: vcmplt_oqpd %zmm1, %zmm0, %k0
7565 ; X64-NEXT: vcmplt_oqpd %zmm3, %zmm2, %k1
7566 ; X64-NEXT: kunpckbw %k0, %k1, %k1
7567 ; X64-NEXT: vblendmps %zmm5, %zmm4, %zmm0 {%k1}
7570 ; X86-LABEL: bad_mask_transition:
7571 ; X86: # %bb.0: # %entry
7572 ; X86-NEXT: pushl %ebp
7573 ; X86-NEXT: .cfi_def_cfa_offset 8
7574 ; X86-NEXT: .cfi_offset %ebp, -8
7575 ; X86-NEXT: movl %esp, %ebp
7576 ; X86-NEXT: .cfi_def_cfa_register %ebp
7577 ; X86-NEXT: andl $-64, %esp
7578 ; X86-NEXT: subl $64, %esp
7579 ; X86-NEXT: vmovaps 72(%ebp), %zmm3
7580 ; X86-NEXT: vcmplt_oqpd %zmm1, %zmm0, %k0
7581 ; X86-NEXT: vcmplt_oqpd 8(%ebp), %zmm2, %k1
7582 ; X86-NEXT: kunpckbw %k0, %k1, %k1
7583 ; X86-NEXT: vmovaps 136(%ebp), %zmm3 {%k1}
7584 ; X86-NEXT: vmovaps %zmm3, %zmm0
7585 ; X86-NEXT: movl %ebp, %esp
7586 ; X86-NEXT: popl %ebp
7587 ; X86-NEXT: .cfi_def_cfa %esp, 4
7590 %0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
7591 %1 = bitcast <8 x i1> %0 to i8
7592 %2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %c, <8 x double> %d, i32 17, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
7593 %3 = bitcast <8 x i1> %2 to i8
7594 %conv = zext i8 %1 to i16
7595 %conv2 = zext i8 %3 to i16
7596 %4 = bitcast i16 %conv to <16 x i1>
7597 %5 = bitcast i16 %conv2 to <16 x i1>
7598 %6 = shufflevector <16 x i1> %4, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
7599 %7 = shufflevector <16 x i1> %5, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
7600 %8 = shufflevector <8 x i1> %6, <8 x i1> %7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
7601 %9 = select <16 x i1> %8, <16 x float> %f, <16 x float> %e
7605 define <16 x float> @bad_mask_transition_2(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d, <16 x float> %e, <16 x float> %f) {
7606 ; X64-LABEL: bad_mask_transition_2:
7607 ; X64: # %bb.0: # %entry
7608 ; X64-NEXT: vcmplt_oqpd %zmm1, %zmm0, %k1
7609 ; X64-NEXT: vblendmps %zmm5, %zmm4, %zmm0 {%k1}
7612 ; X86-LABEL: bad_mask_transition_2:
7613 ; X86: # %bb.0: # %entry
7614 ; X86-NEXT: pushl %ebp
7615 ; X86-NEXT: .cfi_def_cfa_offset 8
7616 ; X86-NEXT: .cfi_offset %ebp, -8
7617 ; X86-NEXT: movl %esp, %ebp
7618 ; X86-NEXT: .cfi_def_cfa_register %ebp
7619 ; X86-NEXT: andl $-64, %esp
7620 ; X86-NEXT: subl $64, %esp
7621 ; X86-NEXT: vmovaps 72(%ebp), %zmm2
7622 ; X86-NEXT: vcmplt_oqpd %zmm1, %zmm0, %k1
7623 ; X86-NEXT: vmovaps 136(%ebp), %zmm2 {%k1}
7624 ; X86-NEXT: vmovaps %zmm2, %zmm0
7625 ; X86-NEXT: movl %ebp, %esp
7626 ; X86-NEXT: popl %ebp
7627 ; X86-NEXT: .cfi_def_cfa %esp, 4
7630 %0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
7631 %1 = bitcast <8 x i1> %0 to i8
7632 %conv = zext i8 %1 to i16
7633 %2 = bitcast i16 %conv to <16 x i1>
7634 %3 = select <16 x i1> %2, <16 x float> %f, <16 x float> %e
7638 declare <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double>, <8 x double>, <8 x i1>)
7639 declare <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float>, <16 x float>, <16 x i1>)
7640 declare <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64>, <8 x i64>, <8 x i1>)
7641 declare <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32>, <16 x i32>, <16 x i1>)
7642 declare <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double>, <8 x double>, <8 x i1>)
7643 declare <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float>, <16 x float>, <16 x i1>)
7644 declare <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64>, <8 x i64>, <8 x i1>)
7645 declare <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32>, <16 x i32>, <16 x i1>)