1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64
3 ; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86
6 define <8 x double> @test_mask_compress_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask) {
7 ; X64-LABEL: test_mask_compress_pd_512:
9 ; X64-NEXT: kmovw %edi, %k1
10 ; X64-NEXT: vcompresspd %zmm0, %zmm1 {%k1}
11 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
14 ; X86-LABEL: test_mask_compress_pd_512:
16 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
17 ; X86-NEXT: kmovw %eax, %k1
18 ; X86-NEXT: vcompresspd %zmm0, %zmm1 {%k1}
19 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
21 %1 = bitcast i8 %mask to <8 x i1>
22 %2 = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> %data, <8 x double> %passthru, <8 x i1> %1)
26 define <8 x double> @test_maskz_compress_pd_512(<8 x double> %data, i8 %mask) {
27 ; X64-LABEL: test_maskz_compress_pd_512:
29 ; X64-NEXT: kmovw %edi, %k1
30 ; X64-NEXT: vcompresspd %zmm0, %zmm0 {%k1} {z}
33 ; X86-LABEL: test_maskz_compress_pd_512:
35 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
36 ; X86-NEXT: kmovw %eax, %k1
37 ; X86-NEXT: vcompresspd %zmm0, %zmm0 {%k1} {z}
39 %1 = bitcast i8 %mask to <8 x i1>
40 %2 = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> %data, <8 x double> zeroinitializer, <8 x i1> %1)
44 define <8 x double> @test_compress_pd_512(<8 x double> %data) {
45 ; CHECK-LABEL: test_compress_pd_512:
47 ; CHECK-NEXT: ret{{[l|q]}}
48 %1 = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> %data, <8 x double> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
52 define <16 x float> @test_mask_compress_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask) {
53 ; X64-LABEL: test_mask_compress_ps_512:
55 ; X64-NEXT: kmovw %edi, %k1
56 ; X64-NEXT: vcompressps %zmm0, %zmm1 {%k1}
57 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
60 ; X86-LABEL: test_mask_compress_ps_512:
62 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
63 ; X86-NEXT: vcompressps %zmm0, %zmm1 {%k1}
64 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
66 %1 = bitcast i16 %mask to <16 x i1>
67 %2 = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> %data, <16 x float> %passthru, <16 x i1> %1)
71 define <16 x float> @test_maskz_compress_ps_512(<16 x float> %data, i16 %mask) {
72 ; X64-LABEL: test_maskz_compress_ps_512:
74 ; X64-NEXT: kmovw %edi, %k1
75 ; X64-NEXT: vcompressps %zmm0, %zmm0 {%k1} {z}
78 ; X86-LABEL: test_maskz_compress_ps_512:
80 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
81 ; X86-NEXT: vcompressps %zmm0, %zmm0 {%k1} {z}
83 %1 = bitcast i16 %mask to <16 x i1>
84 %2 = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> %data, <16 x float> zeroinitializer, <16 x i1> %1)
88 define <16 x float> @test_compress_ps_512(<16 x float> %data) {
89 ; CHECK-LABEL: test_compress_ps_512:
91 ; CHECK-NEXT: ret{{[l|q]}}
92 %1 = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> %data, <16 x float> undef, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
96 define <8 x i64> @test_mask_compress_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) {
97 ; X64-LABEL: test_mask_compress_q_512:
99 ; X64-NEXT: kmovw %edi, %k1
100 ; X64-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
101 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
104 ; X86-LABEL: test_mask_compress_q_512:
106 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
107 ; X86-NEXT: kmovw %eax, %k1
108 ; X86-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
109 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
111 %1 = bitcast i8 %mask to <8 x i1>
112 %2 = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> %data, <8 x i64> %passthru, <8 x i1> %1)
116 define <8 x i64> @test_maskz_compress_q_512(<8 x i64> %data, i8 %mask) {
117 ; X64-LABEL: test_maskz_compress_q_512:
119 ; X64-NEXT: kmovw %edi, %k1
120 ; X64-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
123 ; X86-LABEL: test_maskz_compress_q_512:
125 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
126 ; X86-NEXT: kmovw %eax, %k1
127 ; X86-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
129 %1 = bitcast i8 %mask to <8 x i1>
130 %2 = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> %data, <8 x i64> zeroinitializer, <8 x i1> %1)
134 define <8 x i64> @test_compress_q_512(<8 x i64> %data) {
135 ; CHECK-LABEL: test_compress_q_512:
137 ; CHECK-NEXT: ret{{[l|q]}}
138 %1 = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> %data, <8 x i64> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
142 define <16 x i32> @test_mask_compress_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) {
143 ; X64-LABEL: test_mask_compress_d_512:
145 ; X64-NEXT: kmovw %edi, %k1
146 ; X64-NEXT: vpcompressd %zmm0, %zmm1 {%k1}
147 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
150 ; X86-LABEL: test_mask_compress_d_512:
152 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
153 ; X86-NEXT: vpcompressd %zmm0, %zmm1 {%k1}
154 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
156 %1 = bitcast i16 %mask to <16 x i1>
157 %2 = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %data, <16 x i32> %passthru, <16 x i1> %1)
161 define <16 x i32> @test_maskz_compress_d_512(<16 x i32> %data, i16 %mask) {
162 ; X64-LABEL: test_maskz_compress_d_512:
164 ; X64-NEXT: kmovw %edi, %k1
165 ; X64-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z}
168 ; X86-LABEL: test_maskz_compress_d_512:
170 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
171 ; X86-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z}
173 %1 = bitcast i16 %mask to <16 x i1>
174 %2 = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %data, <16 x i32> zeroinitializer, <16 x i1> %1)
178 define <16 x i32> @test_compress_d_512(<16 x i32> %data) {
179 ; CHECK-LABEL: test_compress_d_512:
181 ; CHECK-NEXT: ret{{[l|q]}}
182 %1 = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %data, <16 x i32> undef, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
186 define <8 x double> @test_expand_pd_512(<8 x double> %data) {
187 ; CHECK-LABEL: test_expand_pd_512:
189 ; CHECK-NEXT: ret{{[l|q]}}
190 %1 = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> %data, <8 x double> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
194 define <8 x double> @test_mask_expand_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask) {
195 ; X64-LABEL: test_mask_expand_pd_512:
197 ; X64-NEXT: kmovw %edi, %k1
198 ; X64-NEXT: vexpandpd %zmm0, %zmm1 {%k1}
199 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
202 ; X86-LABEL: test_mask_expand_pd_512:
204 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
205 ; X86-NEXT: kmovw %eax, %k1
206 ; X86-NEXT: vexpandpd %zmm0, %zmm1 {%k1}
207 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
209 %1 = bitcast i8 %mask to <8 x i1>
210 %2 = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> %data, <8 x double> %passthru, <8 x i1> %1)
214 define <8 x double> @test_maskz_expand_pd_512(<8 x double> %data, i8 %mask) {
215 ; X64-LABEL: test_maskz_expand_pd_512:
217 ; X64-NEXT: kmovw %edi, %k1
218 ; X64-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z}
221 ; X86-LABEL: test_maskz_expand_pd_512:
223 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
224 ; X86-NEXT: kmovw %eax, %k1
225 ; X86-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z}
227 %1 = bitcast i8 %mask to <8 x i1>
228 %2 = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> %data, <8 x double> zeroinitializer, <8 x i1> %1)
232 define <16 x float> @test_expand_ps_512(<16 x float> %data) {
233 ; CHECK-LABEL: test_expand_ps_512:
235 ; CHECK-NEXT: ret{{[l|q]}}
236 %1 = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> %data, <16 x float> undef, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
240 define <16 x float> @test_mask_expand_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask) {
241 ; X64-LABEL: test_mask_expand_ps_512:
243 ; X64-NEXT: kmovw %edi, %k1
244 ; X64-NEXT: vexpandps %zmm0, %zmm1 {%k1}
245 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
248 ; X86-LABEL: test_mask_expand_ps_512:
250 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
251 ; X86-NEXT: vexpandps %zmm0, %zmm1 {%k1}
252 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
254 %1 = bitcast i16 %mask to <16 x i1>
255 %2 = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> %data, <16 x float> %passthru, <16 x i1> %1)
259 define <16 x float> @test_maskz_expand_ps_512(<16 x float> %data, i16 %mask) {
260 ; X64-LABEL: test_maskz_expand_ps_512:
262 ; X64-NEXT: kmovw %edi, %k1
263 ; X64-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z}
266 ; X86-LABEL: test_maskz_expand_ps_512:
268 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
269 ; X86-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z}
271 %1 = bitcast i16 %mask to <16 x i1>
272 %2 = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> %data, <16 x float> zeroinitializer, <16 x i1> %1)
276 define <8 x i64> @test_expand_q_512(<8 x i64> %data) {
277 ; CHECK-LABEL: test_expand_q_512:
279 ; CHECK-NEXT: ret{{[l|q]}}
280 %1 = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> %data, <8 x i64> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
284 define <8 x i64> @test_mask_expand_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) {
285 ; X64-LABEL: test_mask_expand_q_512:
287 ; X64-NEXT: kmovw %edi, %k1
288 ; X64-NEXT: vpexpandq %zmm0, %zmm1 {%k1}
289 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
292 ; X86-LABEL: test_mask_expand_q_512:
294 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
295 ; X86-NEXT: kmovw %eax, %k1
296 ; X86-NEXT: vpexpandq %zmm0, %zmm1 {%k1}
297 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
299 %1 = bitcast i8 %mask to <8 x i1>
300 %2 = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> %data, <8 x i64> %passthru, <8 x i1> %1)
304 define <8 x i64> @test_maskz_expand_q_512(<8 x i64> %data, i8 %mask) {
305 ; X64-LABEL: test_maskz_expand_q_512:
307 ; X64-NEXT: kmovw %edi, %k1
308 ; X64-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
311 ; X86-LABEL: test_maskz_expand_q_512:
313 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
314 ; X86-NEXT: kmovw %eax, %k1
315 ; X86-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
317 %1 = bitcast i8 %mask to <8 x i1>
318 %2 = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> %data, <8 x i64> zeroinitializer, <8 x i1> %1)
322 define <16 x i32> @test_expand_d_512(<16 x i32> %data) {
323 ; CHECK-LABEL: test_expand_d_512:
325 ; CHECK-NEXT: ret{{[l|q]}}
326 %1 = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %data, <16 x i32> undef, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
330 define <16 x i32> @test_mask_expand_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) {
331 ; X64-LABEL: test_mask_expand_d_512:
333 ; X64-NEXT: kmovw %edi, %k1
334 ; X64-NEXT: vpexpandd %zmm0, %zmm1 {%k1}
335 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
338 ; X86-LABEL: test_mask_expand_d_512:
340 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
341 ; X86-NEXT: vpexpandd %zmm0, %zmm1 {%k1}
342 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
344 %1 = bitcast i16 %mask to <16 x i1>
345 %2 = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %data, <16 x i32> %passthru, <16 x i1> %1)
349 define <16 x i32> @test_maskz_expand_d_512(<16 x i32> %data, i16 %mask) {
350 ; X64-LABEL: test_maskz_expand_d_512:
352 ; X64-NEXT: kmovw %edi, %k1
353 ; X64-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z}
356 ; X86-LABEL: test_maskz_expand_d_512:
358 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
359 ; X86-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z}
361 %1 = bitcast i16 %mask to <16 x i1>
362 %2 = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %data, <16 x i32> zeroinitializer, <16 x i1> %1)
366 define <16 x float> @test_rcp_ps_512(<16 x float> %a0) {
367 ; CHECK-LABEL: test_rcp_ps_512:
369 ; CHECK-NEXT: vrcp14ps %zmm0, %zmm0
370 ; CHECK-NEXT: ret{{[l|q]}}
371 %res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
372 ret <16 x float> %res
374 declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
376 define <8 x double> @test_rcp_pd_512(<8 x double> %a0) {
377 ; CHECK-LABEL: test_rcp_pd_512:
379 ; CHECK-NEXT: vrcp14pd %zmm0, %zmm0
380 ; CHECK-NEXT: ret{{[l|q]}}
381 %res = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1) ; <<8 x double>> [#uses=1]
382 ret <8 x double> %res
384 declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>, <8 x double>, i8) nounwind readnone
386 declare <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32, i32)
388 define <2 x double> @test_rndscale_sd(<2 x double> %a, <2 x double> %b) {
389 ; CHECK-LABEL: test_rndscale_sd:
391 ; CHECK-NEXT: vroundsd $11, %xmm1, %xmm0, %xmm0
392 ; CHECK-NEXT: ret{{[l|q]}}
393 %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> undef, i8 -1, i32 11, i32 4)
397 define <2 x double> @test_rndscale_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
398 ; X64-LABEL: test_rndscale_sd_mask:
400 ; X64-NEXT: kmovw %edi, %k1
401 ; X64-NEXT: vrndscalesd $11, %xmm1, %xmm0, %xmm2 {%k1}
402 ; X64-NEXT: vmovapd %xmm2, %xmm0
405 ; X86-LABEL: test_rndscale_sd_mask:
407 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
408 ; X86-NEXT: kmovw %eax, %k1
409 ; X86-NEXT: vrndscalesd $11, %xmm1, %xmm0, %xmm2 {%k1}
410 ; X86-NEXT: vmovapd %xmm2, %xmm0
412 %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 11, i32 4)
416 define <2 x double> @test_rndscale_sd_mask_load(<2 x double> %a, <2 x double>* %bptr, <2 x double> %c, i8 %mask) {
417 ; X64-LABEL: test_rndscale_sd_mask_load:
419 ; X64-NEXT: kmovw %esi, %k1
420 ; X64-NEXT: vrndscalesd $11, (%rdi), %xmm0, %xmm1 {%k1}
421 ; X64-NEXT: vmovapd %xmm1, %xmm0
424 ; X86-LABEL: test_rndscale_sd_mask_load:
426 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
427 ; X86-NEXT: kmovw %eax, %k1
428 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
429 ; X86-NEXT: vrndscalesd $11, (%eax), %xmm0, %xmm1 {%k1}
430 ; X86-NEXT: vmovapd %xmm1, %xmm0
432 %b = load <2 x double>, <2 x double>* %bptr
433 %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 11, i32 4)
437 define <2 x double> @test_rndscale_sd_maskz(<2 x double> %a, <2 x double> %b, i8 %mask) {
438 ; X64-LABEL: test_rndscale_sd_maskz:
440 ; X64-NEXT: kmovw %edi, %k1
441 ; X64-NEXT: vrndscalesd $11, %xmm1, %xmm0, %xmm0 {%k1} {z}
444 ; X86-LABEL: test_rndscale_sd_maskz:
446 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
447 ; X86-NEXT: kmovw %eax, %k1
448 ; X86-NEXT: vrndscalesd $11, %xmm1, %xmm0, %xmm0 {%k1} {z}
450 %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> zeroinitializer, i8 %mask, i32 11, i32 4)
454 declare <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32, i32)
456 define <4 x float> @test_rndscale_ss(<4 x float> %a, <4 x float> %b) {
457 ; CHECK-LABEL: test_rndscale_ss:
459 ; CHECK-NEXT: vroundss $11, %xmm1, %xmm0, %xmm0
460 ; CHECK-NEXT: ret{{[l|q]}}
461 %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 11, i32 4)
465 define <4 x float> @test_rndscale_ss_load(<4 x float> %a, <4 x float>* %bptr) {
466 ; X64-LABEL: test_rndscale_ss_load:
468 ; X64-NEXT: vroundss $11, (%rdi), %xmm0, %xmm0
471 ; X86-LABEL: test_rndscale_ss_load:
473 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
474 ; X86-NEXT: vroundss $11, (%eax), %xmm0, %xmm0
476 %b = load <4 x float>, <4 x float>* %bptr
477 %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 11, i32 4)
481 define <4 x float> @test_rndscale_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
482 ; X64-LABEL: test_rndscale_ss_mask:
484 ; X64-NEXT: kmovw %edi, %k1
485 ; X64-NEXT: vrndscaless $11, %xmm1, %xmm0, %xmm2 {%k1}
486 ; X64-NEXT: vmovaps %xmm2, %xmm0
489 ; X86-LABEL: test_rndscale_ss_mask:
491 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
492 ; X86-NEXT: kmovw %eax, %k1
493 ; X86-NEXT: vrndscaless $11, %xmm1, %xmm0, %xmm2 {%k1}
494 ; X86-NEXT: vmovaps %xmm2, %xmm0
496 %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask, i32 11, i32 4)
500 define <4 x float> @test_rndscale_ss_maskz(<4 x float> %a, <4 x float> %b, i8 %mask) {
501 ; X64-LABEL: test_rndscale_ss_maskz:
503 ; X64-NEXT: kmovw %edi, %k1
504 ; X64-NEXT: vrndscaless $11, %xmm1, %xmm0, %xmm0 {%k1} {z}
507 ; X86-LABEL: test_rndscale_ss_maskz:
509 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
510 ; X86-NEXT: kmovw %eax, %k1
511 ; X86-NEXT: vrndscaless $11, %xmm1, %xmm0, %xmm0 {%k1} {z}
513 %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask, i32 11, i32 4)
517 declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
519 define <8 x double> @test7(<8 x double> %a) {
520 ; CHECK-LABEL: test7:
522 ; CHECK-NEXT: vrndscalepd $11, %zmm0, %zmm0
523 ; CHECK-NEXT: ret{{[l|q]}}
524 %res = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %a, i32 11, <8 x double> %a, i8 -1, i32 4)
528 declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
530 define <16 x float> @test8(<16 x float> %a) {
531 ; CHECK-LABEL: test8:
533 ; CHECK-NEXT: vrndscaleps $11, %zmm0, %zmm0
534 ; CHECK-NEXT: ret{{[l|q]}}
535 %res = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %a, i32 11, <16 x float> %a, i16 -1, i32 4)
539 define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) {
540 ; CHECK-LABEL: test_rsqrt_ps_512:
542 ; CHECK-NEXT: vrsqrt14ps %zmm0, %zmm0
543 ; CHECK-NEXT: ret{{[l|q]}}
544 %res = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
545 ret <16 x float> %res
547 declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
549 define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) {
550 ; CHECK-LABEL: test_sqrt_pd_512:
552 ; CHECK-NEXT: vsqrtpd %zmm0, %zmm0
553 ; CHECK-NEXT: ret{{[l|q]}}
554 %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0)
558 define <8 x double> @test_mask_sqrt_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
559 ; X64-LABEL: test_mask_sqrt_pd_512:
561 ; X64-NEXT: kmovw %edi, %k1
562 ; X64-NEXT: vsqrtpd %zmm0, %zmm1 {%k1}
563 ; X64-NEXT: vmovapd %zmm1, %zmm0
566 ; X86-LABEL: test_mask_sqrt_pd_512:
568 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
569 ; X86-NEXT: kmovw %eax, %k1
570 ; X86-NEXT: vsqrtpd %zmm0, %zmm1 {%k1}
571 ; X86-NEXT: vmovapd %zmm1, %zmm0
573 %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0)
574 %2 = bitcast i8 %mask to <8 x i1>
575 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
579 define <8 x double> @test_maskz_sqrt_pd_512(<8 x double> %a0, i8 %mask) {
580 ; X64-LABEL: test_maskz_sqrt_pd_512:
582 ; X64-NEXT: kmovw %edi, %k1
583 ; X64-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z}
586 ; X86-LABEL: test_maskz_sqrt_pd_512:
588 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
589 ; X86-NEXT: kmovw %eax, %k1
590 ; X86-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z}
592 %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0)
593 %2 = bitcast i8 %mask to <8 x i1>
594 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
597 declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
599 define <8 x double> @test_sqrt_round_pd_512(<8 x double> %a0) {
600 ; CHECK-LABEL: test_sqrt_round_pd_512:
602 ; CHECK-NEXT: vsqrtpd {rz-sae}, %zmm0, %zmm0
603 ; CHECK-NEXT: ret{{[l|q]}}
604 %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11)
608 define <8 x double> @test_mask_sqrt_round_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
609 ; X64-LABEL: test_mask_sqrt_round_pd_512:
611 ; X64-NEXT: kmovw %edi, %k1
612 ; X64-NEXT: vsqrtpd {rz-sae}, %zmm0, %zmm1 {%k1}
613 ; X64-NEXT: vmovapd %zmm1, %zmm0
616 ; X86-LABEL: test_mask_sqrt_round_pd_512:
618 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
619 ; X86-NEXT: kmovw %eax, %k1
620 ; X86-NEXT: vsqrtpd {rz-sae}, %zmm0, %zmm1 {%k1}
621 ; X86-NEXT: vmovapd %zmm1, %zmm0
623 %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11)
624 %2 = bitcast i8 %mask to <8 x i1>
625 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
629 define <8 x double> @test_maskz_sqrt_round_pd_512(<8 x double> %a0, i8 %mask) {
630 ; X64-LABEL: test_maskz_sqrt_round_pd_512:
632 ; X64-NEXT: kmovw %edi, %k1
633 ; X64-NEXT: vsqrtpd {rz-sae}, %zmm0, %zmm0 {%k1} {z}
636 ; X86-LABEL: test_maskz_sqrt_round_pd_512:
638 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
639 ; X86-NEXT: kmovw %eax, %k1
640 ; X86-NEXT: vsqrtpd {rz-sae}, %zmm0, %zmm0 {%k1} {z}
642 %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11)
643 %2 = bitcast i8 %mask to <8 x i1>
644 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
647 declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, i32) nounwind readnone
649 define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) {
650 ; CHECK-LABEL: test_sqrt_ps_512:
652 ; CHECK-NEXT: vsqrtps %zmm0, %zmm0
653 ; CHECK-NEXT: ret{{[l|q]}}
654 %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0)
658 define <16 x float> @test_mask_sqrt_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
659 ; X64-LABEL: test_mask_sqrt_ps_512:
661 ; X64-NEXT: kmovw %edi, %k1
662 ; X64-NEXT: vsqrtps %zmm0, %zmm1 {%k1}
663 ; X64-NEXT: vmovaps %zmm1, %zmm0
666 ; X86-LABEL: test_mask_sqrt_ps_512:
668 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
669 ; X86-NEXT: vsqrtps %zmm0, %zmm1 {%k1}
670 ; X86-NEXT: vmovaps %zmm1, %zmm0
672 %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0)
673 %2 = bitcast i16 %mask to <16 x i1>
674 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
678 define <16 x float> @test_maskz_sqrt_ps_512(<16 x float> %a0, i16 %mask) {
679 ; X64-LABEL: test_maskz_sqrt_ps_512:
681 ; X64-NEXT: kmovw %edi, %k1
682 ; X64-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z}
685 ; X86-LABEL: test_maskz_sqrt_ps_512:
687 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
688 ; X86-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z}
690 %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0)
691 %2 = bitcast i16 %mask to <16 x i1>
692 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
695 declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
697 define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0) {
698 ; CHECK-LABEL: test_sqrt_round_ps_512:
700 ; CHECK-NEXT: vsqrtps {rz-sae}, %zmm0, %zmm0
701 ; CHECK-NEXT: ret{{[l|q]}}
702 %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11)
706 define <16 x float> @test_mask_sqrt_round_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
707 ; X64-LABEL: test_mask_sqrt_round_ps_512:
709 ; X64-NEXT: kmovw %edi, %k1
710 ; X64-NEXT: vsqrtps {rz-sae}, %zmm0, %zmm1 {%k1}
711 ; X64-NEXT: vmovaps %zmm1, %zmm0
714 ; X86-LABEL: test_mask_sqrt_round_ps_512:
716 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
717 ; X86-NEXT: vsqrtps {rz-sae}, %zmm0, %zmm1 {%k1}
718 ; X86-NEXT: vmovaps %zmm1, %zmm0
720 %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11)
721 %2 = bitcast i16 %mask to <16 x i1>
722 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
726 define <16 x float> @test_maskz_sqrt_round_ps_512(<16 x float> %a0, i16 %mask) {
727 ; X64-LABEL: test_maskz_sqrt_round_ps_512:
729 ; X64-NEXT: kmovw %edi, %k1
730 ; X64-NEXT: vsqrtps {rz-sae}, %zmm0, %zmm0 {%k1} {z}
733 ; X86-LABEL: test_maskz_sqrt_round_ps_512:
735 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
736 ; X86-NEXT: vsqrtps {rz-sae}, %zmm0, %zmm0 {%k1} {z}
738 %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11)
739 %2 = bitcast i16 %mask to <16 x i1>
740 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
743 declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, i32) nounwind readnone
745 define <8 x double> @test_getexp_pd_512(<8 x double> %a0) {
746 ; CHECK-LABEL: test_getexp_pd_512:
748 ; CHECK-NEXT: vgetexppd %zmm0, %zmm0
749 ; CHECK-NEXT: ret{{[l|q]}}
750 %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 4)
751 ret <8 x double> %res
753 define <8 x double> @test_getexp_round_pd_512(<8 x double> %a0) {
754 ; CHECK-LABEL: test_getexp_round_pd_512:
756 ; CHECK-NEXT: vgetexppd {sae}, %zmm0, %zmm0
757 ; CHECK-NEXT: ret{{[l|q]}}
758 %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8)
759 ret <8 x double> %res
761 declare <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone
763 define <16 x float> @test_getexp_ps_512(<16 x float> %a0) {
764 ; CHECK-LABEL: test_getexp_ps_512:
766 ; CHECK-NEXT: vgetexpps %zmm0, %zmm0
767 ; CHECK-NEXT: ret{{[l|q]}}
768 %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
769 ret <16 x float> %res
772 define <16 x float> @test_getexp_round_ps_512(<16 x float> %a0) {
773 ; CHECK-LABEL: test_getexp_round_ps_512:
775 ; CHECK-NEXT: vgetexpps {sae}, %zmm0, %zmm0
776 ; CHECK-NEXT: ret{{[l|q]}}
777 %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
778 ret <16 x float> %res
780 declare <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone
782 declare <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
784 define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
785 ; X64-LABEL: test_sqrt_ss:
787 ; X64-NEXT: kmovw %edi, %k1
788 ; X64-NEXT: vmovaps %xmm2, %xmm3
789 ; X64-NEXT: vsqrtss %xmm1, %xmm0, %xmm3 {%k1}
790 ; X64-NEXT: vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
791 ; X64-NEXT: vaddps %xmm2, %xmm3, %xmm2
792 ; X64-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
793 ; X64-NEXT: vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0
794 ; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0
795 ; X64-NEXT: vaddps %xmm0, %xmm2, %xmm0
798 ; X86-LABEL: test_sqrt_ss:
800 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
801 ; X86-NEXT: kmovw %eax, %k1
802 ; X86-NEXT: vmovaps %xmm2, %xmm3
803 ; X86-NEXT: vsqrtss %xmm1, %xmm0, %xmm3 {%k1}
804 ; X86-NEXT: vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
805 ; X86-NEXT: vaddps %xmm2, %xmm3, %xmm2
806 ; X86-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
807 ; X86-NEXT: vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0
808 ; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0
809 ; X86-NEXT: vaddps %xmm0, %xmm2, %xmm0
811 %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
812 %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 9)
813 %res2 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 10)
814 %res3 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 11)
816 %res.1 = fadd <4 x float> %res0, %res1
817 %res.2 = fadd <4 x float> %res2, %res3
818 %res = fadd <4 x float> %res.1, %res.2
822 declare <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
824 define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
825 ; X64-LABEL: test_sqrt_sd:
827 ; X64-NEXT: kmovw %edi, %k1
828 ; X64-NEXT: vmovapd %xmm2, %xmm3
829 ; X64-NEXT: vsqrtsd %xmm1, %xmm0, %xmm3 {%k1}
830 ; X64-NEXT: vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
831 ; X64-NEXT: vaddpd %xmm2, %xmm3, %xmm2
832 ; X64-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
833 ; X64-NEXT: vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0
834 ; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0
835 ; X64-NEXT: vaddpd %xmm0, %xmm2, %xmm0
838 ; X86-LABEL: test_sqrt_sd:
840 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
841 ; X86-NEXT: kmovw %eax, %k1
842 ; X86-NEXT: vmovapd %xmm2, %xmm3
843 ; X86-NEXT: vsqrtsd %xmm1, %xmm0, %xmm3 {%k1}
844 ; X86-NEXT: vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
845 ; X86-NEXT: vaddpd %xmm2, %xmm3, %xmm2
846 ; X86-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
847 ; X86-NEXT: vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0
848 ; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0
849 ; X86-NEXT: vaddpd %xmm0, %xmm2, %xmm0
851 %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
852 %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 9)
853 %res2 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 10)
854 %res3 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 11)
856 %res.1 = fadd <2 x double> %res0, %res1
857 %res.2 = fadd <2 x double> %res2, %res3
858 %res = fadd <2 x double> %res.1, %res.2
859 ret <2 x double> %res
862 define i32 @test_x86_avx512_cvttsd2usi(<2 x double> %a0) {
863 ; CHECK-LABEL: test_x86_avx512_cvttsd2usi:
865 ; CHECK-NEXT: vcvttsd2usi %xmm0, %ecx
866 ; CHECK-NEXT: vcvttsd2usi {sae}, %xmm0, %eax
867 ; CHECK-NEXT: addl %ecx, %eax
868 ; CHECK-NEXT: ret{{[l|q]}}
869 %res0 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 4) ;
870 %res1 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 8) ;
871 %res2 = add i32 %res0, %res1
874 declare i32 @llvm.x86.avx512.cvttsd2usi(<2 x double>, i32) nounwind readnone
876 define i32 @test_x86_avx512_cvttsd2si(<2 x double> %a0) {
877 ; CHECK-LABEL: test_x86_avx512_cvttsd2si:
879 ; CHECK-NEXT: vcvttsd2si %xmm0, %ecx
880 ; CHECK-NEXT: vcvttsd2si {sae}, %xmm0, %eax
881 ; CHECK-NEXT: addl %ecx, %eax
882 ; CHECK-NEXT: ret{{[l|q]}}
883 %res0 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 4) ;
884 %res1 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 8) ;
885 %res2 = add i32 %res0, %res1
888 declare i32 @llvm.x86.avx512.cvttsd2si(<2 x double>, i32) nounwind readnone
890 define i32 @test_x86_avx512_cvttss2si(<4 x float> %a0) {
891 ; CHECK-LABEL: test_x86_avx512_cvttss2si:
893 ; CHECK-NEXT: vcvttss2si {sae}, %xmm0, %ecx
894 ; CHECK-NEXT: vcvttss2si %xmm0, %eax
895 ; CHECK-NEXT: addl %ecx, %eax
896 ; CHECK-NEXT: ret{{[l|q]}}
897 %res0 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 8) ;
898 %res1 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 4) ;
899 %res2 = add i32 %res0, %res1
902 declare i32 @llvm.x86.avx512.cvttss2si(<4 x float>, i32) nounwind readnone
904 define i32 @test_x86_avx512_cvttss2si_load(<4 x float>* %a0) {
905 ; X64-LABEL: test_x86_avx512_cvttss2si_load:
907 ; X64-NEXT: vcvttss2si (%rdi), %eax
910 ; X86-LABEL: test_x86_avx512_cvttss2si_load:
912 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
913 ; X86-NEXT: vcvttss2si (%eax), %eax
915 %a1 = load <4 x float>, <4 x float>* %a0
916 %res = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a1, i32 4) ;
920 define i32 @test_x86_avx512_cvttss2usi(<4 x float> %a0) {
921 ; CHECK-LABEL: test_x86_avx512_cvttss2usi:
923 ; CHECK-NEXT: vcvttss2usi {sae}, %xmm0, %ecx
924 ; CHECK-NEXT: vcvttss2usi %xmm0, %eax
925 ; CHECK-NEXT: addl %ecx, %eax
926 ; CHECK-NEXT: ret{{[l|q]}}
927 %res0 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 8) ;
928 %res1 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 4) ;
929 %res2 = add i32 %res0, %res1
932 declare i32 @llvm.x86.avx512.cvttss2usi(<4 x float>, i32) nounwind readnone
934 define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) {
935 ; CHECK-LABEL: test_x86_avx512_cvtsd2usi32:
937 ; CHECK-NEXT: vcvtsd2usi %xmm0, %eax
938 ; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %ecx
939 ; CHECK-NEXT: addl %eax, %ecx
940 ; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %eax
941 ; CHECK-NEXT: addl %ecx, %eax
942 ; CHECK-NEXT: ret{{[l|q]}}
944 %res = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 4)
945 %res1 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 11)
946 %res2 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 9)
947 %res3 = add i32 %res, %res1
948 %res4 = add i32 %res3, %res2
951 declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone
953 define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) {
954 ; CHECK-LABEL: test_x86_avx512_cvtsd2si32:
956 ; CHECK-NEXT: vcvtsd2si %xmm0, %eax
957 ; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %ecx
958 ; CHECK-NEXT: addl %eax, %ecx
959 ; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %eax
960 ; CHECK-NEXT: addl %ecx, %eax
961 ; CHECK-NEXT: ret{{[l|q]}}
963 %res = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 4)
964 %res1 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 11)
965 %res2 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 9)
966 %res3 = add i32 %res, %res1
967 %res4 = add i32 %res3, %res2
970 declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone
972 define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) {
973 ; CHECK-LABEL: test_x86_avx512_cvtss2usi32:
975 ; CHECK-NEXT: vcvtss2usi %xmm0, %eax
976 ; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %ecx
977 ; CHECK-NEXT: addl %eax, %ecx
978 ; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %eax
979 ; CHECK-NEXT: addl %ecx, %eax
980 ; CHECK-NEXT: ret{{[l|q]}}
982 %res = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 4)
983 %res1 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 11)
984 %res2 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 9)
985 %res3 = add i32 %res, %res1
986 %res4 = add i32 %res3, %res2
989 declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone
991 define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) {
992 ; CHECK-LABEL: test_x86_avx512_cvtss2si32:
994 ; CHECK-NEXT: vcvtss2si %xmm0, %eax
995 ; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %ecx
996 ; CHECK-NEXT: addl %eax, %ecx
997 ; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %eax
998 ; CHECK-NEXT: addl %ecx, %eax
999 ; CHECK-NEXT: ret{{[l|q]}}
1001 %res = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 4)
1002 %res1 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 11)
1003 %res2 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 9)
1004 %res3 = add i32 %res, %res1
1005 %res4 = add i32 %res3, %res2
1008 declare i32 @llvm.x86.avx512.vcvtss2si32(<4 x float>, i32) nounwind readnone
1010 define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) {
1011 ; CHECK-LABEL: test_x86_vcvtph2ps_512:
1013 ; CHECK-NEXT: vcvtph2ps %ymm0, %zmm0
1014 ; CHECK-NEXT: ret{{[l|q]}}
1015 %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
1016 ret <16 x float> %res
1019 define <16 x float> @test_x86_vcvtph2ps_512_sae(<16 x i16> %a0) {
1020 ; CHECK-LABEL: test_x86_vcvtph2ps_512_sae:
1022 ; CHECK-NEXT: vcvtph2ps {sae}, %ymm0, %zmm0
1023 ; CHECK-NEXT: ret{{[l|q]}}
1024 %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
1025 ret <16 x float> %res
1028 define <16 x float> @test_x86_vcvtph2ps_512_rrk(<16 x i16> %a0,<16 x float> %a1, i16 %mask) {
1029 ; X64-LABEL: test_x86_vcvtph2ps_512_rrk:
1031 ; X64-NEXT: kmovw %edi, %k1
1032 ; X64-NEXT: vcvtph2ps %ymm0, %zmm1 {%k1}
1033 ; X64-NEXT: vmovaps %zmm1, %zmm0
1036 ; X86-LABEL: test_x86_vcvtph2ps_512_rrk:
1038 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1039 ; X86-NEXT: vcvtph2ps %ymm0, %zmm1 {%k1}
1040 ; X86-NEXT: vmovaps %zmm1, %zmm0
1042 %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> %a1, i16 %mask, i32 4)
1043 ret <16 x float> %res
1046 define <16 x float> @test_x86_vcvtph2ps_512_sae_rrkz(<16 x i16> %a0, i16 %mask) {
1047 ; X64-LABEL: test_x86_vcvtph2ps_512_sae_rrkz:
1049 ; X64-NEXT: kmovw %edi, %k1
1050 ; X64-NEXT: vcvtph2ps {sae}, %ymm0, %zmm0 {%k1} {z}
1053 ; X86-LABEL: test_x86_vcvtph2ps_512_sae_rrkz:
1055 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1056 ; X86-NEXT: vcvtph2ps {sae}, %ymm0, %zmm0 {%k1} {z}
1058 %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 8)
1059 ret <16 x float> %res
1062 define <16 x float> @test_x86_vcvtph2ps_512_rrkz(<16 x i16> %a0, i16 %mask) {
1063 ; X64-LABEL: test_x86_vcvtph2ps_512_rrkz:
1065 ; X64-NEXT: kmovw %edi, %k1
1066 ; X64-NEXT: vcvtph2ps %ymm0, %zmm0 {%k1} {z}
1069 ; X86-LABEL: test_x86_vcvtph2ps_512_rrkz:
1071 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1072 ; X86-NEXT: vcvtph2ps %ymm0, %zmm0 {%k1} {z}
1074 %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 4)
1075 ret <16 x float> %res
1078 declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly
1080 define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 %mask, <16 x i16> * %dst) {
1081 ; X64-LABEL: test_x86_vcvtps2ph_256:
1083 ; X64-NEXT: kmovw %edi, %k1
1084 ; X64-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z}
1085 ; X64-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1}
1086 ; X64-NEXT: vpaddw %ymm1, %ymm2, %ymm1
1087 ; X64-NEXT: vcvtps2ph $2, %zmm0, (%rsi)
1088 ; X64-NEXT: vmovdqa %ymm1, %ymm0
1091 ; X86-LABEL: test_x86_vcvtps2ph_256:
1093 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1094 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1095 ; X86-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z}
1096 ; X86-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1}
1097 ; X86-NEXT: vpaddw %ymm1, %ymm2, %ymm1
1098 ; X86-NEXT: vcvtps2ph $2, %zmm0, (%eax)
1099 ; X86-NEXT: vmovdqa %ymm1, %ymm0
1101 %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
1102 %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 %mask)
1103 %res3 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> %src, i16 %mask)
1104 store <16 x i16> %res1, <16 x i16> * %dst
1105 %res = add <16 x i16> %res2, %res3
1109 declare <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float>, i32, <16 x i16>, i16) nounwind readonly
1111 define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) {
1112 ; CHECK-LABEL: test_cmpps:
1114 ; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
1115 ; CHECK-NEXT: kmovw %k0, %eax
1116 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
1117 ; CHECK-NEXT: vzeroupper
1118 ; CHECK-NEXT: ret{{[l|q]}}
1119 %res = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i32 8)
1120 %1 = bitcast <16 x i1> %res to i16
1123 declare <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float>, <16 x float>, i32, i32)
1125 define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) {
1126 ; CHECK-LABEL: test_cmppd:
1128 ; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k0
1129 ; CHECK-NEXT: kmovw %k0, %eax
1130 ; CHECK-NEXT: # kill: def $al killed $al killed $eax
1131 ; CHECK-NEXT: vzeroupper
1132 ; CHECK-NEXT: ret{{[l|q]}}
1133 %res = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i32 4)
1134 %1 = bitcast <8 x i1> %res to i8
1137 declare <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double>, <8 x double>, i32, i32)
1139 ; Function Attrs: nounwind readnone
1142 define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) {
1143 ; CHECK-LABEL: test_vmaxpd:
1145 ; CHECK-NEXT: vmaxpd %zmm1, %zmm0, %zmm0
1146 ; CHECK-NEXT: ret{{[l|q]}}
1147 %1 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
1150 declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32)
1152 define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) {
1153 ; CHECK-LABEL: test_vminpd:
1155 ; CHECK-NEXT: vminpd %zmm1, %zmm0, %zmm0
1156 ; CHECK-NEXT: ret{{[l|q]}}
1157 %1 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
1160 declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32)
1162 define void @test_mask_store_ss(i8* %ptr, <4 x float> %data, i8 %mask) {
1163 ; X64-LABEL: test_mask_store_ss:
1165 ; X64-NEXT: kmovw %esi, %k1
1166 ; X64-NEXT: vmovss %xmm0, (%rdi) {%k1}
1169 ; X86-LABEL: test_mask_store_ss:
1171 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1172 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
1173 ; X86-NEXT: kmovw %ecx, %k1
1174 ; X86-NEXT: vmovss %xmm0, (%eax) {%k1}
1176 %1 = and i8 %mask, 1
1177 %2 = bitcast i8* %ptr to <4 x float>*
1178 %3 = bitcast i8 %1 to <8 x i1>
1179 %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1180 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %data, <4 x float>* %2, i32 1, <4 x i1> %extract)
1183 declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) #1
1186 declare <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float>, <16 x float>, i32)
1187 declare <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float>, <16 x float>, i32)
1188 declare <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double>, <8 x double>, i32)
1190 define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1) {
1191 ; CHECK-LABEL: test_vsubps_rn:
1193 ; CHECK-NEXT: vsubps {rn-sae}, %zmm1, %zmm0, %zmm0
1194 ; CHECK-NEXT: ret{{[l|q]}}
1195 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1199 define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1) {
1200 ; CHECK-LABEL: test_vsubps_rd:
1202 ; CHECK-NEXT: vsubps {rd-sae}, %zmm1, %zmm0, %zmm0
1203 ; CHECK-NEXT: ret{{[l|q]}}
1204 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1208 define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1) {
1209 ; CHECK-LABEL: test_vsubps_ru:
1211 ; CHECK-NEXT: vsubps {ru-sae}, %zmm1, %zmm0, %zmm0
1212 ; CHECK-NEXT: ret{{[l|q]}}
1213 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1217 define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1) {
1218 ; CHECK-LABEL: test_vsubps_rz:
1220 ; CHECK-NEXT: vsubps {rz-sae}, %zmm1, %zmm0, %zmm0
1221 ; CHECK-NEXT: ret{{[l|q]}}
1222 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1226 define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1) {
1227 ; CHECK-LABEL: test_vmulps_rn:
1229 ; CHECK-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm0
1230 ; CHECK-NEXT: ret{{[l|q]}}
1231 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1235 define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1) {
1236 ; CHECK-LABEL: test_vmulps_rd:
1238 ; CHECK-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm0
1239 ; CHECK-NEXT: ret{{[l|q]}}
1240 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1244 define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1) {
1245 ; CHECK-LABEL: test_vmulps_ru:
1247 ; CHECK-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm0
1248 ; CHECK-NEXT: ret{{[l|q]}}
1249 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1253 define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) {
1254 ; CHECK-LABEL: test_vmulps_rz:
1256 ; CHECK-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm0
1257 ; CHECK-NEXT: ret{{[l|q]}}
1258 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1263 define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1264 ; X64-LABEL: test_vmulps_mask_rn:
1266 ; X64-NEXT: kmovw %edi, %k1
1267 ; X64-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1270 ; X86-LABEL: test_vmulps_mask_rn:
1272 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1273 ; X86-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1275 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1276 %2 = bitcast i16 %mask to <16 x i1>
1277 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1281 define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1282 ; X64-LABEL: test_vmulps_mask_rd:
1284 ; X64-NEXT: kmovw %edi, %k1
1285 ; X64-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1288 ; X86-LABEL: test_vmulps_mask_rd:
1290 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1291 ; X86-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1293 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1294 %2 = bitcast i16 %mask to <16 x i1>
1295 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1299 define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1300 ; X64-LABEL: test_vmulps_mask_ru:
1302 ; X64-NEXT: kmovw %edi, %k1
1303 ; X64-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1306 ; X86-LABEL: test_vmulps_mask_ru:
1308 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1309 ; X86-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1311 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1312 %2 = bitcast i16 %mask to <16 x i1>
1313 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1317 define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1318 ; X64-LABEL: test_vmulps_mask_rz:
1320 ; X64-NEXT: kmovw %edi, %k1
1321 ; X64-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1324 ; X86-LABEL: test_vmulps_mask_rz:
1326 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1327 ; X86-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1329 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1330 %2 = bitcast i16 %mask to <16 x i1>
1331 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1335 ;; With Passthru value
1336 define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
1337 ; X64-LABEL: test_vmulps_mask_passthru_rn:
1339 ; X64-NEXT: kmovw %edi, %k1
1340 ; X64-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1341 ; X64-NEXT: vmovaps %zmm2, %zmm0
1344 ; X86-LABEL: test_vmulps_mask_passthru_rn:
1346 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1347 ; X86-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1348 ; X86-NEXT: vmovaps %zmm2, %zmm0
1350 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1351 %2 = bitcast i16 %mask to <16 x i1>
1352 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
1356 define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
1357 ; X64-LABEL: test_vmulps_mask_passthru_rd:
1359 ; X64-NEXT: kmovw %edi, %k1
1360 ; X64-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1361 ; X64-NEXT: vmovaps %zmm2, %zmm0
1364 ; X86-LABEL: test_vmulps_mask_passthru_rd:
1366 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1367 ; X86-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1368 ; X86-NEXT: vmovaps %zmm2, %zmm0
1370 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1371 %2 = bitcast i16 %mask to <16 x i1>
1372 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
1376 define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
1377 ; X64-LABEL: test_vmulps_mask_passthru_ru:
1379 ; X64-NEXT: kmovw %edi, %k1
1380 ; X64-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1381 ; X64-NEXT: vmovaps %zmm2, %zmm0
1384 ; X86-LABEL: test_vmulps_mask_passthru_ru:
1386 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1387 ; X86-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1388 ; X86-NEXT: vmovaps %zmm2, %zmm0
1390 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1391 %2 = bitcast i16 %mask to <16 x i1>
1392 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
1396 define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
1397 ; X64-LABEL: test_vmulps_mask_passthru_rz:
1399 ; X64-NEXT: kmovw %edi, %k1
1400 ; X64-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1401 ; X64-NEXT: vmovaps %zmm2, %zmm0
1404 ; X86-LABEL: test_vmulps_mask_passthru_rz:
1406 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1407 ; X86-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1408 ; X86-NEXT: vmovaps %zmm2, %zmm0
1410 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1411 %2 = bitcast i16 %mask to <16 x i1>
1412 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
1417 define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
1418 ; X64-LABEL: test_vmulpd_mask_rn:
1420 ; X64-NEXT: kmovw %edi, %k1
1421 ; X64-NEXT: vmulpd {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1424 ; X86-LABEL: test_vmulpd_mask_rn:
1426 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1427 ; X86-NEXT: kmovw %eax, %k1
1428 ; X86-NEXT: vmulpd {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1430 %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 8)
1431 %2 = bitcast i8 %mask to <8 x i1>
1432 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
1436 define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
1437 ; X64-LABEL: test_vmulpd_mask_rd:
1439 ; X64-NEXT: kmovw %edi, %k1
1440 ; X64-NEXT: vmulpd {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1443 ; X86-LABEL: test_vmulpd_mask_rd:
1445 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1446 ; X86-NEXT: kmovw %eax, %k1
1447 ; X86-NEXT: vmulpd {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1449 %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 9)
1450 %2 = bitcast i8 %mask to <8 x i1>
1451 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
1455 define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
1456 ; X64-LABEL: test_vmulpd_mask_ru:
1458 ; X64-NEXT: kmovw %edi, %k1
1459 ; X64-NEXT: vmulpd {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1462 ; X86-LABEL: test_vmulpd_mask_ru:
1464 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1465 ; X86-NEXT: kmovw %eax, %k1
1466 ; X86-NEXT: vmulpd {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1468 %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 10)
1469 %2 = bitcast i8 %mask to <8 x i1>
1470 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
1474 define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
1475 ; X64-LABEL: test_vmulpd_mask_rz:
1477 ; X64-NEXT: kmovw %edi, %k1
1478 ; X64-NEXT: vmulpd {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1481 ; X86-LABEL: test_vmulpd_mask_rz:
1483 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1484 ; X86-NEXT: kmovw %eax, %k1
1485 ; X86-NEXT: vmulpd {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1487 %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 11)
1488 %2 = bitcast i8 %mask to <8 x i1>
1489 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
1493 define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1494 ; X64-LABEL: test_mm512_maskz_add_round_ps_rn_sae:
1496 ; X64-NEXT: kmovw %edi, %k1
1497 ; X64-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1500 ; X86-LABEL: test_mm512_maskz_add_round_ps_rn_sae:
1502 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1503 ; X86-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1505 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1506 %2 = bitcast i16 %mask to <16 x i1>
1507 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1511 define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1512 ; X64-LABEL: test_mm512_maskz_add_round_ps_rd_sae:
1514 ; X64-NEXT: kmovw %edi, %k1
1515 ; X64-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1518 ; X86-LABEL: test_mm512_maskz_add_round_ps_rd_sae:
1520 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1521 ; X86-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1523 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1524 %2 = bitcast i16 %mask to <16 x i1>
1525 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1529 define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1530 ; X64-LABEL: test_mm512_maskz_add_round_ps_ru_sae:
1532 ; X64-NEXT: kmovw %edi, %k1
1533 ; X64-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1536 ; X86-LABEL: test_mm512_maskz_add_round_ps_ru_sae:
1538 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1539 ; X86-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1541 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1542 %2 = bitcast i16 %mask to <16 x i1>
1543 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1547 define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1548 ; X64-LABEL: test_mm512_maskz_add_round_ps_rz_sae:
1550 ; X64-NEXT: kmovw %edi, %k1
1551 ; X64-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1554 ; X86-LABEL: test_mm512_maskz_add_round_ps_rz_sae:
1556 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1557 ; X86-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1559 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1560 %2 = bitcast i16 %mask to <16 x i1>
1561 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1565 define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1566 ; X64-LABEL: test_mm512_maskz_add_round_ps_current:
1568 ; X64-NEXT: kmovw %edi, %k1
1569 ; X64-NEXT: vaddps %zmm1, %zmm0, %zmm0 {%k1} {z}
1572 ; X86-LABEL: test_mm512_maskz_add_round_ps_current:
1574 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1575 ; X86-NEXT: vaddps %zmm1, %zmm0, %zmm0 {%k1} {z}
1577 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
1578 %2 = bitcast i16 %mask to <16 x i1>
1579 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1583 define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1584 ; X64-LABEL: test_mm512_mask_add_round_ps_rn_sae:
1586 ; X64-NEXT: kmovw %edi, %k1
1587 ; X64-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1588 ; X64-NEXT: vmovaps %zmm2, %zmm0
1591 ; X86-LABEL: test_mm512_mask_add_round_ps_rn_sae:
1593 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1594 ; X86-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1595 ; X86-NEXT: vmovaps %zmm2, %zmm0
1597 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1598 %2 = bitcast i16 %mask to <16 x i1>
1599 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1603 define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1604 ; X64-LABEL: test_mm512_mask_add_round_ps_rd_sae:
1606 ; X64-NEXT: kmovw %edi, %k1
1607 ; X64-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1608 ; X64-NEXT: vmovaps %zmm2, %zmm0
1611 ; X86-LABEL: test_mm512_mask_add_round_ps_rd_sae:
1613 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1614 ; X86-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1615 ; X86-NEXT: vmovaps %zmm2, %zmm0
1617 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1618 %2 = bitcast i16 %mask to <16 x i1>
1619 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1623 define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1624 ; X64-LABEL: test_mm512_mask_add_round_ps_ru_sae:
1626 ; X64-NEXT: kmovw %edi, %k1
1627 ; X64-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1628 ; X64-NEXT: vmovaps %zmm2, %zmm0
1631 ; X86-LABEL: test_mm512_mask_add_round_ps_ru_sae:
1633 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1634 ; X86-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1635 ; X86-NEXT: vmovaps %zmm2, %zmm0
1637 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1638 %2 = bitcast i16 %mask to <16 x i1>
1639 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1643 define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1644 ; X64-LABEL: test_mm512_mask_add_round_ps_rz_sae:
1646 ; X64-NEXT: kmovw %edi, %k1
1647 ; X64-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1648 ; X64-NEXT: vmovaps %zmm2, %zmm0
1651 ; X86-LABEL: test_mm512_mask_add_round_ps_rz_sae:
1653 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1654 ; X86-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1655 ; X86-NEXT: vmovaps %zmm2, %zmm0
1657 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1658 %2 = bitcast i16 %mask to <16 x i1>
1659 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1663 define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1664 ; X64-LABEL: test_mm512_mask_add_round_ps_current:
1666 ; X64-NEXT: kmovw %edi, %k1
1667 ; X64-NEXT: vaddps %zmm1, %zmm0, %zmm2 {%k1}
1668 ; X64-NEXT: vmovaps %zmm2, %zmm0
1671 ; X86-LABEL: test_mm512_mask_add_round_ps_current:
1673 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1674 ; X86-NEXT: vaddps %zmm1, %zmm0, %zmm2 {%k1}
1675 ; X86-NEXT: vmovaps %zmm2, %zmm0
1677 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
1678 %2 = bitcast i16 %mask to <16 x i1>
1679 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1683 define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1684 ; CHECK-LABEL: test_mm512_add_round_ps_rn_sae:
1686 ; CHECK-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0
1687 ; CHECK-NEXT: ret{{[l|q]}}
1688 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1692 define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1693 ; CHECK-LABEL: test_mm512_add_round_ps_rd_sae:
1695 ; CHECK-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0
1696 ; CHECK-NEXT: ret{{[l|q]}}
1697 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1701 define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1702 ; CHECK-LABEL: test_mm512_add_round_ps_ru_sae:
1704 ; CHECK-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm0
1705 ; CHECK-NEXT: ret{{[l|q]}}
1706 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1710 define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1711 ; CHECK-LABEL: test_mm512_add_round_ps_rz_sae:
1713 ; CHECK-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm0
1714 ; CHECK-NEXT: ret{{[l|q]}}
1715 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1719 define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1720 ; CHECK-LABEL: test_mm512_add_round_ps_current:
1722 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
1723 ; CHECK-NEXT: ret{{[l|q]}}
1724 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
1727 declare <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float>, <16 x float>, i32)
1729 define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1730 ; X64-LABEL: test_mm512_mask_sub_round_ps_rn_sae:
1732 ; X64-NEXT: kmovw %edi, %k1
1733 ; X64-NEXT: vsubps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1734 ; X64-NEXT: vmovaps %zmm2, %zmm0
1737 ; X86-LABEL: test_mm512_mask_sub_round_ps_rn_sae:
1739 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1740 ; X86-NEXT: vsubps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1741 ; X86-NEXT: vmovaps %zmm2, %zmm0
1743 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1744 %2 = bitcast i16 %mask to <16 x i1>
1745 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1749 define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1750 ; X64-LABEL: test_mm512_mask_sub_round_ps_rd_sae:
1752 ; X64-NEXT: kmovw %edi, %k1
1753 ; X64-NEXT: vsubps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1754 ; X64-NEXT: vmovaps %zmm2, %zmm0
1757 ; X86-LABEL: test_mm512_mask_sub_round_ps_rd_sae:
1759 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1760 ; X86-NEXT: vsubps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1761 ; X86-NEXT: vmovaps %zmm2, %zmm0
1763 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1764 %2 = bitcast i16 %mask to <16 x i1>
1765 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1769 define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1770 ; X64-LABEL: test_mm512_mask_sub_round_ps_ru_sae:
1772 ; X64-NEXT: kmovw %edi, %k1
1773 ; X64-NEXT: vsubps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1774 ; X64-NEXT: vmovaps %zmm2, %zmm0
1777 ; X86-LABEL: test_mm512_mask_sub_round_ps_ru_sae:
1779 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1780 ; X86-NEXT: vsubps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1781 ; X86-NEXT: vmovaps %zmm2, %zmm0
1783 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1784 %2 = bitcast i16 %mask to <16 x i1>
1785 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1789 define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1790 ; X64-LABEL: test_mm512_mask_sub_round_ps_rz_sae:
1792 ; X64-NEXT: kmovw %edi, %k1
1793 ; X64-NEXT: vsubps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1794 ; X64-NEXT: vmovaps %zmm2, %zmm0
1797 ; X86-LABEL: test_mm512_mask_sub_round_ps_rz_sae:
1799 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1800 ; X86-NEXT: vsubps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1801 ; X86-NEXT: vmovaps %zmm2, %zmm0
1803 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1804 %2 = bitcast i16 %mask to <16 x i1>
1805 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1809 define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1810 ; X64-LABEL: test_mm512_mask_sub_round_ps_current:
1812 ; X64-NEXT: kmovw %edi, %k1
1813 ; X64-NEXT: vsubps %zmm1, %zmm0, %zmm2 {%k1}
1814 ; X64-NEXT: vmovaps %zmm2, %zmm0
1817 ; X86-LABEL: test_mm512_mask_sub_round_ps_current:
1819 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1820 ; X86-NEXT: vsubps %zmm1, %zmm0, %zmm2 {%k1}
1821 ; X86-NEXT: vmovaps %zmm2, %zmm0
1823 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
1824 %2 = bitcast i16 %mask to <16 x i1>
1825 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1829 define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1830 ; CHECK-LABEL: test_mm512_sub_round_ps_rn_sae:
1832 ; CHECK-NEXT: vsubps {rn-sae}, %zmm1, %zmm0, %zmm0
1833 ; CHECK-NEXT: ret{{[l|q]}}
1834 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1838 define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1839 ; CHECK-LABEL: test_mm512_sub_round_ps_rd_sae:
1841 ; CHECK-NEXT: vsubps {rd-sae}, %zmm1, %zmm0, %zmm0
1842 ; CHECK-NEXT: ret{{[l|q]}}
1843 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1847 define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1848 ; CHECK-LABEL: test_mm512_sub_round_ps_ru_sae:
1850 ; CHECK-NEXT: vsubps {ru-sae}, %zmm1, %zmm0, %zmm0
1851 ; CHECK-NEXT: ret{{[l|q]}}
1852 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1856 define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1857 ; CHECK-LABEL: test_mm512_sub_round_ps_rz_sae:
1859 ; CHECK-NEXT: vsubps {rz-sae}, %zmm1, %zmm0, %zmm0
1860 ; CHECK-NEXT: ret{{[l|q]}}
1861 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1865 define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1866 ; CHECK-LABEL: test_mm512_sub_round_ps_current:
1868 ; CHECK-NEXT: vsubps %zmm1, %zmm0, %zmm0
1869 ; CHECK-NEXT: ret{{[l|q]}}
1870 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
1874 define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1875 ; X64-LABEL: test_mm512_maskz_div_round_ps_rn_sae:
1877 ; X64-NEXT: kmovw %edi, %k1
1878 ; X64-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1881 ; X86-LABEL: test_mm512_maskz_div_round_ps_rn_sae:
1883 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1884 ; X86-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1886 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1887 %2 = bitcast i16 %mask to <16 x i1>
1888 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1892 define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1893 ; X64-LABEL: test_mm512_maskz_div_round_ps_rd_sae:
1895 ; X64-NEXT: kmovw %edi, %k1
1896 ; X64-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1899 ; X86-LABEL: test_mm512_maskz_div_round_ps_rd_sae:
1901 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1902 ; X86-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1904 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1905 %2 = bitcast i16 %mask to <16 x i1>
1906 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1910 define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1911 ; X64-LABEL: test_mm512_maskz_div_round_ps_ru_sae:
1913 ; X64-NEXT: kmovw %edi, %k1
1914 ; X64-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1917 ; X86-LABEL: test_mm512_maskz_div_round_ps_ru_sae:
1919 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1920 ; X86-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1922 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1923 %2 = bitcast i16 %mask to <16 x i1>
1924 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1928 define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1929 ; X64-LABEL: test_mm512_maskz_div_round_ps_rz_sae:
1931 ; X64-NEXT: kmovw %edi, %k1
1932 ; X64-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1935 ; X86-LABEL: test_mm512_maskz_div_round_ps_rz_sae:
1937 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1938 ; X86-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1940 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1941 %2 = bitcast i16 %mask to <16 x i1>
1942 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1946 define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1947 ; X64-LABEL: test_mm512_maskz_div_round_ps_current:
1949 ; X64-NEXT: kmovw %edi, %k1
1950 ; X64-NEXT: vdivps %zmm1, %zmm0, %zmm0 {%k1} {z}
1953 ; X86-LABEL: test_mm512_maskz_div_round_ps_current:
1955 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1956 ; X86-NEXT: vdivps %zmm1, %zmm0, %zmm0 {%k1} {z}
1958 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
1959 %2 = bitcast i16 %mask to <16 x i1>
1960 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1964 define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1965 ; X64-LABEL: test_mm512_mask_div_round_ps_rn_sae:
1967 ; X64-NEXT: kmovw %edi, %k1
1968 ; X64-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1969 ; X64-NEXT: vmovaps %zmm2, %zmm0
1972 ; X86-LABEL: test_mm512_mask_div_round_ps_rn_sae:
1974 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1975 ; X86-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1976 ; X86-NEXT: vmovaps %zmm2, %zmm0
1978 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1979 %2 = bitcast i16 %mask to <16 x i1>
1980 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1984 define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1985 ; X64-LABEL: test_mm512_mask_div_round_ps_rd_sae:
1987 ; X64-NEXT: kmovw %edi, %k1
1988 ; X64-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1989 ; X64-NEXT: vmovaps %zmm2, %zmm0
1992 ; X86-LABEL: test_mm512_mask_div_round_ps_rd_sae:
1994 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1995 ; X86-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1996 ; X86-NEXT: vmovaps %zmm2, %zmm0
1998 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1999 %2 = bitcast i16 %mask to <16 x i1>
2000 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
2004 define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2005 ; X64-LABEL: test_mm512_mask_div_round_ps_ru_sae:
2007 ; X64-NEXT: kmovw %edi, %k1
2008 ; X64-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2009 ; X64-NEXT: vmovaps %zmm2, %zmm0
2012 ; X86-LABEL: test_mm512_mask_div_round_ps_ru_sae:
2014 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2015 ; X86-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2016 ; X86-NEXT: vmovaps %zmm2, %zmm0
2018 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
2019 %2 = bitcast i16 %mask to <16 x i1>
2020 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
2024 define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2025 ; X64-LABEL: test_mm512_mask_div_round_ps_rz_sae:
2027 ; X64-NEXT: kmovw %edi, %k1
2028 ; X64-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2029 ; X64-NEXT: vmovaps %zmm2, %zmm0
2032 ; X86-LABEL: test_mm512_mask_div_round_ps_rz_sae:
2034 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2035 ; X86-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2036 ; X86-NEXT: vmovaps %zmm2, %zmm0
2038 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
2039 %2 = bitcast i16 %mask to <16 x i1>
2040 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
2044 define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2045 ; X64-LABEL: test_mm512_mask_div_round_ps_current:
2047 ; X64-NEXT: kmovw %edi, %k1
2048 ; X64-NEXT: vdivps %zmm1, %zmm0, %zmm2 {%k1}
2049 ; X64-NEXT: vmovaps %zmm2, %zmm0
2052 ; X86-LABEL: test_mm512_mask_div_round_ps_current:
2054 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2055 ; X86-NEXT: vdivps %zmm1, %zmm0, %zmm2 {%k1}
2056 ; X86-NEXT: vmovaps %zmm2, %zmm0
2058 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
2059 %2 = bitcast i16 %mask to <16 x i1>
2060 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
2064 define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2065 ; CHECK-LABEL: test_mm512_div_round_ps_rn_sae:
2067 ; CHECK-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm0
2068 ; CHECK-NEXT: ret{{[l|q]}}
2069 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
2073 define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2074 ; CHECK-LABEL: test_mm512_div_round_ps_rd_sae:
2076 ; CHECK-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm0
2077 ; CHECK-NEXT: ret{{[l|q]}}
2078 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
2082 define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2083 ; CHECK-LABEL: test_mm512_div_round_ps_ru_sae:
2085 ; CHECK-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm0
2086 ; CHECK-NEXT: ret{{[l|q]}}
2087 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
2091 define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2092 ; CHECK-LABEL: test_mm512_div_round_ps_rz_sae:
2094 ; CHECK-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm0
2095 ; CHECK-NEXT: ret{{[l|q]}}
2096 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
2100 define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2101 ; CHECK-LABEL: test_mm512_div_round_ps_current:
2103 ; CHECK-NEXT: vdivps %zmm1, %zmm0, %zmm0
2104 ; CHECK-NEXT: ret{{[l|q]}}
2105 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
2108 declare <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float>, <16 x float>, i32)
2110 define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2111 ; X64-LABEL: test_mm512_maskz_min_round_ps_sae:
2113 ; X64-NEXT: kmovw %edi, %k1
2114 ; X64-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2117 ; X86-LABEL: test_mm512_maskz_min_round_ps_sae:
2119 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2120 ; X86-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2122 %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
2123 %2 = bitcast i16 %mask to <16 x i1>
2124 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
2128 define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2129 ; X64-LABEL: test_mm512_maskz_min_round_ps_current:
2131 ; X64-NEXT: kmovw %edi, %k1
2132 ; X64-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
2135 ; X86-LABEL: test_mm512_maskz_min_round_ps_current:
2137 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2138 ; X86-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
2140 %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
2141 %2 = bitcast i16 %mask to <16 x i1>
2142 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
2146 define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2147 ; X64-LABEL: test_mm512_mask_min_round_ps_sae:
2149 ; X64-NEXT: kmovw %edi, %k1
2150 ; X64-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
2151 ; X64-NEXT: vmovaps %zmm2, %zmm0
2154 ; X86-LABEL: test_mm512_mask_min_round_ps_sae:
2156 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2157 ; X86-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
2158 ; X86-NEXT: vmovaps %zmm2, %zmm0
2160 %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
2161 %2 = bitcast i16 %mask to <16 x i1>
2162 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
2166 define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2167 ; X64-LABEL: test_mm512_mask_min_round_ps_current:
2169 ; X64-NEXT: kmovw %edi, %k1
2170 ; X64-NEXT: vminps %zmm1, %zmm0, %zmm2 {%k1}
2171 ; X64-NEXT: vmovaps %zmm2, %zmm0
2174 ; X86-LABEL: test_mm512_mask_min_round_ps_current:
2176 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2177 ; X86-NEXT: vminps %zmm1, %zmm0, %zmm2 {%k1}
2178 ; X86-NEXT: vmovaps %zmm2, %zmm0
2180 %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
2181 %2 = bitcast i16 %mask to <16 x i1>
2182 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
2186 define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2187 ; CHECK-LABEL: test_mm512_min_round_ps_sae:
2189 ; CHECK-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm0
2190 ; CHECK-NEXT: ret{{[l|q]}}
2191 %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
2195 define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2196 ; CHECK-LABEL: test_mm512_min_round_ps_current:
2198 ; CHECK-NEXT: vminps %zmm1, %zmm0, %zmm0
2199 ; CHECK-NEXT: ret{{[l|q]}}
2200 %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
2203 declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32)
2205 define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2206 ; X64-LABEL: test_mm512_maskz_max_round_ps_sae:
2208 ; X64-NEXT: kmovw %edi, %k1
2209 ; X64-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2212 ; X86-LABEL: test_mm512_maskz_max_round_ps_sae:
2214 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2215 ; X86-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2217 %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
2218 %2 = bitcast i16 %mask to <16 x i1>
2219 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
2223 define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2224 ; X64-LABEL: test_mm512_maskz_max_round_ps_current:
2226 ; X64-NEXT: kmovw %edi, %k1
2227 ; X64-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
2230 ; X86-LABEL: test_mm512_maskz_max_round_ps_current:
2232 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2233 ; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
2235 %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
2236 %2 = bitcast i16 %mask to <16 x i1>
2237 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
2241 define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2242 ; X64-LABEL: test_mm512_mask_max_round_ps_sae:
2244 ; X64-NEXT: kmovw %edi, %k1
2245 ; X64-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
2246 ; X64-NEXT: vmovaps %zmm2, %zmm0
2249 ; X86-LABEL: test_mm512_mask_max_round_ps_sae:
2251 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2252 ; X86-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
2253 ; X86-NEXT: vmovaps %zmm2, %zmm0
2255 %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
2256 %2 = bitcast i16 %mask to <16 x i1>
2257 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
2261 define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2262 ; X64-LABEL: test_mm512_mask_max_round_ps_current:
2264 ; X64-NEXT: kmovw %edi, %k1
2265 ; X64-NEXT: vmaxps %zmm1, %zmm0, %zmm2 {%k1}
2266 ; X64-NEXT: vmovaps %zmm2, %zmm0
2269 ; X86-LABEL: test_mm512_mask_max_round_ps_current:
2271 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2272 ; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm2 {%k1}
2273 ; X86-NEXT: vmovaps %zmm2, %zmm0
2275 %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
2276 %2 = bitcast i16 %mask to <16 x i1>
2277 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
2281 define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2282 ; CHECK-LABEL: test_mm512_max_round_ps_sae:
2284 ; CHECK-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm0
2285 ; CHECK-NEXT: ret{{[l|q]}}
2286 %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
2290 define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2291 ; CHECK-LABEL: test_mm512_max_round_ps_current:
2293 ; CHECK-NEXT: vmaxps %zmm1, %zmm0, %zmm0
2294 ; CHECK-NEXT: ret{{[l|q]}}
2295 %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
2298 declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32)
2300 declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
2302 define <4 x float> @test_mask_add_ss_rn(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
2303 ; X64-LABEL: test_mask_add_ss_rn:
2305 ; X64-NEXT: kmovw %edi, %k1
2306 ; X64-NEXT: vaddss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2307 ; X64-NEXT: vmovaps %xmm2, %xmm0
2310 ; X86-LABEL: test_mask_add_ss_rn:
2312 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2313 ; X86-NEXT: kmovw %eax, %k1
2314 ; X86-NEXT: vaddss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2315 ; X86-NEXT: vmovaps %xmm2, %xmm0
2317 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
2318 ret <4 x float> %res
2321 define <4 x float> @test_mask_add_ss_rd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
2322 ; X64-LABEL: test_mask_add_ss_rd:
2324 ; X64-NEXT: kmovw %edi, %k1
2325 ; X64-NEXT: vaddss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2326 ; X64-NEXT: vmovaps %xmm2, %xmm0
2329 ; X86-LABEL: test_mask_add_ss_rd:
2331 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2332 ; X86-NEXT: kmovw %eax, %k1
2333 ; X86-NEXT: vaddss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2334 ; X86-NEXT: vmovaps %xmm2, %xmm0
2336 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 9)
2337 ret <4 x float> %res
2340 define <4 x float> @test_mask_add_ss_ru(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
2341 ; X64-LABEL: test_mask_add_ss_ru:
2343 ; X64-NEXT: kmovw %edi, %k1
2344 ; X64-NEXT: vaddss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2345 ; X64-NEXT: vmovaps %xmm2, %xmm0
2348 ; X86-LABEL: test_mask_add_ss_ru:
2350 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2351 ; X86-NEXT: kmovw %eax, %k1
2352 ; X86-NEXT: vaddss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2353 ; X86-NEXT: vmovaps %xmm2, %xmm0
2355 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 10)
2356 ret <4 x float> %res
2359 define <4 x float> @test_mask_add_ss_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
2360 ; X64-LABEL: test_mask_add_ss_rz:
2362 ; X64-NEXT: kmovw %edi, %k1
2363 ; X64-NEXT: vaddss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2364 ; X64-NEXT: vmovaps %xmm2, %xmm0
2367 ; X86-LABEL: test_mask_add_ss_rz:
2369 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2370 ; X86-NEXT: kmovw %eax, %k1
2371 ; X86-NEXT: vaddss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2372 ; X86-NEXT: vmovaps %xmm2, %xmm0
2374 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 11)
2375 ret <4 x float> %res
2378 define <4 x float> @test_mask_add_ss_current(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
2379 ; X64-LABEL: test_mask_add_ss_current:
2381 ; X64-NEXT: kmovw %edi, %k1
2382 ; X64-NEXT: vaddss %xmm1, %xmm0, %xmm2 {%k1}
2383 ; X64-NEXT: vmovaps %xmm2, %xmm0
2386 ; X86-LABEL: test_mask_add_ss_current:
2388 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2389 ; X86-NEXT: kmovw %eax, %k1
2390 ; X86-NEXT: vaddss %xmm1, %xmm0, %xmm2 {%k1}
2391 ; X86-NEXT: vmovaps %xmm2, %xmm0
2393 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
2394 ret <4 x float> %res
2397 define <4 x float> @test_maskz_add_ss_rn(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
2398 ; X64-LABEL: test_maskz_add_ss_rn:
2400 ; X64-NEXT: kmovw %edi, %k1
2401 ; X64-NEXT: vaddss {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
2404 ; X86-LABEL: test_maskz_add_ss_rn:
2406 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2407 ; X86-NEXT: kmovw %eax, %k1
2408 ; X86-NEXT: vaddss {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
2410 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
2411 ret <4 x float> %res
2414 define <4 x float> @test_add_ss_rn(<4 x float> %a0, <4 x float> %a1) {
2415 ; CHECK-LABEL: test_add_ss_rn:
2417 ; CHECK-NEXT: vaddss {rn-sae}, %xmm1, %xmm0, %xmm0
2418 ; CHECK-NEXT: ret{{[l|q]}}
2419 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
2420 ret <4 x float> %res
2423 define <4 x float> @test_mask_add_ss_current_memfold(<4 x float> %a0, float* %a1, <4 x float> %a2, i8 %mask) {
2424 ; X64-LABEL: test_mask_add_ss_current_memfold:
2426 ; X64-NEXT: kmovw %esi, %k1
2427 ; X64-NEXT: vaddss (%rdi), %xmm0, %xmm1 {%k1}
2428 ; X64-NEXT: vmovaps %xmm1, %xmm0
2431 ; X86-LABEL: test_mask_add_ss_current_memfold:
2433 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2434 ; X86-NEXT: kmovw %eax, %k1
2435 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2436 ; X86-NEXT: vaddss (%eax), %xmm0, %xmm1 {%k1}
2437 ; X86-NEXT: vmovaps %xmm1, %xmm0
2439 %a1.val = load float, float* %a1
2440 %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
2441 %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
2442 %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
2443 %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
2444 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> %a2, i8 %mask, i32 4)
2445 ret <4 x float> %res
2448 define <4 x float> @test_maskz_add_ss_current_memfold(<4 x float> %a0, float* %a1, i8 %mask) {
2449 ; X64-LABEL: test_maskz_add_ss_current_memfold:
2451 ; X64-NEXT: kmovw %esi, %k1
2452 ; X64-NEXT: vaddss (%rdi), %xmm0, %xmm0 {%k1} {z}
2455 ; X86-LABEL: test_maskz_add_ss_current_memfold:
2457 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2458 ; X86-NEXT: kmovw %eax, %k1
2459 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2460 ; X86-NEXT: vaddss (%eax), %xmm0, %xmm0 {%k1} {z}
2462 %a1.val = load float, float* %a1
2463 %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
2464 %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
2465 %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
2466 %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
2467 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> zeroinitializer, i8 %mask, i32 4)
2468 ret <4 x float> %res
2471 declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
2473 define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
2474 ; X64-LABEL: test_mask_add_sd_rn:
2476 ; X64-NEXT: kmovw %edi, %k1
2477 ; X64-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2478 ; X64-NEXT: vmovapd %xmm2, %xmm0
2481 ; X86-LABEL: test_mask_add_sd_rn:
2483 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2484 ; X86-NEXT: kmovw %eax, %k1
2485 ; X86-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2486 ; X86-NEXT: vmovapd %xmm2, %xmm0
2488 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
2489 ret <2 x double> %res
2492 define <2 x double> @test_mask_add_sd_rd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
2493 ; X64-LABEL: test_mask_add_sd_rd:
2495 ; X64-NEXT: kmovw %edi, %k1
2496 ; X64-NEXT: vaddsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2497 ; X64-NEXT: vmovapd %xmm2, %xmm0
2500 ; X86-LABEL: test_mask_add_sd_rd:
2502 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2503 ; X86-NEXT: kmovw %eax, %k1
2504 ; X86-NEXT: vaddsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2505 ; X86-NEXT: vmovapd %xmm2, %xmm0
2507 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 9)
2508 ret <2 x double> %res
2511 define <2 x double> @test_mask_add_sd_ru(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
2512 ; X64-LABEL: test_mask_add_sd_ru:
2514 ; X64-NEXT: kmovw %edi, %k1
2515 ; X64-NEXT: vaddsd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2516 ; X64-NEXT: vmovapd %xmm2, %xmm0
2519 ; X86-LABEL: test_mask_add_sd_ru:
2521 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2522 ; X86-NEXT: kmovw %eax, %k1
2523 ; X86-NEXT: vaddsd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2524 ; X86-NEXT: vmovapd %xmm2, %xmm0
2526 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 10)
2527 ret <2 x double> %res
2530 define <2 x double> @test_mask_add_sd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
2531 ; X64-LABEL: test_mask_add_sd_rz:
2533 ; X64-NEXT: kmovw %edi, %k1
2534 ; X64-NEXT: vaddsd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2535 ; X64-NEXT: vmovapd %xmm2, %xmm0
2538 ; X86-LABEL: test_mask_add_sd_rz:
2540 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2541 ; X86-NEXT: kmovw %eax, %k1
2542 ; X86-NEXT: vaddsd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2543 ; X86-NEXT: vmovapd %xmm2, %xmm0
2545 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 11)
2546 ret <2 x double> %res
2549 define <2 x double> @test_mask_add_sd_current(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
2550 ; X64-LABEL: test_mask_add_sd_current:
2552 ; X64-NEXT: kmovw %edi, %k1
2553 ; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm2 {%k1}
2554 ; X64-NEXT: vmovapd %xmm2, %xmm0
2557 ; X86-LABEL: test_mask_add_sd_current:
2559 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2560 ; X86-NEXT: kmovw %eax, %k1
2561 ; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm2 {%k1}
2562 ; X86-NEXT: vmovapd %xmm2, %xmm0
2564 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
2565 ret <2 x double> %res
2568 define <2 x double> @test_maskz_add_sd_rn(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
2569 ; X64-LABEL: test_maskz_add_sd_rn:
2571 ; X64-NEXT: kmovw %edi, %k1
2572 ; X64-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
2575 ; X86-LABEL: test_maskz_add_sd_rn:
2577 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2578 ; X86-NEXT: kmovw %eax, %k1
2579 ; X86-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
2581 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
2582 ret <2 x double> %res
2585 define <2 x double> @test_add_sd_rn(<2 x double> %a0, <2 x double> %a1) {
2586 ; CHECK-LABEL: test_add_sd_rn:
2588 ; CHECK-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0
2589 ; CHECK-NEXT: ret{{[l|q]}}
2590 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 8)
2591 ret <2 x double> %res
2594 define <2 x double> @test_mask_add_sd_current_memfold(<2 x double> %a0, double* %a1, <2 x double> %a2, i8 %mask) {
2595 ; X64-LABEL: test_mask_add_sd_current_memfold:
2597 ; X64-NEXT: kmovw %esi, %k1
2598 ; X64-NEXT: vaddsd (%rdi), %xmm0, %xmm1 {%k1}
2599 ; X64-NEXT: vmovapd %xmm1, %xmm0
2602 ; X86-LABEL: test_mask_add_sd_current_memfold:
2604 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2605 ; X86-NEXT: kmovw %eax, %k1
2606 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2607 ; X86-NEXT: vaddsd (%eax), %xmm0, %xmm1 {%k1}
2608 ; X86-NEXT: vmovapd %xmm1, %xmm0
2610 %a1.val = load double, double* %a1
2611 %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
2612 %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
2613 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> %a2, i8 %mask, i32 4)
2614 ret <2 x double> %res
2617 define <2 x double> @test_maskz_add_sd_current_memfold(<2 x double> %a0, double* %a1, i8 %mask) {
2618 ; X64-LABEL: test_maskz_add_sd_current_memfold:
2620 ; X64-NEXT: kmovw %esi, %k1
2621 ; X64-NEXT: vaddsd (%rdi), %xmm0, %xmm0 {%k1} {z}
2624 ; X86-LABEL: test_maskz_add_sd_current_memfold:
2626 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2627 ; X86-NEXT: kmovw %eax, %k1
2628 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2629 ; X86-NEXT: vaddsd (%eax), %xmm0, %xmm0 {%k1} {z}
2631 %a1.val = load double, double* %a1
2632 %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
2633 %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
2634 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> zeroinitializer, i8 %mask, i32 4)
2635 ret <2 x double> %res
2638 declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
2640 define <4 x float> @test_mask_max_ss_sae(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
2641 ; X64-LABEL: test_mask_max_ss_sae:
2643 ; X64-NEXT: kmovw %edi, %k1
2644 ; X64-NEXT: vmaxss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
2645 ; X64-NEXT: vmovaps %xmm2, %xmm0
2648 ; X86-LABEL: test_mask_max_ss_sae:
2650 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2651 ; X86-NEXT: kmovw %eax, %k1
2652 ; X86-NEXT: vmaxss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
2653 ; X86-NEXT: vmovaps %xmm2, %xmm0
2655 %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
2656 ret <4 x float> %res
2659 define <4 x float> @test_maskz_max_ss_sae(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
2660 ; X64-LABEL: test_maskz_max_ss_sae:
2662 ; X64-NEXT: kmovw %edi, %k1
2663 ; X64-NEXT: vmaxss {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
2666 ; X86-LABEL: test_maskz_max_ss_sae:
2668 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2669 ; X86-NEXT: kmovw %eax, %k1
2670 ; X86-NEXT: vmaxss {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
2672 %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
2673 ret <4 x float> %res
2676 define <4 x float> @test_max_ss_sae(<4 x float> %a0, <4 x float> %a1) {
2677 ; CHECK-LABEL: test_max_ss_sae:
2679 ; CHECK-NEXT: vmaxss {sae}, %xmm1, %xmm0, %xmm0
2680 ; CHECK-NEXT: ret{{[l|q]}}
2681 %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
2682 ret <4 x float> %res
2685 define <4 x float> @test_mask_max_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
2686 ; X64-LABEL: test_mask_max_ss:
2688 ; X64-NEXT: kmovw %edi, %k1
2689 ; X64-NEXT: vmaxss %xmm1, %xmm0, %xmm2 {%k1}
2690 ; X64-NEXT: vmovaps %xmm2, %xmm0
2693 ; X86-LABEL: test_mask_max_ss:
2695 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2696 ; X86-NEXT: kmovw %eax, %k1
2697 ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm2 {%k1}
2698 ; X86-NEXT: vmovaps %xmm2, %xmm0
2700 %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
2701 ret <4 x float> %res
2704 define <4 x float> @test_maskz_max_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
2705 ; X64-LABEL: test_maskz_max_ss:
2707 ; X64-NEXT: kmovw %edi, %k1
2708 ; X64-NEXT: vmaxss %xmm1, %xmm0, %xmm0 {%k1} {z}
2711 ; X86-LABEL: test_maskz_max_ss:
2713 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2714 ; X86-NEXT: kmovw %eax, %k1
2715 ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm0 {%k1} {z}
2717 %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 4)
2718 ret <4 x float> %res
2721 define <4 x float> @test_max_ss(<4 x float> %a0, <4 x float> %a1) {
2722 ; CHECK-LABEL: test_max_ss:
2724 ; CHECK-NEXT: vmaxss %xmm1, %xmm0, %xmm0
2725 ; CHECK-NEXT: ret{{[l|q]}}
2726 %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 4)
2727 ret <4 x float> %res
2730 define <4 x float> @test_mask_max_ss_memfold(<4 x float> %a0, float* %a1, <4 x float> %a2, i8 %mask) {
2731 ; X64-LABEL: test_mask_max_ss_memfold:
2733 ; X64-NEXT: kmovw %esi, %k1
2734 ; X64-NEXT: vmaxss (%rdi), %xmm0, %xmm1 {%k1}
2735 ; X64-NEXT: vmovaps %xmm1, %xmm0
2738 ; X86-LABEL: test_mask_max_ss_memfold:
2740 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2741 ; X86-NEXT: kmovw %eax, %k1
2742 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2743 ; X86-NEXT: vmaxss (%eax), %xmm0, %xmm1 {%k1}
2744 ; X86-NEXT: vmovaps %xmm1, %xmm0
2746 %a1.val = load float, float* %a1
2747 %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
2748 %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
2749 %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
2750 %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
2751 %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> %a2, i8 %mask, i32 4)
2752 ret <4 x float> %res
2755 define <4 x float> @test_maskz_max_ss_memfold(<4 x float> %a0, float* %a1, i8 %mask) {
2756 ; X64-LABEL: test_maskz_max_ss_memfold:
2758 ; X64-NEXT: kmovw %esi, %k1
2759 ; X64-NEXT: vmaxss (%rdi), %xmm0, %xmm0 {%k1} {z}
2762 ; X86-LABEL: test_maskz_max_ss_memfold:
2764 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2765 ; X86-NEXT: kmovw %eax, %k1
2766 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2767 ; X86-NEXT: vmaxss (%eax), %xmm0, %xmm0 {%k1} {z}
2769 %a1.val = load float, float* %a1
2770 %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
2771 %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
2772 %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
2773 %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
2774 %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> zeroinitializer, i8 %mask, i32 4)
2775 ret <4 x float> %res
2777 declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
2779 define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
2780 ; X64-LABEL: test_mask_max_sd_sae:
2782 ; X64-NEXT: kmovw %edi, %k1
2783 ; X64-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
2784 ; X64-NEXT: vmovapd %xmm2, %xmm0
2787 ; X86-LABEL: test_mask_max_sd_sae:
2789 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2790 ; X86-NEXT: kmovw %eax, %k1
2791 ; X86-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
2792 ; X86-NEXT: vmovapd %xmm2, %xmm0
2794 %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
2795 ret <2 x double> %res
2798 define <2 x double> @test_maskz_max_sd_sae(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
2799 ; X64-LABEL: test_maskz_max_sd_sae:
2801 ; X64-NEXT: kmovw %edi, %k1
2802 ; X64-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
2805 ; X86-LABEL: test_maskz_max_sd_sae:
2807 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2808 ; X86-NEXT: kmovw %eax, %k1
2809 ; X86-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
2811 %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
2812 ret <2 x double> %res
2815 define <2 x double> @test_max_sd_sae(<2 x double> %a0, <2 x double> %a1) {
2816 ; CHECK-LABEL: test_max_sd_sae:
2818 ; CHECK-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm0
2819 ; CHECK-NEXT: ret{{[l|q]}}
2820 %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 8)
2821 ret <2 x double> %res
2824 define <2 x double> @test_mask_max_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
2825 ; X64-LABEL: test_mask_max_sd:
2827 ; X64-NEXT: kmovw %edi, %k1
2828 ; X64-NEXT: vmaxsd %xmm1, %xmm0, %xmm2 {%k1}
2829 ; X64-NEXT: vmovapd %xmm2, %xmm0
2832 ; X86-LABEL: test_mask_max_sd:
2834 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2835 ; X86-NEXT: kmovw %eax, %k1
2836 ; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm2 {%k1}
2837 ; X86-NEXT: vmovapd %xmm2, %xmm0
2839 %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
2840 ret <2 x double> %res
2843 define <2 x double> @test_maskz_max_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
2844 ; X64-LABEL: test_maskz_max_sd:
2846 ; X64-NEXT: kmovw %edi, %k1
2847 ; X64-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2850 ; X86-LABEL: test_maskz_max_sd:
2852 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2853 ; X86-NEXT: kmovw %eax, %k1
2854 ; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2856 %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 4)
2857 ret <2 x double> %res
2860 define <2 x double> @test_max_sd(<2 x double> %a0, <2 x double> %a1) {
2861 ; CHECK-LABEL: test_max_sd:
2863 ; CHECK-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
2864 ; CHECK-NEXT: ret{{[l|q]}}
2865 %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4)
2866 ret <2 x double> %res
2869 define <2 x double> @test_mask_max_sd_memfold(<2 x double> %a0, double* %a1, <2 x double> %a2, i8 %mask) {
2870 ; X64-LABEL: test_mask_max_sd_memfold:
2872 ; X64-NEXT: kmovw %esi, %k1
2873 ; X64-NEXT: vmaxsd (%rdi), %xmm0, %xmm1 {%k1}
2874 ; X64-NEXT: vmovapd %xmm1, %xmm0
2877 ; X86-LABEL: test_mask_max_sd_memfold:
2879 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2880 ; X86-NEXT: kmovw %eax, %k1
2881 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2882 ; X86-NEXT: vmaxsd (%eax), %xmm0, %xmm1 {%k1}
2883 ; X86-NEXT: vmovapd %xmm1, %xmm0
2885 %a1.val = load double, double* %a1
2886 %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
2887 %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
2888 %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> %a2, i8 %mask, i32 4)
2889 ret <2 x double> %res
2892 define <2 x double> @test_maskz_max_sd_memfold(<2 x double> %a0, double* %a1, i8 %mask) {
2893 ; X64-LABEL: test_maskz_max_sd_memfold:
2895 ; X64-NEXT: kmovw %esi, %k1
2896 ; X64-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 {%k1} {z}
2899 ; X86-LABEL: test_maskz_max_sd_memfold:
2901 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2902 ; X86-NEXT: kmovw %eax, %k1
2903 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2904 ; X86-NEXT: vmaxsd (%eax), %xmm0, %xmm0 {%k1} {z}
2906 %a1.val = load double, double* %a1
2907 %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
2908 %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
2909 %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> zeroinitializer, i8 %mask, i32 4)
2910 ret <2 x double> %res
2913 define <4 x float> @test_x86_avx512_cvtsi2ss32(<4 x float> %a, i32 %b) {
2914 ; X64-LABEL: test_x86_avx512_cvtsi2ss32:
2916 ; X64-NEXT: vcvtsi2ss %edi, {rz-sae}, %xmm0, %xmm0
2919 ; X86-LABEL: test_x86_avx512_cvtsi2ss32:
2921 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2922 ; X86-NEXT: vcvtsi2ss %eax, {rz-sae}, %xmm0, %xmm0
2924 %res = call <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float> %a, i32 %b, i32 11) ; <<<4 x float>> [#uses=1]
2925 ret <4 x float> %res
2927 declare <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float>, i32, i32) nounwind readnone
2929 define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss (<4 x float> %a, i32 %b) {
2930 ; X64-LABEL: test_x86_avx512__mm_cvt_roundu32_ss:
2932 ; X64-NEXT: vcvtusi2ss %edi, {rd-sae}, %xmm0, %xmm0
2935 ; X86-LABEL: test_x86_avx512__mm_cvt_roundu32_ss:
2937 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2938 ; X86-NEXT: vcvtusi2ss %eax, {rd-sae}, %xmm0, %xmm0
2940 %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 9) ; <<<4 x float>> [#uses=1]
2941 ret <4 x float> %res
2944 define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss_mem(<4 x float> %a, i32* %ptr) {
2945 ; X64-LABEL: test_x86_avx512__mm_cvt_roundu32_ss_mem:
2947 ; X64-NEXT: movl (%rdi), %eax
2948 ; X64-NEXT: vcvtusi2ss %eax, {rd-sae}, %xmm0, %xmm0
2951 ; X86-LABEL: test_x86_avx512__mm_cvt_roundu32_ss_mem:
2953 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2954 ; X86-NEXT: movl (%eax), %eax
2955 ; X86-NEXT: vcvtusi2ss %eax, {rd-sae}, %xmm0, %xmm0
2957 %b = load i32, i32* %ptr
2958 %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 9) ; <<<4 x float>> [#uses=1]
2959 ret <4 x float> %res
2962 define <4 x float> @test_x86_avx512__mm_cvtu32_ss(<4 x float> %a, i32 %b) {
2963 ; X64-LABEL: test_x86_avx512__mm_cvtu32_ss:
2965 ; X64-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0
2968 ; X86-LABEL: test_x86_avx512__mm_cvtu32_ss:
2970 ; X86-NEXT: vcvtusi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
2972 %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1]
2973 ret <4 x float> %res
2976 define <4 x float> @test_x86_avx512__mm_cvtu32_ss_mem(<4 x float> %a, i32* %ptr) {
2977 ; X64-LABEL: test_x86_avx512__mm_cvtu32_ss_mem:
2979 ; X64-NEXT: vcvtusi2ssl (%rdi), %xmm0, %xmm0
2982 ; X86-LABEL: test_x86_avx512__mm_cvtu32_ss_mem:
2984 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2985 ; X86-NEXT: vcvtusi2ssl (%eax), %xmm0, %xmm0
2987 %b = load i32, i32* %ptr
2988 %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1]
2989 ret <4 x float> %res
2991 declare <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float>, i32, i32) nounwind readnone
2993 declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>)
2995 define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
2996 ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512:
2998 ; X64-NEXT: kmovw %esi, %k1
2999 ; X64-NEXT: vmovdqa64 %zmm1, %zmm3
3000 ; X64-NEXT: vpermi2d (%rdi), %zmm0, %zmm3 {%k1}
3001 ; X64-NEXT: vpermt2d %zmm2, %zmm1, %zmm0
3002 ; X64-NEXT: vpaddd %zmm0, %zmm3, %zmm0
3005 ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512:
3007 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3008 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3009 ; X86-NEXT: vmovdqa64 %zmm1, %zmm3
3010 ; X86-NEXT: vpermi2d (%eax), %zmm0, %zmm3 {%k1}
3011 ; X86-NEXT: vpermt2d %zmm2, %zmm1, %zmm0
3012 ; X86-NEXT: vpaddd %zmm0, %zmm3, %zmm0
3014 %x2 = load <16 x i32>, <16 x i32>* %x2p
3015 %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
3016 %2 = bitcast i16 %x3 to <16 x i1>
3017 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1
3018 %4 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
3019 %res2 = add <16 x i32> %3, %4
3020 ret <16 x i32> %res2
3023 declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>)
3025 define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
3026 ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512:
3028 ; X64-NEXT: vmovapd %zmm0, %zmm3
3029 ; X64-NEXT: vpermt2pd %zmm2, %zmm1, %zmm3
3030 ; X64-NEXT: kmovw %edi, %k1
3031 ; X64-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
3032 ; X64-NEXT: vaddpd %zmm3, %zmm1, %zmm0
3035 ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512:
3037 ; X86-NEXT: vmovapd %zmm0, %zmm3
3038 ; X86-NEXT: vpermt2pd %zmm2, %zmm1, %zmm3
3039 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3040 ; X86-NEXT: kmovw %eax, %k1
3041 ; X86-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
3042 ; X86-NEXT: vaddpd %zmm3, %zmm1, %zmm0
3044 %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)
3045 %2 = bitcast <8 x i64> %x1 to <8 x double>
3046 %3 = bitcast i8 %x3 to <8 x i1>
3047 %4 = select <8 x i1> %3, <8 x double> %1, <8 x double> %2
3048 %5 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)
3049 %6 = bitcast <8 x i64> %x1 to <8 x double>
3050 %res2 = fadd <8 x double> %4, %5
3051 ret <8 x double> %res2
3054 declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>)
3056 define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
3057 ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512:
3059 ; X64-NEXT: vmovaps %zmm0, %zmm3
3060 ; X64-NEXT: vpermt2ps %zmm2, %zmm1, %zmm3
3061 ; X64-NEXT: kmovw %edi, %k1
3062 ; X64-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
3063 ; X64-NEXT: vaddps %zmm3, %zmm1, %zmm0
3066 ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512:
3068 ; X86-NEXT: vmovaps %zmm0, %zmm3
3069 ; X86-NEXT: vpermt2ps %zmm2, %zmm1, %zmm3
3070 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3071 ; X86-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
3072 ; X86-NEXT: vaddps %zmm3, %zmm1, %zmm0
3074 %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)
3075 %2 = bitcast <16 x i32> %x1 to <16 x float>
3076 %3 = bitcast i16 %x3 to <16 x i1>
3077 %4 = select <16 x i1> %3, <16 x float> %1, <16 x float> %2
3078 %5 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)
3079 %6 = bitcast <16 x i32> %x1 to <16 x float>
3080 %res2 = fadd <16 x float> %4, %5
3081 ret <16 x float> %res2
3084 declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>)
3086 define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
3087 ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512:
3089 ; X64-NEXT: vmovdqa64 %zmm0, %zmm3
3090 ; X64-NEXT: vpermt2q %zmm2, %zmm1, %zmm3
3091 ; X64-NEXT: kmovw %edi, %k1
3092 ; X64-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
3093 ; X64-NEXT: vpaddq %zmm3, %zmm1, %zmm0
3096 ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512:
3098 ; X86-NEXT: vmovdqa64 %zmm0, %zmm3
3099 ; X86-NEXT: vpermt2q %zmm2, %zmm1, %zmm3
3100 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3101 ; X86-NEXT: kmovw %eax, %k1
3102 ; X86-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
3103 ; X86-NEXT: vpaddq %zmm3, %zmm1, %zmm0
3105 %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
3106 %2 = bitcast i8 %x3 to <8 x i1>
3107 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x1
3108 %4 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
3109 %res2 = add <8 x i64> %3, %4
3113 define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, i16 %x3) {
3114 ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512:
3116 ; X64-NEXT: kmovw %esi, %k1
3117 ; X64-NEXT: vmovdqa64 %zmm1, %zmm2
3118 ; X64-NEXT: vpermt2d (%rdi), %zmm0, %zmm2 {%k1} {z}
3119 ; X64-NEXT: vpermt2d %zmm1, %zmm0, %zmm1
3120 ; X64-NEXT: vpaddd %zmm1, %zmm2, %zmm0
3123 ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512:
3125 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3126 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3127 ; X86-NEXT: vmovdqa64 %zmm1, %zmm2
3128 ; X86-NEXT: vpermt2d (%eax), %zmm0, %zmm2 {%k1} {z}
3129 ; X86-NEXT: vpermt2d %zmm1, %zmm0, %zmm1
3130 ; X86-NEXT: vpaddd %zmm1, %zmm2, %zmm0
3132 %x2 = load <16 x i32>, <16 x i32>* %x2p
3133 %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2)
3134 %2 = bitcast i16 %x3 to <16 x i1>
3135 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
3136 %4 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x1)
3137 %res2 = add <16 x i32> %3, %4
3138 ret <16 x i32> %res2
3141 define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, double* %x2ptr, i8 %x3) {
3142 ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512:
3144 ; X64-NEXT: kmovw %esi, %k1
3145 ; X64-NEXT: vmovapd %zmm1, %zmm2
3146 ; X64-NEXT: vpermt2pd (%rdi){1to8}, %zmm0, %zmm2 {%k1} {z}
3147 ; X64-NEXT: vpermt2pd %zmm1, %zmm0, %zmm1
3148 ; X64-NEXT: vaddpd %zmm1, %zmm2, %zmm0
3151 ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512:
3153 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3154 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
3155 ; X86-NEXT: kmovw %ecx, %k1
3156 ; X86-NEXT: vmovapd %zmm1, %zmm2
3157 ; X86-NEXT: vpermt2pd (%eax){1to8}, %zmm0, %zmm2 {%k1} {z}
3158 ; X86-NEXT: vpermt2pd %zmm1, %zmm0, %zmm1
3159 ; X86-NEXT: vaddpd %zmm1, %zmm2, %zmm0
3161 %x2s = load double, double* %x2ptr
3162 %x2ins = insertelement <8 x double> undef, double %x2s, i32 0
3163 %x2 = shufflevector <8 x double> %x2ins, <8 x double> undef, <8 x i32> zeroinitializer
3164 %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x1, <8 x i64> %x0, <8 x double> %x2)
3165 %2 = bitcast i8 %x3 to <8 x i1>
3166 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
3167 %4 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x1, <8 x i64> %x0, <8 x double> %x1)
3168 %res2 = fadd <8 x double> %3, %4
3169 ret <8 x double> %res2
3172 define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
3173 ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512:
3175 ; X64-NEXT: vmovaps %zmm1, %zmm3
3176 ; X64-NEXT: vpermt2ps %zmm2, %zmm0, %zmm3
3177 ; X64-NEXT: kmovw %edi, %k1
3178 ; X64-NEXT: vpermt2ps %zmm2, %zmm0, %zmm1 {%k1} {z}
3179 ; X64-NEXT: vaddps %zmm3, %zmm1, %zmm0
3182 ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512:
3184 ; X86-NEXT: vmovaps %zmm1, %zmm3
3185 ; X86-NEXT: vpermt2ps %zmm2, %zmm0, %zmm3
3186 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3187 ; X86-NEXT: vpermt2ps %zmm2, %zmm0, %zmm1 {%k1} {z}
3188 ; X86-NEXT: vaddps %zmm3, %zmm1, %zmm0
3190 %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0, <16 x float> %x2)
3191 %2 = bitcast i16 %x3 to <16 x i1>
3192 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
3193 %4 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0, <16 x float> %x2)
3194 %res2 = fadd <16 x float> %3, %4
3195 ret <16 x float> %res2
3198 define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
3199 ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512:
3201 ; X64-NEXT: vmovdqa64 %zmm1, %zmm3
3202 ; X64-NEXT: vpermt2q %zmm2, %zmm0, %zmm3
3203 ; X64-NEXT: kmovw %edi, %k1
3204 ; X64-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 {%k1} {z}
3205 ; X64-NEXT: vpaddq %zmm3, %zmm1, %zmm0
3208 ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512:
3210 ; X86-NEXT: vmovdqa64 %zmm1, %zmm3
3211 ; X86-NEXT: vpermt2q %zmm2, %zmm0, %zmm3
3212 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3213 ; X86-NEXT: kmovw %eax, %k1
3214 ; X86-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 {%k1} {z}
3215 ; X86-NEXT: vpaddq %zmm3, %zmm1, %zmm0
3217 %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x1, <8 x i64> %x0, <8 x i64> %x2)
3218 %2 = bitcast i8 %x3 to <8 x i1>
3219 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
3220 %4 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x1, <8 x i64> %x0, <8 x i64> %x2)
3221 %res2 = add <8 x i64> %3, %4
3225 define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
3226 ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512:
3228 ; X64-NEXT: vmovdqa64 %zmm1, %zmm3
3229 ; X64-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
3230 ; X64-NEXT: kmovw %edi, %k1
3231 ; X64-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 {%k1}
3232 ; X64-NEXT: vpaddd %zmm3, %zmm1, %zmm0
3235 ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512:
3237 ; X86-NEXT: vmovdqa64 %zmm1, %zmm3
3238 ; X86-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
3239 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3240 ; X86-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 {%k1}
3241 ; X86-NEXT: vpaddd %zmm3, %zmm1, %zmm0
3243 %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2)
3244 %2 = bitcast i16 %x3 to <16 x i1>
3245 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1
3246 %4 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2)
3247 %res2 = add <16 x i32> %3, %4
3248 ret <16 x i32> %res2
3251 declare <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
3252 define <8 x double>@test_int_x86_avx512_mask_scalef_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
3253 ; X64-LABEL: test_int_x86_avx512_mask_scalef_pd_512:
3255 ; X64-NEXT: kmovw %edi, %k1
3256 ; X64-NEXT: vscalefpd {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3257 ; X64-NEXT: vscalefpd {rn-sae}, %zmm1, %zmm0, %zmm0
3258 ; X64-NEXT: vaddpd %zmm0, %zmm2, %zmm0
3261 ; X86-LABEL: test_int_x86_avx512_mask_scalef_pd_512:
3263 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3264 ; X86-NEXT: kmovw %eax, %k1
3265 ; X86-NEXT: vscalefpd {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3266 ; X86-NEXT: vscalefpd {rn-sae}, %zmm1, %zmm0, %zmm0
3267 ; X86-NEXT: vaddpd %zmm0, %zmm2, %zmm0
3269 %res = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 11)
3270 %res1 = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 8)
3271 %res2 = fadd <8 x double> %res, %res1
3272 ret <8 x double> %res2
3275 declare <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
3276 define <16 x float>@test_int_x86_avx512_mask_scalef_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
3277 ; X64-LABEL: test_int_x86_avx512_mask_scalef_ps_512:
3279 ; X64-NEXT: kmovw %edi, %k1
3280 ; X64-NEXT: vscalefps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3281 ; X64-NEXT: vscalefps {rn-sae}, %zmm1, %zmm0, %zmm0
3282 ; X64-NEXT: vaddps %zmm0, %zmm2, %zmm0
3285 ; X86-LABEL: test_int_x86_avx512_mask_scalef_ps_512:
3287 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3288 ; X86-NEXT: vscalefps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3289 ; X86-NEXT: vscalefps {rn-sae}, %zmm1, %zmm0, %zmm0
3290 ; X86-NEXT: vaddps %zmm0, %zmm2, %zmm0
3292 %res = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 10)
3293 %res1 = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 8)
3294 %res2 = fadd <16 x float> %res, %res1
3295 ret <16 x float> %res2
3298 declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64>, <16 x i8>, i8)
3300 define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
3301 ; X64-LABEL: test_int_x86_avx512_mask_pmov_qb_512:
3303 ; X64-NEXT: kmovw %edi, %k1
3304 ; X64-NEXT: vpmovqb %zmm0, %xmm2
3305 ; X64-NEXT: vpmovqb %zmm0, %xmm1 {%k1}
3306 ; X64-NEXT: vpmovqb %zmm0, %xmm0 {%k1} {z}
3307 ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3308 ; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0
3309 ; X64-NEXT: vzeroupper
3312 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qb_512:
3314 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3315 ; X86-NEXT: kmovw %eax, %k1
3316 ; X86-NEXT: vpmovqb %zmm0, %xmm2
3317 ; X86-NEXT: vpmovqb %zmm0, %xmm1 {%k1}
3318 ; X86-NEXT: vpmovqb %zmm0, %xmm0 {%k1} {z}
3319 ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3320 ; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0
3321 ; X86-NEXT: vzeroupper
3323 %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
3324 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
3325 %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
3326 %res3 = add <16 x i8> %res0, %res1
3327 %res4 = add <16 x i8> %res3, %res2
3331 declare void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64>, i8)
3333 define void @test_int_x86_avx512_mask_pmov_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
3334 ; X64-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_512:
3336 ; X64-NEXT: kmovw %esi, %k1
3337 ; X64-NEXT: vpmovqb %zmm0, (%rdi)
3338 ; X64-NEXT: vpmovqb %zmm0, (%rdi) {%k1}
3339 ; X64-NEXT: vzeroupper
3342 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_512:
3344 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3345 ; X86-NEXT: kmovw %eax, %k1
3346 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3347 ; X86-NEXT: vpmovqb %zmm0, (%eax)
3348 ; X86-NEXT: vpmovqb %zmm0, (%eax) {%k1}
3349 ; X86-NEXT: vzeroupper
3351 call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
3352 call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
3356 declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64>, <16 x i8>, i8)
3358 define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
3359 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qb_512:
3361 ; X64-NEXT: kmovw %edi, %k1
3362 ; X64-NEXT: vpmovsqb %zmm0, %xmm2
3363 ; X64-NEXT: vpmovsqb %zmm0, %xmm1 {%k1}
3364 ; X64-NEXT: vpmovsqb %zmm0, %xmm0 {%k1} {z}
3365 ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3366 ; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0
3367 ; X64-NEXT: vzeroupper
3370 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qb_512:
3372 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3373 ; X86-NEXT: kmovw %eax, %k1
3374 ; X86-NEXT: vpmovsqb %zmm0, %xmm2
3375 ; X86-NEXT: vpmovsqb %zmm0, %xmm1 {%k1}
3376 ; X86-NEXT: vpmovsqb %zmm0, %xmm0 {%k1} {z}
3377 ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3378 ; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0
3379 ; X86-NEXT: vzeroupper
3381 %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
3382 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
3383 %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
3384 %res3 = add <16 x i8> %res0, %res1
3385 %res4 = add <16 x i8> %res3, %res2
3389 declare void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64>, i8)
3391 define void @test_int_x86_avx512_mask_pmovs_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
3392 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_512:
3394 ; X64-NEXT: kmovw %esi, %k1
3395 ; X64-NEXT: vpmovsqb %zmm0, (%rdi)
3396 ; X64-NEXT: vpmovsqb %zmm0, (%rdi) {%k1}
3397 ; X64-NEXT: vzeroupper
3400 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_512:
3402 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3403 ; X86-NEXT: kmovw %eax, %k1
3404 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3405 ; X86-NEXT: vpmovsqb %zmm0, (%eax)
3406 ; X86-NEXT: vpmovsqb %zmm0, (%eax) {%k1}
3407 ; X86-NEXT: vzeroupper
3409 call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
3410 call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
3414 declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64>, <16 x i8>, i8)
3416 define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
3417 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qb_512:
3419 ; X64-NEXT: kmovw %edi, %k1
3420 ; X64-NEXT: vpmovusqb %zmm0, %xmm2
3421 ; X64-NEXT: vpmovusqb %zmm0, %xmm1 {%k1}
3422 ; X64-NEXT: vpmovusqb %zmm0, %xmm0 {%k1} {z}
3423 ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3424 ; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0
3425 ; X64-NEXT: vzeroupper
3428 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qb_512:
3430 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3431 ; X86-NEXT: kmovw %eax, %k1
3432 ; X86-NEXT: vpmovusqb %zmm0, %xmm2
3433 ; X86-NEXT: vpmovusqb %zmm0, %xmm1 {%k1}
3434 ; X86-NEXT: vpmovusqb %zmm0, %xmm0 {%k1} {z}
3435 ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3436 ; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0
3437 ; X86-NEXT: vzeroupper
3439 %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
3440 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
3441 %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
3442 %res3 = add <16 x i8> %res0, %res1
3443 %res4 = add <16 x i8> %res3, %res2
3447 declare void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64>, i8)
3449 define void @test_int_x86_avx512_mask_pmovus_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
3450 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_512:
3452 ; X64-NEXT: kmovw %esi, %k1
3453 ; X64-NEXT: vpmovusqb %zmm0, (%rdi)
3454 ; X64-NEXT: vpmovusqb %zmm0, (%rdi) {%k1}
3455 ; X64-NEXT: vzeroupper
3458 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_512:
3460 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3461 ; X86-NEXT: kmovw %eax, %k1
3462 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3463 ; X86-NEXT: vpmovusqb %zmm0, (%eax)
3464 ; X86-NEXT: vpmovusqb %zmm0, (%eax) {%k1}
3465 ; X86-NEXT: vzeroupper
3467 call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
3468 call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
3472 declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
3474 define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
3475 ; X64-LABEL: test_int_x86_avx512_mask_pmov_qw_512:
3477 ; X64-NEXT: kmovw %edi, %k1
3478 ; X64-NEXT: vpmovqw %zmm0, %xmm2
3479 ; X64-NEXT: vpmovqw %zmm0, %xmm1 {%k1}
3480 ; X64-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z}
3481 ; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0
3482 ; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0
3483 ; X64-NEXT: vzeroupper
3486 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qw_512:
3488 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3489 ; X86-NEXT: kmovw %eax, %k1
3490 ; X86-NEXT: vpmovqw %zmm0, %xmm2
3491 ; X86-NEXT: vpmovqw %zmm0, %xmm1 {%k1}
3492 ; X86-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z}
3493 ; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0
3494 ; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0
3495 ; X86-NEXT: vzeroupper
3497 %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
3498 %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
3499 %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
3500 %res3 = add <8 x i16> %res0, %res1
3501 %res4 = add <8 x i16> %res3, %res2
3505 declare void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64>, i8)
3507 define void @test_int_x86_avx512_mask_pmov_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
3508 ; X64-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_512:
3510 ; X64-NEXT: kmovw %esi, %k1
3511 ; X64-NEXT: vpmovqw %zmm0, (%rdi)
3512 ; X64-NEXT: vpmovqw %zmm0, (%rdi) {%k1}
3513 ; X64-NEXT: vzeroupper
3516 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_512:
3518 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3519 ; X86-NEXT: kmovw %eax, %k1
3520 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3521 ; X86-NEXT: vpmovqw %zmm0, (%eax)
3522 ; X86-NEXT: vpmovqw %zmm0, (%eax) {%k1}
3523 ; X86-NEXT: vzeroupper
3525 call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
3526 call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
3530 declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8)
3532 define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
3533 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qw_512:
3535 ; X64-NEXT: kmovw %edi, %k1
3536 ; X64-NEXT: vpmovsqw %zmm0, %xmm2
3537 ; X64-NEXT: vpmovsqw %zmm0, %xmm1 {%k1}
3538 ; X64-NEXT: vpmovsqw %zmm0, %xmm0 {%k1} {z}
3539 ; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0
3540 ; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0
3541 ; X64-NEXT: vzeroupper
3544 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qw_512:
3546 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3547 ; X86-NEXT: kmovw %eax, %k1
3548 ; X86-NEXT: vpmovsqw %zmm0, %xmm2
3549 ; X86-NEXT: vpmovsqw %zmm0, %xmm1 {%k1}
3550 ; X86-NEXT: vpmovsqw %zmm0, %xmm0 {%k1} {z}
3551 ; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0
3552 ; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0
3553 ; X86-NEXT: vzeroupper
3555 %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
3556 %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
3557 %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
3558 %res3 = add <8 x i16> %res0, %res1
3559 %res4 = add <8 x i16> %res3, %res2
3563 declare void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64>, i8)
3565 define void @test_int_x86_avx512_mask_pmovs_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
3566 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_512:
3568 ; X64-NEXT: kmovw %esi, %k1
3569 ; X64-NEXT: vpmovsqw %zmm0, (%rdi)
3570 ; X64-NEXT: vpmovsqw %zmm0, (%rdi) {%k1}
3571 ; X64-NEXT: vzeroupper
3574 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_512:
3576 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3577 ; X86-NEXT: kmovw %eax, %k1
3578 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3579 ; X86-NEXT: vpmovsqw %zmm0, (%eax)
3580 ; X86-NEXT: vpmovsqw %zmm0, (%eax) {%k1}
3581 ; X86-NEXT: vzeroupper
3583 call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
3584 call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
3588 declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8)
3590 define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
3591 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qw_512:
3593 ; X64-NEXT: kmovw %edi, %k1
3594 ; X64-NEXT: vpmovusqw %zmm0, %xmm2
3595 ; X64-NEXT: vpmovusqw %zmm0, %xmm1 {%k1}
3596 ; X64-NEXT: vpmovusqw %zmm0, %xmm0 {%k1} {z}
3597 ; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0
3598 ; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0
3599 ; X64-NEXT: vzeroupper
3602 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qw_512:
3604 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3605 ; X86-NEXT: kmovw %eax, %k1
3606 ; X86-NEXT: vpmovusqw %zmm0, %xmm2
3607 ; X86-NEXT: vpmovusqw %zmm0, %xmm1 {%k1}
3608 ; X86-NEXT: vpmovusqw %zmm0, %xmm0 {%k1} {z}
3609 ; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0
3610 ; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0
3611 ; X86-NEXT: vzeroupper
3613 %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
3614 %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
3615 %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
3616 %res3 = add <8 x i16> %res0, %res1
3617 %res4 = add <8 x i16> %res3, %res2
3621 declare void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64>, i8)
3623 define void @test_int_x86_avx512_mask_pmovus_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
3624 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_512:
3626 ; X64-NEXT: kmovw %esi, %k1
3627 ; X64-NEXT: vpmovusqw %zmm0, (%rdi)
3628 ; X64-NEXT: vpmovusqw %zmm0, (%rdi) {%k1}
3629 ; X64-NEXT: vzeroupper
3632 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_512:
3634 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3635 ; X86-NEXT: kmovw %eax, %k1
3636 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3637 ; X86-NEXT: vpmovusqw %zmm0, (%eax)
3638 ; X86-NEXT: vpmovusqw %zmm0, (%eax) {%k1}
3639 ; X86-NEXT: vzeroupper
3641 call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
3642 call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
3646 define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
3647 ; X64-LABEL: test_int_x86_avx512_mask_pmov_qd_512:
3649 ; X64-NEXT: vpmovqd %zmm0, %ymm2
3650 ; X64-NEXT: kmovw %edi, %k1
3651 ; X64-NEXT: vpmovqd %zmm0, %ymm1 {%k1}
3652 ; X64-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z}
3653 ; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0
3654 ; X64-NEXT: vpaddd %ymm0, %ymm2, %ymm0
3657 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qd_512:
3659 ; X86-NEXT: vpmovqd %zmm0, %ymm2
3660 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3661 ; X86-NEXT: kmovw %eax, %k1
3662 ; X86-NEXT: vpmovqd %zmm0, %ymm1 {%k1}
3663 ; X86-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z}
3664 ; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0
3665 ; X86-NEXT: vpaddd %ymm0, %ymm2, %ymm0
3667 %1 = trunc <8 x i64> %x0 to <8 x i32>
3668 %2 = trunc <8 x i64> %x0 to <8 x i32>
3669 %3 = bitcast i8 %x2 to <8 x i1>
3670 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %x1
3671 %5 = trunc <8 x i64> %x0 to <8 x i32>
3672 %6 = bitcast i8 %x2 to <8 x i1>
3673 %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> zeroinitializer
3674 %res3 = add <8 x i32> %1, %4
3675 %res4 = add <8 x i32> %res3, %7
3679 declare void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64>, i8)
3681 define void @test_int_x86_avx512_mask_pmov_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
3682 ; X64-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_512:
3684 ; X64-NEXT: kmovw %esi, %k1
3685 ; X64-NEXT: vpmovqd %zmm0, (%rdi)
3686 ; X64-NEXT: vpmovqd %zmm0, (%rdi) {%k1}
3687 ; X64-NEXT: vzeroupper
3690 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_512:
3692 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3693 ; X86-NEXT: kmovw %eax, %k1
3694 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3695 ; X86-NEXT: vpmovqd %zmm0, (%eax)
3696 ; X86-NEXT: vpmovqd %zmm0, (%eax) {%k1}
3697 ; X86-NEXT: vzeroupper
3699 call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
3700 call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
3704 declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8)
3706 define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
3707 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qd_512:
3709 ; X64-NEXT: kmovw %edi, %k1
3710 ; X64-NEXT: vpmovsqd %zmm0, %ymm2 {%k1} {z}
3711 ; X64-NEXT: vpmovsqd %zmm0, %ymm1 {%k1}
3712 ; X64-NEXT: vpaddd %ymm2, %ymm1, %ymm1
3713 ; X64-NEXT: vpmovsqd %zmm0, %ymm0
3714 ; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0
3717 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qd_512:
3719 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3720 ; X86-NEXT: kmovw %eax, %k1
3721 ; X86-NEXT: vpmovsqd %zmm0, %ymm1 {%k1}
3722 ; X86-NEXT: vpmovsqd %zmm0, %ymm2 {%k1} {z}
3723 ; X86-NEXT: vpaddd %ymm2, %ymm1, %ymm1
3724 ; X86-NEXT: vpmovsqd %zmm0, %ymm0
3725 ; X86-NEXT: vpaddd %ymm1, %ymm0, %ymm0
3727 %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
3728 %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
3729 %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
3730 %res3 = add <8 x i32> %res0, %res1
3731 %res4 = add <8 x i32> %res3, %res2
3735 declare void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64>, i8)
3737 define void @test_int_x86_avx512_mask_pmovs_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
3738 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_512:
3740 ; X64-NEXT: kmovw %esi, %k1
3741 ; X64-NEXT: vpmovsqd %zmm0, (%rdi)
3742 ; X64-NEXT: vpmovsqd %zmm0, (%rdi) {%k1}
3743 ; X64-NEXT: vzeroupper
3746 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_512:
3748 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3749 ; X86-NEXT: kmovw %eax, %k1
3750 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3751 ; X86-NEXT: vpmovsqd %zmm0, (%eax)
3752 ; X86-NEXT: vpmovsqd %zmm0, (%eax) {%k1}
3753 ; X86-NEXT: vzeroupper
3755 call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
3756 call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
3760 declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8)
3762 define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
3763 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qd_512:
3765 ; X64-NEXT: kmovw %edi, %k1
3766 ; X64-NEXT: vpmovusqd %zmm0, %ymm2 {%k1} {z}
3767 ; X64-NEXT: vpmovusqd %zmm0, %ymm1 {%k1}
3768 ; X64-NEXT: vpaddd %ymm2, %ymm1, %ymm1
3769 ; X64-NEXT: vpmovusqd %zmm0, %ymm0
3770 ; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0
3773 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qd_512:
3775 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3776 ; X86-NEXT: kmovw %eax, %k1
3777 ; X86-NEXT: vpmovusqd %zmm0, %ymm1 {%k1}
3778 ; X86-NEXT: vpmovusqd %zmm0, %ymm2 {%k1} {z}
3779 ; X86-NEXT: vpaddd %ymm2, %ymm1, %ymm1
3780 ; X86-NEXT: vpmovusqd %zmm0, %ymm0
3781 ; X86-NEXT: vpaddd %ymm1, %ymm0, %ymm0
3783 %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
3784 %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
3785 %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
3786 %res3 = add <8 x i32> %res0, %res1
3787 %res4 = add <8 x i32> %res3, %res2
3791 declare void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64>, i8)
3793 define void @test_int_x86_avx512_mask_pmovus_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
3794 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_512:
3796 ; X64-NEXT: kmovw %esi, %k1
3797 ; X64-NEXT: vpmovusqd %zmm0, (%rdi)
3798 ; X64-NEXT: vpmovusqd %zmm0, (%rdi) {%k1}
3799 ; X64-NEXT: vzeroupper
3802 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_512:
3804 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3805 ; X86-NEXT: kmovw %eax, %k1
3806 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3807 ; X86-NEXT: vpmovusqd %zmm0, (%eax)
3808 ; X86-NEXT: vpmovusqd %zmm0, (%eax) {%k1}
3809 ; X86-NEXT: vzeroupper
3811 call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
3812 call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
3816 declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
3818 define <16 x i8>@test_int_x86_avx512_mask_pmov_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
3819 ; X64-LABEL: test_int_x86_avx512_mask_pmov_db_512:
3821 ; X64-NEXT: kmovw %edi, %k1
3822 ; X64-NEXT: vpmovdb %zmm0, %xmm2
3823 ; X64-NEXT: vpmovdb %zmm0, %xmm1 {%k1}
3824 ; X64-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z}
3825 ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3826 ; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0
3827 ; X64-NEXT: vzeroupper
3830 ; X86-LABEL: test_int_x86_avx512_mask_pmov_db_512:
3832 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3833 ; X86-NEXT: vpmovdb %zmm0, %xmm2
3834 ; X86-NEXT: vpmovdb %zmm0, %xmm1 {%k1}
3835 ; X86-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z}
3836 ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3837 ; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0
3838 ; X86-NEXT: vzeroupper
3840 %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
3841 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
3842 %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
3843 %res3 = add <16 x i8> %res0, %res1
3844 %res4 = add <16 x i8> %res3, %res2
3848 declare void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32>, i16)
3850 define void @test_int_x86_avx512_mask_pmov_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
3851 ; X64-LABEL: test_int_x86_avx512_mask_pmov_db_mem_512:
3853 ; X64-NEXT: kmovw %esi, %k1
3854 ; X64-NEXT: vpmovdb %zmm0, (%rdi)
3855 ; X64-NEXT: vpmovdb %zmm0, (%rdi) {%k1}
3856 ; X64-NEXT: vzeroupper
3859 ; X86-LABEL: test_int_x86_avx512_mask_pmov_db_mem_512:
3861 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3862 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3863 ; X86-NEXT: vpmovdb %zmm0, (%eax)
3864 ; X86-NEXT: vpmovdb %zmm0, (%eax) {%k1}
3865 ; X86-NEXT: vzeroupper
3867 call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
3868 call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
3872 declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16)
3874 define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
3875 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_db_512:
3877 ; X64-NEXT: kmovw %edi, %k1
3878 ; X64-NEXT: vpmovsdb %zmm0, %xmm2
3879 ; X64-NEXT: vpmovsdb %zmm0, %xmm1 {%k1}
3880 ; X64-NEXT: vpmovsdb %zmm0, %xmm0 {%k1} {z}
3881 ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3882 ; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0
3883 ; X64-NEXT: vzeroupper
3886 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_db_512:
3888 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3889 ; X86-NEXT: vpmovsdb %zmm0, %xmm2
3890 ; X86-NEXT: vpmovsdb %zmm0, %xmm1 {%k1}
3891 ; X86-NEXT: vpmovsdb %zmm0, %xmm0 {%k1} {z}
3892 ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3893 ; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0
3894 ; X86-NEXT: vzeroupper
3896 %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
3897 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
3898 %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
3899 %res3 = add <16 x i8> %res0, %res1
3900 %res4 = add <16 x i8> %res3, %res2
3904 declare void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32>, i16)
3906 define void @test_int_x86_avx512_mask_pmovs_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
3907 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_512:
3909 ; X64-NEXT: kmovw %esi, %k1
3910 ; X64-NEXT: vpmovsdb %zmm0, (%rdi)
3911 ; X64-NEXT: vpmovsdb %zmm0, (%rdi) {%k1}
3912 ; X64-NEXT: vzeroupper
3915 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_512:
3917 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3918 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3919 ; X86-NEXT: vpmovsdb %zmm0, (%eax)
3920 ; X86-NEXT: vpmovsdb %zmm0, (%eax) {%k1}
3921 ; X86-NEXT: vzeroupper
3923 call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
3924 call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
3928 declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16)
3930 define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
3931 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_db_512:
3933 ; X64-NEXT: kmovw %edi, %k1
3934 ; X64-NEXT: vpmovusdb %zmm0, %xmm2
3935 ; X64-NEXT: vpmovusdb %zmm0, %xmm1 {%k1}
3936 ; X64-NEXT: vpmovusdb %zmm0, %xmm0 {%k1} {z}
3937 ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3938 ; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0
3939 ; X64-NEXT: vzeroupper
3942 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_db_512:
3944 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3945 ; X86-NEXT: vpmovusdb %zmm0, %xmm2
3946 ; X86-NEXT: vpmovusdb %zmm0, %xmm1 {%k1}
3947 ; X86-NEXT: vpmovusdb %zmm0, %xmm0 {%k1} {z}
3948 ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3949 ; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0
3950 ; X86-NEXT: vzeroupper
3952 %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
3953 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
3954 %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
3955 %res3 = add <16 x i8> %res0, %res1
3956 %res4 = add <16 x i8> %res3, %res2
3960 declare void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32>, i16)
3962 define void @test_int_x86_avx512_mask_pmovus_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
3963 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_512:
3965 ; X64-NEXT: kmovw %esi, %k1
3966 ; X64-NEXT: vpmovusdb %zmm0, (%rdi)
3967 ; X64-NEXT: vpmovusdb %zmm0, (%rdi) {%k1}
3968 ; X64-NEXT: vzeroupper
3971 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_512:
3973 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3974 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3975 ; X86-NEXT: vpmovusdb %zmm0, (%eax)
3976 ; X86-NEXT: vpmovusdb %zmm0, (%eax) {%k1}
3977 ; X86-NEXT: vzeroupper
3979 call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
3980 call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
3984 declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16)
3986 define <16 x i16>@test_int_x86_avx512_mask_pmov_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
3987 ; X64-LABEL: test_int_x86_avx512_mask_pmov_dw_512:
3989 ; X64-NEXT: kmovw %edi, %k1
3990 ; X64-NEXT: vpmovdw %zmm0, %ymm2
3991 ; X64-NEXT: vpmovdw %zmm0, %ymm1 {%k1}
3992 ; X64-NEXT: vpmovdw %zmm0, %ymm0 {%k1} {z}
3993 ; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0
3994 ; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0
3997 ; X86-LABEL: test_int_x86_avx512_mask_pmov_dw_512:
3999 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4000 ; X86-NEXT: vpmovdw %zmm0, %ymm2
4001 ; X86-NEXT: vpmovdw %zmm0, %ymm1 {%k1}
4002 ; X86-NEXT: vpmovdw %zmm0, %ymm0 {%k1} {z}
4003 ; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0
4004 ; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0
4006 %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
4007 %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
4008 %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
4009 %res3 = add <16 x i16> %res0, %res1
4010 %res4 = add <16 x i16> %res3, %res2
4011 ret <16 x i16> %res4
4014 declare void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32>, i16)
4016 define void @test_int_x86_avx512_mask_pmov_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
4017 ; X64-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_512:
4019 ; X64-NEXT: kmovw %esi, %k1
4020 ; X64-NEXT: vpmovdw %zmm0, (%rdi)
4021 ; X64-NEXT: vpmovdw %zmm0, (%rdi) {%k1}
4022 ; X64-NEXT: vzeroupper
4025 ; X86-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_512:
4027 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4028 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
4029 ; X86-NEXT: vpmovdw %zmm0, (%eax)
4030 ; X86-NEXT: vpmovdw %zmm0, (%eax) {%k1}
4031 ; X86-NEXT: vzeroupper
4033 call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
4034 call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
4038 declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16)
4040 define <16 x i16>@test_int_x86_avx512_mask_pmovs_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
4041 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_dw_512:
4043 ; X64-NEXT: kmovw %edi, %k1
4044 ; X64-NEXT: vpmovsdw %zmm0, %ymm2
4045 ; X64-NEXT: vpmovsdw %zmm0, %ymm1 {%k1}
4046 ; X64-NEXT: vpmovsdw %zmm0, %ymm0 {%k1} {z}
4047 ; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0
4048 ; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0
4051 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_dw_512:
4053 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4054 ; X86-NEXT: vpmovsdw %zmm0, %ymm2
4055 ; X86-NEXT: vpmovsdw %zmm0, %ymm1 {%k1}
4056 ; X86-NEXT: vpmovsdw %zmm0, %ymm0 {%k1} {z}
4057 ; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0
4058 ; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0
4060 %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
4061 %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
4062 %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
4063 %res3 = add <16 x i16> %res0, %res1
4064 %res4 = add <16 x i16> %res3, %res2
4065 ret <16 x i16> %res4
4068 declare void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32>, i16)
4070 define void @test_int_x86_avx512_mask_pmovs_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
4071 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_512:
4073 ; X64-NEXT: kmovw %esi, %k1
4074 ; X64-NEXT: vpmovsdw %zmm0, (%rdi)
4075 ; X64-NEXT: vpmovsdw %zmm0, (%rdi) {%k1}
4076 ; X64-NEXT: vzeroupper
4079 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_512:
4081 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4082 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
4083 ; X86-NEXT: vpmovsdw %zmm0, (%eax)
4084 ; X86-NEXT: vpmovsdw %zmm0, (%eax) {%k1}
4085 ; X86-NEXT: vzeroupper
4087 call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
4088 call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
4092 declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16)
4094 define <16 x i16>@test_int_x86_avx512_mask_pmovus_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
4095 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_dw_512:
4097 ; X64-NEXT: kmovw %edi, %k1
4098 ; X64-NEXT: vpmovusdw %zmm0, %ymm2
4099 ; X64-NEXT: vpmovusdw %zmm0, %ymm1 {%k1}
4100 ; X64-NEXT: vpmovusdw %zmm0, %ymm0 {%k1} {z}
4101 ; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0
4102 ; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0
4105 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_dw_512:
4107 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4108 ; X86-NEXT: vpmovusdw %zmm0, %ymm2
4109 ; X86-NEXT: vpmovusdw %zmm0, %ymm1 {%k1}
4110 ; X86-NEXT: vpmovusdw %zmm0, %ymm0 {%k1} {z}
4111 ; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0
4112 ; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0
4114 %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
4115 %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
4116 %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
4117 %res3 = add <16 x i16> %res0, %res1
4118 %res4 = add <16 x i16> %res3, %res2
4119 ret <16 x i16> %res4
4122 declare void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32>, i16)
4124 define void @test_int_x86_avx512_mask_pmovus_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
4125 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_512:
4127 ; X64-NEXT: kmovw %esi, %k1
4128 ; X64-NEXT: vpmovusdw %zmm0, (%rdi)
4129 ; X64-NEXT: vpmovusdw %zmm0, (%rdi) {%k1}
4130 ; X64-NEXT: vzeroupper
4133 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_512:
4135 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4136 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
4137 ; X86-NEXT: vpmovusdw %zmm0, (%eax)
4138 ; X86-NEXT: vpmovusdw %zmm0, (%eax) {%k1}
4139 ; X86-NEXT: vzeroupper
4141 call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
4142 call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
4146 declare <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32>, i32)
4148 define <16 x float>@test_int_x86_avx512_mask_cvt_dq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) {
4149 ; X64-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_512:
4151 ; X64-NEXT: kmovw %edi, %k1
4152 ; X64-NEXT: vcvtdq2ps %zmm0, %zmm1 {%k1}
4153 ; X64-NEXT: vcvtdq2ps {rn-sae}, %zmm0, %zmm0
4154 ; X64-NEXT: vaddps %zmm0, %zmm1, %zmm0
4157 ; X86-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_512:
4159 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4160 ; X86-NEXT: vcvtdq2ps %zmm0, %zmm1 {%k1}
4161 ; X86-NEXT: vcvtdq2ps {rn-sae}, %zmm0, %zmm0
4162 ; X86-NEXT: vaddps %zmm0, %zmm1, %zmm0
4164 %cvt = sitofp <16 x i32> %x0 to <16 x float>
4165 %1 = bitcast i16 %x2 to <16 x i1>
4166 %2 = select <16 x i1> %1, <16 x float> %cvt, <16 x float> %x1
4167 %3 = call <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32> %x0, i32 8)
4168 %res2 = fadd <16 x float> %2, %3
4169 ret <16 x float> %res2
4172 declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double>, <8 x i32>, i8, i32)
4174 define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
4175 ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_512:
4177 ; X64-NEXT: kmovw %edi, %k1
4178 ; X64-NEXT: vcvtpd2dq %zmm0, %ymm1 {%k1}
4179 ; X64-NEXT: vcvtpd2dq {rn-sae}, %zmm0, %ymm0
4180 ; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0
4183 ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_512:
4185 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4186 ; X86-NEXT: kmovw %eax, %k1
4187 ; X86-NEXT: vcvtpd2dq %zmm0, %ymm1 {%k1}
4188 ; X86-NEXT: vcvtpd2dq {rn-sae}, %zmm0, %ymm0
4189 ; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0
4191 %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
4192 %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
4193 %res2 = add <8 x i32> %res, %res1
4197 declare <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double>, <8 x float>, i8, i32)
4199 define <8 x float>@test_int_x86_avx512_mask_cvt_pd2ps_512(<8 x double> %x0, <8 x float> %x1, i8 %x2) {
4200 ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_512:
4202 ; X64-NEXT: kmovw %edi, %k1
4203 ; X64-NEXT: vcvtpd2ps %zmm0, %ymm1 {%k1}
4204 ; X64-NEXT: vcvtpd2ps {ru-sae}, %zmm0, %ymm0
4205 ; X64-NEXT: vaddps %ymm0, %ymm1, %ymm0
4208 ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_512:
4210 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4211 ; X86-NEXT: kmovw %eax, %k1
4212 ; X86-NEXT: vcvtpd2ps %zmm0, %ymm1 {%k1}
4213 ; X86-NEXT: vcvtpd2ps {ru-sae}, %zmm0, %ymm0
4214 ; X86-NEXT: vaddps %ymm0, %ymm1, %ymm0
4216 %res = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 %x2, i32 4)
4217 %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 -1, i32 10)
4218 %res2 = fadd <8 x float> %res, %res1
4219 ret <8 x float> %res2
4222 declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
4224 define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
4225 ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_512:
4227 ; X64-NEXT: kmovw %edi, %k1
4228 ; X64-NEXT: vcvtpd2udq {ru-sae}, %zmm0, %ymm1 {%k1}
4229 ; X64-NEXT: vcvtpd2udq {rn-sae}, %zmm0, %ymm0
4230 ; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0
4233 ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_512:
4235 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4236 ; X86-NEXT: kmovw %eax, %k1
4237 ; X86-NEXT: vcvtpd2udq {ru-sae}, %zmm0, %ymm1 {%k1}
4238 ; X86-NEXT: vcvtpd2udq {rn-sae}, %zmm0, %ymm0
4239 ; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0
4241 %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 10)
4242 %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
4243 %res2 = add <8 x i32> %res, %res1
4247 declare <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float>, <16 x i32>, i16, i32)
4249 define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
4250 ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_512:
4252 ; X64-NEXT: kmovw %edi, %k1
4253 ; X64-NEXT: vcvtps2dq {ru-sae}, %zmm0, %zmm1 {%k1}
4254 ; X64-NEXT: vcvtps2dq {rn-sae}, %zmm0, %zmm0
4255 ; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0
4258 ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_512:
4260 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4261 ; X86-NEXT: vcvtps2dq {ru-sae}, %zmm0, %zmm1 {%k1}
4262 ; X86-NEXT: vcvtps2dq {rn-sae}, %zmm0, %zmm0
4263 ; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0
4265 %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 10)
4266 %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
4267 %res2 = add <16 x i32> %res, %res1
4268 ret <16 x i32> %res2
4271 declare <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float>, <8 x double>, i8, i32)
4273 define <8 x double>@test_int_x86_avx512_mask_cvt_ps2pd_512(<8 x float> %x0, <8 x double> %x1, i8 %x2) {
4274 ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_512:
4276 ; X64-NEXT: kmovw %edi, %k1
4277 ; X64-NEXT: vcvtps2pd %ymm0, %zmm1 {%k1}
4278 ; X64-NEXT: vcvtps2pd {sae}, %ymm0, %zmm0
4279 ; X64-NEXT: vaddpd %zmm0, %zmm1, %zmm0
4282 ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_512:
4284 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4285 ; X86-NEXT: kmovw %eax, %k1
4286 ; X86-NEXT: vcvtps2pd %ymm0, %zmm1 {%k1}
4287 ; X86-NEXT: vcvtps2pd {sae}, %ymm0, %zmm0
4288 ; X86-NEXT: vaddpd %zmm0, %zmm1, %zmm0
4290 %res = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 %x2, i32 4)
4291 %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 -1, i32 8)
4292 %res2 = fadd <8 x double> %res, %res1
4293 ret <8 x double> %res2
4296 declare <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float>, <16 x i32>, i16, i32)
4298 define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
4299 ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_512:
4301 ; X64-NEXT: kmovw %edi, %k1
4302 ; X64-NEXT: vcvtps2udq {ru-sae}, %zmm0, %zmm1 {%k1}
4303 ; X64-NEXT: vcvtps2udq {rn-sae}, %zmm0, %zmm0
4304 ; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0
4307 ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_512:
4309 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4310 ; X86-NEXT: vcvtps2udq {ru-sae}, %zmm0, %zmm1 {%k1}
4311 ; X86-NEXT: vcvtps2udq {rn-sae}, %zmm0, %zmm0
4312 ; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0
4314 %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 10)
4315 %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
4316 %res2 = add <16 x i32> %res, %res1
4317 ret <16 x i32> %res2
4320 declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double>, <8 x i32>, i8, i32)
4322 define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
4323 ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_512:
4325 ; X64-NEXT: kmovw %edi, %k1
4326 ; X64-NEXT: vcvttpd2dq %zmm0, %ymm1 {%k1}
4327 ; X64-NEXT: vcvttpd2dq {sae}, %zmm0, %ymm0
4328 ; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0
4331 ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_512:
4333 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4334 ; X86-NEXT: kmovw %eax, %k1
4335 ; X86-NEXT: vcvttpd2dq %zmm0, %ymm1 {%k1}
4336 ; X86-NEXT: vcvttpd2dq {sae}, %zmm0, %ymm0
4337 ; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0
4339 %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
4340 %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
4341 %res2 = add <8 x i32> %res, %res1
4345 declare <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32>, i32)
4347 define <16 x float>@test_int_x86_avx512_mask_cvt_udq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) {
4348 ; X64-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_512:
4350 ; X64-NEXT: kmovw %edi, %k1
4351 ; X64-NEXT: vcvtudq2ps %zmm0, %zmm1 {%k1}
4352 ; X64-NEXT: vcvtudq2ps {rn-sae}, %zmm0, %zmm0
4353 ; X64-NEXT: vaddps %zmm0, %zmm1, %zmm0
4356 ; X86-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_512:
4358 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4359 ; X86-NEXT: vcvtudq2ps %zmm0, %zmm1 {%k1}
4360 ; X86-NEXT: vcvtudq2ps {rn-sae}, %zmm0, %zmm0
4361 ; X86-NEXT: vaddps %zmm0, %zmm1, %zmm0
4363 %cvt = uitofp <16 x i32> %x0 to <16 x float>
4364 %1 = bitcast i16 %x2 to <16 x i1>
4365 %2 = select <16 x i1> %1, <16 x float> %cvt, <16 x float> %x1
4366 %3 = call <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32> %x0, i32 8)
4367 %res2 = fadd <16 x float> %2, %3
4368 ret <16 x float> %res2
4371 declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
4373 define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
4374 ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_512:
4376 ; X64-NEXT: kmovw %edi, %k1
4377 ; X64-NEXT: vcvttpd2udq %zmm0, %ymm1 {%k1}
4378 ; X64-NEXT: vcvttpd2udq {sae}, %zmm0, %ymm0
4379 ; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0
4382 ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_512:
4384 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4385 ; X86-NEXT: kmovw %eax, %k1
4386 ; X86-NEXT: vcvttpd2udq %zmm0, %ymm1 {%k1}
4387 ; X86-NEXT: vcvttpd2udq {sae}, %zmm0, %ymm0
4388 ; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0
4390 %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
4391 %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
4392 %res2 = add <8 x i32> %res, %res1
4396 declare <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float>, <16 x i32>, i16, i32)
4398 define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
4399 ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_512:
4401 ; X64-NEXT: kmovw %edi, %k1
4402 ; X64-NEXT: vcvttps2dq %zmm0, %zmm1 {%k1}
4403 ; X64-NEXT: vcvttps2dq {sae}, %zmm0, %zmm0
4404 ; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0
4407 ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_512:
4409 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4410 ; X86-NEXT: vcvttps2dq %zmm0, %zmm1 {%k1}
4411 ; X86-NEXT: vcvttps2dq {sae}, %zmm0, %zmm0
4412 ; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0
4414 %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4)
4415 %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
4416 %res2 = add <16 x i32> %res, %res1
4417 ret <16 x i32> %res2
4420 declare <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float>, <16 x i32>, i16, i32)
4422 define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
4423 ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_512:
4425 ; X64-NEXT: kmovw %edi, %k1
4426 ; X64-NEXT: vcvttps2udq %zmm0, %zmm1 {%k1}
4427 ; X64-NEXT: vcvttps2udq {sae}, %zmm0, %zmm0
4428 ; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0
4431 ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_512:
4433 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4434 ; X86-NEXT: vcvttps2udq %zmm0, %zmm1 {%k1}
4435 ; X86-NEXT: vcvttps2udq {sae}, %zmm0, %zmm0
4436 ; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0
4438 %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4)
4439 %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
4440 %res2 = add <16 x i32> %res, %res1
4441 ret <16 x i32> %res2
4444 declare <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
4446 define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
4447 ; X64-LABEL: test_getexp_ss:
4449 ; X64-NEXT: kmovw %edi, %k1
4450 ; X64-NEXT: vmovaps %xmm2, %xmm3
4451 ; X64-NEXT: vgetexpss %xmm1, %xmm0, %xmm3 {%k1}
4452 ; X64-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
4453 ; X64-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm5
4454 ; X64-NEXT: vaddps %xmm5, %xmm4, %xmm4
4455 ; X64-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
4456 ; X64-NEXT: vaddps %xmm2, %xmm3, %xmm0
4457 ; X64-NEXT: vaddps %xmm4, %xmm0, %xmm0
4460 ; X86-LABEL: test_getexp_ss:
4462 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4463 ; X86-NEXT: kmovw %eax, %k1
4464 ; X86-NEXT: vmovaps %xmm2, %xmm3
4465 ; X86-NEXT: vgetexpss %xmm1, %xmm0, %xmm3 {%k1}
4466 ; X86-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
4467 ; X86-NEXT: vaddps %xmm2, %xmm3, %xmm2
4468 ; X86-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
4469 ; X86-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm0
4470 ; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0
4471 ; X86-NEXT: vaddps %xmm0, %xmm2, %xmm0
4473 %res0 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
4474 %res1 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
4475 %res2 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
4476 %res3 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
4478 %res.1 = fadd <4 x float> %res0, %res1
4479 %res.2 = fadd <4 x float> %res2, %res3
4480 %res = fadd <4 x float> %res.1, %res.2
4481 ret <4 x float> %res
4484 declare <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
4486 define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
4487 ; X64-LABEL: test_getexp_sd:
4489 ; X64-NEXT: kmovw %edi, %k1
4490 ; X64-NEXT: vgetexpsd %xmm1, %xmm0, %xmm3
4491 ; X64-NEXT: vmovapd %xmm2, %xmm4
4492 ; X64-NEXT: vgetexpsd %xmm1, %xmm0, %xmm4 {%k1}
4493 ; X64-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm5 {%k1} {z}
4494 ; X64-NEXT: vaddpd %xmm3, %xmm5, %xmm3
4495 ; X64-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
4496 ; X64-NEXT: vaddpd %xmm2, %xmm4, %xmm0
4497 ; X64-NEXT: vaddpd %xmm3, %xmm0, %xmm0
4500 ; X86-LABEL: test_getexp_sd:
4502 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4503 ; X86-NEXT: kmovw %eax, %k1
4504 ; X86-NEXT: vmovapd %xmm2, %xmm3
4505 ; X86-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm3 {%k1}
4506 ; X86-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
4507 ; X86-NEXT: vgetexpsd %xmm1, %xmm0, %xmm2 {%k1}
4508 ; X86-NEXT: vaddpd %xmm3, %xmm2, %xmm2
4509 ; X86-NEXT: vgetexpsd %xmm1, %xmm0, %xmm0
4510 ; X86-NEXT: vaddpd %xmm0, %xmm4, %xmm0
4511 ; X86-NEXT: vaddpd %xmm0, %xmm2, %xmm0
4513 %res0 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
4514 %res1 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
4515 %res2 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
4516 %res3 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4)
4518 %res.1 = fadd <2 x double> %res0, %res1
4519 %res.2 = fadd <2 x double> %res2, %res3
4520 %res = fadd <2 x double> %res.1, %res.2
4521 ret <2 x double> %res
4524 declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32)
4526 define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) {
4527 ; X64-LABEL: test_int_x86_avx512_mask_cmp_sd:
4529 ; X64-NEXT: kmovw %edi, %k1
4530 ; X64-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
4531 ; X64-NEXT: kmovw %k0, %eax
4532 ; X64-NEXT: # kill: def $al killed $al killed $eax
4535 ; X86-LABEL: test_int_x86_avx512_mask_cmp_sd:
4537 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4538 ; X86-NEXT: kmovw %eax, %k1
4539 ; X86-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
4540 ; X86-NEXT: kmovw %k0, %eax
4541 ; X86-NEXT: # kill: def $al killed $al killed $eax
4544 %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
4548 define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) {
4549 ; X64-LABEL: test_int_x86_avx512_mask_cmp_sd_all:
4551 ; X64-NEXT: kmovw %edi, %k1
4552 ; X64-NEXT: vcmplesd %xmm1, %xmm0, %k0
4553 ; X64-NEXT: kmovw %k0, %ecx
4554 ; X64-NEXT: vcmpunordsd {sae}, %xmm1, %xmm0, %k0
4555 ; X64-NEXT: kmovw %k0, %edx
4556 ; X64-NEXT: vcmpneqsd %xmm1, %xmm0, %k0 {%k1}
4557 ; X64-NEXT: kmovw %k0, %esi
4558 ; X64-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
4559 ; X64-NEXT: kmovw %k0, %eax
4560 ; X64-NEXT: orb %sil, %al
4561 ; X64-NEXT: orb %dl, %al
4562 ; X64-NEXT: orb %cl, %al
4563 ; X64-NEXT: # kill: def $al killed $al killed $eax
4566 ; X86-LABEL: test_int_x86_avx512_mask_cmp_sd_all:
4568 ; X86-NEXT: pushl %ebx
4569 ; X86-NEXT: .cfi_def_cfa_offset 8
4570 ; X86-NEXT: .cfi_offset %ebx, -8
4571 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4572 ; X86-NEXT: kmovw %eax, %k1
4573 ; X86-NEXT: vcmplesd %xmm1, %xmm0, %k0
4574 ; X86-NEXT: kmovw %k0, %ecx
4575 ; X86-NEXT: vcmpunordsd {sae}, %xmm1, %xmm0, %k0
4576 ; X86-NEXT: kmovw %k0, %edx
4577 ; X86-NEXT: vcmpneqsd %xmm1, %xmm0, %k0 {%k1}
4578 ; X86-NEXT: kmovw %k0, %ebx
4579 ; X86-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
4580 ; X86-NEXT: kmovw %k0, %eax
4581 ; X86-NEXT: orb %bl, %al
4582 ; X86-NEXT: orb %dl, %al
4583 ; X86-NEXT: orb %cl, %al
4584 ; X86-NEXT: # kill: def $al killed $al killed $eax
4585 ; X86-NEXT: popl %ebx
4586 ; X86-NEXT: .cfi_def_cfa_offset 4
4589 %res1 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 2, i8 -1, i32 4)
4590 %res2 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 3, i8 -1, i32 8)
4591 %res3 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 4, i8 %x3, i32 4)
4592 %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
4594 %res11 = or i8 %res1, %res2
4595 %res12 = or i8 %res3, %res4
4596 %res13 = or i8 %res11, %res12
4600 declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)
4602 define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) {
4603 ; X64-LABEL: test_int_x86_avx512_mask_cmp_ss:
4605 ; X64-NEXT: kmovw %edi, %k1
4606 ; X64-NEXT: vcmpunordss %xmm1, %xmm0, %k0 {%k1}
4607 ; X64-NEXT: kmovw %k0, %eax
4608 ; X64-NEXT: # kill: def $al killed $al killed $eax
4611 ; X86-LABEL: test_int_x86_avx512_mask_cmp_ss:
4613 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4614 ; X86-NEXT: kmovw %eax, %k1
4615 ; X86-NEXT: vcmpunordss %xmm1, %xmm0, %k0 {%k1}
4616 ; X86-NEXT: kmovw %k0, %eax
4617 ; X86-NEXT: # kill: def $al killed $al killed $eax
4620 %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 %x3, i32 4)
4625 define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) {
4626 ; X64-LABEL: test_int_x86_avx512_mask_cmp_ss_all:
4628 ; X64-NEXT: kmovw %edi, %k1
4629 ; X64-NEXT: vcmpless %xmm1, %xmm0, %k0
4630 ; X64-NEXT: kmovw %k0, %ecx
4631 ; X64-NEXT: vcmpunordss {sae}, %xmm1, %xmm0, %k0
4632 ; X64-NEXT: kmovw %k0, %edx
4633 ; X64-NEXT: vcmpneqss %xmm1, %xmm0, %k0 {%k1}
4634 ; X64-NEXT: kmovw %k0, %esi
4635 ; X64-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
4636 ; X64-NEXT: kmovw %k0, %eax
4637 ; X64-NEXT: andb %sil, %al
4638 ; X64-NEXT: andb %dl, %al
4639 ; X64-NEXT: andb %cl, %al
4640 ; X64-NEXT: # kill: def $al killed $al killed $eax
4643 ; X86-LABEL: test_int_x86_avx512_mask_cmp_ss_all:
4645 ; X86-NEXT: pushl %ebx
4646 ; X86-NEXT: .cfi_def_cfa_offset 8
4647 ; X86-NEXT: .cfi_offset %ebx, -8
4648 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4649 ; X86-NEXT: kmovw %eax, %k1
4650 ; X86-NEXT: vcmpless %xmm1, %xmm0, %k0
4651 ; X86-NEXT: kmovw %k0, %ecx
4652 ; X86-NEXT: vcmpunordss {sae}, %xmm1, %xmm0, %k0
4653 ; X86-NEXT: kmovw %k0, %edx
4654 ; X86-NEXT: vcmpneqss %xmm1, %xmm0, %k0 {%k1}
4655 ; X86-NEXT: kmovw %k0, %ebx
4656 ; X86-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
4657 ; X86-NEXT: kmovw %k0, %eax
4658 ; X86-NEXT: andb %bl, %al
4659 ; X86-NEXT: andb %dl, %al
4660 ; X86-NEXT: andb %cl, %al
4661 ; X86-NEXT: # kill: def $al killed $al killed $eax
4662 ; X86-NEXT: popl %ebx
4663 ; X86-NEXT: .cfi_def_cfa_offset 4
4665 %res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4)
4666 %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 -1, i32 8)
4667 %res3 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 4, i8 %x3, i32 4)
4668 %res4 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 5, i8 %x3, i32 8)
4670 %res11 = and i8 %res1, %res2
4671 %res12 = and i8 %res3, %res4
4672 %res13 = and i8 %res11, %res12
4676 declare <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
4678 define <8 x double>@test_int_x86_avx512_mask_getmant_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
4679 ; X64-LABEL: test_int_x86_avx512_mask_getmant_pd_512:
4681 ; X64-NEXT: kmovw %edi, %k1
4682 ; X64-NEXT: vgetmantpd $11, %zmm0, %zmm1 {%k1}
4683 ; X64-NEXT: vgetmantpd $11, {sae}, %zmm0, %zmm0
4684 ; X64-NEXT: vaddpd %zmm0, %zmm1, %zmm0
4687 ; X86-LABEL: test_int_x86_avx512_mask_getmant_pd_512:
4689 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4690 ; X86-NEXT: kmovw %eax, %k1
4691 ; X86-NEXT: vgetmantpd $11, %zmm0, %zmm1 {%k1}
4692 ; X86-NEXT: vgetmantpd $11, {sae}, %zmm0, %zmm0
4693 ; X86-NEXT: vaddpd %zmm0, %zmm1, %zmm0
4695 %res = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 %x3, i32 4)
4696 %res1 = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 -1, i32 8)
4697 %res2 = fadd <8 x double> %res, %res1
4698 ret <8 x double> %res2
4701 declare <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
4703 define <16 x float>@test_int_x86_avx512_mask_getmant_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
4704 ; X64-LABEL: test_int_x86_avx512_mask_getmant_ps_512:
4706 ; X64-NEXT: kmovw %edi, %k1
4707 ; X64-NEXT: vgetmantps $11, %zmm0, %zmm1 {%k1}
4708 ; X64-NEXT: vgetmantps $11, {sae}, %zmm0, %zmm0
4709 ; X64-NEXT: vaddps %zmm0, %zmm1, %zmm0
4712 ; X86-LABEL: test_int_x86_avx512_mask_getmant_ps_512:
4714 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4715 ; X86-NEXT: vgetmantps $11, %zmm0, %zmm1 {%k1}
4716 ; X86-NEXT: vgetmantps $11, {sae}, %zmm0, %zmm0
4717 ; X86-NEXT: vaddps %zmm0, %zmm1, %zmm0
4719 %res = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 %x3, i32 4)
4720 %res1 = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 -1, i32 8)
4721 %res2 = fadd <16 x float> %res, %res1
4722 ret <16 x float> %res2
4725 declare <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double>, <2 x double>, i32, <2 x double>, i8, i32)
4727 define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
4728 ; X64-LABEL: test_int_x86_avx512_mask_getmant_sd:
4730 ; X64-NEXT: kmovw %edi, %k1
4731 ; X64-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm3
4732 ; X64-NEXT: vmovapd %xmm2, %xmm4
4733 ; X64-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1}
4734 ; X64-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm5 {%k1} {z}
4735 ; X64-NEXT: vaddpd %xmm5, %xmm4, %xmm4
4736 ; X64-NEXT: vgetmantsd $11, {sae}, %xmm1, %xmm0, %xmm2 {%k1}
4737 ; X64-NEXT: vaddpd %xmm3, %xmm2, %xmm0
4738 ; X64-NEXT: vaddpd %xmm0, %xmm4, %xmm0
4741 ; X86-LABEL: test_int_x86_avx512_mask_getmant_sd:
4743 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4744 ; X86-NEXT: kmovw %eax, %k1
4745 ; X86-NEXT: vmovapd %xmm2, %xmm3
4746 ; X86-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm3 {%k1}
4747 ; X86-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1} {z}
4748 ; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm3
4749 ; X86-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm4
4750 ; X86-NEXT: vgetmantsd $11, {sae}, %xmm1, %xmm0, %xmm2 {%k1}
4751 ; X86-NEXT: vaddpd %xmm4, %xmm2, %xmm0
4752 ; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0
4754 %res = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 4)
4755 %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> zeroinitializer, i8 %x3, i32 4)
4756 %res2 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 8)
4757 %res3 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 -1, i32 4)
4758 %res11 = fadd <2 x double> %res, %res1
4759 %res12 = fadd <2 x double> %res2, %res3
4760 %res13 = fadd <2 x double> %res11, %res12
4761 ret <2 x double> %res13
4764 declare <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float>, <4 x float>, i32, <4 x float>, i8, i32)
4766 define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
4767 ; X64-LABEL: test_int_x86_avx512_mask_getmant_ss:
4769 ; X64-NEXT: kmovw %edi, %k1
4770 ; X64-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm3
4771 ; X64-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1}
4772 ; X64-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm4 {%k1} {z}
4773 ; X64-NEXT: vaddps %xmm4, %xmm2, %xmm2
4774 ; X64-NEXT: vgetmantss $11, {sae}, %xmm1, %xmm0, %xmm0
4775 ; X64-NEXT: vaddps %xmm3, %xmm0, %xmm0
4776 ; X64-NEXT: vaddps %xmm0, %xmm2, %xmm0
4779 ; X86-LABEL: test_int_x86_avx512_mask_getmant_ss:
4781 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4782 ; X86-NEXT: kmovw %eax, %k1
4783 ; X86-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1}
4784 ; X86-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm3 {%k1} {z}
4785 ; X86-NEXT: vaddps %xmm3, %xmm2, %xmm2
4786 ; X86-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm3
4787 ; X86-NEXT: vgetmantss $11, {sae}, %xmm1, %xmm0, %xmm0
4788 ; X86-NEXT: vaddps %xmm3, %xmm0, %xmm0
4789 ; X86-NEXT: vaddps %xmm0, %xmm2, %xmm0
4791 %res = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 %x3, i32 4)
4792 %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> zeroinitializer, i8 %x3, i32 4)
4793 %res2 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 -1, i32 8)
4794 %res3 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 -1, i32 4)
4795 %res11 = fadd <4 x float> %res, %res1
4796 %res12 = fadd <4 x float> %res2, %res3
4797 %res13 = fadd <4 x float> %res11, %res12
4798 ret <4 x float> %res13
4801 declare <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double>, <8 x i64>)
4803 define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1) {
4804 ; CHECK-LABEL: test_int_x86_avx512_vpermilvar_pd_512:
4806 ; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm0
4807 ; CHECK-NEXT: ret{{[l|q]}}
4808 %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1)
4809 ret <8 x double> %res
4812 define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_mask(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %mask) {
4813 ; X64-LABEL: test_int_x86_avx512_vpermilvar_pd_512_mask:
4815 ; X64-NEXT: kmovw %edi, %k1
4816 ; X64-NEXT: vpermilpd %zmm1, %zmm0, %zmm2 {%k1}
4817 ; X64-NEXT: vmovapd %zmm2, %zmm0
4820 ; X86-LABEL: test_int_x86_avx512_vpermilvar_pd_512_mask:
4822 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4823 ; X86-NEXT: kmovw %eax, %k1
4824 ; X86-NEXT: vpermilpd %zmm1, %zmm0, %zmm2 {%k1}
4825 ; X86-NEXT: vmovapd %zmm2, %zmm0
4827 %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1)
4828 %mask.cast = bitcast i8 %mask to <8 x i1>
4829 %res2 = select <8 x i1> %mask.cast, <8 x double> %res, <8 x double> %x2
4830 ret <8 x double> %res2
4833 define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_maskz(<8 x double> %x0, <8 x i64> %x1, i8 %mask) {
4834 ; X64-LABEL: test_int_x86_avx512_vpermilvar_pd_512_maskz:
4836 ; X64-NEXT: kmovw %edi, %k1
4837 ; X64-NEXT: vpermilpd %zmm1, %zmm0, %zmm0 {%k1} {z}
4840 ; X86-LABEL: test_int_x86_avx512_vpermilvar_pd_512_maskz:
4842 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4843 ; X86-NEXT: kmovw %eax, %k1
4844 ; X86-NEXT: vpermilpd %zmm1, %zmm0, %zmm0 {%k1} {z}
4846 %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1)
4847 %mask.cast = bitcast i8 %mask to <8 x i1>
4848 %res2 = select <8 x i1> %mask.cast, <8 x double> %res, <8 x double> zeroinitializer
4849 ret <8 x double> %res2
4852 declare <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float>, <16 x i32>)
4854 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1) {
4855 ; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_512:
4857 ; CHECK-NEXT: vpermilps %zmm1, %zmm0, %zmm0
4858 ; CHECK-NEXT: ret{{[l|q]}}
4859 %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1)
4860 ret <16 x float> %res
4863 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_mask(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %mask) {
4864 ; X64-LABEL: test_int_x86_avx512_vpermilvar_ps_512_mask:
4866 ; X64-NEXT: kmovw %edi, %k1
4867 ; X64-NEXT: vpermilps %zmm1, %zmm0, %zmm2 {%k1}
4868 ; X64-NEXT: vmovaps %zmm2, %zmm0
4871 ; X86-LABEL: test_int_x86_avx512_vpermilvar_ps_512_mask:
4873 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4874 ; X86-NEXT: vpermilps %zmm1, %zmm0, %zmm2 {%k1}
4875 ; X86-NEXT: vmovaps %zmm2, %zmm0
4877 %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1)
4878 %mask.cast = bitcast i16 %mask to <16 x i1>
4879 %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> %x2
4880 ret <16 x float> %res2
4883 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_maskz(<16 x float> %x0, <16 x i32> %x1, i16 %mask) {
4884 ; X64-LABEL: test_int_x86_avx512_vpermilvar_ps_512_maskz:
4886 ; X64-NEXT: kmovw %edi, %k1
4887 ; X64-NEXT: vpermilps %zmm1, %zmm0, %zmm0 {%k1} {z}
4890 ; X86-LABEL: test_int_x86_avx512_vpermilvar_ps_512_maskz:
4892 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4893 ; X86-NEXT: vpermilps %zmm1, %zmm0, %zmm0 {%k1} {z}
4895 %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1)
4896 %mask.cast = bitcast i16 %mask to <16 x i1>
4897 %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> zeroinitializer
4898 ret <16 x float> %res2
4901 ; Test case to make sure we can print shuffle decode comments for constant pool loads.
4902 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool(<16 x float> %x0, <16 x i32> %x1) {
4903 ; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_512_constant_pool:
4905 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
4906 ; CHECK-NEXT: ret{{[l|q]}}
4907 %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
4908 ret <16 x float> %res
4911 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %mask) {
4912 ; X64-LABEL: test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask:
4914 ; X64-NEXT: kmovw %edi, %k1
4915 ; X64-NEXT: vpermilps {{.*#+}} zmm2 {%k1} = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
4916 ; X64-NEXT: vmovaps %zmm2, %zmm0
4919 ; X86-LABEL: test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask:
4921 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4922 ; X86-NEXT: vpermilps {{.*#+}} zmm2 {%k1} = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
4923 ; X86-NEXT: vmovaps %zmm2, %zmm0
4925 %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
4926 %mask.cast = bitcast i16 %mask to <16 x i1>
4927 %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> %x2
4928 ret <16 x float> %res2
4931 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz(<16 x float> %x0, <16 x i32> %x1, i16 %mask) {
4932 ; X64-LABEL: test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz:
4934 ; X64-NEXT: kmovw %edi, %k1
4935 ; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
4938 ; X86-LABEL: test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz:
4940 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4941 ; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
4943 %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
4944 %mask.cast = bitcast i16 %mask to <16 x i1>
4945 %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> zeroinitializer
4946 ret <16 x float> %res2
4949 declare <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double>, <4 x float>, <2 x double>, i8, i32)
4951 define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<2 x double> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) {
4952 ; X64-LABEL: test_int_x86_avx512_mask_cvt_ss2sd_round:
4954 ; X64-NEXT: kmovw %edi, %k1
4955 ; X64-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm2 {%k1}
4956 ; X64-NEXT: vcvtss2sd {sae}, %xmm1, %xmm0, %xmm0
4957 ; X64-NEXT: vaddpd %xmm0, %xmm2, %xmm0
4960 ; X86-LABEL: test_int_x86_avx512_mask_cvt_ss2sd_round:
4962 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4963 ; X86-NEXT: kmovw %eax, %k1
4964 ; X86-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm2 {%k1}
4965 ; X86-NEXT: vcvtss2sd {sae}, %xmm1, %xmm0, %xmm0
4966 ; X86-NEXT: vaddpd %xmm0, %xmm2, %xmm0
4968 %res = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> %x0, <4 x float> %x1, <2 x double> %x2, i8 %x3, i32 4)
4969 %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> %x0, <4 x float> %x1, <2 x double> %x2, i8 -1, i32 8)
4970 %res2 = fadd <2 x double> %res, %res1
4971 ret <2 x double> %res2
4974 declare <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float>, <2 x double>, <4 x float>, i8, i32)
4976 define <4 x float>@test_int_x86_avx512_mask_cvt_sd2ss_round(<4 x float> %x0,<2 x double> %x1, <4 x float> %x2, i8 %x3) {
4977 ; X64-LABEL: test_int_x86_avx512_mask_cvt_sd2ss_round:
4979 ; X64-NEXT: kmovw %edi, %k1
4980 ; X64-NEXT: vcvtsd2ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
4981 ; X64-NEXT: vcvtsd2ss {rn-sae}, %xmm1, %xmm0, %xmm0
4982 ; X64-NEXT: vaddps %xmm0, %xmm2, %xmm0
4985 ; X86-LABEL: test_int_x86_avx512_mask_cvt_sd2ss_round:
4987 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4988 ; X86-NEXT: kmovw %eax, %k1
4989 ; X86-NEXT: vcvtsd2ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
4990 ; X86-NEXT: vcvtsd2ss {rn-sae}, %xmm1, %xmm0, %xmm0
4991 ; X86-NEXT: vaddps %xmm0, %xmm2, %xmm0
4993 %res = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> %x0, <2 x double> %x1, <4 x float> %x2, i8 %x3, i32 11)
4994 %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> %x0, <2 x double> %x1, <4 x float> %x2, i8 -1, i32 8)
4995 %res2 = fadd <4 x float> %res, %res1
4996 ret <4 x float> %res2
4999 declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32)
5001 define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) {
5002 ; X64-LABEL: test_int_x86_avx512_mask_pternlog_d_512:
5004 ; X64-NEXT: vmovdqa64 %zmm0, %zmm3
5005 ; X64-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3
5006 ; X64-NEXT: kmovw %edi, %k1
5007 ; X64-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1}
5008 ; X64-NEXT: vpaddd %zmm3, %zmm0, %zmm0
5011 ; X86-LABEL: test_int_x86_avx512_mask_pternlog_d_512:
5013 ; X86-NEXT: vmovdqa64 %zmm0, %zmm3
5014 ; X86-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3
5015 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
5016 ; X86-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1}
5017 ; X86-NEXT: vpaddd %zmm3, %zmm0, %zmm0
5019 %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
5020 %2 = bitcast i16 %x4 to <16 x i1>
5021 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
5022 %4 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
5023 %res2 = add <16 x i32> %3, %4
5024 ret <16 x i32> %res2
5027 define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) {
5028 ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_d_512:
5030 ; X64-NEXT: vmovdqa64 %zmm0, %zmm3
5031 ; X64-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3
5032 ; X64-NEXT: kmovw %edi, %k1
5033 ; X64-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z}
5034 ; X64-NEXT: vpaddd %zmm3, %zmm0, %zmm0
5037 ; X86-LABEL: test_int_x86_avx512_maskz_pternlog_d_512:
5039 ; X86-NEXT: vmovdqa64 %zmm0, %zmm3
5040 ; X86-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3
5041 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
5042 ; X86-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z}
5043 ; X86-NEXT: vpaddd %zmm3, %zmm0, %zmm0
5045 %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
5046 %2 = bitcast i16 %x4 to <16 x i1>
5047 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
5048 %4 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
5049 %res2 = add <16 x i32> %3, %4
5050 ret <16 x i32> %res2
5053 declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32)
5055 define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) {
5056 ; X64-LABEL: test_int_x86_avx512_mask_pternlog_q_512:
5058 ; X64-NEXT: vmovdqa64 %zmm0, %zmm3
5059 ; X64-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3
5060 ; X64-NEXT: kmovw %edi, %k1
5061 ; X64-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1}
5062 ; X64-NEXT: vpaddq %zmm3, %zmm0, %zmm0
5065 ; X86-LABEL: test_int_x86_avx512_mask_pternlog_q_512:
5067 ; X86-NEXT: vmovdqa64 %zmm0, %zmm3
5068 ; X86-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3
5069 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5070 ; X86-NEXT: kmovw %eax, %k1
5071 ; X86-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1}
5072 ; X86-NEXT: vpaddq %zmm3, %zmm0, %zmm0
5074 %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
5075 %2 = bitcast i8 %x4 to <8 x i1>
5076 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0
5077 %4 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
5078 %res2 = add <8 x i64> %3, %4
5082 define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) {
5083 ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_q_512:
5085 ; X64-NEXT: vmovdqa64 %zmm0, %zmm3
5086 ; X64-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3
5087 ; X64-NEXT: kmovw %edi, %k1
5088 ; X64-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} {z}
5089 ; X64-NEXT: vpaddq %zmm3, %zmm0, %zmm0
5092 ; X86-LABEL: test_int_x86_avx512_maskz_pternlog_q_512:
5094 ; X86-NEXT: vmovdqa64 %zmm0, %zmm3
5095 ; X86-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3
5096 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5097 ; X86-NEXT: kmovw %eax, %k1
5098 ; X86-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} {z}
5099 ; X86-NEXT: vpaddq %zmm3, %zmm0, %zmm0
5101 %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
5102 %2 = bitcast i8 %x4 to <8 x i1>
5103 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
5104 %4 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
5105 %res2 = add <8 x i64> %3, %4
5109 define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
5110 ; CHECK-LABEL: test_x86_avx512_comi_sd_eq_sae:
5112 ; CHECK-NEXT: vcmpeqsd {sae}, %xmm1, %xmm0, %k0
5113 ; CHECK-NEXT: kmovw %k0, %eax
5114 ; CHECK-NEXT: ret{{[l|q]}}
5115 %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 8)
5119 define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
5120 ; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq_sae:
5122 ; CHECK-NEXT: vcmpeq_uqsd {sae}, %xmm1, %xmm0, %k0
5123 ; CHECK-NEXT: kmovw %k0, %eax
5124 ; CHECK-NEXT: ret{{[l|q]}}
5125 %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 8)
5129 define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
5130 ; CHECK-LABEL: test_x86_avx512_comi_sd_eq:
5132 ; CHECK-NEXT: vcmpeqsd %xmm1, %xmm0, %k0
5133 ; CHECK-NEXT: kmovw %k0, %eax
5134 ; CHECK-NEXT: ret{{[l|q]}}
5135 %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 4)
5139 define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
5140 ; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq:
5142 ; CHECK-NEXT: vcmpeq_uqsd %xmm1, %xmm0, %k0
5143 ; CHECK-NEXT: kmovw %k0, %eax
5144 ; CHECK-NEXT: ret{{[l|q]}}
5145 %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 4)
5149 define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
5150 ; CHECK-LABEL: test_x86_avx512_comi_sd_lt_sae:
5152 ; CHECK-NEXT: vcmpltsd {sae}, %xmm1, %xmm0, %k0
5153 ; CHECK-NEXT: kmovw %k0, %eax
5154 ; CHECK-NEXT: ret{{[l|q]}}
5155 %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 8)
5159 define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
5160 ; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt_sae:
5162 ; CHECK-NEXT: vcmpngesd {sae}, %xmm1, %xmm0, %k0
5163 ; CHECK-NEXT: kmovw %k0, %eax
5164 ; CHECK-NEXT: ret{{[l|q]}}
5165 %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 8)
5169 define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
5170 ; CHECK-LABEL: test_x86_avx512_comi_sd_lt:
5172 ; CHECK-NEXT: vcmpltsd %xmm1, %xmm0, %k0
5173 ; CHECK-NEXT: kmovw %k0, %eax
5174 ; CHECK-NEXT: ret{{[l|q]}}
5175 %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 4)
5179 define i32 @test_x86_avx512_ucomi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
5180 ; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt:
5182 ; CHECK-NEXT: vcmpngesd %xmm1, %xmm0, %k0
5183 ; CHECK-NEXT: kmovw %k0, %eax
5184 ; CHECK-NEXT: ret{{[l|q]}}
5185 %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 4)
5189 declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32)
5191 define i32 @test_x86_avx512_ucomi_ss_lt(<4 x float> %a0, <4 x float> %a1) {
5192 ; CHECK-LABEL: test_x86_avx512_ucomi_ss_lt:
5194 ; CHECK-NEXT: vcmpngess %xmm1, %xmm0, %k0
5195 ; CHECK-NEXT: kmovw %k0, %eax
5196 ; CHECK-NEXT: ret{{[l|q]}}
5197 %res = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %a0, <4 x float> %a1, i32 9, i32 4)
5201 declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32)
5203 declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>)
5205 define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
5206 ; X64-LABEL: test_int_x86_avx512_mask_permvar_df_512:
5208 ; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm3
5209 ; X64-NEXT: kmovw %edi, %k1
5210 ; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm2 {%k1}
5211 ; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
5212 ; X64-NEXT: vaddpd %zmm0, %zmm2, %zmm0
5213 ; X64-NEXT: vaddpd %zmm3, %zmm0, %zmm0
5216 ; X86-LABEL: test_int_x86_avx512_mask_permvar_df_512:
5218 ; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm3
5219 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5220 ; X86-NEXT: kmovw %eax, %k1
5221 ; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm2 {%k1}
5222 ; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
5223 ; X86-NEXT: vaddpd %zmm0, %zmm2, %zmm0
5224 ; X86-NEXT: vaddpd %zmm3, %zmm0, %zmm0
5226 %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1)
5227 %2 = bitcast i8 %x3 to <8 x i1>
5228 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %x2
5229 %4 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1)
5230 %5 = bitcast i8 %x3 to <8 x i1>
5231 %6 = select <8 x i1> %5, <8 x double> %4, <8 x double> zeroinitializer
5232 %7 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1)
5233 %res3 = fadd <8 x double> %3, %6
5234 %res4 = fadd <8 x double> %res3, %7
5235 ret <8 x double> %res4
5238 declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>)
5240 define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
5241 ; X64-LABEL: test_int_x86_avx512_mask_permvar_di_512:
5243 ; X64-NEXT: vpermq %zmm0, %zmm1, %zmm3
5244 ; X64-NEXT: kmovw %edi, %k1
5245 ; X64-NEXT: vpermq %zmm0, %zmm1, %zmm2 {%k1}
5246 ; X64-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
5247 ; X64-NEXT: vpaddq %zmm3, %zmm0, %zmm0
5248 ; X64-NEXT: vpaddq %zmm0, %zmm2, %zmm0
5251 ; X86-LABEL: test_int_x86_avx512_mask_permvar_di_512:
5253 ; X86-NEXT: vpermq %zmm0, %zmm1, %zmm3
5254 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5255 ; X86-NEXT: kmovw %eax, %k1
5256 ; X86-NEXT: vpermq %zmm0, %zmm1, %zmm2 {%k1}
5257 ; X86-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
5258 ; X86-NEXT: vpaddq %zmm3, %zmm0, %zmm0
5259 ; X86-NEXT: vpaddq %zmm0, %zmm2, %zmm0
5261 %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1)
5262 %2 = bitcast i8 %x3 to <8 x i1>
5263 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2
5264 %4 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1)
5265 %5 = bitcast i8 %x3 to <8 x i1>
5266 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
5267 %7 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1)
5268 %res3 = add <8 x i64> %3, %6
5269 %res4 = add <8 x i64> %res3, %7
5273 declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>)
5275 define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
5276 ; X64-LABEL: test_int_x86_avx512_mask_permvar_sf_512:
5278 ; X64-NEXT: vpermps %zmm0, %zmm1, %zmm3
5279 ; X64-NEXT: kmovw %edi, %k1
5280 ; X64-NEXT: vpermps %zmm0, %zmm1, %zmm2 {%k1}
5281 ; X64-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
5282 ; X64-NEXT: vaddps %zmm0, %zmm2, %zmm0
5283 ; X64-NEXT: vaddps %zmm3, %zmm0, %zmm0
5286 ; X86-LABEL: test_int_x86_avx512_mask_permvar_sf_512:
5288 ; X86-NEXT: vpermps %zmm0, %zmm1, %zmm3
5289 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
5290 ; X86-NEXT: vpermps %zmm0, %zmm1, %zmm2 {%k1}
5291 ; X86-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
5292 ; X86-NEXT: vaddps %zmm0, %zmm2, %zmm0
5293 ; X86-NEXT: vaddps %zmm3, %zmm0, %zmm0
5295 %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1)
5296 %2 = bitcast i16 %x3 to <16 x i1>
5297 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %x2
5298 %4 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1)
5299 %5 = bitcast i16 %x3 to <16 x i1>
5300 %6 = select <16 x i1> %5, <16 x float> %4, <16 x float> zeroinitializer
5301 %7 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1)
5302 %res3 = fadd <16 x float> %3, %6
5303 %res4 = fadd <16 x float> %res3, %7
5304 ret <16 x float> %res4
5307 declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>)
5309 define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
5310 ; X64-LABEL: test_int_x86_avx512_mask_permvar_si_512:
5312 ; X64-NEXT: vpermd %zmm0, %zmm1, %zmm3
5313 ; X64-NEXT: kmovw %edi, %k1
5314 ; X64-NEXT: vpermd %zmm0, %zmm1, %zmm2 {%k1}
5315 ; X64-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
5316 ; X64-NEXT: vpaddd %zmm3, %zmm0, %zmm0
5317 ; X64-NEXT: vpaddd %zmm0, %zmm2, %zmm0
5320 ; X86-LABEL: test_int_x86_avx512_mask_permvar_si_512:
5322 ; X86-NEXT: vpermd %zmm0, %zmm1, %zmm3
5323 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
5324 ; X86-NEXT: vpermd %zmm0, %zmm1, %zmm2 {%k1}
5325 ; X86-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
5326 ; X86-NEXT: vpaddd %zmm3, %zmm0, %zmm0
5327 ; X86-NEXT: vpaddd %zmm0, %zmm2, %zmm0
5329 %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1)
5330 %2 = bitcast i16 %x3 to <16 x i1>
5331 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2
5332 %4 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1)
5333 %5 = bitcast i16 %x3 to <16 x i1>
5334 %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
5335 %7 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1)
5336 %res3 = add <16 x i32> %3, %6
5337 %res4 = add <16 x i32> %res3, %7
5338 ret <16 x i32> %res4
5341 declare <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32)
5343 define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) {
5344 ; X64-LABEL: test_int_x86_avx512_mask_fixupimm_pd_512:
5346 ; X64-NEXT: kmovw %edi, %k1
5347 ; X64-NEXT: vmovapd %zmm0, %zmm3
5348 ; X64-NEXT: vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1}
5349 ; X64-NEXT: vxorpd %xmm4, %xmm4, %xmm4
5350 ; X64-NEXT: vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z}
5351 ; X64-NEXT: vaddpd %zmm4, %zmm3, %zmm3
5352 ; X64-NEXT: vfixupimmpd $3, {sae}, %zmm2, %zmm1, %zmm0
5353 ; X64-NEXT: vaddpd %zmm0, %zmm3, %zmm0
5356 ; X86-LABEL: test_int_x86_avx512_mask_fixupimm_pd_512:
5358 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5359 ; X86-NEXT: kmovw %eax, %k1
5360 ; X86-NEXT: vmovapd %zmm0, %zmm3
5361 ; X86-NEXT: vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1}
5362 ; X86-NEXT: vxorpd %xmm4, %xmm4, %xmm4
5363 ; X86-NEXT: vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z}
5364 ; X86-NEXT: vaddpd %zmm4, %zmm3, %zmm3
5365 ; X86-NEXT: vfixupimmpd $3, {sae}, %zmm2, %zmm1, %zmm0
5366 ; X86-NEXT: vaddpd %zmm0, %zmm3, %zmm0
5368 %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 4, i8 %x4, i32 4)
5369 %res1 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> %x1, <8 x i64> %x2, i32 5, i8 %x4, i32 4)
5370 %res2 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 -1, i32 8)
5371 %res3 = fadd <8 x double> %res, %res1
5372 %res4 = fadd <8 x double> %res3, %res2
5373 ret <8 x double> %res4
5376 define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512_load(<8 x double> %x0, <8 x double> %x1, <8 x i64>* %x2ptr) {
5377 ; X64-LABEL: test_int_x86_avx512_mask_fixupimm_pd_512_load:
5379 ; X64-NEXT: vfixupimmpd $3, (%rdi), %zmm1, %zmm0
5382 ; X86-LABEL: test_int_x86_avx512_mask_fixupimm_pd_512_load:
5384 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
5385 ; X86-NEXT: vfixupimmpd $3, (%eax), %zmm1, %zmm0
5387 %x2 = load <8 x i64>, <8 x i64>* %x2ptr
5388 %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 -1, i32 4)
5389 ret <8 x double> %res
5392 declare <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32)
5394 define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) {
5395 ; X64-LABEL: test_int_x86_avx512_maskz_fixupimm_pd_512:
5397 ; X64-NEXT: kmovw %edi, %k1
5398 ; X64-NEXT: vmovapd %zmm0, %zmm3
5399 ; X64-NEXT: vfixupimmpd $3, %zmm2, %zmm1, %zmm3 {%k1} {z}
5400 ; X64-NEXT: vxorpd %xmm4, %xmm4, %xmm4
5401 ; X64-NEXT: vmovapd %zmm0, %zmm5
5402 ; X64-NEXT: vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z}
5403 ; X64-NEXT: vaddpd %zmm5, %zmm3, %zmm3
5404 ; X64-NEXT: vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0
5405 ; X64-NEXT: vaddpd %zmm0, %zmm3, %zmm0
5408 ; X86-LABEL: test_int_x86_avx512_maskz_fixupimm_pd_512:
5410 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5411 ; X86-NEXT: kmovw %eax, %k1
5412 ; X86-NEXT: vmovapd %zmm0, %zmm3
5413 ; X86-NEXT: vfixupimmpd $3, %zmm2, %zmm1, %zmm3 {%k1} {z}
5414 ; X86-NEXT: vxorpd %xmm4, %xmm4, %xmm4
5415 ; X86-NEXT: vmovapd %zmm0, %zmm5
5416 ; X86-NEXT: vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z}
5417 ; X86-NEXT: vaddpd %zmm5, %zmm3, %zmm3
5418 ; X86-NEXT: vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0
5419 ; X86-NEXT: vaddpd %zmm0, %zmm3, %zmm0
5421 %res = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 %x4, i32 4)
5422 %res1 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> zeroinitializer, i32 5, i8 %x4, i32 4)
5423 %res2 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 2, i8 -1, i32 8)
5424 %res3 = fadd <8 x double> %res, %res1
5425 %res4 = fadd <8 x double> %res3, %res2
5426 ret <8 x double> %res4
5429 declare <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float>, <4 x float>, <4 x i32>, i32, i8, i32)
5431 define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) {
5432 ; X64-LABEL: test_int_x86_avx512_mask_fixupimm_ss:
5434 ; X64-NEXT: kmovw %edi, %k1
5435 ; X64-NEXT: vmovaps %xmm0, %xmm3
5436 ; X64-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1}
5437 ; X64-NEXT: vxorps %xmm4, %xmm4, %xmm4
5438 ; X64-NEXT: vmovaps %xmm0, %xmm5
5439 ; X64-NEXT: vfixupimmss $5, %xmm4, %xmm1, %xmm5 {%k1}
5440 ; X64-NEXT: vaddps %xmm5, %xmm3, %xmm3
5441 ; X64-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0
5442 ; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0
5445 ; X86-LABEL: test_int_x86_avx512_mask_fixupimm_ss:
5447 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5448 ; X86-NEXT: kmovw %eax, %k1
5449 ; X86-NEXT: vmovaps %xmm0, %xmm3
5450 ; X86-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1}
5451 ; X86-NEXT: vxorps %xmm4, %xmm4, %xmm4
5452 ; X86-NEXT: vmovaps %xmm0, %xmm5
5453 ; X86-NEXT: vfixupimmss $5, %xmm4, %xmm1, %xmm5 {%k1}
5454 ; X86-NEXT: vaddps %xmm5, %xmm3, %xmm3
5455 ; X86-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0
5456 ; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0
5458 %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4)
5459 %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 4)
5460 %res2 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 -1, i32 8)
5461 %res3 = fadd <4 x float> %res, %res1
5462 %res4 = fadd <4 x float> %res3, %res2
5463 ret <4 x float> %res4
5466 declare <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float>, <4 x float>, <4 x i32>, i32, i8, i32)
5468 define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) {
5469 ; X64-LABEL: test_int_x86_avx512_maskz_fixupimm_ss:
5471 ; X64-NEXT: kmovw %edi, %k1
5472 ; X64-NEXT: vmovaps %xmm0, %xmm3
5473 ; X64-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3
5474 ; X64-NEXT: vmovaps %xmm0, %xmm4
5475 ; X64-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm4 {%k1} {z}
5476 ; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2
5477 ; X64-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5478 ; X64-NEXT: vaddps %xmm0, %xmm4, %xmm0
5479 ; X64-NEXT: vaddps %xmm3, %xmm0, %xmm0
5482 ; X86-LABEL: test_int_x86_avx512_maskz_fixupimm_ss:
5484 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5485 ; X86-NEXT: kmovw %eax, %k1
5486 ; X86-NEXT: vmovaps %xmm0, %xmm3
5487 ; X86-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1} {z}
5488 ; X86-NEXT: vmovaps %xmm0, %xmm4
5489 ; X86-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm4
5490 ; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2
5491 ; X86-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5492 ; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0
5493 ; X86-NEXT: vaddps %xmm4, %xmm0, %xmm0
5495 %res = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4)
5496 %res1 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 8)
5497 %res2 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 -1, i32 4)
5498 %res3 = fadd <4 x float> %res, %res1
5499 %res4 = fadd <4 x float> %res3, %res2
5500 ret <4 x float> %res4
5503 declare <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float>, <16 x float>, <16 x i32>, i32, i16, i32)
5505 define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) {
5506 ; X64-LABEL: test_int_x86_avx512_mask_fixupimm_ps_512:
5508 ; X64-NEXT: kmovw %edi, %k1
5509 ; X64-NEXT: vmovaps %zmm0, %zmm3
5510 ; X64-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1}
5511 ; X64-NEXT: vxorps %xmm4, %xmm4, %xmm4
5512 ; X64-NEXT: vmovaps %zmm0, %zmm5
5513 ; X64-NEXT: vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1}
5514 ; X64-NEXT: vaddps %zmm5, %zmm3, %zmm3
5515 ; X64-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0
5516 ; X64-NEXT: vaddps %zmm0, %zmm3, %zmm0
5519 ; X86-LABEL: test_int_x86_avx512_mask_fixupimm_ps_512:
5521 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
5522 ; X86-NEXT: vmovaps %zmm0, %zmm3
5523 ; X86-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1}
5524 ; X86-NEXT: vxorps %xmm4, %xmm4, %xmm4
5525 ; X86-NEXT: vmovaps %zmm0, %zmm5
5526 ; X86-NEXT: vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1}
5527 ; X86-NEXT: vaddps %zmm5, %zmm3, %zmm3
5528 ; X86-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0
5529 ; X86-NEXT: vaddps %zmm0, %zmm3, %zmm0
5531 %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4)
5532 %res1 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 4)
5533 %res2 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 8)
5534 %res3 = fadd <16 x float> %res, %res1
5535 %res4 = fadd <16 x float> %res3, %res2
5536 ret <16 x float> %res4
5539 define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512_load(<16 x float> %x0, <16 x float> %x1, <16 x i32>* %x2ptr) {
5540 ; X64-LABEL: test_int_x86_avx512_mask_fixupimm_ps_512_load:
5542 ; X64-NEXT: vfixupimmps $5, (%rdi), %zmm1, %zmm0
5545 ; X86-LABEL: test_int_x86_avx512_mask_fixupimm_ps_512_load:
5547 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
5548 ; X86-NEXT: vfixupimmps $5, (%eax), %zmm1, %zmm0
5550 %x2 = load <16 x i32>, <16 x i32>* %x2ptr
5551 %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 4)
5552 ret <16 x float> %res
5555 declare <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float>, <16 x float>, <16 x i32>, i32, i16, i32)
5557 define <16 x float>@test_int_x86_avx512_maskz_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) {
5558 ; X64-LABEL: test_int_x86_avx512_maskz_fixupimm_ps_512:
5560 ; X64-NEXT: kmovw %edi, %k1
5561 ; X64-NEXT: vmovaps %zmm0, %zmm3
5562 ; X64-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm3
5563 ; X64-NEXT: vmovaps %zmm0, %zmm4
5564 ; X64-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm4 {%k1} {z}
5565 ; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2
5566 ; X64-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
5567 ; X64-NEXT: vaddps %zmm0, %zmm4, %zmm0
5568 ; X64-NEXT: vaddps %zmm3, %zmm0, %zmm0
5571 ; X86-LABEL: test_int_x86_avx512_maskz_fixupimm_ps_512:
5573 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
5574 ; X86-NEXT: vmovaps %zmm0, %zmm3
5575 ; X86-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1} {z}
5576 ; X86-NEXT: vmovaps %zmm0, %zmm4
5577 ; X86-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm4
5578 ; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2
5579 ; X86-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
5580 ; X86-NEXT: vaddps %zmm0, %zmm3, %zmm0
5581 ; X86-NEXT: vaddps %zmm4, %zmm0, %zmm0
5583 %res = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4)
5584 %res1 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 8)
5585 %res2 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 4)
5586 %res3 = fadd <16 x float> %res, %res1
5587 %res4 = fadd <16 x float> %res3, %res2
5588 ret <16 x float> %res4
5591 declare <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double>, <2 x double>, <2 x i64>, i32, i8, i32)
5593 define <2 x double>@test_int_x86_avx512_mask_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) {
5594 ; X64-LABEL: test_int_x86_avx512_mask_fixupimm_sd:
5596 ; X64-NEXT: kmovw %edi, %k1
5597 ; X64-NEXT: vmovapd %xmm0, %xmm3
5598 ; X64-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3
5599 ; X64-NEXT: vmovapd %xmm0, %xmm4
5600 ; X64-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm4 {%k1}
5601 ; X64-NEXT: vxorpd %xmm2, %xmm2, %xmm2
5602 ; X64-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1}
5603 ; X64-NEXT: vaddpd %xmm0, %xmm4, %xmm0
5604 ; X64-NEXT: vaddpd %xmm3, %xmm0, %xmm0
5607 ; X86-LABEL: test_int_x86_avx512_mask_fixupimm_sd:
5609 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5610 ; X86-NEXT: kmovw %eax, %k1
5611 ; X86-NEXT: vmovapd %xmm0, %xmm3
5612 ; X86-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1}
5613 ; X86-NEXT: vmovapd %xmm0, %xmm4
5614 ; X86-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm4
5615 ; X86-NEXT: vxorpd %xmm2, %xmm2, %xmm2
5616 ; X86-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1}
5617 ; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0
5618 ; X86-NEXT: vaddpd %xmm4, %xmm0, %xmm0
5620 %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4)
5621 %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8)
5622 %res2 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 -1, i32 4)
5623 %res3 = fadd <2 x double> %res, %res1
5624 %res4 = fadd <2 x double> %res3, %res2
5625 ret <2 x double> %res4
5628 declare <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double>, <2 x double>, <2 x i64>, i32, i8, i32)
5630 define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) {
5631 ; X64-LABEL: test_int_x86_avx512_maskz_fixupimm_sd:
5633 ; X64-NEXT: kmovw %edi, %k1
5634 ; X64-NEXT: vmovapd %xmm0, %xmm3
5635 ; X64-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1} {z}
5636 ; X64-NEXT: vxorpd %xmm4, %xmm4, %xmm4
5637 ; X64-NEXT: vmovapd %xmm0, %xmm5
5638 ; X64-NEXT: vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z}
5639 ; X64-NEXT: vaddpd %xmm5, %xmm3, %xmm3
5640 ; X64-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5641 ; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0
5644 ; X86-LABEL: test_int_x86_avx512_maskz_fixupimm_sd:
5646 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5647 ; X86-NEXT: kmovw %eax, %k1
5648 ; X86-NEXT: vmovapd %xmm0, %xmm3
5649 ; X86-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1} {z}
5650 ; X86-NEXT: vxorpd %xmm4, %xmm4, %xmm4
5651 ; X86-NEXT: vmovapd %xmm0, %xmm5
5652 ; X86-NEXT: vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z}
5653 ; X86-NEXT: vaddpd %xmm5, %xmm3, %xmm3
5654 ; X86-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5655 ; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0
5657 %res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4)
5658 %res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8)
5659 %res2 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 8)
5660 %res3 = fadd <2 x double> %res, %res1
5661 %res4 = fadd <2 x double> %res3, %res2
5662 ret <2 x double> %res4
5665 declare double @llvm.fma.f64(double, double, double) #1
5666 declare double @llvm.x86.avx512.vfmadd.f64(double, double, double, i32) #0
5668 define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
5669 ; X64-LABEL: test_int_x86_avx512_mask_vfmadd_sd:
5671 ; X64-NEXT: vmovapd %xmm0, %xmm3
5672 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm3 = (xmm1 * xmm3) + xmm2
5673 ; X64-NEXT: kmovw %edi, %k1
5674 ; X64-NEXT: vmovapd %xmm0, %xmm4
5675 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm4 = (xmm1 * xmm4) + xmm2
5676 ; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm3
5677 ; X64-NEXT: vmovapd %xmm0, %xmm4
5678 ; X64-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm4
5679 ; X64-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5680 ; X64-NEXT: vaddpd %xmm0, %xmm4, %xmm0
5681 ; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0
5684 ; X86-LABEL: test_int_x86_avx512_mask_vfmadd_sd:
5686 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5687 ; X86-NEXT: vmovapd %xmm0, %xmm3
5688 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm3 = (xmm1 * xmm3) + xmm2
5689 ; X86-NEXT: kmovw %eax, %k1
5690 ; X86-NEXT: vmovapd %xmm0, %xmm4
5691 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm4 = (xmm1 * xmm4) + xmm2
5692 ; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm3
5693 ; X86-NEXT: vmovapd %xmm0, %xmm4
5694 ; X86-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm4
5695 ; X86-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5696 ; X86-NEXT: vaddpd %xmm0, %xmm4, %xmm0
5697 ; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0
5699 %1 = extractelement <2 x double> %x0, i64 0
5700 %2 = extractelement <2 x double> %x1, i64 0
5701 %3 = extractelement <2 x double> %x2, i64 0
5702 %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
5703 %5 = insertelement <2 x double> %x0, double %4, i64 0
5704 %6 = extractelement <2 x double> %x0, i64 0
5705 %7 = extractelement <2 x double> %x1, i64 0
5706 %8 = extractelement <2 x double> %x2, i64 0
5707 %9 = call double @llvm.fma.f64(double %6, double %7, double %8)
5708 %10 = bitcast i8 %x3 to <8 x i1>
5709 %11 = extractelement <8 x i1> %10, i64 0
5710 %12 = select i1 %11, double %9, double %6
5711 %13 = insertelement <2 x double> %x0, double %12, i64 0
5712 %14 = extractelement <2 x double> %x0, i64 0
5713 %15 = extractelement <2 x double> %x1, i64 0
5714 %16 = extractelement <2 x double> %x2, i64 0
5715 %17 = call double @llvm.x86.avx512.vfmadd.f64(double %14, double %15, double %16, i32 11)
5716 %18 = insertelement <2 x double> %x0, double %17, i64 0
5717 %19 = extractelement <2 x double> %x0, i64 0
5718 %20 = extractelement <2 x double> %x1, i64 0
5719 %21 = extractelement <2 x double> %x2, i64 0
5720 %22 = call double @llvm.x86.avx512.vfmadd.f64(double %19, double %20, double %21, i32 11)
5721 %23 = bitcast i8 %x3 to <8 x i1>
5722 %24 = extractelement <8 x i1> %23, i64 0
5723 %25 = select i1 %24, double %22, double %19
5724 %26 = insertelement <2 x double> %x0, double %25, i64 0
5725 %res4 = fadd <2 x double> %5, %13
5726 %res5 = fadd <2 x double> %18, %26
5727 %res6 = fadd <2 x double> %res4, %res5
5728 ret <2 x double> %res6
5731 define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
5732 ; X64-LABEL: test_int_x86_avx512_mask_vfmadd_ss:
5734 ; X64-NEXT: vmovaps %xmm0, %xmm3
5735 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm3 = (xmm1 * xmm3) + xmm2
5736 ; X64-NEXT: kmovw %edi, %k1
5737 ; X64-NEXT: vmovaps %xmm0, %xmm4
5738 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm4 = (xmm1 * xmm4) + xmm2
5739 ; X64-NEXT: vaddps %xmm4, %xmm3, %xmm3
5740 ; X64-NEXT: vmovaps %xmm0, %xmm4
5741 ; X64-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm4
5742 ; X64-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5743 ; X64-NEXT: vaddps %xmm0, %xmm4, %xmm0
5744 ; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0
5747 ; X86-LABEL: test_int_x86_avx512_mask_vfmadd_ss:
5749 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5750 ; X86-NEXT: vmovaps %xmm0, %xmm3
5751 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm3 = (xmm1 * xmm3) + xmm2
5752 ; X86-NEXT: kmovw %eax, %k1
5753 ; X86-NEXT: vmovaps %xmm0, %xmm4
5754 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm4 = (xmm1 * xmm4) + xmm2
5755 ; X86-NEXT: vaddps %xmm4, %xmm3, %xmm3
5756 ; X86-NEXT: vmovaps %xmm0, %xmm4
5757 ; X86-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm4
5758 ; X86-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5759 ; X86-NEXT: vaddps %xmm0, %xmm4, %xmm0
5760 ; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0
5762 %1 = extractelement <4 x float> %x0, i64 0
5763 %2 = extractelement <4 x float> %x1, i64 0
5764 %3 = extractelement <4 x float> %x2, i64 0
5765 %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
5766 %5 = insertelement <4 x float> %x0, float %4, i64 0
5767 %6 = extractelement <4 x float> %x0, i64 0
5768 %7 = extractelement <4 x float> %x1, i64 0
5769 %8 = extractelement <4 x float> %x2, i64 0
5770 %9 = call float @llvm.fma.f32(float %6, float %7, float %8)
5771 %10 = bitcast i8 %x3 to <8 x i1>
5772 %11 = extractelement <8 x i1> %10, i64 0
5773 %12 = select i1 %11, float %9, float %6
5774 %13 = insertelement <4 x float> %x0, float %12, i64 0
5775 %14 = extractelement <4 x float> %x0, i64 0
5776 %15 = extractelement <4 x float> %x1, i64 0
5777 %16 = extractelement <4 x float> %x2, i64 0
5778 %17 = call float @llvm.x86.avx512.vfmadd.f32(float %14, float %15, float %16, i32 11)
5779 %18 = insertelement <4 x float> %x0, float %17, i64 0
5780 %19 = extractelement <4 x float> %x0, i64 0
5781 %20 = extractelement <4 x float> %x1, i64 0
5782 %21 = extractelement <4 x float> %x2, i64 0
5783 %22 = call float @llvm.x86.avx512.vfmadd.f32(float %19, float %20, float %21, i32 11)
5784 %23 = bitcast i8 %x3 to <8 x i1>
5785 %24 = extractelement <8 x i1> %23, i64 0
5786 %25 = select i1 %24, float %22, float %19
5787 %26 = insertelement <4 x float> %x0, float %25, i64 0
5788 %res4 = fadd <4 x float> %5, %13
5789 %res5 = fadd <4 x float> %18, %26
5790 %res6 = fadd <4 x float> %res4, %res5
5791 ret <4 x float> %res6
5794 define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
5795 ; X64-LABEL: test_int_x86_avx512_maskz_vfmadd_sd:
5797 ; X64-NEXT: kmovw %edi, %k1
5798 ; X64-NEXT: vmovapd %xmm0, %xmm3
5799 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm3 = (xmm1 * xmm3) + xmm2
5800 ; X64-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5801 ; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0
5804 ; X86-LABEL: test_int_x86_avx512_maskz_vfmadd_sd:
5806 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5807 ; X86-NEXT: kmovw %eax, %k1
5808 ; X86-NEXT: vmovapd %xmm0, %xmm3
5809 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm3 = (xmm1 * xmm3) + xmm2
5810 ; X86-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5811 ; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0
5813 %1 = extractelement <2 x double> %x0, i64 0
5814 %2 = extractelement <2 x double> %x1, i64 0
5815 %3 = extractelement <2 x double> %x2, i64 0
5816 %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
5817 %5 = bitcast i8 %x3 to <8 x i1>
5818 %6 = extractelement <8 x i1> %5, i64 0
5819 %7 = select i1 %6, double %4, double 0.000000e+00
5820 %8 = insertelement <2 x double> %x0, double %7, i64 0
5821 %9 = extractelement <2 x double> %x0, i64 0
5822 %10 = extractelement <2 x double> %x1, i64 0
5823 %11 = extractelement <2 x double> %x2, i64 0
5824 %12 = call double @llvm.x86.avx512.vfmadd.f64(double %9, double %10, double %11, i32 11)
5825 %13 = bitcast i8 %x3 to <8 x i1>
5826 %14 = extractelement <8 x i1> %13, i64 0
5827 %15 = select i1 %14, double %12, double 0.000000e+00
5828 %16 = insertelement <2 x double> %x0, double %15, i64 0
5829 %res2 = fadd <2 x double> %8, %16
5830 ret <2 x double> %res2
5833 declare float @llvm.fma.f32(float, float, float) #1
5834 declare float @llvm.x86.avx512.vfmadd.f32(float, float, float, i32) #0
5836 define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
5837 ; X64-LABEL: test_int_x86_avx512_maskz_vfmadd_ss:
5839 ; X64-NEXT: kmovw %edi, %k1
5840 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5843 ; X86-LABEL: test_int_x86_avx512_maskz_vfmadd_ss:
5845 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5846 ; X86-NEXT: kmovw %eax, %k1
5847 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5849 %1 = extractelement <4 x float> %x0, i64 0
5850 %2 = extractelement <4 x float> %x1, i64 0
5851 %3 = extractelement <4 x float> %x2, i64 0
5852 %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
5853 %5 = bitcast i8 %x3 to <8 x i1>
5854 %6 = extractelement <8 x i1> %5, i64 0
5855 %7 = select i1 %6, float %4, float 0.000000e+00
5856 %8 = insertelement <4 x float> %x0, float %7, i64 0
5857 %9 = extractelement <4 x float> %x0, i64 0
5858 %10 = extractelement <4 x float> %x1, i64 0
5859 %11 = extractelement <4 x float> %x2, i64 0
5860 %12 = call float @llvm.x86.avx512.vfmadd.f32(float %9, float %10, float %11, i32 11)
5861 %13 = bitcast i8 %x3 to <8 x i1>
5862 %14 = extractelement <8 x i1> %13, i64 0
5863 %15 = select i1 %14, float %12, float 0.000000e+00
5864 %16 = insertelement <4 x float> %x0, float %15, i64 0
5865 %res2 = fadd <4 x float> %8, %16
5869 define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
5870 ; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_sd:
5872 ; X64-NEXT: vmovapd %xmm2, %xmm3
5873 ; X64-NEXT: vfmadd231sd {{.*#+}} xmm3 = (xmm0 * xmm1) + xmm3
5874 ; X64-NEXT: kmovw %edi, %k1
5875 ; X64-NEXT: vmovapd %xmm2, %xmm4
5876 ; X64-NEXT: vfmadd231sd {{.*#+}} xmm4 = (xmm0 * xmm1) + xmm4
5877 ; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm3
5878 ; X64-NEXT: vmovapd %xmm2, %xmm4
5879 ; X64-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm4
5880 ; X64-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5881 ; X64-NEXT: vaddpd %xmm2, %xmm4, %xmm0
5882 ; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0
5885 ; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_sd:
5887 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5888 ; X86-NEXT: vmovapd %xmm2, %xmm3
5889 ; X86-NEXT: vfmadd231sd {{.*#+}} xmm3 = (xmm0 * xmm1) + xmm3
5890 ; X86-NEXT: kmovw %eax, %k1
5891 ; X86-NEXT: vmovapd %xmm2, %xmm4
5892 ; X86-NEXT: vfmadd231sd {{.*#+}} xmm4 = (xmm0 * xmm1) + xmm4
5893 ; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm3
5894 ; X86-NEXT: vmovapd %xmm2, %xmm4
5895 ; X86-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm4
5896 ; X86-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5897 ; X86-NEXT: vaddpd %xmm2, %xmm4, %xmm0
5898 ; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0
5900 %1 = extractelement <2 x double> %x0, i64 0
5901 %2 = extractelement <2 x double> %x1, i64 0
5902 %3 = extractelement <2 x double> %x2, i64 0
5903 %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
5904 %5 = insertelement <2 x double> %x2, double %4, i64 0
5905 %6 = extractelement <2 x double> %x0, i64 0
5906 %7 = extractelement <2 x double> %x1, i64 0
5907 %8 = extractelement <2 x double> %x2, i64 0
5908 %9 = call double @llvm.fma.f64(double %6, double %7, double %8)
5909 %10 = bitcast i8 %x3 to <8 x i1>
5910 %11 = extractelement <8 x i1> %10, i64 0
5911 %12 = select i1 %11, double %9, double %8
5912 %13 = insertelement <2 x double> %x2, double %12, i64 0
5913 %14 = extractelement <2 x double> %x0, i64 0
5914 %15 = extractelement <2 x double> %x1, i64 0
5915 %16 = extractelement <2 x double> %x2, i64 0
5916 %17 = call double @llvm.x86.avx512.vfmadd.f64(double %14, double %15, double %16, i32 11)
5917 %18 = insertelement <2 x double> %x2, double %17, i64 0
5918 %19 = extractelement <2 x double> %x0, i64 0
5919 %20 = extractelement <2 x double> %x1, i64 0
5920 %21 = extractelement <2 x double> %x2, i64 0
5921 %22 = call double @llvm.x86.avx512.vfmadd.f64(double %19, double %20, double %21, i32 11)
5922 %23 = bitcast i8 %x3 to <8 x i1>
5923 %24 = extractelement <8 x i1> %23, i64 0
5924 %25 = select i1 %24, double %22, double %21
5925 %26 = insertelement <2 x double> %x2, double %25, i64 0
5926 %res4 = fadd <2 x double> %5, %13
5927 %res5 = fadd <2 x double> %18, %26
5928 %res6 = fadd <2 x double> %res4, %res5
5929 ret <2 x double> %res6
5932 define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
5933 ; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_ss:
5935 ; X64-NEXT: vmovaps %xmm2, %xmm3
5936 ; X64-NEXT: vfmadd231ss {{.*#+}} xmm3 = (xmm0 * xmm1) + xmm3
5937 ; X64-NEXT: kmovw %edi, %k1
5938 ; X64-NEXT: vmovaps %xmm2, %xmm4
5939 ; X64-NEXT: vfmadd231ss {{.*#+}} xmm4 = (xmm0 * xmm1) + xmm4
5940 ; X64-NEXT: vaddps %xmm4, %xmm3, %xmm3
5941 ; X64-NEXT: vmovaps %xmm2, %xmm4
5942 ; X64-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm4
5943 ; X64-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5944 ; X64-NEXT: vaddps %xmm2, %xmm4, %xmm0
5945 ; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0
5948 ; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_ss:
5950 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5951 ; X86-NEXT: vmovaps %xmm2, %xmm3
5952 ; X86-NEXT: vfmadd231ss {{.*#+}} xmm3 = (xmm0 * xmm1) + xmm3
5953 ; X86-NEXT: kmovw %eax, %k1
5954 ; X86-NEXT: vmovaps %xmm2, %xmm4
5955 ; X86-NEXT: vfmadd231ss {{.*#+}} xmm4 = (xmm0 * xmm1) + xmm4
5956 ; X86-NEXT: vaddps %xmm4, %xmm3, %xmm3
5957 ; X86-NEXT: vmovaps %xmm2, %xmm4
5958 ; X86-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm4
5959 ; X86-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5960 ; X86-NEXT: vaddps %xmm2, %xmm4, %xmm0
5961 ; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0
5963 %1 = extractelement <4 x float> %x0, i64 0
5964 %2 = extractelement <4 x float> %x1, i64 0
5965 %3 = extractelement <4 x float> %x2, i64 0
5966 %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
5967 %5 = insertelement <4 x float> %x2, float %4, i64 0
5968 %6 = extractelement <4 x float> %x0, i64 0
5969 %7 = extractelement <4 x float> %x1, i64 0
5970 %8 = extractelement <4 x float> %x2, i64 0
5971 %9 = call float @llvm.fma.f32(float %6, float %7, float %8)
5972 %10 = bitcast i8 %x3 to <8 x i1>
5973 %11 = extractelement <8 x i1> %10, i64 0
5974 %12 = select i1 %11, float %9, float %8
5975 %13 = insertelement <4 x float> %x2, float %12, i64 0
5976 %14 = extractelement <4 x float> %x0, i64 0
5977 %15 = extractelement <4 x float> %x1, i64 0
5978 %16 = extractelement <4 x float> %x2, i64 0
5979 %17 = call float @llvm.x86.avx512.vfmadd.f32(float %14, float %15, float %16, i32 11)
5980 %18 = insertelement <4 x float> %x2, float %17, i64 0
5981 %19 = extractelement <4 x float> %x0, i64 0
5982 %20 = extractelement <4 x float> %x1, i64 0
5983 %21 = extractelement <4 x float> %x2, i64 0
5984 %22 = call float @llvm.x86.avx512.vfmadd.f32(float %19, float %20, float %21, i32 11)
5985 %23 = bitcast i8 %x3 to <8 x i1>
5986 %24 = extractelement <8 x i1> %23, i64 0
5987 %25 = select i1 %24, float %22, float %21
5988 %26 = insertelement <4 x float> %x2, float %25, i64 0
5989 %res4 = fadd <4 x float> %5, %13
5990 %res5 = fadd <4 x float> %18, %26
5991 %res6 = fadd <4 x float> %res4, %res5
5992 ret <4 x float> %res6
5995 define void @fmadd_ss_mask_memfold(float* %a, float* %b, i8 %c) {
5996 ; X64-LABEL: fmadd_ss_mask_memfold:
5998 ; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
5999 ; X64-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
6000 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm0 * xmm1) + xmm0
6001 ; X64-NEXT: kmovw %edx, %k1
6002 ; X64-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
6003 ; X64-NEXT: vmovss %xmm0, (%rdi)
6006 ; X86-LABEL: fmadd_ss_mask_memfold:
6008 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6009 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6010 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
6011 ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
6012 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
6013 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm0 * xmm1) + xmm0
6014 ; X86-NEXT: kmovw %eax, %k1
6015 ; X86-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
6016 ; X86-NEXT: vmovss %xmm0, (%edx)
6018 %a.val = load float, float* %a
6019 %av0 = insertelement <4 x float> undef, float %a.val, i32 0
6020 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
6021 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
6022 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
6024 %b.val = load float, float* %b
6025 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
6026 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
6027 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
6028 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
6029 %1 = extractelement <4 x float> %av, i64 0
6030 %2 = extractelement <4 x float> %bv, i64 0
6031 %3 = extractelement <4 x float> %av, i64 0
6032 %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
6033 %5 = bitcast i8 %c to <8 x i1>
6034 %6 = extractelement <8 x i1> %5, i64 0
6035 %7 = select i1 %6, float %4, float %1
6036 %8 = insertelement <4 x float> %av, float %7, i64 0
6037 %sr = extractelement <4 x float> %8, i32 0
6038 store float %sr, float* %a
6042 define void @fmadd_ss_maskz_memfold(float* %a, float* %b, i8 %c) {
6043 ; X64-LABEL: fmadd_ss_maskz_memfold:
6045 ; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
6046 ; X64-NEXT: vfmadd231ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
6047 ; X64-NEXT: kmovw %edx, %k1
6048 ; X64-NEXT: vmovss %xmm0, %xmm0, %xmm0 {%k1} {z}
6049 ; X64-NEXT: vmovss %xmm0, (%rdi)
6052 ; X86-LABEL: fmadd_ss_maskz_memfold:
6054 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6055 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6056 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
6057 ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
6058 ; X86-NEXT: vfmadd231ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
6059 ; X86-NEXT: kmovw %eax, %k1
6060 ; X86-NEXT: vmovss %xmm0, %xmm0, %xmm0 {%k1} {z}
6061 ; X86-NEXT: vmovss %xmm0, (%edx)
6063 %a.val = load float, float* %a
6064 %av0 = insertelement <4 x float> undef, float %a.val, i32 0
6065 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
6066 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
6067 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
6069 %b.val = load float, float* %b
6070 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
6071 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
6072 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
6073 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
6074 %1 = extractelement <4 x float> %av, i64 0
6075 %2 = extractelement <4 x float> %bv, i64 0
6076 %3 = extractelement <4 x float> %av, i64 0
6077 %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
6078 %5 = bitcast i8 %c to <8 x i1>
6079 %6 = extractelement <8 x i1> %5, i64 0
6080 %7 = select i1 %6, float %4, float 0.000000e+00
6081 %8 = insertelement <4 x float> %av, float %7, i64 0
6082 %sr = extractelement <4 x float> %8, i32 0
6083 store float %sr, float* %a
6087 define void @fmadd_sd_mask_memfold(double* %a, double* %b, i8 %c) {
6088 ; X64-LABEL: fmadd_sd_mask_memfold:
6090 ; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
6091 ; X64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
6092 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm1 = (xmm0 * xmm1) + xmm0
6093 ; X64-NEXT: kmovw %edx, %k1
6094 ; X64-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
6095 ; X64-NEXT: vmovsd %xmm0, (%rdi)
6098 ; X86-LABEL: fmadd_sd_mask_memfold:
6100 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6101 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6102 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
6103 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
6104 ; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
6105 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm1 = (xmm0 * xmm1) + xmm0
6106 ; X86-NEXT: kmovw %eax, %k1
6107 ; X86-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
6108 ; X86-NEXT: vmovsd %xmm0, (%edx)
6110 %a.val = load double, double* %a
6111 %av0 = insertelement <2 x double> undef, double %a.val, i32 0
6112 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
6114 %b.val = load double, double* %b
6115 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
6116 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
6117 %1 = extractelement <2 x double> %av, i64 0
6118 %2 = extractelement <2 x double> %bv, i64 0
6119 %3 = extractelement <2 x double> %av, i64 0
6120 %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
6121 %5 = bitcast i8 %c to <8 x i1>
6122 %6 = extractelement <8 x i1> %5, i64 0
6123 %7 = select i1 %6, double %4, double %1
6124 %8 = insertelement <2 x double> %av, double %7, i64 0
6125 %sr = extractelement <2 x double> %8, i32 0
6126 store double %sr, double* %a
6130 define void @fmadd_sd_maskz_memfold(double* %a, double* %b, i8 %c) {
6131 ; X64-LABEL: fmadd_sd_maskz_memfold:
6133 ; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
6134 ; X64-NEXT: vfmadd231sd {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
6135 ; X64-NEXT: kmovw %edx, %k1
6136 ; X64-NEXT: vmovsd %xmm0, %xmm0, %xmm0 {%k1} {z}
6137 ; X64-NEXT: vmovsd %xmm0, (%rdi)
6140 ; X86-LABEL: fmadd_sd_maskz_memfold:
6142 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6143 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6144 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
6145 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
6146 ; X86-NEXT: vfmadd231sd {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
6147 ; X86-NEXT: kmovw %eax, %k1
6148 ; X86-NEXT: vmovsd %xmm0, %xmm0, %xmm0 {%k1} {z}
6149 ; X86-NEXT: vmovsd %xmm0, (%edx)
6151 %a.val = load double, double* %a
6152 %av0 = insertelement <2 x double> undef, double %a.val, i32 0
6153 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
6155 %b.val = load double, double* %b
6156 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
6157 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
6158 %1 = extractelement <2 x double> %av, i64 0
6159 %2 = extractelement <2 x double> %bv, i64 0
6160 %3 = extractelement <2 x double> %av, i64 0
6161 %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
6162 %5 = bitcast i8 %c to <8 x i1>
6163 %6 = extractelement <8 x i1> %5, i64 0
6164 %7 = select i1 %6, double %4, double 0.000000e+00
6165 %8 = insertelement <2 x double> %av, double %7, i64 0
6166 %sr = extractelement <2 x double> %8, i32 0
6167 store double %sr, double* %a
6171 define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
6172 ; X64-LABEL: test_int_x86_avx512_mask3_vfmsub_sd:
6174 ; X64-NEXT: vmovapd %xmm2, %xmm3
6175 ; X64-NEXT: vfmsub231sd {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm3
6176 ; X64-NEXT: kmovw %edi, %k1
6177 ; X64-NEXT: vmovapd %xmm2, %xmm4
6178 ; X64-NEXT: vfmsub231sd {{.*#+}} xmm4 = (xmm0 * xmm1) - xmm4
6179 ; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm3
6180 ; X64-NEXT: vmovapd %xmm2, %xmm4
6181 ; X64-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4
6182 ; X64-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6183 ; X64-NEXT: vaddpd %xmm2, %xmm4, %xmm0
6184 ; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0
6187 ; X86-LABEL: test_int_x86_avx512_mask3_vfmsub_sd:
6189 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6190 ; X86-NEXT: vmovapd %xmm2, %xmm3
6191 ; X86-NEXT: vfmsub231sd {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm3
6192 ; X86-NEXT: kmovw %eax, %k1
6193 ; X86-NEXT: vmovapd %xmm2, %xmm4
6194 ; X86-NEXT: vfmsub231sd {{.*#+}} xmm4 = (xmm0 * xmm1) - xmm4
6195 ; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm3
6196 ; X86-NEXT: vmovapd %xmm2, %xmm4
6197 ; X86-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4
6198 ; X86-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6199 ; X86-NEXT: vaddpd %xmm2, %xmm4, %xmm0
6200 ; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0
6202 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
6203 %2 = extractelement <2 x double> %x0, i64 0
6204 %3 = extractelement <2 x double> %x1, i64 0
6205 %4 = extractelement <2 x double> %1, i64 0
6206 %5 = call double @llvm.fma.f64(double %2, double %3, double %4)
6207 %6 = extractelement <2 x double> %x2, i64 0
6208 %7 = insertelement <2 x double> %x2, double %5, i64 0
6209 %8 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
6210 %9 = extractelement <2 x double> %x0, i64 0
6211 %10 = extractelement <2 x double> %x1, i64 0
6212 %11 = extractelement <2 x double> %8, i64 0
6213 %12 = call double @llvm.fma.f64(double %9, double %10, double %11)
6214 %13 = extractelement <2 x double> %x2, i64 0
6215 %14 = bitcast i8 %x3 to <8 x i1>
6216 %15 = extractelement <8 x i1> %14, i64 0
6217 %16 = select i1 %15, double %12, double %13
6218 %17 = insertelement <2 x double> %x2, double %16, i64 0
6219 %18 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
6220 %19 = extractelement <2 x double> %x0, i64 0
6221 %20 = extractelement <2 x double> %x1, i64 0
6222 %21 = extractelement <2 x double> %18, i64 0
6223 %22 = call double @llvm.x86.avx512.vfmadd.f64(double %19, double %20, double %21, i32 11)
6224 %23 = extractelement <2 x double> %x2, i64 0
6225 %24 = insertelement <2 x double> %x2, double %22, i64 0
6226 %25 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
6227 %26 = extractelement <2 x double> %x0, i64 0
6228 %27 = extractelement <2 x double> %x1, i64 0
6229 %28 = extractelement <2 x double> %25, i64 0
6230 %29 = call double @llvm.x86.avx512.vfmadd.f64(double %26, double %27, double %28, i32 11)
6231 %30 = extractelement <2 x double> %x2, i64 0
6232 %31 = bitcast i8 %x3 to <8 x i1>
6233 %32 = extractelement <8 x i1> %31, i64 0
6234 %33 = select i1 %32, double %29, double %30
6235 %34 = insertelement <2 x double> %x2, double %33, i64 0
6236 %res4 = fadd <2 x double> %7, %17
6237 %res5 = fadd <2 x double> %24, %34
6238 %res6 = fadd <2 x double> %res4, %res5
6239 ret <2 x double> %res6
6242 define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
6243 ; X64-LABEL: test_int_x86_avx512_mask3_vfmsub_ss:
6245 ; X64-NEXT: vmovaps %xmm2, %xmm3
6246 ; X64-NEXT: vfmsub231ss {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm3
6247 ; X64-NEXT: kmovw %edi, %k1
6248 ; X64-NEXT: vmovaps %xmm2, %xmm4
6249 ; X64-NEXT: vfmsub231ss {{.*#+}} xmm4 = (xmm0 * xmm1) - xmm4
6250 ; X64-NEXT: vaddps %xmm4, %xmm3, %xmm3
6251 ; X64-NEXT: vmovaps %xmm2, %xmm4
6252 ; X64-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4
6253 ; X64-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6254 ; X64-NEXT: vaddps %xmm2, %xmm4, %xmm0
6255 ; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0
6258 ; X86-LABEL: test_int_x86_avx512_mask3_vfmsub_ss:
6260 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6261 ; X86-NEXT: vmovaps %xmm2, %xmm3
6262 ; X86-NEXT: vfmsub231ss {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm3
6263 ; X86-NEXT: kmovw %eax, %k1
6264 ; X86-NEXT: vmovaps %xmm2, %xmm4
6265 ; X86-NEXT: vfmsub231ss {{.*#+}} xmm4 = (xmm0 * xmm1) - xmm4
6266 ; X86-NEXT: vaddps %xmm4, %xmm3, %xmm3
6267 ; X86-NEXT: vmovaps %xmm2, %xmm4
6268 ; X86-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4
6269 ; X86-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6270 ; X86-NEXT: vaddps %xmm2, %xmm4, %xmm0
6271 ; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0
6273 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
6274 %2 = extractelement <4 x float> %x0, i64 0
6275 %3 = extractelement <4 x float> %x1, i64 0
6276 %4 = extractelement <4 x float> %1, i64 0
6277 %5 = call float @llvm.fma.f32(float %2, float %3, float %4)
6278 %6 = extractelement <4 x float> %x2, i64 0
6279 %7 = insertelement <4 x float> %x2, float %5, i64 0
6280 %8 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
6281 %9 = extractelement <4 x float> %x0, i64 0
6282 %10 = extractelement <4 x float> %x1, i64 0
6283 %11 = extractelement <4 x float> %8, i64 0
6284 %12 = call float @llvm.fma.f32(float %9, float %10, float %11)
6285 %13 = extractelement <4 x float> %x2, i64 0
6286 %14 = bitcast i8 %x3 to <8 x i1>
6287 %15 = extractelement <8 x i1> %14, i64 0
6288 %16 = select i1 %15, float %12, float %13
6289 %17 = insertelement <4 x float> %x2, float %16, i64 0
6290 %18 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
6291 %19 = extractelement <4 x float> %x0, i64 0
6292 %20 = extractelement <4 x float> %x1, i64 0
6293 %21 = extractelement <4 x float> %18, i64 0
6294 %22 = call float @llvm.x86.avx512.vfmadd.f32(float %19, float %20, float %21, i32 11)
6295 %23 = extractelement <4 x float> %x2, i64 0
6296 %24 = insertelement <4 x float> %x2, float %22, i64 0
6297 %25 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
6298 %26 = extractelement <4 x float> %x0, i64 0
6299 %27 = extractelement <4 x float> %x1, i64 0
6300 %28 = extractelement <4 x float> %25, i64 0
6301 %29 = call float @llvm.x86.avx512.vfmadd.f32(float %26, float %27, float %28, i32 11)
6302 %30 = extractelement <4 x float> %x2, i64 0
6303 %31 = bitcast i8 %x3 to <8 x i1>
6304 %32 = extractelement <8 x i1> %31, i64 0
6305 %33 = select i1 %32, float %29, float %30
6306 %34 = insertelement <4 x float> %x2, float %33, i64 0
6307 %res4 = fadd <4 x float> %7, %17
6308 %res5 = fadd <4 x float> %24, %34
6309 %res6 = fadd <4 x float> %res4, %res5
6310 ret <4 x float> %res6
6313 define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
6314 ; X64-LABEL: test_int_x86_avx512_mask3_vfnmsub_sd:
6316 ; X64-NEXT: vmovapd %xmm2, %xmm3
6317 ; X64-NEXT: vfnmsub231sd {{.*#+}} xmm3 = -(xmm0 * xmm1) - xmm3
6318 ; X64-NEXT: kmovw %edi, %k1
6319 ; X64-NEXT: vmovapd %xmm2, %xmm4
6320 ; X64-NEXT: vfnmsub231sd {{.*#+}} xmm4 = -(xmm0 * xmm1) - xmm4
6321 ; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm3
6322 ; X64-NEXT: vmovapd %xmm2, %xmm4
6323 ; X64-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4
6324 ; X64-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6325 ; X64-NEXT: vaddpd %xmm2, %xmm4, %xmm0
6326 ; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0
6329 ; X86-LABEL: test_int_x86_avx512_mask3_vfnmsub_sd:
6331 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6332 ; X86-NEXT: vmovapd %xmm2, %xmm3
6333 ; X86-NEXT: vfnmsub231sd {{.*#+}} xmm3 = -(xmm0 * xmm1) - xmm3
6334 ; X86-NEXT: kmovw %eax, %k1
6335 ; X86-NEXT: vmovapd %xmm2, %xmm4
6336 ; X86-NEXT: vfnmsub231sd {{.*#+}} xmm4 = -(xmm0 * xmm1) - xmm4
6337 ; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm3
6338 ; X86-NEXT: vmovapd %xmm2, %xmm4
6339 ; X86-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4
6340 ; X86-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6341 ; X86-NEXT: vaddpd %xmm2, %xmm4, %xmm0
6342 ; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0
6344 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x0
6345 %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
6346 %3 = extractelement <2 x double> %1, i64 0
6347 %4 = extractelement <2 x double> %x1, i64 0
6348 %5 = extractelement <2 x double> %2, i64 0
6349 %6 = call double @llvm.fma.f64(double %3, double %4, double %5)
6350 %7 = extractelement <2 x double> %x2, i64 0
6351 %8 = insertelement <2 x double> %x2, double %6, i64 0
6352 %9 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x0
6353 %10 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
6354 %11 = extractelement <2 x double> %9, i64 0
6355 %12 = extractelement <2 x double> %x1, i64 0
6356 %13 = extractelement <2 x double> %10, i64 0
6357 %14 = call double @llvm.fma.f64(double %11, double %12, double %13)
6358 %15 = extractelement <2 x double> %x2, i64 0
6359 %16 = bitcast i8 %x3 to <8 x i1>
6360 %17 = extractelement <8 x i1> %16, i64 0
6361 %18 = select i1 %17, double %14, double %15
6362 %19 = insertelement <2 x double> %x2, double %18, i64 0
6363 %20 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x0
6364 %21 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
6365 %22 = extractelement <2 x double> %20, i64 0
6366 %23 = extractelement <2 x double> %x1, i64 0
6367 %24 = extractelement <2 x double> %21, i64 0
6368 %25 = call double @llvm.x86.avx512.vfmadd.f64(double %22, double %23, double %24, i32 11)
6369 %26 = extractelement <2 x double> %x2, i64 0
6370 %27 = insertelement <2 x double> %x2, double %25, i64 0
6371 %28 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x0
6372 %29 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
6373 %30 = extractelement <2 x double> %28, i64 0
6374 %31 = extractelement <2 x double> %x1, i64 0
6375 %32 = extractelement <2 x double> %29, i64 0
6376 %33 = call double @llvm.x86.avx512.vfmadd.f64(double %30, double %31, double %32, i32 11)
6377 %34 = extractelement <2 x double> %x2, i64 0
6378 %35 = bitcast i8 %x3 to <8 x i1>
6379 %36 = extractelement <8 x i1> %35, i64 0
6380 %37 = select i1 %36, double %33, double %34
6381 %38 = insertelement <2 x double> %x2, double %37, i64 0
6382 %res4 = fadd <2 x double> %8, %19
6383 %res5 = fadd <2 x double> %27, %38
6384 %res6 = fadd <2 x double> %res4, %res5
6385 ret <2 x double> %res6
6388 define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
6389 ; X64-LABEL: test_int_x86_avx512_mask3_vfnmsub_ss:
6391 ; X64-NEXT: vmovaps %xmm2, %xmm3
6392 ; X64-NEXT: vfnmsub231ss {{.*#+}} xmm3 = -(xmm0 * xmm1) - xmm3
6393 ; X64-NEXT: kmovw %edi, %k1
6394 ; X64-NEXT: vmovaps %xmm2, %xmm4
6395 ; X64-NEXT: vfnmsub231ss {{.*#+}} xmm4 = -(xmm0 * xmm1) - xmm4
6396 ; X64-NEXT: vaddps %xmm4, %xmm3, %xmm3
6397 ; X64-NEXT: vmovaps %xmm2, %xmm4
6398 ; X64-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4
6399 ; X64-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6400 ; X64-NEXT: vaddps %xmm2, %xmm4, %xmm0
6401 ; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0
6404 ; X86-LABEL: test_int_x86_avx512_mask3_vfnmsub_ss:
6406 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6407 ; X86-NEXT: vmovaps %xmm2, %xmm3
6408 ; X86-NEXT: vfnmsub231ss {{.*#+}} xmm3 = -(xmm0 * xmm1) - xmm3
6409 ; X86-NEXT: kmovw %eax, %k1
6410 ; X86-NEXT: vmovaps %xmm2, %xmm4
6411 ; X86-NEXT: vfnmsub231ss {{.*#+}} xmm4 = -(xmm0 * xmm1) - xmm4
6412 ; X86-NEXT: vaddps %xmm4, %xmm3, %xmm3
6413 ; X86-NEXT: vmovaps %xmm2, %xmm4
6414 ; X86-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4
6415 ; X86-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6416 ; X86-NEXT: vaddps %xmm2, %xmm4, %xmm0
6417 ; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0
6419 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x0
6420 %2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
6421 %3 = extractelement <4 x float> %1, i64 0
6422 %4 = extractelement <4 x float> %x1, i64 0
6423 %5 = extractelement <4 x float> %2, i64 0
6424 %6 = call float @llvm.fma.f32(float %3, float %4, float %5)
6425 %7 = extractelement <4 x float> %x2, i64 0
6426 %8 = insertelement <4 x float> %x2, float %6, i64 0
6427 %9 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x0
6428 %10 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
6429 %11 = extractelement <4 x float> %9, i64 0
6430 %12 = extractelement <4 x float> %x1, i64 0
6431 %13 = extractelement <4 x float> %10, i64 0
6432 %14 = call float @llvm.fma.f32(float %11, float %12, float %13)
6433 %15 = extractelement <4 x float> %x2, i64 0
6434 %16 = bitcast i8 %x3 to <8 x i1>
6435 %17 = extractelement <8 x i1> %16, i64 0
6436 %18 = select i1 %17, float %14, float %15
6437 %19 = insertelement <4 x float> %x2, float %18, i64 0
6438 %20 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x0
6439 %21 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
6440 %22 = extractelement <4 x float> %20, i64 0
6441 %23 = extractelement <4 x float> %x1, i64 0
6442 %24 = extractelement <4 x float> %21, i64 0
6443 %25 = call float @llvm.x86.avx512.vfmadd.f32(float %22, float %23, float %24, i32 11)
6444 %26 = extractelement <4 x float> %x2, i64 0
6445 %27 = insertelement <4 x float> %x2, float %25, i64 0
6446 %28 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x0
6447 %29 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
6448 %30 = extractelement <4 x float> %28, i64 0
6449 %31 = extractelement <4 x float> %x1, i64 0
6450 %32 = extractelement <4 x float> %29, i64 0
6451 %33 = call float @llvm.x86.avx512.vfmadd.f32(float %30, float %31, float %32, i32 11)
6452 %34 = extractelement <4 x float> %x2, i64 0
6453 %35 = bitcast i8 %x3 to <8 x i1>
6454 %36 = extractelement <8 x i1> %35, i64 0
6455 %37 = select i1 %36, float %33, float %34
6456 %38 = insertelement <4 x float> %x2, float %37, i64 0
6457 %res4 = fadd <4 x float> %8, %19
6458 %res5 = fadd <4 x float> %27, %38
6459 %res6 = fadd <4 x float> %res4, %res5
6460 ret <4 x float> %res6
6463 define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, float *%ptr_b ,i8 %x3,i32 %x4) {
6464 ; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_rm:
6466 ; X64-NEXT: kmovw %esi, %k1
6467 ; X64-NEXT: vfmadd231ss {{.*#+}} xmm1 = (xmm0 * mem) + xmm1
6468 ; X64-NEXT: vmovaps %xmm1, %xmm0
6471 ; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_rm:
6473 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6474 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6475 ; X86-NEXT: kmovw %ecx, %k1
6476 ; X86-NEXT: vfmadd231ss {{.*#+}} xmm1 = (xmm0 * mem) + xmm1
6477 ; X86-NEXT: vmovaps %xmm1, %xmm0
6479 %q = load float, float* %ptr_b
6480 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
6481 %1 = extractelement <4 x float> %x0, i64 0
6482 %2 = extractelement <4 x float> %vecinit.i, i64 0
6483 %3 = extractelement <4 x float> %x1, i64 0
6484 %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
6485 %5 = bitcast i8 %x3 to <8 x i1>
6486 %6 = extractelement <8 x i1> %5, i64 0
6487 %7 = select i1 %6, float %4, float %3
6488 %8 = insertelement <4 x float> %x1, float %7, i64 0
6492 define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) {
6493 ; X64-LABEL: test_int_x86_avx512_mask_vfmadd_ss_rm:
6495 ; X64-NEXT: kmovw %esi, %k1
6496 ; X64-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm1
6499 ; X86-LABEL: test_int_x86_avx512_mask_vfmadd_ss_rm:
6501 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6502 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6503 ; X86-NEXT: kmovw %ecx, %k1
6504 ; X86-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm1
6506 %q = load float, float* %ptr_b
6507 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
6508 %1 = extractelement <4 x float> %x0, i64 0
6509 %2 = extractelement <4 x float> %vecinit.i, i64 0
6510 %3 = extractelement <4 x float> %x1, i64 0
6511 %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
6512 %5 = bitcast i8 %x3 to <8 x i1>
6513 %6 = extractelement <8 x i1> %5, i64 0
6514 %7 = select i1 %6, float %4, float %1
6515 %8 = insertelement <4 x float> %x0, float %7, i64 0
6520 define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) {
6521 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm:
6523 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
6524 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
6525 ; CHECK-NEXT: ret{{[l|q]}}
6526 %q = load float, float* %ptr_b
6527 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
6528 %1 = extractelement <4 x float> %x0, i64 0
6529 %2 = extractelement <4 x float> %x1, i64 0
6530 %3 = extractelement <4 x float> %vecinit.i, i64 0
6531 %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
6532 %5 = select i1 false, float %4, float 0.000000e+00
6533 %6 = insertelement <4 x float> %x0, float %5, i64 0
6537 define <16 x i32> @test_x86_avx512_psll_d_512(<16 x i32> %a0, <4 x i32> %a1) {
6538 ; CHECK-LABEL: test_x86_avx512_psll_d_512:
6540 ; CHECK-NEXT: vpslld %xmm1, %zmm0, %zmm0
6541 ; CHECK-NEXT: ret{{[l|q]}}
6542 %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6545 define <16 x i32> @test_x86_avx512_mask_psll_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) {
6546 ; X64-LABEL: test_x86_avx512_mask_psll_d_512:
6548 ; X64-NEXT: kmovw %edi, %k1
6549 ; X64-NEXT: vpslld %xmm1, %zmm0, %zmm2 {%k1}
6550 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
6553 ; X86-LABEL: test_x86_avx512_mask_psll_d_512:
6555 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6556 ; X86-NEXT: vpslld %xmm1, %zmm0, %zmm2 {%k1}
6557 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
6559 %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6560 %mask.cast = bitcast i16 %mask to <16 x i1>
6561 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
6562 ret <16 x i32> %res2
6564 define <16 x i32> @test_x86_avx512_maskz_psll_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
6565 ; X64-LABEL: test_x86_avx512_maskz_psll_d_512:
6567 ; X64-NEXT: kmovw %edi, %k1
6568 ; X64-NEXT: vpslld %xmm1, %zmm0, %zmm0 {%k1} {z}
6571 ; X86-LABEL: test_x86_avx512_maskz_psll_d_512:
6573 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6574 ; X86-NEXT: vpslld %xmm1, %zmm0, %zmm0 {%k1} {z}
6576 %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6577 %mask.cast = bitcast i16 %mask to <16 x i1>
6578 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
6579 ret <16 x i32> %res2
6581 declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>) nounwind readnone
6584 define <8 x i64> @test_x86_avx512_psll_q_512(<8 x i64> %a0, <2 x i64> %a1) {
6585 ; CHECK-LABEL: test_x86_avx512_psll_q_512:
6587 ; CHECK-NEXT: vpsllq %xmm1, %zmm0, %zmm0
6588 ; CHECK-NEXT: ret{{[l|q]}}
6589 %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
6592 define <8 x i64> @test_x86_avx512_mask_psll_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) {
6593 ; X64-LABEL: test_x86_avx512_mask_psll_q_512:
6595 ; X64-NEXT: kmovw %edi, %k1
6596 ; X64-NEXT: vpsllq %xmm1, %zmm0, %zmm2 {%k1}
6597 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
6600 ; X86-LABEL: test_x86_avx512_mask_psll_q_512:
6602 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6603 ; X86-NEXT: kmovw %eax, %k1
6604 ; X86-NEXT: vpsllq %xmm1, %zmm0, %zmm2 {%k1}
6605 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
6607 %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
6608 %mask.cast = bitcast i8 %mask to <8 x i1>
6609 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
6612 define <8 x i64> @test_x86_avx512_maskz_psll_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
6613 ; X64-LABEL: test_x86_avx512_maskz_psll_q_512:
6615 ; X64-NEXT: kmovw %edi, %k1
6616 ; X64-NEXT: vpsllq %xmm1, %zmm0, %zmm0 {%k1} {z}
6619 ; X86-LABEL: test_x86_avx512_maskz_psll_q_512:
6621 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6622 ; X86-NEXT: kmovw %eax, %k1
6623 ; X86-NEXT: vpsllq %xmm1, %zmm0, %zmm0 {%k1} {z}
6625 %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
6626 %mask.cast = bitcast i8 %mask to <8 x i1>
6627 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
6630 declare <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64>, <2 x i64>) nounwind readnone
6633 define <16 x i32> @test_x86_avx512_pslli_d_512(<16 x i32> %a0) {
6634 ; CHECK-LABEL: test_x86_avx512_pslli_d_512:
6636 ; CHECK-NEXT: vpslld $7, %zmm0, %zmm0
6637 ; CHECK-NEXT: ret{{[l|q]}}
6638 %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
6641 define <16 x i32> @test_x86_avx512_mask_pslli_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
6642 ; X64-LABEL: test_x86_avx512_mask_pslli_d_512:
6644 ; X64-NEXT: kmovw %edi, %k1
6645 ; X64-NEXT: vpslld $7, %zmm0, %zmm1 {%k1}
6646 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
6649 ; X86-LABEL: test_x86_avx512_mask_pslli_d_512:
6651 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6652 ; X86-NEXT: vpslld $7, %zmm0, %zmm1 {%k1}
6653 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
6655 %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
6656 %mask.cast = bitcast i16 %mask to <16 x i1>
6657 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
6658 ret <16 x i32> %res2
6660 define <16 x i32> @test_x86_avx512_maskz_pslli_d_512(<16 x i32> %a0, i16 %mask) {
6661 ; X64-LABEL: test_x86_avx512_maskz_pslli_d_512:
6663 ; X64-NEXT: kmovw %edi, %k1
6664 ; X64-NEXT: vpslld $7, %zmm0, %zmm0 {%k1} {z}
6667 ; X86-LABEL: test_x86_avx512_maskz_pslli_d_512:
6669 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6670 ; X86-NEXT: vpslld $7, %zmm0, %zmm0 {%k1} {z}
6672 %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
6673 %mask.cast = bitcast i16 %mask to <16 x i1>
6674 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
6675 ret <16 x i32> %res2
6677 declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) nounwind readnone
6680 define <8 x i64> @test_x86_avx512_pslli_q_512(<8 x i64> %a0) {
6681 ; CHECK-LABEL: test_x86_avx512_pslli_q_512:
6683 ; CHECK-NEXT: vpsllq $7, %zmm0, %zmm0
6684 ; CHECK-NEXT: ret{{[l|q]}}
6685 %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
6688 define <8 x i64> @test_x86_avx512_mask_pslli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
6689 ; X64-LABEL: test_x86_avx512_mask_pslli_q_512:
6691 ; X64-NEXT: kmovw %edi, %k1
6692 ; X64-NEXT: vpsllq $7, %zmm0, %zmm1 {%k1}
6693 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
6696 ; X86-LABEL: test_x86_avx512_mask_pslli_q_512:
6698 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6699 ; X86-NEXT: kmovw %eax, %k1
6700 ; X86-NEXT: vpsllq $7, %zmm0, %zmm1 {%k1}
6701 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
6703 %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
6704 %mask.cast = bitcast i8 %mask to <8 x i1>
6705 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
6708 define <8 x i64> @test_x86_avx512_maskz_pslli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
6709 ; X64-LABEL: test_x86_avx512_maskz_pslli_q_512:
6711 ; X64-NEXT: kmovw %edi, %k1
6712 ; X64-NEXT: vpsllq $7, %zmm0, %zmm0 {%k1} {z}
6715 ; X86-LABEL: test_x86_avx512_maskz_pslli_q_512:
6717 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6718 ; X86-NEXT: kmovw %eax, %k1
6719 ; X86-NEXT: vpsllq $7, %zmm0, %zmm0 {%k1} {z}
6721 %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
6722 %mask.cast = bitcast i8 %mask to <8 x i1>
6723 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
6726 declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) nounwind readnone
6729 define <8 x i64> @test_x86_avx512_psra_q_512(<8 x i64> %a0, <2 x i64> %a1) {
6730 ; CHECK-LABEL: test_x86_avx512_psra_q_512:
6732 ; CHECK-NEXT: vpsraq %xmm1, %zmm0, %zmm0
6733 ; CHECK-NEXT: ret{{[l|q]}}
6734 %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
6737 define <8 x i64> @test_x86_avx512_mask_psra_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) {
6738 ; X64-LABEL: test_x86_avx512_mask_psra_q_512:
6740 ; X64-NEXT: kmovw %edi, %k1
6741 ; X64-NEXT: vpsraq %xmm1, %zmm0, %zmm2 {%k1}
6742 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
6745 ; X86-LABEL: test_x86_avx512_mask_psra_q_512:
6747 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6748 ; X86-NEXT: kmovw %eax, %k1
6749 ; X86-NEXT: vpsraq %xmm1, %zmm0, %zmm2 {%k1}
6750 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
6752 %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
6753 %mask.cast = bitcast i8 %mask to <8 x i1>
6754 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
6757 define <8 x i64> @test_x86_avx512_maskz_psra_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
6758 ; X64-LABEL: test_x86_avx512_maskz_psra_q_512:
6760 ; X64-NEXT: kmovw %edi, %k1
6761 ; X64-NEXT: vpsraq %xmm1, %zmm0, %zmm0 {%k1} {z}
6764 ; X86-LABEL: test_x86_avx512_maskz_psra_q_512:
6766 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6767 ; X86-NEXT: kmovw %eax, %k1
6768 ; X86-NEXT: vpsraq %xmm1, %zmm0, %zmm0 {%k1} {z}
6770 %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
6771 %mask.cast = bitcast i8 %mask to <8 x i1>
6772 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
6775 declare <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64>, <2 x i64>) nounwind readnone
6778 define <16 x i32> @test_x86_avx512_psra_d_512(<16 x i32> %a0, <4 x i32> %a1) {
6779 ; CHECK-LABEL: test_x86_avx512_psra_d_512:
6781 ; CHECK-NEXT: vpsrad %xmm1, %zmm0, %zmm0
6782 ; CHECK-NEXT: ret{{[l|q]}}
6783 %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6786 define <16 x i32> @test_x86_avx512_mask_psra_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) {
6787 ; X64-LABEL: test_x86_avx512_mask_psra_d_512:
6789 ; X64-NEXT: kmovw %edi, %k1
6790 ; X64-NEXT: vpsrad %xmm1, %zmm0, %zmm2 {%k1}
6791 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
6794 ; X86-LABEL: test_x86_avx512_mask_psra_d_512:
6796 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6797 ; X86-NEXT: vpsrad %xmm1, %zmm0, %zmm2 {%k1}
6798 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
6800 %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6801 %mask.cast = bitcast i16 %mask to <16 x i1>
6802 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
6803 ret <16 x i32> %res2
6805 define <16 x i32> @test_x86_avx512_maskz_psra_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
6806 ; X64-LABEL: test_x86_avx512_maskz_psra_d_512:
6808 ; X64-NEXT: kmovw %edi, %k1
6809 ; X64-NEXT: vpsrad %xmm1, %zmm0, %zmm0 {%k1} {z}
6812 ; X86-LABEL: test_x86_avx512_maskz_psra_d_512:
6814 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6815 ; X86-NEXT: vpsrad %xmm1, %zmm0, %zmm0 {%k1} {z}
6817 %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6818 %mask.cast = bitcast i16 %mask to <16 x i1>
6819 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
6820 ret <16 x i32> %res2
6822 declare <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32>, <4 x i32>) nounwind readnone
6826 define <8 x i64> @test_x86_avx512_psrai_q_512(<8 x i64> %a0) {
6827 ; CHECK-LABEL: test_x86_avx512_psrai_q_512:
6829 ; CHECK-NEXT: vpsraq $7, %zmm0, %zmm0
6830 ; CHECK-NEXT: ret{{[l|q]}}
6831 %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
6834 define <8 x i64> @test_x86_avx512_mask_psrai_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
6835 ; X64-LABEL: test_x86_avx512_mask_psrai_q_512:
6837 ; X64-NEXT: kmovw %edi, %k1
6838 ; X64-NEXT: vpsraq $7, %zmm0, %zmm1 {%k1}
6839 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
6842 ; X86-LABEL: test_x86_avx512_mask_psrai_q_512:
6844 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6845 ; X86-NEXT: kmovw %eax, %k1
6846 ; X86-NEXT: vpsraq $7, %zmm0, %zmm1 {%k1}
6847 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
6849 %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
6850 %mask.cast = bitcast i8 %mask to <8 x i1>
6851 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
6854 define <8 x i64> @test_x86_avx512_maskz_psrai_q_512(<8 x i64> %a0, i8 %mask) {
6855 ; X64-LABEL: test_x86_avx512_maskz_psrai_q_512:
6857 ; X64-NEXT: kmovw %edi, %k1
6858 ; X64-NEXT: vpsraq $7, %zmm0, %zmm0 {%k1} {z}
6861 ; X86-LABEL: test_x86_avx512_maskz_psrai_q_512:
6863 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6864 ; X86-NEXT: kmovw %eax, %k1
6865 ; X86-NEXT: vpsraq $7, %zmm0, %zmm0 {%k1} {z}
6867 %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
6868 %mask.cast = bitcast i8 %mask to <8 x i1>
6869 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
6872 declare <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64>, i32) nounwind readnone
6875 define <16 x i32> @test_x86_avx512_psrai_d_512(<16 x i32> %a0) {
6876 ; CHECK-LABEL: test_x86_avx512_psrai_d_512:
6878 ; CHECK-NEXT: vpsrad $7, %zmm0, %zmm0
6879 ; CHECK-NEXT: ret{{[l|q]}}
6880 %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
6883 define <16 x i32> @test_x86_avx512_mask_psrai_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
6884 ; X64-LABEL: test_x86_avx512_mask_psrai_d_512:
6886 ; X64-NEXT: kmovw %edi, %k1
6887 ; X64-NEXT: vpsrad $7, %zmm0, %zmm1 {%k1}
6888 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
6891 ; X86-LABEL: test_x86_avx512_mask_psrai_d_512:
6893 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6894 ; X86-NEXT: vpsrad $7, %zmm0, %zmm1 {%k1}
6895 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
6897 %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
6898 %mask.cast = bitcast i16 %mask to <16 x i1>
6899 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
6900 ret <16 x i32> %res2
6902 define <16 x i32> @test_x86_avx512_maskz_psrai_d_512(<16 x i32> %a0, i16 %mask) {
6903 ; X64-LABEL: test_x86_avx512_maskz_psrai_d_512:
6905 ; X64-NEXT: kmovw %edi, %k1
6906 ; X64-NEXT: vpsrad $7, %zmm0, %zmm0 {%k1} {z}
6909 ; X86-LABEL: test_x86_avx512_maskz_psrai_d_512:
6911 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6912 ; X86-NEXT: vpsrad $7, %zmm0, %zmm0 {%k1} {z}
6914 %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
6915 %mask.cast = bitcast i16 %mask to <16 x i1>
6916 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
6917 ret <16 x i32> %res2
6919 declare <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32>, i32) nounwind readnone
6923 define <16 x i32> @test_x86_avx512_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1) {
6924 ; CHECK-LABEL: test_x86_avx512_psrl_d_512:
6926 ; CHECK-NEXT: vpsrld %xmm1, %zmm0, %zmm0
6927 ; CHECK-NEXT: ret{{[l|q]}}
6928 %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6931 define <16 x i32> @test_x86_avx512_mask_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) {
6932 ; X64-LABEL: test_x86_avx512_mask_psrl_d_512:
6934 ; X64-NEXT: kmovw %edi, %k1
6935 ; X64-NEXT: vpsrld %xmm1, %zmm0, %zmm2 {%k1}
6936 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
6939 ; X86-LABEL: test_x86_avx512_mask_psrl_d_512:
6941 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6942 ; X86-NEXT: vpsrld %xmm1, %zmm0, %zmm2 {%k1}
6943 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
6945 %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6946 %mask.cast = bitcast i16 %mask to <16 x i1>
6947 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
6948 ret <16 x i32> %res2
6950 define <16 x i32> @test_x86_avx512_maskz_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
6951 ; X64-LABEL: test_x86_avx512_maskz_psrl_d_512:
6953 ; X64-NEXT: kmovw %edi, %k1
6954 ; X64-NEXT: vpsrld %xmm1, %zmm0, %zmm0 {%k1} {z}
6957 ; X86-LABEL: test_x86_avx512_maskz_psrl_d_512:
6959 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6960 ; X86-NEXT: vpsrld %xmm1, %zmm0, %zmm0 {%k1} {z}
6962 %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6963 %mask.cast = bitcast i16 %mask to <16 x i1>
6964 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
6965 ret <16 x i32> %res2
6967 declare <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32>, <4 x i32>) nounwind readnone
6970 define <8 x i64> @test_x86_avx512_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1) {
6971 ; CHECK-LABEL: test_x86_avx512_psrl_q_512:
6973 ; CHECK-NEXT: vpsrlq %xmm1, %zmm0, %zmm0
6974 ; CHECK-NEXT: ret{{[l|q]}}
6975 %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
6978 define <8 x i64> @test_x86_avx512_mask_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) {
6979 ; X64-LABEL: test_x86_avx512_mask_psrl_q_512:
6981 ; X64-NEXT: kmovw %edi, %k1
6982 ; X64-NEXT: vpsrlq %xmm1, %zmm0, %zmm2 {%k1}
6983 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
6986 ; X86-LABEL: test_x86_avx512_mask_psrl_q_512:
6988 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6989 ; X86-NEXT: kmovw %eax, %k1
6990 ; X86-NEXT: vpsrlq %xmm1, %zmm0, %zmm2 {%k1}
6991 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
6993 %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
6994 %mask.cast = bitcast i8 %mask to <8 x i1>
6995 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
6998 define <8 x i64> @test_x86_avx512_maskz_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
6999 ; X64-LABEL: test_x86_avx512_maskz_psrl_q_512:
7001 ; X64-NEXT: kmovw %edi, %k1
7002 ; X64-NEXT: vpsrlq %xmm1, %zmm0, %zmm0 {%k1} {z}
7005 ; X86-LABEL: test_x86_avx512_maskz_psrl_q_512:
7007 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7008 ; X86-NEXT: kmovw %eax, %k1
7009 ; X86-NEXT: vpsrlq %xmm1, %zmm0, %zmm0 {%k1} {z}
7011 %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
7012 %mask.cast = bitcast i8 %mask to <8 x i1>
7013 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
7016 declare <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64>, <2 x i64>) nounwind readnone
7019 define <16 x i32> @test_x86_avx512_psrli_d_512(<16 x i32> %a0) {
7020 ; CHECK-LABEL: test_x86_avx512_psrli_d_512:
7022 ; CHECK-NEXT: vpsrld $7, %zmm0, %zmm0
7023 ; CHECK-NEXT: ret{{[l|q]}}
7024 %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
7027 define <16 x i32> @test_x86_avx512_mask_psrli_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
7028 ; X64-LABEL: test_x86_avx512_mask_psrli_d_512:
7030 ; X64-NEXT: kmovw %edi, %k1
7031 ; X64-NEXT: vpsrld $7, %zmm0, %zmm1 {%k1}
7032 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
7035 ; X86-LABEL: test_x86_avx512_mask_psrli_d_512:
7037 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7038 ; X86-NEXT: vpsrld $7, %zmm0, %zmm1 {%k1}
7039 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
7041 %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
7042 %mask.cast = bitcast i16 %mask to <16 x i1>
7043 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
7044 ret <16 x i32> %res2
7046 define <16 x i32> @test_x86_avx512_maskz_psrli_d_512(<16 x i32> %a0, i16 %mask) {
7047 ; X64-LABEL: test_x86_avx512_maskz_psrli_d_512:
7049 ; X64-NEXT: kmovw %edi, %k1
7050 ; X64-NEXT: vpsrld $7, %zmm0, %zmm0 {%k1} {z}
7053 ; X86-LABEL: test_x86_avx512_maskz_psrli_d_512:
7055 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7056 ; X86-NEXT: vpsrld $7, %zmm0, %zmm0 {%k1} {z}
7058 %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
7059 %mask.cast = bitcast i16 %mask to <16 x i1>
7060 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
7061 ret <16 x i32> %res2
7063 declare <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32>, i32) nounwind readnone
7066 define <8 x i64> @test_x86_avx512_psrli_q_512(<8 x i64> %a0) {
7067 ; CHECK-LABEL: test_x86_avx512_psrli_q_512:
7069 ; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm0
7070 ; CHECK-NEXT: ret{{[l|q]}}
7071 %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
7074 define <8 x i64> @test_x86_avx512_mask_psrli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
7075 ; X64-LABEL: test_x86_avx512_mask_psrli_q_512:
7077 ; X64-NEXT: kmovw %edi, %k1
7078 ; X64-NEXT: vpsrlq $7, %zmm0, %zmm1 {%k1}
7079 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
7082 ; X86-LABEL: test_x86_avx512_mask_psrli_q_512:
7084 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7085 ; X86-NEXT: kmovw %eax, %k1
7086 ; X86-NEXT: vpsrlq $7, %zmm0, %zmm1 {%k1}
7087 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
7089 %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
7090 %mask.cast = bitcast i8 %mask to <8 x i1>
7091 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
7094 define <8 x i64> @test_x86_avx512_maskz_psrli_q_512(<8 x i64> %a0, i8 %mask) {
7095 ; X64-LABEL: test_x86_avx512_maskz_psrli_q_512:
7097 ; X64-NEXT: kmovw %edi, %k1
7098 ; X64-NEXT: vpsrlq $7, %zmm0, %zmm0 {%k1} {z}
7101 ; X86-LABEL: test_x86_avx512_maskz_psrli_q_512:
7103 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7104 ; X86-NEXT: kmovw %eax, %k1
7105 ; X86-NEXT: vpsrlq $7, %zmm0, %zmm0 {%k1} {z}
7107 %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
7108 %mask.cast = bitcast i8 %mask to <8 x i1>
7109 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
7112 declare <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64>, i32) nounwind readnone
7114 define <16 x i32> @test_x86_avx512_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1) {
7115 ; CHECK-LABEL: test_x86_avx512_psllv_d_512:
7117 ; CHECK-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
7118 ; CHECK-NEXT: ret{{[l|q]}}
7119 %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
7123 define <16 x i32> @test_x86_avx512_psllv_d_512_const() {
7124 ; X64-LABEL: test_x86_avx512_psllv_d_512_const:
7126 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0]
7127 ; X64-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
7128 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295]
7129 ; X64-NEXT: vpsllvd {{.*}}(%rip), %zmm1, %zmm1
7130 ; X64-NEXT: vpaddd %zmm1, %zmm0, %zmm0
7133 ; X86-LABEL: test_x86_avx512_psllv_d_512_const:
7135 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0]
7136 ; X86-NEXT: vpsllvd {{\.LCPI.*}}, %zmm0, %zmm0
7137 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295]
7138 ; X86-NEXT: vpsllvd {{\.LCPI.*}}, %zmm1, %zmm1
7139 ; X86-NEXT: vpaddd %zmm1, %zmm0, %zmm0
7141 %res0 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> <i32 2, i32 9, i32 0, i32 -1, i32 3, i32 7, i32 -1, i32 0, i32 4, i32 5, i32 -2, i32 0, i32 5, i32 3, i32 -3, i32 0>, <16 x i32> <i32 1, i32 0, i32 33, i32 -1,i32 2, i32 0, i32 34, i32 -2, i32 3, i32 0, i32 35, i32 -1, i32 4, i32 0, i32 36, i32 -3>)
7142 %res1 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 -1>)
7143 %res2 = add <16 x i32> %res0, %res1
7144 ret <16 x i32> %res2
7147 define <16 x i32> @test_x86_avx512_mask_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
7148 ; X64-LABEL: test_x86_avx512_mask_psllv_d_512:
7150 ; X64-NEXT: kmovw %edi, %k1
7151 ; X64-NEXT: vpsllvd %zmm1, %zmm0, %zmm2 {%k1}
7152 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
7155 ; X86-LABEL: test_x86_avx512_mask_psllv_d_512:
7157 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7158 ; X86-NEXT: vpsllvd %zmm1, %zmm0, %zmm2 {%k1}
7159 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
7161 %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
7162 %mask.cast = bitcast i16 %mask to <16 x i1>
7163 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2
7164 ret <16 x i32> %res2
7167 define <16 x i32> @test_x86_avx512_maskz_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
7168 ; X64-LABEL: test_x86_avx512_maskz_psllv_d_512:
7170 ; X64-NEXT: kmovw %edi, %k1
7171 ; X64-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 {%k1} {z}
7174 ; X86-LABEL: test_x86_avx512_maskz_psllv_d_512:
7176 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7177 ; X86-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 {%k1} {z}
7179 %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
7180 %mask.cast = bitcast i16 %mask to <16 x i1>
7181 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
7182 ret <16 x i32> %res2
7185 declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>) nounwind readnone
7187 define <8 x i64> @test_x86_avx512_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1) {
7188 ; CHECK-LABEL: test_x86_avx512_psllv_q_512:
7190 ; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm0
7191 ; CHECK-NEXT: ret{{[l|q]}}
7192 %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
7196 define <8 x i64> @test_x86_avx512_psllv_q_512_const() {
7197 ; X64-LABEL: test_x86_avx512_psllv_q_512_const:
7199 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,18446744073709551615,3,7,18446744073709551615,0]
7200 ; X64-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0
7201 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,18446744073709551615]
7202 ; X64-NEXT: vpsllvq {{.*}}(%rip), %zmm1, %zmm1
7203 ; X64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
7206 ; X86-LABEL: test_x86_avx512_psllv_q_512_const:
7208 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,0,9,0,0,0,4294967295,4294967295,3,0,7,0,4294967295,4294967295,0,0]
7209 ; X86-NEXT: vpsllvq {{\.LCPI.*}}, %zmm0, %zmm0
7210 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,0,4,0,4,0,4,0,4,0,4,0,4,0,4294967295,4294967295]
7211 ; X86-NEXT: vpsllvq {{\.LCPI.*}}, %zmm1, %zmm1
7212 ; X86-NEXT: vpaddq %zmm1, %zmm0, %zmm0
7214 %res0 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> <i64 2, i64 9, i64 0, i64 -1, i64 3, i64 7, i64 -1, i64 0>, <8 x i64> <i64 1, i64 0, i64 33, i64 -1,i64 2, i64 0, i64 34, i64 -2>)
7215 %res1 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 -1>)
7216 %res2 = add <8 x i64> %res0, %res1
7220 define <8 x i64> @test_x86_avx512_mask_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
7221 ; X64-LABEL: test_x86_avx512_mask_psllv_q_512:
7223 ; X64-NEXT: kmovw %edi, %k1
7224 ; X64-NEXT: vpsllvq %zmm1, %zmm0, %zmm2 {%k1}
7225 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
7228 ; X86-LABEL: test_x86_avx512_mask_psllv_q_512:
7230 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7231 ; X86-NEXT: kmovw %eax, %k1
7232 ; X86-NEXT: vpsllvq %zmm1, %zmm0, %zmm2 {%k1}
7233 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
7235 %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
7236 %mask.cast = bitcast i8 %mask to <8 x i1>
7237 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2
7241 define <8 x i64> @test_x86_avx512_maskz_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
7242 ; X64-LABEL: test_x86_avx512_maskz_psllv_q_512:
7244 ; X64-NEXT: kmovw %edi, %k1
7245 ; X64-NEXT: vpsllvq %zmm1, %zmm0, %zmm0 {%k1} {z}
7248 ; X86-LABEL: test_x86_avx512_maskz_psllv_q_512:
7250 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7251 ; X86-NEXT: kmovw %eax, %k1
7252 ; X86-NEXT: vpsllvq %zmm1, %zmm0, %zmm0 {%k1} {z}
7254 %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
7255 %mask.cast = bitcast i8 %mask to <8 x i1>
7256 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
7260 declare <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64>, <8 x i64>) nounwind readnone
7262 define <16 x i32> @test_x86_avx512_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1) {
7263 ; CHECK-LABEL: test_x86_avx512_psrav_d_512:
7265 ; CHECK-NEXT: vpsravd %zmm1, %zmm0, %zmm0
7266 ; CHECK-NEXT: ret{{[l|q]}}
7267 %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
7271 define <16 x i32> @test_x86_avx512_mask_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
7272 ; X64-LABEL: test_x86_avx512_mask_psrav_d_512:
7274 ; X64-NEXT: kmovw %edi, %k1
7275 ; X64-NEXT: vpsravd %zmm1, %zmm0, %zmm2 {%k1}
7276 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
7279 ; X86-LABEL: test_x86_avx512_mask_psrav_d_512:
7281 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7282 ; X86-NEXT: vpsravd %zmm1, %zmm0, %zmm2 {%k1}
7283 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
7285 %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
7286 %mask.cast = bitcast i16 %mask to <16 x i1>
7287 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2
7288 ret <16 x i32> %res2
7291 define <16 x i32> @test_x86_avx512_maskz_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
7292 ; X64-LABEL: test_x86_avx512_maskz_psrav_d_512:
7294 ; X64-NEXT: kmovw %edi, %k1
7295 ; X64-NEXT: vpsravd %zmm1, %zmm0, %zmm0 {%k1} {z}
7298 ; X86-LABEL: test_x86_avx512_maskz_psrav_d_512:
7300 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7301 ; X86-NEXT: vpsravd %zmm1, %zmm0, %zmm0 {%k1} {z}
7303 %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
7304 %mask.cast = bitcast i16 %mask to <16 x i1>
7305 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
7306 ret <16 x i32> %res2
7309 declare <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32>, <16 x i32>) nounwind readnone
7311 define <8 x i64> @test_x86_avx512_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1) {
7312 ; CHECK-LABEL: test_x86_avx512_psrav_q_512:
7314 ; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm0
7315 ; CHECK-NEXT: ret{{[l|q]}}
7316 %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
7320 define <8 x i64> @test_x86_avx512_mask_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
7321 ; X64-LABEL: test_x86_avx512_mask_psrav_q_512:
7323 ; X64-NEXT: kmovw %edi, %k1
7324 ; X64-NEXT: vpsravq %zmm1, %zmm0, %zmm2 {%k1}
7325 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
7328 ; X86-LABEL: test_x86_avx512_mask_psrav_q_512:
7330 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7331 ; X86-NEXT: kmovw %eax, %k1
7332 ; X86-NEXT: vpsravq %zmm1, %zmm0, %zmm2 {%k1}
7333 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
7335 %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
7336 %mask.cast = bitcast i8 %mask to <8 x i1>
7337 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2
7341 define <8 x i64> @test_x86_avx512_maskz_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
7342 ; X64-LABEL: test_x86_avx512_maskz_psrav_q_512:
7344 ; X64-NEXT: kmovw %edi, %k1
7345 ; X64-NEXT: vpsravq %zmm1, %zmm0, %zmm0 {%k1} {z}
7348 ; X86-LABEL: test_x86_avx512_maskz_psrav_q_512:
7350 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7351 ; X86-NEXT: kmovw %eax, %k1
7352 ; X86-NEXT: vpsravq %zmm1, %zmm0, %zmm0 {%k1} {z}
7354 %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
7355 %mask.cast = bitcast i8 %mask to <8 x i1>
7356 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
7360 declare <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64>, <8 x i64>) nounwind readnone
7362 define <16 x i32> @test_x86_avx512_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1) {
7363 ; CHECK-LABEL: test_x86_avx512_psrlv_d_512:
7365 ; CHECK-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
7366 ; CHECK-NEXT: ret{{[l|q]}}
7367 %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
7371 define <16 x i32> @test_x86_avx512_psrlv_d_512_const() {
7372 ; X64-LABEL: test_x86_avx512_psrlv_d_512_const:
7374 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0]
7375 ; X64-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
7376 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295]
7377 ; X64-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
7378 ; X64-NEXT: vpaddd %zmm1, %zmm0, %zmm0
7381 ; X86-LABEL: test_x86_avx512_psrlv_d_512_const:
7383 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0]
7384 ; X86-NEXT: vpsrlvd {{\.LCPI.*}}, %zmm0, %zmm0
7385 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295]
7386 ; X86-NEXT: vpsrlvd {{\.LCPI.*}}, %zmm1, %zmm1
7387 ; X86-NEXT: vpaddd %zmm1, %zmm0, %zmm0
7389 %res0 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> <i32 2, i32 9, i32 0, i32 -1, i32 3, i32 7, i32 -1, i32 0, i32 4, i32 5, i32 -2, i32 0, i32 5, i32 3, i32 -3, i32 0>, <16 x i32> <i32 1, i32 0, i32 33, i32 -1,i32 2, i32 0, i32 34, i32 -2, i32 3, i32 0, i32 35, i32 -1, i32 4, i32 0, i32 36, i32 -3>)
7390 %res1 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 -1 >)
7391 %res2 = add <16 x i32> %res0, %res1
7392 ret <16 x i32> %res2
7395 define <16 x i32> @test_x86_avx512_mask_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
7396 ; X64-LABEL: test_x86_avx512_mask_psrlv_d_512:
7398 ; X64-NEXT: kmovw %edi, %k1
7399 ; X64-NEXT: vpsrlvd %zmm1, %zmm0, %zmm2 {%k1}
7400 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
7403 ; X86-LABEL: test_x86_avx512_mask_psrlv_d_512:
7405 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7406 ; X86-NEXT: vpsrlvd %zmm1, %zmm0, %zmm2 {%k1}
7407 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
7409 %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
7410 %mask.cast = bitcast i16 %mask to <16 x i1>
7411 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2
7412 ret <16 x i32> %res2
7415 define <16 x i32> @test_x86_avx512_maskz_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
7416 ; X64-LABEL: test_x86_avx512_maskz_psrlv_d_512:
7418 ; X64-NEXT: kmovw %edi, %k1
7419 ; X64-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 {%k1} {z}
7422 ; X86-LABEL: test_x86_avx512_maskz_psrlv_d_512:
7424 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7425 ; X86-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 {%k1} {z}
7427 %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
7428 %mask.cast = bitcast i16 %mask to <16 x i1>
7429 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
7430 ret <16 x i32> %res2
7433 declare <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32>, <16 x i32>) nounwind readnone
7435 define <8 x i64> @test_x86_avx512_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1) {
7436 ; CHECK-LABEL: test_x86_avx512_psrlv_q_512:
7438 ; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0
7439 ; CHECK-NEXT: ret{{[l|q]}}
7440 %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
7444 define <8 x i64> @test_x86_avx512_psrlv_q_512_const() {
7445 ; X64-LABEL: test_x86_avx512_psrlv_q_512_const:
7447 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,18446744073709551615,3,7,18446744073709551615,0]
7448 ; X64-NEXT: vpsrlvq {{.*}}(%rip), %zmm0, %zmm0
7449 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,18446744073709551615]
7450 ; X64-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1
7451 ; X64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
7454 ; X86-LABEL: test_x86_avx512_psrlv_q_512_const:
7456 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,0,9,0,0,0,4294967295,4294967295,3,0,7,0,4294967295,4294967295,0,0]
7457 ; X86-NEXT: vpsrlvq {{\.LCPI.*}}, %zmm0, %zmm0
7458 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,0,4,0,4,0,4,0,4,0,4,0,4,0,4294967295,4294967295]
7459 ; X86-NEXT: vpsrlvq {{\.LCPI.*}}, %zmm1, %zmm1
7460 ; X86-NEXT: vpaddq %zmm1, %zmm0, %zmm0
7462 %res0 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> <i64 2, i64 9, i64 0, i64 -1, i64 3, i64 7, i64 -1, i64 0>, <8 x i64> <i64 1, i64 0, i64 33, i64 -1,i64 2, i64 0, i64 34, i64 -2>)
7463 %res1 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 -1>)
7464 %res2 = add <8 x i64> %res0, %res1
7468 define <8 x i64> @test_x86_avx512_mask_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
7469 ; X64-LABEL: test_x86_avx512_mask_psrlv_q_512:
7471 ; X64-NEXT: kmovw %edi, %k1
7472 ; X64-NEXT: vpsrlvq %zmm1, %zmm0, %zmm2 {%k1}
7473 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
7476 ; X86-LABEL: test_x86_avx512_mask_psrlv_q_512:
7478 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7479 ; X86-NEXT: kmovw %eax, %k1
7480 ; X86-NEXT: vpsrlvq %zmm1, %zmm0, %zmm2 {%k1}
7481 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
7483 %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
7484 %mask.cast = bitcast i8 %mask to <8 x i1>
7485 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2
7489 define <8 x i64> @test_x86_avx512_maskz_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
7490 ; X64-LABEL: test_x86_avx512_maskz_psrlv_q_512:
7492 ; X64-NEXT: kmovw %edi, %k1
7493 ; X64-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} {z}
7496 ; X86-LABEL: test_x86_avx512_maskz_psrlv_q_512:
7498 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7499 ; X86-NEXT: kmovw %eax, %k1
7500 ; X86-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} {z}
7502 %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
7503 %mask.cast = bitcast i8 %mask to <8 x i1>
7504 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
7508 declare <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64>, <8 x i64>) nounwind readnone
7510 define <16 x float> @bad_mask_transition(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d, <16 x float> %e, <16 x float> %f) {
7511 ; X64-LABEL: bad_mask_transition:
7512 ; X64: # %bb.0: # %entry
7513 ; X64-NEXT: vcmplt_oqpd %zmm1, %zmm0, %k0
7514 ; X64-NEXT: kmovw %k0, %eax
7515 ; X64-NEXT: vcmplt_oqpd %zmm3, %zmm2, %k0
7516 ; X64-NEXT: kmovw %k0, %ecx
7517 ; X64-NEXT: movzbl %al, %eax
7518 ; X64-NEXT: movzbl %cl, %ecx
7519 ; X64-NEXT: kmovw %eax, %k0
7520 ; X64-NEXT: kmovw %ecx, %k1
7521 ; X64-NEXT: kunpckbw %k0, %k1, %k1
7522 ; X64-NEXT: vblendmps %zmm5, %zmm4, %zmm0 {%k1}
7525 ; X86-LABEL: bad_mask_transition:
7526 ; X86: # %bb.0: # %entry
7527 ; X86-NEXT: pushl %ebp
7528 ; X86-NEXT: .cfi_def_cfa_offset 8
7529 ; X86-NEXT: .cfi_offset %ebp, -8
7530 ; X86-NEXT: movl %esp, %ebp
7531 ; X86-NEXT: .cfi_def_cfa_register %ebp
7532 ; X86-NEXT: andl $-64, %esp
7533 ; X86-NEXT: subl $64, %esp
7534 ; X86-NEXT: vmovaps 72(%ebp), %zmm3
7535 ; X86-NEXT: vcmplt_oqpd %zmm1, %zmm0, %k0
7536 ; X86-NEXT: kmovw %k0, %eax
7537 ; X86-NEXT: vcmplt_oqpd 8(%ebp), %zmm2, %k0
7538 ; X86-NEXT: kmovw %k0, %ecx
7539 ; X86-NEXT: movzbl %al, %eax
7540 ; X86-NEXT: movzbl %cl, %ecx
7541 ; X86-NEXT: kmovw %eax, %k0
7542 ; X86-NEXT: kmovw %ecx, %k1
7543 ; X86-NEXT: kunpckbw %k0, %k1, %k1
7544 ; X86-NEXT: vmovaps 136(%ebp), %zmm3 {%k1}
7545 ; X86-NEXT: vmovaps %zmm3, %zmm0
7546 ; X86-NEXT: movl %ebp, %esp
7547 ; X86-NEXT: popl %ebp
7548 ; X86-NEXT: .cfi_def_cfa %esp, 4
7551 %0 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, i32 4)
7552 %1 = bitcast <8 x i1> %0 to i8
7553 %2 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %c, <8 x double> %d, i32 17, i32 4)
7554 %3 = bitcast <8 x i1> %2 to i8
7555 %conv = zext i8 %1 to i16
7556 %conv2 = zext i8 %3 to i16
7557 %4 = bitcast i16 %conv to <16 x i1>
7558 %5 = bitcast i16 %conv2 to <16 x i1>
7559 %6 = shufflevector <16 x i1> %4, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
7560 %7 = shufflevector <16 x i1> %5, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
7561 %8 = shufflevector <8 x i1> %6, <8 x i1> %7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
7562 %9 = select <16 x i1> %8, <16 x float> %f, <16 x float> %e
7566 define <16 x float> @bad_mask_transition_2(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d, <16 x float> %e, <16 x float> %f) {
7567 ; X64-LABEL: bad_mask_transition_2:
7568 ; X64: # %bb.0: # %entry
7569 ; X64-NEXT: vcmplt_oqpd %zmm1, %zmm0, %k0
7570 ; X64-NEXT: kmovw %k0, %eax
7571 ; X64-NEXT: movzbl %al, %eax
7572 ; X64-NEXT: kmovw %eax, %k1
7573 ; X64-NEXT: vblendmps %zmm5, %zmm4, %zmm0 {%k1}
7576 ; X86-LABEL: bad_mask_transition_2:
7577 ; X86: # %bb.0: # %entry
7578 ; X86-NEXT: pushl %ebp
7579 ; X86-NEXT: .cfi_def_cfa_offset 8
7580 ; X86-NEXT: .cfi_offset %ebp, -8
7581 ; X86-NEXT: movl %esp, %ebp
7582 ; X86-NEXT: .cfi_def_cfa_register %ebp
7583 ; X86-NEXT: andl $-64, %esp
7584 ; X86-NEXT: subl $64, %esp
7585 ; X86-NEXT: vmovaps 72(%ebp), %zmm2
7586 ; X86-NEXT: vcmplt_oqpd %zmm1, %zmm0, %k0
7587 ; X86-NEXT: kmovw %k0, %eax
7588 ; X86-NEXT: movzbl %al, %eax
7589 ; X86-NEXT: kmovw %eax, %k1
7590 ; X86-NEXT: vmovaps 136(%ebp), %zmm2 {%k1}
7591 ; X86-NEXT: vmovaps %zmm2, %zmm0
7592 ; X86-NEXT: movl %ebp, %esp
7593 ; X86-NEXT: popl %ebp
7594 ; X86-NEXT: .cfi_def_cfa %esp, 4
7597 %0 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, i32 4)
7598 %1 = bitcast <8 x i1> %0 to i8
7599 %conv = zext i8 %1 to i16
7600 %2 = bitcast i16 %conv to <16 x i1>
7601 %3 = select <16 x i1> %2, <16 x float> %f, <16 x float> %e
7605 declare <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double>, <8 x double>, <8 x i1>)
7606 declare <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float>, <16 x float>, <16 x i1>)
7607 declare <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64>, <8 x i64>, <8 x i1>)
7608 declare <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32>, <16 x i32>, <16 x i1>)
7609 declare <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double>, <8 x double>, <8 x i1>)
7610 declare <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float>, <16 x float>, <16 x i1>)
7611 declare <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64>, <8 x i64>, <8 x i1>)
7612 declare <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32>, <16 x i32>, <16 x i1>)