1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64
3 ; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86
6 define <8 x double> @test_mask_compress_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask) {
7 ; X64-LABEL: test_mask_compress_pd_512:
9 ; X64-NEXT: kmovw %edi, %k1
10 ; X64-NEXT: vcompresspd %zmm0, %zmm1 {%k1}
11 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
14 ; X86-LABEL: test_mask_compress_pd_512:
16 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
17 ; X86-NEXT: kmovw %eax, %k1
18 ; X86-NEXT: vcompresspd %zmm0, %zmm1 {%k1}
19 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
21 %1 = bitcast i8 %mask to <8 x i1>
22 %2 = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> %data, <8 x double> %passthru, <8 x i1> %1)
26 define <8 x double> @test_maskz_compress_pd_512(<8 x double> %data, i8 %mask) {
27 ; X64-LABEL: test_maskz_compress_pd_512:
29 ; X64-NEXT: kmovw %edi, %k1
30 ; X64-NEXT: vcompresspd %zmm0, %zmm0 {%k1} {z}
33 ; X86-LABEL: test_maskz_compress_pd_512:
35 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
36 ; X86-NEXT: kmovw %eax, %k1
37 ; X86-NEXT: vcompresspd %zmm0, %zmm0 {%k1} {z}
39 %1 = bitcast i8 %mask to <8 x i1>
40 %2 = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> %data, <8 x double> zeroinitializer, <8 x i1> %1)
44 define <8 x double> @test_compress_pd_512(<8 x double> %data) {
45 ; CHECK-LABEL: test_compress_pd_512:
47 ; CHECK-NEXT: ret{{[l|q]}}
48 %1 = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> %data, <8 x double> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
52 define <16 x float> @test_mask_compress_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask) {
53 ; X64-LABEL: test_mask_compress_ps_512:
55 ; X64-NEXT: kmovw %edi, %k1
56 ; X64-NEXT: vcompressps %zmm0, %zmm1 {%k1}
57 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
60 ; X86-LABEL: test_mask_compress_ps_512:
62 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
63 ; X86-NEXT: vcompressps %zmm0, %zmm1 {%k1}
64 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
66 %1 = bitcast i16 %mask to <16 x i1>
67 %2 = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> %data, <16 x float> %passthru, <16 x i1> %1)
71 define <16 x float> @test_maskz_compress_ps_512(<16 x float> %data, i16 %mask) {
72 ; X64-LABEL: test_maskz_compress_ps_512:
74 ; X64-NEXT: kmovw %edi, %k1
75 ; X64-NEXT: vcompressps %zmm0, %zmm0 {%k1} {z}
78 ; X86-LABEL: test_maskz_compress_ps_512:
80 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
81 ; X86-NEXT: vcompressps %zmm0, %zmm0 {%k1} {z}
83 %1 = bitcast i16 %mask to <16 x i1>
84 %2 = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> %data, <16 x float> zeroinitializer, <16 x i1> %1)
88 define <16 x float> @test_compress_ps_512(<16 x float> %data) {
89 ; CHECK-LABEL: test_compress_ps_512:
91 ; CHECK-NEXT: ret{{[l|q]}}
92 %1 = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> %data, <16 x float> undef, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
96 define <8 x i64> @test_mask_compress_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) {
97 ; X64-LABEL: test_mask_compress_q_512:
99 ; X64-NEXT: kmovw %edi, %k1
100 ; X64-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
101 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
104 ; X86-LABEL: test_mask_compress_q_512:
106 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
107 ; X86-NEXT: kmovw %eax, %k1
108 ; X86-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
109 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
111 %1 = bitcast i8 %mask to <8 x i1>
112 %2 = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> %data, <8 x i64> %passthru, <8 x i1> %1)
116 define <8 x i64> @test_maskz_compress_q_512(<8 x i64> %data, i8 %mask) {
117 ; X64-LABEL: test_maskz_compress_q_512:
119 ; X64-NEXT: kmovw %edi, %k1
120 ; X64-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
123 ; X86-LABEL: test_maskz_compress_q_512:
125 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
126 ; X86-NEXT: kmovw %eax, %k1
127 ; X86-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
129 %1 = bitcast i8 %mask to <8 x i1>
130 %2 = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> %data, <8 x i64> zeroinitializer, <8 x i1> %1)
134 define <8 x i64> @test_compress_q_512(<8 x i64> %data) {
135 ; CHECK-LABEL: test_compress_q_512:
137 ; CHECK-NEXT: ret{{[l|q]}}
138 %1 = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> %data, <8 x i64> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
142 define <16 x i32> @test_mask_compress_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) {
143 ; X64-LABEL: test_mask_compress_d_512:
145 ; X64-NEXT: kmovw %edi, %k1
146 ; X64-NEXT: vpcompressd %zmm0, %zmm1 {%k1}
147 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
150 ; X86-LABEL: test_mask_compress_d_512:
152 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
153 ; X86-NEXT: vpcompressd %zmm0, %zmm1 {%k1}
154 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
156 %1 = bitcast i16 %mask to <16 x i1>
157 %2 = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %data, <16 x i32> %passthru, <16 x i1> %1)
161 define <16 x i32> @test_maskz_compress_d_512(<16 x i32> %data, i16 %mask) {
162 ; X64-LABEL: test_maskz_compress_d_512:
164 ; X64-NEXT: kmovw %edi, %k1
165 ; X64-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z}
168 ; X86-LABEL: test_maskz_compress_d_512:
170 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
171 ; X86-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z}
173 %1 = bitcast i16 %mask to <16 x i1>
174 %2 = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %data, <16 x i32> zeroinitializer, <16 x i1> %1)
178 define <16 x i32> @test_compress_d_512(<16 x i32> %data) {
179 ; CHECK-LABEL: test_compress_d_512:
181 ; CHECK-NEXT: ret{{[l|q]}}
182 %1 = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %data, <16 x i32> undef, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
186 define <8 x double> @test_expand_pd_512(<8 x double> %data) {
187 ; CHECK-LABEL: test_expand_pd_512:
189 ; CHECK-NEXT: ret{{[l|q]}}
190 %1 = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> %data, <8 x double> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
194 define <8 x double> @test_mask_expand_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask) {
195 ; X64-LABEL: test_mask_expand_pd_512:
197 ; X64-NEXT: kmovw %edi, %k1
198 ; X64-NEXT: vexpandpd %zmm0, %zmm1 {%k1}
199 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
202 ; X86-LABEL: test_mask_expand_pd_512:
204 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
205 ; X86-NEXT: kmovw %eax, %k1
206 ; X86-NEXT: vexpandpd %zmm0, %zmm1 {%k1}
207 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
209 %1 = bitcast i8 %mask to <8 x i1>
210 %2 = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> %data, <8 x double> %passthru, <8 x i1> %1)
214 define <8 x double> @test_maskz_expand_pd_512(<8 x double> %data, i8 %mask) {
215 ; X64-LABEL: test_maskz_expand_pd_512:
217 ; X64-NEXT: kmovw %edi, %k1
218 ; X64-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z}
221 ; X86-LABEL: test_maskz_expand_pd_512:
223 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
224 ; X86-NEXT: kmovw %eax, %k1
225 ; X86-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z}
227 %1 = bitcast i8 %mask to <8 x i1>
228 %2 = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> %data, <8 x double> zeroinitializer, <8 x i1> %1)
232 define <16 x float> @test_expand_ps_512(<16 x float> %data) {
233 ; CHECK-LABEL: test_expand_ps_512:
235 ; CHECK-NEXT: ret{{[l|q]}}
236 %1 = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> %data, <16 x float> undef, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
240 define <16 x float> @test_mask_expand_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask) {
241 ; X64-LABEL: test_mask_expand_ps_512:
243 ; X64-NEXT: kmovw %edi, %k1
244 ; X64-NEXT: vexpandps %zmm0, %zmm1 {%k1}
245 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
248 ; X86-LABEL: test_mask_expand_ps_512:
250 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
251 ; X86-NEXT: vexpandps %zmm0, %zmm1 {%k1}
252 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
254 %1 = bitcast i16 %mask to <16 x i1>
255 %2 = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> %data, <16 x float> %passthru, <16 x i1> %1)
259 define <16 x float> @test_maskz_expand_ps_512(<16 x float> %data, i16 %mask) {
260 ; X64-LABEL: test_maskz_expand_ps_512:
262 ; X64-NEXT: kmovw %edi, %k1
263 ; X64-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z}
266 ; X86-LABEL: test_maskz_expand_ps_512:
268 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
269 ; X86-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z}
271 %1 = bitcast i16 %mask to <16 x i1>
272 %2 = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> %data, <16 x float> zeroinitializer, <16 x i1> %1)
276 define <8 x i64> @test_expand_q_512(<8 x i64> %data) {
277 ; CHECK-LABEL: test_expand_q_512:
279 ; CHECK-NEXT: ret{{[l|q]}}
280 %1 = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> %data, <8 x i64> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
284 define <8 x i64> @test_mask_expand_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) {
285 ; X64-LABEL: test_mask_expand_q_512:
287 ; X64-NEXT: kmovw %edi, %k1
288 ; X64-NEXT: vpexpandq %zmm0, %zmm1 {%k1}
289 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
292 ; X86-LABEL: test_mask_expand_q_512:
294 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
295 ; X86-NEXT: kmovw %eax, %k1
296 ; X86-NEXT: vpexpandq %zmm0, %zmm1 {%k1}
297 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
299 %1 = bitcast i8 %mask to <8 x i1>
300 %2 = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> %data, <8 x i64> %passthru, <8 x i1> %1)
304 define <8 x i64> @test_maskz_expand_q_512(<8 x i64> %data, i8 %mask) {
305 ; X64-LABEL: test_maskz_expand_q_512:
307 ; X64-NEXT: kmovw %edi, %k1
308 ; X64-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
311 ; X86-LABEL: test_maskz_expand_q_512:
313 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
314 ; X86-NEXT: kmovw %eax, %k1
315 ; X86-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
317 %1 = bitcast i8 %mask to <8 x i1>
318 %2 = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> %data, <8 x i64> zeroinitializer, <8 x i1> %1)
322 define <16 x i32> @test_expand_d_512(<16 x i32> %data) {
323 ; CHECK-LABEL: test_expand_d_512:
325 ; CHECK-NEXT: ret{{[l|q]}}
326 %1 = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %data, <16 x i32> undef, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
330 define <16 x i32> @test_mask_expand_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) {
331 ; X64-LABEL: test_mask_expand_d_512:
333 ; X64-NEXT: kmovw %edi, %k1
334 ; X64-NEXT: vpexpandd %zmm0, %zmm1 {%k1}
335 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
338 ; X86-LABEL: test_mask_expand_d_512:
340 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
341 ; X86-NEXT: vpexpandd %zmm0, %zmm1 {%k1}
342 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
344 %1 = bitcast i16 %mask to <16 x i1>
345 %2 = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %data, <16 x i32> %passthru, <16 x i1> %1)
349 define <16 x i32> @test_maskz_expand_d_512(<16 x i32> %data, i16 %mask) {
350 ; X64-LABEL: test_maskz_expand_d_512:
352 ; X64-NEXT: kmovw %edi, %k1
353 ; X64-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z}
356 ; X86-LABEL: test_maskz_expand_d_512:
358 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
359 ; X86-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z}
361 %1 = bitcast i16 %mask to <16 x i1>
362 %2 = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %data, <16 x i32> zeroinitializer, <16 x i1> %1)
366 define <16 x float> @test_rcp_ps_512(<16 x float> %a0) {
367 ; CHECK-LABEL: test_rcp_ps_512:
369 ; CHECK-NEXT: vrcp14ps %zmm0, %zmm0
370 ; CHECK-NEXT: ret{{[l|q]}}
371 %res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
372 ret <16 x float> %res
374 declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
376 define <8 x double> @test_rcp_pd_512(<8 x double> %a0) {
377 ; CHECK-LABEL: test_rcp_pd_512:
379 ; CHECK-NEXT: vrcp14pd %zmm0, %zmm0
380 ; CHECK-NEXT: ret{{[l|q]}}
381 %res = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1) ; <<8 x double>> [#uses=1]
382 ret <8 x double> %res
384 declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>, <8 x double>, i8) nounwind readnone
386 declare <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32, i32)
388 define <2 x double> @test_rndscale_sd(<2 x double> %a, <2 x double> %b) {
389 ; CHECK-LABEL: test_rndscale_sd:
391 ; CHECK-NEXT: vroundsd $11, %xmm1, %xmm0, %xmm0
392 ; CHECK-NEXT: ret{{[l|q]}}
393 %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> undef, i8 -1, i32 11, i32 4)
397 define <2 x double> @test_rndscale_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
398 ; X64-LABEL: test_rndscale_sd_mask:
400 ; X64-NEXT: kmovw %edi, %k1
401 ; X64-NEXT: vrndscalesd $11, %xmm1, %xmm0, %xmm2 {%k1}
402 ; X64-NEXT: vmovapd %xmm2, %xmm0
405 ; X86-LABEL: test_rndscale_sd_mask:
407 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
408 ; X86-NEXT: kmovw %eax, %k1
409 ; X86-NEXT: vrndscalesd $11, %xmm1, %xmm0, %xmm2 {%k1}
410 ; X86-NEXT: vmovapd %xmm2, %xmm0
412 %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 11, i32 4)
416 define <2 x double> @test_rndscale_sd_mask_load(<2 x double> %a, <2 x double>* %bptr, <2 x double> %c, i8 %mask) {
417 ; X64-LABEL: test_rndscale_sd_mask_load:
419 ; X64-NEXT: kmovw %esi, %k1
420 ; X64-NEXT: vrndscalesd $11, (%rdi), %xmm0, %xmm1 {%k1}
421 ; X64-NEXT: vmovapd %xmm1, %xmm0
424 ; X86-LABEL: test_rndscale_sd_mask_load:
426 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
427 ; X86-NEXT: kmovw %eax, %k1
428 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
429 ; X86-NEXT: vrndscalesd $11, (%eax), %xmm0, %xmm1 {%k1}
430 ; X86-NEXT: vmovapd %xmm1, %xmm0
432 %b = load <2 x double>, <2 x double>* %bptr
433 %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 11, i32 4)
437 define <2 x double> @test_rndscale_sd_maskz(<2 x double> %a, <2 x double> %b, i8 %mask) {
438 ; X64-LABEL: test_rndscale_sd_maskz:
440 ; X64-NEXT: kmovw %edi, %k1
441 ; X64-NEXT: vrndscalesd $11, %xmm1, %xmm0, %xmm0 {%k1} {z}
444 ; X86-LABEL: test_rndscale_sd_maskz:
446 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
447 ; X86-NEXT: kmovw %eax, %k1
448 ; X86-NEXT: vrndscalesd $11, %xmm1, %xmm0, %xmm0 {%k1} {z}
450 %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> zeroinitializer, i8 %mask, i32 11, i32 4)
454 declare <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32, i32)
456 define <4 x float> @test_rndscale_ss(<4 x float> %a, <4 x float> %b) {
457 ; CHECK-LABEL: test_rndscale_ss:
459 ; CHECK-NEXT: vroundss $11, %xmm1, %xmm0, %xmm0
460 ; CHECK-NEXT: ret{{[l|q]}}
461 %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 11, i32 4)
465 define <4 x float> @test_rndscale_ss_load(<4 x float> %a, <4 x float>* %bptr) {
466 ; X64-LABEL: test_rndscale_ss_load:
468 ; X64-NEXT: vroundss $11, (%rdi), %xmm0, %xmm0
471 ; X86-LABEL: test_rndscale_ss_load:
473 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
474 ; X86-NEXT: vroundss $11, (%eax), %xmm0, %xmm0
476 %b = load <4 x float>, <4 x float>* %bptr
477 %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 11, i32 4)
481 define <4 x float> @test_rndscale_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
482 ; X64-LABEL: test_rndscale_ss_mask:
484 ; X64-NEXT: kmovw %edi, %k1
485 ; X64-NEXT: vrndscaless $11, %xmm1, %xmm0, %xmm2 {%k1}
486 ; X64-NEXT: vmovaps %xmm2, %xmm0
489 ; X86-LABEL: test_rndscale_ss_mask:
491 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
492 ; X86-NEXT: kmovw %eax, %k1
493 ; X86-NEXT: vrndscaless $11, %xmm1, %xmm0, %xmm2 {%k1}
494 ; X86-NEXT: vmovaps %xmm2, %xmm0
496 %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask, i32 11, i32 4)
500 define <4 x float> @test_rndscale_ss_maskz(<4 x float> %a, <4 x float> %b, i8 %mask) {
501 ; X64-LABEL: test_rndscale_ss_maskz:
503 ; X64-NEXT: kmovw %edi, %k1
504 ; X64-NEXT: vrndscaless $11, %xmm1, %xmm0, %xmm0 {%k1} {z}
507 ; X86-LABEL: test_rndscale_ss_maskz:
509 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
510 ; X86-NEXT: kmovw %eax, %k1
511 ; X86-NEXT: vrndscaless $11, %xmm1, %xmm0, %xmm0 {%k1} {z}
513 %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask, i32 11, i32 4)
517 declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
519 define <8 x double> @test7(<8 x double> %a) {
520 ; CHECK-LABEL: test7:
522 ; CHECK-NEXT: vrndscalepd $11, %zmm0, %zmm0
523 ; CHECK-NEXT: ret{{[l|q]}}
524 %res = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %a, i32 11, <8 x double> %a, i8 -1, i32 4)
528 declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
530 define <16 x float> @test8(<16 x float> %a) {
531 ; CHECK-LABEL: test8:
533 ; CHECK-NEXT: vrndscaleps $11, %zmm0, %zmm0
534 ; CHECK-NEXT: ret{{[l|q]}}
535 %res = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %a, i32 11, <16 x float> %a, i16 -1, i32 4)
539 define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) {
540 ; CHECK-LABEL: test_rsqrt_ps_512:
542 ; CHECK-NEXT: vrsqrt14ps %zmm0, %zmm0
543 ; CHECK-NEXT: ret{{[l|q]}}
544 %res = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
545 ret <16 x float> %res
547 declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
549 define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) {
550 ; CHECK-LABEL: test_sqrt_pd_512:
552 ; CHECK-NEXT: vsqrtpd %zmm0, %zmm0
553 ; CHECK-NEXT: ret{{[l|q]}}
554 %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0)
558 define <8 x double> @test_mask_sqrt_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
559 ; X64-LABEL: test_mask_sqrt_pd_512:
561 ; X64-NEXT: kmovw %edi, %k1
562 ; X64-NEXT: vsqrtpd %zmm0, %zmm1 {%k1}
563 ; X64-NEXT: vmovapd %zmm1, %zmm0
566 ; X86-LABEL: test_mask_sqrt_pd_512:
568 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
569 ; X86-NEXT: kmovw %eax, %k1
570 ; X86-NEXT: vsqrtpd %zmm0, %zmm1 {%k1}
571 ; X86-NEXT: vmovapd %zmm1, %zmm0
573 %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0)
574 %2 = bitcast i8 %mask to <8 x i1>
575 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
579 define <8 x double> @test_maskz_sqrt_pd_512(<8 x double> %a0, i8 %mask) {
580 ; X64-LABEL: test_maskz_sqrt_pd_512:
582 ; X64-NEXT: kmovw %edi, %k1
583 ; X64-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z}
586 ; X86-LABEL: test_maskz_sqrt_pd_512:
588 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
589 ; X86-NEXT: kmovw %eax, %k1
590 ; X86-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z}
592 %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0)
593 %2 = bitcast i8 %mask to <8 x i1>
594 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
597 declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
599 define <8 x double> @test_sqrt_round_pd_512(<8 x double> %a0) {
600 ; CHECK-LABEL: test_sqrt_round_pd_512:
602 ; CHECK-NEXT: vsqrtpd {rz-sae}, %zmm0, %zmm0
603 ; CHECK-NEXT: ret{{[l|q]}}
604 %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11)
608 define <8 x double> @test_mask_sqrt_round_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
609 ; X64-LABEL: test_mask_sqrt_round_pd_512:
611 ; X64-NEXT: kmovw %edi, %k1
612 ; X64-NEXT: vsqrtpd {rz-sae}, %zmm0, %zmm1 {%k1}
613 ; X64-NEXT: vmovapd %zmm1, %zmm0
616 ; X86-LABEL: test_mask_sqrt_round_pd_512:
618 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
619 ; X86-NEXT: kmovw %eax, %k1
620 ; X86-NEXT: vsqrtpd {rz-sae}, %zmm0, %zmm1 {%k1}
621 ; X86-NEXT: vmovapd %zmm1, %zmm0
623 %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11)
624 %2 = bitcast i8 %mask to <8 x i1>
625 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
629 define <8 x double> @test_maskz_sqrt_round_pd_512(<8 x double> %a0, i8 %mask) {
630 ; X64-LABEL: test_maskz_sqrt_round_pd_512:
632 ; X64-NEXT: kmovw %edi, %k1
633 ; X64-NEXT: vsqrtpd {rz-sae}, %zmm0, %zmm0 {%k1} {z}
636 ; X86-LABEL: test_maskz_sqrt_round_pd_512:
638 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
639 ; X86-NEXT: kmovw %eax, %k1
640 ; X86-NEXT: vsqrtpd {rz-sae}, %zmm0, %zmm0 {%k1} {z}
642 %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11)
643 %2 = bitcast i8 %mask to <8 x i1>
644 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
647 declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, i32) nounwind readnone
649 define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) {
650 ; CHECK-LABEL: test_sqrt_ps_512:
652 ; CHECK-NEXT: vsqrtps %zmm0, %zmm0
653 ; CHECK-NEXT: ret{{[l|q]}}
654 %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0)
658 define <16 x float> @test_mask_sqrt_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
659 ; X64-LABEL: test_mask_sqrt_ps_512:
661 ; X64-NEXT: kmovw %edi, %k1
662 ; X64-NEXT: vsqrtps %zmm0, %zmm1 {%k1}
663 ; X64-NEXT: vmovaps %zmm1, %zmm0
666 ; X86-LABEL: test_mask_sqrt_ps_512:
668 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
669 ; X86-NEXT: vsqrtps %zmm0, %zmm1 {%k1}
670 ; X86-NEXT: vmovaps %zmm1, %zmm0
672 %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0)
673 %2 = bitcast i16 %mask to <16 x i1>
674 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
678 define <16 x float> @test_maskz_sqrt_ps_512(<16 x float> %a0, i16 %mask) {
679 ; X64-LABEL: test_maskz_sqrt_ps_512:
681 ; X64-NEXT: kmovw %edi, %k1
682 ; X64-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z}
685 ; X86-LABEL: test_maskz_sqrt_ps_512:
687 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
688 ; X86-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z}
690 %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0)
691 %2 = bitcast i16 %mask to <16 x i1>
692 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
695 declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
697 define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0) {
698 ; CHECK-LABEL: test_sqrt_round_ps_512:
700 ; CHECK-NEXT: vsqrtps {rz-sae}, %zmm0, %zmm0
701 ; CHECK-NEXT: ret{{[l|q]}}
702 %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11)
706 define <16 x float> @test_mask_sqrt_round_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
707 ; X64-LABEL: test_mask_sqrt_round_ps_512:
709 ; X64-NEXT: kmovw %edi, %k1
710 ; X64-NEXT: vsqrtps {rz-sae}, %zmm0, %zmm1 {%k1}
711 ; X64-NEXT: vmovaps %zmm1, %zmm0
714 ; X86-LABEL: test_mask_sqrt_round_ps_512:
716 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
717 ; X86-NEXT: vsqrtps {rz-sae}, %zmm0, %zmm1 {%k1}
718 ; X86-NEXT: vmovaps %zmm1, %zmm0
720 %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11)
721 %2 = bitcast i16 %mask to <16 x i1>
722 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
726 define <16 x float> @test_maskz_sqrt_round_ps_512(<16 x float> %a0, i16 %mask) {
727 ; X64-LABEL: test_maskz_sqrt_round_ps_512:
729 ; X64-NEXT: kmovw %edi, %k1
730 ; X64-NEXT: vsqrtps {rz-sae}, %zmm0, %zmm0 {%k1} {z}
733 ; X86-LABEL: test_maskz_sqrt_round_ps_512:
735 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
736 ; X86-NEXT: vsqrtps {rz-sae}, %zmm0, %zmm0 {%k1} {z}
738 %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11)
739 %2 = bitcast i16 %mask to <16 x i1>
740 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
743 declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, i32) nounwind readnone
745 define <8 x double> @test_getexp_pd_512(<8 x double> %a0) {
746 ; CHECK-LABEL: test_getexp_pd_512:
748 ; CHECK-NEXT: vgetexppd %zmm0, %zmm0
749 ; CHECK-NEXT: ret{{[l|q]}}
750 %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 4)
751 ret <8 x double> %res
753 define <8 x double> @test_getexp_round_pd_512(<8 x double> %a0) {
754 ; CHECK-LABEL: test_getexp_round_pd_512:
756 ; CHECK-NEXT: vgetexppd {sae}, %zmm0, %zmm0
757 ; CHECK-NEXT: ret{{[l|q]}}
758 %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 12)
759 ret <8 x double> %res
761 declare <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone
763 define <16 x float> @test_getexp_ps_512(<16 x float> %a0) {
764 ; CHECK-LABEL: test_getexp_ps_512:
766 ; CHECK-NEXT: vgetexpps %zmm0, %zmm0
767 ; CHECK-NEXT: ret{{[l|q]}}
768 %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
769 ret <16 x float> %res
772 define <16 x float> @test_getexp_round_ps_512(<16 x float> %a0) {
773 ; CHECK-LABEL: test_getexp_round_ps_512:
775 ; CHECK-NEXT: vgetexpps {sae}, %zmm0, %zmm0
776 ; CHECK-NEXT: ret{{[l|q]}}
777 %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
778 ret <16 x float> %res
780 declare <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone
782 declare <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
784 define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
785 ; X64-LABEL: test_sqrt_ss:
787 ; X64-NEXT: kmovw %edi, %k1
788 ; X64-NEXT: vmovaps %xmm2, %xmm3
789 ; X64-NEXT: vsqrtss %xmm1, %xmm0, %xmm3 {%k1}
790 ; X64-NEXT: vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
791 ; X64-NEXT: vaddps %xmm2, %xmm3, %xmm2
792 ; X64-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
793 ; X64-NEXT: vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0
794 ; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0
795 ; X64-NEXT: vaddps %xmm0, %xmm2, %xmm0
798 ; X86-LABEL: test_sqrt_ss:
800 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
801 ; X86-NEXT: kmovw %eax, %k1
802 ; X86-NEXT: vmovaps %xmm2, %xmm3
803 ; X86-NEXT: vsqrtss %xmm1, %xmm0, %xmm3 {%k1}
804 ; X86-NEXT: vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
805 ; X86-NEXT: vaddps %xmm2, %xmm3, %xmm2
806 ; X86-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
807 ; X86-NEXT: vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0
808 ; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0
809 ; X86-NEXT: vaddps %xmm0, %xmm2, %xmm0
811 %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
812 %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 9)
813 %res2 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 10)
814 %res3 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 11)
816 %res.1 = fadd <4 x float> %res0, %res1
817 %res.2 = fadd <4 x float> %res2, %res3
818 %res = fadd <4 x float> %res.1, %res.2
822 declare <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
824 define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
825 ; X64-LABEL: test_sqrt_sd:
827 ; X64-NEXT: kmovw %edi, %k1
828 ; X64-NEXT: vmovapd %xmm2, %xmm3
829 ; X64-NEXT: vsqrtsd %xmm1, %xmm0, %xmm3 {%k1}
830 ; X64-NEXT: vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
831 ; X64-NEXT: vaddpd %xmm2, %xmm3, %xmm2
832 ; X64-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
833 ; X64-NEXT: vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0
834 ; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0
835 ; X64-NEXT: vaddpd %xmm0, %xmm2, %xmm0
838 ; X86-LABEL: test_sqrt_sd:
840 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
841 ; X86-NEXT: kmovw %eax, %k1
842 ; X86-NEXT: vmovapd %xmm2, %xmm3
843 ; X86-NEXT: vsqrtsd %xmm1, %xmm0, %xmm3 {%k1}
844 ; X86-NEXT: vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
845 ; X86-NEXT: vaddpd %xmm2, %xmm3, %xmm2
846 ; X86-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
847 ; X86-NEXT: vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0
848 ; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0
849 ; X86-NEXT: vaddpd %xmm0, %xmm2, %xmm0
851 %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
852 %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 9)
853 %res2 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 10)
854 %res3 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 11)
856 %res.1 = fadd <2 x double> %res0, %res1
857 %res.2 = fadd <2 x double> %res2, %res3
858 %res = fadd <2 x double> %res.1, %res.2
859 ret <2 x double> %res
862 define i32 @test_x86_avx512_cvttsd2usi(<2 x double> %a0) {
863 ; CHECK-LABEL: test_x86_avx512_cvttsd2usi:
865 ; CHECK-NEXT: vcvttsd2usi %xmm0, %ecx
866 ; CHECK-NEXT: vcvttsd2usi {sae}, %xmm0, %eax
867 ; CHECK-NEXT: addl %ecx, %eax
868 ; CHECK-NEXT: ret{{[l|q]}}
869 %res0 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 4) ;
870 %res1 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 8) ;
871 %res2 = add i32 %res0, %res1
874 declare i32 @llvm.x86.avx512.cvttsd2usi(<2 x double>, i32) nounwind readnone
876 define i32 @test_x86_avx512_cvttsd2si(<2 x double> %a0) {
877 ; CHECK-LABEL: test_x86_avx512_cvttsd2si:
879 ; CHECK-NEXT: vcvttsd2si %xmm0, %ecx
880 ; CHECK-NEXT: vcvttsd2si {sae}, %xmm0, %eax
881 ; CHECK-NEXT: addl %ecx, %eax
882 ; CHECK-NEXT: ret{{[l|q]}}
883 %res0 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 4) ;
884 %res1 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 8) ;
885 %res2 = add i32 %res0, %res1
888 declare i32 @llvm.x86.avx512.cvttsd2si(<2 x double>, i32) nounwind readnone
890 define i32 @test_x86_avx512_cvttss2si(<4 x float> %a0) {
891 ; CHECK-LABEL: test_x86_avx512_cvttss2si:
893 ; CHECK-NEXT: vcvttss2si {sae}, %xmm0, %ecx
894 ; CHECK-NEXT: vcvttss2si %xmm0, %eax
895 ; CHECK-NEXT: addl %ecx, %eax
896 ; CHECK-NEXT: ret{{[l|q]}}
897 %res0 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 8) ;
898 %res1 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 4) ;
899 %res2 = add i32 %res0, %res1
902 declare i32 @llvm.x86.avx512.cvttss2si(<4 x float>, i32) nounwind readnone
904 define i32 @test_x86_avx512_cvttss2si_load(<4 x float>* %a0) {
905 ; X64-LABEL: test_x86_avx512_cvttss2si_load:
907 ; X64-NEXT: vcvttss2si (%rdi), %eax
910 ; X86-LABEL: test_x86_avx512_cvttss2si_load:
912 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
913 ; X86-NEXT: vcvttss2si (%eax), %eax
915 %a1 = load <4 x float>, <4 x float>* %a0
916 %res = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a1, i32 4) ;
920 define i32 @test_x86_avx512_cvttss2usi(<4 x float> %a0) {
921 ; CHECK-LABEL: test_x86_avx512_cvttss2usi:
923 ; CHECK-NEXT: vcvttss2usi {sae}, %xmm0, %ecx
924 ; CHECK-NEXT: vcvttss2usi %xmm0, %eax
925 ; CHECK-NEXT: addl %ecx, %eax
926 ; CHECK-NEXT: ret{{[l|q]}}
927 %res0 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 8) ;
928 %res1 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 4) ;
929 %res2 = add i32 %res0, %res1
932 declare i32 @llvm.x86.avx512.cvttss2usi(<4 x float>, i32) nounwind readnone
934 define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) {
935 ; CHECK-LABEL: test_x86_avx512_cvtsd2usi32:
937 ; CHECK-NEXT: vcvtsd2usi %xmm0, %eax
938 ; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %ecx
939 ; CHECK-NEXT: addl %eax, %ecx
940 ; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %eax
941 ; CHECK-NEXT: addl %ecx, %eax
942 ; CHECK-NEXT: ret{{[l|q]}}
944 %res = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 4)
945 %res1 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 11)
946 %res2 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 9)
947 %res3 = add i32 %res, %res1
948 %res4 = add i32 %res3, %res2
951 declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone
953 define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) {
954 ; CHECK-LABEL: test_x86_avx512_cvtsd2si32:
956 ; CHECK-NEXT: vcvtsd2si %xmm0, %eax
957 ; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %ecx
958 ; CHECK-NEXT: addl %eax, %ecx
959 ; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %eax
960 ; CHECK-NEXT: addl %ecx, %eax
961 ; CHECK-NEXT: ret{{[l|q]}}
963 %res = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 4)
964 %res1 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 11)
965 %res2 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 9)
966 %res3 = add i32 %res, %res1
967 %res4 = add i32 %res3, %res2
970 declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone
972 define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) {
973 ; CHECK-LABEL: test_x86_avx512_cvtss2usi32:
975 ; CHECK-NEXT: vcvtss2usi %xmm0, %eax
976 ; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %ecx
977 ; CHECK-NEXT: addl %eax, %ecx
978 ; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %eax
979 ; CHECK-NEXT: addl %ecx, %eax
980 ; CHECK-NEXT: ret{{[l|q]}}
982 %res = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 4)
983 %res1 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 11)
984 %res2 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 9)
985 %res3 = add i32 %res, %res1
986 %res4 = add i32 %res3, %res2
989 declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone
991 define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) {
992 ; CHECK-LABEL: test_x86_avx512_cvtss2si32:
994 ; CHECK-NEXT: vcvtss2si %xmm0, %eax
995 ; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %ecx
996 ; CHECK-NEXT: addl %eax, %ecx
997 ; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %eax
998 ; CHECK-NEXT: addl %ecx, %eax
999 ; CHECK-NEXT: ret{{[l|q]}}
1001 %res = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 4)
1002 %res1 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 11)
1003 %res2 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 9)
1004 %res3 = add i32 %res, %res1
1005 %res4 = add i32 %res3, %res2
1008 declare i32 @llvm.x86.avx512.vcvtss2si32(<4 x float>, i32) nounwind readnone
1010 define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) {
1011 ; CHECK-LABEL: test_x86_vcvtph2ps_512:
1013 ; CHECK-NEXT: vcvtph2ps %ymm0, %zmm0
1014 ; CHECK-NEXT: ret{{[l|q]}}
1015 %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
1016 ret <16 x float> %res
1019 define <16 x float> @test_x86_vcvtph2ps_512_sae(<16 x i16> %a0) {
1020 ; CHECK-LABEL: test_x86_vcvtph2ps_512_sae:
1022 ; CHECK-NEXT: vcvtph2ps {sae}, %ymm0, %zmm0
1023 ; CHECK-NEXT: ret{{[l|q]}}
1024 %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
1025 ret <16 x float> %res
1028 define <16 x float> @test_x86_vcvtph2ps_512_rrk(<16 x i16> %a0,<16 x float> %a1, i16 %mask) {
1029 ; X64-LABEL: test_x86_vcvtph2ps_512_rrk:
1031 ; X64-NEXT: kmovw %edi, %k1
1032 ; X64-NEXT: vcvtph2ps %ymm0, %zmm1 {%k1}
1033 ; X64-NEXT: vmovaps %zmm1, %zmm0
1036 ; X86-LABEL: test_x86_vcvtph2ps_512_rrk:
1038 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1039 ; X86-NEXT: vcvtph2ps %ymm0, %zmm1 {%k1}
1040 ; X86-NEXT: vmovaps %zmm1, %zmm0
1042 %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> %a1, i16 %mask, i32 4)
1043 ret <16 x float> %res
1046 define <16 x float> @test_x86_vcvtph2ps_512_sae_rrkz(<16 x i16> %a0, i16 %mask) {
1047 ; X64-LABEL: test_x86_vcvtph2ps_512_sae_rrkz:
1049 ; X64-NEXT: kmovw %edi, %k1
1050 ; X64-NEXT: vcvtph2ps {sae}, %ymm0, %zmm0 {%k1} {z}
1053 ; X86-LABEL: test_x86_vcvtph2ps_512_sae_rrkz:
1055 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1056 ; X86-NEXT: vcvtph2ps {sae}, %ymm0, %zmm0 {%k1} {z}
1058 %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 8)
1059 ret <16 x float> %res
1062 define <16 x float> @test_x86_vcvtph2ps_512_rrkz(<16 x i16> %a0, i16 %mask) {
1063 ; X64-LABEL: test_x86_vcvtph2ps_512_rrkz:
1065 ; X64-NEXT: kmovw %edi, %k1
1066 ; X64-NEXT: vcvtph2ps %ymm0, %zmm0 {%k1} {z}
1069 ; X86-LABEL: test_x86_vcvtph2ps_512_rrkz:
1071 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1072 ; X86-NEXT: vcvtph2ps %ymm0, %zmm0 {%k1} {z}
1074 %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 4)
1075 ret <16 x float> %res
1078 declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly
1080 define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 %mask, <16 x i16> * %dst) {
1081 ; X64-LABEL: test_x86_vcvtps2ph_256:
1083 ; X64-NEXT: kmovw %edi, %k1
1084 ; X64-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z}
1085 ; X64-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1}
1086 ; X64-NEXT: vpaddw %ymm1, %ymm2, %ymm1
1087 ; X64-NEXT: vcvtps2ph $2, %zmm0, (%rsi)
1088 ; X64-NEXT: vmovdqa %ymm1, %ymm0
1091 ; X86-LABEL: test_x86_vcvtps2ph_256:
1093 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1094 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1095 ; X86-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z}
1096 ; X86-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1}
1097 ; X86-NEXT: vpaddw %ymm1, %ymm2, %ymm1
1098 ; X86-NEXT: vcvtps2ph $2, %zmm0, (%eax)
1099 ; X86-NEXT: vmovdqa %ymm1, %ymm0
1101 %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
1102 %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 %mask)
1103 %res3 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> %src, i16 %mask)
1104 store <16 x i16> %res1, <16 x i16> * %dst
1105 %res = add <16 x i16> %res2, %res3
1109 declare <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float>, i32, <16 x i16>, i16) nounwind readonly
1111 define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) {
1112 ; CHECK-LABEL: test_cmpps:
1114 ; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
1115 ; CHECK-NEXT: kmovw %k0, %eax
1116 ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
1117 ; CHECK-NEXT: vzeroupper
1118 ; CHECK-NEXT: ret{{[l|q]}}
1119 %res = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i32 8)
1120 %1 = bitcast <16 x i1> %res to i16
1123 declare <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float>, <16 x float>, i32, i32)
1125 define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) {
1126 ; CHECK-LABEL: test_cmppd:
1128 ; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k0
1129 ; CHECK-NEXT: kmovw %k0, %eax
1130 ; CHECK-NEXT: # kill: def $al killed $al killed $eax
1131 ; CHECK-NEXT: vzeroupper
1132 ; CHECK-NEXT: ret{{[l|q]}}
1133 %res = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i32 4)
1134 %1 = bitcast <8 x i1> %res to i8
1137 declare <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double>, <8 x double>, i32, i32)
1139 ; Function Attrs: nounwind readnone
1142 define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) {
1143 ; CHECK-LABEL: test_vmaxpd:
1145 ; CHECK-NEXT: vmaxpd %zmm1, %zmm0, %zmm0
1146 ; CHECK-NEXT: ret{{[l|q]}}
1147 %1 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
1150 declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32)
1152 define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) {
1153 ; CHECK-LABEL: test_vminpd:
1155 ; CHECK-NEXT: vminpd %zmm1, %zmm0, %zmm0
1156 ; CHECK-NEXT: ret{{[l|q]}}
1157 %1 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
1160 declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32)
1162 define void @test_mask_store_ss(i8* %ptr, <4 x float> %data, i8 %mask) {
1163 ; X64-LABEL: test_mask_store_ss:
1165 ; X64-NEXT: kmovw %esi, %k1
1166 ; X64-NEXT: vmovss %xmm0, (%rdi) {%k1}
1169 ; X86-LABEL: test_mask_store_ss:
1171 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1172 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
1173 ; X86-NEXT: kmovw %ecx, %k1
1174 ; X86-NEXT: vmovss %xmm0, (%eax) {%k1}
1176 %1 = and i8 %mask, 1
1177 %2 = bitcast i8* %ptr to <4 x float>*
1178 %3 = bitcast i8 %1 to <8 x i1>
1179 %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1180 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %data, <4 x float>* %2, i32 1, <4 x i1> %extract)
1183 declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) #1
1186 declare <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float>, <16 x float>, i32)
1187 declare <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float>, <16 x float>, i32)
1188 declare <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double>, <8 x double>, i32)
1190 define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1) {
1191 ; CHECK-LABEL: test_vsubps_rn:
1193 ; CHECK-NEXT: vsubps {rn-sae}, %zmm1, %zmm0, %zmm0
1194 ; CHECK-NEXT: ret{{[l|q]}}
1195 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1199 define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1) {
1200 ; CHECK-LABEL: test_vsubps_rd:
1202 ; CHECK-NEXT: vsubps {rd-sae}, %zmm1, %zmm0, %zmm0
1203 ; CHECK-NEXT: ret{{[l|q]}}
1204 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1208 define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1) {
1209 ; CHECK-LABEL: test_vsubps_ru:
1211 ; CHECK-NEXT: vsubps {ru-sae}, %zmm1, %zmm0, %zmm0
1212 ; CHECK-NEXT: ret{{[l|q]}}
1213 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1217 define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1) {
1218 ; CHECK-LABEL: test_vsubps_rz:
1220 ; CHECK-NEXT: vsubps {rz-sae}, %zmm1, %zmm0, %zmm0
1221 ; CHECK-NEXT: ret{{[l|q]}}
1222 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1226 define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1) {
1227 ; CHECK-LABEL: test_vmulps_rn:
1229 ; CHECK-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm0
1230 ; CHECK-NEXT: ret{{[l|q]}}
1231 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1235 define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1) {
1236 ; CHECK-LABEL: test_vmulps_rd:
1238 ; CHECK-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm0
1239 ; CHECK-NEXT: ret{{[l|q]}}
1240 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1244 define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1) {
1245 ; CHECK-LABEL: test_vmulps_ru:
1247 ; CHECK-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm0
1248 ; CHECK-NEXT: ret{{[l|q]}}
1249 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1253 define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) {
1254 ; CHECK-LABEL: test_vmulps_rz:
1256 ; CHECK-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm0
1257 ; CHECK-NEXT: ret{{[l|q]}}
1258 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1263 define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1264 ; X64-LABEL: test_vmulps_mask_rn:
1266 ; X64-NEXT: kmovw %edi, %k1
1267 ; X64-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1270 ; X86-LABEL: test_vmulps_mask_rn:
1272 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1273 ; X86-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1275 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1276 %2 = bitcast i16 %mask to <16 x i1>
1277 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1281 define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1282 ; X64-LABEL: test_vmulps_mask_rd:
1284 ; X64-NEXT: kmovw %edi, %k1
1285 ; X64-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1288 ; X86-LABEL: test_vmulps_mask_rd:
1290 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1291 ; X86-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1293 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1294 %2 = bitcast i16 %mask to <16 x i1>
1295 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1299 define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1300 ; X64-LABEL: test_vmulps_mask_ru:
1302 ; X64-NEXT: kmovw %edi, %k1
1303 ; X64-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1306 ; X86-LABEL: test_vmulps_mask_ru:
1308 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1309 ; X86-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1311 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1312 %2 = bitcast i16 %mask to <16 x i1>
1313 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1317 define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1318 ; X64-LABEL: test_vmulps_mask_rz:
1320 ; X64-NEXT: kmovw %edi, %k1
1321 ; X64-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1324 ; X86-LABEL: test_vmulps_mask_rz:
1326 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1327 ; X86-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1329 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1330 %2 = bitcast i16 %mask to <16 x i1>
1331 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1335 ;; With Passthru value
1336 define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
1337 ; X64-LABEL: test_vmulps_mask_passthru_rn:
1339 ; X64-NEXT: kmovw %edi, %k1
1340 ; X64-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1341 ; X64-NEXT: vmovaps %zmm2, %zmm0
1344 ; X86-LABEL: test_vmulps_mask_passthru_rn:
1346 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1347 ; X86-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1348 ; X86-NEXT: vmovaps %zmm2, %zmm0
1350 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1351 %2 = bitcast i16 %mask to <16 x i1>
1352 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
1356 define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
1357 ; X64-LABEL: test_vmulps_mask_passthru_rd:
1359 ; X64-NEXT: kmovw %edi, %k1
1360 ; X64-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1361 ; X64-NEXT: vmovaps %zmm2, %zmm0
1364 ; X86-LABEL: test_vmulps_mask_passthru_rd:
1366 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1367 ; X86-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1368 ; X86-NEXT: vmovaps %zmm2, %zmm0
1370 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1371 %2 = bitcast i16 %mask to <16 x i1>
1372 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
1376 define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
1377 ; X64-LABEL: test_vmulps_mask_passthru_ru:
1379 ; X64-NEXT: kmovw %edi, %k1
1380 ; X64-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1381 ; X64-NEXT: vmovaps %zmm2, %zmm0
1384 ; X86-LABEL: test_vmulps_mask_passthru_ru:
1386 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1387 ; X86-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1388 ; X86-NEXT: vmovaps %zmm2, %zmm0
1390 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1391 %2 = bitcast i16 %mask to <16 x i1>
1392 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
1396 define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
1397 ; X64-LABEL: test_vmulps_mask_passthru_rz:
1399 ; X64-NEXT: kmovw %edi, %k1
1400 ; X64-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1401 ; X64-NEXT: vmovaps %zmm2, %zmm0
1404 ; X86-LABEL: test_vmulps_mask_passthru_rz:
1406 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1407 ; X86-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1408 ; X86-NEXT: vmovaps %zmm2, %zmm0
1410 %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1411 %2 = bitcast i16 %mask to <16 x i1>
1412 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
1417 define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
1418 ; X64-LABEL: test_vmulpd_mask_rn:
1420 ; X64-NEXT: kmovw %edi, %k1
1421 ; X64-NEXT: vmulpd {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1424 ; X86-LABEL: test_vmulpd_mask_rn:
1426 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1427 ; X86-NEXT: kmovw %eax, %k1
1428 ; X86-NEXT: vmulpd {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1430 %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 8)
1431 %2 = bitcast i8 %mask to <8 x i1>
1432 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
1436 define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
1437 ; X64-LABEL: test_vmulpd_mask_rd:
1439 ; X64-NEXT: kmovw %edi, %k1
1440 ; X64-NEXT: vmulpd {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1443 ; X86-LABEL: test_vmulpd_mask_rd:
1445 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1446 ; X86-NEXT: kmovw %eax, %k1
1447 ; X86-NEXT: vmulpd {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1449 %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 9)
1450 %2 = bitcast i8 %mask to <8 x i1>
1451 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
1455 define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
1456 ; X64-LABEL: test_vmulpd_mask_ru:
1458 ; X64-NEXT: kmovw %edi, %k1
1459 ; X64-NEXT: vmulpd {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1462 ; X86-LABEL: test_vmulpd_mask_ru:
1464 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1465 ; X86-NEXT: kmovw %eax, %k1
1466 ; X86-NEXT: vmulpd {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1468 %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 10)
1469 %2 = bitcast i8 %mask to <8 x i1>
1470 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
1474 define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
1475 ; X64-LABEL: test_vmulpd_mask_rz:
1477 ; X64-NEXT: kmovw %edi, %k1
1478 ; X64-NEXT: vmulpd {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1481 ; X86-LABEL: test_vmulpd_mask_rz:
1483 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1484 ; X86-NEXT: kmovw %eax, %k1
1485 ; X86-NEXT: vmulpd {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1487 %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 11)
1488 %2 = bitcast i8 %mask to <8 x i1>
1489 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
1493 define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1494 ; X64-LABEL: test_mm512_maskz_add_round_ps_rn_sae:
1496 ; X64-NEXT: kmovw %edi, %k1
1497 ; X64-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1500 ; X86-LABEL: test_mm512_maskz_add_round_ps_rn_sae:
1502 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1503 ; X86-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1505 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1506 %2 = bitcast i16 %mask to <16 x i1>
1507 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1511 define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1512 ; X64-LABEL: test_mm512_maskz_add_round_ps_rd_sae:
1514 ; X64-NEXT: kmovw %edi, %k1
1515 ; X64-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1518 ; X86-LABEL: test_mm512_maskz_add_round_ps_rd_sae:
1520 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1521 ; X86-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1523 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1524 %2 = bitcast i16 %mask to <16 x i1>
1525 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1529 define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1530 ; X64-LABEL: test_mm512_maskz_add_round_ps_ru_sae:
1532 ; X64-NEXT: kmovw %edi, %k1
1533 ; X64-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1536 ; X86-LABEL: test_mm512_maskz_add_round_ps_ru_sae:
1538 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1539 ; X86-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1541 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1542 %2 = bitcast i16 %mask to <16 x i1>
1543 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1547 define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1548 ; X64-LABEL: test_mm512_maskz_add_round_ps_rz_sae:
1550 ; X64-NEXT: kmovw %edi, %k1
1551 ; X64-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1554 ; X86-LABEL: test_mm512_maskz_add_round_ps_rz_sae:
1556 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1557 ; X86-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1559 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1560 %2 = bitcast i16 %mask to <16 x i1>
1561 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1565 define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1566 ; X64-LABEL: test_mm512_maskz_add_round_ps_current:
1568 ; X64-NEXT: kmovw %edi, %k1
1569 ; X64-NEXT: vaddps %zmm1, %zmm0, %zmm0 {%k1} {z}
1572 ; X86-LABEL: test_mm512_maskz_add_round_ps_current:
1574 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1575 ; X86-NEXT: vaddps %zmm1, %zmm0, %zmm0 {%k1} {z}
1577 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
1578 %2 = bitcast i16 %mask to <16 x i1>
1579 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1583 define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1584 ; X64-LABEL: test_mm512_mask_add_round_ps_rn_sae:
1586 ; X64-NEXT: kmovw %edi, %k1
1587 ; X64-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1588 ; X64-NEXT: vmovaps %zmm2, %zmm0
1591 ; X86-LABEL: test_mm512_mask_add_round_ps_rn_sae:
1593 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1594 ; X86-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1595 ; X86-NEXT: vmovaps %zmm2, %zmm0
1597 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1598 %2 = bitcast i16 %mask to <16 x i1>
1599 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1603 define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1604 ; X64-LABEL: test_mm512_mask_add_round_ps_rd_sae:
1606 ; X64-NEXT: kmovw %edi, %k1
1607 ; X64-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1608 ; X64-NEXT: vmovaps %zmm2, %zmm0
1611 ; X86-LABEL: test_mm512_mask_add_round_ps_rd_sae:
1613 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1614 ; X86-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1615 ; X86-NEXT: vmovaps %zmm2, %zmm0
1617 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1618 %2 = bitcast i16 %mask to <16 x i1>
1619 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1623 define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1624 ; X64-LABEL: test_mm512_mask_add_round_ps_ru_sae:
1626 ; X64-NEXT: kmovw %edi, %k1
1627 ; X64-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1628 ; X64-NEXT: vmovaps %zmm2, %zmm0
1631 ; X86-LABEL: test_mm512_mask_add_round_ps_ru_sae:
1633 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1634 ; X86-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1635 ; X86-NEXT: vmovaps %zmm2, %zmm0
1637 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1638 %2 = bitcast i16 %mask to <16 x i1>
1639 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1643 define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1644 ; X64-LABEL: test_mm512_mask_add_round_ps_rz_sae:
1646 ; X64-NEXT: kmovw %edi, %k1
1647 ; X64-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1648 ; X64-NEXT: vmovaps %zmm2, %zmm0
1651 ; X86-LABEL: test_mm512_mask_add_round_ps_rz_sae:
1653 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1654 ; X86-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1655 ; X86-NEXT: vmovaps %zmm2, %zmm0
1657 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1658 %2 = bitcast i16 %mask to <16 x i1>
1659 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1663 define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1664 ; X64-LABEL: test_mm512_mask_add_round_ps_current:
1666 ; X64-NEXT: kmovw %edi, %k1
1667 ; X64-NEXT: vaddps %zmm1, %zmm0, %zmm2 {%k1}
1668 ; X64-NEXT: vmovaps %zmm2, %zmm0
1671 ; X86-LABEL: test_mm512_mask_add_round_ps_current:
1673 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1674 ; X86-NEXT: vaddps %zmm1, %zmm0, %zmm2 {%k1}
1675 ; X86-NEXT: vmovaps %zmm2, %zmm0
1677 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
1678 %2 = bitcast i16 %mask to <16 x i1>
1679 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1683 define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1684 ; CHECK-LABEL: test_mm512_add_round_ps_rn_sae:
1686 ; CHECK-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0
1687 ; CHECK-NEXT: ret{{[l|q]}}
1688 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1692 define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1693 ; CHECK-LABEL: test_mm512_add_round_ps_rd_sae:
1695 ; CHECK-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0
1696 ; CHECK-NEXT: ret{{[l|q]}}
1697 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1701 define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1702 ; CHECK-LABEL: test_mm512_add_round_ps_ru_sae:
1704 ; CHECK-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm0
1705 ; CHECK-NEXT: ret{{[l|q]}}
1706 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1710 define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1711 ; CHECK-LABEL: test_mm512_add_round_ps_rz_sae:
1713 ; CHECK-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm0
1714 ; CHECK-NEXT: ret{{[l|q]}}
1715 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1719 define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1720 ; CHECK-LABEL: test_mm512_add_round_ps_current:
1722 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
1723 ; CHECK-NEXT: ret{{[l|q]}}
1724 %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
1727 declare <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float>, <16 x float>, i32)
1729 define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1730 ; X64-LABEL: test_mm512_mask_sub_round_ps_rn_sae:
1732 ; X64-NEXT: kmovw %edi, %k1
1733 ; X64-NEXT: vsubps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1734 ; X64-NEXT: vmovaps %zmm2, %zmm0
1737 ; X86-LABEL: test_mm512_mask_sub_round_ps_rn_sae:
1739 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1740 ; X86-NEXT: vsubps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1741 ; X86-NEXT: vmovaps %zmm2, %zmm0
1743 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1744 %2 = bitcast i16 %mask to <16 x i1>
1745 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1749 define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1750 ; X64-LABEL: test_mm512_mask_sub_round_ps_rd_sae:
1752 ; X64-NEXT: kmovw %edi, %k1
1753 ; X64-NEXT: vsubps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1754 ; X64-NEXT: vmovaps %zmm2, %zmm0
1757 ; X86-LABEL: test_mm512_mask_sub_round_ps_rd_sae:
1759 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1760 ; X86-NEXT: vsubps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1761 ; X86-NEXT: vmovaps %zmm2, %zmm0
1763 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1764 %2 = bitcast i16 %mask to <16 x i1>
1765 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1769 define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1770 ; X64-LABEL: test_mm512_mask_sub_round_ps_ru_sae:
1772 ; X64-NEXT: kmovw %edi, %k1
1773 ; X64-NEXT: vsubps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1774 ; X64-NEXT: vmovaps %zmm2, %zmm0
1777 ; X86-LABEL: test_mm512_mask_sub_round_ps_ru_sae:
1779 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1780 ; X86-NEXT: vsubps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1781 ; X86-NEXT: vmovaps %zmm2, %zmm0
1783 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1784 %2 = bitcast i16 %mask to <16 x i1>
1785 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1789 define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1790 ; X64-LABEL: test_mm512_mask_sub_round_ps_rz_sae:
1792 ; X64-NEXT: kmovw %edi, %k1
1793 ; X64-NEXT: vsubps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1794 ; X64-NEXT: vmovaps %zmm2, %zmm0
1797 ; X86-LABEL: test_mm512_mask_sub_round_ps_rz_sae:
1799 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1800 ; X86-NEXT: vsubps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1801 ; X86-NEXT: vmovaps %zmm2, %zmm0
1803 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1804 %2 = bitcast i16 %mask to <16 x i1>
1805 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1809 define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1810 ; X64-LABEL: test_mm512_mask_sub_round_ps_current:
1812 ; X64-NEXT: kmovw %edi, %k1
1813 ; X64-NEXT: vsubps %zmm1, %zmm0, %zmm2 {%k1}
1814 ; X64-NEXT: vmovaps %zmm2, %zmm0
1817 ; X86-LABEL: test_mm512_mask_sub_round_ps_current:
1819 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1820 ; X86-NEXT: vsubps %zmm1, %zmm0, %zmm2 {%k1}
1821 ; X86-NEXT: vmovaps %zmm2, %zmm0
1823 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
1824 %2 = bitcast i16 %mask to <16 x i1>
1825 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1829 define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1830 ; CHECK-LABEL: test_mm512_sub_round_ps_rn_sae:
1832 ; CHECK-NEXT: vsubps {rn-sae}, %zmm1, %zmm0, %zmm0
1833 ; CHECK-NEXT: ret{{[l|q]}}
1834 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1838 define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1839 ; CHECK-LABEL: test_mm512_sub_round_ps_rd_sae:
1841 ; CHECK-NEXT: vsubps {rd-sae}, %zmm1, %zmm0, %zmm0
1842 ; CHECK-NEXT: ret{{[l|q]}}
1843 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1847 define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1848 ; CHECK-LABEL: test_mm512_sub_round_ps_ru_sae:
1850 ; CHECK-NEXT: vsubps {ru-sae}, %zmm1, %zmm0, %zmm0
1851 ; CHECK-NEXT: ret{{[l|q]}}
1852 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1856 define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1857 ; CHECK-LABEL: test_mm512_sub_round_ps_rz_sae:
1859 ; CHECK-NEXT: vsubps {rz-sae}, %zmm1, %zmm0, %zmm0
1860 ; CHECK-NEXT: ret{{[l|q]}}
1861 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1865 define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1866 ; CHECK-LABEL: test_mm512_sub_round_ps_current:
1868 ; CHECK-NEXT: vsubps %zmm1, %zmm0, %zmm0
1869 ; CHECK-NEXT: ret{{[l|q]}}
1870 %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
1874 define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1875 ; X64-LABEL: test_mm512_maskz_div_round_ps_rn_sae:
1877 ; X64-NEXT: kmovw %edi, %k1
1878 ; X64-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1881 ; X86-LABEL: test_mm512_maskz_div_round_ps_rn_sae:
1883 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1884 ; X86-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1886 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1887 %2 = bitcast i16 %mask to <16 x i1>
1888 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1892 define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1893 ; X64-LABEL: test_mm512_maskz_div_round_ps_rd_sae:
1895 ; X64-NEXT: kmovw %edi, %k1
1896 ; X64-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1899 ; X86-LABEL: test_mm512_maskz_div_round_ps_rd_sae:
1901 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1902 ; X86-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1904 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1905 %2 = bitcast i16 %mask to <16 x i1>
1906 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1910 define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1911 ; X64-LABEL: test_mm512_maskz_div_round_ps_ru_sae:
1913 ; X64-NEXT: kmovw %edi, %k1
1914 ; X64-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1917 ; X86-LABEL: test_mm512_maskz_div_round_ps_ru_sae:
1919 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1920 ; X86-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1922 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
1923 %2 = bitcast i16 %mask to <16 x i1>
1924 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1928 define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1929 ; X64-LABEL: test_mm512_maskz_div_round_ps_rz_sae:
1931 ; X64-NEXT: kmovw %edi, %k1
1932 ; X64-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1935 ; X86-LABEL: test_mm512_maskz_div_round_ps_rz_sae:
1937 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1938 ; X86-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
1940 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
1941 %2 = bitcast i16 %mask to <16 x i1>
1942 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1946 define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
1947 ; X64-LABEL: test_mm512_maskz_div_round_ps_current:
1949 ; X64-NEXT: kmovw %edi, %k1
1950 ; X64-NEXT: vdivps %zmm1, %zmm0, %zmm0 {%k1} {z}
1953 ; X86-LABEL: test_mm512_maskz_div_round_ps_current:
1955 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1956 ; X86-NEXT: vdivps %zmm1, %zmm0, %zmm0 {%k1} {z}
1958 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
1959 %2 = bitcast i16 %mask to <16 x i1>
1960 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
1964 define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1965 ; X64-LABEL: test_mm512_mask_div_round_ps_rn_sae:
1967 ; X64-NEXT: kmovw %edi, %k1
1968 ; X64-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1969 ; X64-NEXT: vmovaps %zmm2, %zmm0
1972 ; X86-LABEL: test_mm512_mask_div_round_ps_rn_sae:
1974 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1975 ; X86-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1976 ; X86-NEXT: vmovaps %zmm2, %zmm0
1978 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
1979 %2 = bitcast i16 %mask to <16 x i1>
1980 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
1984 define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
1985 ; X64-LABEL: test_mm512_mask_div_round_ps_rd_sae:
1987 ; X64-NEXT: kmovw %edi, %k1
1988 ; X64-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1989 ; X64-NEXT: vmovaps %zmm2, %zmm0
1992 ; X86-LABEL: test_mm512_mask_div_round_ps_rd_sae:
1994 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
1995 ; X86-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
1996 ; X86-NEXT: vmovaps %zmm2, %zmm0
1998 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
1999 %2 = bitcast i16 %mask to <16 x i1>
2000 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
2004 define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2005 ; X64-LABEL: test_mm512_mask_div_round_ps_ru_sae:
2007 ; X64-NEXT: kmovw %edi, %k1
2008 ; X64-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2009 ; X64-NEXT: vmovaps %zmm2, %zmm0
2012 ; X86-LABEL: test_mm512_mask_div_round_ps_ru_sae:
2014 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2015 ; X86-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2016 ; X86-NEXT: vmovaps %zmm2, %zmm0
2018 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
2019 %2 = bitcast i16 %mask to <16 x i1>
2020 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
2024 define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2025 ; X64-LABEL: test_mm512_mask_div_round_ps_rz_sae:
2027 ; X64-NEXT: kmovw %edi, %k1
2028 ; X64-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2029 ; X64-NEXT: vmovaps %zmm2, %zmm0
2032 ; X86-LABEL: test_mm512_mask_div_round_ps_rz_sae:
2034 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2035 ; X86-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2036 ; X86-NEXT: vmovaps %zmm2, %zmm0
2038 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
2039 %2 = bitcast i16 %mask to <16 x i1>
2040 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
2044 define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2045 ; X64-LABEL: test_mm512_mask_div_round_ps_current:
2047 ; X64-NEXT: kmovw %edi, %k1
2048 ; X64-NEXT: vdivps %zmm1, %zmm0, %zmm2 {%k1}
2049 ; X64-NEXT: vmovaps %zmm2, %zmm0
2052 ; X86-LABEL: test_mm512_mask_div_round_ps_current:
2054 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2055 ; X86-NEXT: vdivps %zmm1, %zmm0, %zmm2 {%k1}
2056 ; X86-NEXT: vmovaps %zmm2, %zmm0
2058 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
2059 %2 = bitcast i16 %mask to <16 x i1>
2060 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
2064 define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2065 ; CHECK-LABEL: test_mm512_div_round_ps_rn_sae:
2067 ; CHECK-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm0
2068 ; CHECK-NEXT: ret{{[l|q]}}
2069 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
2073 define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2074 ; CHECK-LABEL: test_mm512_div_round_ps_rd_sae:
2076 ; CHECK-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm0
2077 ; CHECK-NEXT: ret{{[l|q]}}
2078 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
2082 define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2083 ; CHECK-LABEL: test_mm512_div_round_ps_ru_sae:
2085 ; CHECK-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm0
2086 ; CHECK-NEXT: ret{{[l|q]}}
2087 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
2091 define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2092 ; CHECK-LABEL: test_mm512_div_round_ps_rz_sae:
2094 ; CHECK-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm0
2095 ; CHECK-NEXT: ret{{[l|q]}}
2096 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
2100 define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2101 ; CHECK-LABEL: test_mm512_div_round_ps_current:
2103 ; CHECK-NEXT: vdivps %zmm1, %zmm0, %zmm0
2104 ; CHECK-NEXT: ret{{[l|q]}}
2105 %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
2108 declare <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float>, <16 x float>, i32)
2110 define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2111 ; X64-LABEL: test_mm512_maskz_min_round_ps_sae:
2113 ; X64-NEXT: kmovw %edi, %k1
2114 ; X64-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2117 ; X86-LABEL: test_mm512_maskz_min_round_ps_sae:
2119 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2120 ; X86-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2122 %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
2123 %2 = bitcast i16 %mask to <16 x i1>
2124 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
2128 define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2129 ; X64-LABEL: test_mm512_maskz_min_round_ps_current:
2131 ; X64-NEXT: kmovw %edi, %k1
2132 ; X64-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
2135 ; X86-LABEL: test_mm512_maskz_min_round_ps_current:
2137 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2138 ; X86-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
2140 %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
2141 %2 = bitcast i16 %mask to <16 x i1>
2142 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
2146 define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2147 ; X64-LABEL: test_mm512_mask_min_round_ps_sae:
2149 ; X64-NEXT: kmovw %edi, %k1
2150 ; X64-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
2151 ; X64-NEXT: vmovaps %zmm2, %zmm0
2154 ; X86-LABEL: test_mm512_mask_min_round_ps_sae:
2156 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2157 ; X86-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
2158 ; X86-NEXT: vmovaps %zmm2, %zmm0
2160 %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
2161 %2 = bitcast i16 %mask to <16 x i1>
2162 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
2166 define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2167 ; X64-LABEL: test_mm512_mask_min_round_ps_current:
2169 ; X64-NEXT: kmovw %edi, %k1
2170 ; X64-NEXT: vminps %zmm1, %zmm0, %zmm2 {%k1}
2171 ; X64-NEXT: vmovaps %zmm2, %zmm0
2174 ; X86-LABEL: test_mm512_mask_min_round_ps_current:
2176 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2177 ; X86-NEXT: vminps %zmm1, %zmm0, %zmm2 {%k1}
2178 ; X86-NEXT: vmovaps %zmm2, %zmm0
2180 %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
2181 %2 = bitcast i16 %mask to <16 x i1>
2182 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
2186 define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2187 ; CHECK-LABEL: test_mm512_min_round_ps_sae:
2189 ; CHECK-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm0
2190 ; CHECK-NEXT: ret{{[l|q]}}
2191 %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
2195 define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2196 ; CHECK-LABEL: test_mm512_min_round_ps_current:
2198 ; CHECK-NEXT: vminps %zmm1, %zmm0, %zmm0
2199 ; CHECK-NEXT: ret{{[l|q]}}
2200 %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
2203 declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32)
2205 define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2206 ; X64-LABEL: test_mm512_maskz_max_round_ps_sae:
2208 ; X64-NEXT: kmovw %edi, %k1
2209 ; X64-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2212 ; X86-LABEL: test_mm512_maskz_max_round_ps_sae:
2214 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2215 ; X86-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2217 %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
2218 %2 = bitcast i16 %mask to <16 x i1>
2219 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
2223 define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2224 ; X64-LABEL: test_mm512_maskz_max_round_ps_current:
2226 ; X64-NEXT: kmovw %edi, %k1
2227 ; X64-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
2230 ; X86-LABEL: test_mm512_maskz_max_round_ps_current:
2232 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2233 ; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
2235 %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
2236 %2 = bitcast i16 %mask to <16 x i1>
2237 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
2241 define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2242 ; X64-LABEL: test_mm512_mask_max_round_ps_sae:
2244 ; X64-NEXT: kmovw %edi, %k1
2245 ; X64-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
2246 ; X64-NEXT: vmovaps %zmm2, %zmm0
2249 ; X86-LABEL: test_mm512_mask_max_round_ps_sae:
2251 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2252 ; X86-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
2253 ; X86-NEXT: vmovaps %zmm2, %zmm0
2255 %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
2256 %2 = bitcast i16 %mask to <16 x i1>
2257 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
2261 define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
2262 ; X64-LABEL: test_mm512_mask_max_round_ps_current:
2264 ; X64-NEXT: kmovw %edi, %k1
2265 ; X64-NEXT: vmaxps %zmm1, %zmm0, %zmm2 {%k1}
2266 ; X64-NEXT: vmovaps %zmm2, %zmm0
2269 ; X86-LABEL: test_mm512_mask_max_round_ps_current:
2271 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
2272 ; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm2 {%k1}
2273 ; X86-NEXT: vmovaps %zmm2, %zmm0
2275 %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
2276 %2 = bitcast i16 %mask to <16 x i1>
2277 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
2281 define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2282 ; CHECK-LABEL: test_mm512_max_round_ps_sae:
2284 ; CHECK-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm0
2285 ; CHECK-NEXT: ret{{[l|q]}}
2286 %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
2290 define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2291 ; CHECK-LABEL: test_mm512_max_round_ps_current:
2293 ; CHECK-NEXT: vmaxps %zmm1, %zmm0, %zmm0
2294 ; CHECK-NEXT: ret{{[l|q]}}
2295 %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
2298 declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32)
2300 declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
2302 define <4 x float> @test_mask_add_ss_rn(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
2303 ; X64-LABEL: test_mask_add_ss_rn:
2305 ; X64-NEXT: kmovw %edi, %k1
2306 ; X64-NEXT: vaddss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2307 ; X64-NEXT: vmovaps %xmm2, %xmm0
2310 ; X86-LABEL: test_mask_add_ss_rn:
2312 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2313 ; X86-NEXT: kmovw %eax, %k1
2314 ; X86-NEXT: vaddss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2315 ; X86-NEXT: vmovaps %xmm2, %xmm0
2317 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
2318 ret <4 x float> %res
2321 define <4 x float> @test_mask_add_ss_rd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
2322 ; X64-LABEL: test_mask_add_ss_rd:
2324 ; X64-NEXT: kmovw %edi, %k1
2325 ; X64-NEXT: vaddss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2326 ; X64-NEXT: vmovaps %xmm2, %xmm0
2329 ; X86-LABEL: test_mask_add_ss_rd:
2331 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2332 ; X86-NEXT: kmovw %eax, %k1
2333 ; X86-NEXT: vaddss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2334 ; X86-NEXT: vmovaps %xmm2, %xmm0
2336 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 9)
2337 ret <4 x float> %res
2340 define <4 x float> @test_mask_add_ss_ru(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
2341 ; X64-LABEL: test_mask_add_ss_ru:
2343 ; X64-NEXT: kmovw %edi, %k1
2344 ; X64-NEXT: vaddss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2345 ; X64-NEXT: vmovaps %xmm2, %xmm0
2348 ; X86-LABEL: test_mask_add_ss_ru:
2350 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2351 ; X86-NEXT: kmovw %eax, %k1
2352 ; X86-NEXT: vaddss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2353 ; X86-NEXT: vmovaps %xmm2, %xmm0
2355 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 10)
2356 ret <4 x float> %res
2359 define <4 x float> @test_mask_add_ss_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
2360 ; X64-LABEL: test_mask_add_ss_rz:
2362 ; X64-NEXT: kmovw %edi, %k1
2363 ; X64-NEXT: vaddss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2364 ; X64-NEXT: vmovaps %xmm2, %xmm0
2367 ; X86-LABEL: test_mask_add_ss_rz:
2369 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2370 ; X86-NEXT: kmovw %eax, %k1
2371 ; X86-NEXT: vaddss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2372 ; X86-NEXT: vmovaps %xmm2, %xmm0
2374 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 11)
2375 ret <4 x float> %res
2378 define <4 x float> @test_mask_add_ss_current(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
2379 ; X64-LABEL: test_mask_add_ss_current:
2381 ; X64-NEXT: kmovw %edi, %k1
2382 ; X64-NEXT: vaddss %xmm1, %xmm0, %xmm2 {%k1}
2383 ; X64-NEXT: vmovaps %xmm2, %xmm0
2386 ; X86-LABEL: test_mask_add_ss_current:
2388 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2389 ; X86-NEXT: kmovw %eax, %k1
2390 ; X86-NEXT: vaddss %xmm1, %xmm0, %xmm2 {%k1}
2391 ; X86-NEXT: vmovaps %xmm2, %xmm0
2393 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
2394 ret <4 x float> %res
2397 define <4 x float> @test_maskz_add_ss_rn(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
2398 ; X64-LABEL: test_maskz_add_ss_rn:
2400 ; X64-NEXT: kmovw %edi, %k1
2401 ; X64-NEXT: vaddss {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
2404 ; X86-LABEL: test_maskz_add_ss_rn:
2406 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2407 ; X86-NEXT: kmovw %eax, %k1
2408 ; X86-NEXT: vaddss {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
2410 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
2411 ret <4 x float> %res
2414 define <4 x float> @test_add_ss_rn(<4 x float> %a0, <4 x float> %a1) {
2415 ; CHECK-LABEL: test_add_ss_rn:
2417 ; CHECK-NEXT: vaddss {rn-sae}, %xmm1, %xmm0, %xmm0
2418 ; CHECK-NEXT: ret{{[l|q]}}
2419 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
2420 ret <4 x float> %res
2423 define <4 x float> @test_mask_add_ss_current_memfold(<4 x float> %a0, float* %a1, <4 x float> %a2, i8 %mask) {
2424 ; X64-LABEL: test_mask_add_ss_current_memfold:
2426 ; X64-NEXT: kmovw %esi, %k1
2427 ; X64-NEXT: vaddss (%rdi), %xmm0, %xmm1 {%k1}
2428 ; X64-NEXT: vmovaps %xmm1, %xmm0
2431 ; X86-LABEL: test_mask_add_ss_current_memfold:
2433 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2434 ; X86-NEXT: kmovw %eax, %k1
2435 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2436 ; X86-NEXT: vaddss (%eax), %xmm0, %xmm1 {%k1}
2437 ; X86-NEXT: vmovaps %xmm1, %xmm0
2439 %a1.val = load float, float* %a1
2440 %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
2441 %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
2442 %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
2443 %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
2444 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> %a2, i8 %mask, i32 4)
2445 ret <4 x float> %res
2448 define <4 x float> @test_maskz_add_ss_current_memfold(<4 x float> %a0, float* %a1, i8 %mask) {
2449 ; X64-LABEL: test_maskz_add_ss_current_memfold:
2451 ; X64-NEXT: kmovw %esi, %k1
2452 ; X64-NEXT: vaddss (%rdi), %xmm0, %xmm0 {%k1} {z}
2455 ; X86-LABEL: test_maskz_add_ss_current_memfold:
2457 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2458 ; X86-NEXT: kmovw %eax, %k1
2459 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2460 ; X86-NEXT: vaddss (%eax), %xmm0, %xmm0 {%k1} {z}
2462 %a1.val = load float, float* %a1
2463 %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
2464 %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
2465 %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
2466 %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
2467 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> zeroinitializer, i8 %mask, i32 4)
2468 ret <4 x float> %res
2471 declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
2473 define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
2474 ; X64-LABEL: test_mask_add_sd_rn:
2476 ; X64-NEXT: kmovw %edi, %k1
2477 ; X64-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2478 ; X64-NEXT: vmovapd %xmm2, %xmm0
2481 ; X86-LABEL: test_mask_add_sd_rn:
2483 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2484 ; X86-NEXT: kmovw %eax, %k1
2485 ; X86-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2486 ; X86-NEXT: vmovapd %xmm2, %xmm0
2488 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
2489 ret <2 x double> %res
2492 define <2 x double> @test_mask_add_sd_rd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
2493 ; X64-LABEL: test_mask_add_sd_rd:
2495 ; X64-NEXT: kmovw %edi, %k1
2496 ; X64-NEXT: vaddsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2497 ; X64-NEXT: vmovapd %xmm2, %xmm0
2500 ; X86-LABEL: test_mask_add_sd_rd:
2502 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2503 ; X86-NEXT: kmovw %eax, %k1
2504 ; X86-NEXT: vaddsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2505 ; X86-NEXT: vmovapd %xmm2, %xmm0
2507 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 9)
2508 ret <2 x double> %res
2511 define <2 x double> @test_mask_add_sd_ru(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
2512 ; X64-LABEL: test_mask_add_sd_ru:
2514 ; X64-NEXT: kmovw %edi, %k1
2515 ; X64-NEXT: vaddsd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2516 ; X64-NEXT: vmovapd %xmm2, %xmm0
2519 ; X86-LABEL: test_mask_add_sd_ru:
2521 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2522 ; X86-NEXT: kmovw %eax, %k1
2523 ; X86-NEXT: vaddsd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2524 ; X86-NEXT: vmovapd %xmm2, %xmm0
2526 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 10)
2527 ret <2 x double> %res
2530 define <2 x double> @test_mask_add_sd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
2531 ; X64-LABEL: test_mask_add_sd_rz:
2533 ; X64-NEXT: kmovw %edi, %k1
2534 ; X64-NEXT: vaddsd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2535 ; X64-NEXT: vmovapd %xmm2, %xmm0
2538 ; X86-LABEL: test_mask_add_sd_rz:
2540 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2541 ; X86-NEXT: kmovw %eax, %k1
2542 ; X86-NEXT: vaddsd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2543 ; X86-NEXT: vmovapd %xmm2, %xmm0
2545 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 11)
2546 ret <2 x double> %res
2549 define <2 x double> @test_mask_add_sd_current(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
2550 ; X64-LABEL: test_mask_add_sd_current:
2552 ; X64-NEXT: kmovw %edi, %k1
2553 ; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm2 {%k1}
2554 ; X64-NEXT: vmovapd %xmm2, %xmm0
2557 ; X86-LABEL: test_mask_add_sd_current:
2559 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2560 ; X86-NEXT: kmovw %eax, %k1
2561 ; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm2 {%k1}
2562 ; X86-NEXT: vmovapd %xmm2, %xmm0
2564 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
2565 ret <2 x double> %res
2568 define <2 x double> @test_maskz_add_sd_rn(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
2569 ; X64-LABEL: test_maskz_add_sd_rn:
2571 ; X64-NEXT: kmovw %edi, %k1
2572 ; X64-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
2575 ; X86-LABEL: test_maskz_add_sd_rn:
2577 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2578 ; X86-NEXT: kmovw %eax, %k1
2579 ; X86-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
2581 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
2582 ret <2 x double> %res
2585 define <2 x double> @test_add_sd_rn(<2 x double> %a0, <2 x double> %a1) {
2586 ; CHECK-LABEL: test_add_sd_rn:
2588 ; CHECK-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0
2589 ; CHECK-NEXT: ret{{[l|q]}}
2590 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 8)
2591 ret <2 x double> %res
2594 define <2 x double> @test_mask_add_sd_current_memfold(<2 x double> %a0, double* %a1, <2 x double> %a2, i8 %mask) {
2595 ; X64-LABEL: test_mask_add_sd_current_memfold:
2597 ; X64-NEXT: kmovw %esi, %k1
2598 ; X64-NEXT: vaddsd (%rdi), %xmm0, %xmm1 {%k1}
2599 ; X64-NEXT: vmovapd %xmm1, %xmm0
2602 ; X86-LABEL: test_mask_add_sd_current_memfold:
2604 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2605 ; X86-NEXT: kmovw %eax, %k1
2606 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2607 ; X86-NEXT: vaddsd (%eax), %xmm0, %xmm1 {%k1}
2608 ; X86-NEXT: vmovapd %xmm1, %xmm0
2610 %a1.val = load double, double* %a1
2611 %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
2612 %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
2613 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> %a2, i8 %mask, i32 4)
2614 ret <2 x double> %res
2617 define <2 x double> @test_maskz_add_sd_current_memfold(<2 x double> %a0, double* %a1, i8 %mask) {
2618 ; X64-LABEL: test_maskz_add_sd_current_memfold:
2620 ; X64-NEXT: kmovw %esi, %k1
2621 ; X64-NEXT: vaddsd (%rdi), %xmm0, %xmm0 {%k1} {z}
2624 ; X86-LABEL: test_maskz_add_sd_current_memfold:
2626 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2627 ; X86-NEXT: kmovw %eax, %k1
2628 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2629 ; X86-NEXT: vaddsd (%eax), %xmm0, %xmm0 {%k1} {z}
2631 %a1.val = load double, double* %a1
2632 %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
2633 %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
2634 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> zeroinitializer, i8 %mask, i32 4)
2635 ret <2 x double> %res
2638 declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
2640 define <4 x float> @test_mask_max_ss_sae(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
2641 ; X64-LABEL: test_mask_max_ss_sae:
2643 ; X64-NEXT: kmovw %edi, %k1
2644 ; X64-NEXT: vmaxss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
2645 ; X64-NEXT: vmovaps %xmm2, %xmm0
2648 ; X86-LABEL: test_mask_max_ss_sae:
2650 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2651 ; X86-NEXT: kmovw %eax, %k1
2652 ; X86-NEXT: vmaxss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
2653 ; X86-NEXT: vmovaps %xmm2, %xmm0
2655 %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
2656 ret <4 x float> %res
2659 define <4 x float> @test_maskz_max_ss_sae(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
2660 ; X64-LABEL: test_maskz_max_ss_sae:
2662 ; X64-NEXT: kmovw %edi, %k1
2663 ; X64-NEXT: vmaxss {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
2666 ; X86-LABEL: test_maskz_max_ss_sae:
2668 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2669 ; X86-NEXT: kmovw %eax, %k1
2670 ; X86-NEXT: vmaxss {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
2672 %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
2673 ret <4 x float> %res
2676 define <4 x float> @test_max_ss_sae(<4 x float> %a0, <4 x float> %a1) {
2677 ; CHECK-LABEL: test_max_ss_sae:
2679 ; CHECK-NEXT: vmaxss {sae}, %xmm1, %xmm0, %xmm0
2680 ; CHECK-NEXT: ret{{[l|q]}}
2681 %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
2682 ret <4 x float> %res
2685 define <4 x float> @test_mask_max_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
2686 ; X64-LABEL: test_mask_max_ss:
2688 ; X64-NEXT: kmovw %edi, %k1
2689 ; X64-NEXT: vmaxss %xmm1, %xmm0, %xmm2 {%k1}
2690 ; X64-NEXT: vmovaps %xmm2, %xmm0
2693 ; X86-LABEL: test_mask_max_ss:
2695 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2696 ; X86-NEXT: kmovw %eax, %k1
2697 ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm2 {%k1}
2698 ; X86-NEXT: vmovaps %xmm2, %xmm0
2700 %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
2701 ret <4 x float> %res
2704 define <4 x float> @test_maskz_max_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
2705 ; X64-LABEL: test_maskz_max_ss:
2707 ; X64-NEXT: kmovw %edi, %k1
2708 ; X64-NEXT: vmaxss %xmm1, %xmm0, %xmm0 {%k1} {z}
2711 ; X86-LABEL: test_maskz_max_ss:
2713 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2714 ; X86-NEXT: kmovw %eax, %k1
2715 ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm0 {%k1} {z}
2717 %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 4)
2718 ret <4 x float> %res
2721 define <4 x float> @test_max_ss(<4 x float> %a0, <4 x float> %a1) {
2722 ; CHECK-LABEL: test_max_ss:
2724 ; CHECK-NEXT: vmaxss %xmm1, %xmm0, %xmm0
2725 ; CHECK-NEXT: ret{{[l|q]}}
2726 %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 4)
2727 ret <4 x float> %res
2730 define <4 x float> @test_mask_max_ss_memfold(<4 x float> %a0, float* %a1, <4 x float> %a2, i8 %mask) {
2731 ; X64-LABEL: test_mask_max_ss_memfold:
2733 ; X64-NEXT: kmovw %esi, %k1
2734 ; X64-NEXT: vmaxss (%rdi), %xmm0, %xmm1 {%k1}
2735 ; X64-NEXT: vmovaps %xmm1, %xmm0
2738 ; X86-LABEL: test_mask_max_ss_memfold:
2740 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2741 ; X86-NEXT: kmovw %eax, %k1
2742 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2743 ; X86-NEXT: vmaxss (%eax), %xmm0, %xmm1 {%k1}
2744 ; X86-NEXT: vmovaps %xmm1, %xmm0
2746 %a1.val = load float, float* %a1
2747 %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
2748 %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
2749 %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
2750 %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
2751 %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> %a2, i8 %mask, i32 4)
2752 ret <4 x float> %res
2755 define <4 x float> @test_maskz_max_ss_memfold(<4 x float> %a0, float* %a1, i8 %mask) {
2756 ; X64-LABEL: test_maskz_max_ss_memfold:
2758 ; X64-NEXT: kmovw %esi, %k1
2759 ; X64-NEXT: vmaxss (%rdi), %xmm0, %xmm0 {%k1} {z}
2762 ; X86-LABEL: test_maskz_max_ss_memfold:
2764 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2765 ; X86-NEXT: kmovw %eax, %k1
2766 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2767 ; X86-NEXT: vmaxss (%eax), %xmm0, %xmm0 {%k1} {z}
2769 %a1.val = load float, float* %a1
2770 %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
2771 %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
2772 %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
2773 %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
2774 %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> zeroinitializer, i8 %mask, i32 4)
2775 ret <4 x float> %res
2777 declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
2779 define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
2780 ; X64-LABEL: test_mask_max_sd_sae:
2782 ; X64-NEXT: kmovw %edi, %k1
2783 ; X64-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
2784 ; X64-NEXT: vmovapd %xmm2, %xmm0
2787 ; X86-LABEL: test_mask_max_sd_sae:
2789 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2790 ; X86-NEXT: kmovw %eax, %k1
2791 ; X86-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
2792 ; X86-NEXT: vmovapd %xmm2, %xmm0
2794 %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
2795 ret <2 x double> %res
2798 define <2 x double> @test_maskz_max_sd_sae(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
2799 ; X64-LABEL: test_maskz_max_sd_sae:
2801 ; X64-NEXT: kmovw %edi, %k1
2802 ; X64-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
2805 ; X86-LABEL: test_maskz_max_sd_sae:
2807 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2808 ; X86-NEXT: kmovw %eax, %k1
2809 ; X86-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
2811 %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
2812 ret <2 x double> %res
2815 define <2 x double> @test_max_sd_sae(<2 x double> %a0, <2 x double> %a1) {
2816 ; CHECK-LABEL: test_max_sd_sae:
2818 ; CHECK-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm0
2819 ; CHECK-NEXT: ret{{[l|q]}}
2820 %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 8)
2821 ret <2 x double> %res
2824 define <2 x double> @test_mask_max_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
2825 ; X64-LABEL: test_mask_max_sd:
2827 ; X64-NEXT: kmovw %edi, %k1
2828 ; X64-NEXT: vmaxsd %xmm1, %xmm0, %xmm2 {%k1}
2829 ; X64-NEXT: vmovapd %xmm2, %xmm0
2832 ; X86-LABEL: test_mask_max_sd:
2834 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2835 ; X86-NEXT: kmovw %eax, %k1
2836 ; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm2 {%k1}
2837 ; X86-NEXT: vmovapd %xmm2, %xmm0
2839 %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
2840 ret <2 x double> %res
2843 define <2 x double> @test_maskz_max_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
2844 ; X64-LABEL: test_maskz_max_sd:
2846 ; X64-NEXT: kmovw %edi, %k1
2847 ; X64-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2850 ; X86-LABEL: test_maskz_max_sd:
2852 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2853 ; X86-NEXT: kmovw %eax, %k1
2854 ; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2856 %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 4)
2857 ret <2 x double> %res
2860 define <2 x double> @test_max_sd(<2 x double> %a0, <2 x double> %a1) {
2861 ; CHECK-LABEL: test_max_sd:
2863 ; CHECK-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
2864 ; CHECK-NEXT: ret{{[l|q]}}
2865 %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4)
2866 ret <2 x double> %res
2869 define <2 x double> @test_mask_max_sd_memfold(<2 x double> %a0, double* %a1, <2 x double> %a2, i8 %mask) {
2870 ; X64-LABEL: test_mask_max_sd_memfold:
2872 ; X64-NEXT: kmovw %esi, %k1
2873 ; X64-NEXT: vmaxsd (%rdi), %xmm0, %xmm1 {%k1}
2874 ; X64-NEXT: vmovapd %xmm1, %xmm0
2877 ; X86-LABEL: test_mask_max_sd_memfold:
2879 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2880 ; X86-NEXT: kmovw %eax, %k1
2881 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2882 ; X86-NEXT: vmaxsd (%eax), %xmm0, %xmm1 {%k1}
2883 ; X86-NEXT: vmovapd %xmm1, %xmm0
2885 %a1.val = load double, double* %a1
2886 %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
2887 %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
2888 %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> %a2, i8 %mask, i32 4)
2889 ret <2 x double> %res
2892 define <2 x double> @test_maskz_max_sd_memfold(<2 x double> %a0, double* %a1, i8 %mask) {
2893 ; X64-LABEL: test_maskz_max_sd_memfold:
2895 ; X64-NEXT: kmovw %esi, %k1
2896 ; X64-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 {%k1} {z}
2899 ; X86-LABEL: test_maskz_max_sd_memfold:
2901 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2902 ; X86-NEXT: kmovw %eax, %k1
2903 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2904 ; X86-NEXT: vmaxsd (%eax), %xmm0, %xmm0 {%k1} {z}
2906 %a1.val = load double, double* %a1
2907 %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
2908 %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
2909 %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> zeroinitializer, i8 %mask, i32 4)
2910 ret <2 x double> %res
2913 define <4 x float> @test_x86_avx512_cvtsi2ss32(<4 x float> %a, i32 %b) {
2914 ; X64-LABEL: test_x86_avx512_cvtsi2ss32:
2916 ; X64-NEXT: vcvtsi2ss %edi, {rz-sae}, %xmm0, %xmm0
2919 ; X86-LABEL: test_x86_avx512_cvtsi2ss32:
2921 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2922 ; X86-NEXT: vcvtsi2ss %eax, {rz-sae}, %xmm0, %xmm0
2924 %res = call <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float> %a, i32 %b, i32 11) ; <<<4 x float>> [#uses=1]
2925 ret <4 x float> %res
2927 declare <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float>, i32, i32) nounwind readnone
2929 define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss (<4 x float> %a, i32 %b) {
2930 ; X64-LABEL: test_x86_avx512__mm_cvt_roundu32_ss:
2932 ; X64-NEXT: vcvtusi2ss %edi, {rd-sae}, %xmm0, %xmm0
2935 ; X86-LABEL: test_x86_avx512__mm_cvt_roundu32_ss:
2937 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2938 ; X86-NEXT: vcvtusi2ss %eax, {rd-sae}, %xmm0, %xmm0
2940 %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 9) ; <<<4 x float>> [#uses=1]
2941 ret <4 x float> %res
2944 define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss_mem(<4 x float> %a, i32* %ptr) {
2945 ; X64-LABEL: test_x86_avx512__mm_cvt_roundu32_ss_mem:
2947 ; X64-NEXT: movl (%rdi), %eax
2948 ; X64-NEXT: vcvtusi2ss %eax, {rd-sae}, %xmm0, %xmm0
2951 ; X86-LABEL: test_x86_avx512__mm_cvt_roundu32_ss_mem:
2953 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2954 ; X86-NEXT: movl (%eax), %eax
2955 ; X86-NEXT: vcvtusi2ss %eax, {rd-sae}, %xmm0, %xmm0
2957 %b = load i32, i32* %ptr
2958 %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 9) ; <<<4 x float>> [#uses=1]
2959 ret <4 x float> %res
2962 define <4 x float> @test_x86_avx512__mm_cvtu32_ss(<4 x float> %a, i32 %b) {
2963 ; X64-LABEL: test_x86_avx512__mm_cvtu32_ss:
2965 ; X64-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0
2968 ; X86-LABEL: test_x86_avx512__mm_cvtu32_ss:
2970 ; X86-NEXT: vcvtusi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
2972 %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1]
2973 ret <4 x float> %res
2976 define <4 x float> @test_x86_avx512__mm_cvtu32_ss_mem(<4 x float> %a, i32* %ptr) {
2977 ; X64-LABEL: test_x86_avx512__mm_cvtu32_ss_mem:
2979 ; X64-NEXT: vcvtusi2ssl (%rdi), %xmm0, %xmm0
2982 ; X86-LABEL: test_x86_avx512__mm_cvtu32_ss_mem:
2984 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
2985 ; X86-NEXT: vcvtusi2ssl (%eax), %xmm0, %xmm0
2987 %b = load i32, i32* %ptr
2988 %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1]
2989 ret <4 x float> %res
2991 declare <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float>, i32, i32) nounwind readnone
2993 declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>)
2995 define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
2996 ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512:
2998 ; X64-NEXT: kmovw %esi, %k1
2999 ; X64-NEXT: vmovdqa64 %zmm1, %zmm3
3000 ; X64-NEXT: vpermi2d (%rdi), %zmm0, %zmm3 {%k1}
3001 ; X64-NEXT: vpermt2d %zmm2, %zmm1, %zmm0
3002 ; X64-NEXT: vpaddd %zmm0, %zmm3, %zmm0
3005 ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512:
3007 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3008 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3009 ; X86-NEXT: vmovdqa64 %zmm1, %zmm3
3010 ; X86-NEXT: vpermi2d (%eax), %zmm0, %zmm3 {%k1}
3011 ; X86-NEXT: vpermt2d %zmm2, %zmm1, %zmm0
3012 ; X86-NEXT: vpaddd %zmm0, %zmm3, %zmm0
3014 %x2 = load <16 x i32>, <16 x i32>* %x2p
3015 %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
3016 %2 = bitcast i16 %x3 to <16 x i1>
3017 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1
3018 %4 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
3019 %res2 = add <16 x i32> %3, %4
3020 ret <16 x i32> %res2
3023 declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>)
3025 define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
3026 ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512:
3028 ; X64-NEXT: vmovapd %zmm0, %zmm3
3029 ; X64-NEXT: vpermt2pd %zmm2, %zmm1, %zmm3
3030 ; X64-NEXT: kmovw %edi, %k1
3031 ; X64-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
3032 ; X64-NEXT: vaddpd %zmm3, %zmm1, %zmm0
3035 ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512:
3037 ; X86-NEXT: vmovapd %zmm0, %zmm3
3038 ; X86-NEXT: vpermt2pd %zmm2, %zmm1, %zmm3
3039 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3040 ; X86-NEXT: kmovw %eax, %k1
3041 ; X86-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
3042 ; X86-NEXT: vaddpd %zmm3, %zmm1, %zmm0
3044 %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)
3045 %2 = bitcast <8 x i64> %x1 to <8 x double>
3046 %3 = bitcast i8 %x3 to <8 x i1>
3047 %4 = select <8 x i1> %3, <8 x double> %1, <8 x double> %2
3048 %5 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)
3049 %6 = bitcast <8 x i64> %x1 to <8 x double>
3050 %res2 = fadd <8 x double> %4, %5
3051 ret <8 x double> %res2
3054 declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>)
3056 define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
3057 ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512:
3059 ; X64-NEXT: vmovaps %zmm0, %zmm3
3060 ; X64-NEXT: vpermt2ps %zmm2, %zmm1, %zmm3
3061 ; X64-NEXT: kmovw %edi, %k1
3062 ; X64-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
3063 ; X64-NEXT: vaddps %zmm3, %zmm1, %zmm0
3066 ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512:
3068 ; X86-NEXT: vmovaps %zmm0, %zmm3
3069 ; X86-NEXT: vpermt2ps %zmm2, %zmm1, %zmm3
3070 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3071 ; X86-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
3072 ; X86-NEXT: vaddps %zmm3, %zmm1, %zmm0
3074 %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)
3075 %2 = bitcast <16 x i32> %x1 to <16 x float>
3076 %3 = bitcast i16 %x3 to <16 x i1>
3077 %4 = select <16 x i1> %3, <16 x float> %1, <16 x float> %2
3078 %5 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)
3079 %6 = bitcast <16 x i32> %x1 to <16 x float>
3080 %res2 = fadd <16 x float> %4, %5
3081 ret <16 x float> %res2
3084 declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>)
3086 define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
3087 ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512:
3089 ; X64-NEXT: vmovdqa64 %zmm0, %zmm3
3090 ; X64-NEXT: vpermt2q %zmm2, %zmm1, %zmm3
3091 ; X64-NEXT: kmovw %edi, %k1
3092 ; X64-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
3093 ; X64-NEXT: vpaddq %zmm3, %zmm1, %zmm0
3096 ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512:
3098 ; X86-NEXT: vmovdqa64 %zmm0, %zmm3
3099 ; X86-NEXT: vpermt2q %zmm2, %zmm1, %zmm3
3100 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3101 ; X86-NEXT: kmovw %eax, %k1
3102 ; X86-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
3103 ; X86-NEXT: vpaddq %zmm3, %zmm1, %zmm0
3105 %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
3106 %2 = bitcast i8 %x3 to <8 x i1>
3107 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x1
3108 %4 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
3109 %res2 = add <8 x i64> %3, %4
3113 define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, i16 %x3) {
3114 ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512:
3116 ; X64-NEXT: kmovw %esi, %k1
3117 ; X64-NEXT: vmovdqa64 %zmm1, %zmm2
3118 ; X64-NEXT: vpermt2d (%rdi), %zmm0, %zmm2 {%k1} {z}
3119 ; X64-NEXT: vpermt2d %zmm1, %zmm0, %zmm1
3120 ; X64-NEXT: vpaddd %zmm1, %zmm2, %zmm0
3123 ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512:
3125 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3126 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3127 ; X86-NEXT: vmovdqa64 %zmm1, %zmm2
3128 ; X86-NEXT: vpermt2d (%eax), %zmm0, %zmm2 {%k1} {z}
3129 ; X86-NEXT: vpermt2d %zmm1, %zmm0, %zmm1
3130 ; X86-NEXT: vpaddd %zmm1, %zmm2, %zmm0
3132 %x2 = load <16 x i32>, <16 x i32>* %x2p
3133 %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2)
3134 %2 = bitcast i16 %x3 to <16 x i1>
3135 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
3136 %4 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x1)
3137 %res2 = add <16 x i32> %3, %4
3138 ret <16 x i32> %res2
3141 define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, double* %x2ptr, i8 %x3) {
3142 ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512:
3144 ; X64-NEXT: kmovw %esi, %k1
3145 ; X64-NEXT: vmovapd %zmm1, %zmm2
3146 ; X64-NEXT: vpermt2pd (%rdi){1to8}, %zmm0, %zmm2 {%k1} {z}
3147 ; X64-NEXT: vpermt2pd %zmm1, %zmm0, %zmm1
3148 ; X64-NEXT: vaddpd %zmm1, %zmm2, %zmm0
3151 ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512:
3153 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3154 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
3155 ; X86-NEXT: kmovw %ecx, %k1
3156 ; X86-NEXT: vmovapd %zmm1, %zmm2
3157 ; X86-NEXT: vpermt2pd (%eax){1to8}, %zmm0, %zmm2 {%k1} {z}
3158 ; X86-NEXT: vpermt2pd %zmm1, %zmm0, %zmm1
3159 ; X86-NEXT: vaddpd %zmm1, %zmm2, %zmm0
3161 %x2s = load double, double* %x2ptr
3162 %x2ins = insertelement <8 x double> undef, double %x2s, i32 0
3163 %x2 = shufflevector <8 x double> %x2ins, <8 x double> undef, <8 x i32> zeroinitializer
3164 %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x1, <8 x i64> %x0, <8 x double> %x2)
3165 %2 = bitcast i8 %x3 to <8 x i1>
3166 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
3167 %4 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x1, <8 x i64> %x0, <8 x double> %x1)
3168 %res2 = fadd <8 x double> %3, %4
3169 ret <8 x double> %res2
3172 define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
3173 ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512:
3175 ; X64-NEXT: vmovaps %zmm1, %zmm3
3176 ; X64-NEXT: vpermt2ps %zmm2, %zmm0, %zmm3
3177 ; X64-NEXT: kmovw %edi, %k1
3178 ; X64-NEXT: vpermt2ps %zmm2, %zmm0, %zmm1 {%k1} {z}
3179 ; X64-NEXT: vaddps %zmm3, %zmm1, %zmm0
3182 ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512:
3184 ; X86-NEXT: vmovaps %zmm1, %zmm3
3185 ; X86-NEXT: vpermt2ps %zmm2, %zmm0, %zmm3
3186 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3187 ; X86-NEXT: vpermt2ps %zmm2, %zmm0, %zmm1 {%k1} {z}
3188 ; X86-NEXT: vaddps %zmm3, %zmm1, %zmm0
3190 %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0, <16 x float> %x2)
3191 %2 = bitcast i16 %x3 to <16 x i1>
3192 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
3193 %4 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0, <16 x float> %x2)
3194 %res2 = fadd <16 x float> %3, %4
3195 ret <16 x float> %res2
3198 define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
3199 ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512:
3201 ; X64-NEXT: vmovdqa64 %zmm1, %zmm3
3202 ; X64-NEXT: vpermt2q %zmm2, %zmm0, %zmm3
3203 ; X64-NEXT: kmovw %edi, %k1
3204 ; X64-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 {%k1} {z}
3205 ; X64-NEXT: vpaddq %zmm3, %zmm1, %zmm0
3208 ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512:
3210 ; X86-NEXT: vmovdqa64 %zmm1, %zmm3
3211 ; X86-NEXT: vpermt2q %zmm2, %zmm0, %zmm3
3212 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3213 ; X86-NEXT: kmovw %eax, %k1
3214 ; X86-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 {%k1} {z}
3215 ; X86-NEXT: vpaddq %zmm3, %zmm1, %zmm0
3217 %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x1, <8 x i64> %x0, <8 x i64> %x2)
3218 %2 = bitcast i8 %x3 to <8 x i1>
3219 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
3220 %4 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x1, <8 x i64> %x0, <8 x i64> %x2)
3221 %res2 = add <8 x i64> %3, %4
3225 define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
3226 ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512:
3228 ; X64-NEXT: vmovdqa64 %zmm1, %zmm3
3229 ; X64-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
3230 ; X64-NEXT: kmovw %edi, %k1
3231 ; X64-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 {%k1}
3232 ; X64-NEXT: vpaddd %zmm3, %zmm1, %zmm0
3235 ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512:
3237 ; X86-NEXT: vmovdqa64 %zmm1, %zmm3
3238 ; X86-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
3239 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3240 ; X86-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 {%k1}
3241 ; X86-NEXT: vpaddd %zmm3, %zmm1, %zmm0
3243 %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2)
3244 %2 = bitcast i16 %x3 to <16 x i1>
3245 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1
3246 %4 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2)
3247 %res2 = add <16 x i32> %3, %4
3248 ret <16 x i32> %res2
3251 declare <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
3252 define <8 x double>@test_int_x86_avx512_mask_scalef_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
3253 ; X64-LABEL: test_int_x86_avx512_mask_scalef_pd_512:
3255 ; X64-NEXT: kmovw %edi, %k1
3256 ; X64-NEXT: vscalefpd {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3257 ; X64-NEXT: vscalefpd {rn-sae}, %zmm1, %zmm0, %zmm0
3258 ; X64-NEXT: vaddpd %zmm0, %zmm2, %zmm0
3261 ; X86-LABEL: test_int_x86_avx512_mask_scalef_pd_512:
3263 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3264 ; X86-NEXT: kmovw %eax, %k1
3265 ; X86-NEXT: vscalefpd {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3266 ; X86-NEXT: vscalefpd {rn-sae}, %zmm1, %zmm0, %zmm0
3267 ; X86-NEXT: vaddpd %zmm0, %zmm2, %zmm0
3269 %res = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 11)
3270 %res1 = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 8)
3271 %res2 = fadd <8 x double> %res, %res1
3272 ret <8 x double> %res2
3275 declare <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
3276 define <16 x float>@test_int_x86_avx512_mask_scalef_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
3277 ; X64-LABEL: test_int_x86_avx512_mask_scalef_ps_512:
3279 ; X64-NEXT: kmovw %edi, %k1
3280 ; X64-NEXT: vscalefps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3281 ; X64-NEXT: vscalefps {rn-sae}, %zmm1, %zmm0, %zmm0
3282 ; X64-NEXT: vaddps %zmm0, %zmm2, %zmm0
3285 ; X86-LABEL: test_int_x86_avx512_mask_scalef_ps_512:
3287 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3288 ; X86-NEXT: vscalefps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3289 ; X86-NEXT: vscalefps {rn-sae}, %zmm1, %zmm0, %zmm0
3290 ; X86-NEXT: vaddps %zmm0, %zmm2, %zmm0
3292 %res = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 10)
3293 %res1 = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 8)
3294 %res2 = fadd <16 x float> %res, %res1
3295 ret <16 x float> %res2
3298 declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64>, <16 x i8>, i8)
3300 define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
3301 ; X64-LABEL: test_int_x86_avx512_mask_pmov_qb_512:
3303 ; X64-NEXT: kmovw %edi, %k1
3304 ; X64-NEXT: vpmovqb %zmm0, %xmm2
3305 ; X64-NEXT: vpmovqb %zmm0, %xmm1 {%k1}
3306 ; X64-NEXT: vpmovqb %zmm0, %xmm0 {%k1} {z}
3307 ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3308 ; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0
3309 ; X64-NEXT: vzeroupper
3312 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qb_512:
3314 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3315 ; X86-NEXT: kmovw %eax, %k1
3316 ; X86-NEXT: vpmovqb %zmm0, %xmm2
3317 ; X86-NEXT: vpmovqb %zmm0, %xmm1 {%k1}
3318 ; X86-NEXT: vpmovqb %zmm0, %xmm0 {%k1} {z}
3319 ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3320 ; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0
3321 ; X86-NEXT: vzeroupper
3323 %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
3324 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
3325 %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
3326 %res3 = add <16 x i8> %res0, %res1
3327 %res4 = add <16 x i8> %res3, %res2
3331 declare void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64>, i8)
3333 define void @test_int_x86_avx512_mask_pmov_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
3334 ; X64-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_512:
3336 ; X64-NEXT: kmovw %esi, %k1
3337 ; X64-NEXT: vpmovqb %zmm0, (%rdi)
3338 ; X64-NEXT: vpmovqb %zmm0, (%rdi) {%k1}
3339 ; X64-NEXT: vzeroupper
3342 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_512:
3344 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3345 ; X86-NEXT: kmovw %eax, %k1
3346 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3347 ; X86-NEXT: vpmovqb %zmm0, (%eax)
3348 ; X86-NEXT: vpmovqb %zmm0, (%eax) {%k1}
3349 ; X86-NEXT: vzeroupper
3351 call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
3352 call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
3356 declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64>, <16 x i8>, i8)
3358 define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
3359 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qb_512:
3361 ; X64-NEXT: kmovw %edi, %k1
3362 ; X64-NEXT: vpmovsqb %zmm0, %xmm2
3363 ; X64-NEXT: vpmovsqb %zmm0, %xmm1 {%k1}
3364 ; X64-NEXT: vpmovsqb %zmm0, %xmm0 {%k1} {z}
3365 ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3366 ; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0
3367 ; X64-NEXT: vzeroupper
3370 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qb_512:
3372 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3373 ; X86-NEXT: kmovw %eax, %k1
3374 ; X86-NEXT: vpmovsqb %zmm0, %xmm2
3375 ; X86-NEXT: vpmovsqb %zmm0, %xmm1 {%k1}
3376 ; X86-NEXT: vpmovsqb %zmm0, %xmm0 {%k1} {z}
3377 ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3378 ; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0
3379 ; X86-NEXT: vzeroupper
3381 %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
3382 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
3383 %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
3384 %res3 = add <16 x i8> %res0, %res1
3385 %res4 = add <16 x i8> %res3, %res2
3389 declare void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64>, i8)
3391 define void @test_int_x86_avx512_mask_pmovs_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
3392 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_512:
3394 ; X64-NEXT: kmovw %esi, %k1
3395 ; X64-NEXT: vpmovsqb %zmm0, (%rdi)
3396 ; X64-NEXT: vpmovsqb %zmm0, (%rdi) {%k1}
3397 ; X64-NEXT: vzeroupper
3400 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_512:
3402 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3403 ; X86-NEXT: kmovw %eax, %k1
3404 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3405 ; X86-NEXT: vpmovsqb %zmm0, (%eax)
3406 ; X86-NEXT: vpmovsqb %zmm0, (%eax) {%k1}
3407 ; X86-NEXT: vzeroupper
3409 call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
3410 call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
3414 declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64>, <16 x i8>, i8)
3416 define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
3417 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qb_512:
3419 ; X64-NEXT: kmovw %edi, %k1
3420 ; X64-NEXT: vpmovusqb %zmm0, %xmm2
3421 ; X64-NEXT: vpmovusqb %zmm0, %xmm1 {%k1}
3422 ; X64-NEXT: vpmovusqb %zmm0, %xmm0 {%k1} {z}
3423 ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3424 ; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0
3425 ; X64-NEXT: vzeroupper
3428 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qb_512:
3430 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3431 ; X86-NEXT: kmovw %eax, %k1
3432 ; X86-NEXT: vpmovusqb %zmm0, %xmm2
3433 ; X86-NEXT: vpmovusqb %zmm0, %xmm1 {%k1}
3434 ; X86-NEXT: vpmovusqb %zmm0, %xmm0 {%k1} {z}
3435 ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3436 ; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0
3437 ; X86-NEXT: vzeroupper
3439 %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
3440 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
3441 %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
3442 %res3 = add <16 x i8> %res0, %res1
3443 %res4 = add <16 x i8> %res3, %res2
3447 declare void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64>, i8)
3449 define void @test_int_x86_avx512_mask_pmovus_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
3450 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_512:
3452 ; X64-NEXT: kmovw %esi, %k1
3453 ; X64-NEXT: vpmovusqb %zmm0, (%rdi)
3454 ; X64-NEXT: vpmovusqb %zmm0, (%rdi) {%k1}
3455 ; X64-NEXT: vzeroupper
3458 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_512:
3460 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3461 ; X86-NEXT: kmovw %eax, %k1
3462 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3463 ; X86-NEXT: vpmovusqb %zmm0, (%eax)
3464 ; X86-NEXT: vpmovusqb %zmm0, (%eax) {%k1}
3465 ; X86-NEXT: vzeroupper
3467 call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
3468 call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
3472 declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
3474 define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
3475 ; X64-LABEL: test_int_x86_avx512_mask_pmov_qw_512:
3477 ; X64-NEXT: kmovw %edi, %k1
3478 ; X64-NEXT: vpmovqw %zmm0, %xmm2
3479 ; X64-NEXT: vpmovqw %zmm0, %xmm1 {%k1}
3480 ; X64-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z}
3481 ; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0
3482 ; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0
3483 ; X64-NEXT: vzeroupper
3486 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qw_512:
3488 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3489 ; X86-NEXT: kmovw %eax, %k1
3490 ; X86-NEXT: vpmovqw %zmm0, %xmm2
3491 ; X86-NEXT: vpmovqw %zmm0, %xmm1 {%k1}
3492 ; X86-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z}
3493 ; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0
3494 ; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0
3495 ; X86-NEXT: vzeroupper
3497 %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
3498 %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
3499 %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
3500 %res3 = add <8 x i16> %res0, %res1
3501 %res4 = add <8 x i16> %res3, %res2
3505 declare void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64>, i8)
3507 define void @test_int_x86_avx512_mask_pmov_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
3508 ; X64-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_512:
3510 ; X64-NEXT: kmovw %esi, %k1
3511 ; X64-NEXT: vpmovqw %zmm0, (%rdi)
3512 ; X64-NEXT: vpmovqw %zmm0, (%rdi) {%k1}
3513 ; X64-NEXT: vzeroupper
3516 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_512:
3518 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3519 ; X86-NEXT: kmovw %eax, %k1
3520 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3521 ; X86-NEXT: vpmovqw %zmm0, (%eax)
3522 ; X86-NEXT: vpmovqw %zmm0, (%eax) {%k1}
3523 ; X86-NEXT: vzeroupper
3525 call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
3526 call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
3530 declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8)
3532 define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
3533 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qw_512:
3535 ; X64-NEXT: kmovw %edi, %k1
3536 ; X64-NEXT: vpmovsqw %zmm0, %xmm2
3537 ; X64-NEXT: vpmovsqw %zmm0, %xmm1 {%k1}
3538 ; X64-NEXT: vpmovsqw %zmm0, %xmm0 {%k1} {z}
3539 ; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0
3540 ; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0
3541 ; X64-NEXT: vzeroupper
3544 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qw_512:
3546 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3547 ; X86-NEXT: kmovw %eax, %k1
3548 ; X86-NEXT: vpmovsqw %zmm0, %xmm2
3549 ; X86-NEXT: vpmovsqw %zmm0, %xmm1 {%k1}
3550 ; X86-NEXT: vpmovsqw %zmm0, %xmm0 {%k1} {z}
3551 ; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0
3552 ; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0
3553 ; X86-NEXT: vzeroupper
3555 %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
3556 %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
3557 %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
3558 %res3 = add <8 x i16> %res0, %res1
3559 %res4 = add <8 x i16> %res3, %res2
3563 declare void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64>, i8)
3565 define void @test_int_x86_avx512_mask_pmovs_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
3566 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_512:
3568 ; X64-NEXT: kmovw %esi, %k1
3569 ; X64-NEXT: vpmovsqw %zmm0, (%rdi)
3570 ; X64-NEXT: vpmovsqw %zmm0, (%rdi) {%k1}
3571 ; X64-NEXT: vzeroupper
3574 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_512:
3576 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3577 ; X86-NEXT: kmovw %eax, %k1
3578 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3579 ; X86-NEXT: vpmovsqw %zmm0, (%eax)
3580 ; X86-NEXT: vpmovsqw %zmm0, (%eax) {%k1}
3581 ; X86-NEXT: vzeroupper
3583 call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
3584 call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
3588 declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8)
3590 define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
3591 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qw_512:
3593 ; X64-NEXT: kmovw %edi, %k1
3594 ; X64-NEXT: vpmovusqw %zmm0, %xmm2
3595 ; X64-NEXT: vpmovusqw %zmm0, %xmm1 {%k1}
3596 ; X64-NEXT: vpmovusqw %zmm0, %xmm0 {%k1} {z}
3597 ; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0
3598 ; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0
3599 ; X64-NEXT: vzeroupper
3602 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qw_512:
3604 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3605 ; X86-NEXT: kmovw %eax, %k1
3606 ; X86-NEXT: vpmovusqw %zmm0, %xmm2
3607 ; X86-NEXT: vpmovusqw %zmm0, %xmm1 {%k1}
3608 ; X86-NEXT: vpmovusqw %zmm0, %xmm0 {%k1} {z}
3609 ; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0
3610 ; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0
3611 ; X86-NEXT: vzeroupper
3613 %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
3614 %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
3615 %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
3616 %res3 = add <8 x i16> %res0, %res1
3617 %res4 = add <8 x i16> %res3, %res2
3621 declare void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64>, i8)
3623 define void @test_int_x86_avx512_mask_pmovus_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
3624 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_512:
3626 ; X64-NEXT: kmovw %esi, %k1
3627 ; X64-NEXT: vpmovusqw %zmm0, (%rdi)
3628 ; X64-NEXT: vpmovusqw %zmm0, (%rdi) {%k1}
3629 ; X64-NEXT: vzeroupper
3632 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_512:
3634 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3635 ; X86-NEXT: kmovw %eax, %k1
3636 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3637 ; X86-NEXT: vpmovusqw %zmm0, (%eax)
3638 ; X86-NEXT: vpmovusqw %zmm0, (%eax) {%k1}
3639 ; X86-NEXT: vzeroupper
3641 call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
3642 call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
3646 define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
3647 ; X64-LABEL: test_int_x86_avx512_mask_pmov_qd_512:
3649 ; X64-NEXT: vpmovqd %zmm0, %ymm2
3650 ; X64-NEXT: kmovw %edi, %k1
3651 ; X64-NEXT: vpmovqd %zmm0, %ymm1 {%k1}
3652 ; X64-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z}
3653 ; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0
3654 ; X64-NEXT: vpaddd %ymm0, %ymm2, %ymm0
3657 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qd_512:
3659 ; X86-NEXT: vpmovqd %zmm0, %ymm2
3660 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3661 ; X86-NEXT: kmovw %eax, %k1
3662 ; X86-NEXT: vpmovqd %zmm0, %ymm1 {%k1}
3663 ; X86-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z}
3664 ; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0
3665 ; X86-NEXT: vpaddd %ymm0, %ymm2, %ymm0
3667 %1 = trunc <8 x i64> %x0 to <8 x i32>
3668 %2 = trunc <8 x i64> %x0 to <8 x i32>
3669 %3 = bitcast i8 %x2 to <8 x i1>
3670 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %x1
3671 %5 = trunc <8 x i64> %x0 to <8 x i32>
3672 %6 = bitcast i8 %x2 to <8 x i1>
3673 %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> zeroinitializer
3674 %res3 = add <8 x i32> %1, %4
3675 %res4 = add <8 x i32> %res3, %7
3679 declare void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64>, i8)
3681 define void @test_int_x86_avx512_mask_pmov_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
3682 ; X64-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_512:
3684 ; X64-NEXT: kmovw %esi, %k1
3685 ; X64-NEXT: vpmovqd %zmm0, (%rdi)
3686 ; X64-NEXT: vpmovqd %zmm0, (%rdi) {%k1}
3687 ; X64-NEXT: vzeroupper
3690 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_512:
3692 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3693 ; X86-NEXT: kmovw %eax, %k1
3694 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3695 ; X86-NEXT: vpmovqd %zmm0, (%eax)
3696 ; X86-NEXT: vpmovqd %zmm0, (%eax) {%k1}
3697 ; X86-NEXT: vzeroupper
3699 call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
3700 call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
3704 declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8)
3706 define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
3707 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qd_512:
3709 ; X64-NEXT: kmovw %edi, %k1
3710 ; X64-NEXT: vpmovsqd %zmm0, %ymm2 {%k1} {z}
3711 ; X64-NEXT: vpmovsqd %zmm0, %ymm1 {%k1}
3712 ; X64-NEXT: vpaddd %ymm2, %ymm1, %ymm1
3713 ; X64-NEXT: vpmovsqd %zmm0, %ymm0
3714 ; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0
3717 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qd_512:
3719 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3720 ; X86-NEXT: kmovw %eax, %k1
3721 ; X86-NEXT: vpmovsqd %zmm0, %ymm1 {%k1}
3722 ; X86-NEXT: vpmovsqd %zmm0, %ymm2 {%k1} {z}
3723 ; X86-NEXT: vpaddd %ymm2, %ymm1, %ymm1
3724 ; X86-NEXT: vpmovsqd %zmm0, %ymm0
3725 ; X86-NEXT: vpaddd %ymm1, %ymm0, %ymm0
3727 %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
3728 %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
3729 %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
3730 %res3 = add <8 x i32> %res0, %res1
3731 %res4 = add <8 x i32> %res3, %res2
3735 declare void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64>, i8)
3737 define void @test_int_x86_avx512_mask_pmovs_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
3738 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_512:
3740 ; X64-NEXT: kmovw %esi, %k1
3741 ; X64-NEXT: vpmovsqd %zmm0, (%rdi)
3742 ; X64-NEXT: vpmovsqd %zmm0, (%rdi) {%k1}
3743 ; X64-NEXT: vzeroupper
3746 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_512:
3748 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3749 ; X86-NEXT: kmovw %eax, %k1
3750 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3751 ; X86-NEXT: vpmovsqd %zmm0, (%eax)
3752 ; X86-NEXT: vpmovsqd %zmm0, (%eax) {%k1}
3753 ; X86-NEXT: vzeroupper
3755 call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
3756 call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
3760 declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8)
3762 define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
3763 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qd_512:
3765 ; X64-NEXT: kmovw %edi, %k1
3766 ; X64-NEXT: vpmovusqd %zmm0, %ymm2 {%k1} {z}
3767 ; X64-NEXT: vpmovusqd %zmm0, %ymm1 {%k1}
3768 ; X64-NEXT: vpaddd %ymm2, %ymm1, %ymm1
3769 ; X64-NEXT: vpmovusqd %zmm0, %ymm0
3770 ; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0
3773 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qd_512:
3775 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3776 ; X86-NEXT: kmovw %eax, %k1
3777 ; X86-NEXT: vpmovusqd %zmm0, %ymm1 {%k1}
3778 ; X86-NEXT: vpmovusqd %zmm0, %ymm2 {%k1} {z}
3779 ; X86-NEXT: vpaddd %ymm2, %ymm1, %ymm1
3780 ; X86-NEXT: vpmovusqd %zmm0, %ymm0
3781 ; X86-NEXT: vpaddd %ymm1, %ymm0, %ymm0
3783 %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
3784 %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
3785 %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
3786 %res3 = add <8 x i32> %res0, %res1
3787 %res4 = add <8 x i32> %res3, %res2
3791 declare void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64>, i8)
3793 define void @test_int_x86_avx512_mask_pmovus_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
3794 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_512:
3796 ; X64-NEXT: kmovw %esi, %k1
3797 ; X64-NEXT: vpmovusqd %zmm0, (%rdi)
3798 ; X64-NEXT: vpmovusqd %zmm0, (%rdi) {%k1}
3799 ; X64-NEXT: vzeroupper
3802 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_512:
3804 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3805 ; X86-NEXT: kmovw %eax, %k1
3806 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3807 ; X86-NEXT: vpmovusqd %zmm0, (%eax)
3808 ; X86-NEXT: vpmovusqd %zmm0, (%eax) {%k1}
3809 ; X86-NEXT: vzeroupper
3811 call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
3812 call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
3816 declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
3818 define <16 x i8>@test_int_x86_avx512_mask_pmov_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
3819 ; X64-LABEL: test_int_x86_avx512_mask_pmov_db_512:
3821 ; X64-NEXT: kmovw %edi, %k1
3822 ; X64-NEXT: vpmovdb %zmm0, %xmm2
3823 ; X64-NEXT: vpmovdb %zmm0, %xmm1 {%k1}
3824 ; X64-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z}
3825 ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3826 ; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0
3827 ; X64-NEXT: vzeroupper
3830 ; X86-LABEL: test_int_x86_avx512_mask_pmov_db_512:
3832 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3833 ; X86-NEXT: vpmovdb %zmm0, %xmm2
3834 ; X86-NEXT: vpmovdb %zmm0, %xmm1 {%k1}
3835 ; X86-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z}
3836 ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3837 ; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0
3838 ; X86-NEXT: vzeroupper
3840 %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
3841 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
3842 %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
3843 %res3 = add <16 x i8> %res0, %res1
3844 %res4 = add <16 x i8> %res3, %res2
3848 declare void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32>, i16)
3850 define void @test_int_x86_avx512_mask_pmov_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
3851 ; X64-LABEL: test_int_x86_avx512_mask_pmov_db_mem_512:
3853 ; X64-NEXT: kmovw %esi, %k1
3854 ; X64-NEXT: vpmovdb %zmm0, (%rdi)
3855 ; X64-NEXT: vpmovdb %zmm0, (%rdi) {%k1}
3856 ; X64-NEXT: vzeroupper
3859 ; X86-LABEL: test_int_x86_avx512_mask_pmov_db_mem_512:
3861 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3862 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3863 ; X86-NEXT: vpmovdb %zmm0, (%eax)
3864 ; X86-NEXT: vpmovdb %zmm0, (%eax) {%k1}
3865 ; X86-NEXT: vzeroupper
3867 call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
3868 call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
3872 declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16)
3874 define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
3875 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_db_512:
3877 ; X64-NEXT: kmovw %edi, %k1
3878 ; X64-NEXT: vpmovsdb %zmm0, %xmm2
3879 ; X64-NEXT: vpmovsdb %zmm0, %xmm1 {%k1}
3880 ; X64-NEXT: vpmovsdb %zmm0, %xmm0 {%k1} {z}
3881 ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3882 ; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0
3883 ; X64-NEXT: vzeroupper
3886 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_db_512:
3888 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3889 ; X86-NEXT: vpmovsdb %zmm0, %xmm2
3890 ; X86-NEXT: vpmovsdb %zmm0, %xmm1 {%k1}
3891 ; X86-NEXT: vpmovsdb %zmm0, %xmm0 {%k1} {z}
3892 ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3893 ; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0
3894 ; X86-NEXT: vzeroupper
3896 %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
3897 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
3898 %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
3899 %res3 = add <16 x i8> %res0, %res1
3900 %res4 = add <16 x i8> %res3, %res2
3904 declare void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32>, i16)
3906 define void @test_int_x86_avx512_mask_pmovs_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
3907 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_512:
3909 ; X64-NEXT: kmovw %esi, %k1
3910 ; X64-NEXT: vpmovsdb %zmm0, (%rdi)
3911 ; X64-NEXT: vpmovsdb %zmm0, (%rdi) {%k1}
3912 ; X64-NEXT: vzeroupper
3915 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_512:
3917 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3918 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3919 ; X86-NEXT: vpmovsdb %zmm0, (%eax)
3920 ; X86-NEXT: vpmovsdb %zmm0, (%eax) {%k1}
3921 ; X86-NEXT: vzeroupper
3923 call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
3924 call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
3928 declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16)
3930 define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
3931 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_db_512:
3933 ; X64-NEXT: kmovw %edi, %k1
3934 ; X64-NEXT: vpmovusdb %zmm0, %xmm2
3935 ; X64-NEXT: vpmovusdb %zmm0, %xmm1 {%k1}
3936 ; X64-NEXT: vpmovusdb %zmm0, %xmm0 {%k1} {z}
3937 ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3938 ; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0
3939 ; X64-NEXT: vzeroupper
3942 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_db_512:
3944 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3945 ; X86-NEXT: vpmovusdb %zmm0, %xmm2
3946 ; X86-NEXT: vpmovusdb %zmm0, %xmm1 {%k1}
3947 ; X86-NEXT: vpmovusdb %zmm0, %xmm0 {%k1} {z}
3948 ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0
3949 ; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0
3950 ; X86-NEXT: vzeroupper
3952 %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
3953 %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
3954 %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
3955 %res3 = add <16 x i8> %res0, %res1
3956 %res4 = add <16 x i8> %res3, %res2
3960 declare void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32>, i16)
3962 define void @test_int_x86_avx512_mask_pmovus_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
3963 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_512:
3965 ; X64-NEXT: kmovw %esi, %k1
3966 ; X64-NEXT: vpmovusdb %zmm0, (%rdi)
3967 ; X64-NEXT: vpmovusdb %zmm0, (%rdi) {%k1}
3968 ; X64-NEXT: vzeroupper
3971 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_512:
3973 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
3974 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3975 ; X86-NEXT: vpmovusdb %zmm0, (%eax)
3976 ; X86-NEXT: vpmovusdb %zmm0, (%eax) {%k1}
3977 ; X86-NEXT: vzeroupper
3979 call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
3980 call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
3984 declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16)
3986 define <16 x i16>@test_int_x86_avx512_mask_pmov_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
3987 ; X64-LABEL: test_int_x86_avx512_mask_pmov_dw_512:
3989 ; X64-NEXT: kmovw %edi, %k1
3990 ; X64-NEXT: vpmovdw %zmm0, %ymm2
3991 ; X64-NEXT: vpmovdw %zmm0, %ymm1 {%k1}
3992 ; X64-NEXT: vpmovdw %zmm0, %ymm0 {%k1} {z}
3993 ; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0
3994 ; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0
3997 ; X86-LABEL: test_int_x86_avx512_mask_pmov_dw_512:
3999 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4000 ; X86-NEXT: vpmovdw %zmm0, %ymm2
4001 ; X86-NEXT: vpmovdw %zmm0, %ymm1 {%k1}
4002 ; X86-NEXT: vpmovdw %zmm0, %ymm0 {%k1} {z}
4003 ; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0
4004 ; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0
4006 %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
4007 %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
4008 %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
4009 %res3 = add <16 x i16> %res0, %res1
4010 %res4 = add <16 x i16> %res3, %res2
4011 ret <16 x i16> %res4
4014 declare void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32>, i16)
4016 define void @test_int_x86_avx512_mask_pmov_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
4017 ; X64-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_512:
4019 ; X64-NEXT: kmovw %esi, %k1
4020 ; X64-NEXT: vpmovdw %zmm0, (%rdi)
4021 ; X64-NEXT: vpmovdw %zmm0, (%rdi) {%k1}
4022 ; X64-NEXT: vzeroupper
4025 ; X86-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_512:
4027 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4028 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
4029 ; X86-NEXT: vpmovdw %zmm0, (%eax)
4030 ; X86-NEXT: vpmovdw %zmm0, (%eax) {%k1}
4031 ; X86-NEXT: vzeroupper
4033 call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
4034 call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
4038 declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16)
4040 define <16 x i16>@test_int_x86_avx512_mask_pmovs_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
4041 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_dw_512:
4043 ; X64-NEXT: kmovw %edi, %k1
4044 ; X64-NEXT: vpmovsdw %zmm0, %ymm2
4045 ; X64-NEXT: vpmovsdw %zmm0, %ymm1 {%k1}
4046 ; X64-NEXT: vpmovsdw %zmm0, %ymm0 {%k1} {z}
4047 ; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0
4048 ; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0
4051 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_dw_512:
4053 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4054 ; X86-NEXT: vpmovsdw %zmm0, %ymm2
4055 ; X86-NEXT: vpmovsdw %zmm0, %ymm1 {%k1}
4056 ; X86-NEXT: vpmovsdw %zmm0, %ymm0 {%k1} {z}
4057 ; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0
4058 ; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0
4060 %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
4061 %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
4062 %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
4063 %res3 = add <16 x i16> %res0, %res1
4064 %res4 = add <16 x i16> %res3, %res2
4065 ret <16 x i16> %res4
4068 declare void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32>, i16)
4070 define void @test_int_x86_avx512_mask_pmovs_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
4071 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_512:
4073 ; X64-NEXT: kmovw %esi, %k1
4074 ; X64-NEXT: vpmovsdw %zmm0, (%rdi)
4075 ; X64-NEXT: vpmovsdw %zmm0, (%rdi) {%k1}
4076 ; X64-NEXT: vzeroupper
4079 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_512:
4081 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4082 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
4083 ; X86-NEXT: vpmovsdw %zmm0, (%eax)
4084 ; X86-NEXT: vpmovsdw %zmm0, (%eax) {%k1}
4085 ; X86-NEXT: vzeroupper
4087 call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
4088 call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
4092 declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16)
4094 define <16 x i16>@test_int_x86_avx512_mask_pmovus_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
4095 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_dw_512:
4097 ; X64-NEXT: kmovw %edi, %k1
4098 ; X64-NEXT: vpmovusdw %zmm0, %ymm2
4099 ; X64-NEXT: vpmovusdw %zmm0, %ymm1 {%k1}
4100 ; X64-NEXT: vpmovusdw %zmm0, %ymm0 {%k1} {z}
4101 ; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0
4102 ; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0
4105 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_dw_512:
4107 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4108 ; X86-NEXT: vpmovusdw %zmm0, %ymm2
4109 ; X86-NEXT: vpmovusdw %zmm0, %ymm1 {%k1}
4110 ; X86-NEXT: vpmovusdw %zmm0, %ymm0 {%k1} {z}
4111 ; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0
4112 ; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0
4114 %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
4115 %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
4116 %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
4117 %res3 = add <16 x i16> %res0, %res1
4118 %res4 = add <16 x i16> %res3, %res2
4119 ret <16 x i16> %res4
4122 declare void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32>, i16)
4124 define void @test_int_x86_avx512_mask_pmovus_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
4125 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_512:
4127 ; X64-NEXT: kmovw %esi, %k1
4128 ; X64-NEXT: vpmovusdw %zmm0, (%rdi)
4129 ; X64-NEXT: vpmovusdw %zmm0, (%rdi) {%k1}
4130 ; X64-NEXT: vzeroupper
4133 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_512:
4135 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4136 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
4137 ; X86-NEXT: vpmovusdw %zmm0, (%eax)
4138 ; X86-NEXT: vpmovusdw %zmm0, (%eax) {%k1}
4139 ; X86-NEXT: vzeroupper
4141 call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
4142 call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
4146 declare <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32>, i32)
4148 define <16 x float>@test_int_x86_avx512_mask_cvt_dq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) {
4149 ; X64-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_512:
4151 ; X64-NEXT: kmovw %edi, %k1
4152 ; X64-NEXT: vcvtdq2ps %zmm0, %zmm1 {%k1}
4153 ; X64-NEXT: vcvtdq2ps {rn-sae}, %zmm0, %zmm0
4154 ; X64-NEXT: vaddps %zmm0, %zmm1, %zmm0
4157 ; X86-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_512:
4159 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4160 ; X86-NEXT: vcvtdq2ps %zmm0, %zmm1 {%k1}
4161 ; X86-NEXT: vcvtdq2ps {rn-sae}, %zmm0, %zmm0
4162 ; X86-NEXT: vaddps %zmm0, %zmm1, %zmm0
4164 %cvt = sitofp <16 x i32> %x0 to <16 x float>
4165 %1 = bitcast i16 %x2 to <16 x i1>
4166 %2 = select <16 x i1> %1, <16 x float> %cvt, <16 x float> %x1
4167 %3 = call <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32> %x0, i32 8)
4168 %res2 = fadd <16 x float> %2, %3
4169 ret <16 x float> %res2
4172 declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double>, <8 x i32>, i8, i32)
4174 define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
4175 ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_512:
4177 ; X64-NEXT: kmovw %edi, %k1
4178 ; X64-NEXT: vcvtpd2dq %zmm0, %ymm1 {%k1}
4179 ; X64-NEXT: vcvtpd2dq {rn-sae}, %zmm0, %ymm0
4180 ; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0
4183 ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_512:
4185 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4186 ; X86-NEXT: kmovw %eax, %k1
4187 ; X86-NEXT: vcvtpd2dq %zmm0, %ymm1 {%k1}
4188 ; X86-NEXT: vcvtpd2dq {rn-sae}, %zmm0, %ymm0
4189 ; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0
4191 %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
4192 %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
4193 %res2 = add <8 x i32> %res, %res1
4197 declare <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double>, <8 x float>, i8, i32)
4199 define <8 x float>@test_int_x86_avx512_mask_cvt_pd2ps_512(<8 x double> %x0, <8 x float> %x1, i8 %x2) {
4200 ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_512:
4202 ; X64-NEXT: kmovw %edi, %k1
4203 ; X64-NEXT: vcvtpd2ps %zmm0, %ymm1 {%k1}
4204 ; X64-NEXT: vcvtpd2ps {ru-sae}, %zmm0, %ymm0
4205 ; X64-NEXT: vaddps %ymm0, %ymm1, %ymm0
4208 ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_512:
4210 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4211 ; X86-NEXT: kmovw %eax, %k1
4212 ; X86-NEXT: vcvtpd2ps %zmm0, %ymm1 {%k1}
4213 ; X86-NEXT: vcvtpd2ps {ru-sae}, %zmm0, %ymm0
4214 ; X86-NEXT: vaddps %ymm0, %ymm1, %ymm0
4216 %res = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 %x2, i32 4)
4217 %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 -1, i32 10)
4218 %res2 = fadd <8 x float> %res, %res1
4219 ret <8 x float> %res2
4222 declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
4224 define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
4225 ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_512:
4227 ; X64-NEXT: kmovw %edi, %k1
4228 ; X64-NEXT: vcvtpd2udq {ru-sae}, %zmm0, %ymm1 {%k1}
4229 ; X64-NEXT: vcvtpd2udq {rn-sae}, %zmm0, %ymm0
4230 ; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0
4233 ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_512:
4235 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4236 ; X86-NEXT: kmovw %eax, %k1
4237 ; X86-NEXT: vcvtpd2udq {ru-sae}, %zmm0, %ymm1 {%k1}
4238 ; X86-NEXT: vcvtpd2udq {rn-sae}, %zmm0, %ymm0
4239 ; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0
4241 %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 10)
4242 %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
4243 %res2 = add <8 x i32> %res, %res1
4247 declare <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float>, <16 x i32>, i16, i32)
4249 define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
4250 ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_512:
4252 ; X64-NEXT: kmovw %edi, %k1
4253 ; X64-NEXT: vcvtps2dq {ru-sae}, %zmm0, %zmm1 {%k1}
4254 ; X64-NEXT: vcvtps2dq {rn-sae}, %zmm0, %zmm0
4255 ; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0
4258 ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_512:
4260 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4261 ; X86-NEXT: vcvtps2dq {ru-sae}, %zmm0, %zmm1 {%k1}
4262 ; X86-NEXT: vcvtps2dq {rn-sae}, %zmm0, %zmm0
4263 ; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0
4265 %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 10)
4266 %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
4267 %res2 = add <16 x i32> %res, %res1
4268 ret <16 x i32> %res2
4271 declare <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float>, <8 x double>, i8, i32)
4273 define <8 x double>@test_int_x86_avx512_mask_cvt_ps2pd_512(<8 x float> %x0, <8 x double> %x1, i8 %x2) {
4274 ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_512:
4276 ; X64-NEXT: kmovw %edi, %k1
4277 ; X64-NEXT: vcvtps2pd %ymm0, %zmm1 {%k1}
4278 ; X64-NEXT: vcvtps2pd {sae}, %ymm0, %zmm0
4279 ; X64-NEXT: vaddpd %zmm0, %zmm1, %zmm0
4282 ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_512:
4284 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4285 ; X86-NEXT: kmovw %eax, %k1
4286 ; X86-NEXT: vcvtps2pd %ymm0, %zmm1 {%k1}
4287 ; X86-NEXT: vcvtps2pd {sae}, %ymm0, %zmm0
4288 ; X86-NEXT: vaddpd %zmm0, %zmm1, %zmm0
4290 %res = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 %x2, i32 4)
4291 %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 -1, i32 8)
4292 %res2 = fadd <8 x double> %res, %res1
4293 ret <8 x double> %res2
4296 declare <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float>, <16 x i32>, i16, i32)
4298 define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
4299 ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_512:
4301 ; X64-NEXT: kmovw %edi, %k1
4302 ; X64-NEXT: vcvtps2udq {ru-sae}, %zmm0, %zmm1 {%k1}
4303 ; X64-NEXT: vcvtps2udq {rn-sae}, %zmm0, %zmm0
4304 ; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0
4307 ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_512:
4309 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4310 ; X86-NEXT: vcvtps2udq {ru-sae}, %zmm0, %zmm1 {%k1}
4311 ; X86-NEXT: vcvtps2udq {rn-sae}, %zmm0, %zmm0
4312 ; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0
4314 %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 10)
4315 %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
4316 %res2 = add <16 x i32> %res, %res1
4317 ret <16 x i32> %res2
4320 declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double>, <8 x i32>, i8, i32)
4322 define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
4323 ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_512:
4325 ; X64-NEXT: kmovw %edi, %k1
4326 ; X64-NEXT: vcvttpd2dq %zmm0, %ymm1 {%k1}
4327 ; X64-NEXT: vcvttpd2dq {sae}, %zmm0, %ymm0
4328 ; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0
4331 ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_512:
4333 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4334 ; X86-NEXT: kmovw %eax, %k1
4335 ; X86-NEXT: vcvttpd2dq %zmm0, %ymm1 {%k1}
4336 ; X86-NEXT: vcvttpd2dq {sae}, %zmm0, %ymm0
4337 ; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0
4339 %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
4340 %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
4341 %res2 = add <8 x i32> %res, %res1
4345 declare <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32>, i32)
4347 define <16 x float>@test_int_x86_avx512_mask_cvt_udq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) {
4348 ; X64-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_512:
4350 ; X64-NEXT: kmovw %edi, %k1
4351 ; X64-NEXT: vcvtudq2ps %zmm0, %zmm1 {%k1}
4352 ; X64-NEXT: vcvtudq2ps {rn-sae}, %zmm0, %zmm0
4353 ; X64-NEXT: vaddps %zmm0, %zmm1, %zmm0
4356 ; X86-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_512:
4358 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4359 ; X86-NEXT: vcvtudq2ps %zmm0, %zmm1 {%k1}
4360 ; X86-NEXT: vcvtudq2ps {rn-sae}, %zmm0, %zmm0
4361 ; X86-NEXT: vaddps %zmm0, %zmm1, %zmm0
4363 %cvt = uitofp <16 x i32> %x0 to <16 x float>
4364 %1 = bitcast i16 %x2 to <16 x i1>
4365 %2 = select <16 x i1> %1, <16 x float> %cvt, <16 x float> %x1
4366 %3 = call <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32> %x0, i32 8)
4367 %res2 = fadd <16 x float> %2, %3
4368 ret <16 x float> %res2
4371 declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
4373 define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
4374 ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_512:
4376 ; X64-NEXT: kmovw %edi, %k1
4377 ; X64-NEXT: vcvttpd2udq %zmm0, %ymm1 {%k1}
4378 ; X64-NEXT: vcvttpd2udq {sae}, %zmm0, %ymm0
4379 ; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0
4382 ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_512:
4384 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4385 ; X86-NEXT: kmovw %eax, %k1
4386 ; X86-NEXT: vcvttpd2udq %zmm0, %ymm1 {%k1}
4387 ; X86-NEXT: vcvttpd2udq {sae}, %zmm0, %ymm0
4388 ; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0
4390 %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
4391 %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
4392 %res2 = add <8 x i32> %res, %res1
4396 declare <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float>, <16 x i32>, i16, i32)
4398 define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
4399 ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_512:
4401 ; X64-NEXT: kmovw %edi, %k1
4402 ; X64-NEXT: vcvttps2dq %zmm0, %zmm1 {%k1}
4403 ; X64-NEXT: vcvttps2dq {sae}, %zmm0, %zmm0
4404 ; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0
4407 ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_512:
4409 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4410 ; X86-NEXT: vcvttps2dq %zmm0, %zmm1 {%k1}
4411 ; X86-NEXT: vcvttps2dq {sae}, %zmm0, %zmm0
4412 ; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0
4414 %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4)
4415 %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
4416 %res2 = add <16 x i32> %res, %res1
4417 ret <16 x i32> %res2
4420 declare <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float>, <16 x i32>, i16, i32)
4422 define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
4423 ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_512:
4425 ; X64-NEXT: kmovw %edi, %k1
4426 ; X64-NEXT: vcvttps2udq %zmm0, %zmm1 {%k1}
4427 ; X64-NEXT: vcvttps2udq {sae}, %zmm0, %zmm0
4428 ; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0
4431 ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_512:
4433 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4434 ; X86-NEXT: vcvttps2udq %zmm0, %zmm1 {%k1}
4435 ; X86-NEXT: vcvttps2udq {sae}, %zmm0, %zmm0
4436 ; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0
4438 %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4)
4439 %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
4440 %res2 = add <16 x i32> %res, %res1
4441 ret <16 x i32> %res2
4444 declare <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
4446 define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
4447 ; X64-LABEL: test_getexp_ss:
4449 ; X64-NEXT: kmovw %edi, %k1
4450 ; X64-NEXT: vmovaps %xmm2, %xmm3
4451 ; X64-NEXT: vgetexpss %xmm1, %xmm0, %xmm3 {%k1}
4452 ; X64-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
4453 ; X64-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm5
4454 ; X64-NEXT: vaddps %xmm5, %xmm4, %xmm4
4455 ; X64-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
4456 ; X64-NEXT: vaddps %xmm2, %xmm3, %xmm0
4457 ; X64-NEXT: vaddps %xmm4, %xmm0, %xmm0
4460 ; X86-LABEL: test_getexp_ss:
4462 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4463 ; X86-NEXT: kmovw %eax, %k1
4464 ; X86-NEXT: vmovaps %xmm2, %xmm3
4465 ; X86-NEXT: vgetexpss %xmm1, %xmm0, %xmm3 {%k1}
4466 ; X86-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
4467 ; X86-NEXT: vaddps %xmm2, %xmm3, %xmm2
4468 ; X86-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
4469 ; X86-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm0
4470 ; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0
4471 ; X86-NEXT: vaddps %xmm0, %xmm2, %xmm0
4473 %res0 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
4474 %res1 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
4475 %res2 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
4476 %res3 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
4478 %res.1 = fadd <4 x float> %res0, %res1
4479 %res.2 = fadd <4 x float> %res2, %res3
4480 %res = fadd <4 x float> %res.1, %res.2
4481 ret <4 x float> %res
4484 declare <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
4486 define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
4487 ; X64-LABEL: test_getexp_sd:
4489 ; X64-NEXT: kmovw %edi, %k1
4490 ; X64-NEXT: vgetexpsd %xmm1, %xmm0, %xmm3
4491 ; X64-NEXT: vmovapd %xmm2, %xmm4
4492 ; X64-NEXT: vgetexpsd %xmm1, %xmm0, %xmm4 {%k1}
4493 ; X64-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm5 {%k1} {z}
4494 ; X64-NEXT: vaddpd %xmm3, %xmm5, %xmm3
4495 ; X64-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
4496 ; X64-NEXT: vaddpd %xmm2, %xmm4, %xmm0
4497 ; X64-NEXT: vaddpd %xmm3, %xmm0, %xmm0
4500 ; X86-LABEL: test_getexp_sd:
4502 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4503 ; X86-NEXT: kmovw %eax, %k1
4504 ; X86-NEXT: vmovapd %xmm2, %xmm3
4505 ; X86-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm3 {%k1}
4506 ; X86-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
4507 ; X86-NEXT: vgetexpsd %xmm1, %xmm0, %xmm2 {%k1}
4508 ; X86-NEXT: vaddpd %xmm3, %xmm2, %xmm2
4509 ; X86-NEXT: vgetexpsd %xmm1, %xmm0, %xmm0
4510 ; X86-NEXT: vaddpd %xmm0, %xmm4, %xmm0
4511 ; X86-NEXT: vaddpd %xmm0, %xmm2, %xmm0
4513 %res0 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
4514 %res1 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
4515 %res2 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
4516 %res3 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4)
4518 %res.1 = fadd <2 x double> %res0, %res1
4519 %res.2 = fadd <2 x double> %res2, %res3
4520 %res = fadd <2 x double> %res.1, %res.2
4521 ret <2 x double> %res
4524 declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32)
4526 define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) {
4527 ; X64-LABEL: test_int_x86_avx512_mask_cmp_sd:
4529 ; X64-NEXT: kmovw %edi, %k1
4530 ; X64-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
4531 ; X64-NEXT: kmovw %k0, %eax
4532 ; X64-NEXT: # kill: def $al killed $al killed $eax
4535 ; X86-LABEL: test_int_x86_avx512_mask_cmp_sd:
4537 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4538 ; X86-NEXT: kmovw %eax, %k1
4539 ; X86-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
4540 ; X86-NEXT: kmovw %k0, %eax
4541 ; X86-NEXT: # kill: def $al killed $al killed $eax
4544 %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
4548 define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) {
4549 ; X64-LABEL: test_int_x86_avx512_mask_cmp_sd_all:
4551 ; X64-NEXT: kmovw %edi, %k1
4552 ; X64-NEXT: vcmplesd %xmm1, %xmm0, %k0
4553 ; X64-NEXT: kmovw %k0, %ecx
4554 ; X64-NEXT: vcmpunordsd {sae}, %xmm1, %xmm0, %k0
4555 ; X64-NEXT: kmovw %k0, %edx
4556 ; X64-NEXT: vcmpneqsd %xmm1, %xmm0, %k0 {%k1}
4557 ; X64-NEXT: kmovw %k0, %esi
4558 ; X64-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
4559 ; X64-NEXT: kmovw %k0, %eax
4560 ; X64-NEXT: orl %esi, %eax
4561 ; X64-NEXT: orl %edx, %eax
4562 ; X64-NEXT: orl %ecx, %eax
4563 ; X64-NEXT: # kill: def $al killed $al killed $eax
4566 ; X86-LABEL: test_int_x86_avx512_mask_cmp_sd_all:
4568 ; X86-NEXT: pushl %esi
4569 ; X86-NEXT: .cfi_def_cfa_offset 8
4570 ; X86-NEXT: .cfi_offset %esi, -8
4571 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4572 ; X86-NEXT: kmovw %eax, %k1
4573 ; X86-NEXT: vcmplesd %xmm1, %xmm0, %k0
4574 ; X86-NEXT: kmovw %k0, %ecx
4575 ; X86-NEXT: vcmpunordsd {sae}, %xmm1, %xmm0, %k0
4576 ; X86-NEXT: kmovw %k0, %edx
4577 ; X86-NEXT: vcmpneqsd %xmm1, %xmm0, %k0 {%k1}
4578 ; X86-NEXT: kmovw %k0, %esi
4579 ; X86-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
4580 ; X86-NEXT: kmovw %k0, %eax
4581 ; X86-NEXT: orl %esi, %eax
4582 ; X86-NEXT: orl %edx, %eax
4583 ; X86-NEXT: orl %ecx, %eax
4584 ; X86-NEXT: # kill: def $al killed $al killed $eax
4585 ; X86-NEXT: popl %esi
4586 ; X86-NEXT: .cfi_def_cfa_offset 4
4589 %res1 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 2, i8 -1, i32 4)
4590 %res2 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 3, i8 -1, i32 8)
4591 %res3 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 4, i8 %x3, i32 4)
4592 %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
4594 %res11 = or i8 %res1, %res2
4595 %res12 = or i8 %res3, %res4
4596 %res13 = or i8 %res11, %res12
4600 declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)
4602 define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) {
4603 ; X64-LABEL: test_int_x86_avx512_mask_cmp_ss:
4605 ; X64-NEXT: kmovw %edi, %k1
4606 ; X64-NEXT: vcmpunordss %xmm1, %xmm0, %k0 {%k1}
4607 ; X64-NEXT: kmovw %k0, %eax
4608 ; X64-NEXT: # kill: def $al killed $al killed $eax
4611 ; X86-LABEL: test_int_x86_avx512_mask_cmp_ss:
4613 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4614 ; X86-NEXT: kmovw %eax, %k1
4615 ; X86-NEXT: vcmpunordss %xmm1, %xmm0, %k0 {%k1}
4616 ; X86-NEXT: kmovw %k0, %eax
4617 ; X86-NEXT: # kill: def $al killed $al killed $eax
4620 %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 %x3, i32 4)
4625 define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) {
4626 ; X64-LABEL: test_int_x86_avx512_mask_cmp_ss_all:
4628 ; X64-NEXT: kmovw %edi, %k1
4629 ; X64-NEXT: vcmpless %xmm1, %xmm0, %k0
4630 ; X64-NEXT: kmovw %k0, %ecx
4631 ; X64-NEXT: vcmpunordss {sae}, %xmm1, %xmm0, %k0
4632 ; X64-NEXT: kmovw %k0, %edx
4633 ; X64-NEXT: vcmpneqss %xmm1, %xmm0, %k0 {%k1}
4634 ; X64-NEXT: kmovw %k0, %esi
4635 ; X64-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
4636 ; X64-NEXT: kmovw %k0, %eax
4637 ; X64-NEXT: andl %esi, %eax
4638 ; X64-NEXT: andl %edx, %eax
4639 ; X64-NEXT: andl %ecx, %eax
4640 ; X64-NEXT: # kill: def $al killed $al killed $eax
4643 ; X86-LABEL: test_int_x86_avx512_mask_cmp_ss_all:
4645 ; X86-NEXT: pushl %esi
4646 ; X86-NEXT: .cfi_def_cfa_offset 8
4647 ; X86-NEXT: .cfi_offset %esi, -8
4648 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4649 ; X86-NEXT: kmovw %eax, %k1
4650 ; X86-NEXT: vcmpless %xmm1, %xmm0, %k0
4651 ; X86-NEXT: kmovw %k0, %ecx
4652 ; X86-NEXT: vcmpunordss {sae}, %xmm1, %xmm0, %k0
4653 ; X86-NEXT: kmovw %k0, %edx
4654 ; X86-NEXT: vcmpneqss %xmm1, %xmm0, %k0 {%k1}
4655 ; X86-NEXT: kmovw %k0, %esi
4656 ; X86-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
4657 ; X86-NEXT: kmovw %k0, %eax
4658 ; X86-NEXT: andl %esi, %eax
4659 ; X86-NEXT: andl %edx, %eax
4660 ; X86-NEXT: andl %ecx, %eax
4661 ; X86-NEXT: # kill: def $al killed $al killed $eax
4662 ; X86-NEXT: popl %esi
4663 ; X86-NEXT: .cfi_def_cfa_offset 4
4665 %res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4)
4666 %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 -1, i32 8)
4667 %res3 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 4, i8 %x3, i32 4)
4668 %res4 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 5, i8 %x3, i32 8)
4670 %res11 = and i8 %res1, %res2
4671 %res12 = and i8 %res3, %res4
4672 %res13 = and i8 %res11, %res12
4676 declare <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
4678 define <8 x double>@test_int_x86_avx512_mask_getmant_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
4679 ; X64-LABEL: test_int_x86_avx512_mask_getmant_pd_512:
4681 ; X64-NEXT: kmovw %edi, %k1
4682 ; X64-NEXT: vgetmantpd $11, %zmm0, %zmm1 {%k1}
4683 ; X64-NEXT: vgetmantpd $11, {sae}, %zmm0, %zmm0
4684 ; X64-NEXT: vaddpd %zmm0, %zmm1, %zmm0
4687 ; X86-LABEL: test_int_x86_avx512_mask_getmant_pd_512:
4689 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4690 ; X86-NEXT: kmovw %eax, %k1
4691 ; X86-NEXT: vgetmantpd $11, %zmm0, %zmm1 {%k1}
4692 ; X86-NEXT: vgetmantpd $11, {sae}, %zmm0, %zmm0
4693 ; X86-NEXT: vaddpd %zmm0, %zmm1, %zmm0
4695 %res = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 %x3, i32 4)
4696 %res1 = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 -1, i32 8)
4697 %res2 = fadd <8 x double> %res, %res1
4698 ret <8 x double> %res2
4701 declare <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
4703 define <16 x float>@test_int_x86_avx512_mask_getmant_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
4704 ; X64-LABEL: test_int_x86_avx512_mask_getmant_ps_512:
4706 ; X64-NEXT: kmovw %edi, %k1
4707 ; X64-NEXT: vgetmantps $11, %zmm0, %zmm1 {%k1}
4708 ; X64-NEXT: vgetmantps $11, {sae}, %zmm0, %zmm0
4709 ; X64-NEXT: vaddps %zmm0, %zmm1, %zmm0
4712 ; X86-LABEL: test_int_x86_avx512_mask_getmant_ps_512:
4714 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4715 ; X86-NEXT: vgetmantps $11, %zmm0, %zmm1 {%k1}
4716 ; X86-NEXT: vgetmantps $11, {sae}, %zmm0, %zmm0
4717 ; X86-NEXT: vaddps %zmm0, %zmm1, %zmm0
4719 %res = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 %x3, i32 4)
4720 %res1 = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 -1, i32 8)
4721 %res2 = fadd <16 x float> %res, %res1
4722 ret <16 x float> %res2
4725 declare <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double>, <2 x double>, i32, <2 x double>, i8, i32)
4727 define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
4728 ; X64-LABEL: test_int_x86_avx512_mask_getmant_sd:
4730 ; X64-NEXT: kmovw %edi, %k1
4731 ; X64-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm3
4732 ; X64-NEXT: vmovapd %xmm2, %xmm4
4733 ; X64-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1}
4734 ; X64-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm5 {%k1} {z}
4735 ; X64-NEXT: vaddpd %xmm5, %xmm4, %xmm4
4736 ; X64-NEXT: vgetmantsd $11, {sae}, %xmm1, %xmm0, %xmm2 {%k1}
4737 ; X64-NEXT: vaddpd %xmm3, %xmm2, %xmm0
4738 ; X64-NEXT: vaddpd %xmm0, %xmm4, %xmm0
4741 ; X86-LABEL: test_int_x86_avx512_mask_getmant_sd:
4743 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4744 ; X86-NEXT: kmovw %eax, %k1
4745 ; X86-NEXT: vmovapd %xmm2, %xmm3
4746 ; X86-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm3 {%k1}
4747 ; X86-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1} {z}
4748 ; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm3
4749 ; X86-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm4
4750 ; X86-NEXT: vgetmantsd $11, {sae}, %xmm1, %xmm0, %xmm2 {%k1}
4751 ; X86-NEXT: vaddpd %xmm4, %xmm2, %xmm0
4752 ; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0
4754 %res = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 4)
4755 %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> zeroinitializer, i8 %x3, i32 4)
4756 %res2 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 8)
4757 %res3 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 -1, i32 4)
4758 %res11 = fadd <2 x double> %res, %res1
4759 %res12 = fadd <2 x double> %res2, %res3
4760 %res13 = fadd <2 x double> %res11, %res12
4761 ret <2 x double> %res13
4764 declare <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float>, <4 x float>, i32, <4 x float>, i8, i32)
4766 define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
4767 ; X64-LABEL: test_int_x86_avx512_mask_getmant_ss:
4769 ; X64-NEXT: kmovw %edi, %k1
4770 ; X64-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm3
4771 ; X64-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1}
4772 ; X64-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm4 {%k1} {z}
4773 ; X64-NEXT: vaddps %xmm4, %xmm2, %xmm2
4774 ; X64-NEXT: vgetmantss $11, {sae}, %xmm1, %xmm0, %xmm0
4775 ; X64-NEXT: vaddps %xmm3, %xmm0, %xmm0
4776 ; X64-NEXT: vaddps %xmm0, %xmm2, %xmm0
4779 ; X86-LABEL: test_int_x86_avx512_mask_getmant_ss:
4781 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4782 ; X86-NEXT: kmovw %eax, %k1
4783 ; X86-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1}
4784 ; X86-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm3 {%k1} {z}
4785 ; X86-NEXT: vaddps %xmm3, %xmm2, %xmm2
4786 ; X86-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm3
4787 ; X86-NEXT: vgetmantss $11, {sae}, %xmm1, %xmm0, %xmm0
4788 ; X86-NEXT: vaddps %xmm3, %xmm0, %xmm0
4789 ; X86-NEXT: vaddps %xmm0, %xmm2, %xmm0
4791 %res = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 %x3, i32 4)
4792 %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> zeroinitializer, i8 %x3, i32 4)
4793 %res2 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 -1, i32 8)
4794 %res3 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 -1, i32 4)
4795 %res11 = fadd <4 x float> %res, %res1
4796 %res12 = fadd <4 x float> %res2, %res3
4797 %res13 = fadd <4 x float> %res11, %res12
4798 ret <4 x float> %res13
4801 define <4 x float> @test_int_x86_avx512_mask_getmant_ss_load(<4 x float> %x0, <4 x float>* %x1p) {
4802 ; X64-LABEL: test_int_x86_avx512_mask_getmant_ss_load:
4804 ; X64-NEXT: vgetmantss $11, (%rdi), %xmm0, %xmm0
4807 ; X86-LABEL: test_int_x86_avx512_mask_getmant_ss_load:
4809 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
4810 ; X86-NEXT: vgetmantss $11, (%eax), %xmm0, %xmm0
4812 %x1 = load <4 x float>, <4 x float>* %x1p
4813 %res = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> undef, i8 -1, i32 4)
4814 ret <4 x float> %res
4817 declare <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double>, <8 x i64>)
4819 define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1) {
4820 ; CHECK-LABEL: test_int_x86_avx512_vpermilvar_pd_512:
4822 ; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm0
4823 ; CHECK-NEXT: ret{{[l|q]}}
4824 %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1)
4825 ret <8 x double> %res
4828 define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_mask(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %mask) {
4829 ; X64-LABEL: test_int_x86_avx512_vpermilvar_pd_512_mask:
4831 ; X64-NEXT: kmovw %edi, %k1
4832 ; X64-NEXT: vpermilpd %zmm1, %zmm0, %zmm2 {%k1}
4833 ; X64-NEXT: vmovapd %zmm2, %zmm0
4836 ; X86-LABEL: test_int_x86_avx512_vpermilvar_pd_512_mask:
4838 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4839 ; X86-NEXT: kmovw %eax, %k1
4840 ; X86-NEXT: vpermilpd %zmm1, %zmm0, %zmm2 {%k1}
4841 ; X86-NEXT: vmovapd %zmm2, %zmm0
4843 %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1)
4844 %mask.cast = bitcast i8 %mask to <8 x i1>
4845 %res2 = select <8 x i1> %mask.cast, <8 x double> %res, <8 x double> %x2
4846 ret <8 x double> %res2
4849 define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_maskz(<8 x double> %x0, <8 x i64> %x1, i8 %mask) {
4850 ; X64-LABEL: test_int_x86_avx512_vpermilvar_pd_512_maskz:
4852 ; X64-NEXT: kmovw %edi, %k1
4853 ; X64-NEXT: vpermilpd %zmm1, %zmm0, %zmm0 {%k1} {z}
4856 ; X86-LABEL: test_int_x86_avx512_vpermilvar_pd_512_maskz:
4858 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4859 ; X86-NEXT: kmovw %eax, %k1
4860 ; X86-NEXT: vpermilpd %zmm1, %zmm0, %zmm0 {%k1} {z}
4862 %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1)
4863 %mask.cast = bitcast i8 %mask to <8 x i1>
4864 %res2 = select <8 x i1> %mask.cast, <8 x double> %res, <8 x double> zeroinitializer
4865 ret <8 x double> %res2
4868 declare <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float>, <16 x i32>)
4870 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1) {
4871 ; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_512:
4873 ; CHECK-NEXT: vpermilps %zmm1, %zmm0, %zmm0
4874 ; CHECK-NEXT: ret{{[l|q]}}
4875 %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1)
4876 ret <16 x float> %res
4879 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_mask(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %mask) {
4880 ; X64-LABEL: test_int_x86_avx512_vpermilvar_ps_512_mask:
4882 ; X64-NEXT: kmovw %edi, %k1
4883 ; X64-NEXT: vpermilps %zmm1, %zmm0, %zmm2 {%k1}
4884 ; X64-NEXT: vmovaps %zmm2, %zmm0
4887 ; X86-LABEL: test_int_x86_avx512_vpermilvar_ps_512_mask:
4889 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4890 ; X86-NEXT: vpermilps %zmm1, %zmm0, %zmm2 {%k1}
4891 ; X86-NEXT: vmovaps %zmm2, %zmm0
4893 %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1)
4894 %mask.cast = bitcast i16 %mask to <16 x i1>
4895 %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> %x2
4896 ret <16 x float> %res2
4899 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_maskz(<16 x float> %x0, <16 x i32> %x1, i16 %mask) {
4900 ; X64-LABEL: test_int_x86_avx512_vpermilvar_ps_512_maskz:
4902 ; X64-NEXT: kmovw %edi, %k1
4903 ; X64-NEXT: vpermilps %zmm1, %zmm0, %zmm0 {%k1} {z}
4906 ; X86-LABEL: test_int_x86_avx512_vpermilvar_ps_512_maskz:
4908 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4909 ; X86-NEXT: vpermilps %zmm1, %zmm0, %zmm0 {%k1} {z}
4911 %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1)
4912 %mask.cast = bitcast i16 %mask to <16 x i1>
4913 %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> zeroinitializer
4914 ret <16 x float> %res2
4917 ; Test case to make sure we can print shuffle decode comments for constant pool loads.
4918 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool(<16 x float> %x0, <16 x i32> %x1) {
4919 ; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_512_constant_pool:
4921 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
4922 ; CHECK-NEXT: ret{{[l|q]}}
4923 %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
4924 ret <16 x float> %res
4927 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %mask) {
4928 ; X64-LABEL: test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask:
4930 ; X64-NEXT: kmovw %edi, %k1
4931 ; X64-NEXT: vpermilps {{.*#+}} zmm2 {%k1} = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
4932 ; X64-NEXT: vmovaps %zmm2, %zmm0
4935 ; X86-LABEL: test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask:
4937 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4938 ; X86-NEXT: vpermilps {{.*#+}} zmm2 {%k1} = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
4939 ; X86-NEXT: vmovaps %zmm2, %zmm0
4941 %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
4942 %mask.cast = bitcast i16 %mask to <16 x i1>
4943 %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> %x2
4944 ret <16 x float> %res2
4947 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz(<16 x float> %x0, <16 x i32> %x1, i16 %mask) {
4948 ; X64-LABEL: test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz:
4950 ; X64-NEXT: kmovw %edi, %k1
4951 ; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
4954 ; X86-LABEL: test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz:
4956 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
4957 ; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
4959 %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
4960 %mask.cast = bitcast i16 %mask to <16 x i1>
4961 %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> zeroinitializer
4962 ret <16 x float> %res2
4965 declare <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double>, <4 x float>, <2 x double>, i8, i32)
4967 define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<2 x double> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) {
4968 ; X64-LABEL: test_int_x86_avx512_mask_cvt_ss2sd_round:
4970 ; X64-NEXT: kmovw %edi, %k1
4971 ; X64-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm2 {%k1}
4972 ; X64-NEXT: vcvtss2sd {sae}, %xmm1, %xmm0, %xmm0
4973 ; X64-NEXT: vaddpd %xmm0, %xmm2, %xmm0
4976 ; X86-LABEL: test_int_x86_avx512_mask_cvt_ss2sd_round:
4978 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4979 ; X86-NEXT: kmovw %eax, %k1
4980 ; X86-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm2 {%k1}
4981 ; X86-NEXT: vcvtss2sd {sae}, %xmm1, %xmm0, %xmm0
4982 ; X86-NEXT: vaddpd %xmm0, %xmm2, %xmm0
4984 %res = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> %x0, <4 x float> %x1, <2 x double> %x2, i8 %x3, i32 4)
4985 %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> %x0, <4 x float> %x1, <2 x double> %x2, i8 -1, i32 8)
4986 %res2 = fadd <2 x double> %res, %res1
4987 ret <2 x double> %res2
4990 declare <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float>, <2 x double>, <4 x float>, i8, i32)
4992 define <4 x float>@test_int_x86_avx512_mask_cvt_sd2ss_round(<4 x float> %x0,<2 x double> %x1, <4 x float> %x2, i8 %x3) {
4993 ; X64-LABEL: test_int_x86_avx512_mask_cvt_sd2ss_round:
4995 ; X64-NEXT: kmovw %edi, %k1
4996 ; X64-NEXT: vcvtsd2ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
4997 ; X64-NEXT: vcvtsd2ss {rn-sae}, %xmm1, %xmm0, %xmm0
4998 ; X64-NEXT: vaddps %xmm0, %xmm2, %xmm0
5001 ; X86-LABEL: test_int_x86_avx512_mask_cvt_sd2ss_round:
5003 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5004 ; X86-NEXT: kmovw %eax, %k1
5005 ; X86-NEXT: vcvtsd2ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5006 ; X86-NEXT: vcvtsd2ss {rn-sae}, %xmm1, %xmm0, %xmm0
5007 ; X86-NEXT: vaddps %xmm0, %xmm2, %xmm0
5009 %res = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> %x0, <2 x double> %x1, <4 x float> %x2, i8 %x3, i32 11)
5010 %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> %x0, <2 x double> %x1, <4 x float> %x2, i8 -1, i32 8)
5011 %res2 = fadd <4 x float> %res, %res1
5012 ret <4 x float> %res2
5015 declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32)
5017 define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) {
5018 ; X64-LABEL: test_int_x86_avx512_mask_pternlog_d_512:
5020 ; X64-NEXT: vmovdqa64 %zmm0, %zmm3
5021 ; X64-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3
5022 ; X64-NEXT: kmovw %edi, %k1
5023 ; X64-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1}
5024 ; X64-NEXT: vpaddd %zmm3, %zmm0, %zmm0
5027 ; X86-LABEL: test_int_x86_avx512_mask_pternlog_d_512:
5029 ; X86-NEXT: vmovdqa64 %zmm0, %zmm3
5030 ; X86-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3
5031 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
5032 ; X86-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1}
5033 ; X86-NEXT: vpaddd %zmm3, %zmm0, %zmm0
5035 %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
5036 %2 = bitcast i16 %x4 to <16 x i1>
5037 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
5038 %4 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
5039 %res2 = add <16 x i32> %3, %4
5040 ret <16 x i32> %res2
5043 define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) {
5044 ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_d_512:
5046 ; X64-NEXT: vmovdqa64 %zmm0, %zmm3
5047 ; X64-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3
5048 ; X64-NEXT: kmovw %edi, %k1
5049 ; X64-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z}
5050 ; X64-NEXT: vpaddd %zmm3, %zmm0, %zmm0
5053 ; X86-LABEL: test_int_x86_avx512_maskz_pternlog_d_512:
5055 ; X86-NEXT: vmovdqa64 %zmm0, %zmm3
5056 ; X86-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3
5057 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
5058 ; X86-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z}
5059 ; X86-NEXT: vpaddd %zmm3, %zmm0, %zmm0
5061 %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
5062 %2 = bitcast i16 %x4 to <16 x i1>
5063 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
5064 %4 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
5065 %res2 = add <16 x i32> %3, %4
5066 ret <16 x i32> %res2
5069 declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32)
5071 define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) {
5072 ; X64-LABEL: test_int_x86_avx512_mask_pternlog_q_512:
5074 ; X64-NEXT: vmovdqa64 %zmm0, %zmm3
5075 ; X64-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3
5076 ; X64-NEXT: kmovw %edi, %k1
5077 ; X64-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1}
5078 ; X64-NEXT: vpaddq %zmm3, %zmm0, %zmm0
5081 ; X86-LABEL: test_int_x86_avx512_mask_pternlog_q_512:
5083 ; X86-NEXT: vmovdqa64 %zmm0, %zmm3
5084 ; X86-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3
5085 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5086 ; X86-NEXT: kmovw %eax, %k1
5087 ; X86-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1}
5088 ; X86-NEXT: vpaddq %zmm3, %zmm0, %zmm0
5090 %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
5091 %2 = bitcast i8 %x4 to <8 x i1>
5092 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0
5093 %4 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
5094 %res2 = add <8 x i64> %3, %4
5098 define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) {
5099 ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_q_512:
5101 ; X64-NEXT: vmovdqa64 %zmm0, %zmm3
5102 ; X64-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3
5103 ; X64-NEXT: kmovw %edi, %k1
5104 ; X64-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} {z}
5105 ; X64-NEXT: vpaddq %zmm3, %zmm0, %zmm0
5108 ; X86-LABEL: test_int_x86_avx512_maskz_pternlog_q_512:
5110 ; X86-NEXT: vmovdqa64 %zmm0, %zmm3
5111 ; X86-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3
5112 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5113 ; X86-NEXT: kmovw %eax, %k1
5114 ; X86-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} {z}
5115 ; X86-NEXT: vpaddq %zmm3, %zmm0, %zmm0
5117 %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
5118 %2 = bitcast i8 %x4 to <8 x i1>
5119 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
5120 %4 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
5121 %res2 = add <8 x i64> %3, %4
5125 define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
5126 ; CHECK-LABEL: test_x86_avx512_comi_sd_eq_sae:
5128 ; CHECK-NEXT: vcmpeqsd {sae}, %xmm1, %xmm0, %k0
5129 ; CHECK-NEXT: kmovw %k0, %eax
5130 ; CHECK-NEXT: ret{{[l|q]}}
5131 %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 8)
5135 define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
5136 ; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq_sae:
5138 ; CHECK-NEXT: vcmpeq_uqsd {sae}, %xmm1, %xmm0, %k0
5139 ; CHECK-NEXT: kmovw %k0, %eax
5140 ; CHECK-NEXT: ret{{[l|q]}}
5141 %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 8)
5145 define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
5146 ; CHECK-LABEL: test_x86_avx512_comi_sd_eq:
5148 ; CHECK-NEXT: vcmpeqsd %xmm1, %xmm0, %k0
5149 ; CHECK-NEXT: kmovw %k0, %eax
5150 ; CHECK-NEXT: ret{{[l|q]}}
5151 %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 4)
5155 define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
5156 ; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq:
5158 ; CHECK-NEXT: vcmpeq_uqsd %xmm1, %xmm0, %k0
5159 ; CHECK-NEXT: kmovw %k0, %eax
5160 ; CHECK-NEXT: ret{{[l|q]}}
5161 %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 4)
5165 define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
5166 ; CHECK-LABEL: test_x86_avx512_comi_sd_lt_sae:
5168 ; CHECK-NEXT: vcmpltsd {sae}, %xmm1, %xmm0, %k0
5169 ; CHECK-NEXT: kmovw %k0, %eax
5170 ; CHECK-NEXT: ret{{[l|q]}}
5171 %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 8)
5175 define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
5176 ; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt_sae:
5178 ; CHECK-NEXT: vcmpngesd {sae}, %xmm1, %xmm0, %k0
5179 ; CHECK-NEXT: kmovw %k0, %eax
5180 ; CHECK-NEXT: ret{{[l|q]}}
5181 %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 8)
5185 define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
5186 ; CHECK-LABEL: test_x86_avx512_comi_sd_lt:
5188 ; CHECK-NEXT: vcmpltsd %xmm1, %xmm0, %k0
5189 ; CHECK-NEXT: kmovw %k0, %eax
5190 ; CHECK-NEXT: ret{{[l|q]}}
5191 %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 4)
5195 define i32 @test_x86_avx512_ucomi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
5196 ; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt:
5198 ; CHECK-NEXT: vcmpngesd %xmm1, %xmm0, %k0
5199 ; CHECK-NEXT: kmovw %k0, %eax
5200 ; CHECK-NEXT: ret{{[l|q]}}
5201 %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 4)
5205 declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32)
5207 define i32 @test_x86_avx512_ucomi_ss_lt(<4 x float> %a0, <4 x float> %a1) {
5208 ; CHECK-LABEL: test_x86_avx512_ucomi_ss_lt:
5210 ; CHECK-NEXT: vcmpngess %xmm1, %xmm0, %k0
5211 ; CHECK-NEXT: kmovw %k0, %eax
5212 ; CHECK-NEXT: ret{{[l|q]}}
5213 %res = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %a0, <4 x float> %a1, i32 9, i32 4)
5217 declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32)
5219 declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>)
5221 define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
5222 ; X64-LABEL: test_int_x86_avx512_mask_permvar_df_512:
5224 ; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm3
5225 ; X64-NEXT: kmovw %edi, %k1
5226 ; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm2 {%k1}
5227 ; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
5228 ; X64-NEXT: vaddpd %zmm0, %zmm2, %zmm0
5229 ; X64-NEXT: vaddpd %zmm3, %zmm0, %zmm0
5232 ; X86-LABEL: test_int_x86_avx512_mask_permvar_df_512:
5234 ; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm3
5235 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5236 ; X86-NEXT: kmovw %eax, %k1
5237 ; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm2 {%k1}
5238 ; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
5239 ; X86-NEXT: vaddpd %zmm0, %zmm2, %zmm0
5240 ; X86-NEXT: vaddpd %zmm3, %zmm0, %zmm0
5242 %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1)
5243 %2 = bitcast i8 %x3 to <8 x i1>
5244 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %x2
5245 %4 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1)
5246 %5 = bitcast i8 %x3 to <8 x i1>
5247 %6 = select <8 x i1> %5, <8 x double> %4, <8 x double> zeroinitializer
5248 %7 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1)
5249 %res3 = fadd <8 x double> %3, %6
5250 %res4 = fadd <8 x double> %res3, %7
5251 ret <8 x double> %res4
5254 declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>)
5256 define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
5257 ; X64-LABEL: test_int_x86_avx512_mask_permvar_di_512:
5259 ; X64-NEXT: vpermq %zmm0, %zmm1, %zmm3
5260 ; X64-NEXT: kmovw %edi, %k1
5261 ; X64-NEXT: vpermq %zmm0, %zmm1, %zmm2 {%k1}
5262 ; X64-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
5263 ; X64-NEXT: vpaddq %zmm3, %zmm0, %zmm0
5264 ; X64-NEXT: vpaddq %zmm0, %zmm2, %zmm0
5267 ; X86-LABEL: test_int_x86_avx512_mask_permvar_di_512:
5269 ; X86-NEXT: vpermq %zmm0, %zmm1, %zmm3
5270 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5271 ; X86-NEXT: kmovw %eax, %k1
5272 ; X86-NEXT: vpermq %zmm0, %zmm1, %zmm2 {%k1}
5273 ; X86-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
5274 ; X86-NEXT: vpaddq %zmm3, %zmm0, %zmm0
5275 ; X86-NEXT: vpaddq %zmm0, %zmm2, %zmm0
5277 %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1)
5278 %2 = bitcast i8 %x3 to <8 x i1>
5279 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2
5280 %4 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1)
5281 %5 = bitcast i8 %x3 to <8 x i1>
5282 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
5283 %7 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1)
5284 %res3 = add <8 x i64> %3, %6
5285 %res4 = add <8 x i64> %res3, %7
5289 declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>)
5291 define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
5292 ; X64-LABEL: test_int_x86_avx512_mask_permvar_sf_512:
5294 ; X64-NEXT: vpermps %zmm0, %zmm1, %zmm3
5295 ; X64-NEXT: kmovw %edi, %k1
5296 ; X64-NEXT: vpermps %zmm0, %zmm1, %zmm2 {%k1}
5297 ; X64-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
5298 ; X64-NEXT: vaddps %zmm0, %zmm2, %zmm0
5299 ; X64-NEXT: vaddps %zmm3, %zmm0, %zmm0
5302 ; X86-LABEL: test_int_x86_avx512_mask_permvar_sf_512:
5304 ; X86-NEXT: vpermps %zmm0, %zmm1, %zmm3
5305 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
5306 ; X86-NEXT: vpermps %zmm0, %zmm1, %zmm2 {%k1}
5307 ; X86-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
5308 ; X86-NEXT: vaddps %zmm0, %zmm2, %zmm0
5309 ; X86-NEXT: vaddps %zmm3, %zmm0, %zmm0
5311 %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1)
5312 %2 = bitcast i16 %x3 to <16 x i1>
5313 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %x2
5314 %4 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1)
5315 %5 = bitcast i16 %x3 to <16 x i1>
5316 %6 = select <16 x i1> %5, <16 x float> %4, <16 x float> zeroinitializer
5317 %7 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1)
5318 %res3 = fadd <16 x float> %3, %6
5319 %res4 = fadd <16 x float> %res3, %7
5320 ret <16 x float> %res4
5323 declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>)
5325 define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
5326 ; X64-LABEL: test_int_x86_avx512_mask_permvar_si_512:
5328 ; X64-NEXT: vpermd %zmm0, %zmm1, %zmm3
5329 ; X64-NEXT: kmovw %edi, %k1
5330 ; X64-NEXT: vpermd %zmm0, %zmm1, %zmm2 {%k1}
5331 ; X64-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
5332 ; X64-NEXT: vpaddd %zmm3, %zmm0, %zmm0
5333 ; X64-NEXT: vpaddd %zmm0, %zmm2, %zmm0
5336 ; X86-LABEL: test_int_x86_avx512_mask_permvar_si_512:
5338 ; X86-NEXT: vpermd %zmm0, %zmm1, %zmm3
5339 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
5340 ; X86-NEXT: vpermd %zmm0, %zmm1, %zmm2 {%k1}
5341 ; X86-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
5342 ; X86-NEXT: vpaddd %zmm3, %zmm0, %zmm0
5343 ; X86-NEXT: vpaddd %zmm0, %zmm2, %zmm0
5345 %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1)
5346 %2 = bitcast i16 %x3 to <16 x i1>
5347 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2
5348 %4 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1)
5349 %5 = bitcast i16 %x3 to <16 x i1>
5350 %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
5351 %7 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1)
5352 %res3 = add <16 x i32> %3, %6
5353 %res4 = add <16 x i32> %res3, %7
5354 ret <16 x i32> %res4
5357 declare <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32)
5359 define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) {
5360 ; X64-LABEL: test_int_x86_avx512_mask_fixupimm_pd_512:
5362 ; X64-NEXT: kmovw %edi, %k1
5363 ; X64-NEXT: vmovapd %zmm0, %zmm3
5364 ; X64-NEXT: vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1}
5365 ; X64-NEXT: vxorpd %xmm4, %xmm4, %xmm4
5366 ; X64-NEXT: vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z}
5367 ; X64-NEXT: vaddpd %zmm4, %zmm3, %zmm3
5368 ; X64-NEXT: vfixupimmpd $3, {sae}, %zmm2, %zmm1, %zmm0
5369 ; X64-NEXT: vaddpd %zmm0, %zmm3, %zmm0
5372 ; X86-LABEL: test_int_x86_avx512_mask_fixupimm_pd_512:
5374 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5375 ; X86-NEXT: kmovw %eax, %k1
5376 ; X86-NEXT: vmovapd %zmm0, %zmm3
5377 ; X86-NEXT: vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1}
5378 ; X86-NEXT: vxorpd %xmm4, %xmm4, %xmm4
5379 ; X86-NEXT: vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z}
5380 ; X86-NEXT: vaddpd %zmm4, %zmm3, %zmm3
5381 ; X86-NEXT: vfixupimmpd $3, {sae}, %zmm2, %zmm1, %zmm0
5382 ; X86-NEXT: vaddpd %zmm0, %zmm3, %zmm0
5384 %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 4, i8 %x4, i32 4)
5385 %res1 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> %x1, <8 x i64> %x2, i32 5, i8 %x4, i32 4)
5386 %res2 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 -1, i32 8)
5387 %res3 = fadd <8 x double> %res, %res1
5388 %res4 = fadd <8 x double> %res3, %res2
5389 ret <8 x double> %res4
5392 define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512_load(<8 x double> %x0, <8 x double> %x1, <8 x i64>* %x2ptr) {
5393 ; X64-LABEL: test_int_x86_avx512_mask_fixupimm_pd_512_load:
5395 ; X64-NEXT: vfixupimmpd $3, (%rdi), %zmm1, %zmm0
5398 ; X86-LABEL: test_int_x86_avx512_mask_fixupimm_pd_512_load:
5400 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
5401 ; X86-NEXT: vfixupimmpd $3, (%eax), %zmm1, %zmm0
5403 %x2 = load <8 x i64>, <8 x i64>* %x2ptr
5404 %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 -1, i32 4)
5405 ret <8 x double> %res
5408 declare <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32)
5410 define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) {
5411 ; X64-LABEL: test_int_x86_avx512_maskz_fixupimm_pd_512:
5413 ; X64-NEXT: kmovw %edi, %k1
5414 ; X64-NEXT: vmovapd %zmm0, %zmm3
5415 ; X64-NEXT: vfixupimmpd $3, %zmm2, %zmm1, %zmm3 {%k1} {z}
5416 ; X64-NEXT: vxorpd %xmm4, %xmm4, %xmm4
5417 ; X64-NEXT: vmovapd %zmm0, %zmm5
5418 ; X64-NEXT: vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z}
5419 ; X64-NEXT: vaddpd %zmm5, %zmm3, %zmm3
5420 ; X64-NEXT: vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0
5421 ; X64-NEXT: vaddpd %zmm0, %zmm3, %zmm0
5424 ; X86-LABEL: test_int_x86_avx512_maskz_fixupimm_pd_512:
5426 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5427 ; X86-NEXT: kmovw %eax, %k1
5428 ; X86-NEXT: vmovapd %zmm0, %zmm3
5429 ; X86-NEXT: vfixupimmpd $3, %zmm2, %zmm1, %zmm3 {%k1} {z}
5430 ; X86-NEXT: vxorpd %xmm4, %xmm4, %xmm4
5431 ; X86-NEXT: vmovapd %zmm0, %zmm5
5432 ; X86-NEXT: vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z}
5433 ; X86-NEXT: vaddpd %zmm5, %zmm3, %zmm3
5434 ; X86-NEXT: vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0
5435 ; X86-NEXT: vaddpd %zmm0, %zmm3, %zmm0
5437 %res = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 %x4, i32 4)
5438 %res1 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> zeroinitializer, i32 5, i8 %x4, i32 4)
5439 %res2 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 2, i8 -1, i32 8)
5440 %res3 = fadd <8 x double> %res, %res1
5441 %res4 = fadd <8 x double> %res3, %res2
5442 ret <8 x double> %res4
5445 declare <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float>, <4 x float>, <4 x i32>, i32, i8, i32)
5447 define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) {
5448 ; X64-LABEL: test_int_x86_avx512_mask_fixupimm_ss:
5450 ; X64-NEXT: kmovw %edi, %k1
5451 ; X64-NEXT: vmovaps %xmm0, %xmm3
5452 ; X64-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1}
5453 ; X64-NEXT: vxorps %xmm4, %xmm4, %xmm4
5454 ; X64-NEXT: vmovaps %xmm0, %xmm5
5455 ; X64-NEXT: vfixupimmss $5, %xmm4, %xmm1, %xmm5 {%k1}
5456 ; X64-NEXT: vaddps %xmm5, %xmm3, %xmm3
5457 ; X64-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0
5458 ; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0
5461 ; X86-LABEL: test_int_x86_avx512_mask_fixupimm_ss:
5463 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5464 ; X86-NEXT: kmovw %eax, %k1
5465 ; X86-NEXT: vmovaps %xmm0, %xmm3
5466 ; X86-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1}
5467 ; X86-NEXT: vxorps %xmm4, %xmm4, %xmm4
5468 ; X86-NEXT: vmovaps %xmm0, %xmm5
5469 ; X86-NEXT: vfixupimmss $5, %xmm4, %xmm1, %xmm5 {%k1}
5470 ; X86-NEXT: vaddps %xmm5, %xmm3, %xmm3
5471 ; X86-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0
5472 ; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0
5474 %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4)
5475 %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 4)
5476 %res2 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 -1, i32 8)
5477 %res3 = fadd <4 x float> %res, %res1
5478 %res4 = fadd <4 x float> %res3, %res2
5479 ret <4 x float> %res4
5482 declare <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float>, <4 x float>, <4 x i32>, i32, i8, i32)
5484 define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) {
5485 ; X64-LABEL: test_int_x86_avx512_maskz_fixupimm_ss:
5487 ; X64-NEXT: kmovw %edi, %k1
5488 ; X64-NEXT: vmovaps %xmm0, %xmm3
5489 ; X64-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3
5490 ; X64-NEXT: vmovaps %xmm0, %xmm4
5491 ; X64-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm4 {%k1} {z}
5492 ; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2
5493 ; X64-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5494 ; X64-NEXT: vaddps %xmm0, %xmm4, %xmm0
5495 ; X64-NEXT: vaddps %xmm3, %xmm0, %xmm0
5498 ; X86-LABEL: test_int_x86_avx512_maskz_fixupimm_ss:
5500 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5501 ; X86-NEXT: kmovw %eax, %k1
5502 ; X86-NEXT: vmovaps %xmm0, %xmm3
5503 ; X86-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1} {z}
5504 ; X86-NEXT: vmovaps %xmm0, %xmm4
5505 ; X86-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm4
5506 ; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2
5507 ; X86-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5508 ; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0
5509 ; X86-NEXT: vaddps %xmm4, %xmm0, %xmm0
5511 %res = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4)
5512 %res1 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 8)
5513 %res2 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 -1, i32 4)
5514 %res3 = fadd <4 x float> %res, %res1
5515 %res4 = fadd <4 x float> %res3, %res2
5516 ret <4 x float> %res4
5519 declare <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float>, <16 x float>, <16 x i32>, i32, i16, i32)
5521 define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) {
5522 ; X64-LABEL: test_int_x86_avx512_mask_fixupimm_ps_512:
5524 ; X64-NEXT: kmovw %edi, %k1
5525 ; X64-NEXT: vmovaps %zmm0, %zmm3
5526 ; X64-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1}
5527 ; X64-NEXT: vxorps %xmm4, %xmm4, %xmm4
5528 ; X64-NEXT: vmovaps %zmm0, %zmm5
5529 ; X64-NEXT: vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1}
5530 ; X64-NEXT: vaddps %zmm5, %zmm3, %zmm3
5531 ; X64-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0
5532 ; X64-NEXT: vaddps %zmm0, %zmm3, %zmm0
5535 ; X86-LABEL: test_int_x86_avx512_mask_fixupimm_ps_512:
5537 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
5538 ; X86-NEXT: vmovaps %zmm0, %zmm3
5539 ; X86-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1}
5540 ; X86-NEXT: vxorps %xmm4, %xmm4, %xmm4
5541 ; X86-NEXT: vmovaps %zmm0, %zmm5
5542 ; X86-NEXT: vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1}
5543 ; X86-NEXT: vaddps %zmm5, %zmm3, %zmm3
5544 ; X86-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0
5545 ; X86-NEXT: vaddps %zmm0, %zmm3, %zmm0
5547 %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4)
5548 %res1 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 4)
5549 %res2 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 8)
5550 %res3 = fadd <16 x float> %res, %res1
5551 %res4 = fadd <16 x float> %res3, %res2
5552 ret <16 x float> %res4
5555 define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512_load(<16 x float> %x0, <16 x float> %x1, <16 x i32>* %x2ptr) {
5556 ; X64-LABEL: test_int_x86_avx512_mask_fixupimm_ps_512_load:
5558 ; X64-NEXT: vfixupimmps $5, (%rdi), %zmm1, %zmm0
5561 ; X86-LABEL: test_int_x86_avx512_mask_fixupimm_ps_512_load:
5563 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
5564 ; X86-NEXT: vfixupimmps $5, (%eax), %zmm1, %zmm0
5566 %x2 = load <16 x i32>, <16 x i32>* %x2ptr
5567 %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 4)
5568 ret <16 x float> %res
5571 declare <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float>, <16 x float>, <16 x i32>, i32, i16, i32)
5573 define <16 x float>@test_int_x86_avx512_maskz_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) {
5574 ; X64-LABEL: test_int_x86_avx512_maskz_fixupimm_ps_512:
5576 ; X64-NEXT: kmovw %edi, %k1
5577 ; X64-NEXT: vmovaps %zmm0, %zmm3
5578 ; X64-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm3
5579 ; X64-NEXT: vmovaps %zmm0, %zmm4
5580 ; X64-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm4 {%k1} {z}
5581 ; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2
5582 ; X64-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
5583 ; X64-NEXT: vaddps %zmm0, %zmm4, %zmm0
5584 ; X64-NEXT: vaddps %zmm3, %zmm0, %zmm0
5587 ; X86-LABEL: test_int_x86_avx512_maskz_fixupimm_ps_512:
5589 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
5590 ; X86-NEXT: vmovaps %zmm0, %zmm3
5591 ; X86-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1} {z}
5592 ; X86-NEXT: vmovaps %zmm0, %zmm4
5593 ; X86-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm4
5594 ; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2
5595 ; X86-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
5596 ; X86-NEXT: vaddps %zmm0, %zmm3, %zmm0
5597 ; X86-NEXT: vaddps %zmm4, %zmm0, %zmm0
5599 %res = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4)
5600 %res1 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 8)
5601 %res2 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 4)
5602 %res3 = fadd <16 x float> %res, %res1
5603 %res4 = fadd <16 x float> %res3, %res2
5604 ret <16 x float> %res4
5607 declare <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double>, <2 x double>, <2 x i64>, i32, i8, i32)
5609 define <2 x double>@test_int_x86_avx512_mask_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) {
5610 ; X64-LABEL: test_int_x86_avx512_mask_fixupimm_sd:
5612 ; X64-NEXT: kmovw %edi, %k1
5613 ; X64-NEXT: vmovapd %xmm0, %xmm3
5614 ; X64-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3
5615 ; X64-NEXT: vmovapd %xmm0, %xmm4
5616 ; X64-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm4 {%k1}
5617 ; X64-NEXT: vxorpd %xmm2, %xmm2, %xmm2
5618 ; X64-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1}
5619 ; X64-NEXT: vaddpd %xmm0, %xmm4, %xmm0
5620 ; X64-NEXT: vaddpd %xmm3, %xmm0, %xmm0
5623 ; X86-LABEL: test_int_x86_avx512_mask_fixupimm_sd:
5625 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5626 ; X86-NEXT: kmovw %eax, %k1
5627 ; X86-NEXT: vmovapd %xmm0, %xmm3
5628 ; X86-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1}
5629 ; X86-NEXT: vmovapd %xmm0, %xmm4
5630 ; X86-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm4
5631 ; X86-NEXT: vxorpd %xmm2, %xmm2, %xmm2
5632 ; X86-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1}
5633 ; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0
5634 ; X86-NEXT: vaddpd %xmm4, %xmm0, %xmm0
5636 %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4)
5637 %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8)
5638 %res2 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 -1, i32 4)
5639 %res3 = fadd <2 x double> %res, %res1
5640 %res4 = fadd <2 x double> %res3, %res2
5641 ret <2 x double> %res4
5644 declare <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double>, <2 x double>, <2 x i64>, i32, i8, i32)
5646 define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) {
5647 ; X64-LABEL: test_int_x86_avx512_maskz_fixupimm_sd:
5649 ; X64-NEXT: kmovw %edi, %k1
5650 ; X64-NEXT: vmovapd %xmm0, %xmm3
5651 ; X64-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1} {z}
5652 ; X64-NEXT: vxorpd %xmm4, %xmm4, %xmm4
5653 ; X64-NEXT: vmovapd %xmm0, %xmm5
5654 ; X64-NEXT: vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z}
5655 ; X64-NEXT: vaddpd %xmm5, %xmm3, %xmm3
5656 ; X64-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5657 ; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0
5660 ; X86-LABEL: test_int_x86_avx512_maskz_fixupimm_sd:
5662 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5663 ; X86-NEXT: kmovw %eax, %k1
5664 ; X86-NEXT: vmovapd %xmm0, %xmm3
5665 ; X86-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1} {z}
5666 ; X86-NEXT: vxorpd %xmm4, %xmm4, %xmm4
5667 ; X86-NEXT: vmovapd %xmm0, %xmm5
5668 ; X86-NEXT: vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z}
5669 ; X86-NEXT: vaddpd %xmm5, %xmm3, %xmm3
5670 ; X86-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5671 ; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0
5673 %res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4)
5674 %res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8)
5675 %res2 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 8)
5676 %res3 = fadd <2 x double> %res, %res1
5677 %res4 = fadd <2 x double> %res3, %res2
5678 ret <2 x double> %res4
5681 declare double @llvm.fma.f64(double, double, double) #1
5682 declare double @llvm.x86.avx512.vfmadd.f64(double, double, double, i32) #0
5684 define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
5685 ; X64-LABEL: test_int_x86_avx512_mask_vfmadd_sd:
5687 ; X64-NEXT: vmovapd %xmm0, %xmm3
5688 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm3 = (xmm1 * xmm3) + xmm2
5689 ; X64-NEXT: kmovw %edi, %k1
5690 ; X64-NEXT: vmovapd %xmm0, %xmm4
5691 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm4 = (xmm1 * xmm4) + xmm2
5692 ; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm3
5693 ; X64-NEXT: vmovapd %xmm0, %xmm4
5694 ; X64-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm4
5695 ; X64-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5696 ; X64-NEXT: vaddpd %xmm0, %xmm4, %xmm0
5697 ; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0
5700 ; X86-LABEL: test_int_x86_avx512_mask_vfmadd_sd:
5702 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5703 ; X86-NEXT: vmovapd %xmm0, %xmm3
5704 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm3 = (xmm1 * xmm3) + xmm2
5705 ; X86-NEXT: kmovw %eax, %k1
5706 ; X86-NEXT: vmovapd %xmm0, %xmm4
5707 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm4 = (xmm1 * xmm4) + xmm2
5708 ; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm3
5709 ; X86-NEXT: vmovapd %xmm0, %xmm4
5710 ; X86-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm4
5711 ; X86-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5712 ; X86-NEXT: vaddpd %xmm0, %xmm4, %xmm0
5713 ; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0
5715 %1 = extractelement <2 x double> %x0, i64 0
5716 %2 = extractelement <2 x double> %x1, i64 0
5717 %3 = extractelement <2 x double> %x2, i64 0
5718 %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
5719 %5 = insertelement <2 x double> %x0, double %4, i64 0
5720 %6 = extractelement <2 x double> %x0, i64 0
5721 %7 = extractelement <2 x double> %x1, i64 0
5722 %8 = extractelement <2 x double> %x2, i64 0
5723 %9 = call double @llvm.fma.f64(double %6, double %7, double %8)
5724 %10 = bitcast i8 %x3 to <8 x i1>
5725 %11 = extractelement <8 x i1> %10, i64 0
5726 %12 = select i1 %11, double %9, double %6
5727 %13 = insertelement <2 x double> %x0, double %12, i64 0
5728 %14 = extractelement <2 x double> %x0, i64 0
5729 %15 = extractelement <2 x double> %x1, i64 0
5730 %16 = extractelement <2 x double> %x2, i64 0
5731 %17 = call double @llvm.x86.avx512.vfmadd.f64(double %14, double %15, double %16, i32 11)
5732 %18 = insertelement <2 x double> %x0, double %17, i64 0
5733 %19 = extractelement <2 x double> %x0, i64 0
5734 %20 = extractelement <2 x double> %x1, i64 0
5735 %21 = extractelement <2 x double> %x2, i64 0
5736 %22 = call double @llvm.x86.avx512.vfmadd.f64(double %19, double %20, double %21, i32 11)
5737 %23 = bitcast i8 %x3 to <8 x i1>
5738 %24 = extractelement <8 x i1> %23, i64 0
5739 %25 = select i1 %24, double %22, double %19
5740 %26 = insertelement <2 x double> %x0, double %25, i64 0
5741 %res4 = fadd <2 x double> %5, %13
5742 %res5 = fadd <2 x double> %18, %26
5743 %res6 = fadd <2 x double> %res4, %res5
5744 ret <2 x double> %res6
5747 define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
5748 ; X64-LABEL: test_int_x86_avx512_mask_vfmadd_ss:
5750 ; X64-NEXT: vmovaps %xmm0, %xmm3
5751 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm3 = (xmm1 * xmm3) + xmm2
5752 ; X64-NEXT: kmovw %edi, %k1
5753 ; X64-NEXT: vmovaps %xmm0, %xmm4
5754 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm4 = (xmm1 * xmm4) + xmm2
5755 ; X64-NEXT: vaddps %xmm4, %xmm3, %xmm3
5756 ; X64-NEXT: vmovaps %xmm0, %xmm4
5757 ; X64-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm4
5758 ; X64-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5759 ; X64-NEXT: vaddps %xmm0, %xmm4, %xmm0
5760 ; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0
5763 ; X86-LABEL: test_int_x86_avx512_mask_vfmadd_ss:
5765 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5766 ; X86-NEXT: vmovaps %xmm0, %xmm3
5767 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm3 = (xmm1 * xmm3) + xmm2
5768 ; X86-NEXT: kmovw %eax, %k1
5769 ; X86-NEXT: vmovaps %xmm0, %xmm4
5770 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm4 = (xmm1 * xmm4) + xmm2
5771 ; X86-NEXT: vaddps %xmm4, %xmm3, %xmm3
5772 ; X86-NEXT: vmovaps %xmm0, %xmm4
5773 ; X86-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm4
5774 ; X86-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5775 ; X86-NEXT: vaddps %xmm0, %xmm4, %xmm0
5776 ; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0
5778 %1 = extractelement <4 x float> %x0, i64 0
5779 %2 = extractelement <4 x float> %x1, i64 0
5780 %3 = extractelement <4 x float> %x2, i64 0
5781 %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
5782 %5 = insertelement <4 x float> %x0, float %4, i64 0
5783 %6 = extractelement <4 x float> %x0, i64 0
5784 %7 = extractelement <4 x float> %x1, i64 0
5785 %8 = extractelement <4 x float> %x2, i64 0
5786 %9 = call float @llvm.fma.f32(float %6, float %7, float %8)
5787 %10 = bitcast i8 %x3 to <8 x i1>
5788 %11 = extractelement <8 x i1> %10, i64 0
5789 %12 = select i1 %11, float %9, float %6
5790 %13 = insertelement <4 x float> %x0, float %12, i64 0
5791 %14 = extractelement <4 x float> %x0, i64 0
5792 %15 = extractelement <4 x float> %x1, i64 0
5793 %16 = extractelement <4 x float> %x2, i64 0
5794 %17 = call float @llvm.x86.avx512.vfmadd.f32(float %14, float %15, float %16, i32 11)
5795 %18 = insertelement <4 x float> %x0, float %17, i64 0
5796 %19 = extractelement <4 x float> %x0, i64 0
5797 %20 = extractelement <4 x float> %x1, i64 0
5798 %21 = extractelement <4 x float> %x2, i64 0
5799 %22 = call float @llvm.x86.avx512.vfmadd.f32(float %19, float %20, float %21, i32 11)
5800 %23 = bitcast i8 %x3 to <8 x i1>
5801 %24 = extractelement <8 x i1> %23, i64 0
5802 %25 = select i1 %24, float %22, float %19
5803 %26 = insertelement <4 x float> %x0, float %25, i64 0
5804 %res4 = fadd <4 x float> %5, %13
5805 %res5 = fadd <4 x float> %18, %26
5806 %res6 = fadd <4 x float> %res4, %res5
5807 ret <4 x float> %res6
5810 define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
5811 ; X64-LABEL: test_int_x86_avx512_maskz_vfmadd_sd:
5813 ; X64-NEXT: kmovw %edi, %k1
5814 ; X64-NEXT: vmovapd %xmm0, %xmm3
5815 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm3 = (xmm1 * xmm3) + xmm2
5816 ; X64-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5817 ; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0
5820 ; X86-LABEL: test_int_x86_avx512_maskz_vfmadd_sd:
5822 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5823 ; X86-NEXT: kmovw %eax, %k1
5824 ; X86-NEXT: vmovapd %xmm0, %xmm3
5825 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm3 = (xmm1 * xmm3) + xmm2
5826 ; X86-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5827 ; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0
5829 %1 = extractelement <2 x double> %x0, i64 0
5830 %2 = extractelement <2 x double> %x1, i64 0
5831 %3 = extractelement <2 x double> %x2, i64 0
5832 %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
5833 %5 = bitcast i8 %x3 to <8 x i1>
5834 %6 = extractelement <8 x i1> %5, i64 0
5835 %7 = select i1 %6, double %4, double 0.000000e+00
5836 %8 = insertelement <2 x double> %x0, double %7, i64 0
5837 %9 = extractelement <2 x double> %x0, i64 0
5838 %10 = extractelement <2 x double> %x1, i64 0
5839 %11 = extractelement <2 x double> %x2, i64 0
5840 %12 = call double @llvm.x86.avx512.vfmadd.f64(double %9, double %10, double %11, i32 11)
5841 %13 = bitcast i8 %x3 to <8 x i1>
5842 %14 = extractelement <8 x i1> %13, i64 0
5843 %15 = select i1 %14, double %12, double 0.000000e+00
5844 %16 = insertelement <2 x double> %x0, double %15, i64 0
5845 %res2 = fadd <2 x double> %8, %16
5846 ret <2 x double> %res2
5849 declare float @llvm.fma.f32(float, float, float) #1
5850 declare float @llvm.x86.avx512.vfmadd.f32(float, float, float, i32) #0
5852 define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
5853 ; X64-LABEL: test_int_x86_avx512_maskz_vfmadd_ss:
5855 ; X64-NEXT: kmovw %edi, %k1
5856 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5859 ; X86-LABEL: test_int_x86_avx512_maskz_vfmadd_ss:
5861 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5862 ; X86-NEXT: kmovw %eax, %k1
5863 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5865 %1 = extractelement <4 x float> %x0, i64 0
5866 %2 = extractelement <4 x float> %x1, i64 0
5867 %3 = extractelement <4 x float> %x2, i64 0
5868 %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
5869 %5 = bitcast i8 %x3 to <8 x i1>
5870 %6 = extractelement <8 x i1> %5, i64 0
5871 %7 = select i1 %6, float %4, float 0.000000e+00
5872 %8 = insertelement <4 x float> %x0, float %7, i64 0
5873 %9 = extractelement <4 x float> %x0, i64 0
5874 %10 = extractelement <4 x float> %x1, i64 0
5875 %11 = extractelement <4 x float> %x2, i64 0
5876 %12 = call float @llvm.x86.avx512.vfmadd.f32(float %9, float %10, float %11, i32 11)
5877 %13 = bitcast i8 %x3 to <8 x i1>
5878 %14 = extractelement <8 x i1> %13, i64 0
5879 %15 = select i1 %14, float %12, float 0.000000e+00
5880 %16 = insertelement <4 x float> %x0, float %15, i64 0
5881 %res2 = fadd <4 x float> %8, %16
5885 define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
5886 ; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_sd:
5888 ; X64-NEXT: vmovapd %xmm2, %xmm3
5889 ; X64-NEXT: vfmadd231sd {{.*#+}} xmm3 = (xmm0 * xmm1) + xmm3
5890 ; X64-NEXT: kmovw %edi, %k1
5891 ; X64-NEXT: vmovapd %xmm2, %xmm4
5892 ; X64-NEXT: vfmadd231sd {{.*#+}} xmm4 = (xmm0 * xmm1) + xmm4
5893 ; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm3
5894 ; X64-NEXT: vmovapd %xmm2, %xmm4
5895 ; X64-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm4
5896 ; X64-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5897 ; X64-NEXT: vaddpd %xmm2, %xmm4, %xmm0
5898 ; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0
5901 ; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_sd:
5903 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5904 ; X86-NEXT: vmovapd %xmm2, %xmm3
5905 ; X86-NEXT: vfmadd231sd {{.*#+}} xmm3 = (xmm0 * xmm1) + xmm3
5906 ; X86-NEXT: kmovw %eax, %k1
5907 ; X86-NEXT: vmovapd %xmm2, %xmm4
5908 ; X86-NEXT: vfmadd231sd {{.*#+}} xmm4 = (xmm0 * xmm1) + xmm4
5909 ; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm3
5910 ; X86-NEXT: vmovapd %xmm2, %xmm4
5911 ; X86-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm4
5912 ; X86-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5913 ; X86-NEXT: vaddpd %xmm2, %xmm4, %xmm0
5914 ; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0
5916 %1 = extractelement <2 x double> %x0, i64 0
5917 %2 = extractelement <2 x double> %x1, i64 0
5918 %3 = extractelement <2 x double> %x2, i64 0
5919 %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
5920 %5 = insertelement <2 x double> %x2, double %4, i64 0
5921 %6 = extractelement <2 x double> %x0, i64 0
5922 %7 = extractelement <2 x double> %x1, i64 0
5923 %8 = extractelement <2 x double> %x2, i64 0
5924 %9 = call double @llvm.fma.f64(double %6, double %7, double %8)
5925 %10 = bitcast i8 %x3 to <8 x i1>
5926 %11 = extractelement <8 x i1> %10, i64 0
5927 %12 = select i1 %11, double %9, double %8
5928 %13 = insertelement <2 x double> %x2, double %12, i64 0
5929 %14 = extractelement <2 x double> %x0, i64 0
5930 %15 = extractelement <2 x double> %x1, i64 0
5931 %16 = extractelement <2 x double> %x2, i64 0
5932 %17 = call double @llvm.x86.avx512.vfmadd.f64(double %14, double %15, double %16, i32 11)
5933 %18 = insertelement <2 x double> %x2, double %17, i64 0
5934 %19 = extractelement <2 x double> %x0, i64 0
5935 %20 = extractelement <2 x double> %x1, i64 0
5936 %21 = extractelement <2 x double> %x2, i64 0
5937 %22 = call double @llvm.x86.avx512.vfmadd.f64(double %19, double %20, double %21, i32 11)
5938 %23 = bitcast i8 %x3 to <8 x i1>
5939 %24 = extractelement <8 x i1> %23, i64 0
5940 %25 = select i1 %24, double %22, double %21
5941 %26 = insertelement <2 x double> %x2, double %25, i64 0
5942 %res4 = fadd <2 x double> %5, %13
5943 %res5 = fadd <2 x double> %18, %26
5944 %res6 = fadd <2 x double> %res4, %res5
5945 ret <2 x double> %res6
5948 define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
5949 ; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_ss:
5951 ; X64-NEXT: vmovaps %xmm2, %xmm3
5952 ; X64-NEXT: vfmadd231ss {{.*#+}} xmm3 = (xmm0 * xmm1) + xmm3
5953 ; X64-NEXT: kmovw %edi, %k1
5954 ; X64-NEXT: vmovaps %xmm2, %xmm4
5955 ; X64-NEXT: vfmadd231ss {{.*#+}} xmm4 = (xmm0 * xmm1) + xmm4
5956 ; X64-NEXT: vaddps %xmm4, %xmm3, %xmm3
5957 ; X64-NEXT: vmovaps %xmm2, %xmm4
5958 ; X64-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm4
5959 ; X64-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5960 ; X64-NEXT: vaddps %xmm2, %xmm4, %xmm0
5961 ; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0
5964 ; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_ss:
5966 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5967 ; X86-NEXT: vmovaps %xmm2, %xmm3
5968 ; X86-NEXT: vfmadd231ss {{.*#+}} xmm3 = (xmm0 * xmm1) + xmm3
5969 ; X86-NEXT: kmovw %eax, %k1
5970 ; X86-NEXT: vmovaps %xmm2, %xmm4
5971 ; X86-NEXT: vfmadd231ss {{.*#+}} xmm4 = (xmm0 * xmm1) + xmm4
5972 ; X86-NEXT: vaddps %xmm4, %xmm3, %xmm3
5973 ; X86-NEXT: vmovaps %xmm2, %xmm4
5974 ; X86-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm4
5975 ; X86-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5976 ; X86-NEXT: vaddps %xmm2, %xmm4, %xmm0
5977 ; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0
5979 %1 = extractelement <4 x float> %x0, i64 0
5980 %2 = extractelement <4 x float> %x1, i64 0
5981 %3 = extractelement <4 x float> %x2, i64 0
5982 %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
5983 %5 = insertelement <4 x float> %x2, float %4, i64 0
5984 %6 = extractelement <4 x float> %x0, i64 0
5985 %7 = extractelement <4 x float> %x1, i64 0
5986 %8 = extractelement <4 x float> %x2, i64 0
5987 %9 = call float @llvm.fma.f32(float %6, float %7, float %8)
5988 %10 = bitcast i8 %x3 to <8 x i1>
5989 %11 = extractelement <8 x i1> %10, i64 0
5990 %12 = select i1 %11, float %9, float %8
5991 %13 = insertelement <4 x float> %x2, float %12, i64 0
5992 %14 = extractelement <4 x float> %x0, i64 0
5993 %15 = extractelement <4 x float> %x1, i64 0
5994 %16 = extractelement <4 x float> %x2, i64 0
5995 %17 = call float @llvm.x86.avx512.vfmadd.f32(float %14, float %15, float %16, i32 11)
5996 %18 = insertelement <4 x float> %x2, float %17, i64 0
5997 %19 = extractelement <4 x float> %x0, i64 0
5998 %20 = extractelement <4 x float> %x1, i64 0
5999 %21 = extractelement <4 x float> %x2, i64 0
6000 %22 = call float @llvm.x86.avx512.vfmadd.f32(float %19, float %20, float %21, i32 11)
6001 %23 = bitcast i8 %x3 to <8 x i1>
6002 %24 = extractelement <8 x i1> %23, i64 0
6003 %25 = select i1 %24, float %22, float %21
6004 %26 = insertelement <4 x float> %x2, float %25, i64 0
6005 %res4 = fadd <4 x float> %5, %13
6006 %res5 = fadd <4 x float> %18, %26
6007 %res6 = fadd <4 x float> %res4, %res5
6008 ret <4 x float> %res6
6011 define void @fmadd_ss_mask_memfold(float* %a, float* %b, i8 %c) {
6012 ; X64-LABEL: fmadd_ss_mask_memfold:
6014 ; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
6015 ; X64-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
6016 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm0 * xmm1) + xmm0
6017 ; X64-NEXT: kmovw %edx, %k1
6018 ; X64-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
6019 ; X64-NEXT: vmovss %xmm0, (%rdi)
6022 ; X86-LABEL: fmadd_ss_mask_memfold:
6024 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6025 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6026 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
6027 ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
6028 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
6029 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm0 * xmm1) + xmm0
6030 ; X86-NEXT: kmovw %eax, %k1
6031 ; X86-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
6032 ; X86-NEXT: vmovss %xmm0, (%edx)
6034 %a.val = load float, float* %a
6035 %av0 = insertelement <4 x float> undef, float %a.val, i32 0
6036 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
6037 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
6038 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
6040 %b.val = load float, float* %b
6041 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
6042 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
6043 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
6044 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
6045 %1 = extractelement <4 x float> %av, i64 0
6046 %2 = extractelement <4 x float> %bv, i64 0
6047 %3 = extractelement <4 x float> %av, i64 0
6048 %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
6049 %5 = bitcast i8 %c to <8 x i1>
6050 %6 = extractelement <8 x i1> %5, i64 0
6051 %7 = select i1 %6, float %4, float %1
6052 %8 = insertelement <4 x float> %av, float %7, i64 0
6053 %sr = extractelement <4 x float> %8, i32 0
6054 store float %sr, float* %a
6058 define void @fmadd_ss_maskz_memfold(float* %a, float* %b, i8 %c) {
6059 ; X64-LABEL: fmadd_ss_maskz_memfold:
6061 ; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
6062 ; X64-NEXT: vfmadd231ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
6063 ; X64-NEXT: kmovw %edx, %k1
6064 ; X64-NEXT: vmovss %xmm0, %xmm0, %xmm0 {%k1} {z}
6065 ; X64-NEXT: vmovss %xmm0, (%rdi)
6068 ; X86-LABEL: fmadd_ss_maskz_memfold:
6070 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6071 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6072 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
6073 ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
6074 ; X86-NEXT: vfmadd231ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
6075 ; X86-NEXT: kmovw %eax, %k1
6076 ; X86-NEXT: vmovss %xmm0, %xmm0, %xmm0 {%k1} {z}
6077 ; X86-NEXT: vmovss %xmm0, (%edx)
6079 %a.val = load float, float* %a
6080 %av0 = insertelement <4 x float> undef, float %a.val, i32 0
6081 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
6082 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
6083 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
6085 %b.val = load float, float* %b
6086 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
6087 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
6088 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
6089 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
6090 %1 = extractelement <4 x float> %av, i64 0
6091 %2 = extractelement <4 x float> %bv, i64 0
6092 %3 = extractelement <4 x float> %av, i64 0
6093 %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
6094 %5 = bitcast i8 %c to <8 x i1>
6095 %6 = extractelement <8 x i1> %5, i64 0
6096 %7 = select i1 %6, float %4, float 0.000000e+00
6097 %8 = insertelement <4 x float> %av, float %7, i64 0
6098 %sr = extractelement <4 x float> %8, i32 0
6099 store float %sr, float* %a
6103 define void @fmadd_sd_mask_memfold(double* %a, double* %b, i8 %c) {
6104 ; X64-LABEL: fmadd_sd_mask_memfold:
6106 ; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
6107 ; X64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
6108 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm1 = (xmm0 * xmm1) + xmm0
6109 ; X64-NEXT: kmovw %edx, %k1
6110 ; X64-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
6111 ; X64-NEXT: vmovsd %xmm0, (%rdi)
6114 ; X86-LABEL: fmadd_sd_mask_memfold:
6116 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6117 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6118 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
6119 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
6120 ; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
6121 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm1 = (xmm0 * xmm1) + xmm0
6122 ; X86-NEXT: kmovw %eax, %k1
6123 ; X86-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
6124 ; X86-NEXT: vmovsd %xmm0, (%edx)
6126 %a.val = load double, double* %a
6127 %av0 = insertelement <2 x double> undef, double %a.val, i32 0
6128 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
6130 %b.val = load double, double* %b
6131 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
6132 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
6133 %1 = extractelement <2 x double> %av, i64 0
6134 %2 = extractelement <2 x double> %bv, i64 0
6135 %3 = extractelement <2 x double> %av, i64 0
6136 %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
6137 %5 = bitcast i8 %c to <8 x i1>
6138 %6 = extractelement <8 x i1> %5, i64 0
6139 %7 = select i1 %6, double %4, double %1
6140 %8 = insertelement <2 x double> %av, double %7, i64 0
6141 %sr = extractelement <2 x double> %8, i32 0
6142 store double %sr, double* %a
6146 define void @fmadd_sd_maskz_memfold(double* %a, double* %b, i8 %c) {
6147 ; X64-LABEL: fmadd_sd_maskz_memfold:
6149 ; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
6150 ; X64-NEXT: vfmadd231sd {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
6151 ; X64-NEXT: kmovw %edx, %k1
6152 ; X64-NEXT: vmovsd %xmm0, %xmm0, %xmm0 {%k1} {z}
6153 ; X64-NEXT: vmovsd %xmm0, (%rdi)
6156 ; X86-LABEL: fmadd_sd_maskz_memfold:
6158 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6159 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6160 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
6161 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
6162 ; X86-NEXT: vfmadd231sd {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
6163 ; X86-NEXT: kmovw %eax, %k1
6164 ; X86-NEXT: vmovsd %xmm0, %xmm0, %xmm0 {%k1} {z}
6165 ; X86-NEXT: vmovsd %xmm0, (%edx)
6167 %a.val = load double, double* %a
6168 %av0 = insertelement <2 x double> undef, double %a.val, i32 0
6169 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
6171 %b.val = load double, double* %b
6172 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
6173 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
6174 %1 = extractelement <2 x double> %av, i64 0
6175 %2 = extractelement <2 x double> %bv, i64 0
6176 %3 = extractelement <2 x double> %av, i64 0
6177 %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
6178 %5 = bitcast i8 %c to <8 x i1>
6179 %6 = extractelement <8 x i1> %5, i64 0
6180 %7 = select i1 %6, double %4, double 0.000000e+00
6181 %8 = insertelement <2 x double> %av, double %7, i64 0
6182 %sr = extractelement <2 x double> %8, i32 0
6183 store double %sr, double* %a
6187 define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
6188 ; X64-LABEL: test_int_x86_avx512_mask3_vfmsub_sd:
6190 ; X64-NEXT: vmovapd %xmm2, %xmm3
6191 ; X64-NEXT: vfmsub231sd {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm3
6192 ; X64-NEXT: kmovw %edi, %k1
6193 ; X64-NEXT: vmovapd %xmm2, %xmm4
6194 ; X64-NEXT: vfmsub231sd {{.*#+}} xmm4 = (xmm0 * xmm1) - xmm4
6195 ; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm3
6196 ; X64-NEXT: vmovapd %xmm2, %xmm4
6197 ; X64-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4
6198 ; X64-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6199 ; X64-NEXT: vaddpd %xmm2, %xmm4, %xmm0
6200 ; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0
6203 ; X86-LABEL: test_int_x86_avx512_mask3_vfmsub_sd:
6205 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6206 ; X86-NEXT: vmovapd %xmm2, %xmm3
6207 ; X86-NEXT: vfmsub231sd {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm3
6208 ; X86-NEXT: kmovw %eax, %k1
6209 ; X86-NEXT: vmovapd %xmm2, %xmm4
6210 ; X86-NEXT: vfmsub231sd {{.*#+}} xmm4 = (xmm0 * xmm1) - xmm4
6211 ; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm3
6212 ; X86-NEXT: vmovapd %xmm2, %xmm4
6213 ; X86-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4
6214 ; X86-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6215 ; X86-NEXT: vaddpd %xmm2, %xmm4, %xmm0
6216 ; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0
6218 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
6219 %2 = extractelement <2 x double> %x0, i64 0
6220 %3 = extractelement <2 x double> %x1, i64 0
6221 %4 = extractelement <2 x double> %1, i64 0
6222 %5 = call double @llvm.fma.f64(double %2, double %3, double %4)
6223 %6 = extractelement <2 x double> %x2, i64 0
6224 %7 = insertelement <2 x double> %x2, double %5, i64 0
6225 %8 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
6226 %9 = extractelement <2 x double> %x0, i64 0
6227 %10 = extractelement <2 x double> %x1, i64 0
6228 %11 = extractelement <2 x double> %8, i64 0
6229 %12 = call double @llvm.fma.f64(double %9, double %10, double %11)
6230 %13 = extractelement <2 x double> %x2, i64 0
6231 %14 = bitcast i8 %x3 to <8 x i1>
6232 %15 = extractelement <8 x i1> %14, i64 0
6233 %16 = select i1 %15, double %12, double %13
6234 %17 = insertelement <2 x double> %x2, double %16, i64 0
6235 %18 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
6236 %19 = extractelement <2 x double> %x0, i64 0
6237 %20 = extractelement <2 x double> %x1, i64 0
6238 %21 = extractelement <2 x double> %18, i64 0
6239 %22 = call double @llvm.x86.avx512.vfmadd.f64(double %19, double %20, double %21, i32 11)
6240 %23 = extractelement <2 x double> %x2, i64 0
6241 %24 = insertelement <2 x double> %x2, double %22, i64 0
6242 %25 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
6243 %26 = extractelement <2 x double> %x0, i64 0
6244 %27 = extractelement <2 x double> %x1, i64 0
6245 %28 = extractelement <2 x double> %25, i64 0
6246 %29 = call double @llvm.x86.avx512.vfmadd.f64(double %26, double %27, double %28, i32 11)
6247 %30 = extractelement <2 x double> %x2, i64 0
6248 %31 = bitcast i8 %x3 to <8 x i1>
6249 %32 = extractelement <8 x i1> %31, i64 0
6250 %33 = select i1 %32, double %29, double %30
6251 %34 = insertelement <2 x double> %x2, double %33, i64 0
6252 %res4 = fadd <2 x double> %7, %17
6253 %res5 = fadd <2 x double> %24, %34
6254 %res6 = fadd <2 x double> %res4, %res5
6255 ret <2 x double> %res6
6258 define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
6259 ; X64-LABEL: test_int_x86_avx512_mask3_vfmsub_ss:
6261 ; X64-NEXT: vmovaps %xmm2, %xmm3
6262 ; X64-NEXT: vfmsub231ss {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm3
6263 ; X64-NEXT: kmovw %edi, %k1
6264 ; X64-NEXT: vmovaps %xmm2, %xmm4
6265 ; X64-NEXT: vfmsub231ss {{.*#+}} xmm4 = (xmm0 * xmm1) - xmm4
6266 ; X64-NEXT: vaddps %xmm4, %xmm3, %xmm3
6267 ; X64-NEXT: vmovaps %xmm2, %xmm4
6268 ; X64-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4
6269 ; X64-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6270 ; X64-NEXT: vaddps %xmm2, %xmm4, %xmm0
6271 ; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0
6274 ; X86-LABEL: test_int_x86_avx512_mask3_vfmsub_ss:
6276 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6277 ; X86-NEXT: vmovaps %xmm2, %xmm3
6278 ; X86-NEXT: vfmsub231ss {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm3
6279 ; X86-NEXT: kmovw %eax, %k1
6280 ; X86-NEXT: vmovaps %xmm2, %xmm4
6281 ; X86-NEXT: vfmsub231ss {{.*#+}} xmm4 = (xmm0 * xmm1) - xmm4
6282 ; X86-NEXT: vaddps %xmm4, %xmm3, %xmm3
6283 ; X86-NEXT: vmovaps %xmm2, %xmm4
6284 ; X86-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4
6285 ; X86-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6286 ; X86-NEXT: vaddps %xmm2, %xmm4, %xmm0
6287 ; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0
6289 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
6290 %2 = extractelement <4 x float> %x0, i64 0
6291 %3 = extractelement <4 x float> %x1, i64 0
6292 %4 = extractelement <4 x float> %1, i64 0
6293 %5 = call float @llvm.fma.f32(float %2, float %3, float %4)
6294 %6 = extractelement <4 x float> %x2, i64 0
6295 %7 = insertelement <4 x float> %x2, float %5, i64 0
6296 %8 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
6297 %9 = extractelement <4 x float> %x0, i64 0
6298 %10 = extractelement <4 x float> %x1, i64 0
6299 %11 = extractelement <4 x float> %8, i64 0
6300 %12 = call float @llvm.fma.f32(float %9, float %10, float %11)
6301 %13 = extractelement <4 x float> %x2, i64 0
6302 %14 = bitcast i8 %x3 to <8 x i1>
6303 %15 = extractelement <8 x i1> %14, i64 0
6304 %16 = select i1 %15, float %12, float %13
6305 %17 = insertelement <4 x float> %x2, float %16, i64 0
6306 %18 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
6307 %19 = extractelement <4 x float> %x0, i64 0
6308 %20 = extractelement <4 x float> %x1, i64 0
6309 %21 = extractelement <4 x float> %18, i64 0
6310 %22 = call float @llvm.x86.avx512.vfmadd.f32(float %19, float %20, float %21, i32 11)
6311 %23 = extractelement <4 x float> %x2, i64 0
6312 %24 = insertelement <4 x float> %x2, float %22, i64 0
6313 %25 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
6314 %26 = extractelement <4 x float> %x0, i64 0
6315 %27 = extractelement <4 x float> %x1, i64 0
6316 %28 = extractelement <4 x float> %25, i64 0
6317 %29 = call float @llvm.x86.avx512.vfmadd.f32(float %26, float %27, float %28, i32 11)
6318 %30 = extractelement <4 x float> %x2, i64 0
6319 %31 = bitcast i8 %x3 to <8 x i1>
6320 %32 = extractelement <8 x i1> %31, i64 0
6321 %33 = select i1 %32, float %29, float %30
6322 %34 = insertelement <4 x float> %x2, float %33, i64 0
6323 %res4 = fadd <4 x float> %7, %17
6324 %res5 = fadd <4 x float> %24, %34
6325 %res6 = fadd <4 x float> %res4, %res5
6326 ret <4 x float> %res6
6329 define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
6330 ; X64-LABEL: test_int_x86_avx512_mask3_vfnmsub_sd:
6332 ; X64-NEXT: vmovapd %xmm2, %xmm3
6333 ; X64-NEXT: vfnmsub231sd {{.*#+}} xmm3 = -(xmm0 * xmm1) - xmm3
6334 ; X64-NEXT: kmovw %edi, %k1
6335 ; X64-NEXT: vmovapd %xmm2, %xmm4
6336 ; X64-NEXT: vfnmsub231sd {{.*#+}} xmm4 = -(xmm0 * xmm1) - xmm4
6337 ; X64-NEXT: vaddpd %xmm4, %xmm3, %xmm3
6338 ; X64-NEXT: vmovapd %xmm2, %xmm4
6339 ; X64-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4
6340 ; X64-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6341 ; X64-NEXT: vaddpd %xmm2, %xmm4, %xmm0
6342 ; X64-NEXT: vaddpd %xmm0, %xmm3, %xmm0
6345 ; X86-LABEL: test_int_x86_avx512_mask3_vfnmsub_sd:
6347 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6348 ; X86-NEXT: vmovapd %xmm2, %xmm3
6349 ; X86-NEXT: vfnmsub231sd {{.*#+}} xmm3 = -(xmm0 * xmm1) - xmm3
6350 ; X86-NEXT: kmovw %eax, %k1
6351 ; X86-NEXT: vmovapd %xmm2, %xmm4
6352 ; X86-NEXT: vfnmsub231sd {{.*#+}} xmm4 = -(xmm0 * xmm1) - xmm4
6353 ; X86-NEXT: vaddpd %xmm4, %xmm3, %xmm3
6354 ; X86-NEXT: vmovapd %xmm2, %xmm4
6355 ; X86-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4
6356 ; X86-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6357 ; X86-NEXT: vaddpd %xmm2, %xmm4, %xmm0
6358 ; X86-NEXT: vaddpd %xmm0, %xmm3, %xmm0
6360 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x0
6361 %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
6362 %3 = extractelement <2 x double> %1, i64 0
6363 %4 = extractelement <2 x double> %x1, i64 0
6364 %5 = extractelement <2 x double> %2, i64 0
6365 %6 = call double @llvm.fma.f64(double %3, double %4, double %5)
6366 %7 = extractelement <2 x double> %x2, i64 0
6367 %8 = insertelement <2 x double> %x2, double %6, i64 0
6368 %9 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x0
6369 %10 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
6370 %11 = extractelement <2 x double> %9, i64 0
6371 %12 = extractelement <2 x double> %x1, i64 0
6372 %13 = extractelement <2 x double> %10, i64 0
6373 %14 = call double @llvm.fma.f64(double %11, double %12, double %13)
6374 %15 = extractelement <2 x double> %x2, i64 0
6375 %16 = bitcast i8 %x3 to <8 x i1>
6376 %17 = extractelement <8 x i1> %16, i64 0
6377 %18 = select i1 %17, double %14, double %15
6378 %19 = insertelement <2 x double> %x2, double %18, i64 0
6379 %20 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x0
6380 %21 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
6381 %22 = extractelement <2 x double> %20, i64 0
6382 %23 = extractelement <2 x double> %x1, i64 0
6383 %24 = extractelement <2 x double> %21, i64 0
6384 %25 = call double @llvm.x86.avx512.vfmadd.f64(double %22, double %23, double %24, i32 11)
6385 %26 = extractelement <2 x double> %x2, i64 0
6386 %27 = insertelement <2 x double> %x2, double %25, i64 0
6387 %28 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x0
6388 %29 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
6389 %30 = extractelement <2 x double> %28, i64 0
6390 %31 = extractelement <2 x double> %x1, i64 0
6391 %32 = extractelement <2 x double> %29, i64 0
6392 %33 = call double @llvm.x86.avx512.vfmadd.f64(double %30, double %31, double %32, i32 11)
6393 %34 = extractelement <2 x double> %x2, i64 0
6394 %35 = bitcast i8 %x3 to <8 x i1>
6395 %36 = extractelement <8 x i1> %35, i64 0
6396 %37 = select i1 %36, double %33, double %34
6397 %38 = insertelement <2 x double> %x2, double %37, i64 0
6398 %res4 = fadd <2 x double> %8, %19
6399 %res5 = fadd <2 x double> %27, %38
6400 %res6 = fadd <2 x double> %res4, %res5
6401 ret <2 x double> %res6
6404 define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
6405 ; X64-LABEL: test_int_x86_avx512_mask3_vfnmsub_ss:
6407 ; X64-NEXT: vmovaps %xmm2, %xmm3
6408 ; X64-NEXT: vfnmsub231ss {{.*#+}} xmm3 = -(xmm0 * xmm1) - xmm3
6409 ; X64-NEXT: kmovw %edi, %k1
6410 ; X64-NEXT: vmovaps %xmm2, %xmm4
6411 ; X64-NEXT: vfnmsub231ss {{.*#+}} xmm4 = -(xmm0 * xmm1) - xmm4
6412 ; X64-NEXT: vaddps %xmm4, %xmm3, %xmm3
6413 ; X64-NEXT: vmovaps %xmm2, %xmm4
6414 ; X64-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4
6415 ; X64-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6416 ; X64-NEXT: vaddps %xmm2, %xmm4, %xmm0
6417 ; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0
6420 ; X86-LABEL: test_int_x86_avx512_mask3_vfnmsub_ss:
6422 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6423 ; X86-NEXT: vmovaps %xmm2, %xmm3
6424 ; X86-NEXT: vfnmsub231ss {{.*#+}} xmm3 = -(xmm0 * xmm1) - xmm3
6425 ; X86-NEXT: kmovw %eax, %k1
6426 ; X86-NEXT: vmovaps %xmm2, %xmm4
6427 ; X86-NEXT: vfnmsub231ss {{.*#+}} xmm4 = -(xmm0 * xmm1) - xmm4
6428 ; X86-NEXT: vaddps %xmm4, %xmm3, %xmm3
6429 ; X86-NEXT: vmovaps %xmm2, %xmm4
6430 ; X86-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4
6431 ; X86-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6432 ; X86-NEXT: vaddps %xmm2, %xmm4, %xmm0
6433 ; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0
6435 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x0
6436 %2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
6437 %3 = extractelement <4 x float> %1, i64 0
6438 %4 = extractelement <4 x float> %x1, i64 0
6439 %5 = extractelement <4 x float> %2, i64 0
6440 %6 = call float @llvm.fma.f32(float %3, float %4, float %5)
6441 %7 = extractelement <4 x float> %x2, i64 0
6442 %8 = insertelement <4 x float> %x2, float %6, i64 0
6443 %9 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x0
6444 %10 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
6445 %11 = extractelement <4 x float> %9, i64 0
6446 %12 = extractelement <4 x float> %x1, i64 0
6447 %13 = extractelement <4 x float> %10, i64 0
6448 %14 = call float @llvm.fma.f32(float %11, float %12, float %13)
6449 %15 = extractelement <4 x float> %x2, i64 0
6450 %16 = bitcast i8 %x3 to <8 x i1>
6451 %17 = extractelement <8 x i1> %16, i64 0
6452 %18 = select i1 %17, float %14, float %15
6453 %19 = insertelement <4 x float> %x2, float %18, i64 0
6454 %20 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x0
6455 %21 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
6456 %22 = extractelement <4 x float> %20, i64 0
6457 %23 = extractelement <4 x float> %x1, i64 0
6458 %24 = extractelement <4 x float> %21, i64 0
6459 %25 = call float @llvm.x86.avx512.vfmadd.f32(float %22, float %23, float %24, i32 11)
6460 %26 = extractelement <4 x float> %x2, i64 0
6461 %27 = insertelement <4 x float> %x2, float %25, i64 0
6462 %28 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x0
6463 %29 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
6464 %30 = extractelement <4 x float> %28, i64 0
6465 %31 = extractelement <4 x float> %x1, i64 0
6466 %32 = extractelement <4 x float> %29, i64 0
6467 %33 = call float @llvm.x86.avx512.vfmadd.f32(float %30, float %31, float %32, i32 11)
6468 %34 = extractelement <4 x float> %x2, i64 0
6469 %35 = bitcast i8 %x3 to <8 x i1>
6470 %36 = extractelement <8 x i1> %35, i64 0
6471 %37 = select i1 %36, float %33, float %34
6472 %38 = insertelement <4 x float> %x2, float %37, i64 0
6473 %res4 = fadd <4 x float> %8, %19
6474 %res5 = fadd <4 x float> %27, %38
6475 %res6 = fadd <4 x float> %res4, %res5
6476 ret <4 x float> %res6
6479 define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, float *%ptr_b ,i8 %x3,i32 %x4) {
6480 ; X64-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_rm:
6482 ; X64-NEXT: kmovw %esi, %k1
6483 ; X64-NEXT: vfmadd231ss {{.*#+}} xmm1 = (xmm0 * mem) + xmm1
6484 ; X64-NEXT: vmovaps %xmm1, %xmm0
6487 ; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_rm:
6489 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6490 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6491 ; X86-NEXT: kmovw %ecx, %k1
6492 ; X86-NEXT: vfmadd231ss {{.*#+}} xmm1 = (xmm0 * mem) + xmm1
6493 ; X86-NEXT: vmovaps %xmm1, %xmm0
6495 %q = load float, float* %ptr_b
6496 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
6497 %1 = extractelement <4 x float> %x0, i64 0
6498 %2 = extractelement <4 x float> %vecinit.i, i64 0
6499 %3 = extractelement <4 x float> %x1, i64 0
6500 %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
6501 %5 = bitcast i8 %x3 to <8 x i1>
6502 %6 = extractelement <8 x i1> %5, i64 0
6503 %7 = select i1 %6, float %4, float %3
6504 %8 = insertelement <4 x float> %x1, float %7, i64 0
6508 define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) {
6509 ; X64-LABEL: test_int_x86_avx512_mask_vfmadd_ss_rm:
6511 ; X64-NEXT: kmovw %esi, %k1
6512 ; X64-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm1
6515 ; X86-LABEL: test_int_x86_avx512_mask_vfmadd_ss_rm:
6517 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6518 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6519 ; X86-NEXT: kmovw %ecx, %k1
6520 ; X86-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm1
6522 %q = load float, float* %ptr_b
6523 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
6524 %1 = extractelement <4 x float> %x0, i64 0
6525 %2 = extractelement <4 x float> %vecinit.i, i64 0
6526 %3 = extractelement <4 x float> %x1, i64 0
6527 %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
6528 %5 = bitcast i8 %x3 to <8 x i1>
6529 %6 = extractelement <8 x i1> %5, i64 0
6530 %7 = select i1 %6, float %4, float %1
6531 %8 = insertelement <4 x float> %x0, float %7, i64 0
6536 define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) {
6537 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm:
6539 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
6540 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
6541 ; CHECK-NEXT: ret{{[l|q]}}
6542 %q = load float, float* %ptr_b
6543 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
6544 %1 = extractelement <4 x float> %x0, i64 0
6545 %2 = extractelement <4 x float> %x1, i64 0
6546 %3 = extractelement <4 x float> %vecinit.i, i64 0
6547 %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
6548 %5 = select i1 false, float %4, float 0.000000e+00
6549 %6 = insertelement <4 x float> %x0, float %5, i64 0
6553 define <16 x i32> @test_x86_avx512_psll_d_512(<16 x i32> %a0, <4 x i32> %a1) {
6554 ; CHECK-LABEL: test_x86_avx512_psll_d_512:
6556 ; CHECK-NEXT: vpslld %xmm1, %zmm0, %zmm0
6557 ; CHECK-NEXT: ret{{[l|q]}}
6558 %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6561 define <16 x i32> @test_x86_avx512_mask_psll_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) {
6562 ; X64-LABEL: test_x86_avx512_mask_psll_d_512:
6564 ; X64-NEXT: kmovw %edi, %k1
6565 ; X64-NEXT: vpslld %xmm1, %zmm0, %zmm2 {%k1}
6566 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
6569 ; X86-LABEL: test_x86_avx512_mask_psll_d_512:
6571 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6572 ; X86-NEXT: vpslld %xmm1, %zmm0, %zmm2 {%k1}
6573 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
6575 %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6576 %mask.cast = bitcast i16 %mask to <16 x i1>
6577 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
6578 ret <16 x i32> %res2
6580 define <16 x i32> @test_x86_avx512_maskz_psll_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
6581 ; X64-LABEL: test_x86_avx512_maskz_psll_d_512:
6583 ; X64-NEXT: kmovw %edi, %k1
6584 ; X64-NEXT: vpslld %xmm1, %zmm0, %zmm0 {%k1} {z}
6587 ; X86-LABEL: test_x86_avx512_maskz_psll_d_512:
6589 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6590 ; X86-NEXT: vpslld %xmm1, %zmm0, %zmm0 {%k1} {z}
6592 %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6593 %mask.cast = bitcast i16 %mask to <16 x i1>
6594 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
6595 ret <16 x i32> %res2
6597 declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>) nounwind readnone
6600 define <8 x i64> @test_x86_avx512_psll_q_512(<8 x i64> %a0, <2 x i64> %a1) {
6601 ; CHECK-LABEL: test_x86_avx512_psll_q_512:
6603 ; CHECK-NEXT: vpsllq %xmm1, %zmm0, %zmm0
6604 ; CHECK-NEXT: ret{{[l|q]}}
6605 %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
6608 define <8 x i64> @test_x86_avx512_mask_psll_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) {
6609 ; X64-LABEL: test_x86_avx512_mask_psll_q_512:
6611 ; X64-NEXT: kmovw %edi, %k1
6612 ; X64-NEXT: vpsllq %xmm1, %zmm0, %zmm2 {%k1}
6613 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
6616 ; X86-LABEL: test_x86_avx512_mask_psll_q_512:
6618 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6619 ; X86-NEXT: kmovw %eax, %k1
6620 ; X86-NEXT: vpsllq %xmm1, %zmm0, %zmm2 {%k1}
6621 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
6623 %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
6624 %mask.cast = bitcast i8 %mask to <8 x i1>
6625 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
6628 define <8 x i64> @test_x86_avx512_maskz_psll_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
6629 ; X64-LABEL: test_x86_avx512_maskz_psll_q_512:
6631 ; X64-NEXT: kmovw %edi, %k1
6632 ; X64-NEXT: vpsllq %xmm1, %zmm0, %zmm0 {%k1} {z}
6635 ; X86-LABEL: test_x86_avx512_maskz_psll_q_512:
6637 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6638 ; X86-NEXT: kmovw %eax, %k1
6639 ; X86-NEXT: vpsllq %xmm1, %zmm0, %zmm0 {%k1} {z}
6641 %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
6642 %mask.cast = bitcast i8 %mask to <8 x i1>
6643 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
6646 declare <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64>, <2 x i64>) nounwind readnone
6649 define <16 x i32> @test_x86_avx512_pslli_d_512(<16 x i32> %a0) {
6650 ; CHECK-LABEL: test_x86_avx512_pslli_d_512:
6652 ; CHECK-NEXT: vpslld $7, %zmm0, %zmm0
6653 ; CHECK-NEXT: ret{{[l|q]}}
6654 %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
6657 define <16 x i32> @test_x86_avx512_mask_pslli_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
6658 ; X64-LABEL: test_x86_avx512_mask_pslli_d_512:
6660 ; X64-NEXT: kmovw %edi, %k1
6661 ; X64-NEXT: vpslld $7, %zmm0, %zmm1 {%k1}
6662 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
6665 ; X86-LABEL: test_x86_avx512_mask_pslli_d_512:
6667 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6668 ; X86-NEXT: vpslld $7, %zmm0, %zmm1 {%k1}
6669 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
6671 %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
6672 %mask.cast = bitcast i16 %mask to <16 x i1>
6673 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
6674 ret <16 x i32> %res2
6676 define <16 x i32> @test_x86_avx512_maskz_pslli_d_512(<16 x i32> %a0, i16 %mask) {
6677 ; X64-LABEL: test_x86_avx512_maskz_pslli_d_512:
6679 ; X64-NEXT: kmovw %edi, %k1
6680 ; X64-NEXT: vpslld $7, %zmm0, %zmm0 {%k1} {z}
6683 ; X86-LABEL: test_x86_avx512_maskz_pslli_d_512:
6685 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6686 ; X86-NEXT: vpslld $7, %zmm0, %zmm0 {%k1} {z}
6688 %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
6689 %mask.cast = bitcast i16 %mask to <16 x i1>
6690 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
6691 ret <16 x i32> %res2
6693 declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) nounwind readnone
6696 define <8 x i64> @test_x86_avx512_pslli_q_512(<8 x i64> %a0) {
6697 ; CHECK-LABEL: test_x86_avx512_pslli_q_512:
6699 ; CHECK-NEXT: vpsllq $7, %zmm0, %zmm0
6700 ; CHECK-NEXT: ret{{[l|q]}}
6701 %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
6704 define <8 x i64> @test_x86_avx512_mask_pslli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
6705 ; X64-LABEL: test_x86_avx512_mask_pslli_q_512:
6707 ; X64-NEXT: kmovw %edi, %k1
6708 ; X64-NEXT: vpsllq $7, %zmm0, %zmm1 {%k1}
6709 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
6712 ; X86-LABEL: test_x86_avx512_mask_pslli_q_512:
6714 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6715 ; X86-NEXT: kmovw %eax, %k1
6716 ; X86-NEXT: vpsllq $7, %zmm0, %zmm1 {%k1}
6717 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
6719 %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
6720 %mask.cast = bitcast i8 %mask to <8 x i1>
6721 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
6724 define <8 x i64> @test_x86_avx512_maskz_pslli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
6725 ; X64-LABEL: test_x86_avx512_maskz_pslli_q_512:
6727 ; X64-NEXT: kmovw %edi, %k1
6728 ; X64-NEXT: vpsllq $7, %zmm0, %zmm0 {%k1} {z}
6731 ; X86-LABEL: test_x86_avx512_maskz_pslli_q_512:
6733 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6734 ; X86-NEXT: kmovw %eax, %k1
6735 ; X86-NEXT: vpsllq $7, %zmm0, %zmm0 {%k1} {z}
6737 %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
6738 %mask.cast = bitcast i8 %mask to <8 x i1>
6739 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
6742 declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) nounwind readnone
6745 define <8 x i64> @test_x86_avx512_psra_q_512(<8 x i64> %a0, <2 x i64> %a1) {
6746 ; CHECK-LABEL: test_x86_avx512_psra_q_512:
6748 ; CHECK-NEXT: vpsraq %xmm1, %zmm0, %zmm0
6749 ; CHECK-NEXT: ret{{[l|q]}}
6750 %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
6753 define <8 x i64> @test_x86_avx512_mask_psra_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) {
6754 ; X64-LABEL: test_x86_avx512_mask_psra_q_512:
6756 ; X64-NEXT: kmovw %edi, %k1
6757 ; X64-NEXT: vpsraq %xmm1, %zmm0, %zmm2 {%k1}
6758 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
6761 ; X86-LABEL: test_x86_avx512_mask_psra_q_512:
6763 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6764 ; X86-NEXT: kmovw %eax, %k1
6765 ; X86-NEXT: vpsraq %xmm1, %zmm0, %zmm2 {%k1}
6766 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
6768 %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
6769 %mask.cast = bitcast i8 %mask to <8 x i1>
6770 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
6773 define <8 x i64> @test_x86_avx512_maskz_psra_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
6774 ; X64-LABEL: test_x86_avx512_maskz_psra_q_512:
6776 ; X64-NEXT: kmovw %edi, %k1
6777 ; X64-NEXT: vpsraq %xmm1, %zmm0, %zmm0 {%k1} {z}
6780 ; X86-LABEL: test_x86_avx512_maskz_psra_q_512:
6782 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6783 ; X86-NEXT: kmovw %eax, %k1
6784 ; X86-NEXT: vpsraq %xmm1, %zmm0, %zmm0 {%k1} {z}
6786 %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
6787 %mask.cast = bitcast i8 %mask to <8 x i1>
6788 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
6791 declare <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64>, <2 x i64>) nounwind readnone
6794 define <16 x i32> @test_x86_avx512_psra_d_512(<16 x i32> %a0, <4 x i32> %a1) {
6795 ; CHECK-LABEL: test_x86_avx512_psra_d_512:
6797 ; CHECK-NEXT: vpsrad %xmm1, %zmm0, %zmm0
6798 ; CHECK-NEXT: ret{{[l|q]}}
6799 %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6802 define <16 x i32> @test_x86_avx512_mask_psra_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) {
6803 ; X64-LABEL: test_x86_avx512_mask_psra_d_512:
6805 ; X64-NEXT: kmovw %edi, %k1
6806 ; X64-NEXT: vpsrad %xmm1, %zmm0, %zmm2 {%k1}
6807 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
6810 ; X86-LABEL: test_x86_avx512_mask_psra_d_512:
6812 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6813 ; X86-NEXT: vpsrad %xmm1, %zmm0, %zmm2 {%k1}
6814 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
6816 %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6817 %mask.cast = bitcast i16 %mask to <16 x i1>
6818 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
6819 ret <16 x i32> %res2
6821 define <16 x i32> @test_x86_avx512_maskz_psra_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
6822 ; X64-LABEL: test_x86_avx512_maskz_psra_d_512:
6824 ; X64-NEXT: kmovw %edi, %k1
6825 ; X64-NEXT: vpsrad %xmm1, %zmm0, %zmm0 {%k1} {z}
6828 ; X86-LABEL: test_x86_avx512_maskz_psra_d_512:
6830 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6831 ; X86-NEXT: vpsrad %xmm1, %zmm0, %zmm0 {%k1} {z}
6833 %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6834 %mask.cast = bitcast i16 %mask to <16 x i1>
6835 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
6836 ret <16 x i32> %res2
6838 declare <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32>, <4 x i32>) nounwind readnone
6842 define <8 x i64> @test_x86_avx512_psrai_q_512(<8 x i64> %a0) {
6843 ; CHECK-LABEL: test_x86_avx512_psrai_q_512:
6845 ; CHECK-NEXT: vpsraq $7, %zmm0, %zmm0
6846 ; CHECK-NEXT: ret{{[l|q]}}
6847 %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
6850 define <8 x i64> @test_x86_avx512_mask_psrai_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
6851 ; X64-LABEL: test_x86_avx512_mask_psrai_q_512:
6853 ; X64-NEXT: kmovw %edi, %k1
6854 ; X64-NEXT: vpsraq $7, %zmm0, %zmm1 {%k1}
6855 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
6858 ; X86-LABEL: test_x86_avx512_mask_psrai_q_512:
6860 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6861 ; X86-NEXT: kmovw %eax, %k1
6862 ; X86-NEXT: vpsraq $7, %zmm0, %zmm1 {%k1}
6863 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
6865 %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
6866 %mask.cast = bitcast i8 %mask to <8 x i1>
6867 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
6870 define <8 x i64> @test_x86_avx512_maskz_psrai_q_512(<8 x i64> %a0, i8 %mask) {
6871 ; X64-LABEL: test_x86_avx512_maskz_psrai_q_512:
6873 ; X64-NEXT: kmovw %edi, %k1
6874 ; X64-NEXT: vpsraq $7, %zmm0, %zmm0 {%k1} {z}
6877 ; X86-LABEL: test_x86_avx512_maskz_psrai_q_512:
6879 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6880 ; X86-NEXT: kmovw %eax, %k1
6881 ; X86-NEXT: vpsraq $7, %zmm0, %zmm0 {%k1} {z}
6883 %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
6884 %mask.cast = bitcast i8 %mask to <8 x i1>
6885 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
6888 declare <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64>, i32) nounwind readnone
6891 define <16 x i32> @test_x86_avx512_psrai_d_512(<16 x i32> %a0) {
6892 ; CHECK-LABEL: test_x86_avx512_psrai_d_512:
6894 ; CHECK-NEXT: vpsrad $7, %zmm0, %zmm0
6895 ; CHECK-NEXT: ret{{[l|q]}}
6896 %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
6899 define <16 x i32> @test_x86_avx512_mask_psrai_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
6900 ; X64-LABEL: test_x86_avx512_mask_psrai_d_512:
6902 ; X64-NEXT: kmovw %edi, %k1
6903 ; X64-NEXT: vpsrad $7, %zmm0, %zmm1 {%k1}
6904 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
6907 ; X86-LABEL: test_x86_avx512_mask_psrai_d_512:
6909 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6910 ; X86-NEXT: vpsrad $7, %zmm0, %zmm1 {%k1}
6911 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
6913 %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
6914 %mask.cast = bitcast i16 %mask to <16 x i1>
6915 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
6916 ret <16 x i32> %res2
6918 define <16 x i32> @test_x86_avx512_maskz_psrai_d_512(<16 x i32> %a0, i16 %mask) {
6919 ; X64-LABEL: test_x86_avx512_maskz_psrai_d_512:
6921 ; X64-NEXT: kmovw %edi, %k1
6922 ; X64-NEXT: vpsrad $7, %zmm0, %zmm0 {%k1} {z}
6925 ; X86-LABEL: test_x86_avx512_maskz_psrai_d_512:
6927 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6928 ; X86-NEXT: vpsrad $7, %zmm0, %zmm0 {%k1} {z}
6930 %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
6931 %mask.cast = bitcast i16 %mask to <16 x i1>
6932 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
6933 ret <16 x i32> %res2
6935 declare <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32>, i32) nounwind readnone
6939 define <16 x i32> @test_x86_avx512_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1) {
6940 ; CHECK-LABEL: test_x86_avx512_psrl_d_512:
6942 ; CHECK-NEXT: vpsrld %xmm1, %zmm0, %zmm0
6943 ; CHECK-NEXT: ret{{[l|q]}}
6944 %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6947 define <16 x i32> @test_x86_avx512_mask_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) {
6948 ; X64-LABEL: test_x86_avx512_mask_psrl_d_512:
6950 ; X64-NEXT: kmovw %edi, %k1
6951 ; X64-NEXT: vpsrld %xmm1, %zmm0, %zmm2 {%k1}
6952 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
6955 ; X86-LABEL: test_x86_avx512_mask_psrl_d_512:
6957 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6958 ; X86-NEXT: vpsrld %xmm1, %zmm0, %zmm2 {%k1}
6959 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
6961 %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6962 %mask.cast = bitcast i16 %mask to <16 x i1>
6963 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
6964 ret <16 x i32> %res2
6966 define <16 x i32> @test_x86_avx512_maskz_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
6967 ; X64-LABEL: test_x86_avx512_maskz_psrl_d_512:
6969 ; X64-NEXT: kmovw %edi, %k1
6970 ; X64-NEXT: vpsrld %xmm1, %zmm0, %zmm0 {%k1} {z}
6973 ; X86-LABEL: test_x86_avx512_maskz_psrl_d_512:
6975 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
6976 ; X86-NEXT: vpsrld %xmm1, %zmm0, %zmm0 {%k1} {z}
6978 %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
6979 %mask.cast = bitcast i16 %mask to <16 x i1>
6980 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
6981 ret <16 x i32> %res2
6983 declare <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32>, <4 x i32>) nounwind readnone
6986 define <8 x i64> @test_x86_avx512_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1) {
6987 ; CHECK-LABEL: test_x86_avx512_psrl_q_512:
6989 ; CHECK-NEXT: vpsrlq %xmm1, %zmm0, %zmm0
6990 ; CHECK-NEXT: ret{{[l|q]}}
6991 %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
6994 define <8 x i64> @test_x86_avx512_mask_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) {
6995 ; X64-LABEL: test_x86_avx512_mask_psrl_q_512:
6997 ; X64-NEXT: kmovw %edi, %k1
6998 ; X64-NEXT: vpsrlq %xmm1, %zmm0, %zmm2 {%k1}
6999 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
7002 ; X86-LABEL: test_x86_avx512_mask_psrl_q_512:
7004 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7005 ; X86-NEXT: kmovw %eax, %k1
7006 ; X86-NEXT: vpsrlq %xmm1, %zmm0, %zmm2 {%k1}
7007 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
7009 %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
7010 %mask.cast = bitcast i8 %mask to <8 x i1>
7011 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
7014 define <8 x i64> @test_x86_avx512_maskz_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
7015 ; X64-LABEL: test_x86_avx512_maskz_psrl_q_512:
7017 ; X64-NEXT: kmovw %edi, %k1
7018 ; X64-NEXT: vpsrlq %xmm1, %zmm0, %zmm0 {%k1} {z}
7021 ; X86-LABEL: test_x86_avx512_maskz_psrl_q_512:
7023 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7024 ; X86-NEXT: kmovw %eax, %k1
7025 ; X86-NEXT: vpsrlq %xmm1, %zmm0, %zmm0 {%k1} {z}
7027 %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
7028 %mask.cast = bitcast i8 %mask to <8 x i1>
7029 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
7032 declare <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64>, <2 x i64>) nounwind readnone
7035 define <16 x i32> @test_x86_avx512_psrli_d_512(<16 x i32> %a0) {
7036 ; CHECK-LABEL: test_x86_avx512_psrli_d_512:
7038 ; CHECK-NEXT: vpsrld $7, %zmm0, %zmm0
7039 ; CHECK-NEXT: ret{{[l|q]}}
7040 %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
7043 define <16 x i32> @test_x86_avx512_mask_psrli_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
7044 ; X64-LABEL: test_x86_avx512_mask_psrli_d_512:
7046 ; X64-NEXT: kmovw %edi, %k1
7047 ; X64-NEXT: vpsrld $7, %zmm0, %zmm1 {%k1}
7048 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
7051 ; X86-LABEL: test_x86_avx512_mask_psrli_d_512:
7053 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7054 ; X86-NEXT: vpsrld $7, %zmm0, %zmm1 {%k1}
7055 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
7057 %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
7058 %mask.cast = bitcast i16 %mask to <16 x i1>
7059 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
7060 ret <16 x i32> %res2
7062 define <16 x i32> @test_x86_avx512_maskz_psrli_d_512(<16 x i32> %a0, i16 %mask) {
7063 ; X64-LABEL: test_x86_avx512_maskz_psrli_d_512:
7065 ; X64-NEXT: kmovw %edi, %k1
7066 ; X64-NEXT: vpsrld $7, %zmm0, %zmm0 {%k1} {z}
7069 ; X86-LABEL: test_x86_avx512_maskz_psrli_d_512:
7071 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7072 ; X86-NEXT: vpsrld $7, %zmm0, %zmm0 {%k1} {z}
7074 %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
7075 %mask.cast = bitcast i16 %mask to <16 x i1>
7076 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
7077 ret <16 x i32> %res2
7079 declare <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32>, i32) nounwind readnone
7082 define <8 x i64> @test_x86_avx512_psrli_q_512(<8 x i64> %a0) {
7083 ; CHECK-LABEL: test_x86_avx512_psrli_q_512:
7085 ; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm0
7086 ; CHECK-NEXT: ret{{[l|q]}}
7087 %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
7090 define <8 x i64> @test_x86_avx512_mask_psrli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
7091 ; X64-LABEL: test_x86_avx512_mask_psrli_q_512:
7093 ; X64-NEXT: kmovw %edi, %k1
7094 ; X64-NEXT: vpsrlq $7, %zmm0, %zmm1 {%k1}
7095 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
7098 ; X86-LABEL: test_x86_avx512_mask_psrli_q_512:
7100 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7101 ; X86-NEXT: kmovw %eax, %k1
7102 ; X86-NEXT: vpsrlq $7, %zmm0, %zmm1 {%k1}
7103 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
7105 %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
7106 %mask.cast = bitcast i8 %mask to <8 x i1>
7107 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
7110 define <8 x i64> @test_x86_avx512_maskz_psrli_q_512(<8 x i64> %a0, i8 %mask) {
7111 ; X64-LABEL: test_x86_avx512_maskz_psrli_q_512:
7113 ; X64-NEXT: kmovw %edi, %k1
7114 ; X64-NEXT: vpsrlq $7, %zmm0, %zmm0 {%k1} {z}
7117 ; X86-LABEL: test_x86_avx512_maskz_psrli_q_512:
7119 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7120 ; X86-NEXT: kmovw %eax, %k1
7121 ; X86-NEXT: vpsrlq $7, %zmm0, %zmm0 {%k1} {z}
7123 %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
7124 %mask.cast = bitcast i8 %mask to <8 x i1>
7125 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
7128 declare <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64>, i32) nounwind readnone
7130 define <16 x i32> @test_x86_avx512_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1) {
7131 ; CHECK-LABEL: test_x86_avx512_psllv_d_512:
7133 ; CHECK-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
7134 ; CHECK-NEXT: ret{{[l|q]}}
7135 %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
7139 define <16 x i32> @test_x86_avx512_psllv_d_512_const() {
7140 ; X64-LABEL: test_x86_avx512_psllv_d_512_const:
7142 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0]
7143 ; X64-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
7144 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295]
7145 ; X64-NEXT: vpsllvd {{.*}}(%rip), %zmm1, %zmm1
7146 ; X64-NEXT: vpaddd %zmm1, %zmm0, %zmm0
7149 ; X86-LABEL: test_x86_avx512_psllv_d_512_const:
7151 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0]
7152 ; X86-NEXT: vpsllvd {{\.LCPI.*}}, %zmm0, %zmm0
7153 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295]
7154 ; X86-NEXT: vpsllvd {{\.LCPI.*}}, %zmm1, %zmm1
7155 ; X86-NEXT: vpaddd %zmm1, %zmm0, %zmm0
7157 %res0 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> <i32 2, i32 9, i32 0, i32 -1, i32 3, i32 7, i32 -1, i32 0, i32 4, i32 5, i32 -2, i32 0, i32 5, i32 3, i32 -3, i32 0>, <16 x i32> <i32 1, i32 0, i32 33, i32 -1,i32 2, i32 0, i32 34, i32 -2, i32 3, i32 0, i32 35, i32 -1, i32 4, i32 0, i32 36, i32 -3>)
7158 %res1 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 -1>)
7159 %res2 = add <16 x i32> %res0, %res1
7160 ret <16 x i32> %res2
7163 define <16 x i32> @test_x86_avx512_mask_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
7164 ; X64-LABEL: test_x86_avx512_mask_psllv_d_512:
7166 ; X64-NEXT: kmovw %edi, %k1
7167 ; X64-NEXT: vpsllvd %zmm1, %zmm0, %zmm2 {%k1}
7168 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
7171 ; X86-LABEL: test_x86_avx512_mask_psllv_d_512:
7173 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7174 ; X86-NEXT: vpsllvd %zmm1, %zmm0, %zmm2 {%k1}
7175 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
7177 %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
7178 %mask.cast = bitcast i16 %mask to <16 x i1>
7179 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2
7180 ret <16 x i32> %res2
7183 define <16 x i32> @test_x86_avx512_maskz_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
7184 ; X64-LABEL: test_x86_avx512_maskz_psllv_d_512:
7186 ; X64-NEXT: kmovw %edi, %k1
7187 ; X64-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 {%k1} {z}
7190 ; X86-LABEL: test_x86_avx512_maskz_psllv_d_512:
7192 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7193 ; X86-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 {%k1} {z}
7195 %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
7196 %mask.cast = bitcast i16 %mask to <16 x i1>
7197 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
7198 ret <16 x i32> %res2
7201 declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>) nounwind readnone
7203 define <8 x i64> @test_x86_avx512_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1) {
7204 ; CHECK-LABEL: test_x86_avx512_psllv_q_512:
7206 ; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm0
7207 ; CHECK-NEXT: ret{{[l|q]}}
7208 %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
7212 define <8 x i64> @test_x86_avx512_psllv_q_512_const() {
7213 ; X64-LABEL: test_x86_avx512_psllv_q_512_const:
7215 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,18446744073709551615,3,7,18446744073709551615,0]
7216 ; X64-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0
7217 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,18446744073709551615]
7218 ; X64-NEXT: vpsllvq {{.*}}(%rip), %zmm1, %zmm1
7219 ; X64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
7222 ; X86-LABEL: test_x86_avx512_psllv_q_512_const:
7224 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,0,9,0,0,0,4294967295,4294967295,3,0,7,0,4294967295,4294967295,0,0]
7225 ; X86-NEXT: vpsllvq {{\.LCPI.*}}, %zmm0, %zmm0
7226 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,0,4,0,4,0,4,0,4,0,4,0,4,0,4294967295,4294967295]
7227 ; X86-NEXT: vpsllvq {{\.LCPI.*}}, %zmm1, %zmm1
7228 ; X86-NEXT: vpaddq %zmm1, %zmm0, %zmm0
7230 %res0 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> <i64 2, i64 9, i64 0, i64 -1, i64 3, i64 7, i64 -1, i64 0>, <8 x i64> <i64 1, i64 0, i64 33, i64 -1,i64 2, i64 0, i64 34, i64 -2>)
7231 %res1 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 -1>)
7232 %res2 = add <8 x i64> %res0, %res1
7236 define <8 x i64> @test_x86_avx512_mask_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
7237 ; X64-LABEL: test_x86_avx512_mask_psllv_q_512:
7239 ; X64-NEXT: kmovw %edi, %k1
7240 ; X64-NEXT: vpsllvq %zmm1, %zmm0, %zmm2 {%k1}
7241 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
7244 ; X86-LABEL: test_x86_avx512_mask_psllv_q_512:
7246 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7247 ; X86-NEXT: kmovw %eax, %k1
7248 ; X86-NEXT: vpsllvq %zmm1, %zmm0, %zmm2 {%k1}
7249 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
7251 %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
7252 %mask.cast = bitcast i8 %mask to <8 x i1>
7253 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2
7257 define <8 x i64> @test_x86_avx512_maskz_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
7258 ; X64-LABEL: test_x86_avx512_maskz_psllv_q_512:
7260 ; X64-NEXT: kmovw %edi, %k1
7261 ; X64-NEXT: vpsllvq %zmm1, %zmm0, %zmm0 {%k1} {z}
7264 ; X86-LABEL: test_x86_avx512_maskz_psllv_q_512:
7266 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7267 ; X86-NEXT: kmovw %eax, %k1
7268 ; X86-NEXT: vpsllvq %zmm1, %zmm0, %zmm0 {%k1} {z}
7270 %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
7271 %mask.cast = bitcast i8 %mask to <8 x i1>
7272 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
7276 declare <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64>, <8 x i64>) nounwind readnone
7278 define <16 x i32> @test_x86_avx512_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1) {
7279 ; CHECK-LABEL: test_x86_avx512_psrav_d_512:
7281 ; CHECK-NEXT: vpsravd %zmm1, %zmm0, %zmm0
7282 ; CHECK-NEXT: ret{{[l|q]}}
7283 %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
7287 define <16 x i32> @test_x86_avx512_mask_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
7288 ; X64-LABEL: test_x86_avx512_mask_psrav_d_512:
7290 ; X64-NEXT: kmovw %edi, %k1
7291 ; X64-NEXT: vpsravd %zmm1, %zmm0, %zmm2 {%k1}
7292 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
7295 ; X86-LABEL: test_x86_avx512_mask_psrav_d_512:
7297 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7298 ; X86-NEXT: vpsravd %zmm1, %zmm0, %zmm2 {%k1}
7299 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
7301 %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
7302 %mask.cast = bitcast i16 %mask to <16 x i1>
7303 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2
7304 ret <16 x i32> %res2
7307 define <16 x i32> @test_x86_avx512_maskz_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
7308 ; X64-LABEL: test_x86_avx512_maskz_psrav_d_512:
7310 ; X64-NEXT: kmovw %edi, %k1
7311 ; X64-NEXT: vpsravd %zmm1, %zmm0, %zmm0 {%k1} {z}
7314 ; X86-LABEL: test_x86_avx512_maskz_psrav_d_512:
7316 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7317 ; X86-NEXT: vpsravd %zmm1, %zmm0, %zmm0 {%k1} {z}
7319 %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
7320 %mask.cast = bitcast i16 %mask to <16 x i1>
7321 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
7322 ret <16 x i32> %res2
7325 declare <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32>, <16 x i32>) nounwind readnone
7327 define <8 x i64> @test_x86_avx512_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1) {
7328 ; CHECK-LABEL: test_x86_avx512_psrav_q_512:
7330 ; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm0
7331 ; CHECK-NEXT: ret{{[l|q]}}
7332 %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
7336 define <8 x i64> @test_x86_avx512_mask_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
7337 ; X64-LABEL: test_x86_avx512_mask_psrav_q_512:
7339 ; X64-NEXT: kmovw %edi, %k1
7340 ; X64-NEXT: vpsravq %zmm1, %zmm0, %zmm2 {%k1}
7341 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
7344 ; X86-LABEL: test_x86_avx512_mask_psrav_q_512:
7346 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7347 ; X86-NEXT: kmovw %eax, %k1
7348 ; X86-NEXT: vpsravq %zmm1, %zmm0, %zmm2 {%k1}
7349 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
7351 %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
7352 %mask.cast = bitcast i8 %mask to <8 x i1>
7353 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2
7357 define <8 x i64> @test_x86_avx512_maskz_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
7358 ; X64-LABEL: test_x86_avx512_maskz_psrav_q_512:
7360 ; X64-NEXT: kmovw %edi, %k1
7361 ; X64-NEXT: vpsravq %zmm1, %zmm0, %zmm0 {%k1} {z}
7364 ; X86-LABEL: test_x86_avx512_maskz_psrav_q_512:
7366 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7367 ; X86-NEXT: kmovw %eax, %k1
7368 ; X86-NEXT: vpsravq %zmm1, %zmm0, %zmm0 {%k1} {z}
7370 %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
7371 %mask.cast = bitcast i8 %mask to <8 x i1>
7372 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
7376 declare <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64>, <8 x i64>) nounwind readnone
7378 define <16 x i32> @test_x86_avx512_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1) {
7379 ; CHECK-LABEL: test_x86_avx512_psrlv_d_512:
7381 ; CHECK-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
7382 ; CHECK-NEXT: ret{{[l|q]}}
7383 %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
7387 define <16 x i32> @test_x86_avx512_psrlv_d_512_const() {
7388 ; X64-LABEL: test_x86_avx512_psrlv_d_512_const:
7390 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0]
7391 ; X64-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
7392 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295]
7393 ; X64-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
7394 ; X64-NEXT: vpaddd %zmm1, %zmm0, %zmm0
7397 ; X86-LABEL: test_x86_avx512_psrlv_d_512_const:
7399 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0]
7400 ; X86-NEXT: vpsrlvd {{\.LCPI.*}}, %zmm0, %zmm0
7401 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295]
7402 ; X86-NEXT: vpsrlvd {{\.LCPI.*}}, %zmm1, %zmm1
7403 ; X86-NEXT: vpaddd %zmm1, %zmm0, %zmm0
7405 %res0 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> <i32 2, i32 9, i32 0, i32 -1, i32 3, i32 7, i32 -1, i32 0, i32 4, i32 5, i32 -2, i32 0, i32 5, i32 3, i32 -3, i32 0>, <16 x i32> <i32 1, i32 0, i32 33, i32 -1,i32 2, i32 0, i32 34, i32 -2, i32 3, i32 0, i32 35, i32 -1, i32 4, i32 0, i32 36, i32 -3>)
7406 %res1 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 -1 >)
7407 %res2 = add <16 x i32> %res0, %res1
7408 ret <16 x i32> %res2
7411 define <16 x i32> @test_x86_avx512_mask_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
7412 ; X64-LABEL: test_x86_avx512_mask_psrlv_d_512:
7414 ; X64-NEXT: kmovw %edi, %k1
7415 ; X64-NEXT: vpsrlvd %zmm1, %zmm0, %zmm2 {%k1}
7416 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
7419 ; X86-LABEL: test_x86_avx512_mask_psrlv_d_512:
7421 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7422 ; X86-NEXT: vpsrlvd %zmm1, %zmm0, %zmm2 {%k1}
7423 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
7425 %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
7426 %mask.cast = bitcast i16 %mask to <16 x i1>
7427 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2
7428 ret <16 x i32> %res2
7431 define <16 x i32> @test_x86_avx512_maskz_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
7432 ; X64-LABEL: test_x86_avx512_maskz_psrlv_d_512:
7434 ; X64-NEXT: kmovw %edi, %k1
7435 ; X64-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 {%k1} {z}
7438 ; X86-LABEL: test_x86_avx512_maskz_psrlv_d_512:
7440 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
7441 ; X86-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 {%k1} {z}
7443 %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
7444 %mask.cast = bitcast i16 %mask to <16 x i1>
7445 %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
7446 ret <16 x i32> %res2
7449 declare <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32>, <16 x i32>) nounwind readnone
7451 define <8 x i64> @test_x86_avx512_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1) {
7452 ; CHECK-LABEL: test_x86_avx512_psrlv_q_512:
7454 ; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0
7455 ; CHECK-NEXT: ret{{[l|q]}}
7456 %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
7460 define <8 x i64> @test_x86_avx512_psrlv_q_512_const() {
7461 ; X64-LABEL: test_x86_avx512_psrlv_q_512_const:
7463 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,18446744073709551615,3,7,18446744073709551615,0]
7464 ; X64-NEXT: vpsrlvq {{.*}}(%rip), %zmm0, %zmm0
7465 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,18446744073709551615]
7466 ; X64-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1
7467 ; X64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
7470 ; X86-LABEL: test_x86_avx512_psrlv_q_512_const:
7472 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,0,9,0,0,0,4294967295,4294967295,3,0,7,0,4294967295,4294967295,0,0]
7473 ; X86-NEXT: vpsrlvq {{\.LCPI.*}}, %zmm0, %zmm0
7474 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,0,4,0,4,0,4,0,4,0,4,0,4,0,4294967295,4294967295]
7475 ; X86-NEXT: vpsrlvq {{\.LCPI.*}}, %zmm1, %zmm1
7476 ; X86-NEXT: vpaddq %zmm1, %zmm0, %zmm0
7478 %res0 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> <i64 2, i64 9, i64 0, i64 -1, i64 3, i64 7, i64 -1, i64 0>, <8 x i64> <i64 1, i64 0, i64 33, i64 -1,i64 2, i64 0, i64 34, i64 -2>)
7479 %res1 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 -1>)
7480 %res2 = add <8 x i64> %res0, %res1
7484 define <8 x i64> @test_x86_avx512_mask_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
7485 ; X64-LABEL: test_x86_avx512_mask_psrlv_q_512:
7487 ; X64-NEXT: kmovw %edi, %k1
7488 ; X64-NEXT: vpsrlvq %zmm1, %zmm0, %zmm2 {%k1}
7489 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
7492 ; X86-LABEL: test_x86_avx512_mask_psrlv_q_512:
7494 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7495 ; X86-NEXT: kmovw %eax, %k1
7496 ; X86-NEXT: vpsrlvq %zmm1, %zmm0, %zmm2 {%k1}
7497 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
7499 %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
7500 %mask.cast = bitcast i8 %mask to <8 x i1>
7501 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2
7505 define <8 x i64> @test_x86_avx512_maskz_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
7506 ; X64-LABEL: test_x86_avx512_maskz_psrlv_q_512:
7508 ; X64-NEXT: kmovw %edi, %k1
7509 ; X64-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} {z}
7512 ; X86-LABEL: test_x86_avx512_maskz_psrlv_q_512:
7514 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7515 ; X86-NEXT: kmovw %eax, %k1
7516 ; X86-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} {z}
7518 %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
7519 %mask.cast = bitcast i8 %mask to <8 x i1>
7520 %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
7524 declare <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64>, <8 x i64>) nounwind readnone
7526 define <16 x float> @bad_mask_transition(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d, <16 x float> %e, <16 x float> %f) {
7527 ; X64-LABEL: bad_mask_transition:
7528 ; X64: # %bb.0: # %entry
7529 ; X64-NEXT: vcmplt_oqpd %zmm1, %zmm0, %k0
7530 ; X64-NEXT: kmovw %k0, %eax
7531 ; X64-NEXT: vcmplt_oqpd %zmm3, %zmm2, %k0
7532 ; X64-NEXT: kmovw %k0, %ecx
7533 ; X64-NEXT: movzbl %al, %eax
7534 ; X64-NEXT: movzbl %cl, %ecx
7535 ; X64-NEXT: kmovw %eax, %k0
7536 ; X64-NEXT: kmovw %ecx, %k1
7537 ; X64-NEXT: kunpckbw %k0, %k1, %k1
7538 ; X64-NEXT: vblendmps %zmm5, %zmm4, %zmm0 {%k1}
7541 ; X86-LABEL: bad_mask_transition:
7542 ; X86: # %bb.0: # %entry
7543 ; X86-NEXT: pushl %ebp
7544 ; X86-NEXT: .cfi_def_cfa_offset 8
7545 ; X86-NEXT: .cfi_offset %ebp, -8
7546 ; X86-NEXT: movl %esp, %ebp
7547 ; X86-NEXT: .cfi_def_cfa_register %ebp
7548 ; X86-NEXT: andl $-64, %esp
7549 ; X86-NEXT: subl $64, %esp
7550 ; X86-NEXT: vmovaps 72(%ebp), %zmm3
7551 ; X86-NEXT: vcmplt_oqpd %zmm1, %zmm0, %k0
7552 ; X86-NEXT: kmovw %k0, %eax
7553 ; X86-NEXT: vcmplt_oqpd 8(%ebp), %zmm2, %k0
7554 ; X86-NEXT: kmovw %k0, %ecx
7555 ; X86-NEXT: movzbl %al, %eax
7556 ; X86-NEXT: movzbl %cl, %ecx
7557 ; X86-NEXT: kmovw %eax, %k0
7558 ; X86-NEXT: kmovw %ecx, %k1
7559 ; X86-NEXT: kunpckbw %k0, %k1, %k1
7560 ; X86-NEXT: vmovaps 136(%ebp), %zmm3 {%k1}
7561 ; X86-NEXT: vmovaps %zmm3, %zmm0
7562 ; X86-NEXT: movl %ebp, %esp
7563 ; X86-NEXT: popl %ebp
7564 ; X86-NEXT: .cfi_def_cfa %esp, 4
7567 %0 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, i32 4)
7568 %1 = bitcast <8 x i1> %0 to i8
7569 %2 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %c, <8 x double> %d, i32 17, i32 4)
7570 %3 = bitcast <8 x i1> %2 to i8
7571 %conv = zext i8 %1 to i16
7572 %conv2 = zext i8 %3 to i16
7573 %4 = bitcast i16 %conv to <16 x i1>
7574 %5 = bitcast i16 %conv2 to <16 x i1>
7575 %6 = shufflevector <16 x i1> %4, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
7576 %7 = shufflevector <16 x i1> %5, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
7577 %8 = shufflevector <8 x i1> %6, <8 x i1> %7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
7578 %9 = select <16 x i1> %8, <16 x float> %f, <16 x float> %e
7582 define <16 x float> @bad_mask_transition_2(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d, <16 x float> %e, <16 x float> %f) {
7583 ; X64-LABEL: bad_mask_transition_2:
7584 ; X64: # %bb.0: # %entry
7585 ; X64-NEXT: vcmplt_oqpd %zmm1, %zmm0, %k0
7586 ; X64-NEXT: kmovw %k0, %eax
7587 ; X64-NEXT: movzbl %al, %eax
7588 ; X64-NEXT: kmovw %eax, %k1
7589 ; X64-NEXT: vblendmps %zmm5, %zmm4, %zmm0 {%k1}
7592 ; X86-LABEL: bad_mask_transition_2:
7593 ; X86: # %bb.0: # %entry
7594 ; X86-NEXT: pushl %ebp
7595 ; X86-NEXT: .cfi_def_cfa_offset 8
7596 ; X86-NEXT: .cfi_offset %ebp, -8
7597 ; X86-NEXT: movl %esp, %ebp
7598 ; X86-NEXT: .cfi_def_cfa_register %ebp
7599 ; X86-NEXT: andl $-64, %esp
7600 ; X86-NEXT: subl $64, %esp
7601 ; X86-NEXT: vmovaps 72(%ebp), %zmm2
7602 ; X86-NEXT: vcmplt_oqpd %zmm1, %zmm0, %k0
7603 ; X86-NEXT: kmovw %k0, %eax
7604 ; X86-NEXT: movzbl %al, %eax
7605 ; X86-NEXT: kmovw %eax, %k1
7606 ; X86-NEXT: vmovaps 136(%ebp), %zmm2 {%k1}
7607 ; X86-NEXT: vmovaps %zmm2, %zmm0
7608 ; X86-NEXT: movl %ebp, %esp
7609 ; X86-NEXT: popl %ebp
7610 ; X86-NEXT: .cfi_def_cfa %esp, 4
7613 %0 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, i32 4)
7614 %1 = bitcast <8 x i1> %0 to i8
7615 %conv = zext i8 %1 to i16
7616 %2 = bitcast i16 %conv to <16 x i1>
7617 %3 = select <16 x i1> %2, <16 x float> %f, <16 x float> %e
7621 declare <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double>, <8 x double>, <8 x i1>)
7622 declare <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float>, <16 x float>, <16 x i1>)
7623 declare <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64>, <8 x i64>, <8 x i1>)
7624 declare <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32>, <16 x i32>, <16 x i1>)
7625 declare <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double>, <8 x double>, <8 x i1>)
7626 declare <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float>, <16 x float>, <16 x i1>)
7627 declare <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64>, <8 x i64>, <8 x i1>)
7628 declare <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32>, <16 x i32>, <16 x i1>)