1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86
3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64
5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vl-builtins.c
7 define <4 x float> @test_mm_mask_cvtepi32_ps(<4 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) {
8 ; X86-LABEL: test_mm_mask_cvtepi32_ps:
9 ; X86: # %bb.0: # %entry
10 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
11 ; X86-NEXT: kmovw %eax, %k1
12 ; X86-NEXT: vcvtdq2ps %xmm1, %xmm0 {%k1}
15 ; X64-LABEL: test_mm_mask_cvtepi32_ps:
16 ; X64: # %bb.0: # %entry
17 ; X64-NEXT: kmovw %edi, %k1
18 ; X64-NEXT: vcvtdq2ps %xmm1, %xmm0 {%k1}
21 %0 = bitcast <2 x i64> %__A to <4 x i32>
22 %conv.i.i = sitofp <4 x i32> %0 to <4 x float>
23 %1 = bitcast i8 %__U to <8 x i1>
24 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
25 %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> %__W
29 define <4 x float> @test_mm_maskz_cvtepi32_ps(i8 zeroext %__U, <2 x i64> %__A) {
30 ; X86-LABEL: test_mm_maskz_cvtepi32_ps:
31 ; X86: # %bb.0: # %entry
32 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
33 ; X86-NEXT: kmovw %eax, %k1
34 ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 {%k1} {z}
37 ; X64-LABEL: test_mm_maskz_cvtepi32_ps:
38 ; X64: # %bb.0: # %entry
39 ; X64-NEXT: kmovw %edi, %k1
40 ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 {%k1} {z}
43 %0 = bitcast <2 x i64> %__A to <4 x i32>
44 %conv.i.i = sitofp <4 x i32> %0 to <4 x float>
45 %1 = bitcast i8 %__U to <8 x i1>
46 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
47 %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> zeroinitializer
51 define <8 x float> @test_mm256_mask_cvtepi32_ps(<8 x float> %__W, i8 zeroext %__U, <4 x i64> %__A) {
52 ; X86-LABEL: test_mm256_mask_cvtepi32_ps:
53 ; X86: # %bb.0: # %entry
54 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
55 ; X86-NEXT: kmovw %eax, %k1
56 ; X86-NEXT: vcvtdq2ps %ymm1, %ymm0 {%k1}
59 ; X64-LABEL: test_mm256_mask_cvtepi32_ps:
60 ; X64: # %bb.0: # %entry
61 ; X64-NEXT: kmovw %edi, %k1
62 ; X64-NEXT: vcvtdq2ps %ymm1, %ymm0 {%k1}
65 %0 = bitcast <4 x i64> %__A to <8 x i32>
66 %conv.i.i = sitofp <8 x i32> %0 to <8 x float>
67 %1 = bitcast i8 %__U to <8 x i1>
68 %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> %__W
72 define <8 x float> @test_mm256_maskz_cvtepi32_ps(i8 zeroext %__U, <4 x i64> %__A) {
73 ; X86-LABEL: test_mm256_maskz_cvtepi32_ps:
74 ; X86: # %bb.0: # %entry
75 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
76 ; X86-NEXT: kmovw %eax, %k1
77 ; X86-NEXT: vcvtdq2ps %ymm0, %ymm0 {%k1} {z}
80 ; X64-LABEL: test_mm256_maskz_cvtepi32_ps:
81 ; X64: # %bb.0: # %entry
82 ; X64-NEXT: kmovw %edi, %k1
83 ; X64-NEXT: vcvtdq2ps %ymm0, %ymm0 {%k1} {z}
86 %0 = bitcast <4 x i64> %__A to <8 x i32>
87 %conv.i.i = sitofp <8 x i32> %0 to <8 x float>
88 %1 = bitcast i8 %__U to <8 x i1>
89 %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> zeroinitializer
93 define <2 x i64> @test_mm_mask_cvtpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) {
94 ; X86-LABEL: test_mm_mask_cvtpd_epi32:
95 ; X86: # %bb.0: # %entry
96 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
97 ; X86-NEXT: kmovw %eax, %k1
98 ; X86-NEXT: vcvtpd2dq %xmm1, %xmm0 {%k1}
101 ; X64-LABEL: test_mm_mask_cvtpd_epi32:
102 ; X64: # %bb.0: # %entry
103 ; X64-NEXT: kmovw %edi, %k1
104 ; X64-NEXT: vcvtpd2dq %xmm1, %xmm0 {%k1}
107 %0 = bitcast <2 x i64> %__W to <4 x i32>
108 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8
109 %2 = bitcast <4 x i32> %1 to <2 x i64>
113 define <2 x i64> @test_mm_maskz_cvtpd_epi32(i8 zeroext %__U, <2 x double> %__A) {
114 ; X86-LABEL: test_mm_maskz_cvtpd_epi32:
115 ; X86: # %bb.0: # %entry
116 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
117 ; X86-NEXT: kmovw %eax, %k1
118 ; X86-NEXT: vcvtpd2dq %xmm0, %xmm0 {%k1} {z}
121 ; X64-LABEL: test_mm_maskz_cvtpd_epi32:
122 ; X64: # %bb.0: # %entry
123 ; X64-NEXT: kmovw %edi, %k1
124 ; X64-NEXT: vcvtpd2dq %xmm0, %xmm0 {%k1} {z}
127 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
128 %1 = bitcast <4 x i32> %0 to <2 x i64>
132 define <2 x i64> @test_mm256_mask_cvtpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) {
133 ; X86-LABEL: test_mm256_mask_cvtpd_epi32:
134 ; X86: # %bb.0: # %entry
135 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
136 ; X86-NEXT: kmovw %eax, %k1
137 ; X86-NEXT: vcvtpd2dq %ymm1, %xmm0 {%k1}
138 ; X86-NEXT: vzeroupper
141 ; X64-LABEL: test_mm256_mask_cvtpd_epi32:
142 ; X64: # %bb.0: # %entry
143 ; X64-NEXT: kmovw %edi, %k1
144 ; X64-NEXT: vcvtpd2dq %ymm1, %xmm0 {%k1}
145 ; X64-NEXT: vzeroupper
148 %0 = tail call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %__A) #8
149 %1 = bitcast <2 x i64> %__W to <4 x i32>
150 %2 = bitcast i8 %__U to <8 x i1>
151 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
152 %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1
153 %4 = bitcast <4 x i32> %3 to <2 x i64>
157 define <2 x i64> @test_mm256_maskz_cvtpd_epi32(i8 zeroext %__U, <4 x double> %__A) {
158 ; X86-LABEL: test_mm256_maskz_cvtpd_epi32:
159 ; X86: # %bb.0: # %entry
160 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
161 ; X86-NEXT: kmovw %eax, %k1
162 ; X86-NEXT: vcvtpd2dq %ymm0, %xmm0 {%k1} {z}
163 ; X86-NEXT: vzeroupper
166 ; X64-LABEL: test_mm256_maskz_cvtpd_epi32:
167 ; X64: # %bb.0: # %entry
168 ; X64-NEXT: kmovw %edi, %k1
169 ; X64-NEXT: vcvtpd2dq %ymm0, %xmm0 {%k1} {z}
170 ; X64-NEXT: vzeroupper
173 %0 = tail call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %__A) #8
174 %1 = bitcast i8 %__U to <8 x i1>
175 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
176 %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer
177 %3 = bitcast <4 x i32> %2 to <2 x i64>
181 define <4 x float> @test_mm_mask_cvtpd_ps(<4 x float> %__W, i8 zeroext %__U, <2 x double> %__A) {
182 ; X86-LABEL: test_mm_mask_cvtpd_ps:
183 ; X86: # %bb.0: # %entry
184 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
185 ; X86-NEXT: kmovw %eax, %k1
186 ; X86-NEXT: vcvtpd2ps %xmm1, %xmm0 {%k1}
189 ; X64-LABEL: test_mm_mask_cvtpd_ps:
190 ; X64: # %bb.0: # %entry
191 ; X64-NEXT: kmovw %edi, %k1
192 ; X64-NEXT: vcvtpd2ps %xmm1, %xmm0 {%k1}
195 %0 = tail call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %__A, <4 x float> %__W, i8 %__U) #8
199 define <4 x float> @test_mm_maskz_cvtpd_ps(i8 zeroext %__U, <2 x double> %__A) {
200 ; X86-LABEL: test_mm_maskz_cvtpd_ps:
201 ; X86: # %bb.0: # %entry
202 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
203 ; X86-NEXT: kmovw %eax, %k1
204 ; X86-NEXT: vcvtpd2ps %xmm0, %xmm0 {%k1} {z}
207 ; X64-LABEL: test_mm_maskz_cvtpd_ps:
208 ; X64: # %bb.0: # %entry
209 ; X64-NEXT: kmovw %edi, %k1
210 ; X64-NEXT: vcvtpd2ps %xmm0, %xmm0 {%k1} {z}
213 %0 = tail call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %__A, <4 x float> zeroinitializer, i8 %__U) #8
217 define <4 x float> @test_mm256_mask_cvtpd_ps(<4 x float> %__W, i8 zeroext %__U, <4 x double> %__A) {
218 ; X86-LABEL: test_mm256_mask_cvtpd_ps:
219 ; X86: # %bb.0: # %entry
220 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
221 ; X86-NEXT: kmovw %eax, %k1
222 ; X86-NEXT: vcvtpd2ps %ymm1, %xmm0 {%k1}
223 ; X86-NEXT: vzeroupper
226 ; X64-LABEL: test_mm256_mask_cvtpd_ps:
227 ; X64: # %bb.0: # %entry
228 ; X64-NEXT: kmovw %edi, %k1
229 ; X64-NEXT: vcvtpd2ps %ymm1, %xmm0 {%k1}
230 ; X64-NEXT: vzeroupper
233 %0 = tail call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %__A) #8
234 %1 = bitcast i8 %__U to <8 x i1>
235 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
236 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__W
240 define <4 x float> @test_mm256_maskz_cvtpd_ps(i8 zeroext %__U, <4 x double> %__A) {
241 ; X86-LABEL: test_mm256_maskz_cvtpd_ps:
242 ; X86: # %bb.0: # %entry
243 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
244 ; X86-NEXT: kmovw %eax, %k1
245 ; X86-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
246 ; X86-NEXT: vzeroupper
249 ; X64-LABEL: test_mm256_maskz_cvtpd_ps:
250 ; X64: # %bb.0: # %entry
251 ; X64-NEXT: kmovw %edi, %k1
252 ; X64-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
253 ; X64-NEXT: vzeroupper
256 %0 = tail call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %__A) #8
257 %1 = bitcast i8 %__U to <8 x i1>
258 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
259 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
263 define <2 x i64> @test_mm_cvtpd_epu32(<2 x double> %__A) {
264 ; CHECK-LABEL: test_mm_cvtpd_epu32:
265 ; CHECK: # %bb.0: # %entry
266 ; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm0
267 ; CHECK-NEXT: ret{{[l|q]}}
269 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8
270 %1 = bitcast <4 x i32> %0 to <2 x i64>
274 define <2 x i64> @test_mm_mask_cvtpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) {
275 ; X86-LABEL: test_mm_mask_cvtpd_epu32:
276 ; X86: # %bb.0: # %entry
277 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
278 ; X86-NEXT: kmovw %eax, %k1
279 ; X86-NEXT: vcvtpd2udq %xmm1, %xmm0 {%k1}
282 ; X64-LABEL: test_mm_mask_cvtpd_epu32:
283 ; X64: # %bb.0: # %entry
284 ; X64-NEXT: kmovw %edi, %k1
285 ; X64-NEXT: vcvtpd2udq %xmm1, %xmm0 {%k1}
288 %0 = bitcast <2 x i64> %__W to <4 x i32>
289 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8
290 %2 = bitcast <4 x i32> %1 to <2 x i64>
294 define <2 x i64> @test_mm_maskz_cvtpd_epu32(i8 zeroext %__U, <2 x double> %__A) {
295 ; X86-LABEL: test_mm_maskz_cvtpd_epu32:
296 ; X86: # %bb.0: # %entry
297 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
298 ; X86-NEXT: kmovw %eax, %k1
299 ; X86-NEXT: vcvtpd2udq %xmm0, %xmm0 {%k1} {z}
302 ; X64-LABEL: test_mm_maskz_cvtpd_epu32:
303 ; X64: # %bb.0: # %entry
304 ; X64-NEXT: kmovw %edi, %k1
305 ; X64-NEXT: vcvtpd2udq %xmm0, %xmm0 {%k1} {z}
308 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
309 %1 = bitcast <4 x i32> %0 to <2 x i64>
313 define <2 x i64> @test_mm256_cvtpd_epu32(<4 x double> %__A) {
314 ; CHECK-LABEL: test_mm256_cvtpd_epu32:
315 ; CHECK: # %bb.0: # %entry
316 ; CHECK-NEXT: vcvtpd2udq %ymm0, %xmm0
317 ; CHECK-NEXT: vzeroupper
318 ; CHECK-NEXT: ret{{[l|q]}}
320 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8
321 %1 = bitcast <4 x i32> %0 to <2 x i64>
325 define <2 x i64> @test_mm256_mask_cvtpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) {
326 ; X86-LABEL: test_mm256_mask_cvtpd_epu32:
327 ; X86: # %bb.0: # %entry
328 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
329 ; X86-NEXT: kmovw %eax, %k1
330 ; X86-NEXT: vcvtpd2udq %ymm1, %xmm0 {%k1}
331 ; X86-NEXT: vzeroupper
334 ; X64-LABEL: test_mm256_mask_cvtpd_epu32:
335 ; X64: # %bb.0: # %entry
336 ; X64-NEXT: kmovw %edi, %k1
337 ; X64-NEXT: vcvtpd2udq %ymm1, %xmm0 {%k1}
338 ; X64-NEXT: vzeroupper
341 %0 = bitcast <2 x i64> %__W to <4 x i32>
342 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> %0, i8 %__U) #8
343 %2 = bitcast <4 x i32> %1 to <2 x i64>
347 define <2 x i64> @test_mm256_maskz_cvtpd_epu32(i8 zeroext %__U, <4 x double> %__A) {
348 ; X86-LABEL: test_mm256_maskz_cvtpd_epu32:
349 ; X86: # %bb.0: # %entry
350 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
351 ; X86-NEXT: kmovw %eax, %k1
352 ; X86-NEXT: vcvtpd2udq %ymm0, %xmm0 {%k1} {z}
353 ; X86-NEXT: vzeroupper
356 ; X64-LABEL: test_mm256_maskz_cvtpd_epu32:
357 ; X64: # %bb.0: # %entry
358 ; X64-NEXT: kmovw %edi, %k1
359 ; X64-NEXT: vcvtpd2udq %ymm0, %xmm0 {%k1} {z}
360 ; X64-NEXT: vzeroupper
363 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
364 %1 = bitcast <4 x i32> %0 to <2 x i64>
368 define <2 x i64> @test_mm_mask_cvtps_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) {
369 ; X86-LABEL: test_mm_mask_cvtps_epi32:
370 ; X86: # %bb.0: # %entry
371 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
372 ; X86-NEXT: kmovw %eax, %k1
373 ; X86-NEXT: vcvtps2dq %xmm1, %xmm0 {%k1}
376 ; X64-LABEL: test_mm_mask_cvtps_epi32:
377 ; X64: # %bb.0: # %entry
378 ; X64-NEXT: kmovw %edi, %k1
379 ; X64-NEXT: vcvtps2dq %xmm1, %xmm0 {%k1}
382 %0 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %__A) #8
383 %1 = bitcast <2 x i64> %__W to <4 x i32>
384 %2 = bitcast i8 %__U to <8 x i1>
385 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
386 %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1
387 %4 = bitcast <4 x i32> %3 to <2 x i64>
391 define <2 x i64> @test_mm_maskz_cvtps_epi32(i8 zeroext %__U, <4 x float> %__A) {
392 ; X86-LABEL: test_mm_maskz_cvtps_epi32:
393 ; X86: # %bb.0: # %entry
394 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
395 ; X86-NEXT: kmovw %eax, %k1
396 ; X86-NEXT: vcvtps2dq %xmm0, %xmm0 {%k1} {z}
399 ; X64-LABEL: test_mm_maskz_cvtps_epi32:
400 ; X64: # %bb.0: # %entry
401 ; X64-NEXT: kmovw %edi, %k1
402 ; X64-NEXT: vcvtps2dq %xmm0, %xmm0 {%k1} {z}
405 %0 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %__A) #8
406 %1 = bitcast i8 %__U to <8 x i1>
407 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
408 %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer
409 %3 = bitcast <4 x i32> %2 to <2 x i64>
413 define <4 x i64> @test_mm256_mask_cvtps_epi32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) {
414 ; X86-LABEL: test_mm256_mask_cvtps_epi32:
415 ; X86: # %bb.0: # %entry
416 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
417 ; X86-NEXT: kmovw %eax, %k1
418 ; X86-NEXT: vcvtps2dq %ymm1, %ymm0 {%k1}
421 ; X64-LABEL: test_mm256_mask_cvtps_epi32:
422 ; X64: # %bb.0: # %entry
423 ; X64-NEXT: kmovw %edi, %k1
424 ; X64-NEXT: vcvtps2dq %ymm1, %ymm0 {%k1}
427 %0 = tail call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %__A) #8
428 %1 = bitcast <4 x i64> %__W to <8 x i32>
429 %2 = bitcast i8 %__U to <8 x i1>
430 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
431 %4 = bitcast <8 x i32> %3 to <4 x i64>
435 define <4 x i64> @test_mm256_maskz_cvtps_epi32(i8 zeroext %__U, <8 x float> %__A) {
436 ; X86-LABEL: test_mm256_maskz_cvtps_epi32:
437 ; X86: # %bb.0: # %entry
438 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
439 ; X86-NEXT: kmovw %eax, %k1
440 ; X86-NEXT: vcvtps2dq %ymm0, %ymm0 {%k1} {z}
443 ; X64-LABEL: test_mm256_maskz_cvtps_epi32:
444 ; X64: # %bb.0: # %entry
445 ; X64-NEXT: kmovw %edi, %k1
446 ; X64-NEXT: vcvtps2dq %ymm0, %ymm0 {%k1} {z}
449 %0 = tail call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %__A) #8
450 %1 = bitcast i8 %__U to <8 x i1>
451 %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer
452 %3 = bitcast <8 x i32> %2 to <4 x i64>
456 define <2 x double> @test_mm_mask_cvtps_pd(<2 x double> %__W, i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 {
457 ; X86-LABEL: test_mm_mask_cvtps_pd:
458 ; X86: # %bb.0: # %entry
459 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
460 ; X86-NEXT: kmovw %eax, %k1
461 ; X86-NEXT: vcvtps2pd %xmm1, %xmm0 {%k1}
464 ; X64-LABEL: test_mm_mask_cvtps_pd:
465 ; X64: # %bb.0: # %entry
466 ; X64-NEXT: kmovw %edi, %k1
467 ; X64-NEXT: vcvtps2pd %xmm1, %xmm0 {%k1}
470 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <2 x i32> <i32 0, i32 1>
471 %conv.i.i = fpext <2 x float> %shuffle.i.i to <2 x double>
472 %0 = bitcast i8 %__U to <8 x i1>
473 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
474 %1 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> %__W
478 define <2 x double> @test_mm_maskz_cvtps_pd(i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 {
479 ; X86-LABEL: test_mm_maskz_cvtps_pd:
480 ; X86: # %bb.0: # %entry
481 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
482 ; X86-NEXT: kmovw %eax, %k1
483 ; X86-NEXT: vcvtps2pd %xmm0, %xmm0 {%k1} {z}
486 ; X64-LABEL: test_mm_maskz_cvtps_pd:
487 ; X64: # %bb.0: # %entry
488 ; X64-NEXT: kmovw %edi, %k1
489 ; X64-NEXT: vcvtps2pd %xmm0, %xmm0 {%k1} {z}
492 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <2 x i32> <i32 0, i32 1>
493 %conv.i.i = fpext <2 x float> %shuffle.i.i to <2 x double>
494 %0 = bitcast i8 %__U to <8 x i1>
495 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
496 %1 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> zeroinitializer
500 define <4 x double> @test_mm256_mask_cvtps_pd(<4 x double> %__W, i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 {
501 ; X86-LABEL: test_mm256_mask_cvtps_pd:
502 ; X86: # %bb.0: # %entry
503 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
504 ; X86-NEXT: kmovw %eax, %k1
505 ; X86-NEXT: vcvtps2pd %xmm1, %ymm0 {%k1}
508 ; X64-LABEL: test_mm256_mask_cvtps_pd:
509 ; X64: # %bb.0: # %entry
510 ; X64-NEXT: kmovw %edi, %k1
511 ; X64-NEXT: vcvtps2pd %xmm1, %ymm0 {%k1}
514 %conv.i.i = fpext <4 x float> %__A to <4 x double>
515 %0 = bitcast i8 %__U to <8 x i1>
516 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
517 %1 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> %__W
521 define <4 x double> @test_mm256_maskz_cvtps_pd(i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 {
522 ; X86-LABEL: test_mm256_maskz_cvtps_pd:
523 ; X86: # %bb.0: # %entry
524 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
525 ; X86-NEXT: kmovw %eax, %k1
526 ; X86-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z}
529 ; X64-LABEL: test_mm256_maskz_cvtps_pd:
530 ; X64: # %bb.0: # %entry
531 ; X64-NEXT: kmovw %edi, %k1
532 ; X64-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z}
535 %conv.i.i = fpext <4 x float> %__A to <4 x double>
536 %0 = bitcast i8 %__U to <8 x i1>
537 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
538 %1 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> zeroinitializer
542 define <2 x i64> @test_mm_cvtps_epu32(<4 x float> %__A) {
543 ; CHECK-LABEL: test_mm_cvtps_epu32:
544 ; CHECK: # %bb.0: # %entry
545 ; CHECK-NEXT: vcvtps2udq %xmm0, %xmm0
546 ; CHECK-NEXT: ret{{[l|q]}}
548 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 -1) #8
549 %1 = bitcast <4 x i32> %0 to <2 x i64>
553 define <2 x i64> @test_mm_mask_cvtps_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) {
554 ; X86-LABEL: test_mm_mask_cvtps_epu32:
555 ; X86: # %bb.0: # %entry
556 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
557 ; X86-NEXT: kmovw %eax, %k1
558 ; X86-NEXT: vcvtps2udq %xmm1, %xmm0 {%k1}
561 ; X64-LABEL: test_mm_mask_cvtps_epu32:
562 ; X64: # %bb.0: # %entry
563 ; X64-NEXT: kmovw %edi, %k1
564 ; X64-NEXT: vcvtps2udq %xmm1, %xmm0 {%k1}
567 %0 = bitcast <2 x i64> %__W to <4 x i32>
568 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> %0, i8 %__U) #8
569 %2 = bitcast <4 x i32> %1 to <2 x i64>
573 define <2 x i64> @test_mm_maskz_cvtps_epu32(i8 zeroext %__U, <4 x float> %__A) {
574 ; X86-LABEL: test_mm_maskz_cvtps_epu32:
575 ; X86: # %bb.0: # %entry
576 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
577 ; X86-NEXT: kmovw %eax, %k1
578 ; X86-NEXT: vcvtps2udq %xmm0, %xmm0 {%k1} {z}
581 ; X64-LABEL: test_mm_maskz_cvtps_epu32:
582 ; X64: # %bb.0: # %entry
583 ; X64-NEXT: kmovw %edi, %k1
584 ; X64-NEXT: vcvtps2udq %xmm0, %xmm0 {%k1} {z}
587 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
588 %1 = bitcast <4 x i32> %0 to <2 x i64>
592 define <4 x i64> @test_mm256_cvtps_epu32(<8 x float> %__A) {
593 ; CHECK-LABEL: test_mm256_cvtps_epu32:
594 ; CHECK: # %bb.0: # %entry
595 ; CHECK-NEXT: vcvtps2udq %ymm0, %ymm0
596 ; CHECK-NEXT: ret{{[l|q]}}
598 %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 -1) #8
599 %1 = bitcast <8 x i32> %0 to <4 x i64>
603 define <4 x i64> @test_mm256_mask_cvtps_epu32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) {
604 ; X86-LABEL: test_mm256_mask_cvtps_epu32:
605 ; X86: # %bb.0: # %entry
606 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
607 ; X86-NEXT: kmovw %eax, %k1
608 ; X86-NEXT: vcvtps2udq %ymm1, %ymm0 {%k1}
611 ; X64-LABEL: test_mm256_mask_cvtps_epu32:
612 ; X64: # %bb.0: # %entry
613 ; X64-NEXT: kmovw %edi, %k1
614 ; X64-NEXT: vcvtps2udq %ymm1, %ymm0 {%k1}
617 %0 = bitcast <4 x i64> %__W to <8 x i32>
618 %1 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> %0, i8 %__U) #8
619 %2 = bitcast <8 x i32> %1 to <4 x i64>
623 define <4 x i64> @test_mm256_maskz_cvtps_epu32(i8 zeroext %__U, <8 x float> %__A) {
624 ; X86-LABEL: test_mm256_maskz_cvtps_epu32:
625 ; X86: # %bb.0: # %entry
626 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
627 ; X86-NEXT: kmovw %eax, %k1
628 ; X86-NEXT: vcvtps2udq %ymm0, %ymm0 {%k1} {z}
631 ; X64-LABEL: test_mm256_maskz_cvtps_epu32:
632 ; X64: # %bb.0: # %entry
633 ; X64-NEXT: kmovw %edi, %k1
634 ; X64-NEXT: vcvtps2udq %ymm0, %ymm0 {%k1} {z}
637 %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 %__U) #8
638 %1 = bitcast <8 x i32> %0 to <4 x i64>
642 define <2 x i64> @test_mm_mask_cvttpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) {
643 ; X86-LABEL: test_mm_mask_cvttpd_epi32:
644 ; X86: # %bb.0: # %entry
645 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
646 ; X86-NEXT: kmovw %eax, %k1
647 ; X86-NEXT: vcvttpd2dq %xmm1, %xmm0 {%k1}
650 ; X64-LABEL: test_mm_mask_cvttpd_epi32:
651 ; X64: # %bb.0: # %entry
652 ; X64-NEXT: kmovw %edi, %k1
653 ; X64-NEXT: vcvttpd2dq %xmm1, %xmm0 {%k1}
656 %0 = bitcast <2 x i64> %__W to <4 x i32>
657 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8
658 %2 = bitcast <4 x i32> %1 to <2 x i64>
662 define <2 x i64> @test_mm_maskz_cvttpd_epi32(i8 zeroext %__U, <2 x double> %__A) {
663 ; X86-LABEL: test_mm_maskz_cvttpd_epi32:
664 ; X86: # %bb.0: # %entry
665 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
666 ; X86-NEXT: kmovw %eax, %k1
667 ; X86-NEXT: vcvttpd2dq %xmm0, %xmm0 {%k1} {z}
670 ; X64-LABEL: test_mm_maskz_cvttpd_epi32:
671 ; X64: # %bb.0: # %entry
672 ; X64-NEXT: kmovw %edi, %k1
673 ; X64-NEXT: vcvttpd2dq %xmm0, %xmm0 {%k1} {z}
676 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
677 %1 = bitcast <4 x i32> %0 to <2 x i64>
681 define <2 x i64> @test_mm256_mask_cvttpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) {
682 ; X86-LABEL: test_mm256_mask_cvttpd_epi32:
683 ; X86: # %bb.0: # %entry
684 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
685 ; X86-NEXT: kmovw %eax, %k1
686 ; X86-NEXT: vcvttpd2dq %ymm1, %xmm0 {%k1}
687 ; X86-NEXT: vzeroupper
690 ; X64-LABEL: test_mm256_mask_cvttpd_epi32:
691 ; X64: # %bb.0: # %entry
692 ; X64-NEXT: kmovw %edi, %k1
693 ; X64-NEXT: vcvttpd2dq %ymm1, %xmm0 {%k1}
694 ; X64-NEXT: vzeroupper
697 %0 = tail call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %__A) #8
698 %1 = bitcast <2 x i64> %__W to <4 x i32>
699 %2 = bitcast i8 %__U to <8 x i1>
700 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
701 %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1
702 %4 = bitcast <4 x i32> %3 to <2 x i64>
706 define <2 x i64> @test_mm256_maskz_cvttpd_epi32(i8 zeroext %__U, <4 x double> %__A) {
707 ; X86-LABEL: test_mm256_maskz_cvttpd_epi32:
708 ; X86: # %bb.0: # %entry
709 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
710 ; X86-NEXT: kmovw %eax, %k1
711 ; X86-NEXT: vcvttpd2dq %ymm0, %xmm0 {%k1} {z}
712 ; X86-NEXT: vzeroupper
715 ; X64-LABEL: test_mm256_maskz_cvttpd_epi32:
716 ; X64: # %bb.0: # %entry
717 ; X64-NEXT: kmovw %edi, %k1
718 ; X64-NEXT: vcvttpd2dq %ymm0, %xmm0 {%k1} {z}
719 ; X64-NEXT: vzeroupper
722 %0 = tail call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %__A) #8
723 %1 = bitcast i8 %__U to <8 x i1>
724 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
725 %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer
726 %3 = bitcast <4 x i32> %2 to <2 x i64>
730 define <2 x i64> @test_mm_cvttpd_epu32(<2 x double> %__A) {
731 ; CHECK-LABEL: test_mm_cvttpd_epu32:
732 ; CHECK: # %bb.0: # %entry
733 ; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm0
734 ; CHECK-NEXT: ret{{[l|q]}}
736 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8
737 %1 = bitcast <4 x i32> %0 to <2 x i64>
741 define <2 x i64> @test_mm_mask_cvttpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) {
742 ; X86-LABEL: test_mm_mask_cvttpd_epu32:
743 ; X86: # %bb.0: # %entry
744 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
745 ; X86-NEXT: kmovw %eax, %k1
746 ; X86-NEXT: vcvttpd2udq %xmm1, %xmm0 {%k1}
749 ; X64-LABEL: test_mm_mask_cvttpd_epu32:
750 ; X64: # %bb.0: # %entry
751 ; X64-NEXT: kmovw %edi, %k1
752 ; X64-NEXT: vcvttpd2udq %xmm1, %xmm0 {%k1}
755 %0 = bitcast <2 x i64> %__W to <4 x i32>
756 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8
757 %2 = bitcast <4 x i32> %1 to <2 x i64>
761 define <2 x i64> @test_mm_maskz_cvttpd_epu32(i8 zeroext %__U, <2 x double> %__A) {
762 ; X86-LABEL: test_mm_maskz_cvttpd_epu32:
763 ; X86: # %bb.0: # %entry
764 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
765 ; X86-NEXT: kmovw %eax, %k1
766 ; X86-NEXT: vcvttpd2udq %xmm0, %xmm0 {%k1} {z}
769 ; X64-LABEL: test_mm_maskz_cvttpd_epu32:
770 ; X64: # %bb.0: # %entry
771 ; X64-NEXT: kmovw %edi, %k1
772 ; X64-NEXT: vcvttpd2udq %xmm0, %xmm0 {%k1} {z}
775 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
776 %1 = bitcast <4 x i32> %0 to <2 x i64>
780 define <2 x i64> @test_mm256_cvttpd_epu32(<4 x double> %__A) {
781 ; CHECK-LABEL: test_mm256_cvttpd_epu32:
782 ; CHECK: # %bb.0: # %entry
783 ; CHECK-NEXT: vcvttpd2udq %ymm0, %xmm0
784 ; CHECK-NEXT: vzeroupper
785 ; CHECK-NEXT: ret{{[l|q]}}
787 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8
788 %1 = bitcast <4 x i32> %0 to <2 x i64>
792 define <2 x i64> @test_mm256_mask_cvttpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) {
793 ; X86-LABEL: test_mm256_mask_cvttpd_epu32:
794 ; X86: # %bb.0: # %entry
795 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
796 ; X86-NEXT: kmovw %eax, %k1
797 ; X86-NEXT: vcvttpd2udq %ymm1, %xmm0 {%k1}
798 ; X86-NEXT: vzeroupper
801 ; X64-LABEL: test_mm256_mask_cvttpd_epu32:
802 ; X64: # %bb.0: # %entry
803 ; X64-NEXT: kmovw %edi, %k1
804 ; X64-NEXT: vcvttpd2udq %ymm1, %xmm0 {%k1}
805 ; X64-NEXT: vzeroupper
808 %0 = bitcast <2 x i64> %__W to <4 x i32>
809 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> %0, i8 %__U) #8
810 %2 = bitcast <4 x i32> %1 to <2 x i64>
814 define <2 x i64> @test_mm256_maskz_cvttpd_epu32(i8 zeroext %__U, <4 x double> %__A) {
815 ; X86-LABEL: test_mm256_maskz_cvttpd_epu32:
816 ; X86: # %bb.0: # %entry
817 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
818 ; X86-NEXT: kmovw %eax, %k1
819 ; X86-NEXT: vcvttpd2udq %ymm0, %xmm0 {%k1} {z}
820 ; X86-NEXT: vzeroupper
823 ; X64-LABEL: test_mm256_maskz_cvttpd_epu32:
824 ; X64: # %bb.0: # %entry
825 ; X64-NEXT: kmovw %edi, %k1
826 ; X64-NEXT: vcvttpd2udq %ymm0, %xmm0 {%k1} {z}
827 ; X64-NEXT: vzeroupper
830 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
831 %1 = bitcast <4 x i32> %0 to <2 x i64>
835 define <2 x i64> @test_mm_mask_cvttps_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) {
836 ; X86-LABEL: test_mm_mask_cvttps_epi32:
837 ; X86: # %bb.0: # %entry
838 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
839 ; X86-NEXT: kmovw %eax, %k1
840 ; X86-NEXT: vcvttps2dq %xmm1, %xmm0 {%k1}
843 ; X64-LABEL: test_mm_mask_cvttps_epi32:
844 ; X64: # %bb.0: # %entry
845 ; X64-NEXT: kmovw %edi, %k1
846 ; X64-NEXT: vcvttps2dq %xmm1, %xmm0 {%k1}
849 %0 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %__A) #8
850 %1 = bitcast <2 x i64> %__W to <4 x i32>
851 %2 = bitcast i8 %__U to <8 x i1>
852 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
853 %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1
854 %4 = bitcast <4 x i32> %3 to <2 x i64>
858 define <2 x i64> @test_mm_maskz_cvttps_epi32(i8 zeroext %__U, <4 x float> %__A) {
859 ; X86-LABEL: test_mm_maskz_cvttps_epi32:
860 ; X86: # %bb.0: # %entry
861 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
862 ; X86-NEXT: kmovw %eax, %k1
863 ; X86-NEXT: vcvttps2dq %xmm0, %xmm0 {%k1} {z}
866 ; X64-LABEL: test_mm_maskz_cvttps_epi32:
867 ; X64: # %bb.0: # %entry
868 ; X64-NEXT: kmovw %edi, %k1
869 ; X64-NEXT: vcvttps2dq %xmm0, %xmm0 {%k1} {z}
872 %0 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %__A) #8
873 %1 = bitcast i8 %__U to <8 x i1>
874 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
875 %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer
876 %3 = bitcast <4 x i32> %2 to <2 x i64>
880 define <4 x i64> @test_mm256_mask_cvttps_epi32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) {
881 ; X86-LABEL: test_mm256_mask_cvttps_epi32:
882 ; X86: # %bb.0: # %entry
883 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
884 ; X86-NEXT: kmovw %eax, %k1
885 ; X86-NEXT: vcvttps2dq %ymm1, %ymm0 {%k1}
888 ; X64-LABEL: test_mm256_mask_cvttps_epi32:
889 ; X64: # %bb.0: # %entry
890 ; X64-NEXT: kmovw %edi, %k1
891 ; X64-NEXT: vcvttps2dq %ymm1, %ymm0 {%k1}
894 %0 = tail call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %__A) #8
895 %1 = bitcast <4 x i64> %__W to <8 x i32>
896 %2 = bitcast i8 %__U to <8 x i1>
897 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
898 %4 = bitcast <8 x i32> %3 to <4 x i64>
902 define <4 x i64> @test_mm256_maskz_cvttps_epi32(i8 zeroext %__U, <8 x float> %__A) {
903 ; X86-LABEL: test_mm256_maskz_cvttps_epi32:
904 ; X86: # %bb.0: # %entry
905 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
906 ; X86-NEXT: kmovw %eax, %k1
907 ; X86-NEXT: vcvttps2dq %ymm0, %ymm0 {%k1} {z}
910 ; X64-LABEL: test_mm256_maskz_cvttps_epi32:
911 ; X64: # %bb.0: # %entry
912 ; X64-NEXT: kmovw %edi, %k1
913 ; X64-NEXT: vcvttps2dq %ymm0, %ymm0 {%k1} {z}
916 %0 = tail call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %__A) #8
917 %1 = bitcast i8 %__U to <8 x i1>
918 %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer
919 %3 = bitcast <8 x i32> %2 to <4 x i64>
923 define <2 x i64> @test_mm_cvttps_epu32(<4 x float> %__A) {
924 ; CHECK-LABEL: test_mm_cvttps_epu32:
925 ; CHECK: # %bb.0: # %entry
926 ; CHECK-NEXT: vcvttps2udq %xmm0, %xmm0
927 ; CHECK-NEXT: ret{{[l|q]}}
929 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 -1) #8
930 %1 = bitcast <4 x i32> %0 to <2 x i64>
934 define <2 x i64> @test_mm_mask_cvttps_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) {
935 ; X86-LABEL: test_mm_mask_cvttps_epu32:
936 ; X86: # %bb.0: # %entry
937 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
938 ; X86-NEXT: kmovw %eax, %k1
939 ; X86-NEXT: vcvttps2udq %xmm1, %xmm0 {%k1}
942 ; X64-LABEL: test_mm_mask_cvttps_epu32:
943 ; X64: # %bb.0: # %entry
944 ; X64-NEXT: kmovw %edi, %k1
945 ; X64-NEXT: vcvttps2udq %xmm1, %xmm0 {%k1}
948 %0 = bitcast <2 x i64> %__W to <4 x i32>
949 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> %0, i8 %__U) #8
950 %2 = bitcast <4 x i32> %1 to <2 x i64>
954 define <2 x i64> @test_mm_maskz_cvttps_epu32(i8 zeroext %__U, <4 x float> %__A) {
955 ; X86-LABEL: test_mm_maskz_cvttps_epu32:
956 ; X86: # %bb.0: # %entry
957 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
958 ; X86-NEXT: kmovw %eax, %k1
959 ; X86-NEXT: vcvttps2udq %xmm0, %xmm0 {%k1} {z}
962 ; X64-LABEL: test_mm_maskz_cvttps_epu32:
963 ; X64: # %bb.0: # %entry
964 ; X64-NEXT: kmovw %edi, %k1
965 ; X64-NEXT: vcvttps2udq %xmm0, %xmm0 {%k1} {z}
968 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
969 %1 = bitcast <4 x i32> %0 to <2 x i64>
973 define <4 x i64> @test_mm256_cvttps_epu32(<8 x float> %__A) {
974 ; CHECK-LABEL: test_mm256_cvttps_epu32:
975 ; CHECK: # %bb.0: # %entry
976 ; CHECK-NEXT: vcvttps2udq %ymm0, %ymm0
977 ; CHECK-NEXT: ret{{[l|q]}}
979 %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 -1) #8
980 %1 = bitcast <8 x i32> %0 to <4 x i64>
984 define <4 x i64> @test_mm256_mask_cvttps_epu32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) {
985 ; X86-LABEL: test_mm256_mask_cvttps_epu32:
986 ; X86: # %bb.0: # %entry
987 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
988 ; X86-NEXT: kmovw %eax, %k1
989 ; X86-NEXT: vcvttps2udq %ymm1, %ymm0 {%k1}
992 ; X64-LABEL: test_mm256_mask_cvttps_epu32:
993 ; X64: # %bb.0: # %entry
994 ; X64-NEXT: kmovw %edi, %k1
995 ; X64-NEXT: vcvttps2udq %ymm1, %ymm0 {%k1}
998 %0 = bitcast <4 x i64> %__W to <8 x i32>
999 %1 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> %0, i8 %__U) #8
1000 %2 = bitcast <8 x i32> %1 to <4 x i64>
1004 define <4 x i64> @test_mm256_maskz_cvttps_epu32(i8 zeroext %__U, <8 x float> %__A) {
1005 ; X86-LABEL: test_mm256_maskz_cvttps_epu32:
1006 ; X86: # %bb.0: # %entry
1007 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1008 ; X86-NEXT: kmovw %eax, %k1
1009 ; X86-NEXT: vcvttps2udq %ymm0, %ymm0 {%k1} {z}
1012 ; X64-LABEL: test_mm256_maskz_cvttps_epu32:
1013 ; X64: # %bb.0: # %entry
1014 ; X64-NEXT: kmovw %edi, %k1
1015 ; X64-NEXT: vcvttps2udq %ymm0, %ymm0 {%k1} {z}
1018 %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 %__U) #8
1019 %1 = bitcast <8 x i32> %0 to <4 x i64>
1023 define <2 x double> @test_mm_cvtepu32_pd(<2 x i64> %__A) local_unnamed_addr #0 {
1024 ; CHECK-LABEL: test_mm_cvtepu32_pd:
1025 ; CHECK: # %bb.0: # %entry
1026 ; CHECK-NEXT: vcvtudq2pd %xmm0, %xmm0
1027 ; CHECK-NEXT: ret{{[l|q]}}
1029 %0 = bitcast <2 x i64> %__A to <4 x i32>
1030 %shuffle.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1031 %conv.i = uitofp <2 x i32> %shuffle.i to <2 x double>
1032 ret <2 x double> %conv.i
1035 define <2 x double> @test_mm_mask_cvtepu32_pd(<2 x double> %__W, i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 {
1036 ; X86-LABEL: test_mm_mask_cvtepu32_pd:
1037 ; X86: # %bb.0: # %entry
1038 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1039 ; X86-NEXT: kmovw %eax, %k1
1040 ; X86-NEXT: vcvtudq2pd %xmm1, %xmm0 {%k1}
1043 ; X64-LABEL: test_mm_mask_cvtepu32_pd:
1044 ; X64: # %bb.0: # %entry
1045 ; X64-NEXT: kmovw %edi, %k1
1046 ; X64-NEXT: vcvtudq2pd %xmm1, %xmm0 {%k1}
1049 %0 = bitcast <2 x i64> %__A to <4 x i32>
1050 %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1051 %conv.i.i = uitofp <2 x i32> %shuffle.i.i to <2 x double>
1052 %1 = bitcast i8 %__U to <8 x i1>
1053 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1054 %2 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> %__W
1058 define <2 x double> @test_mm_maskz_cvtepu32_pd(i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 {
1059 ; X86-LABEL: test_mm_maskz_cvtepu32_pd:
1060 ; X86: # %bb.0: # %entry
1061 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1062 ; X86-NEXT: kmovw %eax, %k1
1063 ; X86-NEXT: vcvtudq2pd %xmm0, %xmm0 {%k1} {z}
1066 ; X64-LABEL: test_mm_maskz_cvtepu32_pd:
1067 ; X64: # %bb.0: # %entry
1068 ; X64-NEXT: kmovw %edi, %k1
1069 ; X64-NEXT: vcvtudq2pd %xmm0, %xmm0 {%k1} {z}
1072 %0 = bitcast <2 x i64> %__A to <4 x i32>
1073 %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1074 %conv.i.i = uitofp <2 x i32> %shuffle.i.i to <2 x double>
1075 %1 = bitcast i8 %__U to <8 x i1>
1076 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1077 %2 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> zeroinitializer
1081 define <4 x double> @test_mm256_cvtepu32_pd(<2 x i64> %__A) local_unnamed_addr #0 {
1082 ; CHECK-LABEL: test_mm256_cvtepu32_pd:
1083 ; CHECK: # %bb.0: # %entry
1084 ; CHECK-NEXT: vcvtudq2pd %xmm0, %ymm0
1085 ; CHECK-NEXT: ret{{[l|q]}}
1087 %0 = bitcast <2 x i64> %__A to <4 x i32>
1088 %conv.i = uitofp <4 x i32> %0 to <4 x double>
1089 ret <4 x double> %conv.i
1092 define <4 x double> @test_mm256_mask_cvtepu32_pd(<4 x double> %__W, i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 {
1093 ; X86-LABEL: test_mm256_mask_cvtepu32_pd:
1094 ; X86: # %bb.0: # %entry
1095 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1096 ; X86-NEXT: kmovw %eax, %k1
1097 ; X86-NEXT: vcvtudq2pd %xmm1, %ymm0 {%k1}
1100 ; X64-LABEL: test_mm256_mask_cvtepu32_pd:
1101 ; X64: # %bb.0: # %entry
1102 ; X64-NEXT: kmovw %edi, %k1
1103 ; X64-NEXT: vcvtudq2pd %xmm1, %ymm0 {%k1}
1106 %0 = bitcast <2 x i64> %__A to <4 x i32>
1107 %conv.i.i = uitofp <4 x i32> %0 to <4 x double>
1108 %1 = bitcast i8 %__U to <8 x i1>
1109 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1110 %2 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> %__W
1114 define <4 x double> @test_mm256_maskz_cvtepu32_pd(i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 {
1115 ; X86-LABEL: test_mm256_maskz_cvtepu32_pd:
1116 ; X86: # %bb.0: # %entry
1117 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1118 ; X86-NEXT: kmovw %eax, %k1
1119 ; X86-NEXT: vcvtudq2pd %xmm0, %ymm0 {%k1} {z}
1122 ; X64-LABEL: test_mm256_maskz_cvtepu32_pd:
1123 ; X64: # %bb.0: # %entry
1124 ; X64-NEXT: kmovw %edi, %k1
1125 ; X64-NEXT: vcvtudq2pd %xmm0, %ymm0 {%k1} {z}
1128 %0 = bitcast <2 x i64> %__A to <4 x i32>
1129 %conv.i.i = uitofp <4 x i32> %0 to <4 x double>
1130 %1 = bitcast i8 %__U to <8 x i1>
1131 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1132 %2 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> zeroinitializer
1136 define <4 x float> @test_mm_cvtepu32_ps(<2 x i64> %__A) {
1137 ; CHECK-LABEL: test_mm_cvtepu32_ps:
1138 ; CHECK: # %bb.0: # %entry
1139 ; CHECK-NEXT: vcvtudq2ps %xmm0, %xmm0
1140 ; CHECK-NEXT: ret{{[l|q]}}
1142 %0 = bitcast <2 x i64> %__A to <4 x i32>
1143 %conv.i = uitofp <4 x i32> %0 to <4 x float>
1144 ret <4 x float> %conv.i
1147 define <4 x float> @test_mm_mask_cvtepu32_ps(<4 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) {
1148 ; X86-LABEL: test_mm_mask_cvtepu32_ps:
1149 ; X86: # %bb.0: # %entry
1150 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1151 ; X86-NEXT: kmovw %eax, %k1
1152 ; X86-NEXT: vcvtudq2ps %xmm1, %xmm0 {%k1}
1155 ; X64-LABEL: test_mm_mask_cvtepu32_ps:
1156 ; X64: # %bb.0: # %entry
1157 ; X64-NEXT: kmovw %edi, %k1
1158 ; X64-NEXT: vcvtudq2ps %xmm1, %xmm0 {%k1}
1161 %0 = bitcast <2 x i64> %__A to <4 x i32>
1162 %conv.i.i = uitofp <4 x i32> %0 to <4 x float>
1163 %1 = bitcast i8 %__U to <8 x i1>
1164 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1165 %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> %__W
1169 define <4 x float> @test_mm_maskz_cvtepu32_ps(i8 zeroext %__U, <2 x i64> %__A) {
1170 ; X86-LABEL: test_mm_maskz_cvtepu32_ps:
1171 ; X86: # %bb.0: # %entry
1172 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1173 ; X86-NEXT: kmovw %eax, %k1
1174 ; X86-NEXT: vcvtudq2ps %xmm0, %xmm0 {%k1} {z}
1177 ; X64-LABEL: test_mm_maskz_cvtepu32_ps:
1178 ; X64: # %bb.0: # %entry
1179 ; X64-NEXT: kmovw %edi, %k1
1180 ; X64-NEXT: vcvtudq2ps %xmm0, %xmm0 {%k1} {z}
1183 %0 = bitcast <2 x i64> %__A to <4 x i32>
1184 %conv.i.i = uitofp <4 x i32> %0 to <4 x float>
1185 %1 = bitcast i8 %__U to <8 x i1>
1186 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1187 %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> zeroinitializer
1191 define <8 x float> @test_mm256_cvtepu32_ps(<4 x i64> %__A) {
1192 ; CHECK-LABEL: test_mm256_cvtepu32_ps:
1193 ; CHECK: # %bb.0: # %entry
1194 ; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm0
1195 ; CHECK-NEXT: ret{{[l|q]}}
1197 %0 = bitcast <4 x i64> %__A to <8 x i32>
1198 %conv.i = uitofp <8 x i32> %0 to <8 x float>
1199 ret <8 x float> %conv.i
1202 define <8 x float> @test_mm256_mask_cvtepu32_ps(<8 x float> %__W, i8 zeroext %__U, <4 x i64> %__A) {
1203 ; X86-LABEL: test_mm256_mask_cvtepu32_ps:
1204 ; X86: # %bb.0: # %entry
1205 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1206 ; X86-NEXT: kmovw %eax, %k1
1207 ; X86-NEXT: vcvtudq2ps %ymm1, %ymm0 {%k1}
1210 ; X64-LABEL: test_mm256_mask_cvtepu32_ps:
1211 ; X64: # %bb.0: # %entry
1212 ; X64-NEXT: kmovw %edi, %k1
1213 ; X64-NEXT: vcvtudq2ps %ymm1, %ymm0 {%k1}
1216 %0 = bitcast <4 x i64> %__A to <8 x i32>
1217 %conv.i.i = uitofp <8 x i32> %0 to <8 x float>
1218 %1 = bitcast i8 %__U to <8 x i1>
1219 %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> %__W
1223 define <8 x float> @test_mm256_maskz_cvtepu32_ps(i8 zeroext %__U, <4 x i64> %__A) {
1224 ; X86-LABEL: test_mm256_maskz_cvtepu32_ps:
1225 ; X86: # %bb.0: # %entry
1226 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1227 ; X86-NEXT: kmovw %eax, %k1
1228 ; X86-NEXT: vcvtudq2ps %ymm0, %ymm0 {%k1} {z}
1231 ; X64-LABEL: test_mm256_maskz_cvtepu32_ps:
1232 ; X64: # %bb.0: # %entry
1233 ; X64-NEXT: kmovw %edi, %k1
1234 ; X64-NEXT: vcvtudq2ps %ymm0, %ymm0 {%k1} {z}
1237 %0 = bitcast <4 x i64> %__A to <8 x i32>
1238 %conv.i.i = uitofp <8 x i32> %0 to <8 x float>
1239 %1 = bitcast i8 %__U to <8 x i1>
1240 %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> zeroinitializer
1244 define <8 x float> @test_mm256_shuffle_f32x4(<8 x float> %__A, <8 x float> %__B) {
1245 ; CHECK-LABEL: test_mm256_shuffle_f32x4:
1246 ; CHECK: # %bb.0: # %entry
1247 ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1248 ; CHECK-NEXT: ret{{[l|q]}}
1250 %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1251 ret <8 x float> %shuffle
1254 define <8 x float> @test_mm256_mask_shuffle_f32x4(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) {
1255 ; X86-LABEL: test_mm256_mask_shuffle_f32x4:
1256 ; X86: # %bb.0: # %entry
1257 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1258 ; X86-NEXT: kmovw %eax, %k1
1259 ; X86-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
1262 ; X64-LABEL: test_mm256_mask_shuffle_f32x4:
1263 ; X64: # %bb.0: # %entry
1264 ; X64-NEXT: kmovw %edi, %k1
1265 ; X64-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
1268 %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1269 %0 = bitcast i8 %__U to <8 x i1>
1270 %1 = select <8 x i1> %0, <8 x float> %shuffle, <8 x float> %__W
1274 define <8 x float> @test_mm256_maskz_shuffle_f32x4(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) {
1275 ; X86-LABEL: test_mm256_maskz_shuffle_f32x4:
1276 ; X86: # %bb.0: # %entry
1277 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1278 ; X86-NEXT: kmovw %eax, %k1
1279 ; X86-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
1282 ; X64-LABEL: test_mm256_maskz_shuffle_f32x4:
1283 ; X64: # %bb.0: # %entry
1284 ; X64-NEXT: kmovw %edi, %k1
1285 ; X64-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
1288 %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1289 %0 = bitcast i8 %__U to <8 x i1>
1290 %1 = select <8 x i1> %0, <8 x float> %shuffle, <8 x float> zeroinitializer
1294 define <4 x double> @test_mm256_shuffle_f64x2(<4 x double> %__A, <4 x double> %__B) {
1295 ; CHECK-LABEL: test_mm256_shuffle_f64x2:
1296 ; CHECK: # %bb.0: # %entry
1297 ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1298 ; CHECK-NEXT: ret{{[l|q]}}
1300 %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1301 ret <4 x double> %shuffle
1304 define <4 x double> @test_mm256_mask_shuffle_f64x2(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
1305 ; X86-LABEL: test_mm256_mask_shuffle_f64x2:
1306 ; X86: # %bb.0: # %entry
1307 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1308 ; X86-NEXT: kmovw %eax, %k1
1309 ; X86-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
1312 ; X64-LABEL: test_mm256_mask_shuffle_f64x2:
1313 ; X64: # %bb.0: # %entry
1314 ; X64-NEXT: kmovw %edi, %k1
1315 ; X64-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
1318 %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1319 %0 = bitcast i8 %__U to <8 x i1>
1320 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1321 %1 = select <4 x i1> %extract, <4 x double> %shuffle, <4 x double> %__W
1325 define <4 x double> @test_mm256_maskz_shuffle_f64x2(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
1326 ; X86-LABEL: test_mm256_maskz_shuffle_f64x2:
1327 ; X86: # %bb.0: # %entry
1328 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1329 ; X86-NEXT: kmovw %eax, %k1
1330 ; X86-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
1333 ; X64-LABEL: test_mm256_maskz_shuffle_f64x2:
1334 ; X64: # %bb.0: # %entry
1335 ; X64-NEXT: kmovw %edi, %k1
1336 ; X64-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
1339 %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1340 %0 = bitcast i8 %__U to <8 x i1>
1341 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1342 %1 = select <4 x i1> %extract, <4 x double> %shuffle, <4 x double> zeroinitializer
1346 define <4 x i64> @test_mm256_shuffle_i32x4(<4 x i64> %__A, <4 x i64> %__B) {
1347 ; CHECK-LABEL: test_mm256_shuffle_i32x4:
1348 ; CHECK: # %bb.0: # %entry
1349 ; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1350 ; CHECK-NEXT: ret{{[l|q]}}
1352 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1353 ret <4 x i64> %shuffle
1356 define <4 x i64> @test_mm256_mask_shuffle_i32x4(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1357 ; X86-LABEL: test_mm256_mask_shuffle_i32x4:
1358 ; X86: # %bb.0: # %entry
1359 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1360 ; X86-NEXT: kmovw %eax, %k1
1361 ; X86-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
1364 ; X64-LABEL: test_mm256_mask_shuffle_i32x4:
1365 ; X64: # %bb.0: # %entry
1366 ; X64-NEXT: kmovw %edi, %k1
1367 ; X64-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
1370 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1371 %0 = bitcast <4 x i64> %shuffle to <8 x i32>
1372 %1 = bitcast <4 x i64> %__W to <8 x i32>
1373 %2 = bitcast i8 %__U to <8 x i1>
1374 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
1375 %4 = bitcast <8 x i32> %3 to <4 x i64>
1379 define <4 x i64> @test_mm256_maskz_shuffle_i32x4(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1380 ; X86-LABEL: test_mm256_maskz_shuffle_i32x4:
1381 ; X86: # %bb.0: # %entry
1382 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1383 ; X86-NEXT: kmovw %eax, %k1
1384 ; X86-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
1387 ; X64-LABEL: test_mm256_maskz_shuffle_i32x4:
1388 ; X64: # %bb.0: # %entry
1389 ; X64-NEXT: kmovw %edi, %k1
1390 ; X64-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
1393 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1394 %0 = bitcast <4 x i64> %shuffle to <8 x i32>
1395 %1 = bitcast i8 %__U to <8 x i1>
1396 %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer
1397 %3 = bitcast <8 x i32> %2 to <4 x i64>
1401 define <4 x i64> @test_mm256_shuffle_i64x2(<4 x i64> %__A, <4 x i64> %__B) {
1402 ; CHECK-LABEL: test_mm256_shuffle_i64x2:
1403 ; CHECK: # %bb.0: # %entry
1404 ; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1405 ; CHECK-NEXT: ret{{[l|q]}}
1407 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1408 ret <4 x i64> %shuffle
1411 define <4 x i64> @test_mm256_mask_shuffle_i64x2(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1412 ; X86-LABEL: test_mm256_mask_shuffle_i64x2:
1413 ; X86: # %bb.0: # %entry
1414 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1415 ; X86-NEXT: kmovw %eax, %k1
1416 ; X86-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
1419 ; X64-LABEL: test_mm256_mask_shuffle_i64x2:
1420 ; X64: # %bb.0: # %entry
1421 ; X64-NEXT: kmovw %edi, %k1
1422 ; X64-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
1425 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1426 %0 = bitcast i8 %__U to <8 x i1>
1427 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1428 %1 = select <4 x i1> %extract, <4 x i64> %shuffle, <4 x i64> %__W
1432 define <4 x i64> @test_mm256_maskz_shuffle_i64x2(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1433 ; X86-LABEL: test_mm256_maskz_shuffle_i64x2:
1434 ; X86: # %bb.0: # %entry
1435 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1436 ; X86-NEXT: kmovw %eax, %k1
1437 ; X86-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
1440 ; X64-LABEL: test_mm256_maskz_shuffle_i64x2:
1441 ; X64: # %bb.0: # %entry
1442 ; X64-NEXT: kmovw %edi, %k1
1443 ; X64-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
1446 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1447 %0 = bitcast i8 %__U to <8 x i1>
1448 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1449 %1 = select <4 x i1> %extract, <4 x i64> %shuffle, <4 x i64> zeroinitializer
1453 define zeroext i8 @test_mm_test_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) {
1454 ; CHECK-LABEL: test_mm_test_epi32_mask:
1455 ; CHECK: # %bb.0: # %entry
1456 ; CHECK-NEXT: vptestmd %xmm0, %xmm1, %k0
1457 ; CHECK-NEXT: kmovw %k0, %eax
1458 ; CHECK-NEXT: movzbl %al, %eax
1459 ; CHECK-NEXT: ret{{[l|q]}}
1461 %and.i.i = and <2 x i64> %__B, %__A
1462 %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
1463 %1 = icmp ne <4 x i32> %0, zeroinitializer
1464 %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1465 %3 = bitcast <8 x i1> %2 to i8
1469 define zeroext i8 @test_mm_mask_test_epi32_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1470 ; X86-LABEL: test_mm_mask_test_epi32_mask:
1471 ; X86: # %bb.0: # %entry
1472 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1473 ; X86-NEXT: kmovw %eax, %k1
1474 ; X86-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1}
1475 ; X86-NEXT: kmovw %k0, %eax
1476 ; X86-NEXT: movzbl %al, %eax
1479 ; X64-LABEL: test_mm_mask_test_epi32_mask:
1480 ; X64: # %bb.0: # %entry
1481 ; X64-NEXT: kmovw %edi, %k1
1482 ; X64-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1}
1483 ; X64-NEXT: kmovw %k0, %eax
1484 ; X64-NEXT: movzbl %al, %eax
1487 %and.i.i = and <2 x i64> %__B, %__A
1488 %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
1489 %1 = icmp ne <4 x i32> %0, zeroinitializer
1490 %2 = bitcast i8 %__U to <8 x i1>
1491 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1492 %3 = and <4 x i1> %1, %extract.i
1493 %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1494 %5 = bitcast <8 x i1> %4 to i8
1498 define zeroext i8 @test_mm256_test_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) {
1499 ; CHECK-LABEL: test_mm256_test_epi32_mask:
1500 ; CHECK: # %bb.0: # %entry
1501 ; CHECK-NEXT: vptestmd %ymm0, %ymm1, %k0
1502 ; CHECK-NEXT: kmovw %k0, %eax
1503 ; CHECK-NEXT: movzbl %al, %eax
1504 ; CHECK-NEXT: vzeroupper
1505 ; CHECK-NEXT: ret{{[l|q]}}
1507 %and.i.i = and <4 x i64> %__B, %__A
1508 %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
1509 %1 = icmp ne <8 x i32> %0, zeroinitializer
1510 %2 = bitcast <8 x i1> %1 to i8
1514 define zeroext i8 @test_mm256_mask_test_epi32_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1515 ; X86-LABEL: test_mm256_mask_test_epi32_mask:
1516 ; X86: # %bb.0: # %entry
1517 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1518 ; X86-NEXT: kmovw %eax, %k1
1519 ; X86-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1}
1520 ; X86-NEXT: kmovw %k0, %eax
1521 ; X86-NEXT: movzbl %al, %eax
1522 ; X86-NEXT: vzeroupper
1525 ; X64-LABEL: test_mm256_mask_test_epi32_mask:
1526 ; X64: # %bb.0: # %entry
1527 ; X64-NEXT: kmovw %edi, %k1
1528 ; X64-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1}
1529 ; X64-NEXT: kmovw %k0, %eax
1530 ; X64-NEXT: movzbl %al, %eax
1531 ; X64-NEXT: vzeroupper
1534 %and.i.i = and <4 x i64> %__B, %__A
1535 %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
1536 %1 = icmp ne <8 x i32> %0, zeroinitializer
1537 %2 = bitcast i8 %__U to <8 x i1>
1538 %3 = and <8 x i1> %1, %2
1539 %4 = bitcast <8 x i1> %3 to i8
1543 define zeroext i8 @test_mm_test_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) {
1544 ; CHECK-LABEL: test_mm_test_epi64_mask:
1545 ; CHECK: # %bb.0: # %entry
1546 ; CHECK-NEXT: vptestmq %xmm0, %xmm1, %k0
1547 ; CHECK-NEXT: kmovw %k0, %eax
1548 ; CHECK-NEXT: movzbl %al, %eax
1549 ; CHECK-NEXT: ret{{[l|q]}}
1551 %and.i.i = and <2 x i64> %__B, %__A
1552 %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer
1553 %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1554 %2 = bitcast <8 x i1> %1 to i8
1558 define zeroext i8 @test_mm_mask_test_epi64_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1559 ; X86-LABEL: test_mm_mask_test_epi64_mask:
1560 ; X86: # %bb.0: # %entry
1561 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1562 ; X86-NEXT: kmovw %eax, %k1
1563 ; X86-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1}
1564 ; X86-NEXT: kmovw %k0, %eax
1565 ; X86-NEXT: movzbl %al, %eax
1568 ; X64-LABEL: test_mm_mask_test_epi64_mask:
1569 ; X64: # %bb.0: # %entry
1570 ; X64-NEXT: kmovw %edi, %k1
1571 ; X64-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1}
1572 ; X64-NEXT: kmovw %k0, %eax
1573 ; X64-NEXT: movzbl %al, %eax
1576 %and.i.i = and <2 x i64> %__B, %__A
1577 %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer
1578 %1 = bitcast i8 %__U to <8 x i1>
1579 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1580 %2 = and <2 x i1> %0, %extract.i
1581 %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1582 %4 = bitcast <8 x i1> %3 to i8
1586 define zeroext i8 @test_mm256_test_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) {
1587 ; CHECK-LABEL: test_mm256_test_epi64_mask:
1588 ; CHECK: # %bb.0: # %entry
1589 ; CHECK-NEXT: vptestmq %ymm0, %ymm1, %k0
1590 ; CHECK-NEXT: kmovw %k0, %eax
1591 ; CHECK-NEXT: movzbl %al, %eax
1592 ; CHECK-NEXT: vzeroupper
1593 ; CHECK-NEXT: ret{{[l|q]}}
1595 %and.i.i = and <4 x i64> %__B, %__A
1596 %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer
1597 %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1598 %2 = bitcast <8 x i1> %1 to i8
1602 define zeroext i8 @test_mm256_mask_test_epi64_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1603 ; X86-LABEL: test_mm256_mask_test_epi64_mask:
1604 ; X86: # %bb.0: # %entry
1605 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1606 ; X86-NEXT: kmovw %eax, %k1
1607 ; X86-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1}
1608 ; X86-NEXT: kmovw %k0, %eax
1609 ; X86-NEXT: movzbl %al, %eax
1610 ; X86-NEXT: vzeroupper
1613 ; X64-LABEL: test_mm256_mask_test_epi64_mask:
1614 ; X64: # %bb.0: # %entry
1615 ; X64-NEXT: kmovw %edi, %k1
1616 ; X64-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1}
1617 ; X64-NEXT: kmovw %k0, %eax
1618 ; X64-NEXT: movzbl %al, %eax
1619 ; X64-NEXT: vzeroupper
1622 %and.i.i = and <4 x i64> %__B, %__A
1623 %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer
1624 %1 = bitcast i8 %__U to <8 x i1>
1625 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1626 %2 = and <4 x i1> %0, %extract.i
1627 %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1628 %4 = bitcast <8 x i1> %3 to i8
1632 define zeroext i8 @test_mm_testn_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) {
1633 ; CHECK-LABEL: test_mm_testn_epi32_mask:
1634 ; CHECK: # %bb.0: # %entry
1635 ; CHECK-NEXT: vptestnmd %xmm0, %xmm1, %k0
1636 ; CHECK-NEXT: kmovw %k0, %eax
1637 ; CHECK-NEXT: movzbl %al, %eax
1638 ; CHECK-NEXT: ret{{[l|q]}}
1640 %and.i.i = and <2 x i64> %__B, %__A
1641 %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
1642 %1 = icmp eq <4 x i32> %0, zeroinitializer
1643 %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1644 %3 = bitcast <8 x i1> %2 to i8
1648 define zeroext i8 @test_mm_mask_testn_epi32_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1649 ; X86-LABEL: test_mm_mask_testn_epi32_mask:
1650 ; X86: # %bb.0: # %entry
1651 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1652 ; X86-NEXT: kmovw %eax, %k1
1653 ; X86-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1}
1654 ; X86-NEXT: kmovw %k0, %eax
1655 ; X86-NEXT: movzbl %al, %eax
1658 ; X64-LABEL: test_mm_mask_testn_epi32_mask:
1659 ; X64: # %bb.0: # %entry
1660 ; X64-NEXT: kmovw %edi, %k1
1661 ; X64-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1}
1662 ; X64-NEXT: kmovw %k0, %eax
1663 ; X64-NEXT: movzbl %al, %eax
1666 %and.i.i = and <2 x i64> %__B, %__A
1667 %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
1668 %1 = icmp eq <4 x i32> %0, zeroinitializer
1669 %2 = bitcast i8 %__U to <8 x i1>
1670 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1671 %3 = and <4 x i1> %1, %extract.i
1672 %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1673 %5 = bitcast <8 x i1> %4 to i8
1677 define zeroext i8 @test_mm256_testn_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) {
1678 ; CHECK-LABEL: test_mm256_testn_epi32_mask:
1679 ; CHECK: # %bb.0: # %entry
1680 ; CHECK-NEXT: vptestnmd %ymm0, %ymm1, %k0
1681 ; CHECK-NEXT: kmovw %k0, %eax
1682 ; CHECK-NEXT: movzbl %al, %eax
1683 ; CHECK-NEXT: vzeroupper
1684 ; CHECK-NEXT: ret{{[l|q]}}
1686 %and.i.i = and <4 x i64> %__B, %__A
1687 %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
1688 %1 = icmp eq <8 x i32> %0, zeroinitializer
1689 %2 = bitcast <8 x i1> %1 to i8
1693 define zeroext i8 @test_mm256_mask_testn_epi32_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1694 ; X86-LABEL: test_mm256_mask_testn_epi32_mask:
1695 ; X86: # %bb.0: # %entry
1696 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1697 ; X86-NEXT: kmovw %eax, %k1
1698 ; X86-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1}
1699 ; X86-NEXT: kmovw %k0, %eax
1700 ; X86-NEXT: movzbl %al, %eax
1701 ; X86-NEXT: vzeroupper
1704 ; X64-LABEL: test_mm256_mask_testn_epi32_mask:
1705 ; X64: # %bb.0: # %entry
1706 ; X64-NEXT: kmovw %edi, %k1
1707 ; X64-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1}
1708 ; X64-NEXT: kmovw %k0, %eax
1709 ; X64-NEXT: movzbl %al, %eax
1710 ; X64-NEXT: vzeroupper
1713 %and.i.i = and <4 x i64> %__B, %__A
1714 %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
1715 %1 = icmp eq <8 x i32> %0, zeroinitializer
1716 %2 = bitcast i8 %__U to <8 x i1>
1717 %3 = and <8 x i1> %1, %2
1718 %4 = bitcast <8 x i1> %3 to i8
1722 define zeroext i8 @test_mm_testn_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) {
1723 ; CHECK-LABEL: test_mm_testn_epi64_mask:
1724 ; CHECK: # %bb.0: # %entry
1725 ; CHECK-NEXT: vptestnmq %xmm0, %xmm1, %k0
1726 ; CHECK-NEXT: kmovw %k0, %eax
1727 ; CHECK-NEXT: movzbl %al, %eax
1728 ; CHECK-NEXT: ret{{[l|q]}}
1730 %and.i.i = and <2 x i64> %__B, %__A
1731 %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer
1732 %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1733 %2 = bitcast <8 x i1> %1 to i8
1737 define zeroext i8 @test_mm_mask_testn_epi64_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1738 ; X86-LABEL: test_mm_mask_testn_epi64_mask:
1739 ; X86: # %bb.0: # %entry
1740 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1741 ; X86-NEXT: kmovw %eax, %k1
1742 ; X86-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1}
1743 ; X86-NEXT: kmovw %k0, %eax
1744 ; X86-NEXT: movzbl %al, %eax
1747 ; X64-LABEL: test_mm_mask_testn_epi64_mask:
1748 ; X64: # %bb.0: # %entry
1749 ; X64-NEXT: kmovw %edi, %k1
1750 ; X64-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1}
1751 ; X64-NEXT: kmovw %k0, %eax
1752 ; X64-NEXT: movzbl %al, %eax
1755 %and.i.i = and <2 x i64> %__B, %__A
1756 %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer
1757 %1 = bitcast i8 %__U to <8 x i1>
1758 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1759 %2 = and <2 x i1> %0, %extract.i
1760 %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1761 %4 = bitcast <8 x i1> %3 to i8
1765 define zeroext i8 @test_mm256_testn_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) {
1766 ; CHECK-LABEL: test_mm256_testn_epi64_mask:
1767 ; CHECK: # %bb.0: # %entry
1768 ; CHECK-NEXT: vptestnmq %ymm0, %ymm1, %k0
1769 ; CHECK-NEXT: kmovw %k0, %eax
1770 ; CHECK-NEXT: movzbl %al, %eax
1771 ; CHECK-NEXT: vzeroupper
1772 ; CHECK-NEXT: ret{{[l|q]}}
1774 %and.i.i = and <4 x i64> %__B, %__A
1775 %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer
1776 %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1777 %2 = bitcast <8 x i1> %1 to i8
1781 define zeroext i8 @test_mm256_mask_testn_epi64_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1782 ; X86-LABEL: test_mm256_mask_testn_epi64_mask:
1783 ; X86: # %bb.0: # %entry
1784 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1785 ; X86-NEXT: kmovw %eax, %k1
1786 ; X86-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1}
1787 ; X86-NEXT: kmovw %k0, %eax
1788 ; X86-NEXT: movzbl %al, %eax
1789 ; X86-NEXT: vzeroupper
1792 ; X64-LABEL: test_mm256_mask_testn_epi64_mask:
1793 ; X64: # %bb.0: # %entry
1794 ; X64-NEXT: kmovw %edi, %k1
1795 ; X64-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1}
1796 ; X64-NEXT: kmovw %k0, %eax
1797 ; X64-NEXT: movzbl %al, %eax
1798 ; X64-NEXT: vzeroupper
1801 %and.i.i = and <4 x i64> %__B, %__A
1802 %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer
1803 %1 = bitcast i8 %__U to <8 x i1>
1804 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1805 %2 = and <4 x i1> %0, %extract.i
1806 %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1807 %4 = bitcast <8 x i1> %3 to i8
1811 define <2 x i64> @test_mm_mask_set1_epi32(<2 x i64> %__O, i8 zeroext %__M) {
1812 ; X86-LABEL: test_mm_mask_set1_epi32:
1813 ; X86: # %bb.0: # %entry
1814 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1815 ; X86-NEXT: kmovw %eax, %k1
1816 ; X86-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm0 {%k1}
1819 ; X64-LABEL: test_mm_mask_set1_epi32:
1820 ; X64: # %bb.0: # %entry
1821 ; X64-NEXT: kmovw %edi, %k1
1822 ; X64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1}
1825 %0 = bitcast <2 x i64> %__O to <4 x i32>
1826 %1 = bitcast i8 %__M to <8 x i1>
1827 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1828 %2 = select <4 x i1> %extract.i, <4 x i32> <i32 5, i32 5, i32 5, i32 5>, <4 x i32> %0
1829 %3 = bitcast <4 x i32> %2 to <2 x i64>
1833 define <2 x i64> @test_mm_maskz_set1_epi32(i8 zeroext %__M) {
1834 ; X86-LABEL: test_mm_maskz_set1_epi32:
1835 ; X86: # %bb.0: # %entry
1836 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1837 ; X86-NEXT: kmovw %eax, %k1
1838 ; X86-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm0 {%k1} {z}
1841 ; X64-LABEL: test_mm_maskz_set1_epi32:
1842 ; X64: # %bb.0: # %entry
1843 ; X64-NEXT: kmovw %edi, %k1
1844 ; X64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
1847 %0 = bitcast i8 %__M to <8 x i1>
1848 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1849 %1 = select <4 x i1> %extract.i, <4 x i32> <i32 5, i32 5, i32 5, i32 5>, <4 x i32> zeroinitializer
1850 %2 = bitcast <4 x i32> %1 to <2 x i64>
1854 define <4 x i64> @test_mm256_mask_set1_epi32(<4 x i64> %__O, i8 zeroext %__M) {
1855 ; X86-LABEL: test_mm256_mask_set1_epi32:
1856 ; X86: # %bb.0: # %entry
1857 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1858 ; X86-NEXT: kmovw %eax, %k1
1859 ; X86-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm0 {%k1}
1862 ; X64-LABEL: test_mm256_mask_set1_epi32:
1863 ; X64: # %bb.0: # %entry
1864 ; X64-NEXT: kmovw %edi, %k1
1865 ; X64-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1}
1868 %0 = bitcast <4 x i64> %__O to <8 x i32>
1869 %1 = bitcast i8 %__M to <8 x i1>
1870 %2 = select <8 x i1> %1, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <8 x i32> %0
1871 %3 = bitcast <8 x i32> %2 to <4 x i64>
1875 define <4 x i64> @test_mm256_maskz_set1_epi32(i8 zeroext %__M) {
1876 ; X86-LABEL: test_mm256_maskz_set1_epi32:
1877 ; X86: # %bb.0: # %entry
1878 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1879 ; X86-NEXT: kmovw %eax, %k1
1880 ; X86-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm0 {%k1} {z}
1883 ; X64-LABEL: test_mm256_maskz_set1_epi32:
1884 ; X64: # %bb.0: # %entry
1885 ; X64-NEXT: kmovw %edi, %k1
1886 ; X64-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
1889 %0 = bitcast i8 %__M to <8 x i1>
1890 %1 = select <8 x i1> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <8 x i32> zeroinitializer
1891 %2 = bitcast <8 x i32> %1 to <4 x i64>
1895 define <2 x i64> @test_mm_mask_set1_epi64(<2 x i64> %__O, i8 zeroext %__M, i64 %__A) {
1896 ; X86-LABEL: test_mm_mask_set1_epi64:
1897 ; X86: # %bb.0: # %entry
1898 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1899 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1900 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
1901 ; X86-NEXT: kmovw %eax, %k1
1902 ; X86-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1}
1905 ; X64-LABEL: test_mm_mask_set1_epi64:
1906 ; X64: # %bb.0: # %entry
1907 ; X64-NEXT: kmovw %edi, %k1
1908 ; X64-NEXT: vpbroadcastq %rsi, %xmm0 {%k1}
1911 %vecinit.i.i.i = insertelement <2 x i64> undef, i64 %__A, i32 0
1912 %vecinit1.i.i.i = shufflevector <2 x i64> %vecinit.i.i.i, <2 x i64> undef, <2 x i32> zeroinitializer
1913 %0 = bitcast i8 %__M to <8 x i1>
1914 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1915 %1 = select <2 x i1> %extract.i, <2 x i64> %vecinit1.i.i.i, <2 x i64> %__O
1919 define <2 x i64> @test_mm_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) {
1920 ; X86-LABEL: test_mm_maskz_set1_epi64:
1921 ; X86: # %bb.0: # %entry
1922 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1923 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1924 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
1925 ; X86-NEXT: kmovw %eax, %k1
1926 ; X86-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z}
1929 ; X64-LABEL: test_mm_maskz_set1_epi64:
1930 ; X64: # %bb.0: # %entry
1931 ; X64-NEXT: kmovw %edi, %k1
1932 ; X64-NEXT: vpbroadcastq %rsi, %xmm0 {%k1} {z}
1935 %vecinit.i.i.i = insertelement <2 x i64> undef, i64 %__A, i32 0
1936 %vecinit1.i.i.i = shufflevector <2 x i64> %vecinit.i.i.i, <2 x i64> undef, <2 x i32> zeroinitializer
1937 %0 = bitcast i8 %__M to <8 x i1>
1938 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1939 %1 = select <2 x i1> %extract.i, <2 x i64> %vecinit1.i.i.i, <2 x i64> zeroinitializer
1944 define <4 x i64> @test_mm256_mask_set1_epi64(<4 x i64> %__O, i8 zeroext %__M, i64 %__A) {
1945 ; X86-LABEL: test_mm256_mask_set1_epi64:
1946 ; X86: # %bb.0: # %entry
1947 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1948 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1949 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
1950 ; X86-NEXT: kmovw %eax, %k1
1951 ; X86-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1}
1954 ; X64-LABEL: test_mm256_mask_set1_epi64:
1955 ; X64: # %bb.0: # %entry
1956 ; X64-NEXT: kmovw %edi, %k1
1957 ; X64-NEXT: vpbroadcastq %rsi, %ymm0 {%k1}
1960 %vecinit.i.i = insertelement <4 x i64> undef, i64 %__A, i32 0
1961 %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer
1962 %0 = bitcast i8 %__M to <8 x i1>
1963 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1964 %1 = select <4 x i1> %extract.i, <4 x i64> %vecinit3.i.i, <4 x i64> %__O
1968 define <4 x i64> @test_mm256_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) {
1969 ; X86-LABEL: test_mm256_maskz_set1_epi64:
1970 ; X86: # %bb.0: # %entry
1971 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1972 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1973 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
1974 ; X86-NEXT: kmovw %eax, %k1
1975 ; X86-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z}
1978 ; X64-LABEL: test_mm256_maskz_set1_epi64:
1979 ; X64: # %bb.0: # %entry
1980 ; X64-NEXT: kmovw %edi, %k1
1981 ; X64-NEXT: vpbroadcastq %rsi, %ymm0 {%k1} {z}
1984 %vecinit.i.i = insertelement <4 x i64> undef, i64 %__A, i32 0
1985 %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer
1986 %0 = bitcast i8 %__M to <8 x i1>
1987 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1988 %1 = select <4 x i1> %extract.i, <4 x i64> %vecinit3.i.i, <4 x i64> zeroinitializer
1992 define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
1993 ; CHECK-LABEL: test_mm_broadcastd_epi32:
1995 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
1996 ; CHECK-NEXT: ret{{[l|q]}}
1997 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1998 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
1999 %res1 = bitcast <4 x i32> %res0 to <2 x i64>
2003 define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) {
2004 ; X86-LABEL: test_mm_mask_broadcastd_epi32:
2005 ; X86: # %bb.0: # %entry
2006 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2007 ; X86-NEXT: kmovw %eax, %k1
2008 ; X86-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1}
2011 ; X64-LABEL: test_mm_mask_broadcastd_epi32:
2012 ; X64: # %bb.0: # %entry
2013 ; X64-NEXT: kmovw %edi, %k1
2014 ; X64-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1}
2017 %0 = bitcast <2 x i64> %__A to <4 x i32>
2018 %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
2019 %1 = bitcast <2 x i64> %__O to <4 x i32>
2020 %2 = bitcast i8 %__M to <8 x i1>
2021 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2022 %3 = select <4 x i1> %extract.i, <4 x i32> %shuffle.i.i, <4 x i32> %1
2023 %4 = bitcast <4 x i32> %3 to <2 x i64>
2027 define <2 x i64> @test_mm_maskz_broadcastd_epi32(i8 zeroext %__M, <2 x i64> %__A) {
2028 ; X86-LABEL: test_mm_maskz_broadcastd_epi32:
2029 ; X86: # %bb.0: # %entry
2030 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2031 ; X86-NEXT: kmovw %eax, %k1
2032 ; X86-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z}
2035 ; X64-LABEL: test_mm_maskz_broadcastd_epi32:
2036 ; X64: # %bb.0: # %entry
2037 ; X64-NEXT: kmovw %edi, %k1
2038 ; X64-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z}
2041 %0 = bitcast <2 x i64> %__A to <4 x i32>
2042 %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
2043 %1 = bitcast i8 %__M to <8 x i1>
2044 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2045 %2 = select <4 x i1> %extract.i, <4 x i32> %shuffle.i.i, <4 x i32> zeroinitializer
2046 %3 = bitcast <4 x i32> %2 to <2 x i64>
2050 define <4 x i64> @test_mm256_broadcastd_epi32(<2 x i64> %a0) {
2051 ; CHECK-LABEL: test_mm256_broadcastd_epi32:
2053 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
2054 ; CHECK-NEXT: ret{{[l|q]}}
2055 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2056 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <8 x i32> zeroinitializer
2057 %res1 = bitcast <8 x i32> %res0 to <4 x i64>
2061 define <4 x i64> @test_mm256_mask_broadcastd_epi32(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) {
2062 ; X86-LABEL: test_mm256_mask_broadcastd_epi32:
2064 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2065 ; X86-NEXT: kmovw %eax, %k1
2066 ; X86-NEXT: vpbroadcastd %xmm1, %ymm0 {%k1}
2069 ; X64-LABEL: test_mm256_mask_broadcastd_epi32:
2071 ; X64-NEXT: kmovw %edi, %k1
2072 ; X64-NEXT: vpbroadcastd %xmm1, %ymm0 {%k1}
2074 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2075 %arg1 = bitcast i8 %a1 to <8 x i1>
2076 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
2077 %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <8 x i32> zeroinitializer
2078 %res1 = select <8 x i1> %arg1, <8 x i32> %res0, <8 x i32> %arg0
2079 %res2 = bitcast <8 x i32> %res1 to <4 x i64>
2083 define <4 x i64> @test_mm256_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) {
2084 ; X86-LABEL: test_mm256_maskz_broadcastd_epi32:
2086 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2087 ; X86-NEXT: kmovw %eax, %k1
2088 ; X86-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z}
2091 ; X64-LABEL: test_mm256_maskz_broadcastd_epi32:
2093 ; X64-NEXT: kmovw %edi, %k1
2094 ; X64-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z}
2096 %arg0 = bitcast i8 %a0 to <8 x i1>
2097 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2098 %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <8 x i32> zeroinitializer
2099 %res1 = select <8 x i1> %arg0, <8 x i32> %res0, <8 x i32> zeroinitializer
2100 %res2 = bitcast <8 x i32> %res1 to <4 x i64>
2104 define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
2105 ; CHECK-LABEL: test_mm_broadcastq_epi64:
2107 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2108 ; CHECK-NEXT: ret{{[l|q]}}
2109 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
2113 define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) {
2114 ; X86-LABEL: test_mm_mask_broadcastq_epi64:
2115 ; X86: # %bb.0: # %entry
2116 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2117 ; X86-NEXT: kmovw %eax, %k1
2118 ; X86-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1}
2121 ; X64-LABEL: test_mm_mask_broadcastq_epi64:
2122 ; X64: # %bb.0: # %entry
2123 ; X64-NEXT: kmovw %edi, %k1
2124 ; X64-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1}
2127 %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <2 x i32> zeroinitializer
2128 %0 = bitcast i8 %__M to <8 x i1>
2129 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2130 %1 = select <2 x i1> %extract.i, <2 x i64> %shuffle.i.i, <2 x i64> %__O
2134 define <2 x i64> @test_mm_maskz_broadcastq_epi64(i8 zeroext %__M, <2 x i64> %__A) {
2135 ; X86-LABEL: test_mm_maskz_broadcastq_epi64:
2136 ; X86: # %bb.0: # %entry
2137 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2138 ; X86-NEXT: kmovw %eax, %k1
2139 ; X86-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z}
2142 ; X64-LABEL: test_mm_maskz_broadcastq_epi64:
2143 ; X64: # %bb.0: # %entry
2144 ; X64-NEXT: kmovw %edi, %k1
2145 ; X64-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z}
2148 %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <2 x i32> zeroinitializer
2149 %0 = bitcast i8 %__M to <8 x i1>
2150 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2151 %1 = select <2 x i1> %extract.i, <2 x i64> %shuffle.i.i, <2 x i64> zeroinitializer
2155 define <4 x i64> @test_mm256_broadcastq_epi64(<2 x i64> %a0) {
2156 ; CHECK-LABEL: test_mm256_broadcastq_epi64:
2158 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
2159 ; CHECK-NEXT: ret{{[l|q]}}
2160 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> zeroinitializer
2164 define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) {
2165 ; X86-LABEL: test_mm256_mask_broadcastq_epi64:
2166 ; X86: # %bb.0: # %entry
2167 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2168 ; X86-NEXT: kmovw %eax, %k1
2169 ; X86-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1}
2172 ; X64-LABEL: test_mm256_mask_broadcastq_epi64:
2173 ; X64: # %bb.0: # %entry
2174 ; X64-NEXT: kmovw %edi, %k1
2175 ; X64-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1}
2178 %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <4 x i32> zeroinitializer
2179 %0 = bitcast i8 %__M to <8 x i1>
2180 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2181 %1 = select <4 x i1> %extract.i, <4 x i64> %shuffle.i.i, <4 x i64> %__O
2185 define <4 x i64> @test_mm256_maskz_broadcastq_epi64(i8 zeroext %__M, <2 x i64> %__A) {
2186 ; X86-LABEL: test_mm256_maskz_broadcastq_epi64:
2187 ; X86: # %bb.0: # %entry
2188 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2189 ; X86-NEXT: kmovw %eax, %k1
2190 ; X86-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z}
2193 ; X64-LABEL: test_mm256_maskz_broadcastq_epi64:
2194 ; X64: # %bb.0: # %entry
2195 ; X64-NEXT: kmovw %edi, %k1
2196 ; X64-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z}
2199 %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <4 x i32> zeroinitializer
2200 %0 = bitcast i8 %__M to <8 x i1>
2201 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2202 %1 = select <4 x i1> %extract.i, <4 x i64> %shuffle.i.i, <4 x i64> zeroinitializer
2206 define <4 x double> @test_mm256_broadcastsd_pd(<2 x double> %a0) {
2207 ; CHECK-LABEL: test_mm256_broadcastsd_pd:
2209 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
2210 ; CHECK-NEXT: ret{{[l|q]}}
2211 %res = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer
2212 ret <4 x double> %res
2215 define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %__O, i8 zeroext %__M, <2 x double> %__A) {
2216 ; X86-LABEL: test_mm256_mask_broadcastsd_pd:
2217 ; X86: # %bb.0: # %entry
2218 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2219 ; X86-NEXT: kmovw %eax, %k1
2220 ; X86-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1}
2223 ; X64-LABEL: test_mm256_mask_broadcastsd_pd:
2224 ; X64: # %bb.0: # %entry
2225 ; X64-NEXT: kmovw %edi, %k1
2226 ; X64-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1}
2229 %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <4 x i32> zeroinitializer
2230 %0 = bitcast i8 %__M to <8 x i1>
2231 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2232 %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> %__O
2236 define <4 x double> @test_mm256_maskz_broadcastsd_pd(i8 zeroext %__M, <2 x double> %__A) {
2237 ; X86-LABEL: test_mm256_maskz_broadcastsd_pd:
2238 ; X86: # %bb.0: # %entry
2239 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2240 ; X86-NEXT: kmovw %eax, %k1
2241 ; X86-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z}
2244 ; X64-LABEL: test_mm256_maskz_broadcastsd_pd:
2245 ; X64: # %bb.0: # %entry
2246 ; X64-NEXT: kmovw %edi, %k1
2247 ; X64-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z}
2250 %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <4 x i32> zeroinitializer
2251 %0 = bitcast i8 %__M to <8 x i1>
2252 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2253 %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> zeroinitializer
2257 define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
2258 ; CHECK-LABEL: test_mm_broadcastss_ps:
2260 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
2261 ; CHECK-NEXT: ret{{[l|q]}}
2262 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
2263 ret <4 x float> %res
2266 define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %__O, i8 zeroext %__M, <4 x float> %__A) {
2267 ; X86-LABEL: test_mm_mask_broadcastss_ps:
2268 ; X86: # %bb.0: # %entry
2269 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2270 ; X86-NEXT: kmovw %eax, %k1
2271 ; X86-NEXT: vbroadcastss %xmm1, %xmm0 {%k1}
2274 ; X64-LABEL: test_mm_mask_broadcastss_ps:
2275 ; X64: # %bb.0: # %entry
2276 ; X64-NEXT: kmovw %edi, %k1
2277 ; X64-NEXT: vbroadcastss %xmm1, %xmm0 {%k1}
2280 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> zeroinitializer
2281 %0 = bitcast i8 %__M to <8 x i1>
2282 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2283 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__O
2287 define <4 x float> @test_mm_maskz_broadcastss_ps(i8 zeroext %__M, <4 x float> %__A) {
2288 ; X86-LABEL: test_mm_maskz_broadcastss_ps:
2289 ; X86: # %bb.0: # %entry
2290 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2291 ; X86-NEXT: kmovw %eax, %k1
2292 ; X86-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z}
2295 ; X64-LABEL: test_mm_maskz_broadcastss_ps:
2296 ; X64: # %bb.0: # %entry
2297 ; X64-NEXT: kmovw %edi, %k1
2298 ; X64-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z}
2301 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> zeroinitializer
2302 %0 = bitcast i8 %__M to <8 x i1>
2303 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2304 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer
2308 define <8 x float> @test_mm256_broadcastss_ps(<4 x float> %a0) {
2309 ; CHECK-LABEL: test_mm256_broadcastss_ps:
2311 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
2312 ; CHECK-NEXT: ret{{[l|q]}}
2313 %res = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer
2314 ret <8 x float> %res
2317 define <8 x float> @test_mm256_mask_broadcastss_ps(<8 x float> %a0, i8 %a1, <4 x float> %a2) {
2318 ; X86-LABEL: test_mm256_mask_broadcastss_ps:
2320 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2321 ; X86-NEXT: kmovw %eax, %k1
2322 ; X86-NEXT: vbroadcastss %xmm1, %ymm0 {%k1}
2325 ; X64-LABEL: test_mm256_mask_broadcastss_ps:
2327 ; X64-NEXT: kmovw %edi, %k1
2328 ; X64-NEXT: vbroadcastss %xmm1, %ymm0 {%k1}
2330 %arg1 = bitcast i8 %a1 to <8 x i1>
2331 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <8 x i32> zeroinitializer
2332 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
2333 ret <8 x float> %res1
2336 define <8 x float> @test_mm256_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) {
2337 ; X86-LABEL: test_mm256_maskz_broadcastss_ps:
2339 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2340 ; X86-NEXT: kmovw %eax, %k1
2341 ; X86-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z}
2344 ; X64-LABEL: test_mm256_maskz_broadcastss_ps:
2346 ; X64-NEXT: kmovw %edi, %k1
2347 ; X64-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z}
2349 %arg0 = bitcast i8 %a0 to <8 x i1>
2350 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> zeroinitializer
2351 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
2352 ret <8 x float> %res1
2355 define <2 x double> @test_mm_movddup_pd(<2 x double> %a0) {
2356 ; CHECK-LABEL: test_mm_movddup_pd:
2358 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2359 ; CHECK-NEXT: ret{{[l|q]}}
2360 %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
2361 ret <2 x double> %res
2364 define <2 x double> @test_mm_mask_movedup_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A) {
2365 ; X86-LABEL: test_mm_mask_movedup_pd:
2366 ; X86: # %bb.0: # %entry
2367 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2368 ; X86-NEXT: kmovw %eax, %k1
2369 ; X86-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
2372 ; X64-LABEL: test_mm_mask_movedup_pd:
2373 ; X64: # %bb.0: # %entry
2374 ; X64-NEXT: kmovw %edi, %k1
2375 ; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
2378 %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <2 x i32> zeroinitializer
2379 %0 = bitcast i8 %__U to <8 x i1>
2380 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2381 %1 = select <2 x i1> %extract.i, <2 x double> %shuffle.i.i, <2 x double> %__W
2385 define <2 x double> @test_mm_maskz_movedup_pd(i8 zeroext %__U, <2 x double> %__A) {
2386 ; X86-LABEL: test_mm_maskz_movedup_pd:
2387 ; X86: # %bb.0: # %entry
2388 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2389 ; X86-NEXT: kmovw %eax, %k1
2390 ; X86-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
2393 ; X64-LABEL: test_mm_maskz_movedup_pd:
2394 ; X64: # %bb.0: # %entry
2395 ; X64-NEXT: kmovw %edi, %k1
2396 ; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
2399 %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <2 x i32> zeroinitializer
2400 %0 = bitcast i8 %__U to <8 x i1>
2401 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2402 %1 = select <2 x i1> %extract.i, <2 x double> %shuffle.i.i, <2 x double> zeroinitializer
2406 define <4 x double> @test_mm256_movddup_pd(<4 x double> %a0) {
2407 ; CHECK-LABEL: test_mm256_movddup_pd:
2409 ; CHECK-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
2410 ; CHECK-NEXT: ret{{[l|q]}}
2411 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2412 ret <4 x double> %res
2415 define <4 x double> @test_mm256_mask_movedup_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A) {
2416 ; X86-LABEL: test_mm256_mask_movedup_pd:
2417 ; X86: # %bb.0: # %entry
2418 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2419 ; X86-NEXT: kmovw %eax, %k1
2420 ; X86-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
2423 ; X64-LABEL: test_mm256_mask_movedup_pd:
2424 ; X64: # %bb.0: # %entry
2425 ; X64-NEXT: kmovw %edi, %k1
2426 ; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
2429 %shuffle.i.i = shufflevector <4 x double> %__A, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2430 %0 = bitcast i8 %__U to <8 x i1>
2431 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2432 %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> %__W
2436 define <4 x double> @test_mm256_maskz_movedup_pd(i8 zeroext %__U, <4 x double> %__A) {
2437 ; X86-LABEL: test_mm256_maskz_movedup_pd:
2438 ; X86: # %bb.0: # %entry
2439 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2440 ; X86-NEXT: kmovw %eax, %k1
2441 ; X86-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
2444 ; X64-LABEL: test_mm256_maskz_movedup_pd:
2445 ; X64: # %bb.0: # %entry
2446 ; X64-NEXT: kmovw %edi, %k1
2447 ; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
2450 %shuffle.i.i = shufflevector <4 x double> %__A, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2451 %0 = bitcast i8 %__U to <8 x i1>
2452 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2453 %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> zeroinitializer
2457 define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) {
2458 ; CHECK-LABEL: test_mm_movehdup_ps:
2460 ; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
2461 ; CHECK-NEXT: ret{{[l|q]}}
2462 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
2463 ret <4 x float> %res
2466 define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) {
2467 ; X86-LABEL: test_mm_mask_movehdup_ps:
2468 ; X86: # %bb.0: # %entry
2469 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2470 ; X86-NEXT: kmovw %eax, %k1
2471 ; X86-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
2474 ; X64-LABEL: test_mm_mask_movehdup_ps:
2475 ; X64: # %bb.0: # %entry
2476 ; X64-NEXT: kmovw %edi, %k1
2477 ; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
2480 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
2481 %0 = bitcast i8 %__U to <8 x i1>
2482 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2483 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__W
2487 define <4 x float> @test_mm_maskz_movehdup_ps(i8 zeroext %__U, <4 x float> %__A) {
2488 ; X86-LABEL: test_mm_maskz_movehdup_ps:
2489 ; X86: # %bb.0: # %entry
2490 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2491 ; X86-NEXT: kmovw %eax, %k1
2492 ; X86-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
2495 ; X64-LABEL: test_mm_maskz_movehdup_ps:
2496 ; X64: # %bb.0: # %entry
2497 ; X64-NEXT: kmovw %edi, %k1
2498 ; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
2501 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
2502 %0 = bitcast i8 %__U to <8 x i1>
2503 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2504 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer
2508 define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) {
2509 ; CHECK-LABEL: test_mm256_movehdup_ps:
2511 ; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
2512 ; CHECK-NEXT: ret{{[l|q]}}
2513 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
2514 ret <8 x float> %res
2517 define <8 x float> @test_mm256_mask_movehdup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) {
2518 ; X86-LABEL: test_mm256_mask_movehdup_ps:
2520 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2521 ; X86-NEXT: kmovw %eax, %k1
2522 ; X86-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7]
2525 ; X64-LABEL: test_mm256_mask_movehdup_ps:
2527 ; X64-NEXT: kmovw %edi, %k1
2528 ; X64-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7]
2530 %arg1 = bitcast i8 %a1 to <8 x i1>
2531 %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
2532 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
2533 ret <8 x float> %res1
2536 define <8 x float> @test_mm256_maskz_movehdup_ps(i8 %a0, <8 x float> %a1) {
2537 ; X86-LABEL: test_mm256_maskz_movehdup_ps:
2539 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2540 ; X86-NEXT: kmovw %eax, %k1
2541 ; X86-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
2544 ; X64-LABEL: test_mm256_maskz_movehdup_ps:
2546 ; X64-NEXT: kmovw %edi, %k1
2547 ; X64-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
2549 %arg0 = bitcast i8 %a0 to <8 x i1>
2550 %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
2551 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
2552 ret <8 x float> %res1
2555 define <4 x float> @test_mm_moveldup_ps(<4 x float> %a0) {
2556 ; CHECK-LABEL: test_mm_moveldup_ps:
2558 ; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
2559 ; CHECK-NEXT: ret{{[l|q]}}
2560 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2561 ret <4 x float> %res
2564 define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) {
2565 ; X86-LABEL: test_mm_mask_moveldup_ps:
2566 ; X86: # %bb.0: # %entry
2567 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2568 ; X86-NEXT: kmovw %eax, %k1
2569 ; X86-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
2572 ; X64-LABEL: test_mm_mask_moveldup_ps:
2573 ; X64: # %bb.0: # %entry
2574 ; X64-NEXT: kmovw %edi, %k1
2575 ; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
2578 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2579 %0 = bitcast i8 %__U to <8 x i1>
2580 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2581 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__W
2585 define <4 x float> @test_mm_maskz_moveldup_ps(i8 zeroext %__U, <4 x float> %__A) {
2586 ; X86-LABEL: test_mm_maskz_moveldup_ps:
2587 ; X86: # %bb.0: # %entry
2588 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2589 ; X86-NEXT: kmovw %eax, %k1
2590 ; X86-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
2593 ; X64-LABEL: test_mm_maskz_moveldup_ps:
2594 ; X64: # %bb.0: # %entry
2595 ; X64-NEXT: kmovw %edi, %k1
2596 ; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
2599 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2600 %0 = bitcast i8 %__U to <8 x i1>
2601 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2602 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer
2606 define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) {
2607 ; CHECK-LABEL: test_mm256_moveldup_ps:
2609 ; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
2610 ; CHECK-NEXT: ret{{[l|q]}}
2611 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
2612 ret <8 x float> %res
2615 define <8 x float> @test_mm256_mask_moveldup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) {
2616 ; X86-LABEL: test_mm256_mask_moveldup_ps:
2618 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2619 ; X86-NEXT: kmovw %eax, %k1
2620 ; X86-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6]
2623 ; X64-LABEL: test_mm256_mask_moveldup_ps:
2625 ; X64-NEXT: kmovw %edi, %k1
2626 ; X64-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6]
2628 %arg1 = bitcast i8 %a1 to <8 x i1>
2629 %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
2630 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
2631 ret <8 x float> %res1
2634 define <8 x float> @test_mm256_maskz_moveldup_ps(i8 %a0, <8 x float> %a1) {
2635 ; X86-LABEL: test_mm256_maskz_moveldup_ps:
2637 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2638 ; X86-NEXT: kmovw %eax, %k1
2639 ; X86-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
2642 ; X64-LABEL: test_mm256_maskz_moveldup_ps:
2644 ; X64-NEXT: kmovw %edi, %k1
2645 ; X64-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
2647 %arg0 = bitcast i8 %a0 to <8 x i1>
2648 %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
2649 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
2650 ret <8 x float> %res1
2653 define <4 x i64> @test_mm256_permutex_epi64(<4 x i64> %a0) {
2654 ; CHECK-LABEL: test_mm256_permutex_epi64:
2656 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
2657 ; CHECK-NEXT: ret{{[l|q]}}
2658 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
2662 define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X) {
2663 ; X86-LABEL: test_mm256_mask_permutex_epi64:
2664 ; X86: # %bb.0: # %entry
2665 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2666 ; X86-NEXT: kmovw %eax, %k1
2667 ; X86-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[3,0,0,0]
2670 ; X64-LABEL: test_mm256_mask_permutex_epi64:
2671 ; X64: # %bb.0: # %entry
2672 ; X64-NEXT: kmovw %edi, %k1
2673 ; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[3,0,0,0]
2676 %perm = shufflevector <4 x i64> %__X, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
2677 %0 = bitcast i8 %__M to <8 x i1>
2678 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2679 %1 = select <4 x i1> %extract, <4 x i64> %perm, <4 x i64> %__W
2683 define <4 x i64> @test_mm256_maskz_permutex_epi64(i8 zeroext %__M, <4 x i64> %__X) {
2684 ; X86-LABEL: test_mm256_maskz_permutex_epi64:
2685 ; X86: # %bb.0: # %entry
2686 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2687 ; X86-NEXT: kmovw %eax, %k1
2688 ; X86-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0]
2691 ; X64-LABEL: test_mm256_maskz_permutex_epi64:
2692 ; X64: # %bb.0: # %entry
2693 ; X64-NEXT: kmovw %edi, %k1
2694 ; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0]
2697 %perm = shufflevector <4 x i64> %__X, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
2698 %0 = bitcast i8 %__M to <8 x i1>
2699 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2700 %1 = select <4 x i1> %extract, <4 x i64> %perm, <4 x i64> zeroinitializer
2704 define <4 x double> @test_mm256_permutex_pd(<4 x double> %a0) {
2705 ; CHECK-LABEL: test_mm256_permutex_pd:
2707 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
2708 ; CHECK-NEXT: ret{{[l|q]}}
2709 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
2710 ret <4 x double> %res
2713 define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__X) {
2714 ; X86-LABEL: test_mm256_mask_permutex_pd:
2715 ; X86: # %bb.0: # %entry
2716 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2717 ; X86-NEXT: kmovw %eax, %k1
2718 ; X86-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
2721 ; X64-LABEL: test_mm256_mask_permutex_pd:
2722 ; X64: # %bb.0: # %entry
2723 ; X64-NEXT: kmovw %edi, %k1
2724 ; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
2727 %perm = shufflevector <4 x double> %__X, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
2728 %0 = bitcast i8 %__U to <8 x i1>
2729 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2730 %1 = select <4 x i1> %extract, <4 x double> %perm, <4 x double> %__W
2734 define <4 x double> @test_mm256_maskz_permutex_pd(i8 zeroext %__U, <4 x double> %__X) {
2735 ; X86-LABEL: test_mm256_maskz_permutex_pd:
2736 ; X86: # %bb.0: # %entry
2737 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2738 ; X86-NEXT: kmovw %eax, %k1
2739 ; X86-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
2742 ; X64-LABEL: test_mm256_maskz_permutex_pd:
2743 ; X64: # %bb.0: # %entry
2744 ; X64-NEXT: kmovw %edi, %k1
2745 ; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
2748 %perm = shufflevector <4 x double> %__X, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
2749 %0 = bitcast i8 %__U to <8 x i1>
2750 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2751 %1 = select <4 x i1> %extract, <4 x double> %perm, <4 x double> zeroinitializer
2755 define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) {
2756 ; CHECK-LABEL: test_mm_shuffle_pd:
2758 ; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2759 ; CHECK-NEXT: ret{{[l|q]}}
2760 %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
2761 ret <2 x double> %res
2764 define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2765 ; X86-LABEL: test_mm_mask_shuffle_pd:
2766 ; X86: # %bb.0: # %entry
2767 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2768 ; X86-NEXT: kmovw %eax, %k1
2769 ; X86-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
2772 ; X64-LABEL: test_mm_mask_shuffle_pd:
2773 ; X64: # %bb.0: # %entry
2774 ; X64-NEXT: kmovw %edi, %k1
2775 ; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
2778 %shufp = shufflevector <2 x double> %__A, <2 x double> %__B, <2 x i32> <i32 1, i32 3>
2779 %0 = bitcast i8 %__U to <8 x i1>
2780 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2781 %1 = select <2 x i1> %extract, <2 x double> %shufp, <2 x double> %__W
2785 define <2 x double> @test_mm_maskz_shuffle_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2786 ; X86-LABEL: test_mm_maskz_shuffle_pd:
2787 ; X86: # %bb.0: # %entry
2788 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2789 ; X86-NEXT: kmovw %eax, %k1
2790 ; X86-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
2793 ; X64-LABEL: test_mm_maskz_shuffle_pd:
2794 ; X64: # %bb.0: # %entry
2795 ; X64-NEXT: kmovw %edi, %k1
2796 ; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
2799 %shufp = shufflevector <2 x double> %__A, <2 x double> %__B, <2 x i32> <i32 1, i32 3>
2800 %0 = bitcast i8 %__U to <8 x i1>
2801 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2802 %1 = select <2 x i1> %extract, <2 x double> %shufp, <2 x double> zeroinitializer
2806 define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) {
2807 ; CHECK-LABEL: test_mm256_shuffle_pd:
2809 ; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
2810 ; CHECK-NEXT: ret{{[l|q]}}
2811 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
2812 ret <4 x double> %res
2815 define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
2816 ; X86-LABEL: test_mm256_mask_shuffle_pd:
2817 ; X86: # %bb.0: # %entry
2818 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2819 ; X86-NEXT: kmovw %eax, %k1
2820 ; X86-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
2823 ; X64-LABEL: test_mm256_mask_shuffle_pd:
2824 ; X64: # %bb.0: # %entry
2825 ; X64-NEXT: kmovw %edi, %k1
2826 ; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
2829 %shufp = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
2830 %0 = bitcast i8 %__U to <8 x i1>
2831 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2832 %1 = select <4 x i1> %extract, <4 x double> %shufp, <4 x double> %__W
2836 define <4 x double> @test_mm256_maskz_shuffle_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
2837 ; X86-LABEL: test_mm256_maskz_shuffle_pd:
2838 ; X86: # %bb.0: # %entry
2839 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2840 ; X86-NEXT: kmovw %eax, %k1
2841 ; X86-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
2844 ; X64-LABEL: test_mm256_maskz_shuffle_pd:
2845 ; X64: # %bb.0: # %entry
2846 ; X64-NEXT: kmovw %edi, %k1
2847 ; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
2850 %shufp = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
2851 %0 = bitcast i8 %__U to <8 x i1>
2852 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2853 %1 = select <4 x i1> %extract, <4 x double> %shufp, <4 x double> zeroinitializer
2857 define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) {
2858 ; CHECK-LABEL: test_mm_shuffle_ps:
2860 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
2861 ; CHECK-NEXT: ret{{[l|q]}}
2862 %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
2863 ret <4 x float> %res
2866 define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2867 ; X86-LABEL: test_mm_mask_shuffle_ps:
2868 ; X86: # %bb.0: # %entry
2869 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2870 ; X86-NEXT: kmovw %eax, %k1
2871 ; X86-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
2874 ; X64-LABEL: test_mm_mask_shuffle_ps:
2875 ; X64: # %bb.0: # %entry
2876 ; X64-NEXT: kmovw %edi, %k1
2877 ; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
2880 %shufp = shufflevector <4 x float> %__A, <4 x float> %__B, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
2881 %0 = bitcast i8 %__U to <8 x i1>
2882 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2883 %1 = select <4 x i1> %extract, <4 x float> %shufp, <4 x float> %__W
2887 define <4 x float> @test_mm_maskz_shuffle_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2888 ; X86-LABEL: test_mm_maskz_shuffle_ps:
2889 ; X86: # %bb.0: # %entry
2890 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2891 ; X86-NEXT: kmovw %eax, %k1
2892 ; X86-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
2895 ; X64-LABEL: test_mm_maskz_shuffle_ps:
2896 ; X64: # %bb.0: # %entry
2897 ; X64-NEXT: kmovw %edi, %k1
2898 ; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
2901 %shufp = shufflevector <4 x float> %__A, <4 x float> %__B, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
2902 %0 = bitcast i8 %__U to <8 x i1>
2903 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2904 %1 = select <4 x i1> %extract, <4 x float> %shufp, <4 x float> zeroinitializer
2908 define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) {
2909 ; CHECK-LABEL: test_mm256_shuffle_ps:
2911 ; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
2912 ; CHECK-NEXT: ret{{[l|q]}}
2913 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
2914 ret <8 x float> %res
2917 define <8 x float> @test_mm256_mask_shuffle_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2, <8 x float> %a3) {
2918 ; X86-LABEL: test_mm256_mask_shuffle_ps:
2920 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2921 ; X86-NEXT: kmovw %eax, %k1
2922 ; X86-NEXT: vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4]
2925 ; X64-LABEL: test_mm256_mask_shuffle_ps:
2927 ; X64-NEXT: kmovw %edi, %k1
2928 ; X64-NEXT: vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4]
2930 %arg1 = bitcast i8 %a1 to <8 x i1>
2931 %res0 = shufflevector <8 x float> %a2, <8 x float> %a3, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
2932 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
2933 ret <8 x float> %res1
2936 define <8 x float> @test_mm256_maskz_shuffle_ps(i8 %a0, <8 x float> %a1, <8 x float> %a2) {
2937 ; X86-LABEL: test_mm256_maskz_shuffle_ps:
2939 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2940 ; X86-NEXT: kmovw %eax, %k1
2941 ; X86-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
2944 ; X64-LABEL: test_mm256_maskz_shuffle_ps:
2946 ; X64-NEXT: kmovw %edi, %k1
2947 ; X64-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
2949 %arg0 = bitcast i8 %a0 to <8 x i1>
2950 %res0 = shufflevector <8 x float> %a1, <8 x float> %a2, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
2951 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
2952 ret <8 x float> %res1
2955 define <4 x i64> @test_mm256_mask_mul_epi32(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
2956 ; X86-LABEL: test_mm256_mask_mul_epi32:
2957 ; X86: # %bb.0: # %entry
2958 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2959 ; X86-NEXT: kmovw %eax, %k1
2960 ; X86-NEXT: vpmuldq %ymm1, %ymm2, %ymm0 {%k1}
2963 ; X64-LABEL: test_mm256_mask_mul_epi32:
2964 ; X64: # %bb.0: # %entry
2965 ; X64-NEXT: kmovw %edi, %k1
2966 ; X64-NEXT: vpmuldq %ymm1, %ymm2, %ymm0 {%k1}
2969 %tmp = shl <4 x i64> %__X, <i64 32, i64 32, i64 32, i64 32>
2970 %tmp1 = ashr exact <4 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32>
2971 %tmp2 = shl <4 x i64> %__Y, <i64 32, i64 32, i64 32, i64 32>
2972 %tmp3 = ashr exact <4 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32>
2973 %tmp4 = mul nsw <4 x i64> %tmp3, %tmp1
2974 %tmp5 = bitcast i8 %__M to <8 x i1>
2975 %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2976 %tmp6 = select <4 x i1> %extract.i, <4 x i64> %tmp4, <4 x i64> %__W
2980 define <4 x i64> @test_mm256_maskz_mul_epi32(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
2981 ; X86-LABEL: test_mm256_maskz_mul_epi32:
2983 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2984 ; X86-NEXT: kmovw %eax, %k1
2985 ; X86-NEXT: vpmuldq %ymm0, %ymm1, %ymm0 {%k1} {z}
2988 ; X64-LABEL: test_mm256_maskz_mul_epi32:
2990 ; X64-NEXT: kmovw %edi, %k1
2991 ; X64-NEXT: vpmuldq %ymm0, %ymm1, %ymm0 {%k1} {z}
2993 %tmp = shl <4 x i64> %__X, <i64 32, i64 32, i64 32, i64 32>
2994 %tmp1 = ashr exact <4 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32>
2995 %tmp2 = shl <4 x i64> %__Y, <i64 32, i64 32, i64 32, i64 32>
2996 %tmp3 = ashr exact <4 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32>
2997 %tmp4 = mul nsw <4 x i64> %tmp3, %tmp1
2998 %tmp5 = bitcast i8 %__M to <8 x i1>
2999 %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3000 %tmp6 = select <4 x i1> %extract.i, <4 x i64> %tmp4, <4 x i64> zeroinitializer
3004 define <2 x i64> @test_mm_mask_mul_epi32(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
3005 ; X86-LABEL: test_mm_mask_mul_epi32:
3007 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3008 ; X86-NEXT: kmovw %eax, %k1
3009 ; X86-NEXT: vpmuldq %xmm1, %xmm2, %xmm0 {%k1}
3012 ; X64-LABEL: test_mm_mask_mul_epi32:
3014 ; X64-NEXT: kmovw %edi, %k1
3015 ; X64-NEXT: vpmuldq %xmm1, %xmm2, %xmm0 {%k1}
3017 %tmp = shl <2 x i64> %__X, <i64 32, i64 32>
3018 %tmp1 = ashr exact <2 x i64> %tmp, <i64 32, i64 32>
3019 %tmp2 = shl <2 x i64> %__Y, <i64 32, i64 32>
3020 %tmp3 = ashr exact <2 x i64> %tmp2, <i64 32, i64 32>
3021 %tmp4 = mul nsw <2 x i64> %tmp3, %tmp1
3022 %tmp5 = bitcast i8 %__M to <8 x i1>
3023 %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3024 %tmp6 = select <2 x i1> %extract.i, <2 x i64> %tmp4, <2 x i64> %__W
3028 define <2 x i64> @test_mm_maskz_mul_epi32(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
3029 ; X86-LABEL: test_mm_maskz_mul_epi32:
3031 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3032 ; X86-NEXT: kmovw %eax, %k1
3033 ; X86-NEXT: vpmuldq %xmm0, %xmm1, %xmm0 {%k1} {z}
3036 ; X64-LABEL: test_mm_maskz_mul_epi32:
3038 ; X64-NEXT: kmovw %edi, %k1
3039 ; X64-NEXT: vpmuldq %xmm0, %xmm1, %xmm0 {%k1} {z}
3041 %tmp = shl <2 x i64> %__X, <i64 32, i64 32>
3042 %tmp1 = ashr exact <2 x i64> %tmp, <i64 32, i64 32>
3043 %tmp2 = shl <2 x i64> %__Y, <i64 32, i64 32>
3044 %tmp3 = ashr exact <2 x i64> %tmp2, <i64 32, i64 32>
3045 %tmp4 = mul nsw <2 x i64> %tmp3, %tmp1
3046 %tmp5 = bitcast i8 %__M to <8 x i1>
3047 %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3048 %tmp6 = select <2 x i1> %extract.i, <2 x i64> %tmp4, <2 x i64> zeroinitializer
3052 define <4 x i64> @test_mm256_mask_mul_epu32(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
3053 ; X86-LABEL: test_mm256_mask_mul_epu32:
3054 ; X86: # %bb.0: # %entry
3055 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3056 ; X86-NEXT: kmovw %eax, %k1
3057 ; X86-NEXT: vpmuludq %ymm1, %ymm2, %ymm0 {%k1}
3060 ; X64-LABEL: test_mm256_mask_mul_epu32:
3061 ; X64: # %bb.0: # %entry
3062 ; X64-NEXT: kmovw %edi, %k1
3063 ; X64-NEXT: vpmuludq %ymm1, %ymm2, %ymm0 {%k1}
3066 %tmp = and <4 x i64> %__X, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
3067 %tmp1 = and <4 x i64> %__Y, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
3068 %tmp2 = mul nuw <4 x i64> %tmp1, %tmp
3069 %tmp3 = bitcast i8 %__M to <8 x i1>
3070 %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3071 %tmp4 = select <4 x i1> %extract.i, <4 x i64> %tmp2, <4 x i64> %__W
3075 define <4 x i64> @test_mm256_maskz_mul_epu32(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
3076 ; X86-LABEL: test_mm256_maskz_mul_epu32:
3077 ; X86: # %bb.0: # %entry
3078 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3079 ; X86-NEXT: kmovw %eax, %k1
3080 ; X86-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 {%k1} {z}
3083 ; X64-LABEL: test_mm256_maskz_mul_epu32:
3084 ; X64: # %bb.0: # %entry
3085 ; X64-NEXT: kmovw %edi, %k1
3086 ; X64-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 {%k1} {z}
3089 %tmp = and <4 x i64> %__X, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
3090 %tmp1 = and <4 x i64> %__Y, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
3091 %tmp2 = mul nuw <4 x i64> %tmp1, %tmp
3092 %tmp3 = bitcast i8 %__M to <8 x i1>
3093 %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3094 %tmp4 = select <4 x i1> %extract.i, <4 x i64> %tmp2, <4 x i64> zeroinitializer
3098 define <2 x i64> @test_mm_mask_mul_epu32(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
3099 ; X86-LABEL: test_mm_mask_mul_epu32:
3100 ; X86: # %bb.0: # %entry
3101 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3102 ; X86-NEXT: kmovw %eax, %k1
3103 ; X86-NEXT: vpmuludq %xmm1, %xmm2, %xmm0 {%k1}
3106 ; X64-LABEL: test_mm_mask_mul_epu32:
3107 ; X64: # %bb.0: # %entry
3108 ; X64-NEXT: kmovw %edi, %k1
3109 ; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm0 {%k1}
3112 %tmp = and <2 x i64> %__X, <i64 4294967295, i64 4294967295>
3113 %tmp1 = and <2 x i64> %__Y, <i64 4294967295, i64 4294967295>
3114 %tmp2 = mul nuw <2 x i64> %tmp1, %tmp
3115 %tmp3 = bitcast i8 %__M to <8 x i1>
3116 %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3117 %tmp4 = select <2 x i1> %extract.i, <2 x i64> %tmp2, <2 x i64> %__W
3121 define <2 x i64> @test_mm_maskz_mul_epu32(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
3122 ; X86-LABEL: test_mm_maskz_mul_epu32:
3123 ; X86: # %bb.0: # %entry
3124 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3125 ; X86-NEXT: kmovw %eax, %k1
3126 ; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 {%k1} {z}
3129 ; X64-LABEL: test_mm_maskz_mul_epu32:
3130 ; X64: # %bb.0: # %entry
3131 ; X64-NEXT: kmovw %edi, %k1
3132 ; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 {%k1} {z}
3135 %tmp = and <2 x i64> %__X, <i64 4294967295, i64 4294967295>
3136 %tmp1 = and <2 x i64> %__Y, <i64 4294967295, i64 4294967295>
3137 %tmp2 = mul nuw <2 x i64> %tmp1, %tmp
3138 %tmp3 = bitcast i8 %__M to <8 x i1>
3139 %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3140 %tmp4 = select <2 x i1> %extract.i, <2 x i64> %tmp2, <2 x i64> zeroinitializer
3144 define <2 x i64> @test_mm_cvtepi32_epi8(<2 x i64> %__A) {
3145 ; CHECK-LABEL: test_mm_cvtepi32_epi8:
3146 ; CHECK: # %bb.0: # %entry
3147 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3148 ; CHECK-NEXT: ret{{[l|q]}}
3150 %0 = bitcast <2 x i64> %__A to <4 x i32>
3151 %conv.i = trunc <4 x i32> %0 to <4 x i8>
3152 %shuf.i = shufflevector <4 x i8> %conv.i, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
3153 %1 = bitcast <16 x i8> %shuf.i to <2 x i64>
3157 define <2 x i64> @test_mm_cvtepi32_epi16(<2 x i64> %__A) {
3158 ; CHECK-LABEL: test_mm_cvtepi32_epi16:
3159 ; CHECK: # %bb.0: # %entry
3160 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
3161 ; CHECK-NEXT: ret{{[l|q]}}
3163 %0 = bitcast <2 x i64> %__A to <4 x i32>
3164 %conv.i = trunc <4 x i32> %0 to <4 x i16>
3165 %shuf.i = shufflevector <4 x i16> %conv.i, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3166 %1 = bitcast <8 x i16> %shuf.i to <2 x i64>
3170 define <2 x i64> @test_mm_cvtepi64_epi8(<2 x i64> %__A) {
3171 ; CHECK-LABEL: test_mm_cvtepi64_epi8:
3172 ; CHECK: # %bb.0: # %entry
3173 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3174 ; CHECK-NEXT: ret{{[l|q]}}
3176 %conv.i = trunc <2 x i64> %__A to <2 x i8>
3177 %shuf.i = shufflevector <2 x i8> %conv.i, <2 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
3178 %0 = bitcast <16 x i8> %shuf.i to <2 x i64>
3182 define <2 x i64> @test_mm_cvtepi64_epi16(<2 x i64> %__A) {
3183 ; CHECK-LABEL: test_mm_cvtepi64_epi16:
3184 ; CHECK: # %bb.0: # %entry
3185 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3186 ; CHECK-NEXT: ret{{[l|q]}}
3188 %conv.i = trunc <2 x i64> %__A to <2 x i16>
3189 %shuf.i = shufflevector <2 x i16> %conv.i, <2 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
3190 %0 = bitcast <8 x i16> %shuf.i to <2 x i64>
3194 define <2 x i64> @test_mm_cvtepi64_epi32(<2 x i64> %__A) {
3195 ; CHECK-LABEL: test_mm_cvtepi64_epi32:
3196 ; CHECK: # %bb.0: # %entry
3197 ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
3198 ; CHECK-NEXT: ret{{[l|q]}}
3200 %conv.i = trunc <2 x i64> %__A to <2 x i32>
3201 %shuf.i = shufflevector <2 x i32> %conv.i, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3202 %0 = bitcast <4 x i32> %shuf.i to <2 x i64>
3206 define <2 x i64> @test_mm256_cvtepi32_epi16(<4 x i64> %__A) local_unnamed_addr #0 {
3207 ; CHECK-LABEL: test_mm256_cvtepi32_epi16:
3208 ; CHECK: # %bb.0: # %entry
3209 ; CHECK-NEXT: vpmovdw %ymm0, %xmm0
3210 ; CHECK-NEXT: vzeroupper
3211 ; CHECK-NEXT: ret{{[l|q]}}
3213 %0 = bitcast <4 x i64> %__A to <8 x i32>
3214 %conv.i = trunc <8 x i32> %0 to <8 x i16>
3215 %1 = bitcast <8 x i16> %conv.i to <2 x i64>
3219 define <2 x i64> @test_mm256_mask_cvtepi32_epi16(<2 x i64> %__O, i8 zeroext %__M, <4 x i64> %__A) {
3220 ; X86-LABEL: test_mm256_mask_cvtepi32_epi16:
3221 ; X86: # %bb.0: # %entry
3222 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3223 ; X86-NEXT: kmovw %eax, %k1
3224 ; X86-NEXT: vpmovdw %ymm1, %xmm0 {%k1}
3225 ; X86-NEXT: vzeroupper
3228 ; X64-LABEL: test_mm256_mask_cvtepi32_epi16:
3229 ; X64: # %bb.0: # %entry
3230 ; X64-NEXT: kmovw %edi, %k1
3231 ; X64-NEXT: vpmovdw %ymm1, %xmm0 {%k1}
3232 ; X64-NEXT: vzeroupper
3235 %0 = bitcast <4 x i64> %__A to <8 x i32>
3236 %1 = bitcast <2 x i64> %__O to <8 x i16>
3237 %2 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %0, <8 x i16> %1, i8 %__M)
3238 %3 = bitcast <8 x i16> %2 to <2 x i64>
3242 define <2 x i64> @test_mm256_maskz_cvtepi32_epi16(i8 zeroext %__M, <4 x i64> %__A) {
3243 ; X86-LABEL: test_mm256_maskz_cvtepi32_epi16:
3244 ; X86: # %bb.0: # %entry
3245 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3246 ; X86-NEXT: kmovw %eax, %k1
3247 ; X86-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z}
3248 ; X86-NEXT: vzeroupper
3251 ; X64-LABEL: test_mm256_maskz_cvtepi32_epi16:
3252 ; X64: # %bb.0: # %entry
3253 ; X64-NEXT: kmovw %edi, %k1
3254 ; X64-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z}
3255 ; X64-NEXT: vzeroupper
3258 %0 = bitcast <4 x i64> %__A to <8 x i32>
3259 %1 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %0, <8 x i16> zeroinitializer, i8 %__M)
3260 %2 = bitcast <8 x i16> %1 to <2 x i64>
3264 define <2 x i64> @test_mm256_cvtepi64_epi32(<4 x i64> %__A) local_unnamed_addr #0 {
3265 ; CHECK-LABEL: test_mm256_cvtepi64_epi32:
3266 ; CHECK: # %bb.0: # %entry
3267 ; CHECK-NEXT: vpmovqd %ymm0, %xmm0
3268 ; CHECK-NEXT: vzeroupper
3269 ; CHECK-NEXT: ret{{[l|q]}}
3271 %conv.i = trunc <4 x i64> %__A to <4 x i32>
3272 %0 = bitcast <4 x i32> %conv.i to <2 x i64>
3276 define <2 x i64> @test_mm256_mask_cvtepi64_epi32(<2 x i64> %__O, i8 zeroext %__M, <4 x i64> %__A) {
3277 ; X86-LABEL: test_mm256_mask_cvtepi64_epi32:
3278 ; X86: # %bb.0: # %entry
3279 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3280 ; X86-NEXT: kmovw %eax, %k1
3281 ; X86-NEXT: vpmovqd %ymm1, %xmm0 {%k1}
3282 ; X86-NEXT: vzeroupper
3285 ; X64-LABEL: test_mm256_mask_cvtepi64_epi32:
3286 ; X64: # %bb.0: # %entry
3287 ; X64-NEXT: kmovw %edi, %k1
3288 ; X64-NEXT: vpmovqd %ymm1, %xmm0 {%k1}
3289 ; X64-NEXT: vzeroupper
3292 %conv.i.i = trunc <4 x i64> %__A to <4 x i32>
3293 %0 = bitcast <2 x i64> %__O to <4 x i32>
3294 %1 = bitcast i8 %__M to <8 x i1>
3295 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3296 %2 = select <4 x i1> %extract.i, <4 x i32> %conv.i.i, <4 x i32> %0
3297 %3 = bitcast <4 x i32> %2 to <2 x i64>
3301 define <2 x i64> @test_mm256_maskz_cvtepi64_epi32(i8 zeroext %__M, <4 x i64> %__A) {
3302 ; X86-LABEL: test_mm256_maskz_cvtepi64_epi32:
3303 ; X86: # %bb.0: # %entry
3304 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3305 ; X86-NEXT: kmovw %eax, %k1
3306 ; X86-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z}
3307 ; X86-NEXT: vzeroupper
3310 ; X64-LABEL: test_mm256_maskz_cvtepi64_epi32:
3311 ; X64: # %bb.0: # %entry
3312 ; X64-NEXT: kmovw %edi, %k1
3313 ; X64-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z}
3314 ; X64-NEXT: vzeroupper
3317 %conv.i.i = trunc <4 x i64> %__A to <4 x i32>
3318 %0 = bitcast i8 %__M to <8 x i1>
3319 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3320 %1 = select <4 x i1> %extract.i, <4 x i32> %conv.i.i, <4 x i32> zeroinitializer
3321 %2 = bitcast <4 x i32> %1 to <2 x i64>
3325 define <2 x i64> @test_mm256_cvtepi64_epi8(<4 x i64> %__A) {
3326 ; CHECK-LABEL: test_mm256_cvtepi64_epi8:
3327 ; CHECK: # %bb.0: # %entry
3328 ; CHECK-NEXT: vpmovqb %ymm0, %xmm0
3329 ; CHECK-NEXT: vzeroupper
3330 ; CHECK-NEXT: ret{{[l|q]}}
3332 %conv.i = trunc <4 x i64> %__A to <4 x i8>
3333 %shuf.i = shufflevector <4 x i8> %conv.i, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
3334 %0 = bitcast <16 x i8> %shuf.i to <2 x i64>
3338 define <2 x i64> @test_mm256_cvtepi64_epi16(<4 x i64> %__A) {
3339 ; CHECK-LABEL: test_mm256_cvtepi64_epi16:
3340 ; CHECK: # %bb.0: # %entry
3341 ; CHECK-NEXT: vpmovqw %ymm0, %xmm0
3342 ; CHECK-NEXT: vzeroupper
3343 ; CHECK-NEXT: ret{{[l|q]}}
3345 %conv.i = trunc <4 x i64> %__A to <4 x i16>
3346 %shuf.i = shufflevector <4 x i16> %conv.i, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3347 %0 = bitcast <8 x i16> %shuf.i to <2 x i64>
3351 define <2 x i64> @test_mm256_cvtepi32_epi8(<4 x i64> %__A) {
3352 ; CHECK-LABEL: test_mm256_cvtepi32_epi8:
3353 ; CHECK: # %bb.0: # %entry
3354 ; CHECK-NEXT: vpmovdb %ymm0, %xmm0
3355 ; CHECK-NEXT: vzeroupper
3356 ; CHECK-NEXT: ret{{[l|q]}}
3358 %0 = bitcast <4 x i64> %__A to <8 x i32>
3359 %conv.i = trunc <8 x i32> %0 to <8 x i8>
3360 %shuf.i = shufflevector <8 x i8> %conv.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3361 %1 = bitcast <16 x i8> %shuf.i to <2 x i64>
3365 define <2 x i64> @test_mm_ternarylogic_epi32(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) {
3366 ; CHECK-LABEL: test_mm_ternarylogic_epi32:
3367 ; CHECK: # %bb.0: # %entry
3368 ; CHECK-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0
3369 ; CHECK-NEXT: ret{{[l|q]}}
3371 %0 = bitcast <2 x i64> %__A to <4 x i32>
3372 %1 = bitcast <2 x i64> %__B to <4 x i32>
3373 %2 = bitcast <2 x i64> %__C to <4 x i32>
3374 %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4)
3375 %4 = bitcast <4 x i32> %3 to <2 x i64>
3379 declare <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32) #2
3381 define <2 x i64> @test_mm_mask_ternarylogic_epi32(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__B, <2 x i64> %__C) {
3382 ; X86-LABEL: test_mm_mask_ternarylogic_epi32:
3383 ; X86: # %bb.0: # %entry
3384 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3385 ; X86-NEXT: kmovw %eax, %k1
3386 ; X86-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1}
3389 ; X64-LABEL: test_mm_mask_ternarylogic_epi32:
3390 ; X64: # %bb.0: # %entry
3391 ; X64-NEXT: kmovw %edi, %k1
3392 ; X64-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1}
3395 %0 = bitcast <2 x i64> %__A to <4 x i32>
3396 %1 = bitcast <2 x i64> %__B to <4 x i32>
3397 %2 = bitcast <2 x i64> %__C to <4 x i32>
3398 %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4)
3399 %4 = bitcast i8 %__U to <8 x i1>
3400 %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3401 %5 = select <4 x i1> %extract, <4 x i32> %3, <4 x i32> %0
3402 %6 = bitcast <4 x i32> %5 to <2 x i64>
3406 define <2 x i64> @test_mm_maskz_ternarylogic_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) {
3407 ; X86-LABEL: test_mm_maskz_ternarylogic_epi32:
3408 ; X86: # %bb.0: # %entry
3409 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3410 ; X86-NEXT: kmovw %eax, %k1
3411 ; X86-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} {z}
3414 ; X64-LABEL: test_mm_maskz_ternarylogic_epi32:
3415 ; X64: # %bb.0: # %entry
3416 ; X64-NEXT: kmovw %edi, %k1
3417 ; X64-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} {z}
3420 %0 = bitcast <2 x i64> %__A to <4 x i32>
3421 %1 = bitcast <2 x i64> %__B to <4 x i32>
3422 %2 = bitcast <2 x i64> %__C to <4 x i32>
3423 %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4)
3424 %4 = bitcast i8 %__U to <8 x i1>
3425 %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3426 %5 = select <4 x i1> %extract, <4 x i32> %3, <4 x i32> zeroinitializer
3427 %6 = bitcast <4 x i32> %5 to <2 x i64>
3431 define <4 x i64> @test_mm256_ternarylogic_epi32(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) {
3432 ; CHECK-LABEL: test_mm256_ternarylogic_epi32:
3433 ; CHECK: # %bb.0: # %entry
3434 ; CHECK-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0
3435 ; CHECK-NEXT: ret{{[l|q]}}
3437 %0 = bitcast <4 x i64> %__A to <8 x i32>
3438 %1 = bitcast <4 x i64> %__B to <8 x i32>
3439 %2 = bitcast <4 x i64> %__C to <8 x i32>
3440 %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4)
3441 %4 = bitcast <8 x i32> %3 to <4 x i64>
3445 declare <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i32) #2
3447 define <4 x i64> @test_mm256_mask_ternarylogic_epi32(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__B, <4 x i64> %__C) {
3448 ; X86-LABEL: test_mm256_mask_ternarylogic_epi32:
3449 ; X86: # %bb.0: # %entry
3450 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3451 ; X86-NEXT: kmovw %eax, %k1
3452 ; X86-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1}
3455 ; X64-LABEL: test_mm256_mask_ternarylogic_epi32:
3456 ; X64: # %bb.0: # %entry
3457 ; X64-NEXT: kmovw %edi, %k1
3458 ; X64-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1}
3461 %0 = bitcast <4 x i64> %__A to <8 x i32>
3462 %1 = bitcast <4 x i64> %__B to <8 x i32>
3463 %2 = bitcast <4 x i64> %__C to <8 x i32>
3464 %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4)
3465 %4 = bitcast i8 %__U to <8 x i1>
3466 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %0
3467 %6 = bitcast <8 x i32> %5 to <4 x i64>
3471 define <4 x i64> @test_mm256_maskz_ternarylogic_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) {
3472 ; X86-LABEL: test_mm256_maskz_ternarylogic_epi32:
3473 ; X86: # %bb.0: # %entry
3474 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3475 ; X86-NEXT: kmovw %eax, %k1
3476 ; X86-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} {z}
3479 ; X64-LABEL: test_mm256_maskz_ternarylogic_epi32:
3480 ; X64: # %bb.0: # %entry
3481 ; X64-NEXT: kmovw %edi, %k1
3482 ; X64-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} {z}
3485 %0 = bitcast <4 x i64> %__A to <8 x i32>
3486 %1 = bitcast <4 x i64> %__B to <8 x i32>
3487 %2 = bitcast <4 x i64> %__C to <8 x i32>
3488 %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4)
3489 %4 = bitcast i8 %__U to <8 x i1>
3490 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> zeroinitializer
3491 %6 = bitcast <8 x i32> %5 to <4 x i64>
3495 define <2 x i64> @test_mm_ternarylogic_epi64(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) {
3496 ; CHECK-LABEL: test_mm_ternarylogic_epi64:
3497 ; CHECK: # %bb.0: # %entry
3498 ; CHECK-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0
3499 ; CHECK-NEXT: ret{{[l|q]}}
3501 %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4)
3505 declare <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i32) #2
3507 define <2 x i64> @test_mm_mask_ternarylogic_epi64(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__B, <2 x i64> %__C) {
3508 ; X86-LABEL: test_mm_mask_ternarylogic_epi64:
3509 ; X86: # %bb.0: # %entry
3510 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3511 ; X86-NEXT: kmovw %eax, %k1
3512 ; X86-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1}
3515 ; X64-LABEL: test_mm_mask_ternarylogic_epi64:
3516 ; X64: # %bb.0: # %entry
3517 ; X64-NEXT: kmovw %edi, %k1
3518 ; X64-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1}
3521 %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4)
3522 %1 = bitcast i8 %__U to <8 x i1>
3523 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3524 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__A
3528 define <2 x i64> @test_mm_maskz_ternarylogic_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) {
3529 ; X86-LABEL: test_mm_maskz_ternarylogic_epi64:
3530 ; X86: # %bb.0: # %entry
3531 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3532 ; X86-NEXT: kmovw %eax, %k1
3533 ; X86-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} {z}
3536 ; X64-LABEL: test_mm_maskz_ternarylogic_epi64:
3537 ; X64: # %bb.0: # %entry
3538 ; X64-NEXT: kmovw %edi, %k1
3539 ; X64-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} {z}
3542 %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4)
3543 %1 = bitcast i8 %__U to <8 x i1>
3544 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3545 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer
3549 define <4 x i64> @test_mm256_ternarylogic_epi64(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) {
3550 ; CHECK-LABEL: test_mm256_ternarylogic_epi64:
3551 ; CHECK: # %bb.0: # %entry
3552 ; CHECK-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0
3553 ; CHECK-NEXT: ret{{[l|q]}}
3555 %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4)
3559 declare <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i32) #2
3561 define <4 x i64> @test_mm256_mask_ternarylogic_epi64(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__B, <4 x i64> %__C) {
3562 ; X86-LABEL: test_mm256_mask_ternarylogic_epi64:
3563 ; X86: # %bb.0: # %entry
3564 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3565 ; X86-NEXT: kmovw %eax, %k1
3566 ; X86-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1}
3569 ; X64-LABEL: test_mm256_mask_ternarylogic_epi64:
3570 ; X64: # %bb.0: # %entry
3571 ; X64-NEXT: kmovw %edi, %k1
3572 ; X64-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1}
3575 %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4)
3576 %1 = bitcast i8 %__U to <8 x i1>
3577 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3578 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__A
3582 define <4 x i64> @test_mm256_maskz_ternarylogic_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) {
3583 ; X86-LABEL: test_mm256_maskz_ternarylogic_epi64:
3584 ; X86: # %bb.0: # %entry
3585 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3586 ; X86-NEXT: kmovw %eax, %k1
3587 ; X86-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} {z}
3590 ; X64-LABEL: test_mm256_maskz_ternarylogic_epi64:
3591 ; X64: # %bb.0: # %entry
3592 ; X64-NEXT: kmovw %edi, %k1
3593 ; X64-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} {z}
3596 %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4)
3597 %1 = bitcast i8 %__U to <8 x i1>
3598 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3599 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer
3603 define <2 x i64> @test_mm_mask2_permutex2var_epi32(<2 x i64> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x i64> %__B) {
3604 ; X86-LABEL: test_mm_mask2_permutex2var_epi32:
3605 ; X86: # %bb.0: # %entry
3606 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3607 ; X86-NEXT: kmovw %eax, %k1
3608 ; X86-NEXT: vpermi2d %xmm2, %xmm0, %xmm1 {%k1}
3609 ; X86-NEXT: vmovdqa %xmm1, %xmm0
3612 ; X64-LABEL: test_mm_mask2_permutex2var_epi32:
3613 ; X64: # %bb.0: # %entry
3614 ; X64-NEXT: kmovw %edi, %k1
3615 ; X64-NEXT: vpermi2d %xmm2, %xmm0, %xmm1 {%k1}
3616 ; X64-NEXT: vmovdqa %xmm1, %xmm0
3619 %0 = bitcast <2 x i64> %__A to <4 x i32>
3620 %1 = bitcast <2 x i64> %__I to <4 x i32>
3621 %2 = bitcast <2 x i64> %__B to <4 x i32>
3622 %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
3623 %4 = bitcast i8 %__U to <8 x i1>
3624 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3625 %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> %1
3626 %6 = bitcast <4 x i32> %5 to <2 x i64>
3630 define <4 x i64> @test_mm256_mask2_permutex2var_epi32(<4 x i64> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x i64> %__B) {
3631 ; X86-LABEL: test_mm256_mask2_permutex2var_epi32:
3632 ; X86: # %bb.0: # %entry
3633 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3634 ; X86-NEXT: kmovw %eax, %k1
3635 ; X86-NEXT: vpermi2d %ymm2, %ymm0, %ymm1 {%k1}
3636 ; X86-NEXT: vmovdqa %ymm1, %ymm0
3639 ; X64-LABEL: test_mm256_mask2_permutex2var_epi32:
3640 ; X64: # %bb.0: # %entry
3641 ; X64-NEXT: kmovw %edi, %k1
3642 ; X64-NEXT: vpermi2d %ymm2, %ymm0, %ymm1 {%k1}
3643 ; X64-NEXT: vmovdqa %ymm1, %ymm0
3646 %0 = bitcast <4 x i64> %__A to <8 x i32>
3647 %1 = bitcast <4 x i64> %__I to <8 x i32>
3648 %2 = bitcast <4 x i64> %__B to <8 x i32>
3649 %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
3650 %4 = bitcast i8 %__U to <8 x i1>
3651 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %1
3652 %6 = bitcast <8 x i32> %5 to <4 x i64>
3656 define <2 x double> @test_mm_mask2_permutex2var_pd(<2 x double> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x double> %__B) {
3657 ; X86-LABEL: test_mm_mask2_permutex2var_pd:
3658 ; X86: # %bb.0: # %entry
3659 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3660 ; X86-NEXT: kmovw %eax, %k1
3661 ; X86-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1 {%k1}
3662 ; X86-NEXT: vmovapd %xmm1, %xmm0
3665 ; X64-LABEL: test_mm_mask2_permutex2var_pd:
3666 ; X64: # %bb.0: # %entry
3667 ; X64-NEXT: kmovw %edi, %k1
3668 ; X64-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1 {%k1}
3669 ; X64-NEXT: vmovapd %xmm1, %xmm0
3672 %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B)
3673 %1 = bitcast <2 x i64> %__I to <2 x double>
3674 %2 = bitcast i8 %__U to <8 x i1>
3675 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3676 %3 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %1
3680 define <4 x double> @test_mm256_mask2_permutex2var_pd(<4 x double> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x double> %__B) {
3681 ; X86-LABEL: test_mm256_mask2_permutex2var_pd:
3682 ; X86: # %bb.0: # %entry
3683 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3684 ; X86-NEXT: kmovw %eax, %k1
3685 ; X86-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 {%k1}
3686 ; X86-NEXT: vmovapd %ymm1, %ymm0
3689 ; X64-LABEL: test_mm256_mask2_permutex2var_pd:
3690 ; X64: # %bb.0: # %entry
3691 ; X64-NEXT: kmovw %edi, %k1
3692 ; X64-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 {%k1}
3693 ; X64-NEXT: vmovapd %ymm1, %ymm0
3696 %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B)
3697 %1 = bitcast <4 x i64> %__I to <4 x double>
3698 %2 = bitcast i8 %__U to <8 x i1>
3699 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3700 %3 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %1
3704 define <4 x float> @test_mm_mask2_permutex2var_ps(<4 x float> %__A, <2 x i64> %__I, i8 zeroext %__U, <4 x float> %__B) {
3705 ; X86-LABEL: test_mm_mask2_permutex2var_ps:
3706 ; X86: # %bb.0: # %entry
3707 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3708 ; X86-NEXT: kmovw %eax, %k1
3709 ; X86-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 {%k1}
3710 ; X86-NEXT: vmovaps %xmm1, %xmm0
3713 ; X64-LABEL: test_mm_mask2_permutex2var_ps:
3714 ; X64: # %bb.0: # %entry
3715 ; X64-NEXT: kmovw %edi, %k1
3716 ; X64-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 {%k1}
3717 ; X64-NEXT: vmovaps %xmm1, %xmm0
3720 %0 = bitcast <2 x i64> %__I to <4 x i32>
3721 %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B)
3722 %2 = bitcast <2 x i64> %__I to <4 x float>
3723 %3 = bitcast i8 %__U to <8 x i1>
3724 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3725 %4 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> %2
3729 define <8 x float> @test_mm256_mask2_permutex2var_ps(<8 x float> %__A, <4 x i64> %__I, i8 zeroext %__U, <8 x float> %__B) {
3730 ; X86-LABEL: test_mm256_mask2_permutex2var_ps:
3731 ; X86: # %bb.0: # %entry
3732 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3733 ; X86-NEXT: kmovw %eax, %k1
3734 ; X86-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 {%k1}
3735 ; X86-NEXT: vmovaps %ymm1, %ymm0
3738 ; X64-LABEL: test_mm256_mask2_permutex2var_ps:
3739 ; X64: # %bb.0: # %entry
3740 ; X64-NEXT: kmovw %edi, %k1
3741 ; X64-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 {%k1}
3742 ; X64-NEXT: vmovaps %ymm1, %ymm0
3745 %0 = bitcast <4 x i64> %__I to <8 x i32>
3746 %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B)
3747 %2 = bitcast <4 x i64> %__I to <8 x float>
3748 %3 = bitcast i8 %__U to <8 x i1>
3749 %4 = select <8 x i1> %3, <8 x float> %1, <8 x float> %2
3753 define <2 x i64> @test_mm_mask2_permutex2var_epi64(<2 x i64> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x i64> %__B) {
3754 ; X86-LABEL: test_mm_mask2_permutex2var_epi64:
3755 ; X86: # %bb.0: # %entry
3756 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3757 ; X86-NEXT: kmovw %eax, %k1
3758 ; X86-NEXT: vpermi2q %xmm2, %xmm0, %xmm1 {%k1}
3759 ; X86-NEXT: vmovdqa %xmm1, %xmm0
3762 ; X64-LABEL: test_mm_mask2_permutex2var_epi64:
3763 ; X64: # %bb.0: # %entry
3764 ; X64-NEXT: kmovw %edi, %k1
3765 ; X64-NEXT: vpermi2q %xmm2, %xmm0, %xmm1 {%k1}
3766 ; X64-NEXT: vmovdqa %xmm1, %xmm0
3769 %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B)
3770 %1 = bitcast i8 %__U to <8 x i1>
3771 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3772 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__I
3776 define <4 x i64> @test_mm256_mask2_permutex2var_epi64(<4 x i64> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x i64> %__B) {
3777 ; X86-LABEL: test_mm256_mask2_permutex2var_epi64:
3778 ; X86: # %bb.0: # %entry
3779 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3780 ; X86-NEXT: kmovw %eax, %k1
3781 ; X86-NEXT: vpermi2q %ymm2, %ymm0, %ymm1 {%k1}
3782 ; X86-NEXT: vmovdqa %ymm1, %ymm0
3785 ; X64-LABEL: test_mm256_mask2_permutex2var_epi64:
3786 ; X64: # %bb.0: # %entry
3787 ; X64-NEXT: kmovw %edi, %k1
3788 ; X64-NEXT: vpermi2q %ymm2, %ymm0, %ymm1 {%k1}
3789 ; X64-NEXT: vmovdqa %ymm1, %ymm0
3792 %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B)
3793 %1 = bitcast i8 %__U to <8 x i1>
3794 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3795 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__I
3799 define <2 x i64> @test_mm_permutex2var_epi32(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) {
3800 ; CHECK-LABEL: test_mm_permutex2var_epi32:
3801 ; CHECK: # %bb.0: # %entry
3802 ; CHECK-NEXT: vpermt2d %xmm2, %xmm1, %xmm0
3803 ; CHECK-NEXT: ret{{[l|q]}}
3805 %0 = bitcast <2 x i64> %__A to <4 x i32>
3806 %1 = bitcast <2 x i64> %__I to <4 x i32>
3807 %2 = bitcast <2 x i64> %__B to <4 x i32>
3808 %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
3809 %4 = bitcast <4 x i32> %3 to <2 x i64>
3813 define <2 x i64> @test_mm_mask_permutex2var_epi32(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x i64> %__B) {
3814 ; X86-LABEL: test_mm_mask_permutex2var_epi32:
3815 ; X86: # %bb.0: # %entry
3816 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3817 ; X86-NEXT: kmovw %eax, %k1
3818 ; X86-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 {%k1}
3821 ; X64-LABEL: test_mm_mask_permutex2var_epi32:
3822 ; X64: # %bb.0: # %entry
3823 ; X64-NEXT: kmovw %edi, %k1
3824 ; X64-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 {%k1}
3827 %0 = bitcast <2 x i64> %__A to <4 x i32>
3828 %1 = bitcast <2 x i64> %__I to <4 x i32>
3829 %2 = bitcast <2 x i64> %__B to <4 x i32>
3830 %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
3831 %4 = bitcast i8 %__U to <8 x i1>
3832 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3833 %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> %0
3834 %6 = bitcast <4 x i32> %5 to <2 x i64>
3838 define <2 x i64> @test_mm_maskz_permutex2var_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) {
3839 ; X86-LABEL: test_mm_maskz_permutex2var_epi32:
3840 ; X86: # %bb.0: # %entry
3841 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3842 ; X86-NEXT: kmovw %eax, %k1
3843 ; X86-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 {%k1} {z}
3846 ; X64-LABEL: test_mm_maskz_permutex2var_epi32:
3847 ; X64: # %bb.0: # %entry
3848 ; X64-NEXT: kmovw %edi, %k1
3849 ; X64-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 {%k1} {z}
3852 %0 = bitcast <2 x i64> %__A to <4 x i32>
3853 %1 = bitcast <2 x i64> %__I to <4 x i32>
3854 %2 = bitcast <2 x i64> %__B to <4 x i32>
3855 %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
3856 %4 = bitcast i8 %__U to <8 x i1>
3857 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3858 %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> zeroinitializer
3859 %6 = bitcast <4 x i32> %5 to <2 x i64>
3863 define <4 x i64> @test_mm256_permutex2var_epi32(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) {
3864 ; CHECK-LABEL: test_mm256_permutex2var_epi32:
3865 ; CHECK: # %bb.0: # %entry
3866 ; CHECK-NEXT: vpermt2d %ymm2, %ymm1, %ymm0
3867 ; CHECK-NEXT: ret{{[l|q]}}
3869 %0 = bitcast <4 x i64> %__A to <8 x i32>
3870 %1 = bitcast <4 x i64> %__I to <8 x i32>
3871 %2 = bitcast <4 x i64> %__B to <8 x i32>
3872 %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
3873 %4 = bitcast <8 x i32> %3 to <4 x i64>
3877 define <4 x i64> @test_mm256_mask_permutex2var_epi32(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x i64> %__B) {
3878 ; X86-LABEL: test_mm256_mask_permutex2var_epi32:
3879 ; X86: # %bb.0: # %entry
3880 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3881 ; X86-NEXT: kmovw %eax, %k1
3882 ; X86-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 {%k1}
3885 ; X64-LABEL: test_mm256_mask_permutex2var_epi32:
3886 ; X64: # %bb.0: # %entry
3887 ; X64-NEXT: kmovw %edi, %k1
3888 ; X64-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 {%k1}
3891 %0 = bitcast <4 x i64> %__A to <8 x i32>
3892 %1 = bitcast <4 x i64> %__I to <8 x i32>
3893 %2 = bitcast <4 x i64> %__B to <8 x i32>
3894 %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
3895 %4 = bitcast i8 %__U to <8 x i1>
3896 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %0
3897 %6 = bitcast <8 x i32> %5 to <4 x i64>
3901 define <4 x i64> @test_mm256_maskz_permutex2var_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) {
3902 ; X86-LABEL: test_mm256_maskz_permutex2var_epi32:
3903 ; X86: # %bb.0: # %entry
3904 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3905 ; X86-NEXT: kmovw %eax, %k1
3906 ; X86-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 {%k1} {z}
3909 ; X64-LABEL: test_mm256_maskz_permutex2var_epi32:
3910 ; X64: # %bb.0: # %entry
3911 ; X64-NEXT: kmovw %edi, %k1
3912 ; X64-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 {%k1} {z}
3915 %0 = bitcast <4 x i64> %__A to <8 x i32>
3916 %1 = bitcast <4 x i64> %__I to <8 x i32>
3917 %2 = bitcast <4 x i64> %__B to <8 x i32>
3918 %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
3919 %4 = bitcast i8 %__U to <8 x i1>
3920 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> zeroinitializer
3921 %6 = bitcast <8 x i32> %5 to <4 x i64>
3925 define <2 x double> @test_mm_permutex2var_pd(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) {
3926 ; CHECK-LABEL: test_mm_permutex2var_pd:
3927 ; CHECK: # %bb.0: # %entry
3928 ; CHECK-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0
3929 ; CHECK-NEXT: ret{{[l|q]}}
3931 %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B)
3935 define <2 x double> @test_mm_mask_permutex2var_pd(<2 x double> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x double> %__B) {
3936 ; X86-LABEL: test_mm_mask_permutex2var_pd:
3937 ; X86: # %bb.0: # %entry
3938 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3939 ; X86-NEXT: kmovw %eax, %k1
3940 ; X86-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 {%k1}
3943 ; X64-LABEL: test_mm_mask_permutex2var_pd:
3944 ; X64: # %bb.0: # %entry
3945 ; X64-NEXT: kmovw %edi, %k1
3946 ; X64-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 {%k1}
3949 %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B)
3950 %1 = bitcast i8 %__U to <8 x i1>
3951 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3952 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
3956 define <2 x double> @test_mm_maskz_permutex2var_pd(i8 zeroext %__U, <2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) {
3957 ; X86-LABEL: test_mm_maskz_permutex2var_pd:
3958 ; X86: # %bb.0: # %entry
3959 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3960 ; X86-NEXT: kmovw %eax, %k1
3961 ; X86-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 {%k1} {z}
3964 ; X64-LABEL: test_mm_maskz_permutex2var_pd:
3965 ; X64: # %bb.0: # %entry
3966 ; X64-NEXT: kmovw %edi, %k1
3967 ; X64-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 {%k1} {z}
3970 %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B)
3971 %1 = bitcast i8 %__U to <8 x i1>
3972 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3973 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
3977 define <4 x double> @test_mm256_permutex2var_pd(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) {
3978 ; CHECK-LABEL: test_mm256_permutex2var_pd:
3979 ; CHECK: # %bb.0: # %entry
3980 ; CHECK-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0
3981 ; CHECK-NEXT: ret{{[l|q]}}
3983 %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B)
3987 define <4 x double> @test_mm256_mask_permutex2var_pd(<4 x double> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x double> %__B) {
3988 ; X86-LABEL: test_mm256_mask_permutex2var_pd:
3989 ; X86: # %bb.0: # %entry
3990 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3991 ; X86-NEXT: kmovw %eax, %k1
3992 ; X86-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 {%k1}
3995 ; X64-LABEL: test_mm256_mask_permutex2var_pd:
3996 ; X64: # %bb.0: # %entry
3997 ; X64-NEXT: kmovw %edi, %k1
3998 ; X64-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 {%k1}
4001 %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B)
4002 %1 = bitcast i8 %__U to <8 x i1>
4003 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4004 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
4008 define <4 x double> @test_mm256_maskz_permutex2var_pd(i8 zeroext %__U, <4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) {
4009 ; X86-LABEL: test_mm256_maskz_permutex2var_pd:
4010 ; X86: # %bb.0: # %entry
4011 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4012 ; X86-NEXT: kmovw %eax, %k1
4013 ; X86-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 {%k1} {z}
4016 ; X64-LABEL: test_mm256_maskz_permutex2var_pd:
4017 ; X64: # %bb.0: # %entry
4018 ; X64-NEXT: kmovw %edi, %k1
4019 ; X64-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 {%k1} {z}
4022 %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B)
4023 %1 = bitcast i8 %__U to <8 x i1>
4024 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4025 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
4029 define <4 x float> @test_mm_permutex2var_ps(<4 x float> %__A, <2 x i64> %__I, <4 x float> %__B) {
4030 ; CHECK-LABEL: test_mm_permutex2var_ps:
4031 ; CHECK: # %bb.0: # %entry
4032 ; CHECK-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0
4033 ; CHECK-NEXT: ret{{[l|q]}}
4035 %0 = bitcast <2 x i64> %__I to <4 x i32>
4036 %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B)
4040 define <4 x float> @test_mm_mask_permutex2var_ps(<4 x float> %__A, i8 zeroext %__U, <2 x i64> %__I, <4 x float> %__B) {
4041 ; X86-LABEL: test_mm_mask_permutex2var_ps:
4042 ; X86: # %bb.0: # %entry
4043 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4044 ; X86-NEXT: kmovw %eax, %k1
4045 ; X86-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 {%k1}
4048 ; X64-LABEL: test_mm_mask_permutex2var_ps:
4049 ; X64: # %bb.0: # %entry
4050 ; X64-NEXT: kmovw %edi, %k1
4051 ; X64-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 {%k1}
4054 %0 = bitcast <2 x i64> %__I to <4 x i32>
4055 %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B)
4056 %2 = bitcast i8 %__U to <8 x i1>
4057 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4058 %3 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> %__A
4062 define <4 x float> @test_mm_maskz_permutex2var_ps(i8 zeroext %__U, <4 x float> %__A, <2 x i64> %__I, <4 x float> %__B) {
4063 ; X86-LABEL: test_mm_maskz_permutex2var_ps:
4064 ; X86: # %bb.0: # %entry
4065 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4066 ; X86-NEXT: kmovw %eax, %k1
4067 ; X86-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 {%k1} {z}
4070 ; X64-LABEL: test_mm_maskz_permutex2var_ps:
4071 ; X64: # %bb.0: # %entry
4072 ; X64-NEXT: kmovw %edi, %k1
4073 ; X64-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 {%k1} {z}
4076 %0 = bitcast <2 x i64> %__I to <4 x i32>
4077 %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B)
4078 %2 = bitcast i8 %__U to <8 x i1>
4079 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4080 %3 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> zeroinitializer
4084 define <8 x float> @test_mm256_permutex2var_ps(<8 x float> %__A, <4 x i64> %__I, <8 x float> %__B) {
4085 ; CHECK-LABEL: test_mm256_permutex2var_ps:
4086 ; CHECK: # %bb.0: # %entry
4087 ; CHECK-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0
4088 ; CHECK-NEXT: ret{{[l|q]}}
4090 %0 = bitcast <4 x i64> %__I to <8 x i32>
4091 %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B)
4095 define <8 x float> @test_mm256_mask_permutex2var_ps(<8 x float> %__A, i8 zeroext %__U, <4 x i64> %__I, <8 x float> %__B) {
4096 ; X86-LABEL: test_mm256_mask_permutex2var_ps:
4097 ; X86: # %bb.0: # %entry
4098 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4099 ; X86-NEXT: kmovw %eax, %k1
4100 ; X86-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 {%k1}
4103 ; X64-LABEL: test_mm256_mask_permutex2var_ps:
4104 ; X64: # %bb.0: # %entry
4105 ; X64-NEXT: kmovw %edi, %k1
4106 ; X64-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 {%k1}
4109 %0 = bitcast <4 x i64> %__I to <8 x i32>
4110 %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B)
4111 %2 = bitcast i8 %__U to <8 x i1>
4112 %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %__A
4116 define <8 x float> @test_mm256_maskz_permutex2var_ps(i8 zeroext %__U, <8 x float> %__A, <4 x i64> %__I, <8 x float> %__B) {
4117 ; X86-LABEL: test_mm256_maskz_permutex2var_ps:
4118 ; X86: # %bb.0: # %entry
4119 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4120 ; X86-NEXT: kmovw %eax, %k1
4121 ; X86-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 {%k1} {z}
4124 ; X64-LABEL: test_mm256_maskz_permutex2var_ps:
4125 ; X64: # %bb.0: # %entry
4126 ; X64-NEXT: kmovw %edi, %k1
4127 ; X64-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 {%k1} {z}
4130 %0 = bitcast <4 x i64> %__I to <8 x i32>
4131 %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B)
4132 %2 = bitcast i8 %__U to <8 x i1>
4133 %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> zeroinitializer
4137 define <2 x i64> @test_mm_permutex2var_epi64(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) {
4138 ; CHECK-LABEL: test_mm_permutex2var_epi64:
4139 ; CHECK: # %bb.0: # %entry
4140 ; CHECK-NEXT: vpermt2q %xmm2, %xmm1, %xmm0
4141 ; CHECK-NEXT: ret{{[l|q]}}
4143 %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B)
4147 define <2 x i64> @test_mm_mask_permutex2var_epi64(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x i64> %__B) {
4148 ; X86-LABEL: test_mm_mask_permutex2var_epi64:
4149 ; X86: # %bb.0: # %entry
4150 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4151 ; X86-NEXT: kmovw %eax, %k1
4152 ; X86-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 {%k1}
4155 ; X64-LABEL: test_mm_mask_permutex2var_epi64:
4156 ; X64: # %bb.0: # %entry
4157 ; X64-NEXT: kmovw %edi, %k1
4158 ; X64-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 {%k1}
4161 %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B)
4162 %1 = bitcast i8 %__U to <8 x i1>
4163 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4164 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__A
4168 define <2 x i64> @test_mm_maskz_permutex2var_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) {
4169 ; X86-LABEL: test_mm_maskz_permutex2var_epi64:
4170 ; X86: # %bb.0: # %entry
4171 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4172 ; X86-NEXT: kmovw %eax, %k1
4173 ; X86-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 {%k1} {z}
4176 ; X64-LABEL: test_mm_maskz_permutex2var_epi64:
4177 ; X64: # %bb.0: # %entry
4178 ; X64-NEXT: kmovw %edi, %k1
4179 ; X64-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 {%k1} {z}
4182 %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B)
4183 %1 = bitcast i8 %__U to <8 x i1>
4184 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4185 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
4189 define <4 x i64> @test_mm256_permutex2var_epi64(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) {
4190 ; CHECK-LABEL: test_mm256_permutex2var_epi64:
4191 ; CHECK: # %bb.0: # %entry
4192 ; CHECK-NEXT: vpermt2q %ymm2, %ymm1, %ymm0
4193 ; CHECK-NEXT: ret{{[l|q]}}
4195 %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B)
4199 define <4 x i64> @test_mm256_mask_permutex2var_epi64(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x i64> %__B) {
4200 ; X86-LABEL: test_mm256_mask_permutex2var_epi64:
4201 ; X86: # %bb.0: # %entry
4202 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4203 ; X86-NEXT: kmovw %eax, %k1
4204 ; X86-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 {%k1}
4207 ; X64-LABEL: test_mm256_mask_permutex2var_epi64:
4208 ; X64: # %bb.0: # %entry
4209 ; X64-NEXT: kmovw %edi, %k1
4210 ; X64-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 {%k1}
4213 %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B)
4214 %1 = bitcast i8 %__U to <8 x i1>
4215 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4216 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__A
4220 define <4 x i64> @test_mm256_maskz_permutex2var_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) {
4221 ; X86-LABEL: test_mm256_maskz_permutex2var_epi64:
4222 ; X86: # %bb.0: # %entry
4223 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4224 ; X86-NEXT: kmovw %eax, %k1
4225 ; X86-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 {%k1} {z}
4228 ; X64-LABEL: test_mm256_maskz_permutex2var_epi64:
4229 ; X64: # %bb.0: # %entry
4230 ; X64-NEXT: kmovw %edi, %k1
4231 ; X64-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 {%k1} {z}
4234 %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B)
4235 %1 = bitcast i8 %__U to <8 x i1>
4236 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4237 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
4242 define <2 x double> @test_mm_mask_fmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
4243 ; X86-LABEL: test_mm_mask_fmadd_pd:
4244 ; X86: # %bb.0: # %entry
4245 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4246 ; X86-NEXT: kmovw %eax, %k1
4247 ; X86-NEXT: vfmadd132pd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
4250 ; X64-LABEL: test_mm_mask_fmadd_pd:
4251 ; X64: # %bb.0: # %entry
4252 ; X64-NEXT: kmovw %edi, %k1
4253 ; X64-NEXT: vfmadd132pd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
4256 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
4257 %1 = bitcast i8 %__U to <8 x i1>
4258 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4259 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
4263 define <2 x double> @test_mm_mask_fmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
4264 ; X86-LABEL: test_mm_mask_fmsub_pd:
4265 ; X86: # %bb.0: # %entry
4266 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4267 ; X86-NEXT: kmovw %eax, %k1
4268 ; X86-NEXT: vfmsub132pd {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
4271 ; X64-LABEL: test_mm_mask_fmsub_pd:
4272 ; X64: # %bb.0: # %entry
4273 ; X64-NEXT: kmovw %edi, %k1
4274 ; X64-NEXT: vfmsub132pd {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
4277 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
4278 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
4279 %1 = bitcast i8 %__U to <8 x i1>
4280 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4281 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
4285 define <2 x double> @test_mm_mask3_fmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
4286 ; X86-LABEL: test_mm_mask3_fmadd_pd:
4287 ; X86: # %bb.0: # %entry
4288 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4289 ; X86-NEXT: kmovw %eax, %k1
4290 ; X86-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
4291 ; X86-NEXT: vmovapd %xmm2, %xmm0
4294 ; X64-LABEL: test_mm_mask3_fmadd_pd:
4295 ; X64: # %bb.0: # %entry
4296 ; X64-NEXT: kmovw %edi, %k1
4297 ; X64-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
4298 ; X64-NEXT: vmovapd %xmm2, %xmm0
4301 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
4302 %1 = bitcast i8 %__U to <8 x i1>
4303 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4304 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
4308 define <2 x double> @test_mm_mask3_fnmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
4309 ; X86-LABEL: test_mm_mask3_fnmadd_pd:
4310 ; X86: # %bb.0: # %entry
4311 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4312 ; X86-NEXT: kmovw %eax, %k1
4313 ; X86-NEXT: vfnmadd231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
4314 ; X86-NEXT: vmovapd %xmm2, %xmm0
4317 ; X64-LABEL: test_mm_mask3_fnmadd_pd:
4318 ; X64: # %bb.0: # %entry
4319 ; X64-NEXT: kmovw %edi, %k1
4320 ; X64-NEXT: vfnmadd231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
4321 ; X64-NEXT: vmovapd %xmm2, %xmm0
4324 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
4325 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %__C) #9
4326 %1 = bitcast i8 %__U to <8 x i1>
4327 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4328 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
4332 define <2 x double> @test_mm_maskz_fmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
4333 ; X86-LABEL: test_mm_maskz_fmadd_pd:
4334 ; X86: # %bb.0: # %entry
4335 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4336 ; X86-NEXT: kmovw %eax, %k1
4337 ; X86-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4340 ; X64-LABEL: test_mm_maskz_fmadd_pd:
4341 ; X64: # %bb.0: # %entry
4342 ; X64-NEXT: kmovw %edi, %k1
4343 ; X64-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4346 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
4347 %1 = bitcast i8 %__U to <8 x i1>
4348 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4349 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
4353 define <2 x double> @test_mm_maskz_fmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
4354 ; X86-LABEL: test_mm_maskz_fmsub_pd:
4355 ; X86: # %bb.0: # %entry
4356 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4357 ; X86-NEXT: kmovw %eax, %k1
4358 ; X86-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
4361 ; X64-LABEL: test_mm_maskz_fmsub_pd:
4362 ; X64: # %bb.0: # %entry
4363 ; X64-NEXT: kmovw %edi, %k1
4364 ; X64-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
4367 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
4368 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
4369 %1 = bitcast i8 %__U to <8 x i1>
4370 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4371 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
4375 define <2 x double> @test_mm_maskz_fnmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
4376 ; X86-LABEL: test_mm_maskz_fnmadd_pd:
4377 ; X86: # %bb.0: # %entry
4378 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4379 ; X86-NEXT: kmovw %eax, %k1
4380 ; X86-NEXT: vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
4383 ; X64-LABEL: test_mm_maskz_fnmadd_pd:
4384 ; X64: # %bb.0: # %entry
4385 ; X64-NEXT: kmovw %edi, %k1
4386 ; X64-NEXT: vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
4389 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
4390 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %__C) #9
4391 %1 = bitcast i8 %__U to <8 x i1>
4392 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4393 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
4397 define <2 x double> @test_mm_maskz_fnmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
4398 ; X86-LABEL: test_mm_maskz_fnmsub_pd:
4399 ; X86: # %bb.0: # %entry
4400 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4401 ; X86-NEXT: kmovw %eax, %k1
4402 ; X86-NEXT: vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
4405 ; X64-LABEL: test_mm_maskz_fnmsub_pd:
4406 ; X64: # %bb.0: # %entry
4407 ; X64-NEXT: kmovw %edi, %k1
4408 ; X64-NEXT: vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
4411 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
4412 %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
4413 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %sub1.i) #9
4414 %1 = bitcast i8 %__U to <8 x i1>
4415 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4416 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
4420 define <4 x double> @test_mm256_mask_fmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
4421 ; X86-LABEL: test_mm256_mask_fmadd_pd:
4422 ; X86: # %bb.0: # %entry
4423 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4424 ; X86-NEXT: kmovw %eax, %k1
4425 ; X86-NEXT: vfmadd132pd {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2
4428 ; X64-LABEL: test_mm256_mask_fmadd_pd:
4429 ; X64: # %bb.0: # %entry
4430 ; X64-NEXT: kmovw %edi, %k1
4431 ; X64-NEXT: vfmadd132pd {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2
4434 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
4435 %1 = bitcast i8 %__U to <8 x i1>
4436 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4437 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
4441 define <4 x double> @test_mm256_mask_fmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
4442 ; X86-LABEL: test_mm256_mask_fmsub_pd:
4443 ; X86: # %bb.0: # %entry
4444 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4445 ; X86-NEXT: kmovw %eax, %k1
4446 ; X86-NEXT: vfmsub132pd {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
4449 ; X64-LABEL: test_mm256_mask_fmsub_pd:
4450 ; X64: # %bb.0: # %entry
4451 ; X64-NEXT: kmovw %edi, %k1
4452 ; X64-NEXT: vfmsub132pd {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
4455 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4456 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
4457 %1 = bitcast i8 %__U to <8 x i1>
4458 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4459 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
4463 define <4 x double> @test_mm256_mask3_fmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
4464 ; X86-LABEL: test_mm256_mask3_fmadd_pd:
4465 ; X86: # %bb.0: # %entry
4466 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4467 ; X86-NEXT: kmovw %eax, %k1
4468 ; X86-NEXT: vfmadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2
4469 ; X86-NEXT: vmovapd %ymm2, %ymm0
4472 ; X64-LABEL: test_mm256_mask3_fmadd_pd:
4473 ; X64: # %bb.0: # %entry
4474 ; X64-NEXT: kmovw %edi, %k1
4475 ; X64-NEXT: vfmadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2
4476 ; X64-NEXT: vmovapd %ymm2, %ymm0
4479 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
4480 %1 = bitcast i8 %__U to <8 x i1>
4481 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4482 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
4486 define <4 x double> @test_mm256_mask3_fnmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
4487 ; X86-LABEL: test_mm256_mask3_fnmadd_pd:
4488 ; X86: # %bb.0: # %entry
4489 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4490 ; X86-NEXT: kmovw %eax, %k1
4491 ; X86-NEXT: vfnmadd231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2
4492 ; X86-NEXT: vmovapd %ymm2, %ymm0
4495 ; X64-LABEL: test_mm256_mask3_fnmadd_pd:
4496 ; X64: # %bb.0: # %entry
4497 ; X64-NEXT: kmovw %edi, %k1
4498 ; X64-NEXT: vfnmadd231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2
4499 ; X64-NEXT: vmovapd %ymm2, %ymm0
4502 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4503 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %__C) #9
4504 %1 = bitcast i8 %__U to <8 x i1>
4505 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4506 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
4510 define <4 x double> @test_mm256_maskz_fmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
4511 ; X86-LABEL: test_mm256_maskz_fmadd_pd:
4512 ; X86: # %bb.0: # %entry
4513 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4514 ; X86-NEXT: kmovw %eax, %k1
4515 ; X86-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
4518 ; X64-LABEL: test_mm256_maskz_fmadd_pd:
4519 ; X64: # %bb.0: # %entry
4520 ; X64-NEXT: kmovw %edi, %k1
4521 ; X64-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
4524 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
4525 %1 = bitcast i8 %__U to <8 x i1>
4526 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4527 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
4531 define <4 x double> @test_mm256_maskz_fmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
4532 ; X86-LABEL: test_mm256_maskz_fmsub_pd:
4533 ; X86: # %bb.0: # %entry
4534 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4535 ; X86-NEXT: kmovw %eax, %k1
4536 ; X86-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
4539 ; X64-LABEL: test_mm256_maskz_fmsub_pd:
4540 ; X64: # %bb.0: # %entry
4541 ; X64-NEXT: kmovw %edi, %k1
4542 ; X64-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
4545 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4546 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
4547 %1 = bitcast i8 %__U to <8 x i1>
4548 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4549 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
4553 define <4 x double> @test_mm256_maskz_fnmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
4554 ; X86-LABEL: test_mm256_maskz_fnmadd_pd:
4555 ; X86: # %bb.0: # %entry
4556 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4557 ; X86-NEXT: kmovw %eax, %k1
4558 ; X86-NEXT: vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
4561 ; X64-LABEL: test_mm256_maskz_fnmadd_pd:
4562 ; X64: # %bb.0: # %entry
4563 ; X64-NEXT: kmovw %edi, %k1
4564 ; X64-NEXT: vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
4567 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4568 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %__C) #9
4569 %1 = bitcast i8 %__U to <8 x i1>
4570 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4571 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
4575 define <4 x double> @test_mm256_maskz_fnmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
4576 ; X86-LABEL: test_mm256_maskz_fnmsub_pd:
4577 ; X86: # %bb.0: # %entry
4578 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4579 ; X86-NEXT: kmovw %eax, %k1
4580 ; X86-NEXT: vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
4583 ; X64-LABEL: test_mm256_maskz_fnmsub_pd:
4584 ; X64: # %bb.0: # %entry
4585 ; X64-NEXT: kmovw %edi, %k1
4586 ; X64-NEXT: vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
4589 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4590 %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4591 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %sub1.i) #9
4592 %1 = bitcast i8 %__U to <8 x i1>
4593 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4594 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
4598 define <4 x float> @test_mm_mask_fmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
4599 ; X86-LABEL: test_mm_mask_fmadd_ps:
4600 ; X86: # %bb.0: # %entry
4601 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4602 ; X86-NEXT: kmovw %eax, %k1
4603 ; X86-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
4606 ; X64-LABEL: test_mm_mask_fmadd_ps:
4607 ; X64: # %bb.0: # %entry
4608 ; X64-NEXT: kmovw %edi, %k1
4609 ; X64-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
4612 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
4613 %1 = bitcast i8 %__U to <8 x i1>
4614 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4615 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
4619 define <4 x float> @test_mm_mask_fmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
4620 ; X86-LABEL: test_mm_mask_fmsub_ps:
4621 ; X86: # %bb.0: # %entry
4622 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4623 ; X86-NEXT: kmovw %eax, %k1
4624 ; X86-NEXT: vfmsub132ps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
4627 ; X64-LABEL: test_mm_mask_fmsub_ps:
4628 ; X64: # %bb.0: # %entry
4629 ; X64-NEXT: kmovw %edi, %k1
4630 ; X64-NEXT: vfmsub132ps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
4633 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4634 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
4635 %1 = bitcast i8 %__U to <8 x i1>
4636 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4637 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
4641 define <4 x float> @test_mm_mask3_fmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
4642 ; X86-LABEL: test_mm_mask3_fmadd_ps:
4643 ; X86: # %bb.0: # %entry
4644 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4645 ; X86-NEXT: kmovw %eax, %k1
4646 ; X86-NEXT: vfmadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
4647 ; X86-NEXT: vmovaps %xmm2, %xmm0
4650 ; X64-LABEL: test_mm_mask3_fmadd_ps:
4651 ; X64: # %bb.0: # %entry
4652 ; X64-NEXT: kmovw %edi, %k1
4653 ; X64-NEXT: vfmadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
4654 ; X64-NEXT: vmovaps %xmm2, %xmm0
4657 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
4658 %1 = bitcast i8 %__U to <8 x i1>
4659 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4660 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
4664 define <4 x float> @test_mm_mask3_fnmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
4665 ; X86-LABEL: test_mm_mask3_fnmadd_ps:
4666 ; X86: # %bb.0: # %entry
4667 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4668 ; X86-NEXT: kmovw %eax, %k1
4669 ; X86-NEXT: vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
4670 ; X86-NEXT: vmovaps %xmm2, %xmm0
4673 ; X64-LABEL: test_mm_mask3_fnmadd_ps:
4674 ; X64: # %bb.0: # %entry
4675 ; X64-NEXT: kmovw %edi, %k1
4676 ; X64-NEXT: vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
4677 ; X64-NEXT: vmovaps %xmm2, %xmm0
4680 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4681 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %__C) #9
4682 %1 = bitcast i8 %__U to <8 x i1>
4683 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4684 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
4688 define <4 x float> @test_mm_maskz_fmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4689 ; X86-LABEL: test_mm_maskz_fmadd_ps:
4690 ; X86: # %bb.0: # %entry
4691 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4692 ; X86-NEXT: kmovw %eax, %k1
4693 ; X86-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4696 ; X64-LABEL: test_mm_maskz_fmadd_ps:
4697 ; X64: # %bb.0: # %entry
4698 ; X64-NEXT: kmovw %edi, %k1
4699 ; X64-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4702 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
4703 %1 = bitcast i8 %__U to <8 x i1>
4704 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4705 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
4709 define <4 x float> @test_mm_maskz_fmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4710 ; X86-LABEL: test_mm_maskz_fmsub_ps:
4711 ; X86: # %bb.0: # %entry
4712 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4713 ; X86-NEXT: kmovw %eax, %k1
4714 ; X86-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
4717 ; X64-LABEL: test_mm_maskz_fmsub_ps:
4718 ; X64: # %bb.0: # %entry
4719 ; X64-NEXT: kmovw %edi, %k1
4720 ; X64-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
4723 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4724 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
4725 %1 = bitcast i8 %__U to <8 x i1>
4726 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4727 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
4731 define <4 x float> @test_mm_maskz_fnmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4732 ; X86-LABEL: test_mm_maskz_fnmadd_ps:
4733 ; X86: # %bb.0: # %entry
4734 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4735 ; X86-NEXT: kmovw %eax, %k1
4736 ; X86-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
4739 ; X64-LABEL: test_mm_maskz_fnmadd_ps:
4740 ; X64: # %bb.0: # %entry
4741 ; X64-NEXT: kmovw %edi, %k1
4742 ; X64-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
4745 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4746 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %__C) #9
4747 %1 = bitcast i8 %__U to <8 x i1>
4748 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4749 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
4753 define <4 x float> @test_mm_maskz_fnmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4754 ; X86-LABEL: test_mm_maskz_fnmsub_ps:
4755 ; X86: # %bb.0: # %entry
4756 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4757 ; X86-NEXT: kmovw %eax, %k1
4758 ; X86-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
4761 ; X64-LABEL: test_mm_maskz_fnmsub_ps:
4762 ; X64: # %bb.0: # %entry
4763 ; X64-NEXT: kmovw %edi, %k1
4764 ; X64-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
4767 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4768 %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4769 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %sub1.i) #9
4770 %1 = bitcast i8 %__U to <8 x i1>
4771 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4772 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
4776 define <8 x float> @test_mm256_mask_fmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
4777 ; X86-LABEL: test_mm256_mask_fmadd_ps:
4778 ; X86: # %bb.0: # %entry
4779 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4780 ; X86-NEXT: kmovw %eax, %k1
4781 ; X86-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2
4784 ; X64-LABEL: test_mm256_mask_fmadd_ps:
4785 ; X64: # %bb.0: # %entry
4786 ; X64-NEXT: kmovw %edi, %k1
4787 ; X64-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2
4790 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
4791 %1 = bitcast i8 %__U to <8 x i1>
4792 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
4796 define <8 x float> @test_mm256_mask_fmsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
4797 ; X86-LABEL: test_mm256_mask_fmsub_ps:
4798 ; X86: # %bb.0: # %entry
4799 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4800 ; X86-NEXT: kmovw %eax, %k1
4801 ; X86-NEXT: vfmsub132ps {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
4804 ; X64-LABEL: test_mm256_mask_fmsub_ps:
4805 ; X64: # %bb.0: # %entry
4806 ; X64-NEXT: kmovw %edi, %k1
4807 ; X64-NEXT: vfmsub132ps {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
4810 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4811 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
4812 %1 = bitcast i8 %__U to <8 x i1>
4813 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
4817 define <8 x float> @test_mm256_mask3_fmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
4818 ; X86-LABEL: test_mm256_mask3_fmadd_ps:
4819 ; X86: # %bb.0: # %entry
4820 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4821 ; X86-NEXT: kmovw %eax, %k1
4822 ; X86-NEXT: vfmadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2
4823 ; X86-NEXT: vmovaps %ymm2, %ymm0
4826 ; X64-LABEL: test_mm256_mask3_fmadd_ps:
4827 ; X64: # %bb.0: # %entry
4828 ; X64-NEXT: kmovw %edi, %k1
4829 ; X64-NEXT: vfmadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2
4830 ; X64-NEXT: vmovaps %ymm2, %ymm0
4833 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
4834 %1 = bitcast i8 %__U to <8 x i1>
4835 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
4839 define <8 x float> @test_mm256_mask3_fnmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
4840 ; X86-LABEL: test_mm256_mask3_fnmadd_ps:
4841 ; X86: # %bb.0: # %entry
4842 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4843 ; X86-NEXT: kmovw %eax, %k1
4844 ; X86-NEXT: vfnmadd231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2
4845 ; X86-NEXT: vmovaps %ymm2, %ymm0
4848 ; X64-LABEL: test_mm256_mask3_fnmadd_ps:
4849 ; X64: # %bb.0: # %entry
4850 ; X64-NEXT: kmovw %edi, %k1
4851 ; X64-NEXT: vfnmadd231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2
4852 ; X64-NEXT: vmovaps %ymm2, %ymm0
4855 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4856 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %__C) #9
4857 %1 = bitcast i8 %__U to <8 x i1>
4858 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
4862 define <8 x float> @test_mm256_maskz_fmadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
4863 ; X86-LABEL: test_mm256_maskz_fmadd_ps:
4864 ; X86: # %bb.0: # %entry
4865 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4866 ; X86-NEXT: kmovw %eax, %k1
4867 ; X86-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
4870 ; X64-LABEL: test_mm256_maskz_fmadd_ps:
4871 ; X64: # %bb.0: # %entry
4872 ; X64-NEXT: kmovw %edi, %k1
4873 ; X64-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
4876 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
4877 %1 = bitcast i8 %__U to <8 x i1>
4878 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
4882 define <8 x float> @test_mm256_maskz_fmsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
4883 ; X86-LABEL: test_mm256_maskz_fmsub_ps:
4884 ; X86: # %bb.0: # %entry
4885 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4886 ; X86-NEXT: kmovw %eax, %k1
4887 ; X86-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
4890 ; X64-LABEL: test_mm256_maskz_fmsub_ps:
4891 ; X64: # %bb.0: # %entry
4892 ; X64-NEXT: kmovw %edi, %k1
4893 ; X64-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
4896 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4897 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
4898 %1 = bitcast i8 %__U to <8 x i1>
4899 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
4903 define <8 x float> @test_mm256_maskz_fnmadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
4904 ; X86-LABEL: test_mm256_maskz_fnmadd_ps:
4905 ; X86: # %bb.0: # %entry
4906 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4907 ; X86-NEXT: kmovw %eax, %k1
4908 ; X86-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
4911 ; X64-LABEL: test_mm256_maskz_fnmadd_ps:
4912 ; X64: # %bb.0: # %entry
4913 ; X64-NEXT: kmovw %edi, %k1
4914 ; X64-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
4917 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4918 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %__C) #9
4919 %1 = bitcast i8 %__U to <8 x i1>
4920 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
4924 define <8 x float> @test_mm256_maskz_fnmsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
4925 ; X86-LABEL: test_mm256_maskz_fnmsub_ps:
4926 ; X86: # %bb.0: # %entry
4927 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4928 ; X86-NEXT: kmovw %eax, %k1
4929 ; X86-NEXT: vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
4932 ; X64-LABEL: test_mm256_maskz_fnmsub_ps:
4933 ; X64: # %bb.0: # %entry
4934 ; X64-NEXT: kmovw %edi, %k1
4935 ; X64-NEXT: vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
4938 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4939 %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4940 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %sub1.i) #9
4941 %1 = bitcast i8 %__U to <8 x i1>
4942 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
4946 define <2 x double> @test_mm_mask_fmaddsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
4947 ; X86-LABEL: test_mm_mask_fmaddsub_pd:
4948 ; X86: # %bb.0: # %entry
4949 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4950 ; X86-NEXT: kmovw %eax, %k1
4951 ; X86-NEXT: vfmaddsub132pd {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
4954 ; X64-LABEL: test_mm_mask_fmaddsub_pd:
4955 ; X64: # %bb.0: # %entry
4956 ; X64-NEXT: kmovw %edi, %k1
4957 ; X64-NEXT: vfmaddsub132pd {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
4960 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
4961 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
4962 %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
4963 %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
4964 %4 = bitcast i8 %__U to <8 x i1>
4965 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4966 %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> %__A
4970 define <2 x double> @test_mm_mask_fmsubadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
4971 ; X86-LABEL: test_mm_mask_fmsubadd_pd:
4972 ; X86: # %bb.0: # %entry
4973 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4974 ; X86-NEXT: kmovw %eax, %k1
4975 ; X86-NEXT: vfmsubadd132pd {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
4978 ; X64-LABEL: test_mm_mask_fmsubadd_pd:
4979 ; X64: # %bb.0: # %entry
4980 ; X64-NEXT: kmovw %edi, %k1
4981 ; X64-NEXT: vfmsubadd132pd {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
4984 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
4985 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
4986 %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
4987 %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
4988 %3 = bitcast i8 %__U to <8 x i1>
4989 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4990 %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__A
4994 define <2 x double> @test_mm_mask3_fmaddsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
4995 ; X86-LABEL: test_mm_mask3_fmaddsub_pd:
4996 ; X86: # %bb.0: # %entry
4997 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4998 ; X86-NEXT: kmovw %eax, %k1
4999 ; X86-NEXT: vfmaddsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2
5000 ; X86-NEXT: vmovapd %xmm2, %xmm0
5003 ; X64-LABEL: test_mm_mask3_fmaddsub_pd:
5004 ; X64: # %bb.0: # %entry
5005 ; X64-NEXT: kmovw %edi, %k1
5006 ; X64-NEXT: vfmaddsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2
5007 ; X64-NEXT: vmovapd %xmm2, %xmm0
5010 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
5011 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5012 %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
5013 %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
5014 %4 = bitcast i8 %__U to <8 x i1>
5015 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5016 %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> %__C
5020 define <2 x double> @test_mm_maskz_fmaddsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5021 ; X86-LABEL: test_mm_maskz_fmaddsub_pd:
5022 ; X86: # %bb.0: # %entry
5023 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5024 ; X86-NEXT: kmovw %eax, %k1
5025 ; X86-NEXT: vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
5028 ; X64-LABEL: test_mm_maskz_fmaddsub_pd:
5029 ; X64: # %bb.0: # %entry
5030 ; X64-NEXT: kmovw %edi, %k1
5031 ; X64-NEXT: vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
5034 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
5035 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5036 %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
5037 %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
5038 %4 = bitcast i8 %__U to <8 x i1>
5039 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5040 %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> zeroinitializer
5044 define <2 x double> @test_mm_maskz_fmsubadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5045 ; X86-LABEL: test_mm_maskz_fmsubadd_pd:
5046 ; X86: # %bb.0: # %entry
5047 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5048 ; X86-NEXT: kmovw %eax, %k1
5049 ; X86-NEXT: vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
5052 ; X64-LABEL: test_mm_maskz_fmsubadd_pd:
5053 ; X64: # %bb.0: # %entry
5054 ; X64-NEXT: kmovw %edi, %k1
5055 ; X64-NEXT: vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
5058 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5059 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
5060 %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
5061 %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
5062 %3 = bitcast i8 %__U to <8 x i1>
5063 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5064 %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> zeroinitializer
5068 define <4 x double> @test_mm256_mask_fmaddsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
5069 ; X86-LABEL: test_mm256_mask_fmaddsub_pd:
5070 ; X86: # %bb.0: # %entry
5071 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5072 ; X86-NEXT: kmovw %eax, %k1
5073 ; X86-NEXT: vfmaddsub132pd {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
5076 ; X64-LABEL: test_mm256_mask_fmaddsub_pd:
5077 ; X64: # %bb.0: # %entry
5078 ; X64-NEXT: kmovw %edi, %k1
5079 ; X64-NEXT: vfmaddsub132pd {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
5082 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5083 %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5084 %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
5085 %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5086 %4 = bitcast i8 %__U to <8 x i1>
5087 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5088 %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> %__A
5092 define <4 x double> @test_mm256_mask_fmsubadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
5093 ; X86-LABEL: test_mm256_mask_fmsubadd_pd:
5094 ; X86: # %bb.0: # %entry
5095 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5096 ; X86-NEXT: kmovw %eax, %k1
5097 ; X86-NEXT: vfmsubadd132pd {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
5100 ; X64-LABEL: test_mm256_mask_fmsubadd_pd:
5101 ; X64: # %bb.0: # %entry
5102 ; X64-NEXT: kmovw %edi, %k1
5103 ; X64-NEXT: vfmsubadd132pd {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
5106 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5107 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
5108 %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5109 %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5110 %3 = bitcast i8 %__U to <8 x i1>
5111 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5112 %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__A
5116 define <4 x double> @test_mm256_mask3_fmaddsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
5117 ; X86-LABEL: test_mm256_mask3_fmaddsub_pd:
5118 ; X86: # %bb.0: # %entry
5119 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5120 ; X86-NEXT: kmovw %eax, %k1
5121 ; X86-NEXT: vfmaddsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2
5122 ; X86-NEXT: vmovapd %ymm2, %ymm0
5125 ; X64-LABEL: test_mm256_mask3_fmaddsub_pd:
5126 ; X64: # %bb.0: # %entry
5127 ; X64-NEXT: kmovw %edi, %k1
5128 ; X64-NEXT: vfmaddsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2
5129 ; X64-NEXT: vmovapd %ymm2, %ymm0
5132 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5133 %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5134 %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
5135 %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5136 %4 = bitcast i8 %__U to <8 x i1>
5137 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5138 %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> %__C
5142 define <4 x double> @test_mm256_maskz_fmaddsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
5143 ; X86-LABEL: test_mm256_maskz_fmaddsub_pd:
5144 ; X86: # %bb.0: # %entry
5145 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5146 ; X86-NEXT: kmovw %eax, %k1
5147 ; X86-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
5150 ; X64-LABEL: test_mm256_maskz_fmaddsub_pd:
5151 ; X64: # %bb.0: # %entry
5152 ; X64-NEXT: kmovw %edi, %k1
5153 ; X64-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
5156 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5157 %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5158 %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
5159 %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5160 %4 = bitcast i8 %__U to <8 x i1>
5161 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5162 %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> zeroinitializer
5166 define <4 x double> @test_mm256_maskz_fmsubadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
5167 ; X86-LABEL: test_mm256_maskz_fmsubadd_pd:
5168 ; X86: # %bb.0: # %entry
5169 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5170 ; X86-NEXT: kmovw %eax, %k1
5171 ; X86-NEXT: vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
5174 ; X64-LABEL: test_mm256_maskz_fmsubadd_pd:
5175 ; X64: # %bb.0: # %entry
5176 ; X64-NEXT: kmovw %edi, %k1
5177 ; X64-NEXT: vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
5180 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5181 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
5182 %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5183 %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5184 %3 = bitcast i8 %__U to <8 x i1>
5185 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5186 %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> zeroinitializer
5190 define <4 x float> @test_mm_mask_fmaddsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
5191 ; X86-LABEL: test_mm_mask_fmaddsub_ps:
5192 ; X86: # %bb.0: # %entry
5193 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5194 ; X86-NEXT: kmovw %eax, %k1
5195 ; X86-NEXT: vfmaddsub132ps {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
5198 ; X64-LABEL: test_mm_mask_fmaddsub_ps:
5199 ; X64: # %bb.0: # %entry
5200 ; X64-NEXT: kmovw %edi, %k1
5201 ; X64-NEXT: vfmaddsub132ps {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
5204 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5205 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5206 %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
5207 %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5208 %4 = bitcast i8 %__U to <8 x i1>
5209 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5210 %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> %__A
5214 define <4 x float> @test_mm_mask_fmsubadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
5215 ; X86-LABEL: test_mm_mask_fmsubadd_ps:
5216 ; X86: # %bb.0: # %entry
5217 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5218 ; X86-NEXT: kmovw %eax, %k1
5219 ; X86-NEXT: vfmsubadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
5222 ; X64-LABEL: test_mm_mask_fmsubadd_ps:
5223 ; X64: # %bb.0: # %entry
5224 ; X64-NEXT: kmovw %edi, %k1
5225 ; X64-NEXT: vfmsubadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
5228 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5229 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
5230 %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5231 %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5232 %3 = bitcast i8 %__U to <8 x i1>
5233 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5234 %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__A
5238 define <4 x float> @test_mm_mask3_fmaddsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
5239 ; X86-LABEL: test_mm_mask3_fmaddsub_ps:
5240 ; X86: # %bb.0: # %entry
5241 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5242 ; X86-NEXT: kmovw %eax, %k1
5243 ; X86-NEXT: vfmaddsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2
5244 ; X86-NEXT: vmovaps %xmm2, %xmm0
5247 ; X64-LABEL: test_mm_mask3_fmaddsub_ps:
5248 ; X64: # %bb.0: # %entry
5249 ; X64-NEXT: kmovw %edi, %k1
5250 ; X64-NEXT: vfmaddsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2
5251 ; X64-NEXT: vmovaps %xmm2, %xmm0
5254 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5255 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5256 %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
5257 %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5258 %4 = bitcast i8 %__U to <8 x i1>
5259 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5260 %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> %__C
5264 define <4 x float> @test_mm_maskz_fmaddsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5265 ; X86-LABEL: test_mm_maskz_fmaddsub_ps:
5266 ; X86: # %bb.0: # %entry
5267 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5268 ; X86-NEXT: kmovw %eax, %k1
5269 ; X86-NEXT: vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
5272 ; X64-LABEL: test_mm_maskz_fmaddsub_ps:
5273 ; X64: # %bb.0: # %entry
5274 ; X64-NEXT: kmovw %edi, %k1
5275 ; X64-NEXT: vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
5278 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5279 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5280 %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
5281 %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5282 %4 = bitcast i8 %__U to <8 x i1>
5283 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5284 %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> zeroinitializer
5288 define <4 x float> @test_mm_maskz_fmsubadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5289 ; X86-LABEL: test_mm_maskz_fmsubadd_ps:
5290 ; X86: # %bb.0: # %entry
5291 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5292 ; X86-NEXT: kmovw %eax, %k1
5293 ; X86-NEXT: vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
5296 ; X64-LABEL: test_mm_maskz_fmsubadd_ps:
5297 ; X64: # %bb.0: # %entry
5298 ; X64-NEXT: kmovw %edi, %k1
5299 ; X64-NEXT: vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
5302 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5303 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
5304 %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5305 %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5306 %3 = bitcast i8 %__U to <8 x i1>
5307 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5308 %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> zeroinitializer
5312 define <8 x float> @test_mm256_mask_fmaddsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
5313 ; X86-LABEL: test_mm256_mask_fmaddsub_ps:
5314 ; X86: # %bb.0: # %entry
5315 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5316 ; X86-NEXT: kmovw %eax, %k1
5317 ; X86-NEXT: vfmaddsub132ps {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
5320 ; X64-LABEL: test_mm256_mask_fmaddsub_ps:
5321 ; X64: # %bb.0: # %entry
5322 ; X64-NEXT: kmovw %edi, %k1
5323 ; X64-NEXT: vfmaddsub132ps {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
5326 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5327 %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5328 %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
5329 %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5330 %4 = bitcast i8 %__U to <8 x i1>
5331 %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %__A
5335 define <8 x float> @test_mm256_mask_fmsubadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
5336 ; X86-LABEL: test_mm256_mask_fmsubadd_ps:
5337 ; X86: # %bb.0: # %entry
5338 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5339 ; X86-NEXT: kmovw %eax, %k1
5340 ; X86-NEXT: vfmsubadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
5343 ; X64-LABEL: test_mm256_mask_fmsubadd_ps:
5344 ; X64: # %bb.0: # %entry
5345 ; X64-NEXT: kmovw %edi, %k1
5346 ; X64-NEXT: vfmsubadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
5349 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5350 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
5351 %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5352 %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5353 %3 = bitcast i8 %__U to <8 x i1>
5354 %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__A
5358 define <8 x float> @test_mm256_mask3_fmaddsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
5359 ; X86-LABEL: test_mm256_mask3_fmaddsub_ps:
5360 ; X86: # %bb.0: # %entry
5361 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5362 ; X86-NEXT: kmovw %eax, %k1
5363 ; X86-NEXT: vfmaddsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2
5364 ; X86-NEXT: vmovaps %ymm2, %ymm0
5367 ; X64-LABEL: test_mm256_mask3_fmaddsub_ps:
5368 ; X64: # %bb.0: # %entry
5369 ; X64-NEXT: kmovw %edi, %k1
5370 ; X64-NEXT: vfmaddsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2
5371 ; X64-NEXT: vmovaps %ymm2, %ymm0
5374 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5375 %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5376 %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
5377 %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5378 %4 = bitcast i8 %__U to <8 x i1>
5379 %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %__C
5383 define <8 x float> @test_mm256_maskz_fmaddsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
5384 ; X86-LABEL: test_mm256_maskz_fmaddsub_ps:
5385 ; X86: # %bb.0: # %entry
5386 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5387 ; X86-NEXT: kmovw %eax, %k1
5388 ; X86-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
5391 ; X64-LABEL: test_mm256_maskz_fmaddsub_ps:
5392 ; X64: # %bb.0: # %entry
5393 ; X64-NEXT: kmovw %edi, %k1
5394 ; X64-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
5397 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5398 %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5399 %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
5400 %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5401 %4 = bitcast i8 %__U to <8 x i1>
5402 %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> zeroinitializer
5406 define <8 x float> @test_mm256_maskz_fmsubadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
5407 ; X86-LABEL: test_mm256_maskz_fmsubadd_ps:
5408 ; X86: # %bb.0: # %entry
5409 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5410 ; X86-NEXT: kmovw %eax, %k1
5411 ; X86-NEXT: vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
5414 ; X64-LABEL: test_mm256_maskz_fmsubadd_ps:
5415 ; X64: # %bb.0: # %entry
5416 ; X64-NEXT: kmovw %edi, %k1
5417 ; X64-NEXT: vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
5420 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5421 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
5422 %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5423 %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5424 %3 = bitcast i8 %__U to <8 x i1>
5425 %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> zeroinitializer
5429 define <2 x double> @test_mm_mask3_fmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
5430 ; X86-LABEL: test_mm_mask3_fmsub_pd:
5431 ; X86: # %bb.0: # %entry
5432 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5433 ; X86-NEXT: kmovw %eax, %k1
5434 ; X86-NEXT: vfmsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5435 ; X86-NEXT: vmovapd %xmm2, %xmm0
5438 ; X64-LABEL: test_mm_mask3_fmsub_pd:
5439 ; X64: # %bb.0: # %entry
5440 ; X64-NEXT: kmovw %edi, %k1
5441 ; X64-NEXT: vfmsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5442 ; X64-NEXT: vmovapd %xmm2, %xmm0
5445 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5446 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
5447 %1 = bitcast i8 %__U to <8 x i1>
5448 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5449 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
5453 define <4 x double> @test_mm256_mask3_fmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
5454 ; X86-LABEL: test_mm256_mask3_fmsub_pd:
5455 ; X86: # %bb.0: # %entry
5456 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5457 ; X86-NEXT: kmovw %eax, %k1
5458 ; X86-NEXT: vfmsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2
5459 ; X86-NEXT: vmovapd %ymm2, %ymm0
5462 ; X64-LABEL: test_mm256_mask3_fmsub_pd:
5463 ; X64: # %bb.0: # %entry
5464 ; X64-NEXT: kmovw %edi, %k1
5465 ; X64-NEXT: vfmsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2
5466 ; X64-NEXT: vmovapd %ymm2, %ymm0
5469 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5470 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
5471 %1 = bitcast i8 %__U to <8 x i1>
5472 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5473 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
5477 define <4 x float> @test_mm_mask3_fmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
5478 ; X86-LABEL: test_mm_mask3_fmsub_ps:
5479 ; X86: # %bb.0: # %entry
5480 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5481 ; X86-NEXT: kmovw %eax, %k1
5482 ; X86-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5483 ; X86-NEXT: vmovaps %xmm2, %xmm0
5486 ; X64-LABEL: test_mm_mask3_fmsub_ps:
5487 ; X64: # %bb.0: # %entry
5488 ; X64-NEXT: kmovw %edi, %k1
5489 ; X64-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5490 ; X64-NEXT: vmovaps %xmm2, %xmm0
5493 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5494 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
5495 %1 = bitcast i8 %__U to <8 x i1>
5496 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5497 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
5501 define <8 x float> @test_mm256_mask3_fmsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
5502 ; X86-LABEL: test_mm256_mask3_fmsub_ps:
5503 ; X86: # %bb.0: # %entry
5504 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5505 ; X86-NEXT: kmovw %eax, %k1
5506 ; X86-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2
5507 ; X86-NEXT: vmovaps %ymm2, %ymm0
5510 ; X64-LABEL: test_mm256_mask3_fmsub_ps:
5511 ; X64: # %bb.0: # %entry
5512 ; X64-NEXT: kmovw %edi, %k1
5513 ; X64-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2
5514 ; X64-NEXT: vmovaps %ymm2, %ymm0
5517 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5518 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
5519 %1 = bitcast i8 %__U to <8 x i1>
5520 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
5524 define <2 x double> @test_mm_mask3_fmsubadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
5525 ; X86-LABEL: test_mm_mask3_fmsubadd_pd:
5526 ; X86: # %bb.0: # %entry
5527 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5528 ; X86-NEXT: kmovw %eax, %k1
5529 ; X86-NEXT: vfmsubadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2
5530 ; X86-NEXT: vmovapd %xmm2, %xmm0
5533 ; X64-LABEL: test_mm_mask3_fmsubadd_pd:
5534 ; X64: # %bb.0: # %entry
5535 ; X64-NEXT: kmovw %edi, %k1
5536 ; X64-NEXT: vfmsubadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2
5537 ; X64-NEXT: vmovapd %xmm2, %xmm0
5540 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5541 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
5542 %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
5543 %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
5544 %3 = bitcast i8 %__U to <8 x i1>
5545 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5546 %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__C
5550 define <4 x double> @test_mm256_mask3_fmsubadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
5551 ; X86-LABEL: test_mm256_mask3_fmsubadd_pd:
5552 ; X86: # %bb.0: # %entry
5553 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5554 ; X86-NEXT: kmovw %eax, %k1
5555 ; X86-NEXT: vfmsubadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2
5556 ; X86-NEXT: vmovapd %ymm2, %ymm0
5559 ; X64-LABEL: test_mm256_mask3_fmsubadd_pd:
5560 ; X64: # %bb.0: # %entry
5561 ; X64-NEXT: kmovw %edi, %k1
5562 ; X64-NEXT: vfmsubadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2
5563 ; X64-NEXT: vmovapd %ymm2, %ymm0
5566 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5567 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
5568 %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5569 %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5570 %3 = bitcast i8 %__U to <8 x i1>
5571 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5572 %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__C
5576 define <4 x float> @test_mm_mask3_fmsubadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
5577 ; X86-LABEL: test_mm_mask3_fmsubadd_ps:
5578 ; X86: # %bb.0: # %entry
5579 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5580 ; X86-NEXT: kmovw %eax, %k1
5581 ; X86-NEXT: vfmsubadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2
5582 ; X86-NEXT: vmovaps %xmm2, %xmm0
5585 ; X64-LABEL: test_mm_mask3_fmsubadd_ps:
5586 ; X64: # %bb.0: # %entry
5587 ; X64-NEXT: kmovw %edi, %k1
5588 ; X64-NEXT: vfmsubadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2
5589 ; X64-NEXT: vmovaps %xmm2, %xmm0
5592 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5593 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
5594 %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5595 %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5596 %3 = bitcast i8 %__U to <8 x i1>
5597 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5598 %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__C
5602 define <8 x float> @test_mm256_mask3_fmsubadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
5603 ; X86-LABEL: test_mm256_mask3_fmsubadd_ps:
5604 ; X86: # %bb.0: # %entry
5605 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5606 ; X86-NEXT: kmovw %eax, %k1
5607 ; X86-NEXT: vfmsubadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2
5608 ; X86-NEXT: vmovaps %ymm2, %ymm0
5611 ; X64-LABEL: test_mm256_mask3_fmsubadd_ps:
5612 ; X64: # %bb.0: # %entry
5613 ; X64-NEXT: kmovw %edi, %k1
5614 ; X64-NEXT: vfmsubadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2
5615 ; X64-NEXT: vmovaps %ymm2, %ymm0
5618 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5619 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
5620 %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5621 %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5622 %3 = bitcast i8 %__U to <8 x i1>
5623 %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__C
5627 define <2 x double> @test_mm_mask_fnmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
5628 ; X86-LABEL: test_mm_mask_fnmadd_pd:
5629 ; X86: # %bb.0: # %entry
5630 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5631 ; X86-NEXT: kmovw %eax, %k1
5632 ; X86-NEXT: vfnmadd132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
5635 ; X64-LABEL: test_mm_mask_fnmadd_pd:
5636 ; X64: # %bb.0: # %entry
5637 ; X64-NEXT: kmovw %edi, %k1
5638 ; X64-NEXT: vfnmadd132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
5641 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
5642 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %__C) #9
5643 %1 = bitcast i8 %__U to <8 x i1>
5644 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5645 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
5649 define <4 x double> @test_mm256_mask_fnmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
5650 ; X86-LABEL: test_mm256_mask_fnmadd_pd:
5651 ; X86: # %bb.0: # %entry
5652 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5653 ; X86-NEXT: kmovw %eax, %k1
5654 ; X86-NEXT: vfnmadd132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2
5657 ; X64-LABEL: test_mm256_mask_fnmadd_pd:
5658 ; X64: # %bb.0: # %entry
5659 ; X64-NEXT: kmovw %edi, %k1
5660 ; X64-NEXT: vfnmadd132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2
5663 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
5664 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %__C) #9
5665 %1 = bitcast i8 %__U to <8 x i1>
5666 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5667 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
5671 define <4 x float> @test_mm_mask_fnmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
5672 ; X86-LABEL: test_mm_mask_fnmadd_ps:
5673 ; X86: # %bb.0: # %entry
5674 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5675 ; X86-NEXT: kmovw %eax, %k1
5676 ; X86-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
5679 ; X64-LABEL: test_mm_mask_fnmadd_ps:
5680 ; X64: # %bb.0: # %entry
5681 ; X64-NEXT: kmovw %edi, %k1
5682 ; X64-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
5685 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5686 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %__C) #9
5687 %1 = bitcast i8 %__U to <8 x i1>
5688 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5689 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
5693 define <8 x float> @test_mm256_mask_fnmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
5694 ; X86-LABEL: test_mm256_mask_fnmadd_ps:
5695 ; X86: # %bb.0: # %entry
5696 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5697 ; X86-NEXT: kmovw %eax, %k1
5698 ; X86-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2
5701 ; X64-LABEL: test_mm256_mask_fnmadd_ps:
5702 ; X64: # %bb.0: # %entry
5703 ; X64-NEXT: kmovw %edi, %k1
5704 ; X64-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2
5707 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5708 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %__C) #9
5709 %1 = bitcast i8 %__U to <8 x i1>
5710 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
5714 define <2 x double> @test_mm_mask_fnmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
5715 ; X86-LABEL: test_mm_mask_fnmsub_pd:
5716 ; X86: # %bb.0: # %entry
5717 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5718 ; X86-NEXT: kmovw %eax, %k1
5719 ; X86-NEXT: vfnmsub132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
5722 ; X64-LABEL: test_mm_mask_fnmsub_pd:
5723 ; X64: # %bb.0: # %entry
5724 ; X64-NEXT: kmovw %edi, %k1
5725 ; X64-NEXT: vfnmsub132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
5728 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
5729 %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5730 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %sub1.i) #9
5731 %1 = bitcast i8 %__U to <8 x i1>
5732 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5733 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
5737 define <2 x double> @test_mm_mask3_fnmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
5738 ; X86-LABEL: test_mm_mask3_fnmsub_pd:
5739 ; X86: # %bb.0: # %entry
5740 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5741 ; X86-NEXT: kmovw %eax, %k1
5742 ; X86-NEXT: vfnmsub231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
5743 ; X86-NEXT: vmovapd %xmm2, %xmm0
5746 ; X64-LABEL: test_mm_mask3_fnmsub_pd:
5747 ; X64: # %bb.0: # %entry
5748 ; X64-NEXT: kmovw %edi, %k1
5749 ; X64-NEXT: vfnmsub231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
5750 ; X64-NEXT: vmovapd %xmm2, %xmm0
5753 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
5754 %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5755 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %sub1.i) #9
5756 %1 = bitcast i8 %__U to <8 x i1>
5757 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5758 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
5762 define <4 x double> @test_mm256_mask_fnmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
5763 ; X86-LABEL: test_mm256_mask_fnmsub_pd:
5764 ; X86: # %bb.0: # %entry
5765 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5766 ; X86-NEXT: kmovw %eax, %k1
5767 ; X86-NEXT: vfnmsub132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2
5770 ; X64-LABEL: test_mm256_mask_fnmsub_pd:
5771 ; X64: # %bb.0: # %entry
5772 ; X64-NEXT: kmovw %edi, %k1
5773 ; X64-NEXT: vfnmsub132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2
5776 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
5777 %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5778 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %sub1.i) #9
5779 %1 = bitcast i8 %__U to <8 x i1>
5780 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5781 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
5785 define <4 x double> @test_mm256_mask3_fnmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
5786 ; X86-LABEL: test_mm256_mask3_fnmsub_pd:
5787 ; X86: # %bb.0: # %entry
5788 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5789 ; X86-NEXT: kmovw %eax, %k1
5790 ; X86-NEXT: vfnmsub231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2
5791 ; X86-NEXT: vmovapd %ymm2, %ymm0
5794 ; X64-LABEL: test_mm256_mask3_fnmsub_pd:
5795 ; X64: # %bb.0: # %entry
5796 ; X64-NEXT: kmovw %edi, %k1
5797 ; X64-NEXT: vfnmsub231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2
5798 ; X64-NEXT: vmovapd %ymm2, %ymm0
5801 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
5802 %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5803 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %sub1.i) #9
5804 %1 = bitcast i8 %__U to <8 x i1>
5805 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5806 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
5810 define <4 x float> @test_mm_mask_fnmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
5811 ; X86-LABEL: test_mm_mask_fnmsub_ps:
5812 ; X86: # %bb.0: # %entry
5813 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5814 ; X86-NEXT: kmovw %eax, %k1
5815 ; X86-NEXT: vfnmsub132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
5818 ; X64-LABEL: test_mm_mask_fnmsub_ps:
5819 ; X64: # %bb.0: # %entry
5820 ; X64-NEXT: kmovw %edi, %k1
5821 ; X64-NEXT: vfnmsub132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
5824 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5825 %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5826 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %sub1.i) #9
5827 %1 = bitcast i8 %__U to <8 x i1>
5828 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5829 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
5833 define <4 x float> @test_mm_mask3_fnmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
5834 ; X86-LABEL: test_mm_mask3_fnmsub_ps:
5835 ; X86: # %bb.0: # %entry
5836 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5837 ; X86-NEXT: kmovw %eax, %k1
5838 ; X86-NEXT: vfnmsub231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
5839 ; X86-NEXT: vmovaps %xmm2, %xmm0
5842 ; X64-LABEL: test_mm_mask3_fnmsub_ps:
5843 ; X64: # %bb.0: # %entry
5844 ; X64-NEXT: kmovw %edi, %k1
5845 ; X64-NEXT: vfnmsub231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
5846 ; X64-NEXT: vmovaps %xmm2, %xmm0
5849 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5850 %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5851 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %sub1.i) #9
5852 %1 = bitcast i8 %__U to <8 x i1>
5853 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5854 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
5858 define <8 x float> @test_mm256_mask_fnmsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
5859 ; X86-LABEL: test_mm256_mask_fnmsub_ps:
5860 ; X86: # %bb.0: # %entry
5861 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5862 ; X86-NEXT: kmovw %eax, %k1
5863 ; X86-NEXT: vfnmsub132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2
5866 ; X64-LABEL: test_mm256_mask_fnmsub_ps:
5867 ; X64: # %bb.0: # %entry
5868 ; X64-NEXT: kmovw %edi, %k1
5869 ; X64-NEXT: vfnmsub132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2
5872 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5873 %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5874 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %sub1.i) #9
5875 %1 = bitcast i8 %__U to <8 x i1>
5876 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
5880 define <8 x float> @test_mm256_mask3_fnmsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
5881 ; X86-LABEL: test_mm256_mask3_fnmsub_ps:
5882 ; X86: # %bb.0: # %entry
5883 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5884 ; X86-NEXT: kmovw %eax, %k1
5885 ; X86-NEXT: vfnmsub231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2
5886 ; X86-NEXT: vmovaps %ymm2, %ymm0
5889 ; X64-LABEL: test_mm256_mask3_fnmsub_ps:
5890 ; X64: # %bb.0: # %entry
5891 ; X64-NEXT: kmovw %edi, %k1
5892 ; X64-NEXT: vfnmsub231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2
5893 ; X64-NEXT: vmovaps %ymm2, %ymm0
5896 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5897 %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5898 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %sub1.i) #9
5899 %1 = bitcast i8 %__U to <8 x i1>
5900 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
5904 define <2 x double> @test_mm_mask_expandloadu_pd(<2 x double> %__W, i8 zeroext %__U, i8* readonly %__P) {
5905 ; X86-LABEL: test_mm_mask_expandloadu_pd:
5906 ; X86: # %bb.0: # %entry
5907 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
5908 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
5909 ; X86-NEXT: kmovw %ecx, %k1
5910 ; X86-NEXT: vexpandpd (%eax), %xmm0 {%k1}
5913 ; X64-LABEL: test_mm_mask_expandloadu_pd:
5914 ; X64: # %bb.0: # %entry
5915 ; X64-NEXT: kmovw %edi, %k1
5916 ; X64-NEXT: vexpandpd (%rsi), %xmm0 {%k1}
5919 %0 = bitcast i8* %__P to double*
5920 %1 = bitcast i8 %__U to <8 x i1>
5921 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5922 %2 = tail call <2 x double> @llvm.masked.expandload.v2f64(double* %0, <2 x i1> %extract.i, <2 x double> %__W)
5926 define <2 x double> @test_mm_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) {
5927 ; X86-LABEL: test_mm_maskz_expandloadu_pd:
5928 ; X86: # %bb.0: # %entry
5929 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
5930 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
5931 ; X86-NEXT: kmovw %ecx, %k1
5932 ; X86-NEXT: vexpandpd (%eax), %xmm0 {%k1} {z}
5935 ; X64-LABEL: test_mm_maskz_expandloadu_pd:
5936 ; X64: # %bb.0: # %entry
5937 ; X64-NEXT: kmovw %edi, %k1
5938 ; X64-NEXT: vexpandpd (%rsi), %xmm0 {%k1} {z}
5941 %0 = bitcast i8* %__P to double*
5942 %1 = bitcast i8 %__U to <8 x i1>
5943 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5944 %2 = tail call <2 x double> @llvm.masked.expandload.v2f64(double* %0, <2 x i1> %extract.i, <2 x double> zeroinitializer)
5948 define <4 x double> @test_mm256_mask_expandloadu_pd(<4 x double> %__W, i8 zeroext %__U, i8* readonly %__P) {
5949 ; X86-LABEL: test_mm256_mask_expandloadu_pd:
5950 ; X86: # %bb.0: # %entry
5951 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
5952 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
5953 ; X86-NEXT: kmovw %ecx, %k1
5954 ; X86-NEXT: vexpandpd (%eax), %ymm0 {%k1}
5957 ; X64-LABEL: test_mm256_mask_expandloadu_pd:
5958 ; X64: # %bb.0: # %entry
5959 ; X64-NEXT: kmovw %edi, %k1
5960 ; X64-NEXT: vexpandpd (%rsi), %ymm0 {%k1}
5963 %0 = bitcast i8* %__P to double*
5964 %1 = bitcast i8 %__U to <8 x i1>
5965 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5966 %2 = tail call <4 x double> @llvm.masked.expandload.v4f64(double* %0, <4 x i1> %extract.i, <4 x double> %__W)
5970 define <4 x double> @test_mm256_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) {
5971 ; X86-LABEL: test_mm256_maskz_expandloadu_pd:
5972 ; X86: # %bb.0: # %entry
5973 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
5974 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
5975 ; X86-NEXT: kmovw %ecx, %k1
5976 ; X86-NEXT: vexpandpd (%eax), %ymm0 {%k1} {z}
5979 ; X64-LABEL: test_mm256_maskz_expandloadu_pd:
5980 ; X64: # %bb.0: # %entry
5981 ; X64-NEXT: kmovw %edi, %k1
5982 ; X64-NEXT: vexpandpd (%rsi), %ymm0 {%k1} {z}
5985 %0 = bitcast i8* %__P to double*
5986 %1 = bitcast i8 %__U to <8 x i1>
5987 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5988 %2 = tail call <4 x double> @llvm.masked.expandload.v4f64(double* %0, <4 x i1> %extract.i, <4 x double> zeroinitializer)
5992 define <2 x i64> @test_mm_mask_expandloadu_epi64(<2 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
5993 ; X86-LABEL: test_mm_mask_expandloadu_epi64:
5994 ; X86: # %bb.0: # %entry
5995 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
5996 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
5997 ; X86-NEXT: kmovw %ecx, %k1
5998 ; X86-NEXT: vpexpandq (%eax), %xmm0 {%k1}
6001 ; X64-LABEL: test_mm_mask_expandloadu_epi64:
6002 ; X64: # %bb.0: # %entry
6003 ; X64-NEXT: kmovw %edi, %k1
6004 ; X64-NEXT: vpexpandq (%rsi), %xmm0 {%k1}
6007 %0 = bitcast i8* %__P to i64*
6008 %1 = bitcast i8 %__U to <8 x i1>
6009 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6010 %2 = tail call <2 x i64> @llvm.masked.expandload.v2i64(i64* %0, <2 x i1> %extract.i, <2 x i64> %__W) #10
6014 define <2 x i64> @test_mm_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) {
6015 ; X86-LABEL: test_mm_maskz_expandloadu_epi64:
6016 ; X86: # %bb.0: # %entry
6017 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6018 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6019 ; X86-NEXT: kmovw %ecx, %k1
6020 ; X86-NEXT: vpexpandq (%eax), %xmm0 {%k1} {z}
6023 ; X64-LABEL: test_mm_maskz_expandloadu_epi64:
6024 ; X64: # %bb.0: # %entry
6025 ; X64-NEXT: kmovw %edi, %k1
6026 ; X64-NEXT: vpexpandq (%rsi), %xmm0 {%k1} {z}
6029 %0 = bitcast i8* %__P to i64*
6030 %1 = bitcast i8 %__U to <8 x i1>
6031 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6032 %2 = tail call <2 x i64> @llvm.masked.expandload.v2i64(i64* %0, <2 x i1> %extract.i, <2 x i64> zeroinitializer)
6036 define <4 x i64> @test_mm256_mask_expandloadu_epi64(<4 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
6037 ; X86-LABEL: test_mm256_mask_expandloadu_epi64:
6038 ; X86: # %bb.0: # %entry
6039 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6040 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6041 ; X86-NEXT: kmovw %ecx, %k1
6042 ; X86-NEXT: vpexpandq (%eax), %ymm0 {%k1}
6045 ; X64-LABEL: test_mm256_mask_expandloadu_epi64:
6046 ; X64: # %bb.0: # %entry
6047 ; X64-NEXT: kmovw %edi, %k1
6048 ; X64-NEXT: vpexpandq (%rsi), %ymm0 {%k1}
6051 %0 = bitcast i8* %__P to i64*
6052 %1 = bitcast i8 %__U to <8 x i1>
6053 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6054 %2 = tail call <4 x i64> @llvm.masked.expandload.v4i64(i64* %0, <4 x i1> %extract.i, <4 x i64> %__W) #10
6058 define <4 x i64> @test_mm256_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) {
6059 ; X86-LABEL: test_mm256_maskz_expandloadu_epi64:
6060 ; X86: # %bb.0: # %entry
6061 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6062 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6063 ; X86-NEXT: kmovw %ecx, %k1
6064 ; X86-NEXT: vpexpandq (%eax), %ymm0 {%k1} {z}
6067 ; X64-LABEL: test_mm256_maskz_expandloadu_epi64:
6068 ; X64: # %bb.0: # %entry
6069 ; X64-NEXT: kmovw %edi, %k1
6070 ; X64-NEXT: vpexpandq (%rsi), %ymm0 {%k1} {z}
6073 %0 = bitcast i8* %__P to i64*
6074 %1 = bitcast i8 %__U to <8 x i1>
6075 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6076 %2 = tail call <4 x i64> @llvm.masked.expandload.v4i64(i64* %0, <4 x i1> %extract.i, <4 x i64> zeroinitializer)
6080 define <4 x float> @test_mm_mask_expandloadu_ps(<4 x float> %__W, i8 zeroext %__U, i8* readonly %__P) {
6081 ; X86-LABEL: test_mm_mask_expandloadu_ps:
6082 ; X86: # %bb.0: # %entry
6083 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6084 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6085 ; X86-NEXT: kmovw %ecx, %k1
6086 ; X86-NEXT: vexpandps (%eax), %xmm0 {%k1}
6089 ; X64-LABEL: test_mm_mask_expandloadu_ps:
6090 ; X64: # %bb.0: # %entry
6091 ; X64-NEXT: kmovw %edi, %k1
6092 ; X64-NEXT: vexpandps (%rsi), %xmm0 {%k1}
6095 %0 = bitcast i8* %__P to float*
6096 %1 = bitcast i8 %__U to <8 x i1>
6097 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6098 %2 = tail call <4 x float> @llvm.masked.expandload.v4f32(float* %0, <4 x i1> %extract.i, <4 x float> %__W)
6102 define <4 x float> @test_mm_maskz_expandloadu_ps(i8 zeroext %__U, i8* readonly %__P) {
6103 ; X86-LABEL: test_mm_maskz_expandloadu_ps:
6104 ; X86: # %bb.0: # %entry
6105 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6106 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6107 ; X86-NEXT: kmovw %ecx, %k1
6108 ; X86-NEXT: vexpandps (%eax), %xmm0 {%k1} {z}
6111 ; X64-LABEL: test_mm_maskz_expandloadu_ps:
6112 ; X64: # %bb.0: # %entry
6113 ; X64-NEXT: kmovw %edi, %k1
6114 ; X64-NEXT: vexpandps (%rsi), %xmm0 {%k1} {z}
6117 %0 = bitcast i8* %__P to float*
6118 %1 = bitcast i8 %__U to <8 x i1>
6119 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6120 %2 = tail call <4 x float> @llvm.masked.expandload.v4f32(float* %0, <4 x i1> %extract.i, <4 x float> zeroinitializer)
6124 define <8 x float> @test_mm256_mask_expandloadu_ps(<8 x float> %__W, i8 zeroext %__U, i8* readonly %__P) {
6125 ; X86-LABEL: test_mm256_mask_expandloadu_ps:
6126 ; X86: # %bb.0: # %entry
6127 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6128 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6129 ; X86-NEXT: kmovw %ecx, %k1
6130 ; X86-NEXT: vexpandps (%eax), %ymm0 {%k1}
6133 ; X64-LABEL: test_mm256_mask_expandloadu_ps:
6134 ; X64: # %bb.0: # %entry
6135 ; X64-NEXT: kmovw %edi, %k1
6136 ; X64-NEXT: vexpandps (%rsi), %ymm0 {%k1}
6139 %0 = bitcast i8* %__P to float*
6140 %1 = bitcast i8 %__U to <8 x i1>
6141 %2 = tail call <8 x float> @llvm.masked.expandload.v8f32(float* %0, <8 x i1> %1, <8 x float> %__W)
6145 define <8 x float> @test_mm256_maskz_expandloadu_ps(i8 zeroext %__U, i8* readonly %__P) {
6146 ; X86-LABEL: test_mm256_maskz_expandloadu_ps:
6147 ; X86: # %bb.0: # %entry
6148 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6149 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6150 ; X86-NEXT: kmovw %ecx, %k1
6151 ; X86-NEXT: vexpandps (%eax), %ymm0 {%k1} {z}
6154 ; X64-LABEL: test_mm256_maskz_expandloadu_ps:
6155 ; X64: # %bb.0: # %entry
6156 ; X64-NEXT: kmovw %edi, %k1
6157 ; X64-NEXT: vexpandps (%rsi), %ymm0 {%k1} {z}
6160 %0 = bitcast i8* %__P to float*
6161 %1 = bitcast i8 %__U to <8 x i1>
6162 %2 = tail call <8 x float> @llvm.masked.expandload.v8f32(float* %0, <8 x i1> %1, <8 x float> zeroinitializer)
6166 define <2 x i64> @test_mm_mask_expandloadu_epi32(<2 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
6167 ; X86-LABEL: test_mm_mask_expandloadu_epi32:
6168 ; X86: # %bb.0: # %entry
6169 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6170 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6171 ; X86-NEXT: kmovw %ecx, %k1
6172 ; X86-NEXT: vpexpandd (%eax), %xmm0 {%k1}
6175 ; X64-LABEL: test_mm_mask_expandloadu_epi32:
6176 ; X64: # %bb.0: # %entry
6177 ; X64-NEXT: kmovw %edi, %k1
6178 ; X64-NEXT: vpexpandd (%rsi), %xmm0 {%k1}
6181 %0 = bitcast <2 x i64> %__W to <4 x i32>
6182 %1 = bitcast i8* %__P to i32*
6183 %2 = bitcast i8 %__U to <8 x i1>
6184 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6185 %3 = tail call <4 x i32> @llvm.masked.expandload.v4i32(i32* %1, <4 x i1> %extract.i, <4 x i32> %0)
6186 %4 = bitcast <4 x i32> %3 to <2 x i64>
6190 define <2 x i64> @test_mm_maskz_expandloadu_epi32(i8 zeroext %__U, i8* readonly %__P) {
6191 ; X86-LABEL: test_mm_maskz_expandloadu_epi32:
6192 ; X86: # %bb.0: # %entry
6193 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6194 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6195 ; X86-NEXT: kmovw %ecx, %k1
6196 ; X86-NEXT: vpexpandd (%eax), %xmm0 {%k1} {z}
6199 ; X64-LABEL: test_mm_maskz_expandloadu_epi32:
6200 ; X64: # %bb.0: # %entry
6201 ; X64-NEXT: kmovw %edi, %k1
6202 ; X64-NEXT: vpexpandd (%rsi), %xmm0 {%k1} {z}
6205 %0 = bitcast i8* %__P to i32*
6206 %1 = bitcast i8 %__U to <8 x i1>
6207 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6208 %2 = tail call <4 x i32> @llvm.masked.expandload.v4i32(i32* %0, <4 x i1> %extract.i, <4 x i32> zeroinitializer)
6209 %3 = bitcast <4 x i32> %2 to <2 x i64>
6213 define <4 x i64> @test_mm256_mask_expandloadu_epi32(<4 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
6214 ; X86-LABEL: test_mm256_mask_expandloadu_epi32:
6215 ; X86: # %bb.0: # %entry
6216 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6217 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6218 ; X86-NEXT: kmovw %ecx, %k1
6219 ; X86-NEXT: vpexpandd (%eax), %ymm0 {%k1}
6222 ; X64-LABEL: test_mm256_mask_expandloadu_epi32:
6223 ; X64: # %bb.0: # %entry
6224 ; X64-NEXT: kmovw %edi, %k1
6225 ; X64-NEXT: vpexpandd (%rsi), %ymm0 {%k1}
6228 %0 = bitcast <4 x i64> %__W to <8 x i32>
6229 %1 = bitcast i8* %__P to i32*
6230 %2 = bitcast i8 %__U to <8 x i1>
6231 %3 = tail call <8 x i32> @llvm.masked.expandload.v8i32(i32* %1, <8 x i1> %2, <8 x i32> %0)
6232 %4 = bitcast <8 x i32> %3 to <4 x i64>
6236 define <4 x i64> @test_mm256_maskz_expandloadu_epi32(i8 zeroext %__U, i8* readonly %__P) {
6237 ; X86-LABEL: test_mm256_maskz_expandloadu_epi32:
6238 ; X86: # %bb.0: # %entry
6239 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6240 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6241 ; X86-NEXT: kmovw %ecx, %k1
6242 ; X86-NEXT: vpexpandd (%eax), %ymm0 {%k1} {z}
6245 ; X64-LABEL: test_mm256_maskz_expandloadu_epi32:
6246 ; X64: # %bb.0: # %entry
6247 ; X64-NEXT: kmovw %edi, %k1
6248 ; X64-NEXT: vpexpandd (%rsi), %ymm0 {%k1} {z}
6251 %0 = bitcast i8* %__P to i32*
6252 %1 = bitcast i8 %__U to <8 x i1>
6253 %2 = tail call <8 x i32> @llvm.masked.expandload.v8i32(i32* %0, <8 x i1> %1, <8 x i32> zeroinitializer)
6254 %3 = bitcast <8 x i32> %2 to <4 x i64>
6258 define void @test_mm_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <2 x double> %__A) {
6259 ; X86-LABEL: test_mm_mask_compressstoreu_pd:
6260 ; X86: # %bb.0: # %entry
6261 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6262 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6263 ; X86-NEXT: kmovw %eax, %k1
6264 ; X86-NEXT: vcompresspd %xmm0, (%ecx) {%k1}
6267 ; X64-LABEL: test_mm_mask_compressstoreu_pd:
6268 ; X64: # %bb.0: # %entry
6269 ; X64-NEXT: kmovw %esi, %k1
6270 ; X64-NEXT: vcompresspd %xmm0, (%rdi) {%k1}
6273 %0 = bitcast i8* %__P to double*
6274 %1 = bitcast i8 %__U to <8 x i1>
6275 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6276 tail call void @llvm.masked.compressstore.v2f64(<2 x double> %__A, double* %0, <2 x i1> %extract.i)
6280 define void @test_mm256_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <4 x double> %__A) {
6281 ; X86-LABEL: test_mm256_mask_compressstoreu_pd:
6282 ; X86: # %bb.0: # %entry
6283 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6284 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6285 ; X86-NEXT: kmovw %eax, %k1
6286 ; X86-NEXT: vcompresspd %ymm0, (%ecx) {%k1}
6287 ; X86-NEXT: vzeroupper
6290 ; X64-LABEL: test_mm256_mask_compressstoreu_pd:
6291 ; X64: # %bb.0: # %entry
6292 ; X64-NEXT: kmovw %esi, %k1
6293 ; X64-NEXT: vcompresspd %ymm0, (%rdi) {%k1}
6294 ; X64-NEXT: vzeroupper
6297 %0 = bitcast i8* %__P to double*
6298 %1 = bitcast i8 %__U to <8 x i1>
6299 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6300 tail call void @llvm.masked.compressstore.v4f64(<4 x double> %__A, double* %0, <4 x i1> %extract.i)
6304 define void @test_mm_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <2 x i64> %__A) {
6305 ; X86-LABEL: test_mm_mask_compressstoreu_epi64:
6306 ; X86: # %bb.0: # %entry
6307 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6308 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6309 ; X86-NEXT: kmovw %eax, %k1
6310 ; X86-NEXT: vpcompressq %xmm0, (%ecx) {%k1}
6313 ; X64-LABEL: test_mm_mask_compressstoreu_epi64:
6314 ; X64: # %bb.0: # %entry
6315 ; X64-NEXT: kmovw %esi, %k1
6316 ; X64-NEXT: vpcompressq %xmm0, (%rdi) {%k1}
6319 %0 = bitcast i8* %__P to i64*
6320 %1 = bitcast i8 %__U to <8 x i1>
6321 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6322 tail call void @llvm.masked.compressstore.v2i64(<2 x i64> %__A, i64* %0, <2 x i1> %extract.i)
6326 define void @test_mm256_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <4 x i64> %__A) {
6327 ; X86-LABEL: test_mm256_mask_compressstoreu_epi64:
6328 ; X86: # %bb.0: # %entry
6329 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6330 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6331 ; X86-NEXT: kmovw %eax, %k1
6332 ; X86-NEXT: vpcompressq %ymm0, (%ecx) {%k1}
6333 ; X86-NEXT: vzeroupper
6336 ; X64-LABEL: test_mm256_mask_compressstoreu_epi64:
6337 ; X64: # %bb.0: # %entry
6338 ; X64-NEXT: kmovw %esi, %k1
6339 ; X64-NEXT: vpcompressq %ymm0, (%rdi) {%k1}
6340 ; X64-NEXT: vzeroupper
6343 %0 = bitcast i8* %__P to i64*
6344 %1 = bitcast i8 %__U to <8 x i1>
6345 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6346 tail call void @llvm.masked.compressstore.v4i64(<4 x i64> %__A, i64* %0, <4 x i1> %extract.i)
6350 define void @test_mm_mask_compressstoreu_ps(i8* %__P, i8 zeroext %__U, <4 x float> %__A) {
6351 ; X86-LABEL: test_mm_mask_compressstoreu_ps:
6352 ; X86: # %bb.0: # %entry
6353 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6354 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6355 ; X86-NEXT: kmovw %eax, %k1
6356 ; X86-NEXT: vcompressps %xmm0, (%ecx) {%k1}
6359 ; X64-LABEL: test_mm_mask_compressstoreu_ps:
6360 ; X64: # %bb.0: # %entry
6361 ; X64-NEXT: kmovw %esi, %k1
6362 ; X64-NEXT: vcompressps %xmm0, (%rdi) {%k1}
6365 %0 = bitcast i8* %__P to float*
6366 %1 = bitcast i8 %__U to <8 x i1>
6367 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6368 tail call void @llvm.masked.compressstore.v4f32(<4 x float> %__A, float* %0, <4 x i1> %extract.i)
6372 define void @test_mm256_mask_compressstoreu_ps(i8* %__P, i8 zeroext %__U, <8 x float> %__A) {
6373 ; X86-LABEL: test_mm256_mask_compressstoreu_ps:
6374 ; X86: # %bb.0: # %entry
6375 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6376 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6377 ; X86-NEXT: kmovw %eax, %k1
6378 ; X86-NEXT: vcompressps %ymm0, (%ecx) {%k1}
6379 ; X86-NEXT: vzeroupper
6382 ; X64-LABEL: test_mm256_mask_compressstoreu_ps:
6383 ; X64: # %bb.0: # %entry
6384 ; X64-NEXT: kmovw %esi, %k1
6385 ; X64-NEXT: vcompressps %ymm0, (%rdi) {%k1}
6386 ; X64-NEXT: vzeroupper
6389 %0 = bitcast i8* %__P to float*
6390 %1 = bitcast i8 %__U to <8 x i1>
6391 tail call void @llvm.masked.compressstore.v8f32(<8 x float> %__A, float* %0, <8 x i1> %1)
6395 define void @test_mm_mask_compressstoreu_epi32(i8* %__P, i8 zeroext %__U, <2 x i64> %__A) {
6396 ; X86-LABEL: test_mm_mask_compressstoreu_epi32:
6397 ; X86: # %bb.0: # %entry
6398 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6399 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6400 ; X86-NEXT: kmovw %eax, %k1
6401 ; X86-NEXT: vpcompressd %xmm0, (%ecx) {%k1}
6404 ; X64-LABEL: test_mm_mask_compressstoreu_epi32:
6405 ; X64: # %bb.0: # %entry
6406 ; X64-NEXT: kmovw %esi, %k1
6407 ; X64-NEXT: vpcompressd %xmm0, (%rdi) {%k1}
6410 %0 = bitcast <2 x i64> %__A to <4 x i32>
6411 %1 = bitcast i8* %__P to i32*
6412 %2 = bitcast i8 %__U to <8 x i1>
6413 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6414 tail call void @llvm.masked.compressstore.v4i32(<4 x i32> %0, i32* %1, <4 x i1> %extract.i)
6418 define void @test_mm256_mask_compressstoreu_epi32(i8* %__P, i8 zeroext %__U, <4 x i64> %__A) {
6419 ; X86-LABEL: test_mm256_mask_compressstoreu_epi32:
6420 ; X86: # %bb.0: # %entry
6421 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6422 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6423 ; X86-NEXT: kmovw %eax, %k1
6424 ; X86-NEXT: vpcompressd %ymm0, (%ecx) {%k1}
6425 ; X86-NEXT: vzeroupper
6428 ; X64-LABEL: test_mm256_mask_compressstoreu_epi32:
6429 ; X64: # %bb.0: # %entry
6430 ; X64-NEXT: kmovw %esi, %k1
6431 ; X64-NEXT: vpcompressd %ymm0, (%rdi) {%k1}
6432 ; X64-NEXT: vzeroupper
6435 %0 = bitcast <4 x i64> %__A to <8 x i32>
6436 %1 = bitcast i8* %__P to i32*
6437 %2 = bitcast i8 %__U to <8 x i1>
6438 tail call void @llvm.masked.compressstore.v8i32(<8 x i32> %0, i32* %1, <8 x i1> %2) #10
6443 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #8
6444 declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) #8
6445 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #8
6446 declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) #8
6448 define <2 x double> @test_mm_mask_sqrt_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A) {
6449 ; X86-LABEL: test_mm_mask_sqrt_pd:
6450 ; X86: # %bb.0: # %entry
6451 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6452 ; X86-NEXT: kmovw %eax, %k1
6453 ; X86-NEXT: vsqrtpd %xmm1, %xmm0 {%k1}
6456 ; X64-LABEL: test_mm_mask_sqrt_pd:
6457 ; X64: # %bb.0: # %entry
6458 ; X64-NEXT: kmovw %edi, %k1
6459 ; X64-NEXT: vsqrtpd %xmm1, %xmm0 {%k1}
6462 %0 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %__A) #2
6463 %1 = bitcast i8 %__U to <8 x i1>
6464 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6465 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__W
6469 declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
6471 define <2 x double> @test_mm_maskz_sqrt_pd(i8 zeroext %__U, <2 x double> %__A) {
6472 ; X86-LABEL: test_mm_maskz_sqrt_pd:
6473 ; X86: # %bb.0: # %entry
6474 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6475 ; X86-NEXT: kmovw %eax, %k1
6476 ; X86-NEXT: vsqrtpd %xmm0, %xmm0 {%k1} {z}
6479 ; X64-LABEL: test_mm_maskz_sqrt_pd:
6480 ; X64: # %bb.0: # %entry
6481 ; X64-NEXT: kmovw %edi, %k1
6482 ; X64-NEXT: vsqrtpd %xmm0, %xmm0 {%k1} {z}
6485 %0 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %__A) #2
6486 %1 = bitcast i8 %__U to <8 x i1>
6487 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6488 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
6492 define <4 x double> @test_mm256_mask_sqrt_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A) {
6493 ; X86-LABEL: test_mm256_mask_sqrt_pd:
6494 ; X86: # %bb.0: # %entry
6495 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6496 ; X86-NEXT: kmovw %eax, %k1
6497 ; X86-NEXT: vsqrtpd %ymm1, %ymm0 {%k1}
6500 ; X64-LABEL: test_mm256_mask_sqrt_pd:
6501 ; X64: # %bb.0: # %entry
6502 ; X64-NEXT: kmovw %edi, %k1
6503 ; X64-NEXT: vsqrtpd %ymm1, %ymm0 {%k1}
6506 %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %__A) #2
6507 %1 = bitcast i8 %__U to <8 x i1>
6508 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6509 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__W
6513 declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
6515 define <4 x double> @test_mm256_maskz_sqrt_pd(i8 zeroext %__U, <4 x double> %__A) {
6516 ; X86-LABEL: test_mm256_maskz_sqrt_pd:
6517 ; X86: # %bb.0: # %entry
6518 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6519 ; X86-NEXT: kmovw %eax, %k1
6520 ; X86-NEXT: vsqrtpd %ymm0, %ymm0 {%k1} {z}
6523 ; X64-LABEL: test_mm256_maskz_sqrt_pd:
6524 ; X64: # %bb.0: # %entry
6525 ; X64-NEXT: kmovw %edi, %k1
6526 ; X64-NEXT: vsqrtpd %ymm0, %ymm0 {%k1} {z}
6529 %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %__A) #2
6530 %1 = bitcast i8 %__U to <8 x i1>
6531 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6532 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
6536 define <4 x float> @test_mm_mask_sqrt_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) {
6537 ; X86-LABEL: test_mm_mask_sqrt_ps:
6538 ; X86: # %bb.0: # %entry
6539 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6540 ; X86-NEXT: kmovw %eax, %k1
6541 ; X86-NEXT: vsqrtps %xmm1, %xmm0 {%k1}
6544 ; X64-LABEL: test_mm_mask_sqrt_ps:
6545 ; X64: # %bb.0: # %entry
6546 ; X64-NEXT: kmovw %edi, %k1
6547 ; X64-NEXT: vsqrtps %xmm1, %xmm0 {%k1}
6550 %0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %__A) #2
6551 %1 = bitcast i8 %__U to <8 x i1>
6552 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6553 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__W
6557 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
6559 define <4 x float> @test_mm_maskz_sqrt_ps(i8 zeroext %__U, <4 x float> %__A) {
6560 ; X86-LABEL: test_mm_maskz_sqrt_ps:
6561 ; X86: # %bb.0: # %entry
6562 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6563 ; X86-NEXT: kmovw %eax, %k1
6564 ; X86-NEXT: vsqrtps %xmm0, %xmm0 {%k1} {z}
6567 ; X64-LABEL: test_mm_maskz_sqrt_ps:
6568 ; X64: # %bb.0: # %entry
6569 ; X64-NEXT: kmovw %edi, %k1
6570 ; X64-NEXT: vsqrtps %xmm0, %xmm0 {%k1} {z}
6573 %0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %__A) #2
6574 %1 = bitcast i8 %__U to <8 x i1>
6575 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6576 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
6580 define <8 x float> @test_mm256_mask_sqrt_ps(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A) {
6581 ; X86-LABEL: test_mm256_mask_sqrt_ps:
6582 ; X86: # %bb.0: # %entry
6583 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6584 ; X86-NEXT: kmovw %eax, %k1
6585 ; X86-NEXT: vsqrtps %ymm1, %ymm0 {%k1}
6588 ; X64-LABEL: test_mm256_mask_sqrt_ps:
6589 ; X64: # %bb.0: # %entry
6590 ; X64-NEXT: kmovw %edi, %k1
6591 ; X64-NEXT: vsqrtps %ymm1, %ymm0 {%k1}
6594 %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %__A) #2
6595 %1 = bitcast i8 %__U to <8 x i1>
6596 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__W
6600 define <8 x float> @test_mm256_maskz_sqrt_ps(i8 zeroext %__U, <8 x float> %__A) {
6601 ; X86-LABEL: test_mm256_maskz_sqrt_ps:
6602 ; X86: # %bb.0: # %entry
6603 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6604 ; X86-NEXT: kmovw %eax, %k1
6605 ; X86-NEXT: vsqrtps %ymm0, %ymm0 {%k1} {z}
6608 ; X64-LABEL: test_mm256_maskz_sqrt_ps:
6609 ; X64: # %bb.0: # %entry
6610 ; X64-NEXT: kmovw %edi, %k1
6611 ; X64-NEXT: vsqrtps %ymm0, %ymm0 {%k1} {z}
6614 %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %__A) #2
6615 %1 = bitcast i8 %__U to <8 x i1>
6616 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
6620 declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
6622 define <2 x i64> @test_mm_rol_epi32(<2 x i64> %__A) {
6623 ; CHECK-LABEL: test_mm_rol_epi32:
6624 ; CHECK: # %bb.0: # %entry
6625 ; CHECK-NEXT: vprold $5, %xmm0, %xmm0
6626 ; CHECK-NEXT: ret{{[l|q]}}
6628 %0 = bitcast <2 x i64> %__A to <4 x i32>
6629 %1 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
6630 %2 = bitcast <4 x i32> %1 to <2 x i64>
6634 define <2 x i64> @test_mm_mask_rol_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) {
6635 ; X86-LABEL: test_mm_mask_rol_epi32:
6636 ; X86: # %bb.0: # %entry
6637 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6638 ; X86-NEXT: kmovw %eax, %k1
6639 ; X86-NEXT: vprold $5, %xmm1, %xmm0 {%k1}
6642 ; X64-LABEL: test_mm_mask_rol_epi32:
6643 ; X64: # %bb.0: # %entry
6644 ; X64-NEXT: kmovw %edi, %k1
6645 ; X64-NEXT: vprold $5, %xmm1, %xmm0 {%k1}
6648 %0 = bitcast <2 x i64> %__A to <4 x i32>
6649 %1 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
6650 %2 = bitcast <2 x i64> %__W to <4 x i32>
6651 %3 = bitcast i8 %__U to <8 x i1>
6652 %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6653 %4 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %2
6654 %5 = bitcast <4 x i32> %4 to <2 x i64>
6658 define <2 x i64> @test_mm_maskz_rol_epi32(i8 zeroext %__U, <2 x i64> %__A) {
6659 ; X86-LABEL: test_mm_maskz_rol_epi32:
6660 ; X86: # %bb.0: # %entry
6661 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6662 ; X86-NEXT: kmovw %eax, %k1
6663 ; X86-NEXT: vprold $5, %xmm0, %xmm0 {%k1} {z}
6666 ; X64-LABEL: test_mm_maskz_rol_epi32:
6667 ; X64: # %bb.0: # %entry
6668 ; X64-NEXT: kmovw %edi, %k1
6669 ; X64-NEXT: vprold $5, %xmm0, %xmm0 {%k1} {z}
6672 %0 = bitcast <2 x i64> %__A to <4 x i32>
6673 %1 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
6674 %2 = bitcast i8 %__U to <8 x i1>
6675 %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6676 %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer
6677 %4 = bitcast <4 x i32> %3 to <2 x i64>
6681 define <4 x i64> @test_mm256_rol_epi32(<4 x i64> %__A) {
6682 ; CHECK-LABEL: test_mm256_rol_epi32:
6683 ; CHECK: # %bb.0: # %entry
6684 ; CHECK-NEXT: vprold $5, %ymm0, %ymm0
6685 ; CHECK-NEXT: ret{{[l|q]}}
6687 %0 = bitcast <4 x i64> %__A to <8 x i32>
6688 %1 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
6689 %2 = bitcast <8 x i32> %1 to <4 x i64>
6693 define <4 x i64> @test_mm256_mask_rol_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) {
6694 ; X86-LABEL: test_mm256_mask_rol_epi32:
6695 ; X86: # %bb.0: # %entry
6696 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6697 ; X86-NEXT: kmovw %eax, %k1
6698 ; X86-NEXT: vprold $5, %ymm1, %ymm0 {%k1}
6701 ; X64-LABEL: test_mm256_mask_rol_epi32:
6702 ; X64: # %bb.0: # %entry
6703 ; X64-NEXT: kmovw %edi, %k1
6704 ; X64-NEXT: vprold $5, %ymm1, %ymm0 {%k1}
6707 %0 = bitcast <4 x i64> %__A to <8 x i32>
6708 %1 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
6709 %2 = bitcast <4 x i64> %__W to <8 x i32>
6710 %3 = bitcast i8 %__U to <8 x i1>
6711 %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2
6712 %5 = bitcast <8 x i32> %4 to <4 x i64>
6716 define <4 x i64> @test_mm256_maskz_rol_epi32(i8 zeroext %__U, <4 x i64> %__A) {
6717 ; X86-LABEL: test_mm256_maskz_rol_epi32:
6718 ; X86: # %bb.0: # %entry
6719 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6720 ; X86-NEXT: kmovw %eax, %k1
6721 ; X86-NEXT: vprold $5, %ymm0, %ymm0 {%k1} {z}
6724 ; X64-LABEL: test_mm256_maskz_rol_epi32:
6725 ; X64: # %bb.0: # %entry
6726 ; X64-NEXT: kmovw %edi, %k1
6727 ; X64-NEXT: vprold $5, %ymm0, %ymm0 {%k1} {z}
6730 %0 = bitcast <4 x i64> %__A to <8 x i32>
6731 %1 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
6732 %2 = bitcast i8 %__U to <8 x i1>
6733 %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer
6734 %4 = bitcast <8 x i32> %3 to <4 x i64>
6738 define <2 x i64> @test_mm_rol_epi64(<2 x i64> %__A) {
6739 ; CHECK-LABEL: test_mm_rol_epi64:
6740 ; CHECK: # %bb.0: # %entry
6741 ; CHECK-NEXT: vprolq $5, %xmm0, %xmm0
6742 ; CHECK-NEXT: ret{{[l|q]}}
6744 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>)
6748 define <2 x i64> @test_mm_mask_rol_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) {
6749 ; X86-LABEL: test_mm_mask_rol_epi64:
6750 ; X86: # %bb.0: # %entry
6751 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6752 ; X86-NEXT: kmovw %eax, %k1
6753 ; X86-NEXT: vprolq $5, %xmm1, %xmm0 {%k1}
6756 ; X64-LABEL: test_mm_mask_rol_epi64:
6757 ; X64: # %bb.0: # %entry
6758 ; X64-NEXT: kmovw %edi, %k1
6759 ; X64-NEXT: vprolq $5, %xmm1, %xmm0 {%k1}
6762 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>)
6763 %1 = bitcast i8 %__U to <8 x i1>
6764 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6765 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__W
6769 define <2 x i64> @test_mm_maskz_rol_epi64(i8 zeroext %__U, <2 x i64> %__A) {
6770 ; X86-LABEL: test_mm_maskz_rol_epi64:
6771 ; X86: # %bb.0: # %entry
6772 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6773 ; X86-NEXT: kmovw %eax, %k1
6774 ; X86-NEXT: vprolq $5, %xmm0, %xmm0 {%k1} {z}
6777 ; X64-LABEL: test_mm_maskz_rol_epi64:
6778 ; X64: # %bb.0: # %entry
6779 ; X64-NEXT: kmovw %edi, %k1
6780 ; X64-NEXT: vprolq $5, %xmm0, %xmm0 {%k1} {z}
6783 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>)
6784 %1 = bitcast i8 %__U to <8 x i1>
6785 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6786 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer
6790 define <4 x i64> @test_mm256_rol_epi64(<4 x i64> %__A) {
6791 ; CHECK-LABEL: test_mm256_rol_epi64:
6792 ; CHECK: # %bb.0: # %entry
6793 ; CHECK-NEXT: vprolq $5, %ymm0, %ymm0
6794 ; CHECK-NEXT: ret{{[l|q]}}
6796 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5,i64 5, i64 5>)
6800 define <4 x i64> @test_mm256_mask_rol_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) {
6801 ; X86-LABEL: test_mm256_mask_rol_epi64:
6802 ; X86: # %bb.0: # %entry
6803 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6804 ; X86-NEXT: kmovw %eax, %k1
6805 ; X86-NEXT: vprolq $5, %ymm1, %ymm0 {%k1}
6808 ; X64-LABEL: test_mm256_mask_rol_epi64:
6809 ; X64: # %bb.0: # %entry
6810 ; X64-NEXT: kmovw %edi, %k1
6811 ; X64-NEXT: vprolq $5, %ymm1, %ymm0 {%k1}
6814 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5,i64 5, i64 5>)
6815 %1 = bitcast i8 %__U to <8 x i1>
6816 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6817 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__W
6821 define <4 x i64> @test_mm256_maskz_rol_epi64(i8 zeroext %__U, <4 x i64> %__A) {
6822 ; X86-LABEL: test_mm256_maskz_rol_epi64:
6823 ; X86: # %bb.0: # %entry
6824 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6825 ; X86-NEXT: kmovw %eax, %k1
6826 ; X86-NEXT: vprolq $5, %ymm0, %ymm0 {%k1} {z}
6829 ; X64-LABEL: test_mm256_maskz_rol_epi64:
6830 ; X64: # %bb.0: # %entry
6831 ; X64-NEXT: kmovw %edi, %k1
6832 ; X64-NEXT: vprolq $5, %ymm0, %ymm0 {%k1} {z}
6835 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5,i64 5, i64 5>)
6836 %1 = bitcast i8 %__U to <8 x i1>
6837 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6838 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer
6842 define <2 x i64> @test_mm_rolv_epi32(<2 x i64> %__A, <2 x i64> %__B) {
6843 ; CHECK-LABEL: test_mm_rolv_epi32:
6844 ; CHECK: # %bb.0: # %entry
6845 ; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm0
6846 ; CHECK-NEXT: ret{{[l|q]}}
6848 %0 = bitcast <2 x i64> %__A to <4 x i32>
6849 %1 = bitcast <2 x i64> %__B to <4 x i32>
6850 %2 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1)
6851 %3 = bitcast <4 x i32> %2 to <2 x i64>
6855 define <2 x i64> @test_mm_mask_rolv_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
6856 ; X86-LABEL: test_mm_mask_rolv_epi32:
6857 ; X86: # %bb.0: # %entry
6858 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6859 ; X86-NEXT: kmovw %eax, %k1
6860 ; X86-NEXT: vprolvd %xmm2, %xmm1, %xmm0 {%k1}
6863 ; X64-LABEL: test_mm_mask_rolv_epi32:
6864 ; X64: # %bb.0: # %entry
6865 ; X64-NEXT: kmovw %edi, %k1
6866 ; X64-NEXT: vprolvd %xmm2, %xmm1, %xmm0 {%k1}
6869 %0 = bitcast <2 x i64> %__A to <4 x i32>
6870 %1 = bitcast <2 x i64> %__B to <4 x i32>
6871 %2 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1)
6872 %3 = bitcast <2 x i64> %__W to <4 x i32>
6873 %4 = bitcast i8 %__U to <8 x i1>
6874 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6875 %5 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> %3
6876 %6 = bitcast <4 x i32> %5 to <2 x i64>
6880 define <2 x i64> @test_mm_maskz_rolv_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
6881 ; X86-LABEL: test_mm_maskz_rolv_epi32:
6882 ; X86: # %bb.0: # %entry
6883 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6884 ; X86-NEXT: kmovw %eax, %k1
6885 ; X86-NEXT: vprolvd %xmm1, %xmm0, %xmm0 {%k1} {z}
6888 ; X64-LABEL: test_mm_maskz_rolv_epi32:
6889 ; X64: # %bb.0: # %entry
6890 ; X64-NEXT: kmovw %edi, %k1
6891 ; X64-NEXT: vprolvd %xmm1, %xmm0, %xmm0 {%k1} {z}
6894 %0 = bitcast <2 x i64> %__A to <4 x i32>
6895 %1 = bitcast <2 x i64> %__B to <4 x i32>
6896 %2 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1)
6897 %3 = bitcast i8 %__U to <8 x i1>
6898 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6899 %4 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> zeroinitializer
6900 %5 = bitcast <4 x i32> %4 to <2 x i64>
6904 define <4 x i64> @test_mm256_rolv_epi32(<4 x i64> %__A, <4 x i64> %__B) {
6905 ; CHECK-LABEL: test_mm256_rolv_epi32:
6906 ; CHECK: # %bb.0: # %entry
6907 ; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm0
6908 ; CHECK-NEXT: ret{{[l|q]}}
6910 %0 = bitcast <4 x i64> %__A to <8 x i32>
6911 %1 = bitcast <4 x i64> %__B to <8 x i32>
6912 %2 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1)
6913 %3 = bitcast <8 x i32> %2 to <4 x i64>
6917 define <4 x i64> @test_mm256_mask_rolv_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
6918 ; X86-LABEL: test_mm256_mask_rolv_epi32:
6919 ; X86: # %bb.0: # %entry
6920 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6921 ; X86-NEXT: kmovw %eax, %k1
6922 ; X86-NEXT: vprolvd %ymm2, %ymm1, %ymm0 {%k1}
6925 ; X64-LABEL: test_mm256_mask_rolv_epi32:
6926 ; X64: # %bb.0: # %entry
6927 ; X64-NEXT: kmovw %edi, %k1
6928 ; X64-NEXT: vprolvd %ymm2, %ymm1, %ymm0 {%k1}
6931 %0 = bitcast <4 x i64> %__A to <8 x i32>
6932 %1 = bitcast <4 x i64> %__B to <8 x i32>
6933 %2 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1)
6934 %3 = bitcast <4 x i64> %__W to <8 x i32>
6935 %4 = bitcast i8 %__U to <8 x i1>
6936 %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3
6937 %6 = bitcast <8 x i32> %5 to <4 x i64>
6941 define <4 x i64> @test_mm256_maskz_rolv_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
6942 ; X86-LABEL: test_mm256_maskz_rolv_epi32:
6943 ; X86: # %bb.0: # %entry
6944 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6945 ; X86-NEXT: kmovw %eax, %k1
6946 ; X86-NEXT: vprolvd %ymm1, %ymm0, %ymm0 {%k1} {z}
6949 ; X64-LABEL: test_mm256_maskz_rolv_epi32:
6950 ; X64: # %bb.0: # %entry
6951 ; X64-NEXT: kmovw %edi, %k1
6952 ; X64-NEXT: vprolvd %ymm1, %ymm0, %ymm0 {%k1} {z}
6955 %0 = bitcast <4 x i64> %__A to <8 x i32>
6956 %1 = bitcast <4 x i64> %__B to <8 x i32>
6957 %2 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1)
6958 %3 = bitcast i8 %__U to <8 x i1>
6959 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
6960 %5 = bitcast <8 x i32> %4 to <4 x i64>
6964 define <2 x i64> @test_mm_rolv_epi64(<2 x i64> %__A, <2 x i64> %__B) {
6965 ; CHECK-LABEL: test_mm_rolv_epi64:
6966 ; CHECK: # %bb.0: # %entry
6967 ; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm0
6968 ; CHECK-NEXT: ret{{[l|q]}}
6970 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B)
6974 define <2 x i64> @test_mm_mask_rolv_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
6975 ; X86-LABEL: test_mm_mask_rolv_epi64:
6976 ; X86: # %bb.0: # %entry
6977 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6978 ; X86-NEXT: kmovw %eax, %k1
6979 ; X86-NEXT: vprolvq %xmm2, %xmm1, %xmm0 {%k1}
6982 ; X64-LABEL: test_mm_mask_rolv_epi64:
6983 ; X64: # %bb.0: # %entry
6984 ; X64-NEXT: kmovw %edi, %k1
6985 ; X64-NEXT: vprolvq %xmm2, %xmm1, %xmm0 {%k1}
6988 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B)
6989 %1 = bitcast i8 %__U to <8 x i1>
6990 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6991 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__W
6995 define <2 x i64> @test_mm_maskz_rolv_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
6996 ; X86-LABEL: test_mm_maskz_rolv_epi64:
6997 ; X86: # %bb.0: # %entry
6998 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6999 ; X86-NEXT: kmovw %eax, %k1
7000 ; X86-NEXT: vprolvq %xmm1, %xmm0, %xmm0 {%k1} {z}
7003 ; X64-LABEL: test_mm_maskz_rolv_epi64:
7004 ; X64: # %bb.0: # %entry
7005 ; X64-NEXT: kmovw %edi, %k1
7006 ; X64-NEXT: vprolvq %xmm1, %xmm0, %xmm0 {%k1} {z}
7009 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B)
7010 %1 = bitcast i8 %__U to <8 x i1>
7011 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7012 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
7016 define <4 x i64> @test_mm256_rolv_epi64(<4 x i64> %__A, <4 x i64> %__B) {
7017 ; CHECK-LABEL: test_mm256_rolv_epi64:
7018 ; CHECK: # %bb.0: # %entry
7019 ; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm0
7020 ; CHECK-NEXT: ret{{[l|q]}}
7022 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B)
7026 define <4 x i64> @test_mm256_mask_rolv_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7027 ; X86-LABEL: test_mm256_mask_rolv_epi64:
7028 ; X86: # %bb.0: # %entry
7029 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7030 ; X86-NEXT: kmovw %eax, %k1
7031 ; X86-NEXT: vprolvq %ymm2, %ymm1, %ymm0 {%k1}
7034 ; X64-LABEL: test_mm256_mask_rolv_epi64:
7035 ; X64: # %bb.0: # %entry
7036 ; X64-NEXT: kmovw %edi, %k1
7037 ; X64-NEXT: vprolvq %ymm2, %ymm1, %ymm0 {%k1}
7040 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B)
7041 %1 = bitcast i8 %__U to <8 x i1>
7042 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7043 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__W
7047 define <4 x i64> @test_mm256_maskz_rolv_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7048 ; X86-LABEL: test_mm256_maskz_rolv_epi64:
7049 ; X86: # %bb.0: # %entry
7050 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7051 ; X86-NEXT: kmovw %eax, %k1
7052 ; X86-NEXT: vprolvq %ymm1, %ymm0, %ymm0 {%k1} {z}
7055 ; X64-LABEL: test_mm256_maskz_rolv_epi64:
7056 ; X64: # %bb.0: # %entry
7057 ; X64-NEXT: kmovw %edi, %k1
7058 ; X64-NEXT: vprolvq %ymm1, %ymm0, %ymm0 {%k1} {z}
7061 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B)
7062 %1 = bitcast i8 %__U to <8 x i1>
7063 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7064 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
7068 define <2 x i64> @test_mm_ror_epi32(<2 x i64> %__A) {
7069 ; CHECK-LABEL: test_mm_ror_epi32:
7070 ; CHECK: # %bb.0: # %entry
7071 ; CHECK-NEXT: vprord $5, %xmm0, %xmm0
7072 ; CHECK-NEXT: ret{{[l|q]}}
7074 %0 = bitcast <2 x i64> %__A to <4 x i32>
7075 %1 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
7076 %2 = bitcast <4 x i32> %1 to <2 x i64>
7080 define <2 x i64> @test_mm_mask_ror_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) {
7081 ; X86-LABEL: test_mm_mask_ror_epi32:
7082 ; X86: # %bb.0: # %entry
7083 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7084 ; X86-NEXT: kmovw %eax, %k1
7085 ; X86-NEXT: vprord $5, %xmm1, %xmm0 {%k1}
7088 ; X64-LABEL: test_mm_mask_ror_epi32:
7089 ; X64: # %bb.0: # %entry
7090 ; X64-NEXT: kmovw %edi, %k1
7091 ; X64-NEXT: vprord $5, %xmm1, %xmm0 {%k1}
7094 %0 = bitcast <2 x i64> %__A to <4 x i32>
7095 %1 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
7096 %2 = bitcast <2 x i64> %__W to <4 x i32>
7097 %3 = bitcast i8 %__U to <8 x i1>
7098 %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7099 %4 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %2
7100 %5 = bitcast <4 x i32> %4 to <2 x i64>
7104 define <2 x i64> @test_mm_maskz_ror_epi32(i8 zeroext %__U, <2 x i64> %__A) {
7105 ; X86-LABEL: test_mm_maskz_ror_epi32:
7106 ; X86: # %bb.0: # %entry
7107 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7108 ; X86-NEXT: kmovw %eax, %k1
7109 ; X86-NEXT: vprord $5, %xmm0, %xmm0 {%k1} {z}
7112 ; X64-LABEL: test_mm_maskz_ror_epi32:
7113 ; X64: # %bb.0: # %entry
7114 ; X64-NEXT: kmovw %edi, %k1
7115 ; X64-NEXT: vprord $5, %xmm0, %xmm0 {%k1} {z}
7118 %0 = bitcast <2 x i64> %__A to <4 x i32>
7119 %1 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
7120 %2 = bitcast i8 %__U to <8 x i1>
7121 %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7122 %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer
7123 %4 = bitcast <4 x i32> %3 to <2 x i64>
7127 define <4 x i64> @test_mm256_ror_epi32(<4 x i64> %__A) {
7128 ; CHECK-LABEL: test_mm256_ror_epi32:
7129 ; CHECK: # %bb.0: # %entry
7130 ; CHECK-NEXT: vprord $5, %ymm0, %ymm0
7131 ; CHECK-NEXT: ret{{[l|q]}}
7133 %0 = bitcast <4 x i64> %__A to <8 x i32>
7134 %1 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
7135 %2 = bitcast <8 x i32> %1 to <4 x i64>
7139 define <4 x i64> @test_mm256_mask_ror_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) {
7140 ; X86-LABEL: test_mm256_mask_ror_epi32:
7141 ; X86: # %bb.0: # %entry
7142 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7143 ; X86-NEXT: kmovw %eax, %k1
7144 ; X86-NEXT: vprord $5, %ymm1, %ymm0 {%k1}
7147 ; X64-LABEL: test_mm256_mask_ror_epi32:
7148 ; X64: # %bb.0: # %entry
7149 ; X64-NEXT: kmovw %edi, %k1
7150 ; X64-NEXT: vprord $5, %ymm1, %ymm0 {%k1}
7153 %0 = bitcast <4 x i64> %__A to <8 x i32>
7154 %1 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
7155 %2 = bitcast <4 x i64> %__W to <8 x i32>
7156 %3 = bitcast i8 %__U to <8 x i1>
7157 %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2
7158 %5 = bitcast <8 x i32> %4 to <4 x i64>
7162 define <4 x i64> @test_mm256_maskz_ror_epi32(i8 zeroext %__U, <4 x i64> %__A) {
7163 ; X86-LABEL: test_mm256_maskz_ror_epi32:
7164 ; X86: # %bb.0: # %entry
7165 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7166 ; X86-NEXT: kmovw %eax, %k1
7167 ; X86-NEXT: vprord $5, %ymm0, %ymm0 {%k1} {z}
7170 ; X64-LABEL: test_mm256_maskz_ror_epi32:
7171 ; X64: # %bb.0: # %entry
7172 ; X64-NEXT: kmovw %edi, %k1
7173 ; X64-NEXT: vprord $5, %ymm0, %ymm0 {%k1} {z}
7176 %0 = bitcast <4 x i64> %__A to <8 x i32>
7177 %1 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
7178 %2 = bitcast i8 %__U to <8 x i1>
7179 %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer
7180 %4 = bitcast <8 x i32> %3 to <4 x i64>
7184 define <2 x i64> @test_mm_ror_epi64(<2 x i64> %__A) {
7185 ; CHECK-LABEL: test_mm_ror_epi64:
7186 ; CHECK: # %bb.0: # %entry
7187 ; CHECK-NEXT: vprorq $5, %xmm0, %xmm0
7188 ; CHECK-NEXT: ret{{[l|q]}}
7190 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>)
7194 define <2 x i64> @test_mm_mask_ror_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) {
7195 ; X86-LABEL: test_mm_mask_ror_epi64:
7196 ; X86: # %bb.0: # %entry
7197 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7198 ; X86-NEXT: kmovw %eax, %k1
7199 ; X86-NEXT: vprorq $5, %xmm1, %xmm0 {%k1}
7202 ; X64-LABEL: test_mm_mask_ror_epi64:
7203 ; X64: # %bb.0: # %entry
7204 ; X64-NEXT: kmovw %edi, %k1
7205 ; X64-NEXT: vprorq $5, %xmm1, %xmm0 {%k1}
7208 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>)
7209 %1 = bitcast i8 %__U to <8 x i1>
7210 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7211 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__W
7215 define <2 x i64> @test_mm_maskz_ror_epi64(i8 zeroext %__U, <2 x i64> %__A) {
7216 ; X86-LABEL: test_mm_maskz_ror_epi64:
7217 ; X86: # %bb.0: # %entry
7218 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7219 ; X86-NEXT: kmovw %eax, %k1
7220 ; X86-NEXT: vprorq $5, %xmm0, %xmm0 {%k1} {z}
7223 ; X64-LABEL: test_mm_maskz_ror_epi64:
7224 ; X64: # %bb.0: # %entry
7225 ; X64-NEXT: kmovw %edi, %k1
7226 ; X64-NEXT: vprorq $5, %xmm0, %xmm0 {%k1} {z}
7229 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>)
7230 %1 = bitcast i8 %__U to <8 x i1>
7231 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7232 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer
7236 define <4 x i64> @test_mm256_ror_epi64(<4 x i64> %__A) {
7237 ; CHECK-LABEL: test_mm256_ror_epi64:
7238 ; CHECK: # %bb.0: # %entry
7239 ; CHECK-NEXT: vprorq $5, %ymm0, %ymm0
7240 ; CHECK-NEXT: ret{{[l|q]}}
7242 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5, i64 5, i64 5>)
7246 define <4 x i64> @test_mm256_mask_ror_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) {
7247 ; X86-LABEL: test_mm256_mask_ror_epi64:
7248 ; X86: # %bb.0: # %entry
7249 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7250 ; X86-NEXT: kmovw %eax, %k1
7251 ; X86-NEXT: vprorq $5, %ymm1, %ymm0 {%k1}
7254 ; X64-LABEL: test_mm256_mask_ror_epi64:
7255 ; X64: # %bb.0: # %entry
7256 ; X64-NEXT: kmovw %edi, %k1
7257 ; X64-NEXT: vprorq $5, %ymm1, %ymm0 {%k1}
7260 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5, i64 5, i64 5>)
7261 %1 = bitcast i8 %__U to <8 x i1>
7262 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7263 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__W
7267 define <4 x i64> @test_mm256_maskz_ror_epi64(i8 zeroext %__U, <4 x i64> %__A) {
7268 ; X86-LABEL: test_mm256_maskz_ror_epi64:
7269 ; X86: # %bb.0: # %entry
7270 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7271 ; X86-NEXT: kmovw %eax, %k1
7272 ; X86-NEXT: vprorq $5, %ymm0, %ymm0 {%k1} {z}
7275 ; X64-LABEL: test_mm256_maskz_ror_epi64:
7276 ; X64: # %bb.0: # %entry
7277 ; X64-NEXT: kmovw %edi, %k1
7278 ; X64-NEXT: vprorq $5, %ymm0, %ymm0 {%k1} {z}
7281 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5, i64 5, i64 5>)
7282 %1 = bitcast i8 %__U to <8 x i1>
7283 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7284 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer
7288 define <2 x i64> @test_mm_rorv_epi32(<2 x i64> %__A, <2 x i64> %__B) {
7289 ; CHECK-LABEL: test_mm_rorv_epi32:
7290 ; CHECK: # %bb.0: # %entry
7291 ; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm0
7292 ; CHECK-NEXT: ret{{[l|q]}}
7294 %0 = bitcast <2 x i64> %__A to <4 x i32>
7295 %1 = bitcast <2 x i64> %__B to <4 x i32>
7296 %2 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1)
7297 %3 = bitcast <4 x i32> %2 to <2 x i64>
7301 define <2 x i64> @test_mm_mask_rorv_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
7302 ; X86-LABEL: test_mm_mask_rorv_epi32:
7303 ; X86: # %bb.0: # %entry
7304 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7305 ; X86-NEXT: kmovw %eax, %k1
7306 ; X86-NEXT: vprorvd %xmm2, %xmm1, %xmm0 {%k1}
7309 ; X64-LABEL: test_mm_mask_rorv_epi32:
7310 ; X64: # %bb.0: # %entry
7311 ; X64-NEXT: kmovw %edi, %k1
7312 ; X64-NEXT: vprorvd %xmm2, %xmm1, %xmm0 {%k1}
7315 %0 = bitcast <2 x i64> %__A to <4 x i32>
7316 %1 = bitcast <2 x i64> %__B to <4 x i32>
7317 %2 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1)
7318 %3 = bitcast <2 x i64> %__W to <4 x i32>
7319 %4 = bitcast i8 %__U to <8 x i1>
7320 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7321 %5 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> %3
7322 %6 = bitcast <4 x i32> %5 to <2 x i64>
7326 define <2 x i64> @test_mm_maskz_rorv_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
7327 ; X86-LABEL: test_mm_maskz_rorv_epi32:
7328 ; X86: # %bb.0: # %entry
7329 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7330 ; X86-NEXT: kmovw %eax, %k1
7331 ; X86-NEXT: vprorvd %xmm1, %xmm0, %xmm0 {%k1} {z}
7334 ; X64-LABEL: test_mm_maskz_rorv_epi32:
7335 ; X64: # %bb.0: # %entry
7336 ; X64-NEXT: kmovw %edi, %k1
7337 ; X64-NEXT: vprorvd %xmm1, %xmm0, %xmm0 {%k1} {z}
7340 %0 = bitcast <2 x i64> %__A to <4 x i32>
7341 %1 = bitcast <2 x i64> %__B to <4 x i32>
7342 %2 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1)
7343 %3 = bitcast i8 %__U to <8 x i1>
7344 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7345 %4 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> zeroinitializer
7346 %5 = bitcast <4 x i32> %4 to <2 x i64>
7350 define <4 x i64> @test_mm256_rorv_epi32(<4 x i64> %__A, <4 x i64> %__B) {
7351 ; CHECK-LABEL: test_mm256_rorv_epi32:
7352 ; CHECK: # %bb.0: # %entry
7353 ; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm0
7354 ; CHECK-NEXT: ret{{[l|q]}}
7356 %0 = bitcast <4 x i64> %__A to <8 x i32>
7357 %1 = bitcast <4 x i64> %__B to <8 x i32>
7358 %2 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1)
7359 %3 = bitcast <8 x i32> %2 to <4 x i64>
7363 define <4 x i64> @test_mm256_mask_rorv_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7364 ; X86-LABEL: test_mm256_mask_rorv_epi32:
7365 ; X86: # %bb.0: # %entry
7366 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7367 ; X86-NEXT: kmovw %eax, %k1
7368 ; X86-NEXT: vprorvd %ymm2, %ymm1, %ymm0 {%k1}
7371 ; X64-LABEL: test_mm256_mask_rorv_epi32:
7372 ; X64: # %bb.0: # %entry
7373 ; X64-NEXT: kmovw %edi, %k1
7374 ; X64-NEXT: vprorvd %ymm2, %ymm1, %ymm0 {%k1}
7377 %0 = bitcast <4 x i64> %__A to <8 x i32>
7378 %1 = bitcast <4 x i64> %__B to <8 x i32>
7379 %2 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1)
7380 %3 = bitcast <4 x i64> %__W to <8 x i32>
7381 %4 = bitcast i8 %__U to <8 x i1>
7382 %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3
7383 %6 = bitcast <8 x i32> %5 to <4 x i64>
7387 define <4 x i64> @test_mm256_maskz_rorv_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7388 ; X86-LABEL: test_mm256_maskz_rorv_epi32:
7389 ; X86: # %bb.0: # %entry
7390 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7391 ; X86-NEXT: kmovw %eax, %k1
7392 ; X86-NEXT: vprorvd %ymm1, %ymm0, %ymm0 {%k1} {z}
7395 ; X64-LABEL: test_mm256_maskz_rorv_epi32:
7396 ; X64: # %bb.0: # %entry
7397 ; X64-NEXT: kmovw %edi, %k1
7398 ; X64-NEXT: vprorvd %ymm1, %ymm0, %ymm0 {%k1} {z}
7401 %0 = bitcast <4 x i64> %__A to <8 x i32>
7402 %1 = bitcast <4 x i64> %__B to <8 x i32>
7403 %2 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1)
7404 %3 = bitcast i8 %__U to <8 x i1>
7405 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
7406 %5 = bitcast <8 x i32> %4 to <4 x i64>
7410 define <2 x i64> @test_mm_rorv_epi64(<2 x i64> %__A, <2 x i64> %__B) {
7411 ; CHECK-LABEL: test_mm_rorv_epi64:
7412 ; CHECK: # %bb.0: # %entry
7413 ; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm0
7414 ; CHECK-NEXT: ret{{[l|q]}}
7416 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B)
7420 define <2 x i64> @test_mm_mask_rorv_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
7421 ; X86-LABEL: test_mm_mask_rorv_epi64:
7422 ; X86: # %bb.0: # %entry
7423 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7424 ; X86-NEXT: kmovw %eax, %k1
7425 ; X86-NEXT: vprorvq %xmm2, %xmm1, %xmm0 {%k1}
7428 ; X64-LABEL: test_mm_mask_rorv_epi64:
7429 ; X64: # %bb.0: # %entry
7430 ; X64-NEXT: kmovw %edi, %k1
7431 ; X64-NEXT: vprorvq %xmm2, %xmm1, %xmm0 {%k1}
7434 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B)
7435 %1 = bitcast i8 %__U to <8 x i1>
7436 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7437 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__W
7441 define <2 x i64> @test_mm_maskz_rorv_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
7442 ; X86-LABEL: test_mm_maskz_rorv_epi64:
7443 ; X86: # %bb.0: # %entry
7444 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7445 ; X86-NEXT: kmovw %eax, %k1
7446 ; X86-NEXT: vprorvq %xmm1, %xmm0, %xmm0 {%k1} {z}
7449 ; X64-LABEL: test_mm_maskz_rorv_epi64:
7450 ; X64: # %bb.0: # %entry
7451 ; X64-NEXT: kmovw %edi, %k1
7452 ; X64-NEXT: vprorvq %xmm1, %xmm0, %xmm0 {%k1} {z}
7455 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B)
7456 %1 = bitcast i8 %__U to <8 x i1>
7457 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7458 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
7462 define <4 x i64> @test_mm256_rorv_epi64(<4 x i64> %__A, <4 x i64> %__B) {
7463 ; CHECK-LABEL: test_mm256_rorv_epi64:
7464 ; CHECK: # %bb.0: # %entry
7465 ; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm0
7466 ; CHECK-NEXT: ret{{[l|q]}}
7468 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B)
7472 define <4 x i64> @test_mm256_mask_rorv_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7473 ; X86-LABEL: test_mm256_mask_rorv_epi64:
7474 ; X86: # %bb.0: # %entry
7475 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7476 ; X86-NEXT: kmovw %eax, %k1
7477 ; X86-NEXT: vprorvq %ymm2, %ymm1, %ymm0 {%k1}
7480 ; X64-LABEL: test_mm256_mask_rorv_epi64:
7481 ; X64: # %bb.0: # %entry
7482 ; X64-NEXT: kmovw %edi, %k1
7483 ; X64-NEXT: vprorvq %ymm2, %ymm1, %ymm0 {%k1}
7486 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B)
7487 %1 = bitcast i8 %__U to <8 x i1>
7488 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7489 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__W
7493 define <4 x i64> @test_mm256_maskz_rorv_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7494 ; X86-LABEL: test_mm256_maskz_rorv_epi64:
7495 ; X86: # %bb.0: # %entry
7496 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7497 ; X86-NEXT: kmovw %eax, %k1
7498 ; X86-NEXT: vprorvq %ymm1, %ymm0, %ymm0 {%k1} {z}
7501 ; X64-LABEL: test_mm256_maskz_rorv_epi64:
7502 ; X64: # %bb.0: # %entry
7503 ; X64-NEXT: kmovw %edi, %k1
7504 ; X64-NEXT: vprorvq %ymm1, %ymm0, %ymm0 {%k1} {z}
7507 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B)
7508 %1 = bitcast i8 %__U to <8 x i1>
7509 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7510 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
7514 declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>)
7515 declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>)
7516 declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double>, <4 x i32>, i8)
7517 declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>)
7518 declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double>, <4 x float>, i8)
7519 declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>)
7520 declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double>, <4 x i32>, i8)
7521 declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double>, <4 x i32>, i8)
7522 declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>)
7523 declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>)
7524 declare <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float>, <4 x i32>, i8)
7525 declare <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float>, <8 x i32>, i8)
7526 declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double>, <4 x i32>, i8)
7527 declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>)
7528 declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double>, <4 x i32>, i8)
7529 declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double>, <4 x i32>, i8)
7530 declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>)
7531 declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>)
7532 declare <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float>, <4 x i32>, i8)
7533 declare <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float>, <8 x i32>, i8)
7534 declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8)
7535 declare <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>)
7536 declare <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>)
7537 declare <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>)
7538 declare <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>)
7539 declare <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>)
7540 declare <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>)
7541 declare <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>)
7542 declare <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>)
7543 declare <2 x double> @llvm.masked.expandload.v2f64(double*, <2 x i1>, <2 x double>)
7544 declare <4 x double> @llvm.masked.expandload.v4f64(double*, <4 x i1>, <4 x double>)
7545 declare <2 x i64> @llvm.masked.expandload.v2i64(i64*, <2 x i1>, <2 x i64>)
7546 declare <4 x i64> @llvm.masked.expandload.v4i64(i64*, <4 x i1>, <4 x i64>)
7547 declare <4 x float> @llvm.masked.expandload.v4f32(float*, <4 x i1>, <4 x float>)
7548 declare <8 x float> @llvm.masked.expandload.v8f32(float*, <8 x i1>, <8 x float>)
7549 declare <4 x i32> @llvm.masked.expandload.v4i32(i32*, <4 x i1>, <4 x i32>)
7550 declare <8 x i32> @llvm.masked.expandload.v8i32(i32*, <8 x i1>, <8 x i32>)
7551 declare void @llvm.masked.compressstore.v2f64(<2 x double>, double*, <2 x i1>)
7552 declare void @llvm.masked.compressstore.v4f64(<4 x double>, double*, <4 x i1>)
7553 declare void @llvm.masked.compressstore.v2i64(<2 x i64>, i64*, <2 x i1>)
7554 declare void @llvm.masked.compressstore.v4i64(<4 x i64>, i64*, <4 x i1>)
7555 declare void @llvm.masked.compressstore.v4f32(<4 x float>, float*, <4 x i1>)
7556 declare void @llvm.masked.compressstore.v8f32(<8 x float>, float*, <8 x i1>)
7557 declare void @llvm.masked.compressstore.v4i32(<4 x i32>, i32*, <4 x i1>)
7558 declare void @llvm.masked.compressstore.v8i32(<8 x i32>, i32*, <8 x i1>)
7559 declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
7560 declare <8 x i32> @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
7561 declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
7562 declare <4 x i64> @llvm.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
7563 declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
7564 declare <8 x i32> @llvm.fshr.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
7565 declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
7566 declare <4 x i64> @llvm.fshr.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)