1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86
3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64
5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vl-builtins.c
7 define <4 x float> @test_mm_mask_cvtepi32_ps(<4 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) {
8 ; X86-LABEL: test_mm_mask_cvtepi32_ps:
9 ; X86: # %bb.0: # %entry
10 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
11 ; X86-NEXT: kmovw %eax, %k1
12 ; X86-NEXT: vcvtdq2ps %xmm1, %xmm0 {%k1}
15 ; X64-LABEL: test_mm_mask_cvtepi32_ps:
16 ; X64: # %bb.0: # %entry
17 ; X64-NEXT: kmovw %edi, %k1
18 ; X64-NEXT: vcvtdq2ps %xmm1, %xmm0 {%k1}
21 %0 = bitcast <2 x i64> %__A to <4 x i32>
22 %conv.i.i = sitofp <4 x i32> %0 to <4 x float>
23 %1 = bitcast i8 %__U to <8 x i1>
24 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
25 %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> %__W
29 define <4 x float> @test_mm_maskz_cvtepi32_ps(i8 zeroext %__U, <2 x i64> %__A) {
30 ; X86-LABEL: test_mm_maskz_cvtepi32_ps:
31 ; X86: # %bb.0: # %entry
32 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
33 ; X86-NEXT: kmovw %eax, %k1
34 ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 {%k1} {z}
37 ; X64-LABEL: test_mm_maskz_cvtepi32_ps:
38 ; X64: # %bb.0: # %entry
39 ; X64-NEXT: kmovw %edi, %k1
40 ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 {%k1} {z}
43 %0 = bitcast <2 x i64> %__A to <4 x i32>
44 %conv.i.i = sitofp <4 x i32> %0 to <4 x float>
45 %1 = bitcast i8 %__U to <8 x i1>
46 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
47 %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> zeroinitializer
51 define <8 x float> @test_mm256_mask_cvtepi32_ps(<8 x float> %__W, i8 zeroext %__U, <4 x i64> %__A) {
52 ; X86-LABEL: test_mm256_mask_cvtepi32_ps:
53 ; X86: # %bb.0: # %entry
54 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
55 ; X86-NEXT: kmovw %eax, %k1
56 ; X86-NEXT: vcvtdq2ps %ymm1, %ymm0 {%k1}
59 ; X64-LABEL: test_mm256_mask_cvtepi32_ps:
60 ; X64: # %bb.0: # %entry
61 ; X64-NEXT: kmovw %edi, %k1
62 ; X64-NEXT: vcvtdq2ps %ymm1, %ymm0 {%k1}
65 %0 = bitcast <4 x i64> %__A to <8 x i32>
66 %conv.i.i = sitofp <8 x i32> %0 to <8 x float>
67 %1 = bitcast i8 %__U to <8 x i1>
68 %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> %__W
72 define <8 x float> @test_mm256_maskz_cvtepi32_ps(i8 zeroext %__U, <4 x i64> %__A) {
73 ; X86-LABEL: test_mm256_maskz_cvtepi32_ps:
74 ; X86: # %bb.0: # %entry
75 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
76 ; X86-NEXT: kmovw %eax, %k1
77 ; X86-NEXT: vcvtdq2ps %ymm0, %ymm0 {%k1} {z}
80 ; X64-LABEL: test_mm256_maskz_cvtepi32_ps:
81 ; X64: # %bb.0: # %entry
82 ; X64-NEXT: kmovw %edi, %k1
83 ; X64-NEXT: vcvtdq2ps %ymm0, %ymm0 {%k1} {z}
86 %0 = bitcast <4 x i64> %__A to <8 x i32>
87 %conv.i.i = sitofp <8 x i32> %0 to <8 x float>
88 %1 = bitcast i8 %__U to <8 x i1>
89 %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> zeroinitializer
93 define <2 x i64> @test_mm_mask_cvtpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) {
94 ; X86-LABEL: test_mm_mask_cvtpd_epi32:
95 ; X86: # %bb.0: # %entry
96 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
97 ; X86-NEXT: kmovw %eax, %k1
98 ; X86-NEXT: vcvtpd2dq %xmm1, %xmm0 {%k1}
101 ; X64-LABEL: test_mm_mask_cvtpd_epi32:
102 ; X64: # %bb.0: # %entry
103 ; X64-NEXT: kmovw %edi, %k1
104 ; X64-NEXT: vcvtpd2dq %xmm1, %xmm0 {%k1}
107 %0 = bitcast <2 x i64> %__W to <4 x i32>
108 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8
109 %2 = bitcast <4 x i32> %1 to <2 x i64>
113 define <2 x i64> @test_mm_maskz_cvtpd_epi32(i8 zeroext %__U, <2 x double> %__A) {
114 ; X86-LABEL: test_mm_maskz_cvtpd_epi32:
115 ; X86: # %bb.0: # %entry
116 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
117 ; X86-NEXT: kmovw %eax, %k1
118 ; X86-NEXT: vcvtpd2dq %xmm0, %xmm0 {%k1} {z}
121 ; X64-LABEL: test_mm_maskz_cvtpd_epi32:
122 ; X64: # %bb.0: # %entry
123 ; X64-NEXT: kmovw %edi, %k1
124 ; X64-NEXT: vcvtpd2dq %xmm0, %xmm0 {%k1} {z}
127 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
128 %1 = bitcast <4 x i32> %0 to <2 x i64>
132 define <2 x i64> @test_mm256_mask_cvtpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) {
133 ; X86-LABEL: test_mm256_mask_cvtpd_epi32:
134 ; X86: # %bb.0: # %entry
135 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
136 ; X86-NEXT: kmovw %eax, %k1
137 ; X86-NEXT: vcvtpd2dq %ymm1, %xmm0 {%k1}
138 ; X86-NEXT: vzeroupper
141 ; X64-LABEL: test_mm256_mask_cvtpd_epi32:
142 ; X64: # %bb.0: # %entry
143 ; X64-NEXT: kmovw %edi, %k1
144 ; X64-NEXT: vcvtpd2dq %ymm1, %xmm0 {%k1}
145 ; X64-NEXT: vzeroupper
148 %0 = tail call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %__A) #8
149 %1 = bitcast <2 x i64> %__W to <4 x i32>
150 %2 = bitcast i8 %__U to <8 x i1>
151 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
152 %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1
153 %4 = bitcast <4 x i32> %3 to <2 x i64>
157 define <2 x i64> @test_mm256_maskz_cvtpd_epi32(i8 zeroext %__U, <4 x double> %__A) {
158 ; X86-LABEL: test_mm256_maskz_cvtpd_epi32:
159 ; X86: # %bb.0: # %entry
160 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
161 ; X86-NEXT: kmovw %eax, %k1
162 ; X86-NEXT: vcvtpd2dq %ymm0, %xmm0 {%k1} {z}
163 ; X86-NEXT: vzeroupper
166 ; X64-LABEL: test_mm256_maskz_cvtpd_epi32:
167 ; X64: # %bb.0: # %entry
168 ; X64-NEXT: kmovw %edi, %k1
169 ; X64-NEXT: vcvtpd2dq %ymm0, %xmm0 {%k1} {z}
170 ; X64-NEXT: vzeroupper
173 %0 = tail call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %__A) #8
174 %1 = bitcast i8 %__U to <8 x i1>
175 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
176 %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer
177 %3 = bitcast <4 x i32> %2 to <2 x i64>
181 define <4 x float> @test_mm_mask_cvtpd_ps(<4 x float> %__W, i8 zeroext %__U, <2 x double> %__A) {
182 ; X86-LABEL: test_mm_mask_cvtpd_ps:
183 ; X86: # %bb.0: # %entry
184 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
185 ; X86-NEXT: kmovw %eax, %k1
186 ; X86-NEXT: vcvtpd2ps %xmm1, %xmm0 {%k1}
189 ; X64-LABEL: test_mm_mask_cvtpd_ps:
190 ; X64: # %bb.0: # %entry
191 ; X64-NEXT: kmovw %edi, %k1
192 ; X64-NEXT: vcvtpd2ps %xmm1, %xmm0 {%k1}
195 %0 = tail call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %__A, <4 x float> %__W, i8 %__U) #8
199 define <4 x float> @test_mm_maskz_cvtpd_ps(i8 zeroext %__U, <2 x double> %__A) {
200 ; X86-LABEL: test_mm_maskz_cvtpd_ps:
201 ; X86: # %bb.0: # %entry
202 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
203 ; X86-NEXT: kmovw %eax, %k1
204 ; X86-NEXT: vcvtpd2ps %xmm0, %xmm0 {%k1} {z}
207 ; X64-LABEL: test_mm_maskz_cvtpd_ps:
208 ; X64: # %bb.0: # %entry
209 ; X64-NEXT: kmovw %edi, %k1
210 ; X64-NEXT: vcvtpd2ps %xmm0, %xmm0 {%k1} {z}
213 %0 = tail call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %__A, <4 x float> zeroinitializer, i8 %__U) #8
217 define <4 x float> @test_mm256_mask_cvtpd_ps(<4 x float> %__W, i8 zeroext %__U, <4 x double> %__A) {
218 ; X86-LABEL: test_mm256_mask_cvtpd_ps:
219 ; X86: # %bb.0: # %entry
220 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
221 ; X86-NEXT: kmovw %eax, %k1
222 ; X86-NEXT: vcvtpd2ps %ymm1, %xmm0 {%k1}
223 ; X86-NEXT: vzeroupper
226 ; X64-LABEL: test_mm256_mask_cvtpd_ps:
227 ; X64: # %bb.0: # %entry
228 ; X64-NEXT: kmovw %edi, %k1
229 ; X64-NEXT: vcvtpd2ps %ymm1, %xmm0 {%k1}
230 ; X64-NEXT: vzeroupper
233 %0 = tail call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %__A) #8
234 %1 = bitcast i8 %__U to <8 x i1>
235 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
236 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__W
240 define <4 x float> @test_mm256_maskz_cvtpd_ps(i8 zeroext %__U, <4 x double> %__A) {
241 ; X86-LABEL: test_mm256_maskz_cvtpd_ps:
242 ; X86: # %bb.0: # %entry
243 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
244 ; X86-NEXT: kmovw %eax, %k1
245 ; X86-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
246 ; X86-NEXT: vzeroupper
249 ; X64-LABEL: test_mm256_maskz_cvtpd_ps:
250 ; X64: # %bb.0: # %entry
251 ; X64-NEXT: kmovw %edi, %k1
252 ; X64-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
253 ; X64-NEXT: vzeroupper
256 %0 = tail call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %__A) #8
257 %1 = bitcast i8 %__U to <8 x i1>
258 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
259 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
263 define <2 x i64> @test_mm_cvtpd_epu32(<2 x double> %__A) {
264 ; CHECK-LABEL: test_mm_cvtpd_epu32:
265 ; CHECK: # %bb.0: # %entry
266 ; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm0
267 ; CHECK-NEXT: ret{{[l|q]}}
269 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8
270 %1 = bitcast <4 x i32> %0 to <2 x i64>
274 define <2 x i64> @test_mm_mask_cvtpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) {
275 ; X86-LABEL: test_mm_mask_cvtpd_epu32:
276 ; X86: # %bb.0: # %entry
277 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
278 ; X86-NEXT: kmovw %eax, %k1
279 ; X86-NEXT: vcvtpd2udq %xmm1, %xmm0 {%k1}
282 ; X64-LABEL: test_mm_mask_cvtpd_epu32:
283 ; X64: # %bb.0: # %entry
284 ; X64-NEXT: kmovw %edi, %k1
285 ; X64-NEXT: vcvtpd2udq %xmm1, %xmm0 {%k1}
288 %0 = bitcast <2 x i64> %__W to <4 x i32>
289 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8
290 %2 = bitcast <4 x i32> %1 to <2 x i64>
294 define <2 x i64> @test_mm_maskz_cvtpd_epu32(i8 zeroext %__U, <2 x double> %__A) {
295 ; X86-LABEL: test_mm_maskz_cvtpd_epu32:
296 ; X86: # %bb.0: # %entry
297 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
298 ; X86-NEXT: kmovw %eax, %k1
299 ; X86-NEXT: vcvtpd2udq %xmm0, %xmm0 {%k1} {z}
302 ; X64-LABEL: test_mm_maskz_cvtpd_epu32:
303 ; X64: # %bb.0: # %entry
304 ; X64-NEXT: kmovw %edi, %k1
305 ; X64-NEXT: vcvtpd2udq %xmm0, %xmm0 {%k1} {z}
308 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
309 %1 = bitcast <4 x i32> %0 to <2 x i64>
313 define <2 x i64> @test_mm256_cvtpd_epu32(<4 x double> %__A) {
314 ; CHECK-LABEL: test_mm256_cvtpd_epu32:
315 ; CHECK: # %bb.0: # %entry
316 ; CHECK-NEXT: vcvtpd2udq %ymm0, %xmm0
317 ; CHECK-NEXT: vzeroupper
318 ; CHECK-NEXT: ret{{[l|q]}}
320 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8
321 %1 = bitcast <4 x i32> %0 to <2 x i64>
325 define <2 x i64> @test_mm256_mask_cvtpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) {
326 ; X86-LABEL: test_mm256_mask_cvtpd_epu32:
327 ; X86: # %bb.0: # %entry
328 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
329 ; X86-NEXT: kmovw %eax, %k1
330 ; X86-NEXT: vcvtpd2udq %ymm1, %xmm0 {%k1}
331 ; X86-NEXT: vzeroupper
334 ; X64-LABEL: test_mm256_mask_cvtpd_epu32:
335 ; X64: # %bb.0: # %entry
336 ; X64-NEXT: kmovw %edi, %k1
337 ; X64-NEXT: vcvtpd2udq %ymm1, %xmm0 {%k1}
338 ; X64-NEXT: vzeroupper
341 %0 = bitcast <2 x i64> %__W to <4 x i32>
342 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> %0, i8 %__U) #8
343 %2 = bitcast <4 x i32> %1 to <2 x i64>
347 define <2 x i64> @test_mm256_maskz_cvtpd_epu32(i8 zeroext %__U, <4 x double> %__A) {
348 ; X86-LABEL: test_mm256_maskz_cvtpd_epu32:
349 ; X86: # %bb.0: # %entry
350 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
351 ; X86-NEXT: kmovw %eax, %k1
352 ; X86-NEXT: vcvtpd2udq %ymm0, %xmm0 {%k1} {z}
353 ; X86-NEXT: vzeroupper
356 ; X64-LABEL: test_mm256_maskz_cvtpd_epu32:
357 ; X64: # %bb.0: # %entry
358 ; X64-NEXT: kmovw %edi, %k1
359 ; X64-NEXT: vcvtpd2udq %ymm0, %xmm0 {%k1} {z}
360 ; X64-NEXT: vzeroupper
363 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
364 %1 = bitcast <4 x i32> %0 to <2 x i64>
368 define <2 x i64> @test_mm_mask_cvtps_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) {
369 ; X86-LABEL: test_mm_mask_cvtps_epi32:
370 ; X86: # %bb.0: # %entry
371 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
372 ; X86-NEXT: kmovw %eax, %k1
373 ; X86-NEXT: vcvtps2dq %xmm1, %xmm0 {%k1}
376 ; X64-LABEL: test_mm_mask_cvtps_epi32:
377 ; X64: # %bb.0: # %entry
378 ; X64-NEXT: kmovw %edi, %k1
379 ; X64-NEXT: vcvtps2dq %xmm1, %xmm0 {%k1}
382 %0 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %__A) #8
383 %1 = bitcast <2 x i64> %__W to <4 x i32>
384 %2 = bitcast i8 %__U to <8 x i1>
385 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
386 %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1
387 %4 = bitcast <4 x i32> %3 to <2 x i64>
391 define <2 x i64> @test_mm_maskz_cvtps_epi32(i8 zeroext %__U, <4 x float> %__A) {
392 ; X86-LABEL: test_mm_maskz_cvtps_epi32:
393 ; X86: # %bb.0: # %entry
394 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
395 ; X86-NEXT: kmovw %eax, %k1
396 ; X86-NEXT: vcvtps2dq %xmm0, %xmm0 {%k1} {z}
399 ; X64-LABEL: test_mm_maskz_cvtps_epi32:
400 ; X64: # %bb.0: # %entry
401 ; X64-NEXT: kmovw %edi, %k1
402 ; X64-NEXT: vcvtps2dq %xmm0, %xmm0 {%k1} {z}
405 %0 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %__A) #8
406 %1 = bitcast i8 %__U to <8 x i1>
407 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
408 %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer
409 %3 = bitcast <4 x i32> %2 to <2 x i64>
413 define <4 x i64> @test_mm256_mask_cvtps_epi32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) {
414 ; X86-LABEL: test_mm256_mask_cvtps_epi32:
415 ; X86: # %bb.0: # %entry
416 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
417 ; X86-NEXT: kmovw %eax, %k1
418 ; X86-NEXT: vcvtps2dq %ymm1, %ymm0 {%k1}
421 ; X64-LABEL: test_mm256_mask_cvtps_epi32:
422 ; X64: # %bb.0: # %entry
423 ; X64-NEXT: kmovw %edi, %k1
424 ; X64-NEXT: vcvtps2dq %ymm1, %ymm0 {%k1}
427 %0 = tail call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %__A) #8
428 %1 = bitcast <4 x i64> %__W to <8 x i32>
429 %2 = bitcast i8 %__U to <8 x i1>
430 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
431 %4 = bitcast <8 x i32> %3 to <4 x i64>
435 define <4 x i64> @test_mm256_maskz_cvtps_epi32(i8 zeroext %__U, <8 x float> %__A) {
436 ; X86-LABEL: test_mm256_maskz_cvtps_epi32:
437 ; X86: # %bb.0: # %entry
438 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
439 ; X86-NEXT: kmovw %eax, %k1
440 ; X86-NEXT: vcvtps2dq %ymm0, %ymm0 {%k1} {z}
443 ; X64-LABEL: test_mm256_maskz_cvtps_epi32:
444 ; X64: # %bb.0: # %entry
445 ; X64-NEXT: kmovw %edi, %k1
446 ; X64-NEXT: vcvtps2dq %ymm0, %ymm0 {%k1} {z}
449 %0 = tail call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %__A) #8
450 %1 = bitcast i8 %__U to <8 x i1>
451 %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer
452 %3 = bitcast <8 x i32> %2 to <4 x i64>
456 define <2 x double> @test_mm_mask_cvtps_pd(<2 x double> %__W, i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 {
457 ; X86-LABEL: test_mm_mask_cvtps_pd:
458 ; X86: # %bb.0: # %entry
459 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
460 ; X86-NEXT: kmovw %eax, %k1
461 ; X86-NEXT: vcvtps2pd %xmm1, %xmm0 {%k1}
464 ; X64-LABEL: test_mm_mask_cvtps_pd:
465 ; X64: # %bb.0: # %entry
466 ; X64-NEXT: kmovw %edi, %k1
467 ; X64-NEXT: vcvtps2pd %xmm1, %xmm0 {%k1}
470 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <2 x i32> <i32 0, i32 1>
471 %conv.i.i = fpext <2 x float> %shuffle.i.i to <2 x double>
472 %0 = bitcast i8 %__U to <8 x i1>
473 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
474 %1 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> %__W
478 define <2 x double> @test_mm_maskz_cvtps_pd(i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 {
479 ; X86-LABEL: test_mm_maskz_cvtps_pd:
480 ; X86: # %bb.0: # %entry
481 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
482 ; X86-NEXT: kmovw %eax, %k1
483 ; X86-NEXT: vcvtps2pd %xmm0, %xmm0 {%k1} {z}
486 ; X64-LABEL: test_mm_maskz_cvtps_pd:
487 ; X64: # %bb.0: # %entry
488 ; X64-NEXT: kmovw %edi, %k1
489 ; X64-NEXT: vcvtps2pd %xmm0, %xmm0 {%k1} {z}
492 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <2 x i32> <i32 0, i32 1>
493 %conv.i.i = fpext <2 x float> %shuffle.i.i to <2 x double>
494 %0 = bitcast i8 %__U to <8 x i1>
495 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
496 %1 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> zeroinitializer
500 define <4 x double> @test_mm256_mask_cvtps_pd(<4 x double> %__W, i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 {
501 ; X86-LABEL: test_mm256_mask_cvtps_pd:
502 ; X86: # %bb.0: # %entry
503 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
504 ; X86-NEXT: kmovw %eax, %k1
505 ; X86-NEXT: vcvtps2pd %xmm1, %ymm0 {%k1}
508 ; X64-LABEL: test_mm256_mask_cvtps_pd:
509 ; X64: # %bb.0: # %entry
510 ; X64-NEXT: kmovw %edi, %k1
511 ; X64-NEXT: vcvtps2pd %xmm1, %ymm0 {%k1}
514 %conv.i.i = fpext <4 x float> %__A to <4 x double>
515 %0 = bitcast i8 %__U to <8 x i1>
516 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
517 %1 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> %__W
521 define <4 x double> @test_mm256_maskz_cvtps_pd(i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 {
522 ; X86-LABEL: test_mm256_maskz_cvtps_pd:
523 ; X86: # %bb.0: # %entry
524 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
525 ; X86-NEXT: kmovw %eax, %k1
526 ; X86-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z}
529 ; X64-LABEL: test_mm256_maskz_cvtps_pd:
530 ; X64: # %bb.0: # %entry
531 ; X64-NEXT: kmovw %edi, %k1
532 ; X64-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z}
535 %conv.i.i = fpext <4 x float> %__A to <4 x double>
536 %0 = bitcast i8 %__U to <8 x i1>
537 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
538 %1 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> zeroinitializer
542 define <2 x i64> @test_mm_cvtps_epu32(<4 x float> %__A) {
543 ; CHECK-LABEL: test_mm_cvtps_epu32:
544 ; CHECK: # %bb.0: # %entry
545 ; CHECK-NEXT: vcvtps2udq %xmm0, %xmm0
546 ; CHECK-NEXT: ret{{[l|q]}}
548 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 -1) #8
549 %1 = bitcast <4 x i32> %0 to <2 x i64>
553 define <2 x i64> @test_mm_mask_cvtps_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) {
554 ; X86-LABEL: test_mm_mask_cvtps_epu32:
555 ; X86: # %bb.0: # %entry
556 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
557 ; X86-NEXT: kmovw %eax, %k1
558 ; X86-NEXT: vcvtps2udq %xmm1, %xmm0 {%k1}
561 ; X64-LABEL: test_mm_mask_cvtps_epu32:
562 ; X64: # %bb.0: # %entry
563 ; X64-NEXT: kmovw %edi, %k1
564 ; X64-NEXT: vcvtps2udq %xmm1, %xmm0 {%k1}
567 %0 = bitcast <2 x i64> %__W to <4 x i32>
568 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> %0, i8 %__U) #8
569 %2 = bitcast <4 x i32> %1 to <2 x i64>
573 define <2 x i64> @test_mm_maskz_cvtps_epu32(i8 zeroext %__U, <4 x float> %__A) {
574 ; X86-LABEL: test_mm_maskz_cvtps_epu32:
575 ; X86: # %bb.0: # %entry
576 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
577 ; X86-NEXT: kmovw %eax, %k1
578 ; X86-NEXT: vcvtps2udq %xmm0, %xmm0 {%k1} {z}
581 ; X64-LABEL: test_mm_maskz_cvtps_epu32:
582 ; X64: # %bb.0: # %entry
583 ; X64-NEXT: kmovw %edi, %k1
584 ; X64-NEXT: vcvtps2udq %xmm0, %xmm0 {%k1} {z}
587 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
588 %1 = bitcast <4 x i32> %0 to <2 x i64>
592 define <4 x i64> @test_mm256_cvtps_epu32(<8 x float> %__A) {
593 ; CHECK-LABEL: test_mm256_cvtps_epu32:
594 ; CHECK: # %bb.0: # %entry
595 ; CHECK-NEXT: vcvtps2udq %ymm0, %ymm0
596 ; CHECK-NEXT: ret{{[l|q]}}
598 %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 -1) #8
599 %1 = bitcast <8 x i32> %0 to <4 x i64>
603 define <4 x i64> @test_mm256_mask_cvtps_epu32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) {
604 ; X86-LABEL: test_mm256_mask_cvtps_epu32:
605 ; X86: # %bb.0: # %entry
606 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
607 ; X86-NEXT: kmovw %eax, %k1
608 ; X86-NEXT: vcvtps2udq %ymm1, %ymm0 {%k1}
611 ; X64-LABEL: test_mm256_mask_cvtps_epu32:
612 ; X64: # %bb.0: # %entry
613 ; X64-NEXT: kmovw %edi, %k1
614 ; X64-NEXT: vcvtps2udq %ymm1, %ymm0 {%k1}
617 %0 = bitcast <4 x i64> %__W to <8 x i32>
618 %1 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> %0, i8 %__U) #8
619 %2 = bitcast <8 x i32> %1 to <4 x i64>
623 define <4 x i64> @test_mm256_maskz_cvtps_epu32(i8 zeroext %__U, <8 x float> %__A) {
624 ; X86-LABEL: test_mm256_maskz_cvtps_epu32:
625 ; X86: # %bb.0: # %entry
626 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
627 ; X86-NEXT: kmovw %eax, %k1
628 ; X86-NEXT: vcvtps2udq %ymm0, %ymm0 {%k1} {z}
631 ; X64-LABEL: test_mm256_maskz_cvtps_epu32:
632 ; X64: # %bb.0: # %entry
633 ; X64-NEXT: kmovw %edi, %k1
634 ; X64-NEXT: vcvtps2udq %ymm0, %ymm0 {%k1} {z}
637 %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 %__U) #8
638 %1 = bitcast <8 x i32> %0 to <4 x i64>
642 define <2 x i64> @test_mm_mask_cvttpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) {
643 ; X86-LABEL: test_mm_mask_cvttpd_epi32:
644 ; X86: # %bb.0: # %entry
645 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
646 ; X86-NEXT: kmovw %eax, %k1
647 ; X86-NEXT: vcvttpd2dq %xmm1, %xmm0 {%k1}
650 ; X64-LABEL: test_mm_mask_cvttpd_epi32:
651 ; X64: # %bb.0: # %entry
652 ; X64-NEXT: kmovw %edi, %k1
653 ; X64-NEXT: vcvttpd2dq %xmm1, %xmm0 {%k1}
656 %0 = bitcast <2 x i64> %__W to <4 x i32>
657 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8
658 %2 = bitcast <4 x i32> %1 to <2 x i64>
662 define <2 x i64> @test_mm_maskz_cvttpd_epi32(i8 zeroext %__U, <2 x double> %__A) {
663 ; X86-LABEL: test_mm_maskz_cvttpd_epi32:
664 ; X86: # %bb.0: # %entry
665 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
666 ; X86-NEXT: kmovw %eax, %k1
667 ; X86-NEXT: vcvttpd2dq %xmm0, %xmm0 {%k1} {z}
670 ; X64-LABEL: test_mm_maskz_cvttpd_epi32:
671 ; X64: # %bb.0: # %entry
672 ; X64-NEXT: kmovw %edi, %k1
673 ; X64-NEXT: vcvttpd2dq %xmm0, %xmm0 {%k1} {z}
676 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
677 %1 = bitcast <4 x i32> %0 to <2 x i64>
681 define <2 x i64> @test_mm256_mask_cvttpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) {
682 ; X86-LABEL: test_mm256_mask_cvttpd_epi32:
683 ; X86: # %bb.0: # %entry
684 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
685 ; X86-NEXT: kmovw %eax, %k1
686 ; X86-NEXT: vcvttpd2dq %ymm1, %xmm0 {%k1}
687 ; X86-NEXT: vzeroupper
690 ; X64-LABEL: test_mm256_mask_cvttpd_epi32:
691 ; X64: # %bb.0: # %entry
692 ; X64-NEXT: kmovw %edi, %k1
693 ; X64-NEXT: vcvttpd2dq %ymm1, %xmm0 {%k1}
694 ; X64-NEXT: vzeroupper
697 %0 = tail call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %__A) #8
698 %1 = bitcast <2 x i64> %__W to <4 x i32>
699 %2 = bitcast i8 %__U to <8 x i1>
700 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
701 %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1
702 %4 = bitcast <4 x i32> %3 to <2 x i64>
706 define <2 x i64> @test_mm256_maskz_cvttpd_epi32(i8 zeroext %__U, <4 x double> %__A) {
707 ; X86-LABEL: test_mm256_maskz_cvttpd_epi32:
708 ; X86: # %bb.0: # %entry
709 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
710 ; X86-NEXT: kmovw %eax, %k1
711 ; X86-NEXT: vcvttpd2dq %ymm0, %xmm0 {%k1} {z}
712 ; X86-NEXT: vzeroupper
715 ; X64-LABEL: test_mm256_maskz_cvttpd_epi32:
716 ; X64: # %bb.0: # %entry
717 ; X64-NEXT: kmovw %edi, %k1
718 ; X64-NEXT: vcvttpd2dq %ymm0, %xmm0 {%k1} {z}
719 ; X64-NEXT: vzeroupper
722 %0 = tail call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %__A) #8
723 %1 = bitcast i8 %__U to <8 x i1>
724 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
725 %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer
726 %3 = bitcast <4 x i32> %2 to <2 x i64>
730 define <2 x i64> @test_mm_cvttpd_epu32(<2 x double> %__A) {
731 ; CHECK-LABEL: test_mm_cvttpd_epu32:
732 ; CHECK: # %bb.0: # %entry
733 ; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm0
734 ; CHECK-NEXT: ret{{[l|q]}}
736 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8
737 %1 = bitcast <4 x i32> %0 to <2 x i64>
741 define <2 x i64> @test_mm_mask_cvttpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) {
742 ; X86-LABEL: test_mm_mask_cvttpd_epu32:
743 ; X86: # %bb.0: # %entry
744 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
745 ; X86-NEXT: kmovw %eax, %k1
746 ; X86-NEXT: vcvttpd2udq %xmm1, %xmm0 {%k1}
749 ; X64-LABEL: test_mm_mask_cvttpd_epu32:
750 ; X64: # %bb.0: # %entry
751 ; X64-NEXT: kmovw %edi, %k1
752 ; X64-NEXT: vcvttpd2udq %xmm1, %xmm0 {%k1}
755 %0 = bitcast <2 x i64> %__W to <4 x i32>
756 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8
757 %2 = bitcast <4 x i32> %1 to <2 x i64>
761 define <2 x i64> @test_mm_maskz_cvttpd_epu32(i8 zeroext %__U, <2 x double> %__A) {
762 ; X86-LABEL: test_mm_maskz_cvttpd_epu32:
763 ; X86: # %bb.0: # %entry
764 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
765 ; X86-NEXT: kmovw %eax, %k1
766 ; X86-NEXT: vcvttpd2udq %xmm0, %xmm0 {%k1} {z}
769 ; X64-LABEL: test_mm_maskz_cvttpd_epu32:
770 ; X64: # %bb.0: # %entry
771 ; X64-NEXT: kmovw %edi, %k1
772 ; X64-NEXT: vcvttpd2udq %xmm0, %xmm0 {%k1} {z}
775 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
776 %1 = bitcast <4 x i32> %0 to <2 x i64>
780 define <2 x i64> @test_mm256_cvttpd_epu32(<4 x double> %__A) {
781 ; CHECK-LABEL: test_mm256_cvttpd_epu32:
782 ; CHECK: # %bb.0: # %entry
783 ; CHECK-NEXT: vcvttpd2udq %ymm0, %xmm0
784 ; CHECK-NEXT: vzeroupper
785 ; CHECK-NEXT: ret{{[l|q]}}
787 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8
788 %1 = bitcast <4 x i32> %0 to <2 x i64>
792 define <2 x i64> @test_mm256_mask_cvttpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) {
793 ; X86-LABEL: test_mm256_mask_cvttpd_epu32:
794 ; X86: # %bb.0: # %entry
795 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
796 ; X86-NEXT: kmovw %eax, %k1
797 ; X86-NEXT: vcvttpd2udq %ymm1, %xmm0 {%k1}
798 ; X86-NEXT: vzeroupper
801 ; X64-LABEL: test_mm256_mask_cvttpd_epu32:
802 ; X64: # %bb.0: # %entry
803 ; X64-NEXT: kmovw %edi, %k1
804 ; X64-NEXT: vcvttpd2udq %ymm1, %xmm0 {%k1}
805 ; X64-NEXT: vzeroupper
808 %0 = bitcast <2 x i64> %__W to <4 x i32>
809 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> %0, i8 %__U) #8
810 %2 = bitcast <4 x i32> %1 to <2 x i64>
814 define <2 x i64> @test_mm256_maskz_cvttpd_epu32(i8 zeroext %__U, <4 x double> %__A) {
815 ; X86-LABEL: test_mm256_maskz_cvttpd_epu32:
816 ; X86: # %bb.0: # %entry
817 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
818 ; X86-NEXT: kmovw %eax, %k1
819 ; X86-NEXT: vcvttpd2udq %ymm0, %xmm0 {%k1} {z}
820 ; X86-NEXT: vzeroupper
823 ; X64-LABEL: test_mm256_maskz_cvttpd_epu32:
824 ; X64: # %bb.0: # %entry
825 ; X64-NEXT: kmovw %edi, %k1
826 ; X64-NEXT: vcvttpd2udq %ymm0, %xmm0 {%k1} {z}
827 ; X64-NEXT: vzeroupper
830 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
831 %1 = bitcast <4 x i32> %0 to <2 x i64>
835 define <2 x i64> @test_mm_mask_cvttps_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) {
836 ; X86-LABEL: test_mm_mask_cvttps_epi32:
837 ; X86: # %bb.0: # %entry
838 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
839 ; X86-NEXT: kmovw %eax, %k1
840 ; X86-NEXT: vcvttps2dq %xmm1, %xmm0 {%k1}
843 ; X64-LABEL: test_mm_mask_cvttps_epi32:
844 ; X64: # %bb.0: # %entry
845 ; X64-NEXT: kmovw %edi, %k1
846 ; X64-NEXT: vcvttps2dq %xmm1, %xmm0 {%k1}
849 %0 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %__A) #8
850 %1 = bitcast <2 x i64> %__W to <4 x i32>
851 %2 = bitcast i8 %__U to <8 x i1>
852 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
853 %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1
854 %4 = bitcast <4 x i32> %3 to <2 x i64>
858 define <2 x i64> @test_mm_maskz_cvttps_epi32(i8 zeroext %__U, <4 x float> %__A) {
859 ; X86-LABEL: test_mm_maskz_cvttps_epi32:
860 ; X86: # %bb.0: # %entry
861 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
862 ; X86-NEXT: kmovw %eax, %k1
863 ; X86-NEXT: vcvttps2dq %xmm0, %xmm0 {%k1} {z}
866 ; X64-LABEL: test_mm_maskz_cvttps_epi32:
867 ; X64: # %bb.0: # %entry
868 ; X64-NEXT: kmovw %edi, %k1
869 ; X64-NEXT: vcvttps2dq %xmm0, %xmm0 {%k1} {z}
872 %0 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %__A) #8
873 %1 = bitcast i8 %__U to <8 x i1>
874 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
875 %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer
876 %3 = bitcast <4 x i32> %2 to <2 x i64>
880 define <4 x i64> @test_mm256_mask_cvttps_epi32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) {
881 ; X86-LABEL: test_mm256_mask_cvttps_epi32:
882 ; X86: # %bb.0: # %entry
883 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
884 ; X86-NEXT: kmovw %eax, %k1
885 ; X86-NEXT: vcvttps2dq %ymm1, %ymm0 {%k1}
888 ; X64-LABEL: test_mm256_mask_cvttps_epi32:
889 ; X64: # %bb.0: # %entry
890 ; X64-NEXT: kmovw %edi, %k1
891 ; X64-NEXT: vcvttps2dq %ymm1, %ymm0 {%k1}
894 %0 = tail call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %__A) #8
895 %1 = bitcast <4 x i64> %__W to <8 x i32>
896 %2 = bitcast i8 %__U to <8 x i1>
897 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
898 %4 = bitcast <8 x i32> %3 to <4 x i64>
902 define <4 x i64> @test_mm256_maskz_cvttps_epi32(i8 zeroext %__U, <8 x float> %__A) {
903 ; X86-LABEL: test_mm256_maskz_cvttps_epi32:
904 ; X86: # %bb.0: # %entry
905 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
906 ; X86-NEXT: kmovw %eax, %k1
907 ; X86-NEXT: vcvttps2dq %ymm0, %ymm0 {%k1} {z}
910 ; X64-LABEL: test_mm256_maskz_cvttps_epi32:
911 ; X64: # %bb.0: # %entry
912 ; X64-NEXT: kmovw %edi, %k1
913 ; X64-NEXT: vcvttps2dq %ymm0, %ymm0 {%k1} {z}
916 %0 = tail call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %__A) #8
917 %1 = bitcast i8 %__U to <8 x i1>
918 %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer
919 %3 = bitcast <8 x i32> %2 to <4 x i64>
923 define <2 x i64> @test_mm_cvttps_epu32(<4 x float> %__A) {
924 ; CHECK-LABEL: test_mm_cvttps_epu32:
925 ; CHECK: # %bb.0: # %entry
926 ; CHECK-NEXT: vcvttps2udq %xmm0, %xmm0
927 ; CHECK-NEXT: ret{{[l|q]}}
929 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 -1) #8
930 %1 = bitcast <4 x i32> %0 to <2 x i64>
934 define <2 x i64> @test_mm_mask_cvttps_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) {
935 ; X86-LABEL: test_mm_mask_cvttps_epu32:
936 ; X86: # %bb.0: # %entry
937 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
938 ; X86-NEXT: kmovw %eax, %k1
939 ; X86-NEXT: vcvttps2udq %xmm1, %xmm0 {%k1}
942 ; X64-LABEL: test_mm_mask_cvttps_epu32:
943 ; X64: # %bb.0: # %entry
944 ; X64-NEXT: kmovw %edi, %k1
945 ; X64-NEXT: vcvttps2udq %xmm1, %xmm0 {%k1}
948 %0 = bitcast <2 x i64> %__W to <4 x i32>
949 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> %0, i8 %__U) #8
950 %2 = bitcast <4 x i32> %1 to <2 x i64>
954 define <2 x i64> @test_mm_maskz_cvttps_epu32(i8 zeroext %__U, <4 x float> %__A) {
955 ; X86-LABEL: test_mm_maskz_cvttps_epu32:
956 ; X86: # %bb.0: # %entry
957 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
958 ; X86-NEXT: kmovw %eax, %k1
959 ; X86-NEXT: vcvttps2udq %xmm0, %xmm0 {%k1} {z}
962 ; X64-LABEL: test_mm_maskz_cvttps_epu32:
963 ; X64: # %bb.0: # %entry
964 ; X64-NEXT: kmovw %edi, %k1
965 ; X64-NEXT: vcvttps2udq %xmm0, %xmm0 {%k1} {z}
968 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
969 %1 = bitcast <4 x i32> %0 to <2 x i64>
973 define <4 x i64> @test_mm256_cvttps_epu32(<8 x float> %__A) {
974 ; CHECK-LABEL: test_mm256_cvttps_epu32:
975 ; CHECK: # %bb.0: # %entry
976 ; CHECK-NEXT: vcvttps2udq %ymm0, %ymm0
977 ; CHECK-NEXT: ret{{[l|q]}}
979 %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 -1) #8
980 %1 = bitcast <8 x i32> %0 to <4 x i64>
984 define <4 x i64> @test_mm256_mask_cvttps_epu32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) {
985 ; X86-LABEL: test_mm256_mask_cvttps_epu32:
986 ; X86: # %bb.0: # %entry
987 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
988 ; X86-NEXT: kmovw %eax, %k1
989 ; X86-NEXT: vcvttps2udq %ymm1, %ymm0 {%k1}
992 ; X64-LABEL: test_mm256_mask_cvttps_epu32:
993 ; X64: # %bb.0: # %entry
994 ; X64-NEXT: kmovw %edi, %k1
995 ; X64-NEXT: vcvttps2udq %ymm1, %ymm0 {%k1}
998 %0 = bitcast <4 x i64> %__W to <8 x i32>
999 %1 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> %0, i8 %__U) #8
1000 %2 = bitcast <8 x i32> %1 to <4 x i64>
1004 define <4 x i64> @test_mm256_maskz_cvttps_epu32(i8 zeroext %__U, <8 x float> %__A) {
1005 ; X86-LABEL: test_mm256_maskz_cvttps_epu32:
1006 ; X86: # %bb.0: # %entry
1007 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1008 ; X86-NEXT: kmovw %eax, %k1
1009 ; X86-NEXT: vcvttps2udq %ymm0, %ymm0 {%k1} {z}
1012 ; X64-LABEL: test_mm256_maskz_cvttps_epu32:
1013 ; X64: # %bb.0: # %entry
1014 ; X64-NEXT: kmovw %edi, %k1
1015 ; X64-NEXT: vcvttps2udq %ymm0, %ymm0 {%k1} {z}
1018 %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 %__U) #8
1019 %1 = bitcast <8 x i32> %0 to <4 x i64>
1023 define <2 x double> @test_mm_cvtepu32_pd(<2 x i64> %__A) local_unnamed_addr #0 {
1024 ; CHECK-LABEL: test_mm_cvtepu32_pd:
1025 ; CHECK: # %bb.0: # %entry
1026 ; CHECK-NEXT: vcvtudq2pd %xmm0, %xmm0
1027 ; CHECK-NEXT: ret{{[l|q]}}
1029 %0 = bitcast <2 x i64> %__A to <4 x i32>
1030 %shuffle.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1031 %conv.i = uitofp <2 x i32> %shuffle.i to <2 x double>
1032 ret <2 x double> %conv.i
1035 define <2 x double> @test_mm_mask_cvtepu32_pd(<2 x double> %__W, i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 {
1036 ; X86-LABEL: test_mm_mask_cvtepu32_pd:
1037 ; X86: # %bb.0: # %entry
1038 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1039 ; X86-NEXT: kmovw %eax, %k1
1040 ; X86-NEXT: vcvtudq2pd %xmm1, %xmm0 {%k1}
1043 ; X64-LABEL: test_mm_mask_cvtepu32_pd:
1044 ; X64: # %bb.0: # %entry
1045 ; X64-NEXT: kmovw %edi, %k1
1046 ; X64-NEXT: vcvtudq2pd %xmm1, %xmm0 {%k1}
1049 %0 = bitcast <2 x i64> %__A to <4 x i32>
1050 %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1051 %conv.i.i = uitofp <2 x i32> %shuffle.i.i to <2 x double>
1052 %1 = bitcast i8 %__U to <8 x i1>
1053 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1054 %2 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> %__W
1058 define <2 x double> @test_mm_maskz_cvtepu32_pd(i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 {
1059 ; X86-LABEL: test_mm_maskz_cvtepu32_pd:
1060 ; X86: # %bb.0: # %entry
1061 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1062 ; X86-NEXT: kmovw %eax, %k1
1063 ; X86-NEXT: vcvtudq2pd %xmm0, %xmm0 {%k1} {z}
1066 ; X64-LABEL: test_mm_maskz_cvtepu32_pd:
1067 ; X64: # %bb.0: # %entry
1068 ; X64-NEXT: kmovw %edi, %k1
1069 ; X64-NEXT: vcvtudq2pd %xmm0, %xmm0 {%k1} {z}
1072 %0 = bitcast <2 x i64> %__A to <4 x i32>
1073 %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1074 %conv.i.i = uitofp <2 x i32> %shuffle.i.i to <2 x double>
1075 %1 = bitcast i8 %__U to <8 x i1>
1076 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1077 %2 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> zeroinitializer
1081 define <4 x double> @test_mm256_cvtepu32_pd(<2 x i64> %__A) local_unnamed_addr #0 {
1082 ; CHECK-LABEL: test_mm256_cvtepu32_pd:
1083 ; CHECK: # %bb.0: # %entry
1084 ; CHECK-NEXT: vcvtudq2pd %xmm0, %ymm0
1085 ; CHECK-NEXT: ret{{[l|q]}}
1087 %0 = bitcast <2 x i64> %__A to <4 x i32>
1088 %conv.i = uitofp <4 x i32> %0 to <4 x double>
1089 ret <4 x double> %conv.i
1092 define <4 x double> @test_mm256_mask_cvtepu32_pd(<4 x double> %__W, i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 {
1093 ; X86-LABEL: test_mm256_mask_cvtepu32_pd:
1094 ; X86: # %bb.0: # %entry
1095 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1096 ; X86-NEXT: kmovw %eax, %k1
1097 ; X86-NEXT: vcvtudq2pd %xmm1, %ymm0 {%k1}
1100 ; X64-LABEL: test_mm256_mask_cvtepu32_pd:
1101 ; X64: # %bb.0: # %entry
1102 ; X64-NEXT: kmovw %edi, %k1
1103 ; X64-NEXT: vcvtudq2pd %xmm1, %ymm0 {%k1}
1106 %0 = bitcast <2 x i64> %__A to <4 x i32>
1107 %conv.i.i = uitofp <4 x i32> %0 to <4 x double>
1108 %1 = bitcast i8 %__U to <8 x i1>
1109 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1110 %2 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> %__W
1114 define <4 x double> @test_mm256_maskz_cvtepu32_pd(i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 {
1115 ; X86-LABEL: test_mm256_maskz_cvtepu32_pd:
1116 ; X86: # %bb.0: # %entry
1117 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1118 ; X86-NEXT: kmovw %eax, %k1
1119 ; X86-NEXT: vcvtudq2pd %xmm0, %ymm0 {%k1} {z}
1122 ; X64-LABEL: test_mm256_maskz_cvtepu32_pd:
1123 ; X64: # %bb.0: # %entry
1124 ; X64-NEXT: kmovw %edi, %k1
1125 ; X64-NEXT: vcvtudq2pd %xmm0, %ymm0 {%k1} {z}
1128 %0 = bitcast <2 x i64> %__A to <4 x i32>
1129 %conv.i.i = uitofp <4 x i32> %0 to <4 x double>
1130 %1 = bitcast i8 %__U to <8 x i1>
1131 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1132 %2 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> zeroinitializer
1136 define <4 x float> @test_mm_cvtepu32_ps(<2 x i64> %__A) {
1137 ; CHECK-LABEL: test_mm_cvtepu32_ps:
1138 ; CHECK: # %bb.0: # %entry
1139 ; CHECK-NEXT: vcvtudq2ps %xmm0, %xmm0
1140 ; CHECK-NEXT: ret{{[l|q]}}
1142 %0 = bitcast <2 x i64> %__A to <4 x i32>
1143 %conv.i = uitofp <4 x i32> %0 to <4 x float>
1144 ret <4 x float> %conv.i
1147 define <4 x float> @test_mm_mask_cvtepu32_ps(<4 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) {
1148 ; X86-LABEL: test_mm_mask_cvtepu32_ps:
1149 ; X86: # %bb.0: # %entry
1150 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1151 ; X86-NEXT: kmovw %eax, %k1
1152 ; X86-NEXT: vcvtudq2ps %xmm1, %xmm0 {%k1}
1155 ; X64-LABEL: test_mm_mask_cvtepu32_ps:
1156 ; X64: # %bb.0: # %entry
1157 ; X64-NEXT: kmovw %edi, %k1
1158 ; X64-NEXT: vcvtudq2ps %xmm1, %xmm0 {%k1}
1161 %0 = bitcast <2 x i64> %__A to <4 x i32>
1162 %conv.i.i = uitofp <4 x i32> %0 to <4 x float>
1163 %1 = bitcast i8 %__U to <8 x i1>
1164 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1165 %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> %__W
1169 define <4 x float> @test_mm_maskz_cvtepu32_ps(i8 zeroext %__U, <2 x i64> %__A) {
1170 ; X86-LABEL: test_mm_maskz_cvtepu32_ps:
1171 ; X86: # %bb.0: # %entry
1172 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1173 ; X86-NEXT: kmovw %eax, %k1
1174 ; X86-NEXT: vcvtudq2ps %xmm0, %xmm0 {%k1} {z}
1177 ; X64-LABEL: test_mm_maskz_cvtepu32_ps:
1178 ; X64: # %bb.0: # %entry
1179 ; X64-NEXT: kmovw %edi, %k1
1180 ; X64-NEXT: vcvtudq2ps %xmm0, %xmm0 {%k1} {z}
1183 %0 = bitcast <2 x i64> %__A to <4 x i32>
1184 %conv.i.i = uitofp <4 x i32> %0 to <4 x float>
1185 %1 = bitcast i8 %__U to <8 x i1>
1186 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1187 %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> zeroinitializer
1191 define <8 x float> @test_mm256_cvtepu32_ps(<4 x i64> %__A) {
1192 ; CHECK-LABEL: test_mm256_cvtepu32_ps:
1193 ; CHECK: # %bb.0: # %entry
1194 ; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm0
1195 ; CHECK-NEXT: ret{{[l|q]}}
1197 %0 = bitcast <4 x i64> %__A to <8 x i32>
1198 %conv.i = uitofp <8 x i32> %0 to <8 x float>
1199 ret <8 x float> %conv.i
1202 define <8 x float> @test_mm256_mask_cvtepu32_ps(<8 x float> %__W, i8 zeroext %__U, <4 x i64> %__A) {
1203 ; X86-LABEL: test_mm256_mask_cvtepu32_ps:
1204 ; X86: # %bb.0: # %entry
1205 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1206 ; X86-NEXT: kmovw %eax, %k1
1207 ; X86-NEXT: vcvtudq2ps %ymm1, %ymm0 {%k1}
1210 ; X64-LABEL: test_mm256_mask_cvtepu32_ps:
1211 ; X64: # %bb.0: # %entry
1212 ; X64-NEXT: kmovw %edi, %k1
1213 ; X64-NEXT: vcvtudq2ps %ymm1, %ymm0 {%k1}
1216 %0 = bitcast <4 x i64> %__A to <8 x i32>
1217 %conv.i.i = uitofp <8 x i32> %0 to <8 x float>
1218 %1 = bitcast i8 %__U to <8 x i1>
1219 %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> %__W
1223 define <8 x float> @test_mm256_maskz_cvtepu32_ps(i8 zeroext %__U, <4 x i64> %__A) {
1224 ; X86-LABEL: test_mm256_maskz_cvtepu32_ps:
1225 ; X86: # %bb.0: # %entry
1226 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1227 ; X86-NEXT: kmovw %eax, %k1
1228 ; X86-NEXT: vcvtudq2ps %ymm0, %ymm0 {%k1} {z}
1231 ; X64-LABEL: test_mm256_maskz_cvtepu32_ps:
1232 ; X64: # %bb.0: # %entry
1233 ; X64-NEXT: kmovw %edi, %k1
1234 ; X64-NEXT: vcvtudq2ps %ymm0, %ymm0 {%k1} {z}
1237 %0 = bitcast <4 x i64> %__A to <8 x i32>
1238 %conv.i.i = uitofp <8 x i32> %0 to <8 x float>
1239 %1 = bitcast i8 %__U to <8 x i1>
1240 %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> zeroinitializer
1244 define <8 x float> @test_mm256_shuffle_f32x4(<8 x float> %__A, <8 x float> %__B) {
1245 ; CHECK-LABEL: test_mm256_shuffle_f32x4:
1246 ; CHECK: # %bb.0: # %entry
1247 ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1248 ; CHECK-NEXT: ret{{[l|q]}}
1250 %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1251 ret <8 x float> %shuffle
1254 define <8 x float> @test_mm256_mask_shuffle_f32x4(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) {
1255 ; X86-LABEL: test_mm256_mask_shuffle_f32x4:
1256 ; X86: # %bb.0: # %entry
1257 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1258 ; X86-NEXT: kmovw %eax, %k1
1259 ; X86-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
1262 ; X64-LABEL: test_mm256_mask_shuffle_f32x4:
1263 ; X64: # %bb.0: # %entry
1264 ; X64-NEXT: kmovw %edi, %k1
1265 ; X64-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
1268 %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1269 %0 = bitcast i8 %__U to <8 x i1>
1270 %1 = select <8 x i1> %0, <8 x float> %shuffle, <8 x float> %__W
1274 define <8 x float> @test_mm256_maskz_shuffle_f32x4(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) {
1275 ; X86-LABEL: test_mm256_maskz_shuffle_f32x4:
1276 ; X86: # %bb.0: # %entry
1277 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1278 ; X86-NEXT: kmovw %eax, %k1
1279 ; X86-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
1282 ; X64-LABEL: test_mm256_maskz_shuffle_f32x4:
1283 ; X64: # %bb.0: # %entry
1284 ; X64-NEXT: kmovw %edi, %k1
1285 ; X64-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
1288 %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1289 %0 = bitcast i8 %__U to <8 x i1>
1290 %1 = select <8 x i1> %0, <8 x float> %shuffle, <8 x float> zeroinitializer
1294 define <4 x double> @test_mm256_shuffle_f64x2(<4 x double> %__A, <4 x double> %__B) {
1295 ; CHECK-LABEL: test_mm256_shuffle_f64x2:
1296 ; CHECK: # %bb.0: # %entry
1297 ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1298 ; CHECK-NEXT: ret{{[l|q]}}
1300 %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1301 ret <4 x double> %shuffle
1304 define <4 x double> @test_mm256_mask_shuffle_f64x2(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
1305 ; X86-LABEL: test_mm256_mask_shuffle_f64x2:
1306 ; X86: # %bb.0: # %entry
1307 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1308 ; X86-NEXT: kmovw %eax, %k1
1309 ; X86-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
1312 ; X64-LABEL: test_mm256_mask_shuffle_f64x2:
1313 ; X64: # %bb.0: # %entry
1314 ; X64-NEXT: kmovw %edi, %k1
1315 ; X64-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
1318 %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1319 %0 = bitcast i8 %__U to <8 x i1>
1320 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1321 %1 = select <4 x i1> %extract, <4 x double> %shuffle, <4 x double> %__W
1325 define <4 x double> @test_mm256_maskz_shuffle_f64x2(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
1326 ; X86-LABEL: test_mm256_maskz_shuffle_f64x2:
1327 ; X86: # %bb.0: # %entry
1328 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1329 ; X86-NEXT: kmovw %eax, %k1
1330 ; X86-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
1333 ; X64-LABEL: test_mm256_maskz_shuffle_f64x2:
1334 ; X64: # %bb.0: # %entry
1335 ; X64-NEXT: kmovw %edi, %k1
1336 ; X64-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
1339 %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1340 %0 = bitcast i8 %__U to <8 x i1>
1341 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1342 %1 = select <4 x i1> %extract, <4 x double> %shuffle, <4 x double> zeroinitializer
1346 define <4 x i64> @test_mm256_shuffle_i32x4(<4 x i64> %__A, <4 x i64> %__B) {
1347 ; CHECK-LABEL: test_mm256_shuffle_i32x4:
1348 ; CHECK: # %bb.0: # %entry
1349 ; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1350 ; CHECK-NEXT: ret{{[l|q]}}
1352 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1353 ret <4 x i64> %shuffle
1356 define <4 x i64> @test_mm256_mask_shuffle_i32x4(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1357 ; X86-LABEL: test_mm256_mask_shuffle_i32x4:
1358 ; X86: # %bb.0: # %entry
1359 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1360 ; X86-NEXT: kmovw %eax, %k1
1361 ; X86-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
1364 ; X64-LABEL: test_mm256_mask_shuffle_i32x4:
1365 ; X64: # %bb.0: # %entry
1366 ; X64-NEXT: kmovw %edi, %k1
1367 ; X64-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
1370 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1371 %0 = bitcast <4 x i64> %shuffle to <8 x i32>
1372 %1 = bitcast <4 x i64> %__W to <8 x i32>
1373 %2 = bitcast i8 %__U to <8 x i1>
1374 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
1375 %4 = bitcast <8 x i32> %3 to <4 x i64>
1379 define <4 x i64> @test_mm256_maskz_shuffle_i32x4(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1380 ; X86-LABEL: test_mm256_maskz_shuffle_i32x4:
1381 ; X86: # %bb.0: # %entry
1382 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1383 ; X86-NEXT: kmovw %eax, %k1
1384 ; X86-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
1387 ; X64-LABEL: test_mm256_maskz_shuffle_i32x4:
1388 ; X64: # %bb.0: # %entry
1389 ; X64-NEXT: kmovw %edi, %k1
1390 ; X64-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
1393 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1394 %0 = bitcast <4 x i64> %shuffle to <8 x i32>
1395 %1 = bitcast i8 %__U to <8 x i1>
1396 %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer
1397 %3 = bitcast <8 x i32> %2 to <4 x i64>
1401 define <4 x i64> @test_mm256_shuffle_i64x2(<4 x i64> %__A, <4 x i64> %__B) {
1402 ; CHECK-LABEL: test_mm256_shuffle_i64x2:
1403 ; CHECK: # %bb.0: # %entry
1404 ; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1405 ; CHECK-NEXT: ret{{[l|q]}}
1407 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1408 ret <4 x i64> %shuffle
1411 define <4 x i64> @test_mm256_mask_shuffle_i64x2(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1412 ; X86-LABEL: test_mm256_mask_shuffle_i64x2:
1413 ; X86: # %bb.0: # %entry
1414 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1415 ; X86-NEXT: kmovw %eax, %k1
1416 ; X86-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
1419 ; X64-LABEL: test_mm256_mask_shuffle_i64x2:
1420 ; X64: # %bb.0: # %entry
1421 ; X64-NEXT: kmovw %edi, %k1
1422 ; X64-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
1425 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1426 %0 = bitcast i8 %__U to <8 x i1>
1427 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1428 %1 = select <4 x i1> %extract, <4 x i64> %shuffle, <4 x i64> %__W
1432 define <4 x i64> @test_mm256_maskz_shuffle_i64x2(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1433 ; X86-LABEL: test_mm256_maskz_shuffle_i64x2:
1434 ; X86: # %bb.0: # %entry
1435 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1436 ; X86-NEXT: kmovw %eax, %k1
1437 ; X86-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
1440 ; X64-LABEL: test_mm256_maskz_shuffle_i64x2:
1441 ; X64: # %bb.0: # %entry
1442 ; X64-NEXT: kmovw %edi, %k1
1443 ; X64-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
1446 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1447 %0 = bitcast i8 %__U to <8 x i1>
1448 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1449 %1 = select <4 x i1> %extract, <4 x i64> %shuffle, <4 x i64> zeroinitializer
1453 define zeroext i8 @test_mm_test_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) {
1454 ; CHECK-LABEL: test_mm_test_epi32_mask:
1455 ; CHECK: # %bb.0: # %entry
1456 ; CHECK-NEXT: vptestmd %xmm0, %xmm1, %k0
1457 ; CHECK-NEXT: kmovw %k0, %eax
1458 ; CHECK-NEXT: movzbl %al, %eax
1459 ; CHECK-NEXT: ret{{[l|q]}}
1461 %and.i.i = and <2 x i64> %__B, %__A
1462 %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
1463 %1 = icmp ne <4 x i32> %0, zeroinitializer
1464 %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1465 %3 = bitcast <8 x i1> %2 to i8
1469 define zeroext i8 @test_mm_mask_test_epi32_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1470 ; X86-LABEL: test_mm_mask_test_epi32_mask:
1471 ; X86: # %bb.0: # %entry
1472 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1473 ; X86-NEXT: kmovw %eax, %k1
1474 ; X86-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1}
1475 ; X86-NEXT: kmovw %k0, %eax
1476 ; X86-NEXT: movzbl %al, %eax
1479 ; X64-LABEL: test_mm_mask_test_epi32_mask:
1480 ; X64: # %bb.0: # %entry
1481 ; X64-NEXT: kmovw %edi, %k1
1482 ; X64-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1}
1483 ; X64-NEXT: kmovw %k0, %eax
1484 ; X64-NEXT: movzbl %al, %eax
1487 %and.i.i = and <2 x i64> %__B, %__A
1488 %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
1489 %1 = icmp ne <4 x i32> %0, zeroinitializer
1490 %2 = bitcast i8 %__U to <8 x i1>
1491 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1492 %3 = and <4 x i1> %1, %extract.i
1493 %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1494 %5 = bitcast <8 x i1> %4 to i8
1498 define zeroext i8 @test_mm256_test_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) {
1499 ; CHECK-LABEL: test_mm256_test_epi32_mask:
1500 ; CHECK: # %bb.0: # %entry
1501 ; CHECK-NEXT: vptestmd %ymm0, %ymm1, %k0
1502 ; CHECK-NEXT: kmovw %k0, %eax
1503 ; CHECK-NEXT: movzbl %al, %eax
1504 ; CHECK-NEXT: vzeroupper
1505 ; CHECK-NEXT: ret{{[l|q]}}
1507 %and.i.i = and <4 x i64> %__B, %__A
1508 %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
1509 %1 = icmp ne <8 x i32> %0, zeroinitializer
1510 %2 = bitcast <8 x i1> %1 to i8
1514 define zeroext i8 @test_mm256_mask_test_epi32_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1515 ; X86-LABEL: test_mm256_mask_test_epi32_mask:
1516 ; X86: # %bb.0: # %entry
1517 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1518 ; X86-NEXT: kmovw %eax, %k1
1519 ; X86-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1}
1520 ; X86-NEXT: kmovw %k0, %eax
1521 ; X86-NEXT: movzbl %al, %eax
1522 ; X86-NEXT: vzeroupper
1525 ; X64-LABEL: test_mm256_mask_test_epi32_mask:
1526 ; X64: # %bb.0: # %entry
1527 ; X64-NEXT: kmovw %edi, %k1
1528 ; X64-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1}
1529 ; X64-NEXT: kmovw %k0, %eax
1530 ; X64-NEXT: movzbl %al, %eax
1531 ; X64-NEXT: vzeroupper
1534 %and.i.i = and <4 x i64> %__B, %__A
1535 %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
1536 %1 = icmp ne <8 x i32> %0, zeroinitializer
1537 %2 = bitcast i8 %__U to <8 x i1>
1538 %3 = and <8 x i1> %1, %2
1539 %4 = bitcast <8 x i1> %3 to i8
1543 define zeroext i8 @test_mm_test_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) {
1544 ; CHECK-LABEL: test_mm_test_epi64_mask:
1545 ; CHECK: # %bb.0: # %entry
1546 ; CHECK-NEXT: vptestmq %xmm0, %xmm1, %k0
1547 ; CHECK-NEXT: kmovw %k0, %eax
1548 ; CHECK-NEXT: movzbl %al, %eax
1549 ; CHECK-NEXT: ret{{[l|q]}}
1551 %and.i.i = and <2 x i64> %__B, %__A
1552 %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer
1553 %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1554 %2 = bitcast <8 x i1> %1 to i8
1558 define zeroext i8 @test_mm_mask_test_epi64_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1559 ; X86-LABEL: test_mm_mask_test_epi64_mask:
1560 ; X86: # %bb.0: # %entry
1561 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1562 ; X86-NEXT: kmovw %eax, %k1
1563 ; X86-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1}
1564 ; X86-NEXT: kmovw %k0, %eax
1565 ; X86-NEXT: movzbl %al, %eax
1568 ; X64-LABEL: test_mm_mask_test_epi64_mask:
1569 ; X64: # %bb.0: # %entry
1570 ; X64-NEXT: kmovw %edi, %k1
1571 ; X64-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1}
1572 ; X64-NEXT: kmovw %k0, %eax
1573 ; X64-NEXT: movzbl %al, %eax
1576 %and.i.i = and <2 x i64> %__B, %__A
1577 %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer
1578 %1 = bitcast i8 %__U to <8 x i1>
1579 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1580 %2 = and <2 x i1> %0, %extract.i
1581 %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1582 %4 = bitcast <8 x i1> %3 to i8
1586 define zeroext i8 @test_mm256_test_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) {
1587 ; CHECK-LABEL: test_mm256_test_epi64_mask:
1588 ; CHECK: # %bb.0: # %entry
1589 ; CHECK-NEXT: vptestmq %ymm0, %ymm1, %k0
1590 ; CHECK-NEXT: kmovw %k0, %eax
1591 ; CHECK-NEXT: movzbl %al, %eax
1592 ; CHECK-NEXT: vzeroupper
1593 ; CHECK-NEXT: ret{{[l|q]}}
1595 %and.i.i = and <4 x i64> %__B, %__A
1596 %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer
1597 %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1598 %2 = bitcast <8 x i1> %1 to i8
1602 define zeroext i8 @test_mm256_mask_test_epi64_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1603 ; X86-LABEL: test_mm256_mask_test_epi64_mask:
1604 ; X86: # %bb.0: # %entry
1605 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1606 ; X86-NEXT: kmovw %eax, %k1
1607 ; X86-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1}
1608 ; X86-NEXT: kmovw %k0, %eax
1609 ; X86-NEXT: movzbl %al, %eax
1610 ; X86-NEXT: vzeroupper
1613 ; X64-LABEL: test_mm256_mask_test_epi64_mask:
1614 ; X64: # %bb.0: # %entry
1615 ; X64-NEXT: kmovw %edi, %k1
1616 ; X64-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1}
1617 ; X64-NEXT: kmovw %k0, %eax
1618 ; X64-NEXT: movzbl %al, %eax
1619 ; X64-NEXT: vzeroupper
1622 %and.i.i = and <4 x i64> %__B, %__A
1623 %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer
1624 %1 = bitcast i8 %__U to <8 x i1>
1625 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1626 %2 = and <4 x i1> %0, %extract.i
1627 %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1628 %4 = bitcast <8 x i1> %3 to i8
1632 define zeroext i8 @test_mm_testn_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) {
1633 ; CHECK-LABEL: test_mm_testn_epi32_mask:
1634 ; CHECK: # %bb.0: # %entry
1635 ; CHECK-NEXT: vptestnmd %xmm0, %xmm1, %k0
1636 ; CHECK-NEXT: kmovw %k0, %eax
1637 ; CHECK-NEXT: movzbl %al, %eax
1638 ; CHECK-NEXT: ret{{[l|q]}}
1640 %and.i.i = and <2 x i64> %__B, %__A
1641 %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
1642 %1 = icmp eq <4 x i32> %0, zeroinitializer
1643 %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1644 %3 = bitcast <8 x i1> %2 to i8
1648 define zeroext i8 @test_mm_mask_testn_epi32_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1649 ; X86-LABEL: test_mm_mask_testn_epi32_mask:
1650 ; X86: # %bb.0: # %entry
1651 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1652 ; X86-NEXT: kmovw %eax, %k1
1653 ; X86-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1}
1654 ; X86-NEXT: kmovw %k0, %eax
1655 ; X86-NEXT: movzbl %al, %eax
1658 ; X64-LABEL: test_mm_mask_testn_epi32_mask:
1659 ; X64: # %bb.0: # %entry
1660 ; X64-NEXT: kmovw %edi, %k1
1661 ; X64-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1}
1662 ; X64-NEXT: kmovw %k0, %eax
1663 ; X64-NEXT: movzbl %al, %eax
1666 %and.i.i = and <2 x i64> %__B, %__A
1667 %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
1668 %1 = icmp eq <4 x i32> %0, zeroinitializer
1669 %2 = bitcast i8 %__U to <8 x i1>
1670 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1671 %3 = and <4 x i1> %1, %extract.i
1672 %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1673 %5 = bitcast <8 x i1> %4 to i8
1677 define zeroext i8 @test_mm256_testn_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) {
1678 ; CHECK-LABEL: test_mm256_testn_epi32_mask:
1679 ; CHECK: # %bb.0: # %entry
1680 ; CHECK-NEXT: vptestnmd %ymm0, %ymm1, %k0
1681 ; CHECK-NEXT: kmovw %k0, %eax
1682 ; CHECK-NEXT: movzbl %al, %eax
1683 ; CHECK-NEXT: vzeroupper
1684 ; CHECK-NEXT: ret{{[l|q]}}
1686 %and.i.i = and <4 x i64> %__B, %__A
1687 %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
1688 %1 = icmp eq <8 x i32> %0, zeroinitializer
1689 %2 = bitcast <8 x i1> %1 to i8
1693 define zeroext i8 @test_mm256_mask_testn_epi32_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1694 ; X86-LABEL: test_mm256_mask_testn_epi32_mask:
1695 ; X86: # %bb.0: # %entry
1696 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1697 ; X86-NEXT: kmovw %eax, %k1
1698 ; X86-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1}
1699 ; X86-NEXT: kmovw %k0, %eax
1700 ; X86-NEXT: movzbl %al, %eax
1701 ; X86-NEXT: vzeroupper
1704 ; X64-LABEL: test_mm256_mask_testn_epi32_mask:
1705 ; X64: # %bb.0: # %entry
1706 ; X64-NEXT: kmovw %edi, %k1
1707 ; X64-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1}
1708 ; X64-NEXT: kmovw %k0, %eax
1709 ; X64-NEXT: movzbl %al, %eax
1710 ; X64-NEXT: vzeroupper
1713 %and.i.i = and <4 x i64> %__B, %__A
1714 %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
1715 %1 = icmp eq <8 x i32> %0, zeroinitializer
1716 %2 = bitcast i8 %__U to <8 x i1>
1717 %3 = and <8 x i1> %1, %2
1718 %4 = bitcast <8 x i1> %3 to i8
1722 define zeroext i8 @test_mm_testn_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) {
1723 ; CHECK-LABEL: test_mm_testn_epi64_mask:
1724 ; CHECK: # %bb.0: # %entry
1725 ; CHECK-NEXT: vptestnmq %xmm0, %xmm1, %k0
1726 ; CHECK-NEXT: kmovw %k0, %eax
1727 ; CHECK-NEXT: movzbl %al, %eax
1728 ; CHECK-NEXT: ret{{[l|q]}}
1730 %and.i.i = and <2 x i64> %__B, %__A
1731 %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer
1732 %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1733 %2 = bitcast <8 x i1> %1 to i8
1737 define zeroext i8 @test_mm_mask_testn_epi64_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1738 ; X86-LABEL: test_mm_mask_testn_epi64_mask:
1739 ; X86: # %bb.0: # %entry
1740 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1741 ; X86-NEXT: kmovw %eax, %k1
1742 ; X86-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1}
1743 ; X86-NEXT: kmovw %k0, %eax
1744 ; X86-NEXT: movzbl %al, %eax
1747 ; X64-LABEL: test_mm_mask_testn_epi64_mask:
1748 ; X64: # %bb.0: # %entry
1749 ; X64-NEXT: kmovw %edi, %k1
1750 ; X64-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1}
1751 ; X64-NEXT: kmovw %k0, %eax
1752 ; X64-NEXT: movzbl %al, %eax
1755 %and.i.i = and <2 x i64> %__B, %__A
1756 %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer
1757 %1 = bitcast i8 %__U to <8 x i1>
1758 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1759 %2 = and <2 x i1> %0, %extract.i
1760 %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1761 %4 = bitcast <8 x i1> %3 to i8
1765 define zeroext i8 @test_mm256_testn_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) {
1766 ; CHECK-LABEL: test_mm256_testn_epi64_mask:
1767 ; CHECK: # %bb.0: # %entry
1768 ; CHECK-NEXT: vptestnmq %ymm0, %ymm1, %k0
1769 ; CHECK-NEXT: kmovw %k0, %eax
1770 ; CHECK-NEXT: movzbl %al, %eax
1771 ; CHECK-NEXT: vzeroupper
1772 ; CHECK-NEXT: ret{{[l|q]}}
1774 %and.i.i = and <4 x i64> %__B, %__A
1775 %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer
1776 %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1777 %2 = bitcast <8 x i1> %1 to i8
1781 define zeroext i8 @test_mm256_mask_testn_epi64_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1782 ; X86-LABEL: test_mm256_mask_testn_epi64_mask:
1783 ; X86: # %bb.0: # %entry
1784 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1785 ; X86-NEXT: kmovw %eax, %k1
1786 ; X86-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1}
1787 ; X86-NEXT: kmovw %k0, %eax
1788 ; X86-NEXT: movzbl %al, %eax
1789 ; X86-NEXT: vzeroupper
1792 ; X64-LABEL: test_mm256_mask_testn_epi64_mask:
1793 ; X64: # %bb.0: # %entry
1794 ; X64-NEXT: kmovw %edi, %k1
1795 ; X64-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1}
1796 ; X64-NEXT: kmovw %k0, %eax
1797 ; X64-NEXT: movzbl %al, %eax
1798 ; X64-NEXT: vzeroupper
1801 %and.i.i = and <4 x i64> %__B, %__A
1802 %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer
1803 %1 = bitcast i8 %__U to <8 x i1>
1804 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1805 %2 = and <4 x i1> %0, %extract.i
1806 %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1807 %4 = bitcast <8 x i1> %3 to i8
1811 define <2 x i64> @test_mm_mask_set1_epi32(<2 x i64> %__O, i8 zeroext %__M) {
1812 ; X86-LABEL: test_mm_mask_set1_epi32:
1813 ; X86: # %bb.0: # %entry
1814 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1815 ; X86-NEXT: kmovw %eax, %k1
1816 ; X86-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm0 {%k1}
1819 ; X64-LABEL: test_mm_mask_set1_epi32:
1820 ; X64: # %bb.0: # %entry
1821 ; X64-NEXT: kmovw %edi, %k1
1822 ; X64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1}
1825 %0 = bitcast <2 x i64> %__O to <4 x i32>
1826 %1 = bitcast i8 %__M to <8 x i1>
1827 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1828 %2 = select <4 x i1> %extract.i, <4 x i32> <i32 5, i32 5, i32 5, i32 5>, <4 x i32> %0
1829 %3 = bitcast <4 x i32> %2 to <2 x i64>
1833 define <2 x i64> @test_mm_maskz_set1_epi32(i8 zeroext %__M) {
1834 ; X86-LABEL: test_mm_maskz_set1_epi32:
1835 ; X86: # %bb.0: # %entry
1836 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1837 ; X86-NEXT: kmovw %eax, %k1
1838 ; X86-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm0 {%k1} {z}
1841 ; X64-LABEL: test_mm_maskz_set1_epi32:
1842 ; X64: # %bb.0: # %entry
1843 ; X64-NEXT: kmovw %edi, %k1
1844 ; X64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
1847 %0 = bitcast i8 %__M to <8 x i1>
1848 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1849 %1 = select <4 x i1> %extract.i, <4 x i32> <i32 5, i32 5, i32 5, i32 5>, <4 x i32> zeroinitializer
1850 %2 = bitcast <4 x i32> %1 to <2 x i64>
1854 define <4 x i64> @test_mm256_mask_set1_epi32(<4 x i64> %__O, i8 zeroext %__M) {
1855 ; X86-LABEL: test_mm256_mask_set1_epi32:
1856 ; X86: # %bb.0: # %entry
1857 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1858 ; X86-NEXT: kmovw %eax, %k1
1859 ; X86-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm0 {%k1}
1862 ; X64-LABEL: test_mm256_mask_set1_epi32:
1863 ; X64: # %bb.0: # %entry
1864 ; X64-NEXT: kmovw %edi, %k1
1865 ; X64-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1}
1868 %0 = bitcast <4 x i64> %__O to <8 x i32>
1869 %1 = bitcast i8 %__M to <8 x i1>
1870 %2 = select <8 x i1> %1, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <8 x i32> %0
1871 %3 = bitcast <8 x i32> %2 to <4 x i64>
1875 define <4 x i64> @test_mm256_maskz_set1_epi32(i8 zeroext %__M) {
1876 ; X86-LABEL: test_mm256_maskz_set1_epi32:
1877 ; X86: # %bb.0: # %entry
1878 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1879 ; X86-NEXT: kmovw %eax, %k1
1880 ; X86-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm0 {%k1} {z}
1883 ; X64-LABEL: test_mm256_maskz_set1_epi32:
1884 ; X64: # %bb.0: # %entry
1885 ; X64-NEXT: kmovw %edi, %k1
1886 ; X64-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
1889 %0 = bitcast i8 %__M to <8 x i1>
1890 %1 = select <8 x i1> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <8 x i32> zeroinitializer
1891 %2 = bitcast <8 x i32> %1 to <4 x i64>
1895 define <2 x i64> @test_mm_mask_set1_epi64(<2 x i64> %__O, i8 zeroext %__M, i64 %__A) {
1896 ; X86-LABEL: test_mm_mask_set1_epi64:
1897 ; X86: # %bb.0: # %entry
1898 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1899 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1900 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
1901 ; X86-NEXT: kmovw %eax, %k1
1902 ; X86-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1}
1905 ; X64-LABEL: test_mm_mask_set1_epi64:
1906 ; X64: # %bb.0: # %entry
1907 ; X64-NEXT: kmovw %edi, %k1
1908 ; X64-NEXT: vpbroadcastq %rsi, %xmm0 {%k1}
1911 %vecinit.i.i.i = insertelement <2 x i64> undef, i64 %__A, i32 0
1912 %vecinit1.i.i.i = shufflevector <2 x i64> %vecinit.i.i.i, <2 x i64> undef, <2 x i32> zeroinitializer
1913 %0 = bitcast i8 %__M to <8 x i1>
1914 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1915 %1 = select <2 x i1> %extract.i, <2 x i64> %vecinit1.i.i.i, <2 x i64> %__O
1919 define <2 x i64> @test_mm_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) {
1920 ; X86-LABEL: test_mm_maskz_set1_epi64:
1921 ; X86: # %bb.0: # %entry
1922 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1923 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1924 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
1925 ; X86-NEXT: kmovw %eax, %k1
1926 ; X86-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z}
1929 ; X64-LABEL: test_mm_maskz_set1_epi64:
1930 ; X64: # %bb.0: # %entry
1931 ; X64-NEXT: kmovw %edi, %k1
1932 ; X64-NEXT: vpbroadcastq %rsi, %xmm0 {%k1} {z}
1935 %vecinit.i.i.i = insertelement <2 x i64> undef, i64 %__A, i32 0
1936 %vecinit1.i.i.i = shufflevector <2 x i64> %vecinit.i.i.i, <2 x i64> undef, <2 x i32> zeroinitializer
1937 %0 = bitcast i8 %__M to <8 x i1>
1938 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1939 %1 = select <2 x i1> %extract.i, <2 x i64> %vecinit1.i.i.i, <2 x i64> zeroinitializer
1944 define <4 x i64> @test_mm256_mask_set1_epi64(<4 x i64> %__O, i8 zeroext %__M, i64 %__A) {
1945 ; X86-LABEL: test_mm256_mask_set1_epi64:
1946 ; X86: # %bb.0: # %entry
1947 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1948 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1949 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
1950 ; X86-NEXT: kmovw %eax, %k1
1951 ; X86-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1}
1954 ; X64-LABEL: test_mm256_mask_set1_epi64:
1955 ; X64: # %bb.0: # %entry
1956 ; X64-NEXT: kmovw %edi, %k1
1957 ; X64-NEXT: vpbroadcastq %rsi, %ymm0 {%k1}
1960 %vecinit.i.i = insertelement <4 x i64> undef, i64 %__A, i32 0
1961 %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer
1962 %0 = bitcast i8 %__M to <8 x i1>
1963 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1964 %1 = select <4 x i1> %extract.i, <4 x i64> %vecinit3.i.i, <4 x i64> %__O
1968 define <4 x i64> @test_mm256_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) {
1969 ; X86-LABEL: test_mm256_maskz_set1_epi64:
1970 ; X86: # %bb.0: # %entry
1971 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1972 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1973 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
1974 ; X86-NEXT: kmovw %eax, %k1
1975 ; X86-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z}
1978 ; X64-LABEL: test_mm256_maskz_set1_epi64:
1979 ; X64: # %bb.0: # %entry
1980 ; X64-NEXT: kmovw %edi, %k1
1981 ; X64-NEXT: vpbroadcastq %rsi, %ymm0 {%k1} {z}
1984 %vecinit.i.i = insertelement <4 x i64> undef, i64 %__A, i32 0
1985 %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer
1986 %0 = bitcast i8 %__M to <8 x i1>
1987 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1988 %1 = select <4 x i1> %extract.i, <4 x i64> %vecinit3.i.i, <4 x i64> zeroinitializer
1992 define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
1993 ; CHECK-LABEL: test_mm_broadcastd_epi32:
1995 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
1996 ; CHECK-NEXT: ret{{[l|q]}}
1997 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1998 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
1999 %res1 = bitcast <4 x i32> %res0 to <2 x i64>
2003 define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) {
2004 ; X86-LABEL: test_mm_mask_broadcastd_epi32:
2005 ; X86: # %bb.0: # %entry
2006 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2007 ; X86-NEXT: kmovw %eax, %k1
2008 ; X86-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1}
2011 ; X64-LABEL: test_mm_mask_broadcastd_epi32:
2012 ; X64: # %bb.0: # %entry
2013 ; X64-NEXT: kmovw %edi, %k1
2014 ; X64-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1}
2017 %0 = bitcast <2 x i64> %__A to <4 x i32>
2018 %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
2019 %1 = bitcast <2 x i64> %__O to <4 x i32>
2020 %2 = bitcast i8 %__M to <8 x i1>
2021 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2022 %3 = select <4 x i1> %extract.i, <4 x i32> %shuffle.i.i, <4 x i32> %1
2023 %4 = bitcast <4 x i32> %3 to <2 x i64>
2027 define <2 x i64> @test_mm_maskz_broadcastd_epi32(i8 zeroext %__M, <2 x i64> %__A) {
2028 ; X86-LABEL: test_mm_maskz_broadcastd_epi32:
2029 ; X86: # %bb.0: # %entry
2030 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2031 ; X86-NEXT: kmovw %eax, %k1
2032 ; X86-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z}
2035 ; X64-LABEL: test_mm_maskz_broadcastd_epi32:
2036 ; X64: # %bb.0: # %entry
2037 ; X64-NEXT: kmovw %edi, %k1
2038 ; X64-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z}
2041 %0 = bitcast <2 x i64> %__A to <4 x i32>
2042 %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
2043 %1 = bitcast i8 %__M to <8 x i1>
2044 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2045 %2 = select <4 x i1> %extract.i, <4 x i32> %shuffle.i.i, <4 x i32> zeroinitializer
2046 %3 = bitcast <4 x i32> %2 to <2 x i64>
2050 define <4 x i64> @test_mm256_broadcastd_epi32(<2 x i64> %a0) {
2051 ; CHECK-LABEL: test_mm256_broadcastd_epi32:
2053 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
2054 ; CHECK-NEXT: ret{{[l|q]}}
2055 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2056 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <8 x i32> zeroinitializer
2057 %res1 = bitcast <8 x i32> %res0 to <4 x i64>
2061 define <4 x i64> @test_mm256_mask_broadcastd_epi32(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) {
2062 ; X86-LABEL: test_mm256_mask_broadcastd_epi32:
2064 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2065 ; X86-NEXT: kmovw %eax, %k1
2066 ; X86-NEXT: vpbroadcastd %xmm1, %ymm0 {%k1}
2069 ; X64-LABEL: test_mm256_mask_broadcastd_epi32:
2071 ; X64-NEXT: kmovw %edi, %k1
2072 ; X64-NEXT: vpbroadcastd %xmm1, %ymm0 {%k1}
2074 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2075 %arg1 = bitcast i8 %a1 to <8 x i1>
2076 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
2077 %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <8 x i32> zeroinitializer
2078 %res1 = select <8 x i1> %arg1, <8 x i32> %res0, <8 x i32> %arg0
2079 %res2 = bitcast <8 x i32> %res1 to <4 x i64>
2083 define <4 x i64> @test_mm256_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) {
2084 ; X86-LABEL: test_mm256_maskz_broadcastd_epi32:
2086 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2087 ; X86-NEXT: kmovw %eax, %k1
2088 ; X86-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z}
2091 ; X64-LABEL: test_mm256_maskz_broadcastd_epi32:
2093 ; X64-NEXT: kmovw %edi, %k1
2094 ; X64-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z}
2096 %arg0 = bitcast i8 %a0 to <8 x i1>
2097 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2098 %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <8 x i32> zeroinitializer
2099 %res1 = select <8 x i1> %arg0, <8 x i32> %res0, <8 x i32> zeroinitializer
2100 %res2 = bitcast <8 x i32> %res1 to <4 x i64>
2104 define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
2105 ; CHECK-LABEL: test_mm_broadcastq_epi64:
2107 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2108 ; CHECK-NEXT: ret{{[l|q]}}
2109 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
2113 define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) {
2114 ; X86-LABEL: test_mm_mask_broadcastq_epi64:
2115 ; X86: # %bb.0: # %entry
2116 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2117 ; X86-NEXT: kmovw %eax, %k1
2118 ; X86-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1}
2121 ; X64-LABEL: test_mm_mask_broadcastq_epi64:
2122 ; X64: # %bb.0: # %entry
2123 ; X64-NEXT: kmovw %edi, %k1
2124 ; X64-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1}
2127 %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <2 x i32> zeroinitializer
2128 %0 = bitcast i8 %__M to <8 x i1>
2129 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2130 %1 = select <2 x i1> %extract.i, <2 x i64> %shuffle.i.i, <2 x i64> %__O
2134 define <2 x i64> @test_mm_maskz_broadcastq_epi64(i8 zeroext %__M, <2 x i64> %__A) {
2135 ; X86-LABEL: test_mm_maskz_broadcastq_epi64:
2136 ; X86: # %bb.0: # %entry
2137 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2138 ; X86-NEXT: kmovw %eax, %k1
2139 ; X86-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z}
2142 ; X64-LABEL: test_mm_maskz_broadcastq_epi64:
2143 ; X64: # %bb.0: # %entry
2144 ; X64-NEXT: kmovw %edi, %k1
2145 ; X64-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z}
2148 %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <2 x i32> zeroinitializer
2149 %0 = bitcast i8 %__M to <8 x i1>
2150 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2151 %1 = select <2 x i1> %extract.i, <2 x i64> %shuffle.i.i, <2 x i64> zeroinitializer
2155 define <4 x i64> @test_mm256_broadcastq_epi64(<2 x i64> %a0) {
2156 ; CHECK-LABEL: test_mm256_broadcastq_epi64:
2158 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
2159 ; CHECK-NEXT: ret{{[l|q]}}
2160 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> zeroinitializer
2164 define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) {
2165 ; X86-LABEL: test_mm256_mask_broadcastq_epi64:
2166 ; X86: # %bb.0: # %entry
2167 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2168 ; X86-NEXT: kmovw %eax, %k1
2169 ; X86-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1}
2172 ; X64-LABEL: test_mm256_mask_broadcastq_epi64:
2173 ; X64: # %bb.0: # %entry
2174 ; X64-NEXT: kmovw %edi, %k1
2175 ; X64-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1}
2178 %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <4 x i32> zeroinitializer
2179 %0 = bitcast i8 %__M to <8 x i1>
2180 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2181 %1 = select <4 x i1> %extract.i, <4 x i64> %shuffle.i.i, <4 x i64> %__O
2185 define <4 x i64> @test_mm256_maskz_broadcastq_epi64(i8 zeroext %__M, <2 x i64> %__A) {
2186 ; X86-LABEL: test_mm256_maskz_broadcastq_epi64:
2187 ; X86: # %bb.0: # %entry
2188 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2189 ; X86-NEXT: kmovw %eax, %k1
2190 ; X86-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z}
2193 ; X64-LABEL: test_mm256_maskz_broadcastq_epi64:
2194 ; X64: # %bb.0: # %entry
2195 ; X64-NEXT: kmovw %edi, %k1
2196 ; X64-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z}
2199 %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <4 x i32> zeroinitializer
2200 %0 = bitcast i8 %__M to <8 x i1>
2201 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2202 %1 = select <4 x i1> %extract.i, <4 x i64> %shuffle.i.i, <4 x i64> zeroinitializer
2206 define <4 x double> @test_mm256_broadcastsd_pd(<2 x double> %a0) {
2207 ; CHECK-LABEL: test_mm256_broadcastsd_pd:
2209 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
2210 ; CHECK-NEXT: ret{{[l|q]}}
2211 %res = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer
2212 ret <4 x double> %res
2215 define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %__O, i8 zeroext %__M, <2 x double> %__A) {
2216 ; X86-LABEL: test_mm256_mask_broadcastsd_pd:
2217 ; X86: # %bb.0: # %entry
2218 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2219 ; X86-NEXT: kmovw %eax, %k1
2220 ; X86-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1}
2223 ; X64-LABEL: test_mm256_mask_broadcastsd_pd:
2224 ; X64: # %bb.0: # %entry
2225 ; X64-NEXT: kmovw %edi, %k1
2226 ; X64-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1}
2229 %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <4 x i32> zeroinitializer
2230 %0 = bitcast i8 %__M to <8 x i1>
2231 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2232 %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> %__O
2236 define <4 x double> @test_mm256_maskz_broadcastsd_pd(i8 zeroext %__M, <2 x double> %__A) {
2237 ; X86-LABEL: test_mm256_maskz_broadcastsd_pd:
2238 ; X86: # %bb.0: # %entry
2239 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2240 ; X86-NEXT: kmovw %eax, %k1
2241 ; X86-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z}
2244 ; X64-LABEL: test_mm256_maskz_broadcastsd_pd:
2245 ; X64: # %bb.0: # %entry
2246 ; X64-NEXT: kmovw %edi, %k1
2247 ; X64-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z}
2250 %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <4 x i32> zeroinitializer
2251 %0 = bitcast i8 %__M to <8 x i1>
2252 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2253 %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> zeroinitializer
2257 define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
2258 ; CHECK-LABEL: test_mm_broadcastss_ps:
2260 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
2261 ; CHECK-NEXT: ret{{[l|q]}}
2262 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
2263 ret <4 x float> %res
2266 define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %__O, i8 zeroext %__M, <4 x float> %__A) {
2267 ; X86-LABEL: test_mm_mask_broadcastss_ps:
2268 ; X86: # %bb.0: # %entry
2269 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2270 ; X86-NEXT: kmovw %eax, %k1
2271 ; X86-NEXT: vbroadcastss %xmm1, %xmm0 {%k1}
2274 ; X64-LABEL: test_mm_mask_broadcastss_ps:
2275 ; X64: # %bb.0: # %entry
2276 ; X64-NEXT: kmovw %edi, %k1
2277 ; X64-NEXT: vbroadcastss %xmm1, %xmm0 {%k1}
2280 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> zeroinitializer
2281 %0 = bitcast i8 %__M to <8 x i1>
2282 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2283 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__O
2287 define <4 x float> @test_mm_maskz_broadcastss_ps(i8 zeroext %__M, <4 x float> %__A) {
2288 ; X86-LABEL: test_mm_maskz_broadcastss_ps:
2289 ; X86: # %bb.0: # %entry
2290 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2291 ; X86-NEXT: kmovw %eax, %k1
2292 ; X86-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z}
2295 ; X64-LABEL: test_mm_maskz_broadcastss_ps:
2296 ; X64: # %bb.0: # %entry
2297 ; X64-NEXT: kmovw %edi, %k1
2298 ; X64-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z}
2301 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> zeroinitializer
2302 %0 = bitcast i8 %__M to <8 x i1>
2303 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2304 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer
2308 define <8 x float> @test_mm256_broadcastss_ps(<4 x float> %a0) {
2309 ; CHECK-LABEL: test_mm256_broadcastss_ps:
2311 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
2312 ; CHECK-NEXT: ret{{[l|q]}}
2313 %res = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer
2314 ret <8 x float> %res
2317 define <8 x float> @test_mm256_mask_broadcastss_ps(<8 x float> %a0, i8 %a1, <4 x float> %a2) {
2318 ; X86-LABEL: test_mm256_mask_broadcastss_ps:
2320 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2321 ; X86-NEXT: kmovw %eax, %k1
2322 ; X86-NEXT: vbroadcastss %xmm1, %ymm0 {%k1}
2325 ; X64-LABEL: test_mm256_mask_broadcastss_ps:
2327 ; X64-NEXT: kmovw %edi, %k1
2328 ; X64-NEXT: vbroadcastss %xmm1, %ymm0 {%k1}
2330 %arg1 = bitcast i8 %a1 to <8 x i1>
2331 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <8 x i32> zeroinitializer
2332 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
2333 ret <8 x float> %res1
2336 define <8 x float> @test_mm256_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) {
2337 ; X86-LABEL: test_mm256_maskz_broadcastss_ps:
2339 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2340 ; X86-NEXT: kmovw %eax, %k1
2341 ; X86-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z}
2344 ; X64-LABEL: test_mm256_maskz_broadcastss_ps:
2346 ; X64-NEXT: kmovw %edi, %k1
2347 ; X64-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z}
2349 %arg0 = bitcast i8 %a0 to <8 x i1>
2350 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> zeroinitializer
2351 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
2352 ret <8 x float> %res1
2355 define <2 x double> @test_mm_movddup_pd(<2 x double> %a0) {
2356 ; CHECK-LABEL: test_mm_movddup_pd:
2358 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2359 ; CHECK-NEXT: ret{{[l|q]}}
2360 %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
2361 ret <2 x double> %res
2364 define <2 x double> @test_mm_mask_movedup_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A) {
2365 ; X86-LABEL: test_mm_mask_movedup_pd:
2366 ; X86: # %bb.0: # %entry
2367 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2368 ; X86-NEXT: kmovw %eax, %k1
2369 ; X86-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
2372 ; X64-LABEL: test_mm_mask_movedup_pd:
2373 ; X64: # %bb.0: # %entry
2374 ; X64-NEXT: kmovw %edi, %k1
2375 ; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
2378 %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <2 x i32> zeroinitializer
2379 %0 = bitcast i8 %__U to <8 x i1>
2380 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2381 %1 = select <2 x i1> %extract.i, <2 x double> %shuffle.i.i, <2 x double> %__W
2385 define <2 x double> @test_mm_maskz_movedup_pd(i8 zeroext %__U, <2 x double> %__A) {
2386 ; X86-LABEL: test_mm_maskz_movedup_pd:
2387 ; X86: # %bb.0: # %entry
2388 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2389 ; X86-NEXT: kmovw %eax, %k1
2390 ; X86-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
2393 ; X64-LABEL: test_mm_maskz_movedup_pd:
2394 ; X64: # %bb.0: # %entry
2395 ; X64-NEXT: kmovw %edi, %k1
2396 ; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
2399 %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <2 x i32> zeroinitializer
2400 %0 = bitcast i8 %__U to <8 x i1>
2401 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2402 %1 = select <2 x i1> %extract.i, <2 x double> %shuffle.i.i, <2 x double> zeroinitializer
2406 define <4 x double> @test_mm256_movddup_pd(<4 x double> %a0) {
2407 ; CHECK-LABEL: test_mm256_movddup_pd:
2409 ; CHECK-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
2410 ; CHECK-NEXT: ret{{[l|q]}}
2411 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2412 ret <4 x double> %res
2415 define <4 x double> @test_mm256_mask_movedup_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A) {
2416 ; X86-LABEL: test_mm256_mask_movedup_pd:
2417 ; X86: # %bb.0: # %entry
2418 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2419 ; X86-NEXT: kmovw %eax, %k1
2420 ; X86-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
2423 ; X64-LABEL: test_mm256_mask_movedup_pd:
2424 ; X64: # %bb.0: # %entry
2425 ; X64-NEXT: kmovw %edi, %k1
2426 ; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
2429 %shuffle.i.i = shufflevector <4 x double> %__A, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2430 %0 = bitcast i8 %__U to <8 x i1>
2431 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2432 %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> %__W
2436 define <4 x double> @test_mm256_maskz_movedup_pd(i8 zeroext %__U, <4 x double> %__A) {
2437 ; X86-LABEL: test_mm256_maskz_movedup_pd:
2438 ; X86: # %bb.0: # %entry
2439 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2440 ; X86-NEXT: kmovw %eax, %k1
2441 ; X86-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
2444 ; X64-LABEL: test_mm256_maskz_movedup_pd:
2445 ; X64: # %bb.0: # %entry
2446 ; X64-NEXT: kmovw %edi, %k1
2447 ; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
2450 %shuffle.i.i = shufflevector <4 x double> %__A, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2451 %0 = bitcast i8 %__U to <8 x i1>
2452 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2453 %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> zeroinitializer
2457 define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) {
2458 ; CHECK-LABEL: test_mm_movehdup_ps:
2460 ; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
2461 ; CHECK-NEXT: ret{{[l|q]}}
2462 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
2463 ret <4 x float> %res
2466 define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) {
2467 ; X86-LABEL: test_mm_mask_movehdup_ps:
2468 ; X86: # %bb.0: # %entry
2469 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2470 ; X86-NEXT: kmovw %eax, %k1
2471 ; X86-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
2474 ; X64-LABEL: test_mm_mask_movehdup_ps:
2475 ; X64: # %bb.0: # %entry
2476 ; X64-NEXT: kmovw %edi, %k1
2477 ; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
2480 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
2481 %0 = bitcast i8 %__U to <8 x i1>
2482 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2483 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__W
2487 define <4 x float> @test_mm_maskz_movehdup_ps(i8 zeroext %__U, <4 x float> %__A) {
2488 ; X86-LABEL: test_mm_maskz_movehdup_ps:
2489 ; X86: # %bb.0: # %entry
2490 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2491 ; X86-NEXT: kmovw %eax, %k1
2492 ; X86-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
2495 ; X64-LABEL: test_mm_maskz_movehdup_ps:
2496 ; X64: # %bb.0: # %entry
2497 ; X64-NEXT: kmovw %edi, %k1
2498 ; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
2501 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
2502 %0 = bitcast i8 %__U to <8 x i1>
2503 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2504 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer
2508 define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) {
2509 ; CHECK-LABEL: test_mm256_movehdup_ps:
2511 ; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
2512 ; CHECK-NEXT: ret{{[l|q]}}
2513 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
2514 ret <8 x float> %res
2517 define <8 x float> @test_mm256_mask_movehdup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) {
2518 ; X86-LABEL: test_mm256_mask_movehdup_ps:
2520 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2521 ; X86-NEXT: kmovw %eax, %k1
2522 ; X86-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7]
2525 ; X64-LABEL: test_mm256_mask_movehdup_ps:
2527 ; X64-NEXT: kmovw %edi, %k1
2528 ; X64-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7]
2530 %arg1 = bitcast i8 %a1 to <8 x i1>
2531 %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
2532 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
2533 ret <8 x float> %res1
2536 define <8 x float> @test_mm256_maskz_movehdup_ps(i8 %a0, <8 x float> %a1) {
2537 ; X86-LABEL: test_mm256_maskz_movehdup_ps:
2539 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2540 ; X86-NEXT: kmovw %eax, %k1
2541 ; X86-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
2544 ; X64-LABEL: test_mm256_maskz_movehdup_ps:
2546 ; X64-NEXT: kmovw %edi, %k1
2547 ; X64-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
2549 %arg0 = bitcast i8 %a0 to <8 x i1>
2550 %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
2551 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
2552 ret <8 x float> %res1
2555 define <4 x float> @test_mm_moveldup_ps(<4 x float> %a0) {
2556 ; CHECK-LABEL: test_mm_moveldup_ps:
2558 ; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
2559 ; CHECK-NEXT: ret{{[l|q]}}
2560 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2561 ret <4 x float> %res
2564 define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) {
2565 ; X86-LABEL: test_mm_mask_moveldup_ps:
2566 ; X86: # %bb.0: # %entry
2567 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2568 ; X86-NEXT: kmovw %eax, %k1
2569 ; X86-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
2572 ; X64-LABEL: test_mm_mask_moveldup_ps:
2573 ; X64: # %bb.0: # %entry
2574 ; X64-NEXT: kmovw %edi, %k1
2575 ; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
2578 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2579 %0 = bitcast i8 %__U to <8 x i1>
2580 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2581 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__W
2585 define <4 x float> @test_mm_maskz_moveldup_ps(i8 zeroext %__U, <4 x float> %__A) {
2586 ; X86-LABEL: test_mm_maskz_moveldup_ps:
2587 ; X86: # %bb.0: # %entry
2588 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2589 ; X86-NEXT: kmovw %eax, %k1
2590 ; X86-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
2593 ; X64-LABEL: test_mm_maskz_moveldup_ps:
2594 ; X64: # %bb.0: # %entry
2595 ; X64-NEXT: kmovw %edi, %k1
2596 ; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
2599 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2600 %0 = bitcast i8 %__U to <8 x i1>
2601 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2602 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer
2606 define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) {
2607 ; CHECK-LABEL: test_mm256_moveldup_ps:
2609 ; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
2610 ; CHECK-NEXT: ret{{[l|q]}}
2611 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
2612 ret <8 x float> %res
2615 define <8 x float> @test_mm256_mask_moveldup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) {
2616 ; X86-LABEL: test_mm256_mask_moveldup_ps:
2618 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2619 ; X86-NEXT: kmovw %eax, %k1
2620 ; X86-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6]
2623 ; X64-LABEL: test_mm256_mask_moveldup_ps:
2625 ; X64-NEXT: kmovw %edi, %k1
2626 ; X64-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6]
2628 %arg1 = bitcast i8 %a1 to <8 x i1>
2629 %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
2630 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
2631 ret <8 x float> %res1
2634 define <8 x float> @test_mm256_maskz_moveldup_ps(i8 %a0, <8 x float> %a1) {
2635 ; X86-LABEL: test_mm256_maskz_moveldup_ps:
2637 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2638 ; X86-NEXT: kmovw %eax, %k1
2639 ; X86-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
2642 ; X64-LABEL: test_mm256_maskz_moveldup_ps:
2644 ; X64-NEXT: kmovw %edi, %k1
2645 ; X64-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
2647 %arg0 = bitcast i8 %a0 to <8 x i1>
2648 %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
2649 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
2650 ret <8 x float> %res1
2653 define <4 x i64> @test_mm256_permutex_epi64(<4 x i64> %a0) {
2654 ; CHECK-LABEL: test_mm256_permutex_epi64:
2656 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
2657 ; CHECK-NEXT: ret{{[l|q]}}
2658 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
2662 define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X) {
2663 ; X86-LABEL: test_mm256_mask_permutex_epi64:
2664 ; X86: # %bb.0: # %entry
2665 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2666 ; X86-NEXT: kmovw %eax, %k1
2667 ; X86-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[3,0,0,0]
2670 ; X64-LABEL: test_mm256_mask_permutex_epi64:
2671 ; X64: # %bb.0: # %entry
2672 ; X64-NEXT: kmovw %edi, %k1
2673 ; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[3,0,0,0]
2676 %perm = shufflevector <4 x i64> %__X, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
2677 %0 = bitcast i8 %__M to <8 x i1>
2678 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2679 %1 = select <4 x i1> %extract, <4 x i64> %perm, <4 x i64> %__W
2683 define <4 x i64> @test_mm256_maskz_permutex_epi64(i8 zeroext %__M, <4 x i64> %__X) {
2684 ; X86-LABEL: test_mm256_maskz_permutex_epi64:
2685 ; X86: # %bb.0: # %entry
2686 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2687 ; X86-NEXT: kmovw %eax, %k1
2688 ; X86-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0]
2691 ; X64-LABEL: test_mm256_maskz_permutex_epi64:
2692 ; X64: # %bb.0: # %entry
2693 ; X64-NEXT: kmovw %edi, %k1
2694 ; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0]
2697 %perm = shufflevector <4 x i64> %__X, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
2698 %0 = bitcast i8 %__M to <8 x i1>
2699 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2700 %1 = select <4 x i1> %extract, <4 x i64> %perm, <4 x i64> zeroinitializer
2704 define <4 x double> @test_mm256_permutex_pd(<4 x double> %a0) {
2705 ; CHECK-LABEL: test_mm256_permutex_pd:
2707 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
2708 ; CHECK-NEXT: ret{{[l|q]}}
2709 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
2710 ret <4 x double> %res
2713 define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__X) {
2714 ; X86-LABEL: test_mm256_mask_permutex_pd:
2715 ; X86: # %bb.0: # %entry
2716 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2717 ; X86-NEXT: kmovw %eax, %k1
2718 ; X86-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
2721 ; X64-LABEL: test_mm256_mask_permutex_pd:
2722 ; X64: # %bb.0: # %entry
2723 ; X64-NEXT: kmovw %edi, %k1
2724 ; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
2727 %perm = shufflevector <4 x double> %__X, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
2728 %0 = bitcast i8 %__U to <8 x i1>
2729 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2730 %1 = select <4 x i1> %extract, <4 x double> %perm, <4 x double> %__W
2734 define <4 x double> @test_mm256_maskz_permutex_pd(i8 zeroext %__U, <4 x double> %__X) {
2735 ; X86-LABEL: test_mm256_maskz_permutex_pd:
2736 ; X86: # %bb.0: # %entry
2737 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2738 ; X86-NEXT: kmovw %eax, %k1
2739 ; X86-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
2742 ; X64-LABEL: test_mm256_maskz_permutex_pd:
2743 ; X64: # %bb.0: # %entry
2744 ; X64-NEXT: kmovw %edi, %k1
2745 ; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
2748 %perm = shufflevector <4 x double> %__X, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
2749 %0 = bitcast i8 %__U to <8 x i1>
2750 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2751 %1 = select <4 x i1> %extract, <4 x double> %perm, <4 x double> zeroinitializer
2755 define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) {
2756 ; CHECK-LABEL: test_mm_shuffle_pd:
2758 ; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2759 ; CHECK-NEXT: ret{{[l|q]}}
2760 %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
2761 ret <2 x double> %res
2764 define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2765 ; X86-LABEL: test_mm_mask_shuffle_pd:
2766 ; X86: # %bb.0: # %entry
2767 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2768 ; X86-NEXT: kmovw %eax, %k1
2769 ; X86-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
2772 ; X64-LABEL: test_mm_mask_shuffle_pd:
2773 ; X64: # %bb.0: # %entry
2774 ; X64-NEXT: kmovw %edi, %k1
2775 ; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
2778 %shufp = shufflevector <2 x double> %__A, <2 x double> %__B, <2 x i32> <i32 1, i32 3>
2779 %0 = bitcast i8 %__U to <8 x i1>
2780 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2781 %1 = select <2 x i1> %extract, <2 x double> %shufp, <2 x double> %__W
2785 define <2 x double> @test_mm_maskz_shuffle_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2786 ; X86-LABEL: test_mm_maskz_shuffle_pd:
2787 ; X86: # %bb.0: # %entry
2788 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2789 ; X86-NEXT: kmovw %eax, %k1
2790 ; X86-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
2793 ; X64-LABEL: test_mm_maskz_shuffle_pd:
2794 ; X64: # %bb.0: # %entry
2795 ; X64-NEXT: kmovw %edi, %k1
2796 ; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
2799 %shufp = shufflevector <2 x double> %__A, <2 x double> %__B, <2 x i32> <i32 1, i32 3>
2800 %0 = bitcast i8 %__U to <8 x i1>
2801 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2802 %1 = select <2 x i1> %extract, <2 x double> %shufp, <2 x double> zeroinitializer
2806 define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) {
2807 ; CHECK-LABEL: test_mm256_shuffle_pd:
2809 ; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
2810 ; CHECK-NEXT: ret{{[l|q]}}
2811 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
2812 ret <4 x double> %res
2815 define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
2816 ; X86-LABEL: test_mm256_mask_shuffle_pd:
2817 ; X86: # %bb.0: # %entry
2818 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2819 ; X86-NEXT: kmovw %eax, %k1
2820 ; X86-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
2823 ; X64-LABEL: test_mm256_mask_shuffle_pd:
2824 ; X64: # %bb.0: # %entry
2825 ; X64-NEXT: kmovw %edi, %k1
2826 ; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
2829 %shufp = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
2830 %0 = bitcast i8 %__U to <8 x i1>
2831 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2832 %1 = select <4 x i1> %extract, <4 x double> %shufp, <4 x double> %__W
2836 define <4 x double> @test_mm256_maskz_shuffle_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
2837 ; X86-LABEL: test_mm256_maskz_shuffle_pd:
2838 ; X86: # %bb.0: # %entry
2839 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2840 ; X86-NEXT: kmovw %eax, %k1
2841 ; X86-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
2844 ; X64-LABEL: test_mm256_maskz_shuffle_pd:
2845 ; X64: # %bb.0: # %entry
2846 ; X64-NEXT: kmovw %edi, %k1
2847 ; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
2850 %shufp = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
2851 %0 = bitcast i8 %__U to <8 x i1>
2852 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2853 %1 = select <4 x i1> %extract, <4 x double> %shufp, <4 x double> zeroinitializer
2857 define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) {
2858 ; CHECK-LABEL: test_mm_shuffle_ps:
2860 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
2861 ; CHECK-NEXT: ret{{[l|q]}}
2862 %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
2863 ret <4 x float> %res
2866 define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2867 ; X86-LABEL: test_mm_mask_shuffle_ps:
2868 ; X86: # %bb.0: # %entry
2869 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2870 ; X86-NEXT: kmovw %eax, %k1
2871 ; X86-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
2874 ; X64-LABEL: test_mm_mask_shuffle_ps:
2875 ; X64: # %bb.0: # %entry
2876 ; X64-NEXT: kmovw %edi, %k1
2877 ; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
2880 %shufp = shufflevector <4 x float> %__A, <4 x float> %__B, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
2881 %0 = bitcast i8 %__U to <8 x i1>
2882 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2883 %1 = select <4 x i1> %extract, <4 x float> %shufp, <4 x float> %__W
2887 define <4 x float> @test_mm_maskz_shuffle_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2888 ; X86-LABEL: test_mm_maskz_shuffle_ps:
2889 ; X86: # %bb.0: # %entry
2890 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2891 ; X86-NEXT: kmovw %eax, %k1
2892 ; X86-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
2895 ; X64-LABEL: test_mm_maskz_shuffle_ps:
2896 ; X64: # %bb.0: # %entry
2897 ; X64-NEXT: kmovw %edi, %k1
2898 ; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
2901 %shufp = shufflevector <4 x float> %__A, <4 x float> %__B, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
2902 %0 = bitcast i8 %__U to <8 x i1>
2903 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2904 %1 = select <4 x i1> %extract, <4 x float> %shufp, <4 x float> zeroinitializer
2908 define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) {
2909 ; CHECK-LABEL: test_mm256_shuffle_ps:
2911 ; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
2912 ; CHECK-NEXT: ret{{[l|q]}}
2913 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
2914 ret <8 x float> %res
2917 define <8 x float> @test_mm256_mask_shuffle_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2, <8 x float> %a3) {
2918 ; X86-LABEL: test_mm256_mask_shuffle_ps:
2920 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2921 ; X86-NEXT: kmovw %eax, %k1
2922 ; X86-NEXT: vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4]
2925 ; X64-LABEL: test_mm256_mask_shuffle_ps:
2927 ; X64-NEXT: kmovw %edi, %k1
2928 ; X64-NEXT: vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4]
2930 %arg1 = bitcast i8 %a1 to <8 x i1>
2931 %res0 = shufflevector <8 x float> %a2, <8 x float> %a3, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
2932 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
2933 ret <8 x float> %res1
2936 define <8 x float> @test_mm256_maskz_shuffle_ps(i8 %a0, <8 x float> %a1, <8 x float> %a2) {
2937 ; X86-LABEL: test_mm256_maskz_shuffle_ps:
2939 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2940 ; X86-NEXT: kmovw %eax, %k1
2941 ; X86-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
2944 ; X64-LABEL: test_mm256_maskz_shuffle_ps:
2946 ; X64-NEXT: kmovw %edi, %k1
2947 ; X64-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
2949 %arg0 = bitcast i8 %a0 to <8 x i1>
2950 %res0 = shufflevector <8 x float> %a1, <8 x float> %a2, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
2951 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
2952 ret <8 x float> %res1
2955 define <4 x i64> @test_mm256_mask_mul_epi32(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
2956 ; X86-LABEL: test_mm256_mask_mul_epi32:
2957 ; X86: # %bb.0: # %entry
2958 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2959 ; X86-NEXT: kmovw %eax, %k1
2960 ; X86-NEXT: vpmuldq %ymm1, %ymm2, %ymm0 {%k1}
2963 ; X64-LABEL: test_mm256_mask_mul_epi32:
2964 ; X64: # %bb.0: # %entry
2965 ; X64-NEXT: kmovw %edi, %k1
2966 ; X64-NEXT: vpmuldq %ymm1, %ymm2, %ymm0 {%k1}
2969 %tmp = shl <4 x i64> %__X, <i64 32, i64 32, i64 32, i64 32>
2970 %tmp1 = ashr exact <4 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32>
2971 %tmp2 = shl <4 x i64> %__Y, <i64 32, i64 32, i64 32, i64 32>
2972 %tmp3 = ashr exact <4 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32>
2973 %tmp4 = mul nsw <4 x i64> %tmp3, %tmp1
2974 %tmp5 = bitcast i8 %__M to <8 x i1>
2975 %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2976 %tmp6 = select <4 x i1> %extract.i, <4 x i64> %tmp4, <4 x i64> %__W
2980 define <4 x i64> @test_mm256_maskz_mul_epi32(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
2981 ; X86-LABEL: test_mm256_maskz_mul_epi32:
2983 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2984 ; X86-NEXT: kmovw %eax, %k1
2985 ; X86-NEXT: vpmuldq %ymm0, %ymm1, %ymm0 {%k1} {z}
2988 ; X64-LABEL: test_mm256_maskz_mul_epi32:
2990 ; X64-NEXT: kmovw %edi, %k1
2991 ; X64-NEXT: vpmuldq %ymm0, %ymm1, %ymm0 {%k1} {z}
2993 %tmp = shl <4 x i64> %__X, <i64 32, i64 32, i64 32, i64 32>
2994 %tmp1 = ashr exact <4 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32>
2995 %tmp2 = shl <4 x i64> %__Y, <i64 32, i64 32, i64 32, i64 32>
2996 %tmp3 = ashr exact <4 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32>
2997 %tmp4 = mul nsw <4 x i64> %tmp3, %tmp1
2998 %tmp5 = bitcast i8 %__M to <8 x i1>
2999 %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3000 %tmp6 = select <4 x i1> %extract.i, <4 x i64> %tmp4, <4 x i64> zeroinitializer
3004 define <2 x i64> @test_mm_mask_mul_epi32(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
3005 ; X86-LABEL: test_mm_mask_mul_epi32:
3007 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3008 ; X86-NEXT: kmovw %eax, %k1
3009 ; X86-NEXT: vpmuldq %xmm1, %xmm2, %xmm0 {%k1}
3012 ; X64-LABEL: test_mm_mask_mul_epi32:
3014 ; X64-NEXT: kmovw %edi, %k1
3015 ; X64-NEXT: vpmuldq %xmm1, %xmm2, %xmm0 {%k1}
3017 %tmp = shl <2 x i64> %__X, <i64 32, i64 32>
3018 %tmp1 = ashr exact <2 x i64> %tmp, <i64 32, i64 32>
3019 %tmp2 = shl <2 x i64> %__Y, <i64 32, i64 32>
3020 %tmp3 = ashr exact <2 x i64> %tmp2, <i64 32, i64 32>
3021 %tmp4 = mul nsw <2 x i64> %tmp3, %tmp1
3022 %tmp5 = bitcast i8 %__M to <8 x i1>
3023 %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3024 %tmp6 = select <2 x i1> %extract.i, <2 x i64> %tmp4, <2 x i64> %__W
3028 define <2 x i64> @test_mm_maskz_mul_epi32(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
3029 ; X86-LABEL: test_mm_maskz_mul_epi32:
3031 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3032 ; X86-NEXT: kmovw %eax, %k1
3033 ; X86-NEXT: vpmuldq %xmm0, %xmm1, %xmm0 {%k1} {z}
3036 ; X64-LABEL: test_mm_maskz_mul_epi32:
3038 ; X64-NEXT: kmovw %edi, %k1
3039 ; X64-NEXT: vpmuldq %xmm0, %xmm1, %xmm0 {%k1} {z}
3041 %tmp = shl <2 x i64> %__X, <i64 32, i64 32>
3042 %tmp1 = ashr exact <2 x i64> %tmp, <i64 32, i64 32>
3043 %tmp2 = shl <2 x i64> %__Y, <i64 32, i64 32>
3044 %tmp3 = ashr exact <2 x i64> %tmp2, <i64 32, i64 32>
3045 %tmp4 = mul nsw <2 x i64> %tmp3, %tmp1
3046 %tmp5 = bitcast i8 %__M to <8 x i1>
3047 %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3048 %tmp6 = select <2 x i1> %extract.i, <2 x i64> %tmp4, <2 x i64> zeroinitializer
3052 define <4 x i64> @test_mm256_mask_mul_epu32(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
3053 ; X86-LABEL: test_mm256_mask_mul_epu32:
3054 ; X86: # %bb.0: # %entry
3055 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3056 ; X86-NEXT: kmovw %eax, %k1
3057 ; X86-NEXT: vpmuludq %ymm1, %ymm2, %ymm0 {%k1}
3060 ; X64-LABEL: test_mm256_mask_mul_epu32:
3061 ; X64: # %bb.0: # %entry
3062 ; X64-NEXT: kmovw %edi, %k1
3063 ; X64-NEXT: vpmuludq %ymm1, %ymm2, %ymm0 {%k1}
3066 %tmp = and <4 x i64> %__X, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
3067 %tmp1 = and <4 x i64> %__Y, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
3068 %tmp2 = mul nuw <4 x i64> %tmp1, %tmp
3069 %tmp3 = bitcast i8 %__M to <8 x i1>
3070 %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3071 %tmp4 = select <4 x i1> %extract.i, <4 x i64> %tmp2, <4 x i64> %__W
3075 define <4 x i64> @test_mm256_maskz_mul_epu32(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
3076 ; X86-LABEL: test_mm256_maskz_mul_epu32:
3077 ; X86: # %bb.0: # %entry
3078 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3079 ; X86-NEXT: kmovw %eax, %k1
3080 ; X86-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 {%k1} {z}
3083 ; X64-LABEL: test_mm256_maskz_mul_epu32:
3084 ; X64: # %bb.0: # %entry
3085 ; X64-NEXT: kmovw %edi, %k1
3086 ; X64-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 {%k1} {z}
3089 %tmp = and <4 x i64> %__X, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
3090 %tmp1 = and <4 x i64> %__Y, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
3091 %tmp2 = mul nuw <4 x i64> %tmp1, %tmp
3092 %tmp3 = bitcast i8 %__M to <8 x i1>
3093 %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3094 %tmp4 = select <4 x i1> %extract.i, <4 x i64> %tmp2, <4 x i64> zeroinitializer
3098 define <2 x i64> @test_mm_mask_mul_epu32(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
3099 ; X86-LABEL: test_mm_mask_mul_epu32:
3100 ; X86: # %bb.0: # %entry
3101 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3102 ; X86-NEXT: kmovw %eax, %k1
3103 ; X86-NEXT: vpmuludq %xmm1, %xmm2, %xmm0 {%k1}
3106 ; X64-LABEL: test_mm_mask_mul_epu32:
3107 ; X64: # %bb.0: # %entry
3108 ; X64-NEXT: kmovw %edi, %k1
3109 ; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm0 {%k1}
3112 %tmp = and <2 x i64> %__X, <i64 4294967295, i64 4294967295>
3113 %tmp1 = and <2 x i64> %__Y, <i64 4294967295, i64 4294967295>
3114 %tmp2 = mul nuw <2 x i64> %tmp1, %tmp
3115 %tmp3 = bitcast i8 %__M to <8 x i1>
3116 %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3117 %tmp4 = select <2 x i1> %extract.i, <2 x i64> %tmp2, <2 x i64> %__W
3121 define <2 x i64> @test_mm_maskz_mul_epu32(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
3122 ; X86-LABEL: test_mm_maskz_mul_epu32:
3123 ; X86: # %bb.0: # %entry
3124 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3125 ; X86-NEXT: kmovw %eax, %k1
3126 ; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 {%k1} {z}
3129 ; X64-LABEL: test_mm_maskz_mul_epu32:
3130 ; X64: # %bb.0: # %entry
3131 ; X64-NEXT: kmovw %edi, %k1
3132 ; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 {%k1} {z}
3135 %tmp = and <2 x i64> %__X, <i64 4294967295, i64 4294967295>
3136 %tmp1 = and <2 x i64> %__Y, <i64 4294967295, i64 4294967295>
3137 %tmp2 = mul nuw <2 x i64> %tmp1, %tmp
3138 %tmp3 = bitcast i8 %__M to <8 x i1>
3139 %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3140 %tmp4 = select <2 x i1> %extract.i, <2 x i64> %tmp2, <2 x i64> zeroinitializer
3144 define <2 x i64> @test_mm_cvtepi32_epi8(<2 x i64> %__A) {
3145 ; CHECK-LABEL: test_mm_cvtepi32_epi8:
3146 ; CHECK: # %bb.0: # %entry
3147 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3148 ; CHECK-NEXT: ret{{[l|q]}}
3150 %0 = bitcast <2 x i64> %__A to <4 x i32>
3151 %conv.i = trunc <4 x i32> %0 to <4 x i8>
3152 %shuf.i = shufflevector <4 x i8> %conv.i, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
3153 %1 = bitcast <16 x i8> %shuf.i to <2 x i64>
3157 define <2 x i64> @test_mm_cvtepi32_epi16(<2 x i64> %__A) {
3158 ; CHECK-LABEL: test_mm_cvtepi32_epi16:
3159 ; CHECK: # %bb.0: # %entry
3160 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
3161 ; CHECK-NEXT: ret{{[l|q]}}
3163 %0 = bitcast <2 x i64> %__A to <4 x i32>
3164 %conv.i = trunc <4 x i32> %0 to <4 x i16>
3165 %shuf.i = shufflevector <4 x i16> %conv.i, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3166 %1 = bitcast <8 x i16> %shuf.i to <2 x i64>
3170 define <2 x i64> @test_mm_cvtepi64_epi8(<2 x i64> %__A) {
3171 ; CHECK-LABEL: test_mm_cvtepi64_epi8:
3172 ; CHECK: # %bb.0: # %entry
3173 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3174 ; CHECK-NEXT: ret{{[l|q]}}
3176 %conv.i = trunc <2 x i64> %__A to <2 x i8>
3177 %shuf.i = shufflevector <2 x i8> %conv.i, <2 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
3178 %0 = bitcast <16 x i8> %shuf.i to <2 x i64>
3182 define <2 x i64> @test_mm_cvtepi64_epi16(<2 x i64> %__A) {
3183 ; CHECK-LABEL: test_mm_cvtepi64_epi16:
3184 ; CHECK: # %bb.0: # %entry
3185 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3186 ; CHECK-NEXT: ret{{[l|q]}}
3188 %conv.i = trunc <2 x i64> %__A to <2 x i16>
3189 %shuf.i = shufflevector <2 x i16> %conv.i, <2 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
3190 %0 = bitcast <8 x i16> %shuf.i to <2 x i64>
3194 define <2 x i64> @test_mm_cvtepi64_epi32(<2 x i64> %__A) {
3195 ; CHECK-LABEL: test_mm_cvtepi64_epi32:
3196 ; CHECK: # %bb.0: # %entry
3197 ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
3198 ; CHECK-NEXT: ret{{[l|q]}}
3200 %conv.i = trunc <2 x i64> %__A to <2 x i32>
3201 %shuf.i = shufflevector <2 x i32> %conv.i, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3202 %0 = bitcast <4 x i32> %shuf.i to <2 x i64>
3206 define <2 x i64> @test_mm256_cvtepi32_epi16(<4 x i64> %__A) local_unnamed_addr #0 {
3207 ; CHECK-LABEL: test_mm256_cvtepi32_epi16:
3208 ; CHECK: # %bb.0: # %entry
3209 ; CHECK-NEXT: vpmovdw %ymm0, %xmm0
3210 ; CHECK-NEXT: vzeroupper
3211 ; CHECK-NEXT: ret{{[l|q]}}
3213 %0 = bitcast <4 x i64> %__A to <8 x i32>
3214 %conv.i = trunc <8 x i32> %0 to <8 x i16>
3215 %1 = bitcast <8 x i16> %conv.i to <2 x i64>
3219 define <2 x i64> @test_mm256_mask_cvtepi32_epi16(<2 x i64> %__O, i8 zeroext %__M, <4 x i64> %__A) {
3220 ; X86-LABEL: test_mm256_mask_cvtepi32_epi16:
3221 ; X86: # %bb.0: # %entry
3222 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3223 ; X86-NEXT: kmovw %eax, %k1
3224 ; X86-NEXT: vpmovdw %ymm1, %xmm0 {%k1}
3225 ; X86-NEXT: vzeroupper
3228 ; X64-LABEL: test_mm256_mask_cvtepi32_epi16:
3229 ; X64: # %bb.0: # %entry
3230 ; X64-NEXT: kmovw %edi, %k1
3231 ; X64-NEXT: vpmovdw %ymm1, %xmm0 {%k1}
3232 ; X64-NEXT: vzeroupper
3235 %0 = bitcast <4 x i64> %__A to <8 x i32>
3236 %1 = bitcast <2 x i64> %__O to <8 x i16>
3237 %2 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %0, <8 x i16> %1, i8 %__M)
3238 %3 = bitcast <8 x i16> %2 to <2 x i64>
3242 define <2 x i64> @test_mm256_maskz_cvtepi32_epi16(i8 zeroext %__M, <4 x i64> %__A) {
3243 ; X86-LABEL: test_mm256_maskz_cvtepi32_epi16:
3244 ; X86: # %bb.0: # %entry
3245 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3246 ; X86-NEXT: kmovw %eax, %k1
3247 ; X86-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z}
3248 ; X86-NEXT: vzeroupper
3251 ; X64-LABEL: test_mm256_maskz_cvtepi32_epi16:
3252 ; X64: # %bb.0: # %entry
3253 ; X64-NEXT: kmovw %edi, %k1
3254 ; X64-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z}
3255 ; X64-NEXT: vzeroupper
3258 %0 = bitcast <4 x i64> %__A to <8 x i32>
3259 %1 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %0, <8 x i16> zeroinitializer, i8 %__M)
3260 %2 = bitcast <8 x i16> %1 to <2 x i64>
3264 define <2 x i64> @test_mm256_cvtepi64_epi32(<4 x i64> %__A) local_unnamed_addr #0 {
3265 ; CHECK-LABEL: test_mm256_cvtepi64_epi32:
3266 ; CHECK: # %bb.0: # %entry
3267 ; CHECK-NEXT: vpmovqd %ymm0, %xmm0
3268 ; CHECK-NEXT: vzeroupper
3269 ; CHECK-NEXT: ret{{[l|q]}}
3271 %conv.i = trunc <4 x i64> %__A to <4 x i32>
3272 %0 = bitcast <4 x i32> %conv.i to <2 x i64>
3276 define <2 x i64> @test_mm256_mask_cvtepi64_epi32(<2 x i64> %__O, i8 zeroext %__M, <4 x i64> %__A) {
3277 ; X86-LABEL: test_mm256_mask_cvtepi64_epi32:
3278 ; X86: # %bb.0: # %entry
3279 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3280 ; X86-NEXT: kmovw %eax, %k1
3281 ; X86-NEXT: vpmovqd %ymm1, %xmm0 {%k1}
3282 ; X86-NEXT: vzeroupper
3285 ; X64-LABEL: test_mm256_mask_cvtepi64_epi32:
3286 ; X64: # %bb.0: # %entry
3287 ; X64-NEXT: kmovw %edi, %k1
3288 ; X64-NEXT: vpmovqd %ymm1, %xmm0 {%k1}
3289 ; X64-NEXT: vzeroupper
3292 %conv.i.i = trunc <4 x i64> %__A to <4 x i32>
3293 %0 = bitcast <2 x i64> %__O to <4 x i32>
3294 %1 = bitcast i8 %__M to <8 x i1>
3295 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3296 %2 = select <4 x i1> %extract.i, <4 x i32> %conv.i.i, <4 x i32> %0
3297 %3 = bitcast <4 x i32> %2 to <2 x i64>
3301 define <2 x i64> @test_mm256_maskz_cvtepi64_epi32(i8 zeroext %__M, <4 x i64> %__A) {
3302 ; X86-LABEL: test_mm256_maskz_cvtepi64_epi32:
3303 ; X86: # %bb.0: # %entry
3304 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3305 ; X86-NEXT: kmovw %eax, %k1
3306 ; X86-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z}
3307 ; X86-NEXT: vzeroupper
3310 ; X64-LABEL: test_mm256_maskz_cvtepi64_epi32:
3311 ; X64: # %bb.0: # %entry
3312 ; X64-NEXT: kmovw %edi, %k1
3313 ; X64-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z}
3314 ; X64-NEXT: vzeroupper
3317 %conv.i.i = trunc <4 x i64> %__A to <4 x i32>
3318 %0 = bitcast i8 %__M to <8 x i1>
3319 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3320 %1 = select <4 x i1> %extract.i, <4 x i32> %conv.i.i, <4 x i32> zeroinitializer
3321 %2 = bitcast <4 x i32> %1 to <2 x i64>
3325 define <2 x i64> @test_mm256_cvtepi64_epi8(<4 x i64> %__A) {
3326 ; CHECK-LABEL: test_mm256_cvtepi64_epi8:
3327 ; CHECK: # %bb.0: # %entry
3328 ; CHECK-NEXT: vpmovqb %ymm0, %xmm0
3329 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
3330 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
3331 ; CHECK-NEXT: vzeroupper
3332 ; CHECK-NEXT: ret{{[l|q]}}
3334 %conv.i = trunc <4 x i64> %__A to <4 x i8>
3335 %shuf.i = shufflevector <4 x i8> %conv.i, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
3336 %0 = bitcast <16 x i8> %shuf.i to <2 x i64>
3340 define <2 x i64> @test_mm256_cvtepi64_epi16(<4 x i64> %__A) {
3341 ; CHECK-LABEL: test_mm256_cvtepi64_epi16:
3342 ; CHECK: # %bb.0: # %entry
3343 ; CHECK-NEXT: vpmovqw %ymm0, %xmm0
3344 ; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
3345 ; CHECK-NEXT: vzeroupper
3346 ; CHECK-NEXT: ret{{[l|q]}}
3348 %conv.i = trunc <4 x i64> %__A to <4 x i16>
3349 %shuf.i = shufflevector <4 x i16> %conv.i, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3350 %0 = bitcast <8 x i16> %shuf.i to <2 x i64>
3354 define <2 x i64> @test_mm256_cvtepi32_epi8(<4 x i64> %__A) {
3355 ; CHECK-LABEL: test_mm256_cvtepi32_epi8:
3356 ; CHECK: # %bb.0: # %entry
3357 ; CHECK-NEXT: vpmovdb %ymm0, %xmm0
3358 ; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
3359 ; CHECK-NEXT: vzeroupper
3360 ; CHECK-NEXT: ret{{[l|q]}}
3362 %0 = bitcast <4 x i64> %__A to <8 x i32>
3363 %conv.i = trunc <8 x i32> %0 to <8 x i8>
3364 %shuf.i = shufflevector <8 x i8> %conv.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3365 %1 = bitcast <16 x i8> %shuf.i to <2 x i64>
3369 define <2 x i64> @test_mm_ternarylogic_epi32(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) {
3370 ; CHECK-LABEL: test_mm_ternarylogic_epi32:
3371 ; CHECK: # %bb.0: # %entry
3372 ; CHECK-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0
3373 ; CHECK-NEXT: ret{{[l|q]}}
3375 %0 = bitcast <2 x i64> %__A to <4 x i32>
3376 %1 = bitcast <2 x i64> %__B to <4 x i32>
3377 %2 = bitcast <2 x i64> %__C to <4 x i32>
3378 %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4)
3379 %4 = bitcast <4 x i32> %3 to <2 x i64>
3383 declare <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32) #2
3385 define <2 x i64> @test_mm_mask_ternarylogic_epi32(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__B, <2 x i64> %__C) {
3386 ; X86-LABEL: test_mm_mask_ternarylogic_epi32:
3387 ; X86: # %bb.0: # %entry
3388 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3389 ; X86-NEXT: kmovw %eax, %k1
3390 ; X86-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1}
3393 ; X64-LABEL: test_mm_mask_ternarylogic_epi32:
3394 ; X64: # %bb.0: # %entry
3395 ; X64-NEXT: kmovw %edi, %k1
3396 ; X64-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1}
3399 %0 = bitcast <2 x i64> %__A to <4 x i32>
3400 %1 = bitcast <2 x i64> %__B to <4 x i32>
3401 %2 = bitcast <2 x i64> %__C to <4 x i32>
3402 %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4)
3403 %4 = bitcast i8 %__U to <8 x i1>
3404 %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3405 %5 = select <4 x i1> %extract, <4 x i32> %3, <4 x i32> %0
3406 %6 = bitcast <4 x i32> %5 to <2 x i64>
3410 define <2 x i64> @test_mm_maskz_ternarylogic_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) {
3411 ; X86-LABEL: test_mm_maskz_ternarylogic_epi32:
3412 ; X86: # %bb.0: # %entry
3413 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3414 ; X86-NEXT: kmovw %eax, %k1
3415 ; X86-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} {z}
3418 ; X64-LABEL: test_mm_maskz_ternarylogic_epi32:
3419 ; X64: # %bb.0: # %entry
3420 ; X64-NEXT: kmovw %edi, %k1
3421 ; X64-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} {z}
3424 %0 = bitcast <2 x i64> %__A to <4 x i32>
3425 %1 = bitcast <2 x i64> %__B to <4 x i32>
3426 %2 = bitcast <2 x i64> %__C to <4 x i32>
3427 %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4)
3428 %4 = bitcast i8 %__U to <8 x i1>
3429 %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3430 %5 = select <4 x i1> %extract, <4 x i32> %3, <4 x i32> zeroinitializer
3431 %6 = bitcast <4 x i32> %5 to <2 x i64>
3435 define <4 x i64> @test_mm256_ternarylogic_epi32(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) {
3436 ; CHECK-LABEL: test_mm256_ternarylogic_epi32:
3437 ; CHECK: # %bb.0: # %entry
3438 ; CHECK-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0
3439 ; CHECK-NEXT: ret{{[l|q]}}
3441 %0 = bitcast <4 x i64> %__A to <8 x i32>
3442 %1 = bitcast <4 x i64> %__B to <8 x i32>
3443 %2 = bitcast <4 x i64> %__C to <8 x i32>
3444 %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4)
3445 %4 = bitcast <8 x i32> %3 to <4 x i64>
3449 declare <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i32) #2
3451 define <4 x i64> @test_mm256_mask_ternarylogic_epi32(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__B, <4 x i64> %__C) {
3452 ; X86-LABEL: test_mm256_mask_ternarylogic_epi32:
3453 ; X86: # %bb.0: # %entry
3454 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3455 ; X86-NEXT: kmovw %eax, %k1
3456 ; X86-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1}
3459 ; X64-LABEL: test_mm256_mask_ternarylogic_epi32:
3460 ; X64: # %bb.0: # %entry
3461 ; X64-NEXT: kmovw %edi, %k1
3462 ; X64-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1}
3465 %0 = bitcast <4 x i64> %__A to <8 x i32>
3466 %1 = bitcast <4 x i64> %__B to <8 x i32>
3467 %2 = bitcast <4 x i64> %__C to <8 x i32>
3468 %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4)
3469 %4 = bitcast i8 %__U to <8 x i1>
3470 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %0
3471 %6 = bitcast <8 x i32> %5 to <4 x i64>
3475 define <4 x i64> @test_mm256_maskz_ternarylogic_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) {
3476 ; X86-LABEL: test_mm256_maskz_ternarylogic_epi32:
3477 ; X86: # %bb.0: # %entry
3478 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3479 ; X86-NEXT: kmovw %eax, %k1
3480 ; X86-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} {z}
3483 ; X64-LABEL: test_mm256_maskz_ternarylogic_epi32:
3484 ; X64: # %bb.0: # %entry
3485 ; X64-NEXT: kmovw %edi, %k1
3486 ; X64-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} {z}
3489 %0 = bitcast <4 x i64> %__A to <8 x i32>
3490 %1 = bitcast <4 x i64> %__B to <8 x i32>
3491 %2 = bitcast <4 x i64> %__C to <8 x i32>
3492 %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4)
3493 %4 = bitcast i8 %__U to <8 x i1>
3494 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> zeroinitializer
3495 %6 = bitcast <8 x i32> %5 to <4 x i64>
3499 define <2 x i64> @test_mm_ternarylogic_epi64(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) {
3500 ; CHECK-LABEL: test_mm_ternarylogic_epi64:
3501 ; CHECK: # %bb.0: # %entry
3502 ; CHECK-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0
3503 ; CHECK-NEXT: ret{{[l|q]}}
3505 %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4)
3509 declare <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i32) #2
3511 define <2 x i64> @test_mm_mask_ternarylogic_epi64(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__B, <2 x i64> %__C) {
3512 ; X86-LABEL: test_mm_mask_ternarylogic_epi64:
3513 ; X86: # %bb.0: # %entry
3514 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3515 ; X86-NEXT: kmovw %eax, %k1
3516 ; X86-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1}
3519 ; X64-LABEL: test_mm_mask_ternarylogic_epi64:
3520 ; X64: # %bb.0: # %entry
3521 ; X64-NEXT: kmovw %edi, %k1
3522 ; X64-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1}
3525 %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4)
3526 %1 = bitcast i8 %__U to <8 x i1>
3527 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3528 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__A
3532 define <2 x i64> @test_mm_maskz_ternarylogic_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) {
3533 ; X86-LABEL: test_mm_maskz_ternarylogic_epi64:
3534 ; X86: # %bb.0: # %entry
3535 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3536 ; X86-NEXT: kmovw %eax, %k1
3537 ; X86-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} {z}
3540 ; X64-LABEL: test_mm_maskz_ternarylogic_epi64:
3541 ; X64: # %bb.0: # %entry
3542 ; X64-NEXT: kmovw %edi, %k1
3543 ; X64-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} {z}
3546 %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4)
3547 %1 = bitcast i8 %__U to <8 x i1>
3548 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3549 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer
3553 define <4 x i64> @test_mm256_ternarylogic_epi64(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) {
3554 ; CHECK-LABEL: test_mm256_ternarylogic_epi64:
3555 ; CHECK: # %bb.0: # %entry
3556 ; CHECK-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0
3557 ; CHECK-NEXT: ret{{[l|q]}}
3559 %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4)
3563 declare <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i32) #2
3565 define <4 x i64> @test_mm256_mask_ternarylogic_epi64(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__B, <4 x i64> %__C) {
3566 ; X86-LABEL: test_mm256_mask_ternarylogic_epi64:
3567 ; X86: # %bb.0: # %entry
3568 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3569 ; X86-NEXT: kmovw %eax, %k1
3570 ; X86-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1}
3573 ; X64-LABEL: test_mm256_mask_ternarylogic_epi64:
3574 ; X64: # %bb.0: # %entry
3575 ; X64-NEXT: kmovw %edi, %k1
3576 ; X64-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1}
3579 %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4)
3580 %1 = bitcast i8 %__U to <8 x i1>
3581 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3582 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__A
3586 define <4 x i64> @test_mm256_maskz_ternarylogic_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) {
3587 ; X86-LABEL: test_mm256_maskz_ternarylogic_epi64:
3588 ; X86: # %bb.0: # %entry
3589 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3590 ; X86-NEXT: kmovw %eax, %k1
3591 ; X86-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} {z}
3594 ; X64-LABEL: test_mm256_maskz_ternarylogic_epi64:
3595 ; X64: # %bb.0: # %entry
3596 ; X64-NEXT: kmovw %edi, %k1
3597 ; X64-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} {z}
3600 %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4)
3601 %1 = bitcast i8 %__U to <8 x i1>
3602 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3603 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer
3607 define <2 x i64> @test_mm_mask2_permutex2var_epi32(<2 x i64> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x i64> %__B) {
3608 ; X86-LABEL: test_mm_mask2_permutex2var_epi32:
3609 ; X86: # %bb.0: # %entry
3610 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3611 ; X86-NEXT: kmovw %eax, %k1
3612 ; X86-NEXT: vpermi2d %xmm2, %xmm0, %xmm1 {%k1}
3613 ; X86-NEXT: vmovdqa %xmm1, %xmm0
3616 ; X64-LABEL: test_mm_mask2_permutex2var_epi32:
3617 ; X64: # %bb.0: # %entry
3618 ; X64-NEXT: kmovw %edi, %k1
3619 ; X64-NEXT: vpermi2d %xmm2, %xmm0, %xmm1 {%k1}
3620 ; X64-NEXT: vmovdqa %xmm1, %xmm0
3623 %0 = bitcast <2 x i64> %__A to <4 x i32>
3624 %1 = bitcast <2 x i64> %__I to <4 x i32>
3625 %2 = bitcast <2 x i64> %__B to <4 x i32>
3626 %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
3627 %4 = bitcast i8 %__U to <8 x i1>
3628 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3629 %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> %1
3630 %6 = bitcast <4 x i32> %5 to <2 x i64>
3634 define <4 x i64> @test_mm256_mask2_permutex2var_epi32(<4 x i64> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x i64> %__B) {
3635 ; X86-LABEL: test_mm256_mask2_permutex2var_epi32:
3636 ; X86: # %bb.0: # %entry
3637 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3638 ; X86-NEXT: kmovw %eax, %k1
3639 ; X86-NEXT: vpermi2d %ymm2, %ymm0, %ymm1 {%k1}
3640 ; X86-NEXT: vmovdqa %ymm1, %ymm0
3643 ; X64-LABEL: test_mm256_mask2_permutex2var_epi32:
3644 ; X64: # %bb.0: # %entry
3645 ; X64-NEXT: kmovw %edi, %k1
3646 ; X64-NEXT: vpermi2d %ymm2, %ymm0, %ymm1 {%k1}
3647 ; X64-NEXT: vmovdqa %ymm1, %ymm0
3650 %0 = bitcast <4 x i64> %__A to <8 x i32>
3651 %1 = bitcast <4 x i64> %__I to <8 x i32>
3652 %2 = bitcast <4 x i64> %__B to <8 x i32>
3653 %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
3654 %4 = bitcast i8 %__U to <8 x i1>
3655 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %1
3656 %6 = bitcast <8 x i32> %5 to <4 x i64>
3660 define <2 x double> @test_mm_mask2_permutex2var_pd(<2 x double> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x double> %__B) {
3661 ; X86-LABEL: test_mm_mask2_permutex2var_pd:
3662 ; X86: # %bb.0: # %entry
3663 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3664 ; X86-NEXT: kmovw %eax, %k1
3665 ; X86-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1 {%k1}
3666 ; X86-NEXT: vmovapd %xmm1, %xmm0
3669 ; X64-LABEL: test_mm_mask2_permutex2var_pd:
3670 ; X64: # %bb.0: # %entry
3671 ; X64-NEXT: kmovw %edi, %k1
3672 ; X64-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1 {%k1}
3673 ; X64-NEXT: vmovapd %xmm1, %xmm0
3676 %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B)
3677 %1 = bitcast <2 x i64> %__I to <2 x double>
3678 %2 = bitcast i8 %__U to <8 x i1>
3679 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3680 %3 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %1
3684 define <4 x double> @test_mm256_mask2_permutex2var_pd(<4 x double> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x double> %__B) {
3685 ; X86-LABEL: test_mm256_mask2_permutex2var_pd:
3686 ; X86: # %bb.0: # %entry
3687 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3688 ; X86-NEXT: kmovw %eax, %k1
3689 ; X86-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 {%k1}
3690 ; X86-NEXT: vmovapd %ymm1, %ymm0
3693 ; X64-LABEL: test_mm256_mask2_permutex2var_pd:
3694 ; X64: # %bb.0: # %entry
3695 ; X64-NEXT: kmovw %edi, %k1
3696 ; X64-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 {%k1}
3697 ; X64-NEXT: vmovapd %ymm1, %ymm0
3700 %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B)
3701 %1 = bitcast <4 x i64> %__I to <4 x double>
3702 %2 = bitcast i8 %__U to <8 x i1>
3703 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3704 %3 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %1
3708 define <4 x float> @test_mm_mask2_permutex2var_ps(<4 x float> %__A, <2 x i64> %__I, i8 zeroext %__U, <4 x float> %__B) {
3709 ; X86-LABEL: test_mm_mask2_permutex2var_ps:
3710 ; X86: # %bb.0: # %entry
3711 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3712 ; X86-NEXT: kmovw %eax, %k1
3713 ; X86-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 {%k1}
3714 ; X86-NEXT: vmovaps %xmm1, %xmm0
3717 ; X64-LABEL: test_mm_mask2_permutex2var_ps:
3718 ; X64: # %bb.0: # %entry
3719 ; X64-NEXT: kmovw %edi, %k1
3720 ; X64-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 {%k1}
3721 ; X64-NEXT: vmovaps %xmm1, %xmm0
3724 %0 = bitcast <2 x i64> %__I to <4 x i32>
3725 %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B)
3726 %2 = bitcast <2 x i64> %__I to <4 x float>
3727 %3 = bitcast i8 %__U to <8 x i1>
3728 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3729 %4 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> %2
3733 define <8 x float> @test_mm256_mask2_permutex2var_ps(<8 x float> %__A, <4 x i64> %__I, i8 zeroext %__U, <8 x float> %__B) {
3734 ; X86-LABEL: test_mm256_mask2_permutex2var_ps:
3735 ; X86: # %bb.0: # %entry
3736 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3737 ; X86-NEXT: kmovw %eax, %k1
3738 ; X86-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 {%k1}
3739 ; X86-NEXT: vmovaps %ymm1, %ymm0
3742 ; X64-LABEL: test_mm256_mask2_permutex2var_ps:
3743 ; X64: # %bb.0: # %entry
3744 ; X64-NEXT: kmovw %edi, %k1
3745 ; X64-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 {%k1}
3746 ; X64-NEXT: vmovaps %ymm1, %ymm0
3749 %0 = bitcast <4 x i64> %__I to <8 x i32>
3750 %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B)
3751 %2 = bitcast <4 x i64> %__I to <8 x float>
3752 %3 = bitcast i8 %__U to <8 x i1>
3753 %4 = select <8 x i1> %3, <8 x float> %1, <8 x float> %2
3757 define <2 x i64> @test_mm_mask2_permutex2var_epi64(<2 x i64> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x i64> %__B) {
3758 ; X86-LABEL: test_mm_mask2_permutex2var_epi64:
3759 ; X86: # %bb.0: # %entry
3760 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3761 ; X86-NEXT: kmovw %eax, %k1
3762 ; X86-NEXT: vpermi2q %xmm2, %xmm0, %xmm1 {%k1}
3763 ; X86-NEXT: vmovdqa %xmm1, %xmm0
3766 ; X64-LABEL: test_mm_mask2_permutex2var_epi64:
3767 ; X64: # %bb.0: # %entry
3768 ; X64-NEXT: kmovw %edi, %k1
3769 ; X64-NEXT: vpermi2q %xmm2, %xmm0, %xmm1 {%k1}
3770 ; X64-NEXT: vmovdqa %xmm1, %xmm0
3773 %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B)
3774 %1 = bitcast i8 %__U to <8 x i1>
3775 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3776 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__I
3780 define <4 x i64> @test_mm256_mask2_permutex2var_epi64(<4 x i64> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x i64> %__B) {
3781 ; X86-LABEL: test_mm256_mask2_permutex2var_epi64:
3782 ; X86: # %bb.0: # %entry
3783 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3784 ; X86-NEXT: kmovw %eax, %k1
3785 ; X86-NEXT: vpermi2q %ymm2, %ymm0, %ymm1 {%k1}
3786 ; X86-NEXT: vmovdqa %ymm1, %ymm0
3789 ; X64-LABEL: test_mm256_mask2_permutex2var_epi64:
3790 ; X64: # %bb.0: # %entry
3791 ; X64-NEXT: kmovw %edi, %k1
3792 ; X64-NEXT: vpermi2q %ymm2, %ymm0, %ymm1 {%k1}
3793 ; X64-NEXT: vmovdqa %ymm1, %ymm0
3796 %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B)
3797 %1 = bitcast i8 %__U to <8 x i1>
3798 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3799 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__I
3803 define <2 x i64> @test_mm_permutex2var_epi32(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) {
3804 ; CHECK-LABEL: test_mm_permutex2var_epi32:
3805 ; CHECK: # %bb.0: # %entry
3806 ; CHECK-NEXT: vpermt2d %xmm2, %xmm1, %xmm0
3807 ; CHECK-NEXT: ret{{[l|q]}}
3809 %0 = bitcast <2 x i64> %__A to <4 x i32>
3810 %1 = bitcast <2 x i64> %__I to <4 x i32>
3811 %2 = bitcast <2 x i64> %__B to <4 x i32>
3812 %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
3813 %4 = bitcast <4 x i32> %3 to <2 x i64>
3817 define <2 x i64> @test_mm_mask_permutex2var_epi32(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x i64> %__B) {
3818 ; X86-LABEL: test_mm_mask_permutex2var_epi32:
3819 ; X86: # %bb.0: # %entry
3820 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3821 ; X86-NEXT: kmovw %eax, %k1
3822 ; X86-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 {%k1}
3825 ; X64-LABEL: test_mm_mask_permutex2var_epi32:
3826 ; X64: # %bb.0: # %entry
3827 ; X64-NEXT: kmovw %edi, %k1
3828 ; X64-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 {%k1}
3831 %0 = bitcast <2 x i64> %__A to <4 x i32>
3832 %1 = bitcast <2 x i64> %__I to <4 x i32>
3833 %2 = bitcast <2 x i64> %__B to <4 x i32>
3834 %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
3835 %4 = bitcast i8 %__U to <8 x i1>
3836 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3837 %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> %0
3838 %6 = bitcast <4 x i32> %5 to <2 x i64>
3842 define <2 x i64> @test_mm_maskz_permutex2var_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) {
3843 ; X86-LABEL: test_mm_maskz_permutex2var_epi32:
3844 ; X86: # %bb.0: # %entry
3845 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3846 ; X86-NEXT: kmovw %eax, %k1
3847 ; X86-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 {%k1} {z}
3850 ; X64-LABEL: test_mm_maskz_permutex2var_epi32:
3851 ; X64: # %bb.0: # %entry
3852 ; X64-NEXT: kmovw %edi, %k1
3853 ; X64-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 {%k1} {z}
3856 %0 = bitcast <2 x i64> %__A to <4 x i32>
3857 %1 = bitcast <2 x i64> %__I to <4 x i32>
3858 %2 = bitcast <2 x i64> %__B to <4 x i32>
3859 %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
3860 %4 = bitcast i8 %__U to <8 x i1>
3861 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3862 %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> zeroinitializer
3863 %6 = bitcast <4 x i32> %5 to <2 x i64>
3867 define <4 x i64> @test_mm256_permutex2var_epi32(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) {
3868 ; CHECK-LABEL: test_mm256_permutex2var_epi32:
3869 ; CHECK: # %bb.0: # %entry
3870 ; CHECK-NEXT: vpermt2d %ymm2, %ymm1, %ymm0
3871 ; CHECK-NEXT: ret{{[l|q]}}
3873 %0 = bitcast <4 x i64> %__A to <8 x i32>
3874 %1 = bitcast <4 x i64> %__I to <8 x i32>
3875 %2 = bitcast <4 x i64> %__B to <8 x i32>
3876 %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
3877 %4 = bitcast <8 x i32> %3 to <4 x i64>
3881 define <4 x i64> @test_mm256_mask_permutex2var_epi32(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x i64> %__B) {
3882 ; X86-LABEL: test_mm256_mask_permutex2var_epi32:
3883 ; X86: # %bb.0: # %entry
3884 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3885 ; X86-NEXT: kmovw %eax, %k1
3886 ; X86-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 {%k1}
3889 ; X64-LABEL: test_mm256_mask_permutex2var_epi32:
3890 ; X64: # %bb.0: # %entry
3891 ; X64-NEXT: kmovw %edi, %k1
3892 ; X64-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 {%k1}
3895 %0 = bitcast <4 x i64> %__A to <8 x i32>
3896 %1 = bitcast <4 x i64> %__I to <8 x i32>
3897 %2 = bitcast <4 x i64> %__B to <8 x i32>
3898 %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
3899 %4 = bitcast i8 %__U to <8 x i1>
3900 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %0
3901 %6 = bitcast <8 x i32> %5 to <4 x i64>
3905 define <4 x i64> @test_mm256_maskz_permutex2var_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) {
3906 ; X86-LABEL: test_mm256_maskz_permutex2var_epi32:
3907 ; X86: # %bb.0: # %entry
3908 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3909 ; X86-NEXT: kmovw %eax, %k1
3910 ; X86-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 {%k1} {z}
3913 ; X64-LABEL: test_mm256_maskz_permutex2var_epi32:
3914 ; X64: # %bb.0: # %entry
3915 ; X64-NEXT: kmovw %edi, %k1
3916 ; X64-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 {%k1} {z}
3919 %0 = bitcast <4 x i64> %__A to <8 x i32>
3920 %1 = bitcast <4 x i64> %__I to <8 x i32>
3921 %2 = bitcast <4 x i64> %__B to <8 x i32>
3922 %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
3923 %4 = bitcast i8 %__U to <8 x i1>
3924 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> zeroinitializer
3925 %6 = bitcast <8 x i32> %5 to <4 x i64>
3929 define <2 x double> @test_mm_permutex2var_pd(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) {
3930 ; CHECK-LABEL: test_mm_permutex2var_pd:
3931 ; CHECK: # %bb.0: # %entry
3932 ; CHECK-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0
3933 ; CHECK-NEXT: ret{{[l|q]}}
3935 %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B)
3939 define <2 x double> @test_mm_mask_permutex2var_pd(<2 x double> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x double> %__B) {
3940 ; X86-LABEL: test_mm_mask_permutex2var_pd:
3941 ; X86: # %bb.0: # %entry
3942 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3943 ; X86-NEXT: kmovw %eax, %k1
3944 ; X86-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 {%k1}
3947 ; X64-LABEL: test_mm_mask_permutex2var_pd:
3948 ; X64: # %bb.0: # %entry
3949 ; X64-NEXT: kmovw %edi, %k1
3950 ; X64-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 {%k1}
3953 %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B)
3954 %1 = bitcast i8 %__U to <8 x i1>
3955 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3956 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
3960 define <2 x double> @test_mm_maskz_permutex2var_pd(i8 zeroext %__U, <2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) {
3961 ; X86-LABEL: test_mm_maskz_permutex2var_pd:
3962 ; X86: # %bb.0: # %entry
3963 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3964 ; X86-NEXT: kmovw %eax, %k1
3965 ; X86-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 {%k1} {z}
3968 ; X64-LABEL: test_mm_maskz_permutex2var_pd:
3969 ; X64: # %bb.0: # %entry
3970 ; X64-NEXT: kmovw %edi, %k1
3971 ; X64-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 {%k1} {z}
3974 %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B)
3975 %1 = bitcast i8 %__U to <8 x i1>
3976 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3977 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
3981 define <4 x double> @test_mm256_permutex2var_pd(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) {
3982 ; CHECK-LABEL: test_mm256_permutex2var_pd:
3983 ; CHECK: # %bb.0: # %entry
3984 ; CHECK-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0
3985 ; CHECK-NEXT: ret{{[l|q]}}
3987 %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B)
3991 define <4 x double> @test_mm256_mask_permutex2var_pd(<4 x double> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x double> %__B) {
3992 ; X86-LABEL: test_mm256_mask_permutex2var_pd:
3993 ; X86: # %bb.0: # %entry
3994 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3995 ; X86-NEXT: kmovw %eax, %k1
3996 ; X86-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 {%k1}
3999 ; X64-LABEL: test_mm256_mask_permutex2var_pd:
4000 ; X64: # %bb.0: # %entry
4001 ; X64-NEXT: kmovw %edi, %k1
4002 ; X64-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 {%k1}
4005 %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B)
4006 %1 = bitcast i8 %__U to <8 x i1>
4007 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4008 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
4012 define <4 x double> @test_mm256_maskz_permutex2var_pd(i8 zeroext %__U, <4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) {
4013 ; X86-LABEL: test_mm256_maskz_permutex2var_pd:
4014 ; X86: # %bb.0: # %entry
4015 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4016 ; X86-NEXT: kmovw %eax, %k1
4017 ; X86-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 {%k1} {z}
4020 ; X64-LABEL: test_mm256_maskz_permutex2var_pd:
4021 ; X64: # %bb.0: # %entry
4022 ; X64-NEXT: kmovw %edi, %k1
4023 ; X64-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 {%k1} {z}
4026 %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B)
4027 %1 = bitcast i8 %__U to <8 x i1>
4028 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4029 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
4033 define <4 x float> @test_mm_permutex2var_ps(<4 x float> %__A, <2 x i64> %__I, <4 x float> %__B) {
4034 ; CHECK-LABEL: test_mm_permutex2var_ps:
4035 ; CHECK: # %bb.0: # %entry
4036 ; CHECK-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0
4037 ; CHECK-NEXT: ret{{[l|q]}}
4039 %0 = bitcast <2 x i64> %__I to <4 x i32>
4040 %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B)
4044 define <4 x float> @test_mm_mask_permutex2var_ps(<4 x float> %__A, i8 zeroext %__U, <2 x i64> %__I, <4 x float> %__B) {
4045 ; X86-LABEL: test_mm_mask_permutex2var_ps:
4046 ; X86: # %bb.0: # %entry
4047 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4048 ; X86-NEXT: kmovw %eax, %k1
4049 ; X86-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 {%k1}
4052 ; X64-LABEL: test_mm_mask_permutex2var_ps:
4053 ; X64: # %bb.0: # %entry
4054 ; X64-NEXT: kmovw %edi, %k1
4055 ; X64-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 {%k1}
4058 %0 = bitcast <2 x i64> %__I to <4 x i32>
4059 %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B)
4060 %2 = bitcast i8 %__U to <8 x i1>
4061 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4062 %3 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> %__A
4066 define <4 x float> @test_mm_maskz_permutex2var_ps(i8 zeroext %__U, <4 x float> %__A, <2 x i64> %__I, <4 x float> %__B) {
4067 ; X86-LABEL: test_mm_maskz_permutex2var_ps:
4068 ; X86: # %bb.0: # %entry
4069 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4070 ; X86-NEXT: kmovw %eax, %k1
4071 ; X86-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 {%k1} {z}
4074 ; X64-LABEL: test_mm_maskz_permutex2var_ps:
4075 ; X64: # %bb.0: # %entry
4076 ; X64-NEXT: kmovw %edi, %k1
4077 ; X64-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 {%k1} {z}
4080 %0 = bitcast <2 x i64> %__I to <4 x i32>
4081 %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B)
4082 %2 = bitcast i8 %__U to <8 x i1>
4083 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4084 %3 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> zeroinitializer
4088 define <8 x float> @test_mm256_permutex2var_ps(<8 x float> %__A, <4 x i64> %__I, <8 x float> %__B) {
4089 ; CHECK-LABEL: test_mm256_permutex2var_ps:
4090 ; CHECK: # %bb.0: # %entry
4091 ; CHECK-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0
4092 ; CHECK-NEXT: ret{{[l|q]}}
4094 %0 = bitcast <4 x i64> %__I to <8 x i32>
4095 %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B)
4099 define <8 x float> @test_mm256_mask_permutex2var_ps(<8 x float> %__A, i8 zeroext %__U, <4 x i64> %__I, <8 x float> %__B) {
4100 ; X86-LABEL: test_mm256_mask_permutex2var_ps:
4101 ; X86: # %bb.0: # %entry
4102 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4103 ; X86-NEXT: kmovw %eax, %k1
4104 ; X86-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 {%k1}
4107 ; X64-LABEL: test_mm256_mask_permutex2var_ps:
4108 ; X64: # %bb.0: # %entry
4109 ; X64-NEXT: kmovw %edi, %k1
4110 ; X64-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 {%k1}
4113 %0 = bitcast <4 x i64> %__I to <8 x i32>
4114 %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B)
4115 %2 = bitcast i8 %__U to <8 x i1>
4116 %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %__A
4120 define <8 x float> @test_mm256_maskz_permutex2var_ps(i8 zeroext %__U, <8 x float> %__A, <4 x i64> %__I, <8 x float> %__B) {
4121 ; X86-LABEL: test_mm256_maskz_permutex2var_ps:
4122 ; X86: # %bb.0: # %entry
4123 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4124 ; X86-NEXT: kmovw %eax, %k1
4125 ; X86-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 {%k1} {z}
4128 ; X64-LABEL: test_mm256_maskz_permutex2var_ps:
4129 ; X64: # %bb.0: # %entry
4130 ; X64-NEXT: kmovw %edi, %k1
4131 ; X64-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 {%k1} {z}
4134 %0 = bitcast <4 x i64> %__I to <8 x i32>
4135 %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B)
4136 %2 = bitcast i8 %__U to <8 x i1>
4137 %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> zeroinitializer
4141 define <2 x i64> @test_mm_permutex2var_epi64(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) {
4142 ; CHECK-LABEL: test_mm_permutex2var_epi64:
4143 ; CHECK: # %bb.0: # %entry
4144 ; CHECK-NEXT: vpermt2q %xmm2, %xmm1, %xmm0
4145 ; CHECK-NEXT: ret{{[l|q]}}
4147 %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B)
4151 define <2 x i64> @test_mm_mask_permutex2var_epi64(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x i64> %__B) {
4152 ; X86-LABEL: test_mm_mask_permutex2var_epi64:
4153 ; X86: # %bb.0: # %entry
4154 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4155 ; X86-NEXT: kmovw %eax, %k1
4156 ; X86-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 {%k1}
4159 ; X64-LABEL: test_mm_mask_permutex2var_epi64:
4160 ; X64: # %bb.0: # %entry
4161 ; X64-NEXT: kmovw %edi, %k1
4162 ; X64-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 {%k1}
4165 %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B)
4166 %1 = bitcast i8 %__U to <8 x i1>
4167 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4168 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__A
4172 define <2 x i64> @test_mm_maskz_permutex2var_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) {
4173 ; X86-LABEL: test_mm_maskz_permutex2var_epi64:
4174 ; X86: # %bb.0: # %entry
4175 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4176 ; X86-NEXT: kmovw %eax, %k1
4177 ; X86-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 {%k1} {z}
4180 ; X64-LABEL: test_mm_maskz_permutex2var_epi64:
4181 ; X64: # %bb.0: # %entry
4182 ; X64-NEXT: kmovw %edi, %k1
4183 ; X64-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 {%k1} {z}
4186 %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B)
4187 %1 = bitcast i8 %__U to <8 x i1>
4188 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4189 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
4193 define <4 x i64> @test_mm256_permutex2var_epi64(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) {
4194 ; CHECK-LABEL: test_mm256_permutex2var_epi64:
4195 ; CHECK: # %bb.0: # %entry
4196 ; CHECK-NEXT: vpermt2q %ymm2, %ymm1, %ymm0
4197 ; CHECK-NEXT: ret{{[l|q]}}
4199 %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B)
4203 define <4 x i64> @test_mm256_mask_permutex2var_epi64(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x i64> %__B) {
4204 ; X86-LABEL: test_mm256_mask_permutex2var_epi64:
4205 ; X86: # %bb.0: # %entry
4206 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4207 ; X86-NEXT: kmovw %eax, %k1
4208 ; X86-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 {%k1}
4211 ; X64-LABEL: test_mm256_mask_permutex2var_epi64:
4212 ; X64: # %bb.0: # %entry
4213 ; X64-NEXT: kmovw %edi, %k1
4214 ; X64-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 {%k1}
4217 %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B)
4218 %1 = bitcast i8 %__U to <8 x i1>
4219 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4220 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__A
4224 define <4 x i64> @test_mm256_maskz_permutex2var_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) {
4225 ; X86-LABEL: test_mm256_maskz_permutex2var_epi64:
4226 ; X86: # %bb.0: # %entry
4227 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4228 ; X86-NEXT: kmovw %eax, %k1
4229 ; X86-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 {%k1} {z}
4232 ; X64-LABEL: test_mm256_maskz_permutex2var_epi64:
4233 ; X64: # %bb.0: # %entry
4234 ; X64-NEXT: kmovw %edi, %k1
4235 ; X64-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 {%k1} {z}
4238 %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B)
4239 %1 = bitcast i8 %__U to <8 x i1>
4240 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4241 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
4246 define <2 x double> @test_mm_mask_fmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
4247 ; X86-LABEL: test_mm_mask_fmadd_pd:
4248 ; X86: # %bb.0: # %entry
4249 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4250 ; X86-NEXT: kmovw %eax, %k1
4251 ; X86-NEXT: vfmadd132pd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
4254 ; X64-LABEL: test_mm_mask_fmadd_pd:
4255 ; X64: # %bb.0: # %entry
4256 ; X64-NEXT: kmovw %edi, %k1
4257 ; X64-NEXT: vfmadd132pd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
4260 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
4261 %1 = bitcast i8 %__U to <8 x i1>
4262 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4263 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
4267 define <2 x double> @test_mm_mask_fmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
4268 ; X86-LABEL: test_mm_mask_fmsub_pd:
4269 ; X86: # %bb.0: # %entry
4270 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4271 ; X86-NEXT: kmovw %eax, %k1
4272 ; X86-NEXT: vfmsub132pd {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
4275 ; X64-LABEL: test_mm_mask_fmsub_pd:
4276 ; X64: # %bb.0: # %entry
4277 ; X64-NEXT: kmovw %edi, %k1
4278 ; X64-NEXT: vfmsub132pd {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
4281 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
4282 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
4283 %1 = bitcast i8 %__U to <8 x i1>
4284 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4285 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
4289 define <2 x double> @test_mm_mask3_fmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
4290 ; X86-LABEL: test_mm_mask3_fmadd_pd:
4291 ; X86: # %bb.0: # %entry
4292 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4293 ; X86-NEXT: kmovw %eax, %k1
4294 ; X86-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
4295 ; X86-NEXT: vmovapd %xmm2, %xmm0
4298 ; X64-LABEL: test_mm_mask3_fmadd_pd:
4299 ; X64: # %bb.0: # %entry
4300 ; X64-NEXT: kmovw %edi, %k1
4301 ; X64-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
4302 ; X64-NEXT: vmovapd %xmm2, %xmm0
4305 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
4306 %1 = bitcast i8 %__U to <8 x i1>
4307 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4308 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
4312 define <2 x double> @test_mm_mask3_fnmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
4313 ; X86-LABEL: test_mm_mask3_fnmadd_pd:
4314 ; X86: # %bb.0: # %entry
4315 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4316 ; X86-NEXT: kmovw %eax, %k1
4317 ; X86-NEXT: vfnmadd231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
4318 ; X86-NEXT: vmovapd %xmm2, %xmm0
4321 ; X64-LABEL: test_mm_mask3_fnmadd_pd:
4322 ; X64: # %bb.0: # %entry
4323 ; X64-NEXT: kmovw %edi, %k1
4324 ; X64-NEXT: vfnmadd231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
4325 ; X64-NEXT: vmovapd %xmm2, %xmm0
4328 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
4329 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %__C) #9
4330 %1 = bitcast i8 %__U to <8 x i1>
4331 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4332 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
4336 define <2 x double> @test_mm_maskz_fmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
4337 ; X86-LABEL: test_mm_maskz_fmadd_pd:
4338 ; X86: # %bb.0: # %entry
4339 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4340 ; X86-NEXT: kmovw %eax, %k1
4341 ; X86-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4344 ; X64-LABEL: test_mm_maskz_fmadd_pd:
4345 ; X64: # %bb.0: # %entry
4346 ; X64-NEXT: kmovw %edi, %k1
4347 ; X64-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4350 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
4351 %1 = bitcast i8 %__U to <8 x i1>
4352 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4353 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
4357 define <2 x double> @test_mm_maskz_fmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
4358 ; X86-LABEL: test_mm_maskz_fmsub_pd:
4359 ; X86: # %bb.0: # %entry
4360 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4361 ; X86-NEXT: kmovw %eax, %k1
4362 ; X86-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
4365 ; X64-LABEL: test_mm_maskz_fmsub_pd:
4366 ; X64: # %bb.0: # %entry
4367 ; X64-NEXT: kmovw %edi, %k1
4368 ; X64-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
4371 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
4372 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
4373 %1 = bitcast i8 %__U to <8 x i1>
4374 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4375 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
4379 define <2 x double> @test_mm_maskz_fnmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
4380 ; X86-LABEL: test_mm_maskz_fnmadd_pd:
4381 ; X86: # %bb.0: # %entry
4382 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4383 ; X86-NEXT: kmovw %eax, %k1
4384 ; X86-NEXT: vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
4387 ; X64-LABEL: test_mm_maskz_fnmadd_pd:
4388 ; X64: # %bb.0: # %entry
4389 ; X64-NEXT: kmovw %edi, %k1
4390 ; X64-NEXT: vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
4393 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
4394 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %__C) #9
4395 %1 = bitcast i8 %__U to <8 x i1>
4396 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4397 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
4401 define <2 x double> @test_mm_maskz_fnmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
4402 ; X86-LABEL: test_mm_maskz_fnmsub_pd:
4403 ; X86: # %bb.0: # %entry
4404 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4405 ; X86-NEXT: kmovw %eax, %k1
4406 ; X86-NEXT: vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
4409 ; X64-LABEL: test_mm_maskz_fnmsub_pd:
4410 ; X64: # %bb.0: # %entry
4411 ; X64-NEXT: kmovw %edi, %k1
4412 ; X64-NEXT: vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
4415 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
4416 %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
4417 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %sub1.i) #9
4418 %1 = bitcast i8 %__U to <8 x i1>
4419 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4420 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
4424 define <4 x double> @test_mm256_mask_fmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
4425 ; X86-LABEL: test_mm256_mask_fmadd_pd:
4426 ; X86: # %bb.0: # %entry
4427 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4428 ; X86-NEXT: kmovw %eax, %k1
4429 ; X86-NEXT: vfmadd132pd {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2
4432 ; X64-LABEL: test_mm256_mask_fmadd_pd:
4433 ; X64: # %bb.0: # %entry
4434 ; X64-NEXT: kmovw %edi, %k1
4435 ; X64-NEXT: vfmadd132pd {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2
4438 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
4439 %1 = bitcast i8 %__U to <8 x i1>
4440 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4441 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
4445 define <4 x double> @test_mm256_mask_fmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
4446 ; X86-LABEL: test_mm256_mask_fmsub_pd:
4447 ; X86: # %bb.0: # %entry
4448 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4449 ; X86-NEXT: kmovw %eax, %k1
4450 ; X86-NEXT: vfmsub132pd {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
4453 ; X64-LABEL: test_mm256_mask_fmsub_pd:
4454 ; X64: # %bb.0: # %entry
4455 ; X64-NEXT: kmovw %edi, %k1
4456 ; X64-NEXT: vfmsub132pd {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
4459 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4460 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
4461 %1 = bitcast i8 %__U to <8 x i1>
4462 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4463 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
4467 define <4 x double> @test_mm256_mask3_fmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
4468 ; X86-LABEL: test_mm256_mask3_fmadd_pd:
4469 ; X86: # %bb.0: # %entry
4470 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4471 ; X86-NEXT: kmovw %eax, %k1
4472 ; X86-NEXT: vfmadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2
4473 ; X86-NEXT: vmovapd %ymm2, %ymm0
4476 ; X64-LABEL: test_mm256_mask3_fmadd_pd:
4477 ; X64: # %bb.0: # %entry
4478 ; X64-NEXT: kmovw %edi, %k1
4479 ; X64-NEXT: vfmadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2
4480 ; X64-NEXT: vmovapd %ymm2, %ymm0
4483 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
4484 %1 = bitcast i8 %__U to <8 x i1>
4485 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4486 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
4490 define <4 x double> @test_mm256_mask3_fnmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
4491 ; X86-LABEL: test_mm256_mask3_fnmadd_pd:
4492 ; X86: # %bb.0: # %entry
4493 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4494 ; X86-NEXT: kmovw %eax, %k1
4495 ; X86-NEXT: vfnmadd231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2
4496 ; X86-NEXT: vmovapd %ymm2, %ymm0
4499 ; X64-LABEL: test_mm256_mask3_fnmadd_pd:
4500 ; X64: # %bb.0: # %entry
4501 ; X64-NEXT: kmovw %edi, %k1
4502 ; X64-NEXT: vfnmadd231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2
4503 ; X64-NEXT: vmovapd %ymm2, %ymm0
4506 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4507 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %__C) #9
4508 %1 = bitcast i8 %__U to <8 x i1>
4509 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4510 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
4514 define <4 x double> @test_mm256_maskz_fmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
4515 ; X86-LABEL: test_mm256_maskz_fmadd_pd:
4516 ; X86: # %bb.0: # %entry
4517 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4518 ; X86-NEXT: kmovw %eax, %k1
4519 ; X86-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
4522 ; X64-LABEL: test_mm256_maskz_fmadd_pd:
4523 ; X64: # %bb.0: # %entry
4524 ; X64-NEXT: kmovw %edi, %k1
4525 ; X64-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
4528 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
4529 %1 = bitcast i8 %__U to <8 x i1>
4530 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4531 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
4535 define <4 x double> @test_mm256_maskz_fmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
4536 ; X86-LABEL: test_mm256_maskz_fmsub_pd:
4537 ; X86: # %bb.0: # %entry
4538 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4539 ; X86-NEXT: kmovw %eax, %k1
4540 ; X86-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
4543 ; X64-LABEL: test_mm256_maskz_fmsub_pd:
4544 ; X64: # %bb.0: # %entry
4545 ; X64-NEXT: kmovw %edi, %k1
4546 ; X64-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
4549 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4550 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
4551 %1 = bitcast i8 %__U to <8 x i1>
4552 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4553 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
4557 define <4 x double> @test_mm256_maskz_fnmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
4558 ; X86-LABEL: test_mm256_maskz_fnmadd_pd:
4559 ; X86: # %bb.0: # %entry
4560 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4561 ; X86-NEXT: kmovw %eax, %k1
4562 ; X86-NEXT: vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
4565 ; X64-LABEL: test_mm256_maskz_fnmadd_pd:
4566 ; X64: # %bb.0: # %entry
4567 ; X64-NEXT: kmovw %edi, %k1
4568 ; X64-NEXT: vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
4571 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4572 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %__C) #9
4573 %1 = bitcast i8 %__U to <8 x i1>
4574 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4575 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
4579 define <4 x double> @test_mm256_maskz_fnmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
4580 ; X86-LABEL: test_mm256_maskz_fnmsub_pd:
4581 ; X86: # %bb.0: # %entry
4582 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4583 ; X86-NEXT: kmovw %eax, %k1
4584 ; X86-NEXT: vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
4587 ; X64-LABEL: test_mm256_maskz_fnmsub_pd:
4588 ; X64: # %bb.0: # %entry
4589 ; X64-NEXT: kmovw %edi, %k1
4590 ; X64-NEXT: vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
4593 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4594 %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4595 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %sub1.i) #9
4596 %1 = bitcast i8 %__U to <8 x i1>
4597 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4598 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
4602 define <4 x float> @test_mm_mask_fmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
4603 ; X86-LABEL: test_mm_mask_fmadd_ps:
4604 ; X86: # %bb.0: # %entry
4605 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4606 ; X86-NEXT: kmovw %eax, %k1
4607 ; X86-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
4610 ; X64-LABEL: test_mm_mask_fmadd_ps:
4611 ; X64: # %bb.0: # %entry
4612 ; X64-NEXT: kmovw %edi, %k1
4613 ; X64-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
4616 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
4617 %1 = bitcast i8 %__U to <8 x i1>
4618 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4619 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
4623 define <4 x float> @test_mm_mask_fmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
4624 ; X86-LABEL: test_mm_mask_fmsub_ps:
4625 ; X86: # %bb.0: # %entry
4626 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4627 ; X86-NEXT: kmovw %eax, %k1
4628 ; X86-NEXT: vfmsub132ps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
4631 ; X64-LABEL: test_mm_mask_fmsub_ps:
4632 ; X64: # %bb.0: # %entry
4633 ; X64-NEXT: kmovw %edi, %k1
4634 ; X64-NEXT: vfmsub132ps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
4637 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4638 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
4639 %1 = bitcast i8 %__U to <8 x i1>
4640 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4641 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
4645 define <4 x float> @test_mm_mask3_fmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
4646 ; X86-LABEL: test_mm_mask3_fmadd_ps:
4647 ; X86: # %bb.0: # %entry
4648 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4649 ; X86-NEXT: kmovw %eax, %k1
4650 ; X86-NEXT: vfmadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
4651 ; X86-NEXT: vmovaps %xmm2, %xmm0
4654 ; X64-LABEL: test_mm_mask3_fmadd_ps:
4655 ; X64: # %bb.0: # %entry
4656 ; X64-NEXT: kmovw %edi, %k1
4657 ; X64-NEXT: vfmadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
4658 ; X64-NEXT: vmovaps %xmm2, %xmm0
4661 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
4662 %1 = bitcast i8 %__U to <8 x i1>
4663 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4664 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
4668 define <4 x float> @test_mm_mask3_fnmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
4669 ; X86-LABEL: test_mm_mask3_fnmadd_ps:
4670 ; X86: # %bb.0: # %entry
4671 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4672 ; X86-NEXT: kmovw %eax, %k1
4673 ; X86-NEXT: vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
4674 ; X86-NEXT: vmovaps %xmm2, %xmm0
4677 ; X64-LABEL: test_mm_mask3_fnmadd_ps:
4678 ; X64: # %bb.0: # %entry
4679 ; X64-NEXT: kmovw %edi, %k1
4680 ; X64-NEXT: vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
4681 ; X64-NEXT: vmovaps %xmm2, %xmm0
4684 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4685 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %__C) #9
4686 %1 = bitcast i8 %__U to <8 x i1>
4687 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4688 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
4692 define <4 x float> @test_mm_maskz_fmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4693 ; X86-LABEL: test_mm_maskz_fmadd_ps:
4694 ; X86: # %bb.0: # %entry
4695 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4696 ; X86-NEXT: kmovw %eax, %k1
4697 ; X86-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4700 ; X64-LABEL: test_mm_maskz_fmadd_ps:
4701 ; X64: # %bb.0: # %entry
4702 ; X64-NEXT: kmovw %edi, %k1
4703 ; X64-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4706 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
4707 %1 = bitcast i8 %__U to <8 x i1>
4708 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4709 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
4713 define <4 x float> @test_mm_maskz_fmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4714 ; X86-LABEL: test_mm_maskz_fmsub_ps:
4715 ; X86: # %bb.0: # %entry
4716 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4717 ; X86-NEXT: kmovw %eax, %k1
4718 ; X86-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
4721 ; X64-LABEL: test_mm_maskz_fmsub_ps:
4722 ; X64: # %bb.0: # %entry
4723 ; X64-NEXT: kmovw %edi, %k1
4724 ; X64-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
4727 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4728 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
4729 %1 = bitcast i8 %__U to <8 x i1>
4730 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4731 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
4735 define <4 x float> @test_mm_maskz_fnmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4736 ; X86-LABEL: test_mm_maskz_fnmadd_ps:
4737 ; X86: # %bb.0: # %entry
4738 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4739 ; X86-NEXT: kmovw %eax, %k1
4740 ; X86-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
4743 ; X64-LABEL: test_mm_maskz_fnmadd_ps:
4744 ; X64: # %bb.0: # %entry
4745 ; X64-NEXT: kmovw %edi, %k1
4746 ; X64-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
4749 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4750 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %__C) #9
4751 %1 = bitcast i8 %__U to <8 x i1>
4752 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4753 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
4757 define <4 x float> @test_mm_maskz_fnmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4758 ; X86-LABEL: test_mm_maskz_fnmsub_ps:
4759 ; X86: # %bb.0: # %entry
4760 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4761 ; X86-NEXT: kmovw %eax, %k1
4762 ; X86-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
4765 ; X64-LABEL: test_mm_maskz_fnmsub_ps:
4766 ; X64: # %bb.0: # %entry
4767 ; X64-NEXT: kmovw %edi, %k1
4768 ; X64-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
4771 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4772 %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4773 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %sub1.i) #9
4774 %1 = bitcast i8 %__U to <8 x i1>
4775 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4776 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
4780 define <8 x float> @test_mm256_mask_fmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
4781 ; X86-LABEL: test_mm256_mask_fmadd_ps:
4782 ; X86: # %bb.0: # %entry
4783 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4784 ; X86-NEXT: kmovw %eax, %k1
4785 ; X86-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2
4788 ; X64-LABEL: test_mm256_mask_fmadd_ps:
4789 ; X64: # %bb.0: # %entry
4790 ; X64-NEXT: kmovw %edi, %k1
4791 ; X64-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2
4794 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
4795 %1 = bitcast i8 %__U to <8 x i1>
4796 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
4800 define <8 x float> @test_mm256_mask_fmsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
4801 ; X86-LABEL: test_mm256_mask_fmsub_ps:
4802 ; X86: # %bb.0: # %entry
4803 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4804 ; X86-NEXT: kmovw %eax, %k1
4805 ; X86-NEXT: vfmsub132ps {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
4808 ; X64-LABEL: test_mm256_mask_fmsub_ps:
4809 ; X64: # %bb.0: # %entry
4810 ; X64-NEXT: kmovw %edi, %k1
4811 ; X64-NEXT: vfmsub132ps {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
4814 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4815 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
4816 %1 = bitcast i8 %__U to <8 x i1>
4817 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
4821 define <8 x float> @test_mm256_mask3_fmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
4822 ; X86-LABEL: test_mm256_mask3_fmadd_ps:
4823 ; X86: # %bb.0: # %entry
4824 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4825 ; X86-NEXT: kmovw %eax, %k1
4826 ; X86-NEXT: vfmadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2
4827 ; X86-NEXT: vmovaps %ymm2, %ymm0
4830 ; X64-LABEL: test_mm256_mask3_fmadd_ps:
4831 ; X64: # %bb.0: # %entry
4832 ; X64-NEXT: kmovw %edi, %k1
4833 ; X64-NEXT: vfmadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2
4834 ; X64-NEXT: vmovaps %ymm2, %ymm0
4837 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
4838 %1 = bitcast i8 %__U to <8 x i1>
4839 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
4843 define <8 x float> @test_mm256_mask3_fnmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
4844 ; X86-LABEL: test_mm256_mask3_fnmadd_ps:
4845 ; X86: # %bb.0: # %entry
4846 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4847 ; X86-NEXT: kmovw %eax, %k1
4848 ; X86-NEXT: vfnmadd231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2
4849 ; X86-NEXT: vmovaps %ymm2, %ymm0
4852 ; X64-LABEL: test_mm256_mask3_fnmadd_ps:
4853 ; X64: # %bb.0: # %entry
4854 ; X64-NEXT: kmovw %edi, %k1
4855 ; X64-NEXT: vfnmadd231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2
4856 ; X64-NEXT: vmovaps %ymm2, %ymm0
4859 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4860 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %__C) #9
4861 %1 = bitcast i8 %__U to <8 x i1>
4862 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
4866 define <8 x float> @test_mm256_maskz_fmadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
4867 ; X86-LABEL: test_mm256_maskz_fmadd_ps:
4868 ; X86: # %bb.0: # %entry
4869 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4870 ; X86-NEXT: kmovw %eax, %k1
4871 ; X86-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
4874 ; X64-LABEL: test_mm256_maskz_fmadd_ps:
4875 ; X64: # %bb.0: # %entry
4876 ; X64-NEXT: kmovw %edi, %k1
4877 ; X64-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
4880 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
4881 %1 = bitcast i8 %__U to <8 x i1>
4882 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
4886 define <8 x float> @test_mm256_maskz_fmsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
4887 ; X86-LABEL: test_mm256_maskz_fmsub_ps:
4888 ; X86: # %bb.0: # %entry
4889 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4890 ; X86-NEXT: kmovw %eax, %k1
4891 ; X86-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
4894 ; X64-LABEL: test_mm256_maskz_fmsub_ps:
4895 ; X64: # %bb.0: # %entry
4896 ; X64-NEXT: kmovw %edi, %k1
4897 ; X64-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
4900 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4901 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
4902 %1 = bitcast i8 %__U to <8 x i1>
4903 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
4907 define <8 x float> @test_mm256_maskz_fnmadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
4908 ; X86-LABEL: test_mm256_maskz_fnmadd_ps:
4909 ; X86: # %bb.0: # %entry
4910 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4911 ; X86-NEXT: kmovw %eax, %k1
4912 ; X86-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
4915 ; X64-LABEL: test_mm256_maskz_fnmadd_ps:
4916 ; X64: # %bb.0: # %entry
4917 ; X64-NEXT: kmovw %edi, %k1
4918 ; X64-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
4921 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4922 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %__C) #9
4923 %1 = bitcast i8 %__U to <8 x i1>
4924 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
4928 define <8 x float> @test_mm256_maskz_fnmsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
4929 ; X86-LABEL: test_mm256_maskz_fnmsub_ps:
4930 ; X86: # %bb.0: # %entry
4931 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4932 ; X86-NEXT: kmovw %eax, %k1
4933 ; X86-NEXT: vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
4936 ; X64-LABEL: test_mm256_maskz_fnmsub_ps:
4937 ; X64: # %bb.0: # %entry
4938 ; X64-NEXT: kmovw %edi, %k1
4939 ; X64-NEXT: vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
4942 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4943 %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4944 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %sub1.i) #9
4945 %1 = bitcast i8 %__U to <8 x i1>
4946 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
4950 define <2 x double> @test_mm_mask_fmaddsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
4951 ; X86-LABEL: test_mm_mask_fmaddsub_pd:
4952 ; X86: # %bb.0: # %entry
4953 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4954 ; X86-NEXT: kmovw %eax, %k1
4955 ; X86-NEXT: vfmaddsub132pd {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
4958 ; X64-LABEL: test_mm_mask_fmaddsub_pd:
4959 ; X64: # %bb.0: # %entry
4960 ; X64-NEXT: kmovw %edi, %k1
4961 ; X64-NEXT: vfmaddsub132pd {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
4964 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
4965 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
4966 %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
4967 %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
4968 %4 = bitcast i8 %__U to <8 x i1>
4969 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4970 %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> %__A
4974 define <2 x double> @test_mm_mask_fmsubadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
4975 ; X86-LABEL: test_mm_mask_fmsubadd_pd:
4976 ; X86: # %bb.0: # %entry
4977 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4978 ; X86-NEXT: kmovw %eax, %k1
4979 ; X86-NEXT: vfmsubadd132pd {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
4982 ; X64-LABEL: test_mm_mask_fmsubadd_pd:
4983 ; X64: # %bb.0: # %entry
4984 ; X64-NEXT: kmovw %edi, %k1
4985 ; X64-NEXT: vfmsubadd132pd {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
4988 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
4989 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
4990 %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
4991 %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
4992 %3 = bitcast i8 %__U to <8 x i1>
4993 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4994 %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__A
4998 define <2 x double> @test_mm_mask3_fmaddsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
4999 ; X86-LABEL: test_mm_mask3_fmaddsub_pd:
5000 ; X86: # %bb.0: # %entry
5001 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5002 ; X86-NEXT: kmovw %eax, %k1
5003 ; X86-NEXT: vfmaddsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2
5004 ; X86-NEXT: vmovapd %xmm2, %xmm0
5007 ; X64-LABEL: test_mm_mask3_fmaddsub_pd:
5008 ; X64: # %bb.0: # %entry
5009 ; X64-NEXT: kmovw %edi, %k1
5010 ; X64-NEXT: vfmaddsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2
5011 ; X64-NEXT: vmovapd %xmm2, %xmm0
5014 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
5015 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5016 %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
5017 %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
5018 %4 = bitcast i8 %__U to <8 x i1>
5019 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5020 %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> %__C
5024 define <2 x double> @test_mm_maskz_fmaddsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5025 ; X86-LABEL: test_mm_maskz_fmaddsub_pd:
5026 ; X86: # %bb.0: # %entry
5027 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5028 ; X86-NEXT: kmovw %eax, %k1
5029 ; X86-NEXT: vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
5032 ; X64-LABEL: test_mm_maskz_fmaddsub_pd:
5033 ; X64: # %bb.0: # %entry
5034 ; X64-NEXT: kmovw %edi, %k1
5035 ; X64-NEXT: vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
5038 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
5039 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5040 %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
5041 %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
5042 %4 = bitcast i8 %__U to <8 x i1>
5043 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5044 %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> zeroinitializer
5048 define <2 x double> @test_mm_maskz_fmsubadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5049 ; X86-LABEL: test_mm_maskz_fmsubadd_pd:
5050 ; X86: # %bb.0: # %entry
5051 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5052 ; X86-NEXT: kmovw %eax, %k1
5053 ; X86-NEXT: vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
5056 ; X64-LABEL: test_mm_maskz_fmsubadd_pd:
5057 ; X64: # %bb.0: # %entry
5058 ; X64-NEXT: kmovw %edi, %k1
5059 ; X64-NEXT: vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
5062 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5063 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
5064 %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
5065 %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
5066 %3 = bitcast i8 %__U to <8 x i1>
5067 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5068 %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> zeroinitializer
5072 define <4 x double> @test_mm256_mask_fmaddsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
5073 ; X86-LABEL: test_mm256_mask_fmaddsub_pd:
5074 ; X86: # %bb.0: # %entry
5075 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5076 ; X86-NEXT: kmovw %eax, %k1
5077 ; X86-NEXT: vfmaddsub132pd {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
5080 ; X64-LABEL: test_mm256_mask_fmaddsub_pd:
5081 ; X64: # %bb.0: # %entry
5082 ; X64-NEXT: kmovw %edi, %k1
5083 ; X64-NEXT: vfmaddsub132pd {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
5086 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5087 %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5088 %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
5089 %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5090 %4 = bitcast i8 %__U to <8 x i1>
5091 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5092 %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> %__A
5096 define <4 x double> @test_mm256_mask_fmsubadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
5097 ; X86-LABEL: test_mm256_mask_fmsubadd_pd:
5098 ; X86: # %bb.0: # %entry
5099 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5100 ; X86-NEXT: kmovw %eax, %k1
5101 ; X86-NEXT: vfmsubadd132pd {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
5104 ; X64-LABEL: test_mm256_mask_fmsubadd_pd:
5105 ; X64: # %bb.0: # %entry
5106 ; X64-NEXT: kmovw %edi, %k1
5107 ; X64-NEXT: vfmsubadd132pd {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
5110 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5111 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
5112 %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5113 %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5114 %3 = bitcast i8 %__U to <8 x i1>
5115 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5116 %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__A
5120 define <4 x double> @test_mm256_mask3_fmaddsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
5121 ; X86-LABEL: test_mm256_mask3_fmaddsub_pd:
5122 ; X86: # %bb.0: # %entry
5123 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5124 ; X86-NEXT: kmovw %eax, %k1
5125 ; X86-NEXT: vfmaddsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2
5126 ; X86-NEXT: vmovapd %ymm2, %ymm0
5129 ; X64-LABEL: test_mm256_mask3_fmaddsub_pd:
5130 ; X64: # %bb.0: # %entry
5131 ; X64-NEXT: kmovw %edi, %k1
5132 ; X64-NEXT: vfmaddsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2
5133 ; X64-NEXT: vmovapd %ymm2, %ymm0
5136 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5137 %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5138 %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
5139 %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5140 %4 = bitcast i8 %__U to <8 x i1>
5141 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5142 %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> %__C
5146 define <4 x double> @test_mm256_maskz_fmaddsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
5147 ; X86-LABEL: test_mm256_maskz_fmaddsub_pd:
5148 ; X86: # %bb.0: # %entry
5149 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5150 ; X86-NEXT: kmovw %eax, %k1
5151 ; X86-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
5154 ; X64-LABEL: test_mm256_maskz_fmaddsub_pd:
5155 ; X64: # %bb.0: # %entry
5156 ; X64-NEXT: kmovw %edi, %k1
5157 ; X64-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
5160 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5161 %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5162 %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
5163 %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5164 %4 = bitcast i8 %__U to <8 x i1>
5165 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5166 %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> zeroinitializer
5170 define <4 x double> @test_mm256_maskz_fmsubadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
5171 ; X86-LABEL: test_mm256_maskz_fmsubadd_pd:
5172 ; X86: # %bb.0: # %entry
5173 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5174 ; X86-NEXT: kmovw %eax, %k1
5175 ; X86-NEXT: vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
5178 ; X64-LABEL: test_mm256_maskz_fmsubadd_pd:
5179 ; X64: # %bb.0: # %entry
5180 ; X64-NEXT: kmovw %edi, %k1
5181 ; X64-NEXT: vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
5184 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5185 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
5186 %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5187 %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5188 %3 = bitcast i8 %__U to <8 x i1>
5189 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5190 %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> zeroinitializer
5194 define <4 x float> @test_mm_mask_fmaddsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
5195 ; X86-LABEL: test_mm_mask_fmaddsub_ps:
5196 ; X86: # %bb.0: # %entry
5197 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5198 ; X86-NEXT: kmovw %eax, %k1
5199 ; X86-NEXT: vfmaddsub132ps {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
5202 ; X64-LABEL: test_mm_mask_fmaddsub_ps:
5203 ; X64: # %bb.0: # %entry
5204 ; X64-NEXT: kmovw %edi, %k1
5205 ; X64-NEXT: vfmaddsub132ps {{.*#+}} xmm0 = (xmm0 * xmm1) +/- xmm2
5208 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5209 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5210 %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
5211 %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5212 %4 = bitcast i8 %__U to <8 x i1>
5213 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5214 %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> %__A
5218 define <4 x float> @test_mm_mask_fmsubadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
5219 ; X86-LABEL: test_mm_mask_fmsubadd_ps:
5220 ; X86: # %bb.0: # %entry
5221 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5222 ; X86-NEXT: kmovw %eax, %k1
5223 ; X86-NEXT: vfmsubadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
5226 ; X64-LABEL: test_mm_mask_fmsubadd_ps:
5227 ; X64: # %bb.0: # %entry
5228 ; X64-NEXT: kmovw %edi, %k1
5229 ; X64-NEXT: vfmsubadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) -/+ xmm2
5232 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5233 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
5234 %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5235 %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5236 %3 = bitcast i8 %__U to <8 x i1>
5237 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5238 %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__A
5242 define <4 x float> @test_mm_mask3_fmaddsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
5243 ; X86-LABEL: test_mm_mask3_fmaddsub_ps:
5244 ; X86: # %bb.0: # %entry
5245 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5246 ; X86-NEXT: kmovw %eax, %k1
5247 ; X86-NEXT: vfmaddsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2
5248 ; X86-NEXT: vmovaps %xmm2, %xmm0
5251 ; X64-LABEL: test_mm_mask3_fmaddsub_ps:
5252 ; X64: # %bb.0: # %entry
5253 ; X64-NEXT: kmovw %edi, %k1
5254 ; X64-NEXT: vfmaddsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2
5255 ; X64-NEXT: vmovaps %xmm2, %xmm0
5258 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5259 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5260 %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
5261 %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5262 %4 = bitcast i8 %__U to <8 x i1>
5263 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5264 %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> %__C
5268 define <4 x float> @test_mm_maskz_fmaddsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5269 ; X86-LABEL: test_mm_maskz_fmaddsub_ps:
5270 ; X86: # %bb.0: # %entry
5271 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5272 ; X86-NEXT: kmovw %eax, %k1
5273 ; X86-NEXT: vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
5276 ; X64-LABEL: test_mm_maskz_fmaddsub_ps:
5277 ; X64: # %bb.0: # %entry
5278 ; X64-NEXT: kmovw %edi, %k1
5279 ; X64-NEXT: vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
5282 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5283 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5284 %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
5285 %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5286 %4 = bitcast i8 %__U to <8 x i1>
5287 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5288 %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> zeroinitializer
5292 define <4 x float> @test_mm_maskz_fmsubadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5293 ; X86-LABEL: test_mm_maskz_fmsubadd_ps:
5294 ; X86: # %bb.0: # %entry
5295 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5296 ; X86-NEXT: kmovw %eax, %k1
5297 ; X86-NEXT: vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
5300 ; X64-LABEL: test_mm_maskz_fmsubadd_ps:
5301 ; X64: # %bb.0: # %entry
5302 ; X64-NEXT: kmovw %edi, %k1
5303 ; X64-NEXT: vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
5306 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5307 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
5308 %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5309 %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5310 %3 = bitcast i8 %__U to <8 x i1>
5311 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5312 %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> zeroinitializer
5316 define <8 x float> @test_mm256_mask_fmaddsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
5317 ; X86-LABEL: test_mm256_mask_fmaddsub_ps:
5318 ; X86: # %bb.0: # %entry
5319 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5320 ; X86-NEXT: kmovw %eax, %k1
5321 ; X86-NEXT: vfmaddsub132ps {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
5324 ; X64-LABEL: test_mm256_mask_fmaddsub_ps:
5325 ; X64: # %bb.0: # %entry
5326 ; X64-NEXT: kmovw %edi, %k1
5327 ; X64-NEXT: vfmaddsub132ps {{.*#+}} ymm0 = (ymm0 * ymm1) +/- ymm2
5330 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5331 %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5332 %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
5333 %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5334 %4 = bitcast i8 %__U to <8 x i1>
5335 %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %__A
5339 define <8 x float> @test_mm256_mask_fmsubadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
5340 ; X86-LABEL: test_mm256_mask_fmsubadd_ps:
5341 ; X86: # %bb.0: # %entry
5342 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5343 ; X86-NEXT: kmovw %eax, %k1
5344 ; X86-NEXT: vfmsubadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
5347 ; X64-LABEL: test_mm256_mask_fmsubadd_ps:
5348 ; X64: # %bb.0: # %entry
5349 ; X64-NEXT: kmovw %edi, %k1
5350 ; X64-NEXT: vfmsubadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) -/+ ymm2
5353 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5354 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
5355 %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5356 %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5357 %3 = bitcast i8 %__U to <8 x i1>
5358 %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__A
5362 define <8 x float> @test_mm256_mask3_fmaddsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
5363 ; X86-LABEL: test_mm256_mask3_fmaddsub_ps:
5364 ; X86: # %bb.0: # %entry
5365 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5366 ; X86-NEXT: kmovw %eax, %k1
5367 ; X86-NEXT: vfmaddsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2
5368 ; X86-NEXT: vmovaps %ymm2, %ymm0
5371 ; X64-LABEL: test_mm256_mask3_fmaddsub_ps:
5372 ; X64: # %bb.0: # %entry
5373 ; X64-NEXT: kmovw %edi, %k1
5374 ; X64-NEXT: vfmaddsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2
5375 ; X64-NEXT: vmovaps %ymm2, %ymm0
5378 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5379 %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5380 %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
5381 %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5382 %4 = bitcast i8 %__U to <8 x i1>
5383 %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %__C
5387 define <8 x float> @test_mm256_maskz_fmaddsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
5388 ; X86-LABEL: test_mm256_maskz_fmaddsub_ps:
5389 ; X86: # %bb.0: # %entry
5390 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5391 ; X86-NEXT: kmovw %eax, %k1
5392 ; X86-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
5395 ; X64-LABEL: test_mm256_maskz_fmaddsub_ps:
5396 ; X64: # %bb.0: # %entry
5397 ; X64-NEXT: kmovw %edi, %k1
5398 ; X64-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
5401 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5402 %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5403 %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
5404 %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5405 %4 = bitcast i8 %__U to <8 x i1>
5406 %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> zeroinitializer
5410 define <8 x float> @test_mm256_maskz_fmsubadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
5411 ; X86-LABEL: test_mm256_maskz_fmsubadd_ps:
5412 ; X86: # %bb.0: # %entry
5413 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5414 ; X86-NEXT: kmovw %eax, %k1
5415 ; X86-NEXT: vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
5418 ; X64-LABEL: test_mm256_maskz_fmsubadd_ps:
5419 ; X64: # %bb.0: # %entry
5420 ; X64-NEXT: kmovw %edi, %k1
5421 ; X64-NEXT: vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
5424 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5425 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
5426 %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5427 %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5428 %3 = bitcast i8 %__U to <8 x i1>
5429 %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> zeroinitializer
5433 define <2 x double> @test_mm_mask3_fmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
5434 ; X86-LABEL: test_mm_mask3_fmsub_pd:
5435 ; X86: # %bb.0: # %entry
5436 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5437 ; X86-NEXT: kmovw %eax, %k1
5438 ; X86-NEXT: vfmsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5439 ; X86-NEXT: vmovapd %xmm2, %xmm0
5442 ; X64-LABEL: test_mm_mask3_fmsub_pd:
5443 ; X64: # %bb.0: # %entry
5444 ; X64-NEXT: kmovw %edi, %k1
5445 ; X64-NEXT: vfmsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5446 ; X64-NEXT: vmovapd %xmm2, %xmm0
5449 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5450 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
5451 %1 = bitcast i8 %__U to <8 x i1>
5452 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5453 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
5457 define <4 x double> @test_mm256_mask3_fmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
5458 ; X86-LABEL: test_mm256_mask3_fmsub_pd:
5459 ; X86: # %bb.0: # %entry
5460 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5461 ; X86-NEXT: kmovw %eax, %k1
5462 ; X86-NEXT: vfmsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2
5463 ; X86-NEXT: vmovapd %ymm2, %ymm0
5466 ; X64-LABEL: test_mm256_mask3_fmsub_pd:
5467 ; X64: # %bb.0: # %entry
5468 ; X64-NEXT: kmovw %edi, %k1
5469 ; X64-NEXT: vfmsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2
5470 ; X64-NEXT: vmovapd %ymm2, %ymm0
5473 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5474 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
5475 %1 = bitcast i8 %__U to <8 x i1>
5476 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5477 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
5481 define <4 x float> @test_mm_mask3_fmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
5482 ; X86-LABEL: test_mm_mask3_fmsub_ps:
5483 ; X86: # %bb.0: # %entry
5484 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5485 ; X86-NEXT: kmovw %eax, %k1
5486 ; X86-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5487 ; X86-NEXT: vmovaps %xmm2, %xmm0
5490 ; X64-LABEL: test_mm_mask3_fmsub_ps:
5491 ; X64: # %bb.0: # %entry
5492 ; X64-NEXT: kmovw %edi, %k1
5493 ; X64-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5494 ; X64-NEXT: vmovaps %xmm2, %xmm0
5497 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5498 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
5499 %1 = bitcast i8 %__U to <8 x i1>
5500 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5501 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
5505 define <8 x float> @test_mm256_mask3_fmsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
5506 ; X86-LABEL: test_mm256_mask3_fmsub_ps:
5507 ; X86: # %bb.0: # %entry
5508 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5509 ; X86-NEXT: kmovw %eax, %k1
5510 ; X86-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2
5511 ; X86-NEXT: vmovaps %ymm2, %ymm0
5514 ; X64-LABEL: test_mm256_mask3_fmsub_ps:
5515 ; X64: # %bb.0: # %entry
5516 ; X64-NEXT: kmovw %edi, %k1
5517 ; X64-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2
5518 ; X64-NEXT: vmovaps %ymm2, %ymm0
5521 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5522 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
5523 %1 = bitcast i8 %__U to <8 x i1>
5524 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
5528 define <2 x double> @test_mm_mask3_fmsubadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
5529 ; X86-LABEL: test_mm_mask3_fmsubadd_pd:
5530 ; X86: # %bb.0: # %entry
5531 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5532 ; X86-NEXT: kmovw %eax, %k1
5533 ; X86-NEXT: vfmsubadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2
5534 ; X86-NEXT: vmovapd %xmm2, %xmm0
5537 ; X64-LABEL: test_mm_mask3_fmsubadd_pd:
5538 ; X64: # %bb.0: # %entry
5539 ; X64-NEXT: kmovw %edi, %k1
5540 ; X64-NEXT: vfmsubadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2
5541 ; X64-NEXT: vmovapd %xmm2, %xmm0
5544 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5545 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
5546 %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
5547 %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
5548 %3 = bitcast i8 %__U to <8 x i1>
5549 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5550 %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__C
5554 define <4 x double> @test_mm256_mask3_fmsubadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
5555 ; X86-LABEL: test_mm256_mask3_fmsubadd_pd:
5556 ; X86: # %bb.0: # %entry
5557 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5558 ; X86-NEXT: kmovw %eax, %k1
5559 ; X86-NEXT: vfmsubadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2
5560 ; X86-NEXT: vmovapd %ymm2, %ymm0
5563 ; X64-LABEL: test_mm256_mask3_fmsubadd_pd:
5564 ; X64: # %bb.0: # %entry
5565 ; X64-NEXT: kmovw %edi, %k1
5566 ; X64-NEXT: vfmsubadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2
5567 ; X64-NEXT: vmovapd %ymm2, %ymm0
5570 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5571 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
5572 %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5573 %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5574 %3 = bitcast i8 %__U to <8 x i1>
5575 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5576 %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__C
5580 define <4 x float> @test_mm_mask3_fmsubadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
5581 ; X86-LABEL: test_mm_mask3_fmsubadd_ps:
5582 ; X86: # %bb.0: # %entry
5583 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5584 ; X86-NEXT: kmovw %eax, %k1
5585 ; X86-NEXT: vfmsubadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2
5586 ; X86-NEXT: vmovaps %xmm2, %xmm0
5589 ; X64-LABEL: test_mm_mask3_fmsubadd_ps:
5590 ; X64: # %bb.0: # %entry
5591 ; X64-NEXT: kmovw %edi, %k1
5592 ; X64-NEXT: vfmsubadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2
5593 ; X64-NEXT: vmovaps %xmm2, %xmm0
5596 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5597 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
5598 %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5599 %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5600 %3 = bitcast i8 %__U to <8 x i1>
5601 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5602 %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__C
5606 define <8 x float> @test_mm256_mask3_fmsubadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
5607 ; X86-LABEL: test_mm256_mask3_fmsubadd_ps:
5608 ; X86: # %bb.0: # %entry
5609 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5610 ; X86-NEXT: kmovw %eax, %k1
5611 ; X86-NEXT: vfmsubadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2
5612 ; X86-NEXT: vmovaps %ymm2, %ymm0
5615 ; X64-LABEL: test_mm256_mask3_fmsubadd_ps:
5616 ; X64: # %bb.0: # %entry
5617 ; X64-NEXT: kmovw %edi, %k1
5618 ; X64-NEXT: vfmsubadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2
5619 ; X64-NEXT: vmovaps %ymm2, %ymm0
5622 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5623 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
5624 %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5625 %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5626 %3 = bitcast i8 %__U to <8 x i1>
5627 %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__C
5631 define <2 x double> @test_mm_mask_fnmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
5632 ; X86-LABEL: test_mm_mask_fnmadd_pd:
5633 ; X86: # %bb.0: # %entry
5634 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5635 ; X86-NEXT: kmovw %eax, %k1
5636 ; X86-NEXT: vfnmadd132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
5639 ; X64-LABEL: test_mm_mask_fnmadd_pd:
5640 ; X64: # %bb.0: # %entry
5641 ; X64-NEXT: kmovw %edi, %k1
5642 ; X64-NEXT: vfnmadd132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
5645 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
5646 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %__C) #9
5647 %1 = bitcast i8 %__U to <8 x i1>
5648 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5649 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
5653 define <4 x double> @test_mm256_mask_fnmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
5654 ; X86-LABEL: test_mm256_mask_fnmadd_pd:
5655 ; X86: # %bb.0: # %entry
5656 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5657 ; X86-NEXT: kmovw %eax, %k1
5658 ; X86-NEXT: vfnmadd132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2
5661 ; X64-LABEL: test_mm256_mask_fnmadd_pd:
5662 ; X64: # %bb.0: # %entry
5663 ; X64-NEXT: kmovw %edi, %k1
5664 ; X64-NEXT: vfnmadd132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2
5667 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
5668 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %__C) #9
5669 %1 = bitcast i8 %__U to <8 x i1>
5670 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5671 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
5675 define <4 x float> @test_mm_mask_fnmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
5676 ; X86-LABEL: test_mm_mask_fnmadd_ps:
5677 ; X86: # %bb.0: # %entry
5678 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5679 ; X86-NEXT: kmovw %eax, %k1
5680 ; X86-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
5683 ; X64-LABEL: test_mm_mask_fnmadd_ps:
5684 ; X64: # %bb.0: # %entry
5685 ; X64-NEXT: kmovw %edi, %k1
5686 ; X64-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
5689 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5690 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %__C) #9
5691 %1 = bitcast i8 %__U to <8 x i1>
5692 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5693 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
5697 define <8 x float> @test_mm256_mask_fnmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
5698 ; X86-LABEL: test_mm256_mask_fnmadd_ps:
5699 ; X86: # %bb.0: # %entry
5700 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5701 ; X86-NEXT: kmovw %eax, %k1
5702 ; X86-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2
5705 ; X64-LABEL: test_mm256_mask_fnmadd_ps:
5706 ; X64: # %bb.0: # %entry
5707 ; X64-NEXT: kmovw %edi, %k1
5708 ; X64-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2
5711 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5712 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %__C) #9
5713 %1 = bitcast i8 %__U to <8 x i1>
5714 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
5718 define <2 x double> @test_mm_mask_fnmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
5719 ; X86-LABEL: test_mm_mask_fnmsub_pd:
5720 ; X86: # %bb.0: # %entry
5721 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5722 ; X86-NEXT: kmovw %eax, %k1
5723 ; X86-NEXT: vfnmsub132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
5726 ; X64-LABEL: test_mm_mask_fnmsub_pd:
5727 ; X64: # %bb.0: # %entry
5728 ; X64-NEXT: kmovw %edi, %k1
5729 ; X64-NEXT: vfnmsub132pd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
5732 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
5733 %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5734 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %sub1.i) #9
5735 %1 = bitcast i8 %__U to <8 x i1>
5736 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5737 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
5741 define <2 x double> @test_mm_mask3_fnmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
5742 ; X86-LABEL: test_mm_mask3_fnmsub_pd:
5743 ; X86: # %bb.0: # %entry
5744 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5745 ; X86-NEXT: kmovw %eax, %k1
5746 ; X86-NEXT: vfnmsub231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
5747 ; X86-NEXT: vmovapd %xmm2, %xmm0
5750 ; X64-LABEL: test_mm_mask3_fnmsub_pd:
5751 ; X64: # %bb.0: # %entry
5752 ; X64-NEXT: kmovw %edi, %k1
5753 ; X64-NEXT: vfnmsub231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
5754 ; X64-NEXT: vmovapd %xmm2, %xmm0
5757 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
5758 %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5759 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %sub1.i) #9
5760 %1 = bitcast i8 %__U to <8 x i1>
5761 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5762 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
5766 define <4 x double> @test_mm256_mask_fnmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
5767 ; X86-LABEL: test_mm256_mask_fnmsub_pd:
5768 ; X86: # %bb.0: # %entry
5769 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5770 ; X86-NEXT: kmovw %eax, %k1
5771 ; X86-NEXT: vfnmsub132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2
5774 ; X64-LABEL: test_mm256_mask_fnmsub_pd:
5775 ; X64: # %bb.0: # %entry
5776 ; X64-NEXT: kmovw %edi, %k1
5777 ; X64-NEXT: vfnmsub132pd {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2
5780 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
5781 %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5782 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %sub1.i) #9
5783 %1 = bitcast i8 %__U to <8 x i1>
5784 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5785 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
5789 define <4 x double> @test_mm256_mask3_fnmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
5790 ; X86-LABEL: test_mm256_mask3_fnmsub_pd:
5791 ; X86: # %bb.0: # %entry
5792 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5793 ; X86-NEXT: kmovw %eax, %k1
5794 ; X86-NEXT: vfnmsub231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2
5795 ; X86-NEXT: vmovapd %ymm2, %ymm0
5798 ; X64-LABEL: test_mm256_mask3_fnmsub_pd:
5799 ; X64: # %bb.0: # %entry
5800 ; X64-NEXT: kmovw %edi, %k1
5801 ; X64-NEXT: vfnmsub231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2
5802 ; X64-NEXT: vmovapd %ymm2, %ymm0
5805 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
5806 %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5807 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %sub1.i) #9
5808 %1 = bitcast i8 %__U to <8 x i1>
5809 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5810 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
5814 define <4 x float> @test_mm_mask_fnmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
5815 ; X86-LABEL: test_mm_mask_fnmsub_ps:
5816 ; X86: # %bb.0: # %entry
5817 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5818 ; X86-NEXT: kmovw %eax, %k1
5819 ; X86-NEXT: vfnmsub132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
5822 ; X64-LABEL: test_mm_mask_fnmsub_ps:
5823 ; X64: # %bb.0: # %entry
5824 ; X64-NEXT: kmovw %edi, %k1
5825 ; X64-NEXT: vfnmsub132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
5828 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5829 %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5830 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %sub1.i) #9
5831 %1 = bitcast i8 %__U to <8 x i1>
5832 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5833 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
5837 define <4 x float> @test_mm_mask3_fnmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
5838 ; X86-LABEL: test_mm_mask3_fnmsub_ps:
5839 ; X86: # %bb.0: # %entry
5840 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5841 ; X86-NEXT: kmovw %eax, %k1
5842 ; X86-NEXT: vfnmsub231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
5843 ; X86-NEXT: vmovaps %xmm2, %xmm0
5846 ; X64-LABEL: test_mm_mask3_fnmsub_ps:
5847 ; X64: # %bb.0: # %entry
5848 ; X64-NEXT: kmovw %edi, %k1
5849 ; X64-NEXT: vfnmsub231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
5850 ; X64-NEXT: vmovaps %xmm2, %xmm0
5853 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5854 %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5855 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %sub1.i) #9
5856 %1 = bitcast i8 %__U to <8 x i1>
5857 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5858 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
5862 define <8 x float> @test_mm256_mask_fnmsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
5863 ; X86-LABEL: test_mm256_mask_fnmsub_ps:
5864 ; X86: # %bb.0: # %entry
5865 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5866 ; X86-NEXT: kmovw %eax, %k1
5867 ; X86-NEXT: vfnmsub132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2
5870 ; X64-LABEL: test_mm256_mask_fnmsub_ps:
5871 ; X64: # %bb.0: # %entry
5872 ; X64-NEXT: kmovw %edi, %k1
5873 ; X64-NEXT: vfnmsub132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2
5876 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5877 %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5878 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %sub1.i) #9
5879 %1 = bitcast i8 %__U to <8 x i1>
5880 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
5884 define <8 x float> @test_mm256_mask3_fnmsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
5885 ; X86-LABEL: test_mm256_mask3_fnmsub_ps:
5886 ; X86: # %bb.0: # %entry
5887 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5888 ; X86-NEXT: kmovw %eax, %k1
5889 ; X86-NEXT: vfnmsub231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2
5890 ; X86-NEXT: vmovaps %ymm2, %ymm0
5893 ; X64-LABEL: test_mm256_mask3_fnmsub_ps:
5894 ; X64: # %bb.0: # %entry
5895 ; X64-NEXT: kmovw %edi, %k1
5896 ; X64-NEXT: vfnmsub231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2
5897 ; X64-NEXT: vmovaps %ymm2, %ymm0
5900 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5901 %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5902 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %sub1.i) #9
5903 %1 = bitcast i8 %__U to <8 x i1>
5904 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
5908 define <2 x double> @test_mm_mask_expandloadu_pd(<2 x double> %__W, i8 zeroext %__U, i8* readonly %__P) {
5909 ; X86-LABEL: test_mm_mask_expandloadu_pd:
5910 ; X86: # %bb.0: # %entry
5911 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
5912 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
5913 ; X86-NEXT: kmovw %ecx, %k1
5914 ; X86-NEXT: vexpandpd (%eax), %xmm0 {%k1}
5917 ; X64-LABEL: test_mm_mask_expandloadu_pd:
5918 ; X64: # %bb.0: # %entry
5919 ; X64-NEXT: kmovw %edi, %k1
5920 ; X64-NEXT: vexpandpd (%rsi), %xmm0 {%k1}
5923 %0 = bitcast i8* %__P to double*
5924 %1 = bitcast i8 %__U to <8 x i1>
5925 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5926 %2 = tail call <2 x double> @llvm.masked.expandload.v2f64(double* %0, <2 x i1> %extract.i, <2 x double> %__W)
5930 define <2 x double> @test_mm_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) {
5931 ; X86-LABEL: test_mm_maskz_expandloadu_pd:
5932 ; X86: # %bb.0: # %entry
5933 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
5934 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
5935 ; X86-NEXT: kmovw %ecx, %k1
5936 ; X86-NEXT: vexpandpd (%eax), %xmm0 {%k1} {z}
5939 ; X64-LABEL: test_mm_maskz_expandloadu_pd:
5940 ; X64: # %bb.0: # %entry
5941 ; X64-NEXT: kmovw %edi, %k1
5942 ; X64-NEXT: vexpandpd (%rsi), %xmm0 {%k1} {z}
5945 %0 = bitcast i8* %__P to double*
5946 %1 = bitcast i8 %__U to <8 x i1>
5947 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5948 %2 = tail call <2 x double> @llvm.masked.expandload.v2f64(double* %0, <2 x i1> %extract.i, <2 x double> zeroinitializer)
5952 define <4 x double> @test_mm256_mask_expandloadu_pd(<4 x double> %__W, i8 zeroext %__U, i8* readonly %__P) {
5953 ; X86-LABEL: test_mm256_mask_expandloadu_pd:
5954 ; X86: # %bb.0: # %entry
5955 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
5956 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
5957 ; X86-NEXT: kmovw %ecx, %k1
5958 ; X86-NEXT: vexpandpd (%eax), %ymm0 {%k1}
5961 ; X64-LABEL: test_mm256_mask_expandloadu_pd:
5962 ; X64: # %bb.0: # %entry
5963 ; X64-NEXT: kmovw %edi, %k1
5964 ; X64-NEXT: vexpandpd (%rsi), %ymm0 {%k1}
5967 %0 = bitcast i8* %__P to double*
5968 %1 = bitcast i8 %__U to <8 x i1>
5969 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5970 %2 = tail call <4 x double> @llvm.masked.expandload.v4f64(double* %0, <4 x i1> %extract.i, <4 x double> %__W)
5974 define <4 x double> @test_mm256_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) {
5975 ; X86-LABEL: test_mm256_maskz_expandloadu_pd:
5976 ; X86: # %bb.0: # %entry
5977 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
5978 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
5979 ; X86-NEXT: kmovw %ecx, %k1
5980 ; X86-NEXT: vexpandpd (%eax), %ymm0 {%k1} {z}
5983 ; X64-LABEL: test_mm256_maskz_expandloadu_pd:
5984 ; X64: # %bb.0: # %entry
5985 ; X64-NEXT: kmovw %edi, %k1
5986 ; X64-NEXT: vexpandpd (%rsi), %ymm0 {%k1} {z}
5989 %0 = bitcast i8* %__P to double*
5990 %1 = bitcast i8 %__U to <8 x i1>
5991 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5992 %2 = tail call <4 x double> @llvm.masked.expandload.v4f64(double* %0, <4 x i1> %extract.i, <4 x double> zeroinitializer)
5996 define <2 x i64> @test_mm_mask_expandloadu_epi64(<2 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
5997 ; X86-LABEL: test_mm_mask_expandloadu_epi64:
5998 ; X86: # %bb.0: # %entry
5999 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6000 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6001 ; X86-NEXT: kmovw %ecx, %k1
6002 ; X86-NEXT: vpexpandq (%eax), %xmm0 {%k1}
6005 ; X64-LABEL: test_mm_mask_expandloadu_epi64:
6006 ; X64: # %bb.0: # %entry
6007 ; X64-NEXT: kmovw %edi, %k1
6008 ; X64-NEXT: vpexpandq (%rsi), %xmm0 {%k1}
6011 %0 = bitcast i8* %__P to i64*
6012 %1 = bitcast i8 %__U to <8 x i1>
6013 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6014 %2 = tail call <2 x i64> @llvm.masked.expandload.v2i64(i64* %0, <2 x i1> %extract.i, <2 x i64> %__W) #10
6018 define <2 x i64> @test_mm_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) {
6019 ; X86-LABEL: test_mm_maskz_expandloadu_epi64:
6020 ; X86: # %bb.0: # %entry
6021 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6022 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6023 ; X86-NEXT: kmovw %ecx, %k1
6024 ; X86-NEXT: vpexpandq (%eax), %xmm0 {%k1} {z}
6027 ; X64-LABEL: test_mm_maskz_expandloadu_epi64:
6028 ; X64: # %bb.0: # %entry
6029 ; X64-NEXT: kmovw %edi, %k1
6030 ; X64-NEXT: vpexpandq (%rsi), %xmm0 {%k1} {z}
6033 %0 = bitcast i8* %__P to i64*
6034 %1 = bitcast i8 %__U to <8 x i1>
6035 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6036 %2 = tail call <2 x i64> @llvm.masked.expandload.v2i64(i64* %0, <2 x i1> %extract.i, <2 x i64> zeroinitializer)
6040 define <4 x i64> @test_mm256_mask_expandloadu_epi64(<4 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
6041 ; X86-LABEL: test_mm256_mask_expandloadu_epi64:
6042 ; X86: # %bb.0: # %entry
6043 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6044 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6045 ; X86-NEXT: kmovw %ecx, %k1
6046 ; X86-NEXT: vpexpandq (%eax), %ymm0 {%k1}
6049 ; X64-LABEL: test_mm256_mask_expandloadu_epi64:
6050 ; X64: # %bb.0: # %entry
6051 ; X64-NEXT: kmovw %edi, %k1
6052 ; X64-NEXT: vpexpandq (%rsi), %ymm0 {%k1}
6055 %0 = bitcast i8* %__P to i64*
6056 %1 = bitcast i8 %__U to <8 x i1>
6057 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6058 %2 = tail call <4 x i64> @llvm.masked.expandload.v4i64(i64* %0, <4 x i1> %extract.i, <4 x i64> %__W) #10
6062 define <4 x i64> @test_mm256_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) {
6063 ; X86-LABEL: test_mm256_maskz_expandloadu_epi64:
6064 ; X86: # %bb.0: # %entry
6065 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6066 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6067 ; X86-NEXT: kmovw %ecx, %k1
6068 ; X86-NEXT: vpexpandq (%eax), %ymm0 {%k1} {z}
6071 ; X64-LABEL: test_mm256_maskz_expandloadu_epi64:
6072 ; X64: # %bb.0: # %entry
6073 ; X64-NEXT: kmovw %edi, %k1
6074 ; X64-NEXT: vpexpandq (%rsi), %ymm0 {%k1} {z}
6077 %0 = bitcast i8* %__P to i64*
6078 %1 = bitcast i8 %__U to <8 x i1>
6079 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6080 %2 = tail call <4 x i64> @llvm.masked.expandload.v4i64(i64* %0, <4 x i1> %extract.i, <4 x i64> zeroinitializer)
6084 define <4 x float> @test_mm_mask_expandloadu_ps(<4 x float> %__W, i8 zeroext %__U, i8* readonly %__P) {
6085 ; X86-LABEL: test_mm_mask_expandloadu_ps:
6086 ; X86: # %bb.0: # %entry
6087 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6088 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6089 ; X86-NEXT: kmovw %ecx, %k1
6090 ; X86-NEXT: vexpandps (%eax), %xmm0 {%k1}
6093 ; X64-LABEL: test_mm_mask_expandloadu_ps:
6094 ; X64: # %bb.0: # %entry
6095 ; X64-NEXT: kmovw %edi, %k1
6096 ; X64-NEXT: vexpandps (%rsi), %xmm0 {%k1}
6099 %0 = bitcast i8* %__P to float*
6100 %1 = bitcast i8 %__U to <8 x i1>
6101 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6102 %2 = tail call <4 x float> @llvm.masked.expandload.v4f32(float* %0, <4 x i1> %extract.i, <4 x float> %__W)
6106 define <4 x float> @test_mm_maskz_expandloadu_ps(i8 zeroext %__U, i8* readonly %__P) {
6107 ; X86-LABEL: test_mm_maskz_expandloadu_ps:
6108 ; X86: # %bb.0: # %entry
6109 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6110 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6111 ; X86-NEXT: kmovw %ecx, %k1
6112 ; X86-NEXT: vexpandps (%eax), %xmm0 {%k1} {z}
6115 ; X64-LABEL: test_mm_maskz_expandloadu_ps:
6116 ; X64: # %bb.0: # %entry
6117 ; X64-NEXT: kmovw %edi, %k1
6118 ; X64-NEXT: vexpandps (%rsi), %xmm0 {%k1} {z}
6121 %0 = bitcast i8* %__P to float*
6122 %1 = bitcast i8 %__U to <8 x i1>
6123 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6124 %2 = tail call <4 x float> @llvm.masked.expandload.v4f32(float* %0, <4 x i1> %extract.i, <4 x float> zeroinitializer)
6128 define <8 x float> @test_mm256_mask_expandloadu_ps(<8 x float> %__W, i8 zeroext %__U, i8* readonly %__P) {
6129 ; X86-LABEL: test_mm256_mask_expandloadu_ps:
6130 ; X86: # %bb.0: # %entry
6131 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6132 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6133 ; X86-NEXT: kmovw %ecx, %k1
6134 ; X86-NEXT: vexpandps (%eax), %ymm0 {%k1}
6137 ; X64-LABEL: test_mm256_mask_expandloadu_ps:
6138 ; X64: # %bb.0: # %entry
6139 ; X64-NEXT: kmovw %edi, %k1
6140 ; X64-NEXT: vexpandps (%rsi), %ymm0 {%k1}
6143 %0 = bitcast i8* %__P to float*
6144 %1 = bitcast i8 %__U to <8 x i1>
6145 %2 = tail call <8 x float> @llvm.masked.expandload.v8f32(float* %0, <8 x i1> %1, <8 x float> %__W)
6149 define <8 x float> @test_mm256_maskz_expandloadu_ps(i8 zeroext %__U, i8* readonly %__P) {
6150 ; X86-LABEL: test_mm256_maskz_expandloadu_ps:
6151 ; X86: # %bb.0: # %entry
6152 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6153 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6154 ; X86-NEXT: kmovw %ecx, %k1
6155 ; X86-NEXT: vexpandps (%eax), %ymm0 {%k1} {z}
6158 ; X64-LABEL: test_mm256_maskz_expandloadu_ps:
6159 ; X64: # %bb.0: # %entry
6160 ; X64-NEXT: kmovw %edi, %k1
6161 ; X64-NEXT: vexpandps (%rsi), %ymm0 {%k1} {z}
6164 %0 = bitcast i8* %__P to float*
6165 %1 = bitcast i8 %__U to <8 x i1>
6166 %2 = tail call <8 x float> @llvm.masked.expandload.v8f32(float* %0, <8 x i1> %1, <8 x float> zeroinitializer)
6170 define <2 x i64> @test_mm_mask_expandloadu_epi32(<2 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
6171 ; X86-LABEL: test_mm_mask_expandloadu_epi32:
6172 ; X86: # %bb.0: # %entry
6173 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6174 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6175 ; X86-NEXT: kmovw %ecx, %k1
6176 ; X86-NEXT: vpexpandd (%eax), %xmm0 {%k1}
6179 ; X64-LABEL: test_mm_mask_expandloadu_epi32:
6180 ; X64: # %bb.0: # %entry
6181 ; X64-NEXT: kmovw %edi, %k1
6182 ; X64-NEXT: vpexpandd (%rsi), %xmm0 {%k1}
6185 %0 = bitcast <2 x i64> %__W to <4 x i32>
6186 %1 = bitcast i8* %__P to i32*
6187 %2 = bitcast i8 %__U to <8 x i1>
6188 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6189 %3 = tail call <4 x i32> @llvm.masked.expandload.v4i32(i32* %1, <4 x i1> %extract.i, <4 x i32> %0)
6190 %4 = bitcast <4 x i32> %3 to <2 x i64>
6194 define <2 x i64> @test_mm_maskz_expandloadu_epi32(i8 zeroext %__U, i8* readonly %__P) {
6195 ; X86-LABEL: test_mm_maskz_expandloadu_epi32:
6196 ; X86: # %bb.0: # %entry
6197 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6198 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6199 ; X86-NEXT: kmovw %ecx, %k1
6200 ; X86-NEXT: vpexpandd (%eax), %xmm0 {%k1} {z}
6203 ; X64-LABEL: test_mm_maskz_expandloadu_epi32:
6204 ; X64: # %bb.0: # %entry
6205 ; X64-NEXT: kmovw %edi, %k1
6206 ; X64-NEXT: vpexpandd (%rsi), %xmm0 {%k1} {z}
6209 %0 = bitcast i8* %__P to i32*
6210 %1 = bitcast i8 %__U to <8 x i1>
6211 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6212 %2 = tail call <4 x i32> @llvm.masked.expandload.v4i32(i32* %0, <4 x i1> %extract.i, <4 x i32> zeroinitializer)
6213 %3 = bitcast <4 x i32> %2 to <2 x i64>
6217 define <4 x i64> @test_mm256_mask_expandloadu_epi32(<4 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
6218 ; X86-LABEL: test_mm256_mask_expandloadu_epi32:
6219 ; X86: # %bb.0: # %entry
6220 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6221 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6222 ; X86-NEXT: kmovw %ecx, %k1
6223 ; X86-NEXT: vpexpandd (%eax), %ymm0 {%k1}
6226 ; X64-LABEL: test_mm256_mask_expandloadu_epi32:
6227 ; X64: # %bb.0: # %entry
6228 ; X64-NEXT: kmovw %edi, %k1
6229 ; X64-NEXT: vpexpandd (%rsi), %ymm0 {%k1}
6232 %0 = bitcast <4 x i64> %__W to <8 x i32>
6233 %1 = bitcast i8* %__P to i32*
6234 %2 = bitcast i8 %__U to <8 x i1>
6235 %3 = tail call <8 x i32> @llvm.masked.expandload.v8i32(i32* %1, <8 x i1> %2, <8 x i32> %0)
6236 %4 = bitcast <8 x i32> %3 to <4 x i64>
6240 define <4 x i64> @test_mm256_maskz_expandloadu_epi32(i8 zeroext %__U, i8* readonly %__P) {
6241 ; X86-LABEL: test_mm256_maskz_expandloadu_epi32:
6242 ; X86: # %bb.0: # %entry
6243 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6244 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6245 ; X86-NEXT: kmovw %ecx, %k1
6246 ; X86-NEXT: vpexpandd (%eax), %ymm0 {%k1} {z}
6249 ; X64-LABEL: test_mm256_maskz_expandloadu_epi32:
6250 ; X64: # %bb.0: # %entry
6251 ; X64-NEXT: kmovw %edi, %k1
6252 ; X64-NEXT: vpexpandd (%rsi), %ymm0 {%k1} {z}
6255 %0 = bitcast i8* %__P to i32*
6256 %1 = bitcast i8 %__U to <8 x i1>
6257 %2 = tail call <8 x i32> @llvm.masked.expandload.v8i32(i32* %0, <8 x i1> %1, <8 x i32> zeroinitializer)
6258 %3 = bitcast <8 x i32> %2 to <4 x i64>
6262 define void @test_mm_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <2 x double> %__A) {
6263 ; X86-LABEL: test_mm_mask_compressstoreu_pd:
6264 ; X86: # %bb.0: # %entry
6265 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6266 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6267 ; X86-NEXT: kmovw %eax, %k1
6268 ; X86-NEXT: vcompresspd %xmm0, (%ecx) {%k1}
6271 ; X64-LABEL: test_mm_mask_compressstoreu_pd:
6272 ; X64: # %bb.0: # %entry
6273 ; X64-NEXT: kmovw %esi, %k1
6274 ; X64-NEXT: vcompresspd %xmm0, (%rdi) {%k1}
6277 %0 = bitcast i8* %__P to double*
6278 %1 = bitcast i8 %__U to <8 x i1>
6279 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6280 tail call void @llvm.masked.compressstore.v2f64(<2 x double> %__A, double* %0, <2 x i1> %extract.i)
6284 define void @test_mm256_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <4 x double> %__A) {
6285 ; X86-LABEL: test_mm256_mask_compressstoreu_pd:
6286 ; X86: # %bb.0: # %entry
6287 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6288 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6289 ; X86-NEXT: kmovw %eax, %k1
6290 ; X86-NEXT: vcompresspd %ymm0, (%ecx) {%k1}
6291 ; X86-NEXT: vzeroupper
6294 ; X64-LABEL: test_mm256_mask_compressstoreu_pd:
6295 ; X64: # %bb.0: # %entry
6296 ; X64-NEXT: kmovw %esi, %k1
6297 ; X64-NEXT: vcompresspd %ymm0, (%rdi) {%k1}
6298 ; X64-NEXT: vzeroupper
6301 %0 = bitcast i8* %__P to double*
6302 %1 = bitcast i8 %__U to <8 x i1>
6303 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6304 tail call void @llvm.masked.compressstore.v4f64(<4 x double> %__A, double* %0, <4 x i1> %extract.i)
6308 define void @test_mm_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <2 x i64> %__A) {
6309 ; X86-LABEL: test_mm_mask_compressstoreu_epi64:
6310 ; X86: # %bb.0: # %entry
6311 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6312 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6313 ; X86-NEXT: kmovw %eax, %k1
6314 ; X86-NEXT: vpcompressq %xmm0, (%ecx) {%k1}
6317 ; X64-LABEL: test_mm_mask_compressstoreu_epi64:
6318 ; X64: # %bb.0: # %entry
6319 ; X64-NEXT: kmovw %esi, %k1
6320 ; X64-NEXT: vpcompressq %xmm0, (%rdi) {%k1}
6323 %0 = bitcast i8* %__P to i64*
6324 %1 = bitcast i8 %__U to <8 x i1>
6325 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6326 tail call void @llvm.masked.compressstore.v2i64(<2 x i64> %__A, i64* %0, <2 x i1> %extract.i)
6330 define void @test_mm256_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <4 x i64> %__A) {
6331 ; X86-LABEL: test_mm256_mask_compressstoreu_epi64:
6332 ; X86: # %bb.0: # %entry
6333 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6334 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6335 ; X86-NEXT: kmovw %eax, %k1
6336 ; X86-NEXT: vpcompressq %ymm0, (%ecx) {%k1}
6337 ; X86-NEXT: vzeroupper
6340 ; X64-LABEL: test_mm256_mask_compressstoreu_epi64:
6341 ; X64: # %bb.0: # %entry
6342 ; X64-NEXT: kmovw %esi, %k1
6343 ; X64-NEXT: vpcompressq %ymm0, (%rdi) {%k1}
6344 ; X64-NEXT: vzeroupper
6347 %0 = bitcast i8* %__P to i64*
6348 %1 = bitcast i8 %__U to <8 x i1>
6349 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6350 tail call void @llvm.masked.compressstore.v4i64(<4 x i64> %__A, i64* %0, <4 x i1> %extract.i)
6354 define void @test_mm_mask_compressstoreu_ps(i8* %__P, i8 zeroext %__U, <4 x float> %__A) {
6355 ; X86-LABEL: test_mm_mask_compressstoreu_ps:
6356 ; X86: # %bb.0: # %entry
6357 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6358 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6359 ; X86-NEXT: kmovw %eax, %k1
6360 ; X86-NEXT: vcompressps %xmm0, (%ecx) {%k1}
6363 ; X64-LABEL: test_mm_mask_compressstoreu_ps:
6364 ; X64: # %bb.0: # %entry
6365 ; X64-NEXT: kmovw %esi, %k1
6366 ; X64-NEXT: vcompressps %xmm0, (%rdi) {%k1}
6369 %0 = bitcast i8* %__P to float*
6370 %1 = bitcast i8 %__U to <8 x i1>
6371 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6372 tail call void @llvm.masked.compressstore.v4f32(<4 x float> %__A, float* %0, <4 x i1> %extract.i)
6376 define void @test_mm256_mask_compressstoreu_ps(i8* %__P, i8 zeroext %__U, <8 x float> %__A) {
6377 ; X86-LABEL: test_mm256_mask_compressstoreu_ps:
6378 ; X86: # %bb.0: # %entry
6379 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6380 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6381 ; X86-NEXT: kmovw %eax, %k1
6382 ; X86-NEXT: vcompressps %ymm0, (%ecx) {%k1}
6383 ; X86-NEXT: vzeroupper
6386 ; X64-LABEL: test_mm256_mask_compressstoreu_ps:
6387 ; X64: # %bb.0: # %entry
6388 ; X64-NEXT: kmovw %esi, %k1
6389 ; X64-NEXT: vcompressps %ymm0, (%rdi) {%k1}
6390 ; X64-NEXT: vzeroupper
6393 %0 = bitcast i8* %__P to float*
6394 %1 = bitcast i8 %__U to <8 x i1>
6395 tail call void @llvm.masked.compressstore.v8f32(<8 x float> %__A, float* %0, <8 x i1> %1)
6399 define void @test_mm_mask_compressstoreu_epi32(i8* %__P, i8 zeroext %__U, <2 x i64> %__A) {
6400 ; X86-LABEL: test_mm_mask_compressstoreu_epi32:
6401 ; X86: # %bb.0: # %entry
6402 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6403 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6404 ; X86-NEXT: kmovw %eax, %k1
6405 ; X86-NEXT: vpcompressd %xmm0, (%ecx) {%k1}
6408 ; X64-LABEL: test_mm_mask_compressstoreu_epi32:
6409 ; X64: # %bb.0: # %entry
6410 ; X64-NEXT: kmovw %esi, %k1
6411 ; X64-NEXT: vpcompressd %xmm0, (%rdi) {%k1}
6414 %0 = bitcast <2 x i64> %__A to <4 x i32>
6415 %1 = bitcast i8* %__P to i32*
6416 %2 = bitcast i8 %__U to <8 x i1>
6417 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6418 tail call void @llvm.masked.compressstore.v4i32(<4 x i32> %0, i32* %1, <4 x i1> %extract.i)
6422 define void @test_mm256_mask_compressstoreu_epi32(i8* %__P, i8 zeroext %__U, <4 x i64> %__A) {
6423 ; X86-LABEL: test_mm256_mask_compressstoreu_epi32:
6424 ; X86: # %bb.0: # %entry
6425 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6426 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6427 ; X86-NEXT: kmovw %eax, %k1
6428 ; X86-NEXT: vpcompressd %ymm0, (%ecx) {%k1}
6429 ; X86-NEXT: vzeroupper
6432 ; X64-LABEL: test_mm256_mask_compressstoreu_epi32:
6433 ; X64: # %bb.0: # %entry
6434 ; X64-NEXT: kmovw %esi, %k1
6435 ; X64-NEXT: vpcompressd %ymm0, (%rdi) {%k1}
6436 ; X64-NEXT: vzeroupper
6439 %0 = bitcast <4 x i64> %__A to <8 x i32>
6440 %1 = bitcast i8* %__P to i32*
6441 %2 = bitcast i8 %__U to <8 x i1>
6442 tail call void @llvm.masked.compressstore.v8i32(<8 x i32> %0, i32* %1, <8 x i1> %2) #10
6447 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #8
6448 declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) #8
6449 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #8
6450 declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) #8
6452 define <2 x double> @test_mm_mask_sqrt_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A) {
6453 ; X86-LABEL: test_mm_mask_sqrt_pd:
6454 ; X86: # %bb.0: # %entry
6455 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6456 ; X86-NEXT: kmovw %eax, %k1
6457 ; X86-NEXT: vsqrtpd %xmm1, %xmm0 {%k1}
6460 ; X64-LABEL: test_mm_mask_sqrt_pd:
6461 ; X64: # %bb.0: # %entry
6462 ; X64-NEXT: kmovw %edi, %k1
6463 ; X64-NEXT: vsqrtpd %xmm1, %xmm0 {%k1}
6466 %0 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %__A) #2
6467 %1 = bitcast i8 %__U to <8 x i1>
6468 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6469 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__W
6473 declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
6475 define <2 x double> @test_mm_maskz_sqrt_pd(i8 zeroext %__U, <2 x double> %__A) {
6476 ; X86-LABEL: test_mm_maskz_sqrt_pd:
6477 ; X86: # %bb.0: # %entry
6478 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6479 ; X86-NEXT: kmovw %eax, %k1
6480 ; X86-NEXT: vsqrtpd %xmm0, %xmm0 {%k1} {z}
6483 ; X64-LABEL: test_mm_maskz_sqrt_pd:
6484 ; X64: # %bb.0: # %entry
6485 ; X64-NEXT: kmovw %edi, %k1
6486 ; X64-NEXT: vsqrtpd %xmm0, %xmm0 {%k1} {z}
6489 %0 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %__A) #2
6490 %1 = bitcast i8 %__U to <8 x i1>
6491 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6492 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
6496 define <4 x double> @test_mm256_mask_sqrt_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A) {
6497 ; X86-LABEL: test_mm256_mask_sqrt_pd:
6498 ; X86: # %bb.0: # %entry
6499 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6500 ; X86-NEXT: kmovw %eax, %k1
6501 ; X86-NEXT: vsqrtpd %ymm1, %ymm0 {%k1}
6504 ; X64-LABEL: test_mm256_mask_sqrt_pd:
6505 ; X64: # %bb.0: # %entry
6506 ; X64-NEXT: kmovw %edi, %k1
6507 ; X64-NEXT: vsqrtpd %ymm1, %ymm0 {%k1}
6510 %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %__A) #2
6511 %1 = bitcast i8 %__U to <8 x i1>
6512 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6513 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__W
6517 declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
6519 define <4 x double> @test_mm256_maskz_sqrt_pd(i8 zeroext %__U, <4 x double> %__A) {
6520 ; X86-LABEL: test_mm256_maskz_sqrt_pd:
6521 ; X86: # %bb.0: # %entry
6522 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6523 ; X86-NEXT: kmovw %eax, %k1
6524 ; X86-NEXT: vsqrtpd %ymm0, %ymm0 {%k1} {z}
6527 ; X64-LABEL: test_mm256_maskz_sqrt_pd:
6528 ; X64: # %bb.0: # %entry
6529 ; X64-NEXT: kmovw %edi, %k1
6530 ; X64-NEXT: vsqrtpd %ymm0, %ymm0 {%k1} {z}
6533 %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %__A) #2
6534 %1 = bitcast i8 %__U to <8 x i1>
6535 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6536 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
6540 define <4 x float> @test_mm_mask_sqrt_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) {
6541 ; X86-LABEL: test_mm_mask_sqrt_ps:
6542 ; X86: # %bb.0: # %entry
6543 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6544 ; X86-NEXT: kmovw %eax, %k1
6545 ; X86-NEXT: vsqrtps %xmm1, %xmm0 {%k1}
6548 ; X64-LABEL: test_mm_mask_sqrt_ps:
6549 ; X64: # %bb.0: # %entry
6550 ; X64-NEXT: kmovw %edi, %k1
6551 ; X64-NEXT: vsqrtps %xmm1, %xmm0 {%k1}
6554 %0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %__A) #2
6555 %1 = bitcast i8 %__U to <8 x i1>
6556 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6557 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__W
6561 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
6563 define <4 x float> @test_mm_maskz_sqrt_ps(i8 zeroext %__U, <4 x float> %__A) {
6564 ; X86-LABEL: test_mm_maskz_sqrt_ps:
6565 ; X86: # %bb.0: # %entry
6566 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6567 ; X86-NEXT: kmovw %eax, %k1
6568 ; X86-NEXT: vsqrtps %xmm0, %xmm0 {%k1} {z}
6571 ; X64-LABEL: test_mm_maskz_sqrt_ps:
6572 ; X64: # %bb.0: # %entry
6573 ; X64-NEXT: kmovw %edi, %k1
6574 ; X64-NEXT: vsqrtps %xmm0, %xmm0 {%k1} {z}
6577 %0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %__A) #2
6578 %1 = bitcast i8 %__U to <8 x i1>
6579 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6580 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
6584 define <8 x float> @test_mm256_mask_sqrt_ps(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A) {
6585 ; X86-LABEL: test_mm256_mask_sqrt_ps:
6586 ; X86: # %bb.0: # %entry
6587 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6588 ; X86-NEXT: kmovw %eax, %k1
6589 ; X86-NEXT: vsqrtps %ymm1, %ymm0 {%k1}
6592 ; X64-LABEL: test_mm256_mask_sqrt_ps:
6593 ; X64: # %bb.0: # %entry
6594 ; X64-NEXT: kmovw %edi, %k1
6595 ; X64-NEXT: vsqrtps %ymm1, %ymm0 {%k1}
6598 %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %__A) #2
6599 %1 = bitcast i8 %__U to <8 x i1>
6600 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__W
6604 define <8 x float> @test_mm256_maskz_sqrt_ps(i8 zeroext %__U, <8 x float> %__A) {
6605 ; X86-LABEL: test_mm256_maskz_sqrt_ps:
6606 ; X86: # %bb.0: # %entry
6607 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6608 ; X86-NEXT: kmovw %eax, %k1
6609 ; X86-NEXT: vsqrtps %ymm0, %ymm0 {%k1} {z}
6612 ; X64-LABEL: test_mm256_maskz_sqrt_ps:
6613 ; X64: # %bb.0: # %entry
6614 ; X64-NEXT: kmovw %edi, %k1
6615 ; X64-NEXT: vsqrtps %ymm0, %ymm0 {%k1} {z}
6618 %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %__A) #2
6619 %1 = bitcast i8 %__U to <8 x i1>
6620 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
6624 declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
6626 define <2 x i64> @test_mm_rol_epi32(<2 x i64> %__A) {
6627 ; CHECK-LABEL: test_mm_rol_epi32:
6628 ; CHECK: # %bb.0: # %entry
6629 ; CHECK-NEXT: vprold $5, %xmm0, %xmm0
6630 ; CHECK-NEXT: ret{{[l|q]}}
6632 %0 = bitcast <2 x i64> %__A to <4 x i32>
6633 %1 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
6634 %2 = bitcast <4 x i32> %1 to <2 x i64>
6638 define <2 x i64> @test_mm_mask_rol_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) {
6639 ; X86-LABEL: test_mm_mask_rol_epi32:
6640 ; X86: # %bb.0: # %entry
6641 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6642 ; X86-NEXT: kmovw %eax, %k1
6643 ; X86-NEXT: vprold $5, %xmm1, %xmm0 {%k1}
6646 ; X64-LABEL: test_mm_mask_rol_epi32:
6647 ; X64: # %bb.0: # %entry
6648 ; X64-NEXT: kmovw %edi, %k1
6649 ; X64-NEXT: vprold $5, %xmm1, %xmm0 {%k1}
6652 %0 = bitcast <2 x i64> %__A to <4 x i32>
6653 %1 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
6654 %2 = bitcast <2 x i64> %__W to <4 x i32>
6655 %3 = bitcast i8 %__U to <8 x i1>
6656 %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6657 %4 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %2
6658 %5 = bitcast <4 x i32> %4 to <2 x i64>
6662 define <2 x i64> @test_mm_maskz_rol_epi32(i8 zeroext %__U, <2 x i64> %__A) {
6663 ; X86-LABEL: test_mm_maskz_rol_epi32:
6664 ; X86: # %bb.0: # %entry
6665 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6666 ; X86-NEXT: kmovw %eax, %k1
6667 ; X86-NEXT: vprold $5, %xmm0, %xmm0 {%k1} {z}
6670 ; X64-LABEL: test_mm_maskz_rol_epi32:
6671 ; X64: # %bb.0: # %entry
6672 ; X64-NEXT: kmovw %edi, %k1
6673 ; X64-NEXT: vprold $5, %xmm0, %xmm0 {%k1} {z}
6676 %0 = bitcast <2 x i64> %__A to <4 x i32>
6677 %1 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
6678 %2 = bitcast i8 %__U to <8 x i1>
6679 %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6680 %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer
6681 %4 = bitcast <4 x i32> %3 to <2 x i64>
6685 define <4 x i64> @test_mm256_rol_epi32(<4 x i64> %__A) {
6686 ; CHECK-LABEL: test_mm256_rol_epi32:
6687 ; CHECK: # %bb.0: # %entry
6688 ; CHECK-NEXT: vprold $5, %ymm0, %ymm0
6689 ; CHECK-NEXT: ret{{[l|q]}}
6691 %0 = bitcast <4 x i64> %__A to <8 x i32>
6692 %1 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
6693 %2 = bitcast <8 x i32> %1 to <4 x i64>
6697 define <4 x i64> @test_mm256_mask_rol_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) {
6698 ; X86-LABEL: test_mm256_mask_rol_epi32:
6699 ; X86: # %bb.0: # %entry
6700 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6701 ; X86-NEXT: kmovw %eax, %k1
6702 ; X86-NEXT: vprold $5, %ymm1, %ymm0 {%k1}
6705 ; X64-LABEL: test_mm256_mask_rol_epi32:
6706 ; X64: # %bb.0: # %entry
6707 ; X64-NEXT: kmovw %edi, %k1
6708 ; X64-NEXT: vprold $5, %ymm1, %ymm0 {%k1}
6711 %0 = bitcast <4 x i64> %__A to <8 x i32>
6712 %1 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
6713 %2 = bitcast <4 x i64> %__W to <8 x i32>
6714 %3 = bitcast i8 %__U to <8 x i1>
6715 %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2
6716 %5 = bitcast <8 x i32> %4 to <4 x i64>
6720 define <4 x i64> @test_mm256_maskz_rol_epi32(i8 zeroext %__U, <4 x i64> %__A) {
6721 ; X86-LABEL: test_mm256_maskz_rol_epi32:
6722 ; X86: # %bb.0: # %entry
6723 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6724 ; X86-NEXT: kmovw %eax, %k1
6725 ; X86-NEXT: vprold $5, %ymm0, %ymm0 {%k1} {z}
6728 ; X64-LABEL: test_mm256_maskz_rol_epi32:
6729 ; X64: # %bb.0: # %entry
6730 ; X64-NEXT: kmovw %edi, %k1
6731 ; X64-NEXT: vprold $5, %ymm0, %ymm0 {%k1} {z}
6734 %0 = bitcast <4 x i64> %__A to <8 x i32>
6735 %1 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
6736 %2 = bitcast i8 %__U to <8 x i1>
6737 %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer
6738 %4 = bitcast <8 x i32> %3 to <4 x i64>
6742 define <2 x i64> @test_mm_rol_epi64(<2 x i64> %__A) {
6743 ; CHECK-LABEL: test_mm_rol_epi64:
6744 ; CHECK: # %bb.0: # %entry
6745 ; CHECK-NEXT: vprolq $5, %xmm0, %xmm0
6746 ; CHECK-NEXT: ret{{[l|q]}}
6748 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>)
6752 define <2 x i64> @test_mm_mask_rol_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) {
6753 ; X86-LABEL: test_mm_mask_rol_epi64:
6754 ; X86: # %bb.0: # %entry
6755 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6756 ; X86-NEXT: kmovw %eax, %k1
6757 ; X86-NEXT: vprolq $5, %xmm1, %xmm0 {%k1}
6760 ; X64-LABEL: test_mm_mask_rol_epi64:
6761 ; X64: # %bb.0: # %entry
6762 ; X64-NEXT: kmovw %edi, %k1
6763 ; X64-NEXT: vprolq $5, %xmm1, %xmm0 {%k1}
6766 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>)
6767 %1 = bitcast i8 %__U to <8 x i1>
6768 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6769 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__W
6773 define <2 x i64> @test_mm_maskz_rol_epi64(i8 zeroext %__U, <2 x i64> %__A) {
6774 ; X86-LABEL: test_mm_maskz_rol_epi64:
6775 ; X86: # %bb.0: # %entry
6776 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6777 ; X86-NEXT: kmovw %eax, %k1
6778 ; X86-NEXT: vprolq $5, %xmm0, %xmm0 {%k1} {z}
6781 ; X64-LABEL: test_mm_maskz_rol_epi64:
6782 ; X64: # %bb.0: # %entry
6783 ; X64-NEXT: kmovw %edi, %k1
6784 ; X64-NEXT: vprolq $5, %xmm0, %xmm0 {%k1} {z}
6787 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>)
6788 %1 = bitcast i8 %__U to <8 x i1>
6789 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6790 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer
6794 define <4 x i64> @test_mm256_rol_epi64(<4 x i64> %__A) {
6795 ; CHECK-LABEL: test_mm256_rol_epi64:
6796 ; CHECK: # %bb.0: # %entry
6797 ; CHECK-NEXT: vprolq $5, %ymm0, %ymm0
6798 ; CHECK-NEXT: ret{{[l|q]}}
6800 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5,i64 5, i64 5>)
6804 define <4 x i64> @test_mm256_mask_rol_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) {
6805 ; X86-LABEL: test_mm256_mask_rol_epi64:
6806 ; X86: # %bb.0: # %entry
6807 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6808 ; X86-NEXT: kmovw %eax, %k1
6809 ; X86-NEXT: vprolq $5, %ymm1, %ymm0 {%k1}
6812 ; X64-LABEL: test_mm256_mask_rol_epi64:
6813 ; X64: # %bb.0: # %entry
6814 ; X64-NEXT: kmovw %edi, %k1
6815 ; X64-NEXT: vprolq $5, %ymm1, %ymm0 {%k1}
6818 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5,i64 5, i64 5>)
6819 %1 = bitcast i8 %__U to <8 x i1>
6820 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6821 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__W
6825 define <4 x i64> @test_mm256_maskz_rol_epi64(i8 zeroext %__U, <4 x i64> %__A) {
6826 ; X86-LABEL: test_mm256_maskz_rol_epi64:
6827 ; X86: # %bb.0: # %entry
6828 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6829 ; X86-NEXT: kmovw %eax, %k1
6830 ; X86-NEXT: vprolq $5, %ymm0, %ymm0 {%k1} {z}
6833 ; X64-LABEL: test_mm256_maskz_rol_epi64:
6834 ; X64: # %bb.0: # %entry
6835 ; X64-NEXT: kmovw %edi, %k1
6836 ; X64-NEXT: vprolq $5, %ymm0, %ymm0 {%k1} {z}
6839 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5,i64 5, i64 5>)
6840 %1 = bitcast i8 %__U to <8 x i1>
6841 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6842 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer
6846 define <2 x i64> @test_mm_rolv_epi32(<2 x i64> %__A, <2 x i64> %__B) {
6847 ; CHECK-LABEL: test_mm_rolv_epi32:
6848 ; CHECK: # %bb.0: # %entry
6849 ; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm0
6850 ; CHECK-NEXT: ret{{[l|q]}}
6852 %0 = bitcast <2 x i64> %__A to <4 x i32>
6853 %1 = bitcast <2 x i64> %__B to <4 x i32>
6854 %2 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1)
6855 %3 = bitcast <4 x i32> %2 to <2 x i64>
6859 define <2 x i64> @test_mm_mask_rolv_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
6860 ; X86-LABEL: test_mm_mask_rolv_epi32:
6861 ; X86: # %bb.0: # %entry
6862 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6863 ; X86-NEXT: kmovw %eax, %k1
6864 ; X86-NEXT: vprolvd %xmm2, %xmm1, %xmm0 {%k1}
6867 ; X64-LABEL: test_mm_mask_rolv_epi32:
6868 ; X64: # %bb.0: # %entry
6869 ; X64-NEXT: kmovw %edi, %k1
6870 ; X64-NEXT: vprolvd %xmm2, %xmm1, %xmm0 {%k1}
6873 %0 = bitcast <2 x i64> %__A to <4 x i32>
6874 %1 = bitcast <2 x i64> %__B to <4 x i32>
6875 %2 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1)
6876 %3 = bitcast <2 x i64> %__W to <4 x i32>
6877 %4 = bitcast i8 %__U to <8 x i1>
6878 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6879 %5 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> %3
6880 %6 = bitcast <4 x i32> %5 to <2 x i64>
6884 define <2 x i64> @test_mm_maskz_rolv_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
6885 ; X86-LABEL: test_mm_maskz_rolv_epi32:
6886 ; X86: # %bb.0: # %entry
6887 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6888 ; X86-NEXT: kmovw %eax, %k1
6889 ; X86-NEXT: vprolvd %xmm1, %xmm0, %xmm0 {%k1} {z}
6892 ; X64-LABEL: test_mm_maskz_rolv_epi32:
6893 ; X64: # %bb.0: # %entry
6894 ; X64-NEXT: kmovw %edi, %k1
6895 ; X64-NEXT: vprolvd %xmm1, %xmm0, %xmm0 {%k1} {z}
6898 %0 = bitcast <2 x i64> %__A to <4 x i32>
6899 %1 = bitcast <2 x i64> %__B to <4 x i32>
6900 %2 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1)
6901 %3 = bitcast i8 %__U to <8 x i1>
6902 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6903 %4 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> zeroinitializer
6904 %5 = bitcast <4 x i32> %4 to <2 x i64>
6908 define <4 x i64> @test_mm256_rolv_epi32(<4 x i64> %__A, <4 x i64> %__B) {
6909 ; CHECK-LABEL: test_mm256_rolv_epi32:
6910 ; CHECK: # %bb.0: # %entry
6911 ; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm0
6912 ; CHECK-NEXT: ret{{[l|q]}}
6914 %0 = bitcast <4 x i64> %__A to <8 x i32>
6915 %1 = bitcast <4 x i64> %__B to <8 x i32>
6916 %2 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1)
6917 %3 = bitcast <8 x i32> %2 to <4 x i64>
6921 define <4 x i64> @test_mm256_mask_rolv_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
6922 ; X86-LABEL: test_mm256_mask_rolv_epi32:
6923 ; X86: # %bb.0: # %entry
6924 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6925 ; X86-NEXT: kmovw %eax, %k1
6926 ; X86-NEXT: vprolvd %ymm2, %ymm1, %ymm0 {%k1}
6929 ; X64-LABEL: test_mm256_mask_rolv_epi32:
6930 ; X64: # %bb.0: # %entry
6931 ; X64-NEXT: kmovw %edi, %k1
6932 ; X64-NEXT: vprolvd %ymm2, %ymm1, %ymm0 {%k1}
6935 %0 = bitcast <4 x i64> %__A to <8 x i32>
6936 %1 = bitcast <4 x i64> %__B to <8 x i32>
6937 %2 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1)
6938 %3 = bitcast <4 x i64> %__W to <8 x i32>
6939 %4 = bitcast i8 %__U to <8 x i1>
6940 %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3
6941 %6 = bitcast <8 x i32> %5 to <4 x i64>
6945 define <4 x i64> @test_mm256_maskz_rolv_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
6946 ; X86-LABEL: test_mm256_maskz_rolv_epi32:
6947 ; X86: # %bb.0: # %entry
6948 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6949 ; X86-NEXT: kmovw %eax, %k1
6950 ; X86-NEXT: vprolvd %ymm1, %ymm0, %ymm0 {%k1} {z}
6953 ; X64-LABEL: test_mm256_maskz_rolv_epi32:
6954 ; X64: # %bb.0: # %entry
6955 ; X64-NEXT: kmovw %edi, %k1
6956 ; X64-NEXT: vprolvd %ymm1, %ymm0, %ymm0 {%k1} {z}
6959 %0 = bitcast <4 x i64> %__A to <8 x i32>
6960 %1 = bitcast <4 x i64> %__B to <8 x i32>
6961 %2 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1)
6962 %3 = bitcast i8 %__U to <8 x i1>
6963 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
6964 %5 = bitcast <8 x i32> %4 to <4 x i64>
6968 define <2 x i64> @test_mm_rolv_epi64(<2 x i64> %__A, <2 x i64> %__B) {
6969 ; CHECK-LABEL: test_mm_rolv_epi64:
6970 ; CHECK: # %bb.0: # %entry
6971 ; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm0
6972 ; CHECK-NEXT: ret{{[l|q]}}
6974 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B)
6978 define <2 x i64> @test_mm_mask_rolv_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
6979 ; X86-LABEL: test_mm_mask_rolv_epi64:
6980 ; X86: # %bb.0: # %entry
6981 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6982 ; X86-NEXT: kmovw %eax, %k1
6983 ; X86-NEXT: vprolvq %xmm2, %xmm1, %xmm0 {%k1}
6986 ; X64-LABEL: test_mm_mask_rolv_epi64:
6987 ; X64: # %bb.0: # %entry
6988 ; X64-NEXT: kmovw %edi, %k1
6989 ; X64-NEXT: vprolvq %xmm2, %xmm1, %xmm0 {%k1}
6992 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B)
6993 %1 = bitcast i8 %__U to <8 x i1>
6994 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6995 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__W
6999 define <2 x i64> @test_mm_maskz_rolv_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
7000 ; X86-LABEL: test_mm_maskz_rolv_epi64:
7001 ; X86: # %bb.0: # %entry
7002 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7003 ; X86-NEXT: kmovw %eax, %k1
7004 ; X86-NEXT: vprolvq %xmm1, %xmm0, %xmm0 {%k1} {z}
7007 ; X64-LABEL: test_mm_maskz_rolv_epi64:
7008 ; X64: # %bb.0: # %entry
7009 ; X64-NEXT: kmovw %edi, %k1
7010 ; X64-NEXT: vprolvq %xmm1, %xmm0, %xmm0 {%k1} {z}
7013 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B)
7014 %1 = bitcast i8 %__U to <8 x i1>
7015 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7016 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
7020 define <4 x i64> @test_mm256_rolv_epi64(<4 x i64> %__A, <4 x i64> %__B) {
7021 ; CHECK-LABEL: test_mm256_rolv_epi64:
7022 ; CHECK: # %bb.0: # %entry
7023 ; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm0
7024 ; CHECK-NEXT: ret{{[l|q]}}
7026 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B)
7030 define <4 x i64> @test_mm256_mask_rolv_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7031 ; X86-LABEL: test_mm256_mask_rolv_epi64:
7032 ; X86: # %bb.0: # %entry
7033 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7034 ; X86-NEXT: kmovw %eax, %k1
7035 ; X86-NEXT: vprolvq %ymm2, %ymm1, %ymm0 {%k1}
7038 ; X64-LABEL: test_mm256_mask_rolv_epi64:
7039 ; X64: # %bb.0: # %entry
7040 ; X64-NEXT: kmovw %edi, %k1
7041 ; X64-NEXT: vprolvq %ymm2, %ymm1, %ymm0 {%k1}
7044 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B)
7045 %1 = bitcast i8 %__U to <8 x i1>
7046 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7047 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__W
7051 define <4 x i64> @test_mm256_maskz_rolv_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7052 ; X86-LABEL: test_mm256_maskz_rolv_epi64:
7053 ; X86: # %bb.0: # %entry
7054 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7055 ; X86-NEXT: kmovw %eax, %k1
7056 ; X86-NEXT: vprolvq %ymm1, %ymm0, %ymm0 {%k1} {z}
7059 ; X64-LABEL: test_mm256_maskz_rolv_epi64:
7060 ; X64: # %bb.0: # %entry
7061 ; X64-NEXT: kmovw %edi, %k1
7062 ; X64-NEXT: vprolvq %ymm1, %ymm0, %ymm0 {%k1} {z}
7065 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B)
7066 %1 = bitcast i8 %__U to <8 x i1>
7067 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7068 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
7072 define <2 x i64> @test_mm_ror_epi32(<2 x i64> %__A) {
7073 ; CHECK-LABEL: test_mm_ror_epi32:
7074 ; CHECK: # %bb.0: # %entry
7075 ; CHECK-NEXT: vprord $5, %xmm0, %xmm0
7076 ; CHECK-NEXT: ret{{[l|q]}}
7078 %0 = bitcast <2 x i64> %__A to <4 x i32>
7079 %1 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
7080 %2 = bitcast <4 x i32> %1 to <2 x i64>
7084 define <2 x i64> @test_mm_mask_ror_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) {
7085 ; X86-LABEL: test_mm_mask_ror_epi32:
7086 ; X86: # %bb.0: # %entry
7087 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7088 ; X86-NEXT: kmovw %eax, %k1
7089 ; X86-NEXT: vprord $5, %xmm1, %xmm0 {%k1}
7092 ; X64-LABEL: test_mm_mask_ror_epi32:
7093 ; X64: # %bb.0: # %entry
7094 ; X64-NEXT: kmovw %edi, %k1
7095 ; X64-NEXT: vprord $5, %xmm1, %xmm0 {%k1}
7098 %0 = bitcast <2 x i64> %__A to <4 x i32>
7099 %1 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
7100 %2 = bitcast <2 x i64> %__W to <4 x i32>
7101 %3 = bitcast i8 %__U to <8 x i1>
7102 %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7103 %4 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %2
7104 %5 = bitcast <4 x i32> %4 to <2 x i64>
7108 define <2 x i64> @test_mm_maskz_ror_epi32(i8 zeroext %__U, <2 x i64> %__A) {
7109 ; X86-LABEL: test_mm_maskz_ror_epi32:
7110 ; X86: # %bb.0: # %entry
7111 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7112 ; X86-NEXT: kmovw %eax, %k1
7113 ; X86-NEXT: vprord $5, %xmm0, %xmm0 {%k1} {z}
7116 ; X64-LABEL: test_mm_maskz_ror_epi32:
7117 ; X64: # %bb.0: # %entry
7118 ; X64-NEXT: kmovw %edi, %k1
7119 ; X64-NEXT: vprord $5, %xmm0, %xmm0 {%k1} {z}
7122 %0 = bitcast <2 x i64> %__A to <4 x i32>
7123 %1 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
7124 %2 = bitcast i8 %__U to <8 x i1>
7125 %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7126 %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer
7127 %4 = bitcast <4 x i32> %3 to <2 x i64>
7131 define <4 x i64> @test_mm256_ror_epi32(<4 x i64> %__A) {
7132 ; CHECK-LABEL: test_mm256_ror_epi32:
7133 ; CHECK: # %bb.0: # %entry
7134 ; CHECK-NEXT: vprord $5, %ymm0, %ymm0
7135 ; CHECK-NEXT: ret{{[l|q]}}
7137 %0 = bitcast <4 x i64> %__A to <8 x i32>
7138 %1 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
7139 %2 = bitcast <8 x i32> %1 to <4 x i64>
7143 define <4 x i64> @test_mm256_mask_ror_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) {
7144 ; X86-LABEL: test_mm256_mask_ror_epi32:
7145 ; X86: # %bb.0: # %entry
7146 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7147 ; X86-NEXT: kmovw %eax, %k1
7148 ; X86-NEXT: vprord $5, %ymm1, %ymm0 {%k1}
7151 ; X64-LABEL: test_mm256_mask_ror_epi32:
7152 ; X64: # %bb.0: # %entry
7153 ; X64-NEXT: kmovw %edi, %k1
7154 ; X64-NEXT: vprord $5, %ymm1, %ymm0 {%k1}
7157 %0 = bitcast <4 x i64> %__A to <8 x i32>
7158 %1 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
7159 %2 = bitcast <4 x i64> %__W to <8 x i32>
7160 %3 = bitcast i8 %__U to <8 x i1>
7161 %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2
7162 %5 = bitcast <8 x i32> %4 to <4 x i64>
7166 define <4 x i64> @test_mm256_maskz_ror_epi32(i8 zeroext %__U, <4 x i64> %__A) {
7167 ; X86-LABEL: test_mm256_maskz_ror_epi32:
7168 ; X86: # %bb.0: # %entry
7169 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7170 ; X86-NEXT: kmovw %eax, %k1
7171 ; X86-NEXT: vprord $5, %ymm0, %ymm0 {%k1} {z}
7174 ; X64-LABEL: test_mm256_maskz_ror_epi32:
7175 ; X64: # %bb.0: # %entry
7176 ; X64-NEXT: kmovw %edi, %k1
7177 ; X64-NEXT: vprord $5, %ymm0, %ymm0 {%k1} {z}
7180 %0 = bitcast <4 x i64> %__A to <8 x i32>
7181 %1 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
7182 %2 = bitcast i8 %__U to <8 x i1>
7183 %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer
7184 %4 = bitcast <8 x i32> %3 to <4 x i64>
7188 define <2 x i64> @test_mm_ror_epi64(<2 x i64> %__A) {
7189 ; CHECK-LABEL: test_mm_ror_epi64:
7190 ; CHECK: # %bb.0: # %entry
7191 ; CHECK-NEXT: vprorq $5, %xmm0, %xmm0
7192 ; CHECK-NEXT: ret{{[l|q]}}
7194 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>)
7198 define <2 x i64> @test_mm_mask_ror_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) {
7199 ; X86-LABEL: test_mm_mask_ror_epi64:
7200 ; X86: # %bb.0: # %entry
7201 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7202 ; X86-NEXT: kmovw %eax, %k1
7203 ; X86-NEXT: vprorq $5, %xmm1, %xmm0 {%k1}
7206 ; X64-LABEL: test_mm_mask_ror_epi64:
7207 ; X64: # %bb.0: # %entry
7208 ; X64-NEXT: kmovw %edi, %k1
7209 ; X64-NEXT: vprorq $5, %xmm1, %xmm0 {%k1}
7212 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>)
7213 %1 = bitcast i8 %__U to <8 x i1>
7214 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7215 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__W
7219 define <2 x i64> @test_mm_maskz_ror_epi64(i8 zeroext %__U, <2 x i64> %__A) {
7220 ; X86-LABEL: test_mm_maskz_ror_epi64:
7221 ; X86: # %bb.0: # %entry
7222 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7223 ; X86-NEXT: kmovw %eax, %k1
7224 ; X86-NEXT: vprorq $5, %xmm0, %xmm0 {%k1} {z}
7227 ; X64-LABEL: test_mm_maskz_ror_epi64:
7228 ; X64: # %bb.0: # %entry
7229 ; X64-NEXT: kmovw %edi, %k1
7230 ; X64-NEXT: vprorq $5, %xmm0, %xmm0 {%k1} {z}
7233 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>)
7234 %1 = bitcast i8 %__U to <8 x i1>
7235 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7236 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer
7240 define <4 x i64> @test_mm256_ror_epi64(<4 x i64> %__A) {
7241 ; CHECK-LABEL: test_mm256_ror_epi64:
7242 ; CHECK: # %bb.0: # %entry
7243 ; CHECK-NEXT: vprorq $5, %ymm0, %ymm0
7244 ; CHECK-NEXT: ret{{[l|q]}}
7246 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5, i64 5, i64 5>)
7250 define <4 x i64> @test_mm256_mask_ror_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) {
7251 ; X86-LABEL: test_mm256_mask_ror_epi64:
7252 ; X86: # %bb.0: # %entry
7253 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7254 ; X86-NEXT: kmovw %eax, %k1
7255 ; X86-NEXT: vprorq $5, %ymm1, %ymm0 {%k1}
7258 ; X64-LABEL: test_mm256_mask_ror_epi64:
7259 ; X64: # %bb.0: # %entry
7260 ; X64-NEXT: kmovw %edi, %k1
7261 ; X64-NEXT: vprorq $5, %ymm1, %ymm0 {%k1}
7264 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5, i64 5, i64 5>)
7265 %1 = bitcast i8 %__U to <8 x i1>
7266 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7267 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__W
7271 define <4 x i64> @test_mm256_maskz_ror_epi64(i8 zeroext %__U, <4 x i64> %__A) {
7272 ; X86-LABEL: test_mm256_maskz_ror_epi64:
7273 ; X86: # %bb.0: # %entry
7274 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7275 ; X86-NEXT: kmovw %eax, %k1
7276 ; X86-NEXT: vprorq $5, %ymm0, %ymm0 {%k1} {z}
7279 ; X64-LABEL: test_mm256_maskz_ror_epi64:
7280 ; X64: # %bb.0: # %entry
7281 ; X64-NEXT: kmovw %edi, %k1
7282 ; X64-NEXT: vprorq $5, %ymm0, %ymm0 {%k1} {z}
7285 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5, i64 5, i64 5>)
7286 %1 = bitcast i8 %__U to <8 x i1>
7287 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7288 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer
7292 define <2 x i64> @test_mm_rorv_epi32(<2 x i64> %__A, <2 x i64> %__B) {
7293 ; CHECK-LABEL: test_mm_rorv_epi32:
7294 ; CHECK: # %bb.0: # %entry
7295 ; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm0
7296 ; CHECK-NEXT: ret{{[l|q]}}
7298 %0 = bitcast <2 x i64> %__A to <4 x i32>
7299 %1 = bitcast <2 x i64> %__B to <4 x i32>
7300 %2 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1)
7301 %3 = bitcast <4 x i32> %2 to <2 x i64>
7305 define <2 x i64> @test_mm_mask_rorv_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
7306 ; X86-LABEL: test_mm_mask_rorv_epi32:
7307 ; X86: # %bb.0: # %entry
7308 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7309 ; X86-NEXT: kmovw %eax, %k1
7310 ; X86-NEXT: vprorvd %xmm2, %xmm1, %xmm0 {%k1}
7313 ; X64-LABEL: test_mm_mask_rorv_epi32:
7314 ; X64: # %bb.0: # %entry
7315 ; X64-NEXT: kmovw %edi, %k1
7316 ; X64-NEXT: vprorvd %xmm2, %xmm1, %xmm0 {%k1}
7319 %0 = bitcast <2 x i64> %__A to <4 x i32>
7320 %1 = bitcast <2 x i64> %__B to <4 x i32>
7321 %2 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1)
7322 %3 = bitcast <2 x i64> %__W to <4 x i32>
7323 %4 = bitcast i8 %__U to <8 x i1>
7324 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7325 %5 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> %3
7326 %6 = bitcast <4 x i32> %5 to <2 x i64>
7330 define <2 x i64> @test_mm_maskz_rorv_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
7331 ; X86-LABEL: test_mm_maskz_rorv_epi32:
7332 ; X86: # %bb.0: # %entry
7333 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7334 ; X86-NEXT: kmovw %eax, %k1
7335 ; X86-NEXT: vprorvd %xmm1, %xmm0, %xmm0 {%k1} {z}
7338 ; X64-LABEL: test_mm_maskz_rorv_epi32:
7339 ; X64: # %bb.0: # %entry
7340 ; X64-NEXT: kmovw %edi, %k1
7341 ; X64-NEXT: vprorvd %xmm1, %xmm0, %xmm0 {%k1} {z}
7344 %0 = bitcast <2 x i64> %__A to <4 x i32>
7345 %1 = bitcast <2 x i64> %__B to <4 x i32>
7346 %2 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1)
7347 %3 = bitcast i8 %__U to <8 x i1>
7348 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7349 %4 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> zeroinitializer
7350 %5 = bitcast <4 x i32> %4 to <2 x i64>
7354 define <4 x i64> @test_mm256_rorv_epi32(<4 x i64> %__A, <4 x i64> %__B) {
7355 ; CHECK-LABEL: test_mm256_rorv_epi32:
7356 ; CHECK: # %bb.0: # %entry
7357 ; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm0
7358 ; CHECK-NEXT: ret{{[l|q]}}
7360 %0 = bitcast <4 x i64> %__A to <8 x i32>
7361 %1 = bitcast <4 x i64> %__B to <8 x i32>
7362 %2 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1)
7363 %3 = bitcast <8 x i32> %2 to <4 x i64>
7367 define <4 x i64> @test_mm256_mask_rorv_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7368 ; X86-LABEL: test_mm256_mask_rorv_epi32:
7369 ; X86: # %bb.0: # %entry
7370 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7371 ; X86-NEXT: kmovw %eax, %k1
7372 ; X86-NEXT: vprorvd %ymm2, %ymm1, %ymm0 {%k1}
7375 ; X64-LABEL: test_mm256_mask_rorv_epi32:
7376 ; X64: # %bb.0: # %entry
7377 ; X64-NEXT: kmovw %edi, %k1
7378 ; X64-NEXT: vprorvd %ymm2, %ymm1, %ymm0 {%k1}
7381 %0 = bitcast <4 x i64> %__A to <8 x i32>
7382 %1 = bitcast <4 x i64> %__B to <8 x i32>
7383 %2 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1)
7384 %3 = bitcast <4 x i64> %__W to <8 x i32>
7385 %4 = bitcast i8 %__U to <8 x i1>
7386 %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3
7387 %6 = bitcast <8 x i32> %5 to <4 x i64>
7391 define <4 x i64> @test_mm256_maskz_rorv_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7392 ; X86-LABEL: test_mm256_maskz_rorv_epi32:
7393 ; X86: # %bb.0: # %entry
7394 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7395 ; X86-NEXT: kmovw %eax, %k1
7396 ; X86-NEXT: vprorvd %ymm1, %ymm0, %ymm0 {%k1} {z}
7399 ; X64-LABEL: test_mm256_maskz_rorv_epi32:
7400 ; X64: # %bb.0: # %entry
7401 ; X64-NEXT: kmovw %edi, %k1
7402 ; X64-NEXT: vprorvd %ymm1, %ymm0, %ymm0 {%k1} {z}
7405 %0 = bitcast <4 x i64> %__A to <8 x i32>
7406 %1 = bitcast <4 x i64> %__B to <8 x i32>
7407 %2 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1)
7408 %3 = bitcast i8 %__U to <8 x i1>
7409 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
7410 %5 = bitcast <8 x i32> %4 to <4 x i64>
7414 define <2 x i64> @test_mm_rorv_epi64(<2 x i64> %__A, <2 x i64> %__B) {
7415 ; CHECK-LABEL: test_mm_rorv_epi64:
7416 ; CHECK: # %bb.0: # %entry
7417 ; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm0
7418 ; CHECK-NEXT: ret{{[l|q]}}
7420 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B)
7424 define <2 x i64> @test_mm_mask_rorv_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
7425 ; X86-LABEL: test_mm_mask_rorv_epi64:
7426 ; X86: # %bb.0: # %entry
7427 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7428 ; X86-NEXT: kmovw %eax, %k1
7429 ; X86-NEXT: vprorvq %xmm2, %xmm1, %xmm0 {%k1}
7432 ; X64-LABEL: test_mm_mask_rorv_epi64:
7433 ; X64: # %bb.0: # %entry
7434 ; X64-NEXT: kmovw %edi, %k1
7435 ; X64-NEXT: vprorvq %xmm2, %xmm1, %xmm0 {%k1}
7438 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B)
7439 %1 = bitcast i8 %__U to <8 x i1>
7440 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7441 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__W
7445 define <2 x i64> @test_mm_maskz_rorv_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
7446 ; X86-LABEL: test_mm_maskz_rorv_epi64:
7447 ; X86: # %bb.0: # %entry
7448 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7449 ; X86-NEXT: kmovw %eax, %k1
7450 ; X86-NEXT: vprorvq %xmm1, %xmm0, %xmm0 {%k1} {z}
7453 ; X64-LABEL: test_mm_maskz_rorv_epi64:
7454 ; X64: # %bb.0: # %entry
7455 ; X64-NEXT: kmovw %edi, %k1
7456 ; X64-NEXT: vprorvq %xmm1, %xmm0, %xmm0 {%k1} {z}
7459 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B)
7460 %1 = bitcast i8 %__U to <8 x i1>
7461 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7462 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
7466 define <4 x i64> @test_mm256_rorv_epi64(<4 x i64> %__A, <4 x i64> %__B) {
7467 ; CHECK-LABEL: test_mm256_rorv_epi64:
7468 ; CHECK: # %bb.0: # %entry
7469 ; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm0
7470 ; CHECK-NEXT: ret{{[l|q]}}
7472 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B)
7476 define <4 x i64> @test_mm256_mask_rorv_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7477 ; X86-LABEL: test_mm256_mask_rorv_epi64:
7478 ; X86: # %bb.0: # %entry
7479 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7480 ; X86-NEXT: kmovw %eax, %k1
7481 ; X86-NEXT: vprorvq %ymm2, %ymm1, %ymm0 {%k1}
7484 ; X64-LABEL: test_mm256_mask_rorv_epi64:
7485 ; X64: # %bb.0: # %entry
7486 ; X64-NEXT: kmovw %edi, %k1
7487 ; X64-NEXT: vprorvq %ymm2, %ymm1, %ymm0 {%k1}
7490 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B)
7491 %1 = bitcast i8 %__U to <8 x i1>
7492 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7493 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__W
7497 define <4 x i64> @test_mm256_maskz_rorv_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7498 ; X86-LABEL: test_mm256_maskz_rorv_epi64:
7499 ; X86: # %bb.0: # %entry
7500 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7501 ; X86-NEXT: kmovw %eax, %k1
7502 ; X86-NEXT: vprorvq %ymm1, %ymm0, %ymm0 {%k1} {z}
7505 ; X64-LABEL: test_mm256_maskz_rorv_epi64:
7506 ; X64: # %bb.0: # %entry
7507 ; X64-NEXT: kmovw %edi, %k1
7508 ; X64-NEXT: vprorvq %ymm1, %ymm0, %ymm0 {%k1} {z}
7511 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B)
7512 %1 = bitcast i8 %__U to <8 x i1>
7513 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7514 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
7518 declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>)
7519 declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>)
7520 declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double>, <4 x i32>, i8)
7521 declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>)
7522 declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double>, <4 x float>, i8)
7523 declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>)
7524 declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double>, <4 x i32>, i8)
7525 declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double>, <4 x i32>, i8)
7526 declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>)
7527 declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>)
7528 declare <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float>, <4 x i32>, i8)
7529 declare <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float>, <8 x i32>, i8)
7530 declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double>, <4 x i32>, i8)
7531 declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>)
7532 declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double>, <4 x i32>, i8)
7533 declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double>, <4 x i32>, i8)
7534 declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>)
7535 declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>)
7536 declare <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float>, <4 x i32>, i8)
7537 declare <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float>, <8 x i32>, i8)
7538 declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8)
7539 declare <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>)
7540 declare <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>)
7541 declare <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>)
7542 declare <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>)
7543 declare <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>)
7544 declare <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>)
7545 declare <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>)
7546 declare <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>)
7547 declare <2 x double> @llvm.masked.expandload.v2f64(double*, <2 x i1>, <2 x double>)
7548 declare <4 x double> @llvm.masked.expandload.v4f64(double*, <4 x i1>, <4 x double>)
7549 declare <2 x i64> @llvm.masked.expandload.v2i64(i64*, <2 x i1>, <2 x i64>)
7550 declare <4 x i64> @llvm.masked.expandload.v4i64(i64*, <4 x i1>, <4 x i64>)
7551 declare <4 x float> @llvm.masked.expandload.v4f32(float*, <4 x i1>, <4 x float>)
7552 declare <8 x float> @llvm.masked.expandload.v8f32(float*, <8 x i1>, <8 x float>)
7553 declare <4 x i32> @llvm.masked.expandload.v4i32(i32*, <4 x i1>, <4 x i32>)
7554 declare <8 x i32> @llvm.masked.expandload.v8i32(i32*, <8 x i1>, <8 x i32>)
7555 declare void @llvm.masked.compressstore.v2f64(<2 x double>, double*, <2 x i1>)
7556 declare void @llvm.masked.compressstore.v4f64(<4 x double>, double*, <4 x i1>)
7557 declare void @llvm.masked.compressstore.v2i64(<2 x i64>, i64*, <2 x i1>)
7558 declare void @llvm.masked.compressstore.v4i64(<4 x i64>, i64*, <4 x i1>)
7559 declare void @llvm.masked.compressstore.v4f32(<4 x float>, float*, <4 x i1>)
7560 declare void @llvm.masked.compressstore.v8f32(<8 x float>, float*, <8 x i1>)
7561 declare void @llvm.masked.compressstore.v4i32(<4 x i32>, i32*, <4 x i1>)
7562 declare void @llvm.masked.compressstore.v8i32(<8 x i32>, i32*, <8 x i1>)
7563 declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
7564 declare <8 x i32> @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
7565 declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
7566 declare <4 x i64> @llvm.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
7567 declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
7568 declare <8 x i32> @llvm.fshr.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
7569 declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
7570 declare <4 x i64> @llvm.fshr.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)