1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86
3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64
5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vl-builtins.c
7 define <4 x float> @test_mm_mask_cvtepi32_ps(<4 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) {
8 ; X86-LABEL: test_mm_mask_cvtepi32_ps:
9 ; X86: # %bb.0: # %entry
10 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
11 ; X86-NEXT: kmovw %eax, %k1
12 ; X86-NEXT: vcvtdq2ps %xmm1, %xmm0 {%k1}
15 ; X64-LABEL: test_mm_mask_cvtepi32_ps:
16 ; X64: # %bb.0: # %entry
17 ; X64-NEXT: kmovw %edi, %k1
18 ; X64-NEXT: vcvtdq2ps %xmm1, %xmm0 {%k1}
21 %0 = bitcast <2 x i64> %__A to <4 x i32>
22 %conv.i.i = sitofp <4 x i32> %0 to <4 x float>
23 %1 = bitcast i8 %__U to <8 x i1>
24 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
25 %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> %__W
29 define <4 x float> @test_mm_maskz_cvtepi32_ps(i8 zeroext %__U, <2 x i64> %__A) {
30 ; X86-LABEL: test_mm_maskz_cvtepi32_ps:
31 ; X86: # %bb.0: # %entry
32 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
33 ; X86-NEXT: kmovw %eax, %k1
34 ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 {%k1} {z}
37 ; X64-LABEL: test_mm_maskz_cvtepi32_ps:
38 ; X64: # %bb.0: # %entry
39 ; X64-NEXT: kmovw %edi, %k1
40 ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 {%k1} {z}
43 %0 = bitcast <2 x i64> %__A to <4 x i32>
44 %conv.i.i = sitofp <4 x i32> %0 to <4 x float>
45 %1 = bitcast i8 %__U to <8 x i1>
46 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
47 %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> zeroinitializer
51 define <8 x float> @test_mm256_mask_cvtepi32_ps(<8 x float> %__W, i8 zeroext %__U, <4 x i64> %__A) {
52 ; X86-LABEL: test_mm256_mask_cvtepi32_ps:
53 ; X86: # %bb.0: # %entry
54 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
55 ; X86-NEXT: kmovw %eax, %k1
56 ; X86-NEXT: vcvtdq2ps %ymm1, %ymm0 {%k1}
59 ; X64-LABEL: test_mm256_mask_cvtepi32_ps:
60 ; X64: # %bb.0: # %entry
61 ; X64-NEXT: kmovw %edi, %k1
62 ; X64-NEXT: vcvtdq2ps %ymm1, %ymm0 {%k1}
65 %0 = bitcast <4 x i64> %__A to <8 x i32>
66 %conv.i.i = sitofp <8 x i32> %0 to <8 x float>
67 %1 = bitcast i8 %__U to <8 x i1>
68 %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> %__W
72 define <8 x float> @test_mm256_maskz_cvtepi32_ps(i8 zeroext %__U, <4 x i64> %__A) {
73 ; X86-LABEL: test_mm256_maskz_cvtepi32_ps:
74 ; X86: # %bb.0: # %entry
75 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
76 ; X86-NEXT: kmovw %eax, %k1
77 ; X86-NEXT: vcvtdq2ps %ymm0, %ymm0 {%k1} {z}
80 ; X64-LABEL: test_mm256_maskz_cvtepi32_ps:
81 ; X64: # %bb.0: # %entry
82 ; X64-NEXT: kmovw %edi, %k1
83 ; X64-NEXT: vcvtdq2ps %ymm0, %ymm0 {%k1} {z}
86 %0 = bitcast <4 x i64> %__A to <8 x i32>
87 %conv.i.i = sitofp <8 x i32> %0 to <8 x float>
88 %1 = bitcast i8 %__U to <8 x i1>
89 %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> zeroinitializer
93 define <2 x i64> @test_mm_mask_cvtpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) {
94 ; X86-LABEL: test_mm_mask_cvtpd_epi32:
95 ; X86: # %bb.0: # %entry
96 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
97 ; X86-NEXT: kmovw %eax, %k1
98 ; X86-NEXT: vcvtpd2dq %xmm1, %xmm0 {%k1}
101 ; X64-LABEL: test_mm_mask_cvtpd_epi32:
102 ; X64: # %bb.0: # %entry
103 ; X64-NEXT: kmovw %edi, %k1
104 ; X64-NEXT: vcvtpd2dq %xmm1, %xmm0 {%k1}
107 %0 = bitcast <2 x i64> %__W to <4 x i32>
108 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8
109 %2 = bitcast <4 x i32> %1 to <2 x i64>
113 define <2 x i64> @test_mm_maskz_cvtpd_epi32(i8 zeroext %__U, <2 x double> %__A) {
114 ; X86-LABEL: test_mm_maskz_cvtpd_epi32:
115 ; X86: # %bb.0: # %entry
116 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
117 ; X86-NEXT: kmovw %eax, %k1
118 ; X86-NEXT: vcvtpd2dq %xmm0, %xmm0 {%k1} {z}
121 ; X64-LABEL: test_mm_maskz_cvtpd_epi32:
122 ; X64: # %bb.0: # %entry
123 ; X64-NEXT: kmovw %edi, %k1
124 ; X64-NEXT: vcvtpd2dq %xmm0, %xmm0 {%k1} {z}
127 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
128 %1 = bitcast <4 x i32> %0 to <2 x i64>
132 define <2 x i64> @test_mm256_mask_cvtpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) {
133 ; X86-LABEL: test_mm256_mask_cvtpd_epi32:
134 ; X86: # %bb.0: # %entry
135 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
136 ; X86-NEXT: kmovw %eax, %k1
137 ; X86-NEXT: vcvtpd2dq %ymm1, %xmm0 {%k1}
138 ; X86-NEXT: vzeroupper
141 ; X64-LABEL: test_mm256_mask_cvtpd_epi32:
142 ; X64: # %bb.0: # %entry
143 ; X64-NEXT: kmovw %edi, %k1
144 ; X64-NEXT: vcvtpd2dq %ymm1, %xmm0 {%k1}
145 ; X64-NEXT: vzeroupper
148 %0 = tail call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %__A) #8
149 %1 = bitcast <2 x i64> %__W to <4 x i32>
150 %2 = bitcast i8 %__U to <8 x i1>
151 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
152 %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1
153 %4 = bitcast <4 x i32> %3 to <2 x i64>
157 define <2 x i64> @test_mm256_maskz_cvtpd_epi32(i8 zeroext %__U, <4 x double> %__A) {
158 ; X86-LABEL: test_mm256_maskz_cvtpd_epi32:
159 ; X86: # %bb.0: # %entry
160 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
161 ; X86-NEXT: kmovw %eax, %k1
162 ; X86-NEXT: vcvtpd2dq %ymm0, %xmm0 {%k1} {z}
163 ; X86-NEXT: vzeroupper
166 ; X64-LABEL: test_mm256_maskz_cvtpd_epi32:
167 ; X64: # %bb.0: # %entry
168 ; X64-NEXT: kmovw %edi, %k1
169 ; X64-NEXT: vcvtpd2dq %ymm0, %xmm0 {%k1} {z}
170 ; X64-NEXT: vzeroupper
173 %0 = tail call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %__A) #8
174 %1 = bitcast i8 %__U to <8 x i1>
175 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
176 %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer
177 %3 = bitcast <4 x i32> %2 to <2 x i64>
181 define <4 x float> @test_mm_mask_cvtpd_ps(<4 x float> %__W, i8 zeroext %__U, <2 x double> %__A) {
182 ; X86-LABEL: test_mm_mask_cvtpd_ps:
183 ; X86: # %bb.0: # %entry
184 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
185 ; X86-NEXT: kmovw %eax, %k1
186 ; X86-NEXT: vcvtpd2ps %xmm1, %xmm0 {%k1}
189 ; X64-LABEL: test_mm_mask_cvtpd_ps:
190 ; X64: # %bb.0: # %entry
191 ; X64-NEXT: kmovw %edi, %k1
192 ; X64-NEXT: vcvtpd2ps %xmm1, %xmm0 {%k1}
195 %0 = tail call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %__A, <4 x float> %__W, i8 %__U) #8
199 define <4 x float> @test_mm_maskz_cvtpd_ps(i8 zeroext %__U, <2 x double> %__A) {
200 ; X86-LABEL: test_mm_maskz_cvtpd_ps:
201 ; X86: # %bb.0: # %entry
202 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
203 ; X86-NEXT: kmovw %eax, %k1
204 ; X86-NEXT: vcvtpd2ps %xmm0, %xmm0 {%k1} {z}
207 ; X64-LABEL: test_mm_maskz_cvtpd_ps:
208 ; X64: # %bb.0: # %entry
209 ; X64-NEXT: kmovw %edi, %k1
210 ; X64-NEXT: vcvtpd2ps %xmm0, %xmm0 {%k1} {z}
213 %0 = tail call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %__A, <4 x float> zeroinitializer, i8 %__U) #8
217 define <4 x float> @test_mm256_mask_cvtpd_ps(<4 x float> %__W, i8 zeroext %__U, <4 x double> %__A) {
218 ; X86-LABEL: test_mm256_mask_cvtpd_ps:
219 ; X86: # %bb.0: # %entry
220 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
221 ; X86-NEXT: kmovw %eax, %k1
222 ; X86-NEXT: vcvtpd2ps %ymm1, %xmm0 {%k1}
223 ; X86-NEXT: vzeroupper
226 ; X64-LABEL: test_mm256_mask_cvtpd_ps:
227 ; X64: # %bb.0: # %entry
228 ; X64-NEXT: kmovw %edi, %k1
229 ; X64-NEXT: vcvtpd2ps %ymm1, %xmm0 {%k1}
230 ; X64-NEXT: vzeroupper
233 %0 = tail call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %__A) #8
234 %1 = bitcast i8 %__U to <8 x i1>
235 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
236 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__W
240 define <4 x float> @test_mm256_maskz_cvtpd_ps(i8 zeroext %__U, <4 x double> %__A) {
241 ; X86-LABEL: test_mm256_maskz_cvtpd_ps:
242 ; X86: # %bb.0: # %entry
243 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
244 ; X86-NEXT: kmovw %eax, %k1
245 ; X86-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
246 ; X86-NEXT: vzeroupper
249 ; X64-LABEL: test_mm256_maskz_cvtpd_ps:
250 ; X64: # %bb.0: # %entry
251 ; X64-NEXT: kmovw %edi, %k1
252 ; X64-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
253 ; X64-NEXT: vzeroupper
256 %0 = tail call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %__A) #8
257 %1 = bitcast i8 %__U to <8 x i1>
258 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
259 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
263 define <2 x i64> @test_mm_cvtpd_epu32(<2 x double> %__A) {
264 ; CHECK-LABEL: test_mm_cvtpd_epu32:
265 ; CHECK: # %bb.0: # %entry
266 ; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm0
267 ; CHECK-NEXT: ret{{[l|q]}}
269 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8
270 %1 = bitcast <4 x i32> %0 to <2 x i64>
274 define <2 x i64> @test_mm_mask_cvtpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) {
275 ; X86-LABEL: test_mm_mask_cvtpd_epu32:
276 ; X86: # %bb.0: # %entry
277 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
278 ; X86-NEXT: kmovw %eax, %k1
279 ; X86-NEXT: vcvtpd2udq %xmm1, %xmm0 {%k1}
282 ; X64-LABEL: test_mm_mask_cvtpd_epu32:
283 ; X64: # %bb.0: # %entry
284 ; X64-NEXT: kmovw %edi, %k1
285 ; X64-NEXT: vcvtpd2udq %xmm1, %xmm0 {%k1}
288 %0 = bitcast <2 x i64> %__W to <4 x i32>
289 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8
290 %2 = bitcast <4 x i32> %1 to <2 x i64>
294 define <2 x i64> @test_mm_maskz_cvtpd_epu32(i8 zeroext %__U, <2 x double> %__A) {
295 ; X86-LABEL: test_mm_maskz_cvtpd_epu32:
296 ; X86: # %bb.0: # %entry
297 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
298 ; X86-NEXT: kmovw %eax, %k1
299 ; X86-NEXT: vcvtpd2udq %xmm0, %xmm0 {%k1} {z}
302 ; X64-LABEL: test_mm_maskz_cvtpd_epu32:
303 ; X64: # %bb.0: # %entry
304 ; X64-NEXT: kmovw %edi, %k1
305 ; X64-NEXT: vcvtpd2udq %xmm0, %xmm0 {%k1} {z}
308 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
309 %1 = bitcast <4 x i32> %0 to <2 x i64>
313 define <2 x i64> @test_mm256_cvtpd_epu32(<4 x double> %__A) {
314 ; CHECK-LABEL: test_mm256_cvtpd_epu32:
315 ; CHECK: # %bb.0: # %entry
316 ; CHECK-NEXT: vcvtpd2udq %ymm0, %xmm0
317 ; CHECK-NEXT: vzeroupper
318 ; CHECK-NEXT: ret{{[l|q]}}
320 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8
321 %1 = bitcast <4 x i32> %0 to <2 x i64>
325 define <2 x i64> @test_mm256_mask_cvtpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) {
326 ; X86-LABEL: test_mm256_mask_cvtpd_epu32:
327 ; X86: # %bb.0: # %entry
328 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
329 ; X86-NEXT: kmovw %eax, %k1
330 ; X86-NEXT: vcvtpd2udq %ymm1, %xmm0 {%k1}
331 ; X86-NEXT: vzeroupper
334 ; X64-LABEL: test_mm256_mask_cvtpd_epu32:
335 ; X64: # %bb.0: # %entry
336 ; X64-NEXT: kmovw %edi, %k1
337 ; X64-NEXT: vcvtpd2udq %ymm1, %xmm0 {%k1}
338 ; X64-NEXT: vzeroupper
341 %0 = bitcast <2 x i64> %__W to <4 x i32>
342 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> %0, i8 %__U) #8
343 %2 = bitcast <4 x i32> %1 to <2 x i64>
347 define <2 x i64> @test_mm256_maskz_cvtpd_epu32(i8 zeroext %__U, <4 x double> %__A) {
348 ; X86-LABEL: test_mm256_maskz_cvtpd_epu32:
349 ; X86: # %bb.0: # %entry
350 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
351 ; X86-NEXT: kmovw %eax, %k1
352 ; X86-NEXT: vcvtpd2udq %ymm0, %xmm0 {%k1} {z}
353 ; X86-NEXT: vzeroupper
356 ; X64-LABEL: test_mm256_maskz_cvtpd_epu32:
357 ; X64: # %bb.0: # %entry
358 ; X64-NEXT: kmovw %edi, %k1
359 ; X64-NEXT: vcvtpd2udq %ymm0, %xmm0 {%k1} {z}
360 ; X64-NEXT: vzeroupper
363 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
364 %1 = bitcast <4 x i32> %0 to <2 x i64>
368 define <4 x float> @test_mm_mask_cvtph_ps(<4 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) {
369 ; X86-LABEL: test_mm_mask_cvtph_ps:
370 ; X86: # %bb.0: # %entry
371 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
372 ; X86-NEXT: kmovw %eax, %k1
373 ; X86-NEXT: vcvtph2ps %xmm1, %xmm0 {%k1}
376 ; X64-LABEL: test_mm_mask_cvtph_ps:
377 ; X64: # %bb.0: # %entry
378 ; X64-NEXT: kmovw %edi, %k1
379 ; X64-NEXT: vcvtph2ps %xmm1, %xmm0 {%k1}
382 %0 = bitcast <2 x i64> %__A to <8 x i16>
383 %1 = shufflevector <8 x i16> %0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
384 %2 = bitcast <4 x i16> %1 to <4 x half>
385 %3 = bitcast i8 %__U to <8 x i1>
386 %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
387 %5 = fpext <4 x half> %2 to <4 x float>
388 %6 = select <4 x i1> %4, <4 x float> %5, <4 x float> %__W
392 define <4 x float> @test_mm_maskz_cvtph_ps(i8 zeroext %__U, <2 x i64> %__A) {
393 ; X86-LABEL: test_mm_maskz_cvtph_ps:
394 ; X86: # %bb.0: # %entry
395 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
396 ; X86-NEXT: kmovw %eax, %k1
397 ; X86-NEXT: vcvtph2ps %xmm0, %xmm0 {%k1} {z}
400 ; X64-LABEL: test_mm_maskz_cvtph_ps:
401 ; X64: # %bb.0: # %entry
402 ; X64-NEXT: kmovw %edi, %k1
403 ; X64-NEXT: vcvtph2ps %xmm0, %xmm0 {%k1} {z}
406 %0 = bitcast <2 x i64> %__A to <8 x i16>
407 %1 = shufflevector <8 x i16> %0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
408 %2 = bitcast <4 x i16> %1 to <4 x half>
409 %3 = bitcast i8 %__U to <8 x i1>
410 %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
411 %5 = fpext <4 x half> %2 to <4 x float>
412 %6 = select <4 x i1> %4, <4 x float> %5, <4 x float> zeroinitializer
416 define <8 x float> @test_mm256_mask_cvtph_ps(<8 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) {
417 ; X86-LABEL: test_mm256_mask_cvtph_ps:
418 ; X86: # %bb.0: # %entry
419 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
420 ; X86-NEXT: kmovw %eax, %k1
421 ; X86-NEXT: vcvtph2ps %xmm1, %ymm0 {%k1}
424 ; X64-LABEL: test_mm256_mask_cvtph_ps:
425 ; X64: # %bb.0: # %entry
426 ; X64-NEXT: kmovw %edi, %k1
427 ; X64-NEXT: vcvtph2ps %xmm1, %ymm0 {%k1}
430 %0 = bitcast <2 x i64> %__A to <8 x i16>
431 %1 = bitcast <8 x i16> %0 to <8 x half>
432 %2 = bitcast i8 %__U to <8 x i1>
433 %3 = fpext <8 x half> %1 to <8 x float>
434 %4 = select <8 x i1> %2, <8 x float> %3, <8 x float> %__W
438 define <8 x float> @test_mm256_maskz_cvtph_ps(i8 zeroext %__U, <2 x i64> %__A) {
439 ; X86-LABEL: test_mm256_maskz_cvtph_ps:
440 ; X86: # %bb.0: # %entry
441 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
442 ; X86-NEXT: kmovw %eax, %k1
443 ; X86-NEXT: vcvtph2ps %xmm0, %ymm0 {%k1} {z}
446 ; X64-LABEL: test_mm256_maskz_cvtph_ps:
447 ; X64: # %bb.0: # %entry
448 ; X64-NEXT: kmovw %edi, %k1
449 ; X64-NEXT: vcvtph2ps %xmm0, %ymm0 {%k1} {z}
452 %0 = bitcast <2 x i64> %__A to <8 x i16>
453 %1 = bitcast <8 x i16> %0 to <8 x half>
454 %2 = bitcast i8 %__U to <8 x i1>
455 %3 = fpext <8 x half> %1 to <8 x float>
456 %4 = select <8 x i1> %2, <8 x float> %3, <8 x float> zeroinitializer
460 define <2 x i64> @test_mm_mask_cvtps_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) {
461 ; X86-LABEL: test_mm_mask_cvtps_epi32:
462 ; X86: # %bb.0: # %entry
463 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
464 ; X86-NEXT: kmovw %eax, %k1
465 ; X86-NEXT: vcvtps2dq %xmm1, %xmm0 {%k1}
468 ; X64-LABEL: test_mm_mask_cvtps_epi32:
469 ; X64: # %bb.0: # %entry
470 ; X64-NEXT: kmovw %edi, %k1
471 ; X64-NEXT: vcvtps2dq %xmm1, %xmm0 {%k1}
474 %0 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %__A) #8
475 %1 = bitcast <2 x i64> %__W to <4 x i32>
476 %2 = bitcast i8 %__U to <8 x i1>
477 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
478 %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1
479 %4 = bitcast <4 x i32> %3 to <2 x i64>
483 define <2 x i64> @test_mm_maskz_cvtps_epi32(i8 zeroext %__U, <4 x float> %__A) {
484 ; X86-LABEL: test_mm_maskz_cvtps_epi32:
485 ; X86: # %bb.0: # %entry
486 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
487 ; X86-NEXT: kmovw %eax, %k1
488 ; X86-NEXT: vcvtps2dq %xmm0, %xmm0 {%k1} {z}
491 ; X64-LABEL: test_mm_maskz_cvtps_epi32:
492 ; X64: # %bb.0: # %entry
493 ; X64-NEXT: kmovw %edi, %k1
494 ; X64-NEXT: vcvtps2dq %xmm0, %xmm0 {%k1} {z}
497 %0 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %__A) #8
498 %1 = bitcast i8 %__U to <8 x i1>
499 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
500 %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer
501 %3 = bitcast <4 x i32> %2 to <2 x i64>
505 define <4 x i64> @test_mm256_mask_cvtps_epi32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) {
506 ; X86-LABEL: test_mm256_mask_cvtps_epi32:
507 ; X86: # %bb.0: # %entry
508 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
509 ; X86-NEXT: kmovw %eax, %k1
510 ; X86-NEXT: vcvtps2dq %ymm1, %ymm0 {%k1}
513 ; X64-LABEL: test_mm256_mask_cvtps_epi32:
514 ; X64: # %bb.0: # %entry
515 ; X64-NEXT: kmovw %edi, %k1
516 ; X64-NEXT: vcvtps2dq %ymm1, %ymm0 {%k1}
519 %0 = tail call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %__A) #8
520 %1 = bitcast <4 x i64> %__W to <8 x i32>
521 %2 = bitcast i8 %__U to <8 x i1>
522 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
523 %4 = bitcast <8 x i32> %3 to <4 x i64>
527 define <4 x i64> @test_mm256_maskz_cvtps_epi32(i8 zeroext %__U, <8 x float> %__A) {
528 ; X86-LABEL: test_mm256_maskz_cvtps_epi32:
529 ; X86: # %bb.0: # %entry
530 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
531 ; X86-NEXT: kmovw %eax, %k1
532 ; X86-NEXT: vcvtps2dq %ymm0, %ymm0 {%k1} {z}
535 ; X64-LABEL: test_mm256_maskz_cvtps_epi32:
536 ; X64: # %bb.0: # %entry
537 ; X64-NEXT: kmovw %edi, %k1
538 ; X64-NEXT: vcvtps2dq %ymm0, %ymm0 {%k1} {z}
541 %0 = tail call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %__A) #8
542 %1 = bitcast i8 %__U to <8 x i1>
543 %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer
544 %3 = bitcast <8 x i32> %2 to <4 x i64>
548 define <2 x double> @test_mm_mask_cvtps_pd(<2 x double> %__W, i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 {
549 ; X86-LABEL: test_mm_mask_cvtps_pd:
550 ; X86: # %bb.0: # %entry
551 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
552 ; X86-NEXT: kmovw %eax, %k1
553 ; X86-NEXT: vcvtps2pd %xmm1, %xmm0 {%k1}
556 ; X64-LABEL: test_mm_mask_cvtps_pd:
557 ; X64: # %bb.0: # %entry
558 ; X64-NEXT: kmovw %edi, %k1
559 ; X64-NEXT: vcvtps2pd %xmm1, %xmm0 {%k1}
562 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <2 x i32> <i32 0, i32 1>
563 %conv.i.i = fpext <2 x float> %shuffle.i.i to <2 x double>
564 %0 = bitcast i8 %__U to <8 x i1>
565 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
566 %1 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> %__W
570 define <2 x double> @test_mm_maskz_cvtps_pd(i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 {
571 ; X86-LABEL: test_mm_maskz_cvtps_pd:
572 ; X86: # %bb.0: # %entry
573 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
574 ; X86-NEXT: kmovw %eax, %k1
575 ; X86-NEXT: vcvtps2pd %xmm0, %xmm0 {%k1} {z}
578 ; X64-LABEL: test_mm_maskz_cvtps_pd:
579 ; X64: # %bb.0: # %entry
580 ; X64-NEXT: kmovw %edi, %k1
581 ; X64-NEXT: vcvtps2pd %xmm0, %xmm0 {%k1} {z}
584 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <2 x i32> <i32 0, i32 1>
585 %conv.i.i = fpext <2 x float> %shuffle.i.i to <2 x double>
586 %0 = bitcast i8 %__U to <8 x i1>
587 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
588 %1 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> zeroinitializer
592 define <4 x double> @test_mm256_mask_cvtps_pd(<4 x double> %__W, i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 {
593 ; X86-LABEL: test_mm256_mask_cvtps_pd:
594 ; X86: # %bb.0: # %entry
595 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
596 ; X86-NEXT: kmovw %eax, %k1
597 ; X86-NEXT: vcvtps2pd %xmm1, %ymm0 {%k1}
600 ; X64-LABEL: test_mm256_mask_cvtps_pd:
601 ; X64: # %bb.0: # %entry
602 ; X64-NEXT: kmovw %edi, %k1
603 ; X64-NEXT: vcvtps2pd %xmm1, %ymm0 {%k1}
606 %conv.i.i = fpext <4 x float> %__A to <4 x double>
607 %0 = bitcast i8 %__U to <8 x i1>
608 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
609 %1 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> %__W
613 define <4 x double> @test_mm256_maskz_cvtps_pd(i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 {
614 ; X86-LABEL: test_mm256_maskz_cvtps_pd:
615 ; X86: # %bb.0: # %entry
616 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
617 ; X86-NEXT: kmovw %eax, %k1
618 ; X86-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z}
621 ; X64-LABEL: test_mm256_maskz_cvtps_pd:
622 ; X64: # %bb.0: # %entry
623 ; X64-NEXT: kmovw %edi, %k1
624 ; X64-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z}
627 %conv.i.i = fpext <4 x float> %__A to <4 x double>
628 %0 = bitcast i8 %__U to <8 x i1>
629 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
630 %1 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> zeroinitializer
634 define <2 x i64> @test_mm_cvtps_epu32(<4 x float> %__A) {
635 ; CHECK-LABEL: test_mm_cvtps_epu32:
636 ; CHECK: # %bb.0: # %entry
637 ; CHECK-NEXT: vcvtps2udq %xmm0, %xmm0
638 ; CHECK-NEXT: ret{{[l|q]}}
640 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 -1) #8
641 %1 = bitcast <4 x i32> %0 to <2 x i64>
645 define <2 x i64> @test_mm_mask_cvtps_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) {
646 ; X86-LABEL: test_mm_mask_cvtps_epu32:
647 ; X86: # %bb.0: # %entry
648 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
649 ; X86-NEXT: kmovw %eax, %k1
650 ; X86-NEXT: vcvtps2udq %xmm1, %xmm0 {%k1}
653 ; X64-LABEL: test_mm_mask_cvtps_epu32:
654 ; X64: # %bb.0: # %entry
655 ; X64-NEXT: kmovw %edi, %k1
656 ; X64-NEXT: vcvtps2udq %xmm1, %xmm0 {%k1}
659 %0 = bitcast <2 x i64> %__W to <4 x i32>
660 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> %0, i8 %__U) #8
661 %2 = bitcast <4 x i32> %1 to <2 x i64>
665 define <2 x i64> @test_mm_maskz_cvtps_epu32(i8 zeroext %__U, <4 x float> %__A) {
666 ; X86-LABEL: test_mm_maskz_cvtps_epu32:
667 ; X86: # %bb.0: # %entry
668 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
669 ; X86-NEXT: kmovw %eax, %k1
670 ; X86-NEXT: vcvtps2udq %xmm0, %xmm0 {%k1} {z}
673 ; X64-LABEL: test_mm_maskz_cvtps_epu32:
674 ; X64: # %bb.0: # %entry
675 ; X64-NEXT: kmovw %edi, %k1
676 ; X64-NEXT: vcvtps2udq %xmm0, %xmm0 {%k1} {z}
679 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
680 %1 = bitcast <4 x i32> %0 to <2 x i64>
684 define <4 x i64> @test_mm256_cvtps_epu32(<8 x float> %__A) {
685 ; CHECK-LABEL: test_mm256_cvtps_epu32:
686 ; CHECK: # %bb.0: # %entry
687 ; CHECK-NEXT: vcvtps2udq %ymm0, %ymm0
688 ; CHECK-NEXT: ret{{[l|q]}}
690 %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 -1) #8
691 %1 = bitcast <8 x i32> %0 to <4 x i64>
695 define <4 x i64> @test_mm256_mask_cvtps_epu32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) {
696 ; X86-LABEL: test_mm256_mask_cvtps_epu32:
697 ; X86: # %bb.0: # %entry
698 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
699 ; X86-NEXT: kmovw %eax, %k1
700 ; X86-NEXT: vcvtps2udq %ymm1, %ymm0 {%k1}
703 ; X64-LABEL: test_mm256_mask_cvtps_epu32:
704 ; X64: # %bb.0: # %entry
705 ; X64-NEXT: kmovw %edi, %k1
706 ; X64-NEXT: vcvtps2udq %ymm1, %ymm0 {%k1}
709 %0 = bitcast <4 x i64> %__W to <8 x i32>
710 %1 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> %0, i8 %__U) #8
711 %2 = bitcast <8 x i32> %1 to <4 x i64>
715 define <4 x i64> @test_mm256_maskz_cvtps_epu32(i8 zeroext %__U, <8 x float> %__A) {
716 ; X86-LABEL: test_mm256_maskz_cvtps_epu32:
717 ; X86: # %bb.0: # %entry
718 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
719 ; X86-NEXT: kmovw %eax, %k1
720 ; X86-NEXT: vcvtps2udq %ymm0, %ymm0 {%k1} {z}
723 ; X64-LABEL: test_mm256_maskz_cvtps_epu32:
724 ; X64: # %bb.0: # %entry
725 ; X64-NEXT: kmovw %edi, %k1
726 ; X64-NEXT: vcvtps2udq %ymm0, %ymm0 {%k1} {z}
729 %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 %__U) #8
730 %1 = bitcast <8 x i32> %0 to <4 x i64>
734 define <2 x i64> @test_mm_mask_cvttpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) {
735 ; X86-LABEL: test_mm_mask_cvttpd_epi32:
736 ; X86: # %bb.0: # %entry
737 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
738 ; X86-NEXT: kmovw %eax, %k1
739 ; X86-NEXT: vcvttpd2dq %xmm1, %xmm0 {%k1}
742 ; X64-LABEL: test_mm_mask_cvttpd_epi32:
743 ; X64: # %bb.0: # %entry
744 ; X64-NEXT: kmovw %edi, %k1
745 ; X64-NEXT: vcvttpd2dq %xmm1, %xmm0 {%k1}
748 %0 = bitcast <2 x i64> %__W to <4 x i32>
749 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8
750 %2 = bitcast <4 x i32> %1 to <2 x i64>
754 define <2 x i64> @test_mm_maskz_cvttpd_epi32(i8 zeroext %__U, <2 x double> %__A) {
755 ; X86-LABEL: test_mm_maskz_cvttpd_epi32:
756 ; X86: # %bb.0: # %entry
757 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
758 ; X86-NEXT: kmovw %eax, %k1
759 ; X86-NEXT: vcvttpd2dq %xmm0, %xmm0 {%k1} {z}
762 ; X64-LABEL: test_mm_maskz_cvttpd_epi32:
763 ; X64: # %bb.0: # %entry
764 ; X64-NEXT: kmovw %edi, %k1
765 ; X64-NEXT: vcvttpd2dq %xmm0, %xmm0 {%k1} {z}
768 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
769 %1 = bitcast <4 x i32> %0 to <2 x i64>
773 define <2 x i64> @test_mm256_mask_cvttpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) {
774 ; X86-LABEL: test_mm256_mask_cvttpd_epi32:
775 ; X86: # %bb.0: # %entry
776 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
777 ; X86-NEXT: kmovw %eax, %k1
778 ; X86-NEXT: vcvttpd2dq %ymm1, %xmm0 {%k1}
779 ; X86-NEXT: vzeroupper
782 ; X64-LABEL: test_mm256_mask_cvttpd_epi32:
783 ; X64: # %bb.0: # %entry
784 ; X64-NEXT: kmovw %edi, %k1
785 ; X64-NEXT: vcvttpd2dq %ymm1, %xmm0 {%k1}
786 ; X64-NEXT: vzeroupper
789 %0 = tail call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %__A) #8
790 %1 = bitcast <2 x i64> %__W to <4 x i32>
791 %2 = bitcast i8 %__U to <8 x i1>
792 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
793 %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1
794 %4 = bitcast <4 x i32> %3 to <2 x i64>
798 define <2 x i64> @test_mm256_maskz_cvttpd_epi32(i8 zeroext %__U, <4 x double> %__A) {
799 ; X86-LABEL: test_mm256_maskz_cvttpd_epi32:
800 ; X86: # %bb.0: # %entry
801 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
802 ; X86-NEXT: kmovw %eax, %k1
803 ; X86-NEXT: vcvttpd2dq %ymm0, %xmm0 {%k1} {z}
804 ; X86-NEXT: vzeroupper
807 ; X64-LABEL: test_mm256_maskz_cvttpd_epi32:
808 ; X64: # %bb.0: # %entry
809 ; X64-NEXT: kmovw %edi, %k1
810 ; X64-NEXT: vcvttpd2dq %ymm0, %xmm0 {%k1} {z}
811 ; X64-NEXT: vzeroupper
814 %0 = tail call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %__A) #8
815 %1 = bitcast i8 %__U to <8 x i1>
816 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
817 %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer
818 %3 = bitcast <4 x i32> %2 to <2 x i64>
822 define <2 x i64> @test_mm_cvttpd_epu32(<2 x double> %__A) {
823 ; CHECK-LABEL: test_mm_cvttpd_epu32:
824 ; CHECK: # %bb.0: # %entry
825 ; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm0
826 ; CHECK-NEXT: ret{{[l|q]}}
828 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8
829 %1 = bitcast <4 x i32> %0 to <2 x i64>
833 define <2 x i64> @test_mm_mask_cvttpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) {
834 ; X86-LABEL: test_mm_mask_cvttpd_epu32:
835 ; X86: # %bb.0: # %entry
836 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
837 ; X86-NEXT: kmovw %eax, %k1
838 ; X86-NEXT: vcvttpd2udq %xmm1, %xmm0 {%k1}
841 ; X64-LABEL: test_mm_mask_cvttpd_epu32:
842 ; X64: # %bb.0: # %entry
843 ; X64-NEXT: kmovw %edi, %k1
844 ; X64-NEXT: vcvttpd2udq %xmm1, %xmm0 {%k1}
847 %0 = bitcast <2 x i64> %__W to <4 x i32>
848 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8
849 %2 = bitcast <4 x i32> %1 to <2 x i64>
853 define <2 x i64> @test_mm_maskz_cvttpd_epu32(i8 zeroext %__U, <2 x double> %__A) {
854 ; X86-LABEL: test_mm_maskz_cvttpd_epu32:
855 ; X86: # %bb.0: # %entry
856 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
857 ; X86-NEXT: kmovw %eax, %k1
858 ; X86-NEXT: vcvttpd2udq %xmm0, %xmm0 {%k1} {z}
861 ; X64-LABEL: test_mm_maskz_cvttpd_epu32:
862 ; X64: # %bb.0: # %entry
863 ; X64-NEXT: kmovw %edi, %k1
864 ; X64-NEXT: vcvttpd2udq %xmm0, %xmm0 {%k1} {z}
867 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
868 %1 = bitcast <4 x i32> %0 to <2 x i64>
872 define <2 x i64> @test_mm256_cvttpd_epu32(<4 x double> %__A) {
873 ; CHECK-LABEL: test_mm256_cvttpd_epu32:
874 ; CHECK: # %bb.0: # %entry
875 ; CHECK-NEXT: vcvttpd2udq %ymm0, %xmm0
876 ; CHECK-NEXT: vzeroupper
877 ; CHECK-NEXT: ret{{[l|q]}}
879 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8
880 %1 = bitcast <4 x i32> %0 to <2 x i64>
884 define <2 x i64> @test_mm256_mask_cvttpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) {
885 ; X86-LABEL: test_mm256_mask_cvttpd_epu32:
886 ; X86: # %bb.0: # %entry
887 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
888 ; X86-NEXT: kmovw %eax, %k1
889 ; X86-NEXT: vcvttpd2udq %ymm1, %xmm0 {%k1}
890 ; X86-NEXT: vzeroupper
893 ; X64-LABEL: test_mm256_mask_cvttpd_epu32:
894 ; X64: # %bb.0: # %entry
895 ; X64-NEXT: kmovw %edi, %k1
896 ; X64-NEXT: vcvttpd2udq %ymm1, %xmm0 {%k1}
897 ; X64-NEXT: vzeroupper
900 %0 = bitcast <2 x i64> %__W to <4 x i32>
901 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> %0, i8 %__U) #8
902 %2 = bitcast <4 x i32> %1 to <2 x i64>
906 define <2 x i64> @test_mm256_maskz_cvttpd_epu32(i8 zeroext %__U, <4 x double> %__A) {
907 ; X86-LABEL: test_mm256_maskz_cvttpd_epu32:
908 ; X86: # %bb.0: # %entry
909 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
910 ; X86-NEXT: kmovw %eax, %k1
911 ; X86-NEXT: vcvttpd2udq %ymm0, %xmm0 {%k1} {z}
912 ; X86-NEXT: vzeroupper
915 ; X64-LABEL: test_mm256_maskz_cvttpd_epu32:
916 ; X64: # %bb.0: # %entry
917 ; X64-NEXT: kmovw %edi, %k1
918 ; X64-NEXT: vcvttpd2udq %ymm0, %xmm0 {%k1} {z}
919 ; X64-NEXT: vzeroupper
922 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
923 %1 = bitcast <4 x i32> %0 to <2 x i64>
927 define <2 x i64> @test_mm_mask_cvttps_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) {
928 ; X86-LABEL: test_mm_mask_cvttps_epi32:
929 ; X86: # %bb.0: # %entry
930 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
931 ; X86-NEXT: kmovw %eax, %k1
932 ; X86-NEXT: vcvttps2dq %xmm1, %xmm0 {%k1}
935 ; X64-LABEL: test_mm_mask_cvttps_epi32:
936 ; X64: # %bb.0: # %entry
937 ; X64-NEXT: kmovw %edi, %k1
938 ; X64-NEXT: vcvttps2dq %xmm1, %xmm0 {%k1}
941 %0 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %__A) #8
942 %1 = bitcast <2 x i64> %__W to <4 x i32>
943 %2 = bitcast i8 %__U to <8 x i1>
944 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
945 %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1
946 %4 = bitcast <4 x i32> %3 to <2 x i64>
950 define <2 x i64> @test_mm_maskz_cvttps_epi32(i8 zeroext %__U, <4 x float> %__A) {
951 ; X86-LABEL: test_mm_maskz_cvttps_epi32:
952 ; X86: # %bb.0: # %entry
953 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
954 ; X86-NEXT: kmovw %eax, %k1
955 ; X86-NEXT: vcvttps2dq %xmm0, %xmm0 {%k1} {z}
958 ; X64-LABEL: test_mm_maskz_cvttps_epi32:
959 ; X64: # %bb.0: # %entry
960 ; X64-NEXT: kmovw %edi, %k1
961 ; X64-NEXT: vcvttps2dq %xmm0, %xmm0 {%k1} {z}
964 %0 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %__A) #8
965 %1 = bitcast i8 %__U to <8 x i1>
966 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
967 %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer
968 %3 = bitcast <4 x i32> %2 to <2 x i64>
972 define <4 x i64> @test_mm256_mask_cvttps_epi32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) {
973 ; X86-LABEL: test_mm256_mask_cvttps_epi32:
974 ; X86: # %bb.0: # %entry
975 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
976 ; X86-NEXT: kmovw %eax, %k1
977 ; X86-NEXT: vcvttps2dq %ymm1, %ymm0 {%k1}
980 ; X64-LABEL: test_mm256_mask_cvttps_epi32:
981 ; X64: # %bb.0: # %entry
982 ; X64-NEXT: kmovw %edi, %k1
983 ; X64-NEXT: vcvttps2dq %ymm1, %ymm0 {%k1}
986 %0 = tail call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %__A) #8
987 %1 = bitcast <4 x i64> %__W to <8 x i32>
988 %2 = bitcast i8 %__U to <8 x i1>
989 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
990 %4 = bitcast <8 x i32> %3 to <4 x i64>
994 define <4 x i64> @test_mm256_maskz_cvttps_epi32(i8 zeroext %__U, <8 x float> %__A) {
995 ; X86-LABEL: test_mm256_maskz_cvttps_epi32:
996 ; X86: # %bb.0: # %entry
997 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
998 ; X86-NEXT: kmovw %eax, %k1
999 ; X86-NEXT: vcvttps2dq %ymm0, %ymm0 {%k1} {z}
1002 ; X64-LABEL: test_mm256_maskz_cvttps_epi32:
1003 ; X64: # %bb.0: # %entry
1004 ; X64-NEXT: kmovw %edi, %k1
1005 ; X64-NEXT: vcvttps2dq %ymm0, %ymm0 {%k1} {z}
1008 %0 = tail call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %__A) #8
1009 %1 = bitcast i8 %__U to <8 x i1>
1010 %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer
1011 %3 = bitcast <8 x i32> %2 to <4 x i64>
1015 define <2 x i64> @test_mm_cvttps_epu32(<4 x float> %__A) {
1016 ; CHECK-LABEL: test_mm_cvttps_epu32:
1017 ; CHECK: # %bb.0: # %entry
1018 ; CHECK-NEXT: vcvttps2udq %xmm0, %xmm0
1019 ; CHECK-NEXT: ret{{[l|q]}}
1021 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 -1) #8
1022 %1 = bitcast <4 x i32> %0 to <2 x i64>
1026 define <2 x i64> @test_mm_mask_cvttps_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) {
1027 ; X86-LABEL: test_mm_mask_cvttps_epu32:
1028 ; X86: # %bb.0: # %entry
1029 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1030 ; X86-NEXT: kmovw %eax, %k1
1031 ; X86-NEXT: vcvttps2udq %xmm1, %xmm0 {%k1}
1034 ; X64-LABEL: test_mm_mask_cvttps_epu32:
1035 ; X64: # %bb.0: # %entry
1036 ; X64-NEXT: kmovw %edi, %k1
1037 ; X64-NEXT: vcvttps2udq %xmm1, %xmm0 {%k1}
1040 %0 = bitcast <2 x i64> %__W to <4 x i32>
1041 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> %0, i8 %__U) #8
1042 %2 = bitcast <4 x i32> %1 to <2 x i64>
1046 define <2 x i64> @test_mm_maskz_cvttps_epu32(i8 zeroext %__U, <4 x float> %__A) {
1047 ; X86-LABEL: test_mm_maskz_cvttps_epu32:
1048 ; X86: # %bb.0: # %entry
1049 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1050 ; X86-NEXT: kmovw %eax, %k1
1051 ; X86-NEXT: vcvttps2udq %xmm0, %xmm0 {%k1} {z}
1054 ; X64-LABEL: test_mm_maskz_cvttps_epu32:
1055 ; X64: # %bb.0: # %entry
1056 ; X64-NEXT: kmovw %edi, %k1
1057 ; X64-NEXT: vcvttps2udq %xmm0, %xmm0 {%k1} {z}
1060 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 %__U) #8
1061 %1 = bitcast <4 x i32> %0 to <2 x i64>
1065 define <4 x i64> @test_mm256_cvttps_epu32(<8 x float> %__A) {
1066 ; CHECK-LABEL: test_mm256_cvttps_epu32:
1067 ; CHECK: # %bb.0: # %entry
1068 ; CHECK-NEXT: vcvttps2udq %ymm0, %ymm0
1069 ; CHECK-NEXT: ret{{[l|q]}}
1071 %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 -1) #8
1072 %1 = bitcast <8 x i32> %0 to <4 x i64>
1076 define <4 x i64> @test_mm256_mask_cvttps_epu32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) {
1077 ; X86-LABEL: test_mm256_mask_cvttps_epu32:
1078 ; X86: # %bb.0: # %entry
1079 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1080 ; X86-NEXT: kmovw %eax, %k1
1081 ; X86-NEXT: vcvttps2udq %ymm1, %ymm0 {%k1}
1084 ; X64-LABEL: test_mm256_mask_cvttps_epu32:
1085 ; X64: # %bb.0: # %entry
1086 ; X64-NEXT: kmovw %edi, %k1
1087 ; X64-NEXT: vcvttps2udq %ymm1, %ymm0 {%k1}
1090 %0 = bitcast <4 x i64> %__W to <8 x i32>
1091 %1 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> %0, i8 %__U) #8
1092 %2 = bitcast <8 x i32> %1 to <4 x i64>
1096 define <4 x i64> @test_mm256_maskz_cvttps_epu32(i8 zeroext %__U, <8 x float> %__A) {
1097 ; X86-LABEL: test_mm256_maskz_cvttps_epu32:
1098 ; X86: # %bb.0: # %entry
1099 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1100 ; X86-NEXT: kmovw %eax, %k1
1101 ; X86-NEXT: vcvttps2udq %ymm0, %ymm0 {%k1} {z}
1104 ; X64-LABEL: test_mm256_maskz_cvttps_epu32:
1105 ; X64: # %bb.0: # %entry
1106 ; X64-NEXT: kmovw %edi, %k1
1107 ; X64-NEXT: vcvttps2udq %ymm0, %ymm0 {%k1} {z}
1110 %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 %__U) #8
1111 %1 = bitcast <8 x i32> %0 to <4 x i64>
1115 define <2 x double> @test_mm_cvtepu32_pd(<2 x i64> %__A) local_unnamed_addr #0 {
1116 ; CHECK-LABEL: test_mm_cvtepu32_pd:
1117 ; CHECK: # %bb.0: # %entry
1118 ; CHECK-NEXT: vcvtudq2pd %xmm0, %xmm0
1119 ; CHECK-NEXT: ret{{[l|q]}}
1121 %0 = bitcast <2 x i64> %__A to <4 x i32>
1122 %shuffle.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1123 %conv.i = uitofp <2 x i32> %shuffle.i to <2 x double>
1124 ret <2 x double> %conv.i
1127 define <2 x double> @test_mm_mask_cvtepu32_pd(<2 x double> %__W, i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 {
1128 ; X86-LABEL: test_mm_mask_cvtepu32_pd:
1129 ; X86: # %bb.0: # %entry
1130 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1131 ; X86-NEXT: kmovw %eax, %k1
1132 ; X86-NEXT: vcvtudq2pd %xmm1, %xmm0 {%k1}
1135 ; X64-LABEL: test_mm_mask_cvtepu32_pd:
1136 ; X64: # %bb.0: # %entry
1137 ; X64-NEXT: kmovw %edi, %k1
1138 ; X64-NEXT: vcvtudq2pd %xmm1, %xmm0 {%k1}
1141 %0 = bitcast <2 x i64> %__A to <4 x i32>
1142 %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1143 %conv.i.i = uitofp <2 x i32> %shuffle.i.i to <2 x double>
1144 %1 = bitcast i8 %__U to <8 x i1>
1145 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1146 %2 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> %__W
1150 define <2 x double> @test_mm_maskz_cvtepu32_pd(i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 {
1151 ; X86-LABEL: test_mm_maskz_cvtepu32_pd:
1152 ; X86: # %bb.0: # %entry
1153 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1154 ; X86-NEXT: kmovw %eax, %k1
1155 ; X86-NEXT: vcvtudq2pd %xmm0, %xmm0 {%k1} {z}
1158 ; X64-LABEL: test_mm_maskz_cvtepu32_pd:
1159 ; X64: # %bb.0: # %entry
1160 ; X64-NEXT: kmovw %edi, %k1
1161 ; X64-NEXT: vcvtudq2pd %xmm0, %xmm0 {%k1} {z}
1164 %0 = bitcast <2 x i64> %__A to <4 x i32>
1165 %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1166 %conv.i.i = uitofp <2 x i32> %shuffle.i.i to <2 x double>
1167 %1 = bitcast i8 %__U to <8 x i1>
1168 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1169 %2 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> zeroinitializer
1173 define <4 x double> @test_mm256_cvtepu32_pd(<2 x i64> %__A) local_unnamed_addr #0 {
1174 ; CHECK-LABEL: test_mm256_cvtepu32_pd:
1175 ; CHECK: # %bb.0: # %entry
1176 ; CHECK-NEXT: vcvtudq2pd %xmm0, %ymm0
1177 ; CHECK-NEXT: ret{{[l|q]}}
1179 %0 = bitcast <2 x i64> %__A to <4 x i32>
1180 %conv.i = uitofp <4 x i32> %0 to <4 x double>
1181 ret <4 x double> %conv.i
1184 define <4 x double> @test_mm256_mask_cvtepu32_pd(<4 x double> %__W, i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 {
1185 ; X86-LABEL: test_mm256_mask_cvtepu32_pd:
1186 ; X86: # %bb.0: # %entry
1187 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1188 ; X86-NEXT: kmovw %eax, %k1
1189 ; X86-NEXT: vcvtudq2pd %xmm1, %ymm0 {%k1}
1192 ; X64-LABEL: test_mm256_mask_cvtepu32_pd:
1193 ; X64: # %bb.0: # %entry
1194 ; X64-NEXT: kmovw %edi, %k1
1195 ; X64-NEXT: vcvtudq2pd %xmm1, %ymm0 {%k1}
1198 %0 = bitcast <2 x i64> %__A to <4 x i32>
1199 %conv.i.i = uitofp <4 x i32> %0 to <4 x double>
1200 %1 = bitcast i8 %__U to <8 x i1>
1201 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1202 %2 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> %__W
1206 define <4 x double> @test_mm256_maskz_cvtepu32_pd(i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 {
1207 ; X86-LABEL: test_mm256_maskz_cvtepu32_pd:
1208 ; X86: # %bb.0: # %entry
1209 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1210 ; X86-NEXT: kmovw %eax, %k1
1211 ; X86-NEXT: vcvtudq2pd %xmm0, %ymm0 {%k1} {z}
1214 ; X64-LABEL: test_mm256_maskz_cvtepu32_pd:
1215 ; X64: # %bb.0: # %entry
1216 ; X64-NEXT: kmovw %edi, %k1
1217 ; X64-NEXT: vcvtudq2pd %xmm0, %ymm0 {%k1} {z}
1220 %0 = bitcast <2 x i64> %__A to <4 x i32>
1221 %conv.i.i = uitofp <4 x i32> %0 to <4 x double>
1222 %1 = bitcast i8 %__U to <8 x i1>
1223 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1224 %2 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> zeroinitializer
1228 define <4 x float> @test_mm_cvtepu32_ps(<2 x i64> %__A) {
1229 ; CHECK-LABEL: test_mm_cvtepu32_ps:
1230 ; CHECK: # %bb.0: # %entry
1231 ; CHECK-NEXT: vcvtudq2ps %xmm0, %xmm0
1232 ; CHECK-NEXT: ret{{[l|q]}}
1234 %0 = bitcast <2 x i64> %__A to <4 x i32>
1235 %conv.i = uitofp <4 x i32> %0 to <4 x float>
1236 ret <4 x float> %conv.i
1239 define <4 x float> @test_mm_mask_cvtepu32_ps(<4 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) {
1240 ; X86-LABEL: test_mm_mask_cvtepu32_ps:
1241 ; X86: # %bb.0: # %entry
1242 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1243 ; X86-NEXT: kmovw %eax, %k1
1244 ; X86-NEXT: vcvtudq2ps %xmm1, %xmm0 {%k1}
1247 ; X64-LABEL: test_mm_mask_cvtepu32_ps:
1248 ; X64: # %bb.0: # %entry
1249 ; X64-NEXT: kmovw %edi, %k1
1250 ; X64-NEXT: vcvtudq2ps %xmm1, %xmm0 {%k1}
1253 %0 = bitcast <2 x i64> %__A to <4 x i32>
1254 %conv.i.i = uitofp <4 x i32> %0 to <4 x float>
1255 %1 = bitcast i8 %__U to <8 x i1>
1256 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1257 %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> %__W
1261 define <4 x float> @test_mm_maskz_cvtepu32_ps(i8 zeroext %__U, <2 x i64> %__A) {
1262 ; X86-LABEL: test_mm_maskz_cvtepu32_ps:
1263 ; X86: # %bb.0: # %entry
1264 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1265 ; X86-NEXT: kmovw %eax, %k1
1266 ; X86-NEXT: vcvtudq2ps %xmm0, %xmm0 {%k1} {z}
1269 ; X64-LABEL: test_mm_maskz_cvtepu32_ps:
1270 ; X64: # %bb.0: # %entry
1271 ; X64-NEXT: kmovw %edi, %k1
1272 ; X64-NEXT: vcvtudq2ps %xmm0, %xmm0 {%k1} {z}
1275 %0 = bitcast <2 x i64> %__A to <4 x i32>
1276 %conv.i.i = uitofp <4 x i32> %0 to <4 x float>
1277 %1 = bitcast i8 %__U to <8 x i1>
1278 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1279 %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> zeroinitializer
1283 define <8 x float> @test_mm256_cvtepu32_ps(<4 x i64> %__A) {
1284 ; CHECK-LABEL: test_mm256_cvtepu32_ps:
1285 ; CHECK: # %bb.0: # %entry
1286 ; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm0
1287 ; CHECK-NEXT: ret{{[l|q]}}
1289 %0 = bitcast <4 x i64> %__A to <8 x i32>
1290 %conv.i = uitofp <8 x i32> %0 to <8 x float>
1291 ret <8 x float> %conv.i
1294 define <8 x float> @test_mm256_mask_cvtepu32_ps(<8 x float> %__W, i8 zeroext %__U, <4 x i64> %__A) {
1295 ; X86-LABEL: test_mm256_mask_cvtepu32_ps:
1296 ; X86: # %bb.0: # %entry
1297 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1298 ; X86-NEXT: kmovw %eax, %k1
1299 ; X86-NEXT: vcvtudq2ps %ymm1, %ymm0 {%k1}
1302 ; X64-LABEL: test_mm256_mask_cvtepu32_ps:
1303 ; X64: # %bb.0: # %entry
1304 ; X64-NEXT: kmovw %edi, %k1
1305 ; X64-NEXT: vcvtudq2ps %ymm1, %ymm0 {%k1}
1308 %0 = bitcast <4 x i64> %__A to <8 x i32>
1309 %conv.i.i = uitofp <8 x i32> %0 to <8 x float>
1310 %1 = bitcast i8 %__U to <8 x i1>
1311 %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> %__W
1315 define <8 x float> @test_mm256_maskz_cvtepu32_ps(i8 zeroext %__U, <4 x i64> %__A) {
1316 ; X86-LABEL: test_mm256_maskz_cvtepu32_ps:
1317 ; X86: # %bb.0: # %entry
1318 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1319 ; X86-NEXT: kmovw %eax, %k1
1320 ; X86-NEXT: vcvtudq2ps %ymm0, %ymm0 {%k1} {z}
1323 ; X64-LABEL: test_mm256_maskz_cvtepu32_ps:
1324 ; X64: # %bb.0: # %entry
1325 ; X64-NEXT: kmovw %edi, %k1
1326 ; X64-NEXT: vcvtudq2ps %ymm0, %ymm0 {%k1} {z}
1329 %0 = bitcast <4 x i64> %__A to <8 x i32>
1330 %conv.i.i = uitofp <8 x i32> %0 to <8 x float>
1331 %1 = bitcast i8 %__U to <8 x i1>
1332 %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> zeroinitializer
1336 define <8 x float> @test_mm256_shuffle_f32x4(<8 x float> %__A, <8 x float> %__B) {
1337 ; CHECK-LABEL: test_mm256_shuffle_f32x4:
1338 ; CHECK: # %bb.0: # %entry
1339 ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1340 ; CHECK-NEXT: ret{{[l|q]}}
1342 %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1343 ret <8 x float> %shuffle
1346 define <8 x float> @test_mm256_mask_shuffle_f32x4(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) {
1347 ; X86-LABEL: test_mm256_mask_shuffle_f32x4:
1348 ; X86: # %bb.0: # %entry
1349 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1350 ; X86-NEXT: kmovw %eax, %k1
1351 ; X86-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
1354 ; X64-LABEL: test_mm256_mask_shuffle_f32x4:
1355 ; X64: # %bb.0: # %entry
1356 ; X64-NEXT: kmovw %edi, %k1
1357 ; X64-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
1360 %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1361 %0 = bitcast i8 %__U to <8 x i1>
1362 %1 = select <8 x i1> %0, <8 x float> %shuffle, <8 x float> %__W
1366 define <8 x float> @test_mm256_maskz_shuffle_f32x4(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) {
1367 ; X86-LABEL: test_mm256_maskz_shuffle_f32x4:
1368 ; X86: # %bb.0: # %entry
1369 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1370 ; X86-NEXT: kmovw %eax, %k1
1371 ; X86-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
1374 ; X64-LABEL: test_mm256_maskz_shuffle_f32x4:
1375 ; X64: # %bb.0: # %entry
1376 ; X64-NEXT: kmovw %edi, %k1
1377 ; X64-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
1380 %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1381 %0 = bitcast i8 %__U to <8 x i1>
1382 %1 = select <8 x i1> %0, <8 x float> %shuffle, <8 x float> zeroinitializer
1386 define <4 x double> @test_mm256_shuffle_f64x2(<4 x double> %__A, <4 x double> %__B) {
1387 ; CHECK-LABEL: test_mm256_shuffle_f64x2:
1388 ; CHECK: # %bb.0: # %entry
1389 ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1390 ; CHECK-NEXT: ret{{[l|q]}}
1392 %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1393 ret <4 x double> %shuffle
1396 define <4 x double> @test_mm256_mask_shuffle_f64x2(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
1397 ; X86-LABEL: test_mm256_mask_shuffle_f64x2:
1398 ; X86: # %bb.0: # %entry
1399 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1400 ; X86-NEXT: kmovw %eax, %k1
1401 ; X86-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
1404 ; X64-LABEL: test_mm256_mask_shuffle_f64x2:
1405 ; X64: # %bb.0: # %entry
1406 ; X64-NEXT: kmovw %edi, %k1
1407 ; X64-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
1410 %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1411 %0 = bitcast i8 %__U to <8 x i1>
1412 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1413 %1 = select <4 x i1> %extract, <4 x double> %shuffle, <4 x double> %__W
1417 define <4 x double> @test_mm256_maskz_shuffle_f64x2(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
1418 ; X86-LABEL: test_mm256_maskz_shuffle_f64x2:
1419 ; X86: # %bb.0: # %entry
1420 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1421 ; X86-NEXT: kmovw %eax, %k1
1422 ; X86-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
1425 ; X64-LABEL: test_mm256_maskz_shuffle_f64x2:
1426 ; X64: # %bb.0: # %entry
1427 ; X64-NEXT: kmovw %edi, %k1
1428 ; X64-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
1431 %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1432 %0 = bitcast i8 %__U to <8 x i1>
1433 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1434 %1 = select <4 x i1> %extract, <4 x double> %shuffle, <4 x double> zeroinitializer
1438 define <4 x i64> @test_mm256_shuffle_i32x4(<4 x i64> %__A, <4 x i64> %__B) {
1439 ; CHECK-LABEL: test_mm256_shuffle_i32x4:
1440 ; CHECK: # %bb.0: # %entry
1441 ; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1442 ; CHECK-NEXT: ret{{[l|q]}}
1444 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1445 ret <4 x i64> %shuffle
1448 define <4 x i64> @test_mm256_mask_shuffle_i32x4(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1449 ; X86-LABEL: test_mm256_mask_shuffle_i32x4:
1450 ; X86: # %bb.0: # %entry
1451 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1452 ; X86-NEXT: kmovw %eax, %k1
1453 ; X86-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
1456 ; X64-LABEL: test_mm256_mask_shuffle_i32x4:
1457 ; X64: # %bb.0: # %entry
1458 ; X64-NEXT: kmovw %edi, %k1
1459 ; X64-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
1462 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1463 %0 = bitcast <4 x i64> %shuffle to <8 x i32>
1464 %1 = bitcast <4 x i64> %__W to <8 x i32>
1465 %2 = bitcast i8 %__U to <8 x i1>
1466 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
1467 %4 = bitcast <8 x i32> %3 to <4 x i64>
1471 define <4 x i64> @test_mm256_maskz_shuffle_i32x4(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1472 ; X86-LABEL: test_mm256_maskz_shuffle_i32x4:
1473 ; X86: # %bb.0: # %entry
1474 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1475 ; X86-NEXT: kmovw %eax, %k1
1476 ; X86-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
1479 ; X64-LABEL: test_mm256_maskz_shuffle_i32x4:
1480 ; X64: # %bb.0: # %entry
1481 ; X64-NEXT: kmovw %edi, %k1
1482 ; X64-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
1485 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1486 %0 = bitcast <4 x i64> %shuffle to <8 x i32>
1487 %1 = bitcast i8 %__U to <8 x i1>
1488 %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer
1489 %3 = bitcast <8 x i32> %2 to <4 x i64>
1493 define <4 x i64> @test_mm256_shuffle_i64x2(<4 x i64> %__A, <4 x i64> %__B) {
1494 ; CHECK-LABEL: test_mm256_shuffle_i64x2:
1495 ; CHECK: # %bb.0: # %entry
1496 ; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1497 ; CHECK-NEXT: ret{{[l|q]}}
1499 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1500 ret <4 x i64> %shuffle
1503 define <4 x i64> @test_mm256_mask_shuffle_i64x2(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1504 ; X86-LABEL: test_mm256_mask_shuffle_i64x2:
1505 ; X86: # %bb.0: # %entry
1506 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1507 ; X86-NEXT: kmovw %eax, %k1
1508 ; X86-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
1511 ; X64-LABEL: test_mm256_mask_shuffle_i64x2:
1512 ; X64: # %bb.0: # %entry
1513 ; X64-NEXT: kmovw %edi, %k1
1514 ; X64-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
1517 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1518 %0 = bitcast i8 %__U to <8 x i1>
1519 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1520 %1 = select <4 x i1> %extract, <4 x i64> %shuffle, <4 x i64> %__W
1524 define <4 x i64> @test_mm256_maskz_shuffle_i64x2(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1525 ; X86-LABEL: test_mm256_maskz_shuffle_i64x2:
1526 ; X86: # %bb.0: # %entry
1527 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1528 ; X86-NEXT: kmovw %eax, %k1
1529 ; X86-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
1532 ; X64-LABEL: test_mm256_maskz_shuffle_i64x2:
1533 ; X64: # %bb.0: # %entry
1534 ; X64-NEXT: kmovw %edi, %k1
1535 ; X64-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
1538 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1539 %0 = bitcast i8 %__U to <8 x i1>
1540 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1541 %1 = select <4 x i1> %extract, <4 x i64> %shuffle, <4 x i64> zeroinitializer
1545 define zeroext i8 @test_mm_test_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) {
1546 ; CHECK-LABEL: test_mm_test_epi32_mask:
1547 ; CHECK: # %bb.0: # %entry
1548 ; CHECK-NEXT: vptestmd %xmm0, %xmm1, %k0
1549 ; CHECK-NEXT: kmovw %k0, %eax
1550 ; CHECK-NEXT: movzbl %al, %eax
1551 ; CHECK-NEXT: ret{{[l|q]}}
1553 %and.i.i = and <2 x i64> %__B, %__A
1554 %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
1555 %1 = icmp ne <4 x i32> %0, zeroinitializer
1556 %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1557 %3 = bitcast <8 x i1> %2 to i8
1561 define zeroext i8 @test_mm_mask_test_epi32_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1562 ; X86-LABEL: test_mm_mask_test_epi32_mask:
1563 ; X86: # %bb.0: # %entry
1564 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1565 ; X86-NEXT: kmovw %eax, %k1
1566 ; X86-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1}
1567 ; X86-NEXT: kmovw %k0, %eax
1568 ; X86-NEXT: movzbl %al, %eax
1571 ; X64-LABEL: test_mm_mask_test_epi32_mask:
1572 ; X64: # %bb.0: # %entry
1573 ; X64-NEXT: kmovw %edi, %k1
1574 ; X64-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1}
1575 ; X64-NEXT: kmovw %k0, %eax
1576 ; X64-NEXT: movzbl %al, %eax
1579 %and.i.i = and <2 x i64> %__B, %__A
1580 %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
1581 %1 = icmp ne <4 x i32> %0, zeroinitializer
1582 %2 = bitcast i8 %__U to <8 x i1>
1583 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1584 %3 = and <4 x i1> %1, %extract.i
1585 %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1586 %5 = bitcast <8 x i1> %4 to i8
1590 define zeroext i8 @test_mm256_test_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) {
1591 ; CHECK-LABEL: test_mm256_test_epi32_mask:
1592 ; CHECK: # %bb.0: # %entry
1593 ; CHECK-NEXT: vptestmd %ymm0, %ymm1, %k0
1594 ; CHECK-NEXT: kmovw %k0, %eax
1595 ; CHECK-NEXT: movzbl %al, %eax
1596 ; CHECK-NEXT: vzeroupper
1597 ; CHECK-NEXT: ret{{[l|q]}}
1599 %and.i.i = and <4 x i64> %__B, %__A
1600 %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
1601 %1 = icmp ne <8 x i32> %0, zeroinitializer
1602 %2 = bitcast <8 x i1> %1 to i8
1606 define zeroext i8 @test_mm256_mask_test_epi32_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1607 ; X86-LABEL: test_mm256_mask_test_epi32_mask:
1608 ; X86: # %bb.0: # %entry
1609 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1610 ; X86-NEXT: kmovw %eax, %k1
1611 ; X86-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1}
1612 ; X86-NEXT: kmovw %k0, %eax
1613 ; X86-NEXT: movzbl %al, %eax
1614 ; X86-NEXT: vzeroupper
1617 ; X64-LABEL: test_mm256_mask_test_epi32_mask:
1618 ; X64: # %bb.0: # %entry
1619 ; X64-NEXT: kmovw %edi, %k1
1620 ; X64-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1}
1621 ; X64-NEXT: kmovw %k0, %eax
1622 ; X64-NEXT: movzbl %al, %eax
1623 ; X64-NEXT: vzeroupper
1626 %and.i.i = and <4 x i64> %__B, %__A
1627 %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
1628 %1 = icmp ne <8 x i32> %0, zeroinitializer
1629 %2 = bitcast i8 %__U to <8 x i1>
1630 %3 = and <8 x i1> %1, %2
1631 %4 = bitcast <8 x i1> %3 to i8
1635 define zeroext i8 @test_mm_test_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) {
1636 ; CHECK-LABEL: test_mm_test_epi64_mask:
1637 ; CHECK: # %bb.0: # %entry
1638 ; CHECK-NEXT: vptestmq %xmm0, %xmm1, %k0
1639 ; CHECK-NEXT: kmovw %k0, %eax
1640 ; CHECK-NEXT: movzbl %al, %eax
1641 ; CHECK-NEXT: ret{{[l|q]}}
1643 %and.i.i = and <2 x i64> %__B, %__A
1644 %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer
1645 %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1646 %2 = bitcast <8 x i1> %1 to i8
1650 define zeroext i8 @test_mm_mask_test_epi64_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1651 ; X86-LABEL: test_mm_mask_test_epi64_mask:
1652 ; X86: # %bb.0: # %entry
1653 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1654 ; X86-NEXT: kmovw %eax, %k1
1655 ; X86-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1}
1656 ; X86-NEXT: kmovw %k0, %eax
1657 ; X86-NEXT: movzbl %al, %eax
1660 ; X64-LABEL: test_mm_mask_test_epi64_mask:
1661 ; X64: # %bb.0: # %entry
1662 ; X64-NEXT: kmovw %edi, %k1
1663 ; X64-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1}
1664 ; X64-NEXT: kmovw %k0, %eax
1665 ; X64-NEXT: movzbl %al, %eax
1668 %and.i.i = and <2 x i64> %__B, %__A
1669 %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer
1670 %1 = bitcast i8 %__U to <8 x i1>
1671 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1672 %2 = and <2 x i1> %0, %extract.i
1673 %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1674 %4 = bitcast <8 x i1> %3 to i8
1678 define zeroext i8 @test_mm256_test_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) {
1679 ; CHECK-LABEL: test_mm256_test_epi64_mask:
1680 ; CHECK: # %bb.0: # %entry
1681 ; CHECK-NEXT: vptestmq %ymm0, %ymm1, %k0
1682 ; CHECK-NEXT: kmovw %k0, %eax
1683 ; CHECK-NEXT: movzbl %al, %eax
1684 ; CHECK-NEXT: vzeroupper
1685 ; CHECK-NEXT: ret{{[l|q]}}
1687 %and.i.i = and <4 x i64> %__B, %__A
1688 %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer
1689 %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1690 %2 = bitcast <8 x i1> %1 to i8
1694 define zeroext i8 @test_mm256_mask_test_epi64_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1695 ; X86-LABEL: test_mm256_mask_test_epi64_mask:
1696 ; X86: # %bb.0: # %entry
1697 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1698 ; X86-NEXT: kmovw %eax, %k1
1699 ; X86-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1}
1700 ; X86-NEXT: kmovw %k0, %eax
1701 ; X86-NEXT: movzbl %al, %eax
1702 ; X86-NEXT: vzeroupper
1705 ; X64-LABEL: test_mm256_mask_test_epi64_mask:
1706 ; X64: # %bb.0: # %entry
1707 ; X64-NEXT: kmovw %edi, %k1
1708 ; X64-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1}
1709 ; X64-NEXT: kmovw %k0, %eax
1710 ; X64-NEXT: movzbl %al, %eax
1711 ; X64-NEXT: vzeroupper
1714 %and.i.i = and <4 x i64> %__B, %__A
1715 %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer
1716 %1 = bitcast i8 %__U to <8 x i1>
1717 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1718 %2 = and <4 x i1> %0, %extract.i
1719 %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1720 %4 = bitcast <8 x i1> %3 to i8
1724 define zeroext i8 @test_mm_testn_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) {
1725 ; CHECK-LABEL: test_mm_testn_epi32_mask:
1726 ; CHECK: # %bb.0: # %entry
1727 ; CHECK-NEXT: vptestnmd %xmm0, %xmm1, %k0
1728 ; CHECK-NEXT: kmovw %k0, %eax
1729 ; CHECK-NEXT: movzbl %al, %eax
1730 ; CHECK-NEXT: ret{{[l|q]}}
1732 %and.i.i = and <2 x i64> %__B, %__A
1733 %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
1734 %1 = icmp eq <4 x i32> %0, zeroinitializer
1735 %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1736 %3 = bitcast <8 x i1> %2 to i8
1740 define zeroext i8 @test_mm_mask_testn_epi32_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1741 ; X86-LABEL: test_mm_mask_testn_epi32_mask:
1742 ; X86: # %bb.0: # %entry
1743 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1744 ; X86-NEXT: kmovw %eax, %k1
1745 ; X86-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1}
1746 ; X86-NEXT: kmovw %k0, %eax
1747 ; X86-NEXT: movzbl %al, %eax
1750 ; X64-LABEL: test_mm_mask_testn_epi32_mask:
1751 ; X64: # %bb.0: # %entry
1752 ; X64-NEXT: kmovw %edi, %k1
1753 ; X64-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1}
1754 ; X64-NEXT: kmovw %k0, %eax
1755 ; X64-NEXT: movzbl %al, %eax
1758 %and.i.i = and <2 x i64> %__B, %__A
1759 %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
1760 %1 = icmp eq <4 x i32> %0, zeroinitializer
1761 %2 = bitcast i8 %__U to <8 x i1>
1762 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1763 %3 = and <4 x i1> %1, %extract.i
1764 %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1765 %5 = bitcast <8 x i1> %4 to i8
1769 define zeroext i8 @test_mm256_testn_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) {
1770 ; CHECK-LABEL: test_mm256_testn_epi32_mask:
1771 ; CHECK: # %bb.0: # %entry
1772 ; CHECK-NEXT: vptestnmd %ymm0, %ymm1, %k0
1773 ; CHECK-NEXT: kmovw %k0, %eax
1774 ; CHECK-NEXT: movzbl %al, %eax
1775 ; CHECK-NEXT: vzeroupper
1776 ; CHECK-NEXT: ret{{[l|q]}}
1778 %and.i.i = and <4 x i64> %__B, %__A
1779 %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
1780 %1 = icmp eq <8 x i32> %0, zeroinitializer
1781 %2 = bitcast <8 x i1> %1 to i8
1785 define zeroext i8 @test_mm256_mask_testn_epi32_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1786 ; X86-LABEL: test_mm256_mask_testn_epi32_mask:
1787 ; X86: # %bb.0: # %entry
1788 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1789 ; X86-NEXT: kmovw %eax, %k1
1790 ; X86-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1}
1791 ; X86-NEXT: kmovw %k0, %eax
1792 ; X86-NEXT: movzbl %al, %eax
1793 ; X86-NEXT: vzeroupper
1796 ; X64-LABEL: test_mm256_mask_testn_epi32_mask:
1797 ; X64: # %bb.0: # %entry
1798 ; X64-NEXT: kmovw %edi, %k1
1799 ; X64-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1}
1800 ; X64-NEXT: kmovw %k0, %eax
1801 ; X64-NEXT: movzbl %al, %eax
1802 ; X64-NEXT: vzeroupper
1805 %and.i.i = and <4 x i64> %__B, %__A
1806 %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
1807 %1 = icmp eq <8 x i32> %0, zeroinitializer
1808 %2 = bitcast i8 %__U to <8 x i1>
1809 %3 = and <8 x i1> %1, %2
1810 %4 = bitcast <8 x i1> %3 to i8
1814 define zeroext i8 @test_mm_testn_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) {
1815 ; CHECK-LABEL: test_mm_testn_epi64_mask:
1816 ; CHECK: # %bb.0: # %entry
1817 ; CHECK-NEXT: vptestnmq %xmm0, %xmm1, %k0
1818 ; CHECK-NEXT: kmovw %k0, %eax
1819 ; CHECK-NEXT: movzbl %al, %eax
1820 ; CHECK-NEXT: ret{{[l|q]}}
1822 %and.i.i = and <2 x i64> %__B, %__A
1823 %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer
1824 %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1825 %2 = bitcast <8 x i1> %1 to i8
1829 define zeroext i8 @test_mm_mask_testn_epi64_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1830 ; X86-LABEL: test_mm_mask_testn_epi64_mask:
1831 ; X86: # %bb.0: # %entry
1832 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1833 ; X86-NEXT: kmovw %eax, %k1
1834 ; X86-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1}
1835 ; X86-NEXT: kmovw %k0, %eax
1836 ; X86-NEXT: movzbl %al, %eax
1839 ; X64-LABEL: test_mm_mask_testn_epi64_mask:
1840 ; X64: # %bb.0: # %entry
1841 ; X64-NEXT: kmovw %edi, %k1
1842 ; X64-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1}
1843 ; X64-NEXT: kmovw %k0, %eax
1844 ; X64-NEXT: movzbl %al, %eax
1847 %and.i.i = and <2 x i64> %__B, %__A
1848 %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer
1849 %1 = bitcast i8 %__U to <8 x i1>
1850 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1851 %2 = and <2 x i1> %0, %extract.i
1852 %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1853 %4 = bitcast <8 x i1> %3 to i8
1857 define zeroext i8 @test_mm256_testn_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) {
1858 ; CHECK-LABEL: test_mm256_testn_epi64_mask:
1859 ; CHECK: # %bb.0: # %entry
1860 ; CHECK-NEXT: vptestnmq %ymm0, %ymm1, %k0
1861 ; CHECK-NEXT: kmovw %k0, %eax
1862 ; CHECK-NEXT: movzbl %al, %eax
1863 ; CHECK-NEXT: vzeroupper
1864 ; CHECK-NEXT: ret{{[l|q]}}
1866 %and.i.i = and <4 x i64> %__B, %__A
1867 %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer
1868 %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1869 %2 = bitcast <8 x i1> %1 to i8
1873 define zeroext i8 @test_mm256_mask_testn_epi64_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1874 ; X86-LABEL: test_mm256_mask_testn_epi64_mask:
1875 ; X86: # %bb.0: # %entry
1876 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1877 ; X86-NEXT: kmovw %eax, %k1
1878 ; X86-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1}
1879 ; X86-NEXT: kmovw %k0, %eax
1880 ; X86-NEXT: movzbl %al, %eax
1881 ; X86-NEXT: vzeroupper
1884 ; X64-LABEL: test_mm256_mask_testn_epi64_mask:
1885 ; X64: # %bb.0: # %entry
1886 ; X64-NEXT: kmovw %edi, %k1
1887 ; X64-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1}
1888 ; X64-NEXT: kmovw %k0, %eax
1889 ; X64-NEXT: movzbl %al, %eax
1890 ; X64-NEXT: vzeroupper
1893 %and.i.i = and <4 x i64> %__B, %__A
1894 %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer
1895 %1 = bitcast i8 %__U to <8 x i1>
1896 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1897 %2 = and <4 x i1> %0, %extract.i
1898 %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1899 %4 = bitcast <8 x i1> %3 to i8
1903 define <2 x i64> @test_mm_mask_set1_epi32(<2 x i64> %__O, i8 zeroext %__M) {
1904 ; X86-LABEL: test_mm_mask_set1_epi32:
1905 ; X86: # %bb.0: # %entry
1906 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1907 ; X86-NEXT: kmovw %eax, %k1
1908 ; X86-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 {%k1}
1911 ; X64-LABEL: test_mm_mask_set1_epi32:
1912 ; X64: # %bb.0: # %entry
1913 ; X64-NEXT: kmovw %edi, %k1
1914 ; X64-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1}
1917 %0 = bitcast <2 x i64> %__O to <4 x i32>
1918 %1 = bitcast i8 %__M to <8 x i1>
1919 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1920 %2 = select <4 x i1> %extract.i, <4 x i32> <i32 5, i32 5, i32 5, i32 5>, <4 x i32> %0
1921 %3 = bitcast <4 x i32> %2 to <2 x i64>
1925 define <2 x i64> @test_mm_maskz_set1_epi32(i8 zeroext %__M) {
1926 ; X86-LABEL: test_mm_maskz_set1_epi32:
1927 ; X86: # %bb.0: # %entry
1928 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1929 ; X86-NEXT: kmovw %eax, %k1
1930 ; X86-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 {%k1} {z}
1933 ; X64-LABEL: test_mm_maskz_set1_epi32:
1934 ; X64: # %bb.0: # %entry
1935 ; X64-NEXT: kmovw %edi, %k1
1936 ; X64-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} {z}
1939 %0 = bitcast i8 %__M to <8 x i1>
1940 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1941 %1 = select <4 x i1> %extract.i, <4 x i32> <i32 5, i32 5, i32 5, i32 5>, <4 x i32> zeroinitializer
1942 %2 = bitcast <4 x i32> %1 to <2 x i64>
1946 define <4 x i64> @test_mm256_mask_set1_epi32(<4 x i64> %__O, i8 zeroext %__M) {
1947 ; X86-LABEL: test_mm256_mask_set1_epi32:
1948 ; X86: # %bb.0: # %entry
1949 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1950 ; X86-NEXT: kmovw %eax, %k1
1951 ; X86-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 {%k1}
1954 ; X64-LABEL: test_mm256_mask_set1_epi32:
1955 ; X64: # %bb.0: # %entry
1956 ; X64-NEXT: kmovw %edi, %k1
1957 ; X64-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k1}
1960 %0 = bitcast <4 x i64> %__O to <8 x i32>
1961 %1 = bitcast i8 %__M to <8 x i1>
1962 %2 = select <8 x i1> %1, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <8 x i32> %0
1963 %3 = bitcast <8 x i32> %2 to <4 x i64>
1967 define <4 x i64> @test_mm256_maskz_set1_epi32(i8 zeroext %__M) {
1968 ; X86-LABEL: test_mm256_maskz_set1_epi32:
1969 ; X86: # %bb.0: # %entry
1970 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1971 ; X86-NEXT: kmovw %eax, %k1
1972 ; X86-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 {%k1} {z}
1975 ; X64-LABEL: test_mm256_maskz_set1_epi32:
1976 ; X64: # %bb.0: # %entry
1977 ; X64-NEXT: kmovw %edi, %k1
1978 ; X64-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k1} {z}
1981 %0 = bitcast i8 %__M to <8 x i1>
1982 %1 = select <8 x i1> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <8 x i32> zeroinitializer
1983 %2 = bitcast <8 x i32> %1 to <4 x i64>
1987 define <2 x i64> @test_mm_mask_set1_epi64(<2 x i64> %__O, i8 zeroext %__M, i64 %__A) {
1988 ; X86-LABEL: test_mm_mask_set1_epi64:
1989 ; X86: # %bb.0: # %entry
1990 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1991 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1992 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
1993 ; X86-NEXT: kmovw %eax, %k1
1994 ; X86-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1}
1997 ; X64-LABEL: test_mm_mask_set1_epi64:
1998 ; X64: # %bb.0: # %entry
1999 ; X64-NEXT: kmovw %edi, %k1
2000 ; X64-NEXT: vpbroadcastq %rsi, %xmm0 {%k1}
2003 %vecinit.i.i.i = insertelement <2 x i64> undef, i64 %__A, i32 0
2004 %vecinit1.i.i.i = shufflevector <2 x i64> %vecinit.i.i.i, <2 x i64> undef, <2 x i32> zeroinitializer
2005 %0 = bitcast i8 %__M to <8 x i1>
2006 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2007 %1 = select <2 x i1> %extract.i, <2 x i64> %vecinit1.i.i.i, <2 x i64> %__O
2011 define <2 x i64> @test_mm_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) {
2012 ; X86-LABEL: test_mm_maskz_set1_epi64:
2013 ; X86: # %bb.0: # %entry
2014 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2015 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2016 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2017 ; X86-NEXT: kmovw %eax, %k1
2018 ; X86-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z}
2021 ; X64-LABEL: test_mm_maskz_set1_epi64:
2022 ; X64: # %bb.0: # %entry
2023 ; X64-NEXT: kmovw %edi, %k1
2024 ; X64-NEXT: vpbroadcastq %rsi, %xmm0 {%k1} {z}
2027 %vecinit.i.i.i = insertelement <2 x i64> undef, i64 %__A, i32 0
2028 %vecinit1.i.i.i = shufflevector <2 x i64> %vecinit.i.i.i, <2 x i64> undef, <2 x i32> zeroinitializer
2029 %0 = bitcast i8 %__M to <8 x i1>
2030 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2031 %1 = select <2 x i1> %extract.i, <2 x i64> %vecinit1.i.i.i, <2 x i64> zeroinitializer
2036 define <4 x i64> @test_mm256_mask_set1_epi64(<4 x i64> %__O, i8 zeroext %__M, i64 %__A) {
2037 ; X86-LABEL: test_mm256_mask_set1_epi64:
2038 ; X86: # %bb.0: # %entry
2039 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2040 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2041 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
2042 ; X86-NEXT: kmovw %eax, %k1
2043 ; X86-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1}
2046 ; X64-LABEL: test_mm256_mask_set1_epi64:
2047 ; X64: # %bb.0: # %entry
2048 ; X64-NEXT: kmovw %edi, %k1
2049 ; X64-NEXT: vpbroadcastq %rsi, %ymm0 {%k1}
2052 %vecinit.i.i = insertelement <4 x i64> undef, i64 %__A, i32 0
2053 %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer
2054 %0 = bitcast i8 %__M to <8 x i1>
2055 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2056 %1 = select <4 x i1> %extract.i, <4 x i64> %vecinit3.i.i, <4 x i64> %__O
2060 define <4 x i64> @test_mm256_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) {
2061 ; X86-LABEL: test_mm256_maskz_set1_epi64:
2062 ; X86: # %bb.0: # %entry
2063 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2064 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2065 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2066 ; X86-NEXT: kmovw %eax, %k1
2067 ; X86-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z}
2070 ; X64-LABEL: test_mm256_maskz_set1_epi64:
2071 ; X64: # %bb.0: # %entry
2072 ; X64-NEXT: kmovw %edi, %k1
2073 ; X64-NEXT: vpbroadcastq %rsi, %ymm0 {%k1} {z}
2076 %vecinit.i.i = insertelement <4 x i64> undef, i64 %__A, i32 0
2077 %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer
2078 %0 = bitcast i8 %__M to <8 x i1>
2079 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2080 %1 = select <4 x i1> %extract.i, <4 x i64> %vecinit3.i.i, <4 x i64> zeroinitializer
2084 define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
2085 ; CHECK-LABEL: test_mm_broadcastd_epi32:
2087 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
2088 ; CHECK-NEXT: ret{{[l|q]}}
2089 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2090 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
2091 %res1 = bitcast <4 x i32> %res0 to <2 x i64>
2095 define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) {
2096 ; X86-LABEL: test_mm_mask_broadcastd_epi32:
2097 ; X86: # %bb.0: # %entry
2098 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2099 ; X86-NEXT: kmovw %eax, %k1
2100 ; X86-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1}
2103 ; X64-LABEL: test_mm_mask_broadcastd_epi32:
2104 ; X64: # %bb.0: # %entry
2105 ; X64-NEXT: kmovw %edi, %k1
2106 ; X64-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1}
2109 %0 = bitcast <2 x i64> %__A to <4 x i32>
2110 %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
2111 %1 = bitcast <2 x i64> %__O to <4 x i32>
2112 %2 = bitcast i8 %__M to <8 x i1>
2113 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2114 %3 = select <4 x i1> %extract.i, <4 x i32> %shuffle.i.i, <4 x i32> %1
2115 %4 = bitcast <4 x i32> %3 to <2 x i64>
2119 define <2 x i64> @test_mm_maskz_broadcastd_epi32(i8 zeroext %__M, <2 x i64> %__A) {
2120 ; X86-LABEL: test_mm_maskz_broadcastd_epi32:
2121 ; X86: # %bb.0: # %entry
2122 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2123 ; X86-NEXT: kmovw %eax, %k1
2124 ; X86-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z}
2127 ; X64-LABEL: test_mm_maskz_broadcastd_epi32:
2128 ; X64: # %bb.0: # %entry
2129 ; X64-NEXT: kmovw %edi, %k1
2130 ; X64-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z}
2133 %0 = bitcast <2 x i64> %__A to <4 x i32>
2134 %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
2135 %1 = bitcast i8 %__M to <8 x i1>
2136 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2137 %2 = select <4 x i1> %extract.i, <4 x i32> %shuffle.i.i, <4 x i32> zeroinitializer
2138 %3 = bitcast <4 x i32> %2 to <2 x i64>
2142 define <4 x i64> @test_mm256_broadcastd_epi32(<2 x i64> %a0) {
2143 ; CHECK-LABEL: test_mm256_broadcastd_epi32:
2145 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
2146 ; CHECK-NEXT: ret{{[l|q]}}
2147 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2148 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <8 x i32> zeroinitializer
2149 %res1 = bitcast <8 x i32> %res0 to <4 x i64>
2153 define <4 x i64> @test_mm256_mask_broadcastd_epi32(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) {
2154 ; X86-LABEL: test_mm256_mask_broadcastd_epi32:
2156 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2157 ; X86-NEXT: kmovw %eax, %k1
2158 ; X86-NEXT: vpbroadcastd %xmm1, %ymm0 {%k1}
2161 ; X64-LABEL: test_mm256_mask_broadcastd_epi32:
2163 ; X64-NEXT: kmovw %edi, %k1
2164 ; X64-NEXT: vpbroadcastd %xmm1, %ymm0 {%k1}
2166 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2167 %arg1 = bitcast i8 %a1 to <8 x i1>
2168 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
2169 %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <8 x i32> zeroinitializer
2170 %res1 = select <8 x i1> %arg1, <8 x i32> %res0, <8 x i32> %arg0
2171 %res2 = bitcast <8 x i32> %res1 to <4 x i64>
2175 define <4 x i64> @test_mm256_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) {
2176 ; X86-LABEL: test_mm256_maskz_broadcastd_epi32:
2178 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2179 ; X86-NEXT: kmovw %eax, %k1
2180 ; X86-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z}
2183 ; X64-LABEL: test_mm256_maskz_broadcastd_epi32:
2185 ; X64-NEXT: kmovw %edi, %k1
2186 ; X64-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z}
2188 %arg0 = bitcast i8 %a0 to <8 x i1>
2189 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2190 %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <8 x i32> zeroinitializer
2191 %res1 = select <8 x i1> %arg0, <8 x i32> %res0, <8 x i32> zeroinitializer
2192 %res2 = bitcast <8 x i32> %res1 to <4 x i64>
2196 define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
2197 ; CHECK-LABEL: test_mm_broadcastq_epi64:
2199 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2200 ; CHECK-NEXT: ret{{[l|q]}}
2201 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
2205 define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) {
2206 ; X86-LABEL: test_mm_mask_broadcastq_epi64:
2207 ; X86: # %bb.0: # %entry
2208 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2209 ; X86-NEXT: kmovw %eax, %k1
2210 ; X86-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1}
2213 ; X64-LABEL: test_mm_mask_broadcastq_epi64:
2214 ; X64: # %bb.0: # %entry
2215 ; X64-NEXT: kmovw %edi, %k1
2216 ; X64-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1}
2219 %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <2 x i32> zeroinitializer
2220 %0 = bitcast i8 %__M to <8 x i1>
2221 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2222 %1 = select <2 x i1> %extract.i, <2 x i64> %shuffle.i.i, <2 x i64> %__O
2226 define <2 x i64> @test_mm_maskz_broadcastq_epi64(i8 zeroext %__M, <2 x i64> %__A) {
2227 ; X86-LABEL: test_mm_maskz_broadcastq_epi64:
2228 ; X86: # %bb.0: # %entry
2229 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2230 ; X86-NEXT: kmovw %eax, %k1
2231 ; X86-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z}
2234 ; X64-LABEL: test_mm_maskz_broadcastq_epi64:
2235 ; X64: # %bb.0: # %entry
2236 ; X64-NEXT: kmovw %edi, %k1
2237 ; X64-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z}
2240 %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <2 x i32> zeroinitializer
2241 %0 = bitcast i8 %__M to <8 x i1>
2242 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2243 %1 = select <2 x i1> %extract.i, <2 x i64> %shuffle.i.i, <2 x i64> zeroinitializer
2247 define <4 x i64> @test_mm256_broadcastq_epi64(<2 x i64> %a0) {
2248 ; CHECK-LABEL: test_mm256_broadcastq_epi64:
2250 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
2251 ; CHECK-NEXT: ret{{[l|q]}}
2252 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> zeroinitializer
2256 define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) {
2257 ; X86-LABEL: test_mm256_mask_broadcastq_epi64:
2258 ; X86: # %bb.0: # %entry
2259 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2260 ; X86-NEXT: kmovw %eax, %k1
2261 ; X86-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1}
2264 ; X64-LABEL: test_mm256_mask_broadcastq_epi64:
2265 ; X64: # %bb.0: # %entry
2266 ; X64-NEXT: kmovw %edi, %k1
2267 ; X64-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1}
2270 %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <4 x i32> zeroinitializer
2271 %0 = bitcast i8 %__M to <8 x i1>
2272 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2273 %1 = select <4 x i1> %extract.i, <4 x i64> %shuffle.i.i, <4 x i64> %__O
2277 define <4 x i64> @test_mm256_maskz_broadcastq_epi64(i8 zeroext %__M, <2 x i64> %__A) {
2278 ; X86-LABEL: test_mm256_maskz_broadcastq_epi64:
2279 ; X86: # %bb.0: # %entry
2280 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2281 ; X86-NEXT: kmovw %eax, %k1
2282 ; X86-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z}
2285 ; X64-LABEL: test_mm256_maskz_broadcastq_epi64:
2286 ; X64: # %bb.0: # %entry
2287 ; X64-NEXT: kmovw %edi, %k1
2288 ; X64-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z}
2291 %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <4 x i32> zeroinitializer
2292 %0 = bitcast i8 %__M to <8 x i1>
2293 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2294 %1 = select <4 x i1> %extract.i, <4 x i64> %shuffle.i.i, <4 x i64> zeroinitializer
2298 define <4 x double> @test_mm256_broadcastsd_pd(<2 x double> %a0) {
2299 ; CHECK-LABEL: test_mm256_broadcastsd_pd:
2301 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
2302 ; CHECK-NEXT: ret{{[l|q]}}
2303 %res = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer
2304 ret <4 x double> %res
2307 define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %__O, i8 zeroext %__M, <2 x double> %__A) {
2308 ; X86-LABEL: test_mm256_mask_broadcastsd_pd:
2309 ; X86: # %bb.0: # %entry
2310 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2311 ; X86-NEXT: kmovw %eax, %k1
2312 ; X86-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1}
2315 ; X64-LABEL: test_mm256_mask_broadcastsd_pd:
2316 ; X64: # %bb.0: # %entry
2317 ; X64-NEXT: kmovw %edi, %k1
2318 ; X64-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1}
2321 %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <4 x i32> zeroinitializer
2322 %0 = bitcast i8 %__M to <8 x i1>
2323 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2324 %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> %__O
2328 define <4 x double> @test_mm256_maskz_broadcastsd_pd(i8 zeroext %__M, <2 x double> %__A) {
2329 ; X86-LABEL: test_mm256_maskz_broadcastsd_pd:
2330 ; X86: # %bb.0: # %entry
2331 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2332 ; X86-NEXT: kmovw %eax, %k1
2333 ; X86-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z}
2336 ; X64-LABEL: test_mm256_maskz_broadcastsd_pd:
2337 ; X64: # %bb.0: # %entry
2338 ; X64-NEXT: kmovw %edi, %k1
2339 ; X64-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z}
2342 %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <4 x i32> zeroinitializer
2343 %0 = bitcast i8 %__M to <8 x i1>
2344 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2345 %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> zeroinitializer
2349 define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
2350 ; CHECK-LABEL: test_mm_broadcastss_ps:
2352 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
2353 ; CHECK-NEXT: ret{{[l|q]}}
2354 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
2355 ret <4 x float> %res
2358 define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %__O, i8 zeroext %__M, <4 x float> %__A) {
2359 ; X86-LABEL: test_mm_mask_broadcastss_ps:
2360 ; X86: # %bb.0: # %entry
2361 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2362 ; X86-NEXT: kmovw %eax, %k1
2363 ; X86-NEXT: vbroadcastss %xmm1, %xmm0 {%k1}
2366 ; X64-LABEL: test_mm_mask_broadcastss_ps:
2367 ; X64: # %bb.0: # %entry
2368 ; X64-NEXT: kmovw %edi, %k1
2369 ; X64-NEXT: vbroadcastss %xmm1, %xmm0 {%k1}
2372 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> zeroinitializer
2373 %0 = bitcast i8 %__M to <8 x i1>
2374 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2375 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__O
2379 define <4 x float> @test_mm_maskz_broadcastss_ps(i8 zeroext %__M, <4 x float> %__A) {
2380 ; X86-LABEL: test_mm_maskz_broadcastss_ps:
2381 ; X86: # %bb.0: # %entry
2382 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2383 ; X86-NEXT: kmovw %eax, %k1
2384 ; X86-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z}
2387 ; X64-LABEL: test_mm_maskz_broadcastss_ps:
2388 ; X64: # %bb.0: # %entry
2389 ; X64-NEXT: kmovw %edi, %k1
2390 ; X64-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z}
2393 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> zeroinitializer
2394 %0 = bitcast i8 %__M to <8 x i1>
2395 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2396 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer
2400 define <8 x float> @test_mm256_broadcastss_ps(<4 x float> %a0) {
2401 ; CHECK-LABEL: test_mm256_broadcastss_ps:
2403 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
2404 ; CHECK-NEXT: ret{{[l|q]}}
2405 %res = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer
2406 ret <8 x float> %res
2409 define <8 x float> @test_mm256_mask_broadcastss_ps(<8 x float> %a0, i8 %a1, <4 x float> %a2) {
2410 ; X86-LABEL: test_mm256_mask_broadcastss_ps:
2412 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2413 ; X86-NEXT: kmovw %eax, %k1
2414 ; X86-NEXT: vbroadcastss %xmm1, %ymm0 {%k1}
2417 ; X64-LABEL: test_mm256_mask_broadcastss_ps:
2419 ; X64-NEXT: kmovw %edi, %k1
2420 ; X64-NEXT: vbroadcastss %xmm1, %ymm0 {%k1}
2422 %arg1 = bitcast i8 %a1 to <8 x i1>
2423 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <8 x i32> zeroinitializer
2424 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
2425 ret <8 x float> %res1
2428 define <8 x float> @test_mm256_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) {
2429 ; X86-LABEL: test_mm256_maskz_broadcastss_ps:
2431 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2432 ; X86-NEXT: kmovw %eax, %k1
2433 ; X86-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z}
2436 ; X64-LABEL: test_mm256_maskz_broadcastss_ps:
2438 ; X64-NEXT: kmovw %edi, %k1
2439 ; X64-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z}
2441 %arg0 = bitcast i8 %a0 to <8 x i1>
2442 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> zeroinitializer
2443 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
2444 ret <8 x float> %res1
2447 define <2 x double> @test_mm_movddup_pd(<2 x double> %a0) {
2448 ; CHECK-LABEL: test_mm_movddup_pd:
2450 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2451 ; CHECK-NEXT: ret{{[l|q]}}
2452 %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
2453 ret <2 x double> %res
2456 define <2 x double> @test_mm_mask_movedup_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A) {
2457 ; X86-LABEL: test_mm_mask_movedup_pd:
2458 ; X86: # %bb.0: # %entry
2459 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2460 ; X86-NEXT: kmovw %eax, %k1
2461 ; X86-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
2464 ; X64-LABEL: test_mm_mask_movedup_pd:
2465 ; X64: # %bb.0: # %entry
2466 ; X64-NEXT: kmovw %edi, %k1
2467 ; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
2470 %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <2 x i32> zeroinitializer
2471 %0 = bitcast i8 %__U to <8 x i1>
2472 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2473 %1 = select <2 x i1> %extract.i, <2 x double> %shuffle.i.i, <2 x double> %__W
2477 define <2 x double> @test_mm_maskz_movedup_pd(i8 zeroext %__U, <2 x double> %__A) {
2478 ; X86-LABEL: test_mm_maskz_movedup_pd:
2479 ; X86: # %bb.0: # %entry
2480 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2481 ; X86-NEXT: kmovw %eax, %k1
2482 ; X86-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
2485 ; X64-LABEL: test_mm_maskz_movedup_pd:
2486 ; X64: # %bb.0: # %entry
2487 ; X64-NEXT: kmovw %edi, %k1
2488 ; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
2491 %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <2 x i32> zeroinitializer
2492 %0 = bitcast i8 %__U to <8 x i1>
2493 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2494 %1 = select <2 x i1> %extract.i, <2 x double> %shuffle.i.i, <2 x double> zeroinitializer
2498 define <4 x double> @test_mm256_movddup_pd(<4 x double> %a0) {
2499 ; CHECK-LABEL: test_mm256_movddup_pd:
2501 ; CHECK-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
2502 ; CHECK-NEXT: ret{{[l|q]}}
2503 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2504 ret <4 x double> %res
2507 define <4 x double> @test_mm256_mask_movedup_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A) {
2508 ; X86-LABEL: test_mm256_mask_movedup_pd:
2509 ; X86: # %bb.0: # %entry
2510 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2511 ; X86-NEXT: kmovw %eax, %k1
2512 ; X86-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
2515 ; X64-LABEL: test_mm256_mask_movedup_pd:
2516 ; X64: # %bb.0: # %entry
2517 ; X64-NEXT: kmovw %edi, %k1
2518 ; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
2521 %shuffle.i.i = shufflevector <4 x double> %__A, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2522 %0 = bitcast i8 %__U to <8 x i1>
2523 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2524 %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> %__W
2528 define <4 x double> @test_mm256_maskz_movedup_pd(i8 zeroext %__U, <4 x double> %__A) {
2529 ; X86-LABEL: test_mm256_maskz_movedup_pd:
2530 ; X86: # %bb.0: # %entry
2531 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2532 ; X86-NEXT: kmovw %eax, %k1
2533 ; X86-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
2536 ; X64-LABEL: test_mm256_maskz_movedup_pd:
2537 ; X64: # %bb.0: # %entry
2538 ; X64-NEXT: kmovw %edi, %k1
2539 ; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
2542 %shuffle.i.i = shufflevector <4 x double> %__A, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2543 %0 = bitcast i8 %__U to <8 x i1>
2544 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2545 %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> zeroinitializer
2549 define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) {
2550 ; CHECK-LABEL: test_mm_movehdup_ps:
2552 ; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
2553 ; CHECK-NEXT: ret{{[l|q]}}
2554 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
2555 ret <4 x float> %res
2558 define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) {
2559 ; X86-LABEL: test_mm_mask_movehdup_ps:
2560 ; X86: # %bb.0: # %entry
2561 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2562 ; X86-NEXT: kmovw %eax, %k1
2563 ; X86-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
2566 ; X64-LABEL: test_mm_mask_movehdup_ps:
2567 ; X64: # %bb.0: # %entry
2568 ; X64-NEXT: kmovw %edi, %k1
2569 ; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
2572 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
2573 %0 = bitcast i8 %__U to <8 x i1>
2574 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2575 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__W
2579 define <4 x float> @test_mm_maskz_movehdup_ps(i8 zeroext %__U, <4 x float> %__A) {
2580 ; X86-LABEL: test_mm_maskz_movehdup_ps:
2581 ; X86: # %bb.0: # %entry
2582 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2583 ; X86-NEXT: kmovw %eax, %k1
2584 ; X86-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
2587 ; X64-LABEL: test_mm_maskz_movehdup_ps:
2588 ; X64: # %bb.0: # %entry
2589 ; X64-NEXT: kmovw %edi, %k1
2590 ; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
2593 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
2594 %0 = bitcast i8 %__U to <8 x i1>
2595 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2596 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer
2600 define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) {
2601 ; CHECK-LABEL: test_mm256_movehdup_ps:
2603 ; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
2604 ; CHECK-NEXT: ret{{[l|q]}}
2605 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
2606 ret <8 x float> %res
2609 define <8 x float> @test_mm256_mask_movehdup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) {
2610 ; X86-LABEL: test_mm256_mask_movehdup_ps:
2612 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2613 ; X86-NEXT: kmovw %eax, %k1
2614 ; X86-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7]
2617 ; X64-LABEL: test_mm256_mask_movehdup_ps:
2619 ; X64-NEXT: kmovw %edi, %k1
2620 ; X64-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7]
2622 %arg1 = bitcast i8 %a1 to <8 x i1>
2623 %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
2624 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
2625 ret <8 x float> %res1
2628 define <8 x float> @test_mm256_maskz_movehdup_ps(i8 %a0, <8 x float> %a1) {
2629 ; X86-LABEL: test_mm256_maskz_movehdup_ps:
2631 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2632 ; X86-NEXT: kmovw %eax, %k1
2633 ; X86-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
2636 ; X64-LABEL: test_mm256_maskz_movehdup_ps:
2638 ; X64-NEXT: kmovw %edi, %k1
2639 ; X64-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
2641 %arg0 = bitcast i8 %a0 to <8 x i1>
2642 %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
2643 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
2644 ret <8 x float> %res1
2647 define <4 x float> @test_mm_moveldup_ps(<4 x float> %a0) {
2648 ; CHECK-LABEL: test_mm_moveldup_ps:
2650 ; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
2651 ; CHECK-NEXT: ret{{[l|q]}}
2652 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2653 ret <4 x float> %res
2656 define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) {
2657 ; X86-LABEL: test_mm_mask_moveldup_ps:
2658 ; X86: # %bb.0: # %entry
2659 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2660 ; X86-NEXT: kmovw %eax, %k1
2661 ; X86-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
2664 ; X64-LABEL: test_mm_mask_moveldup_ps:
2665 ; X64: # %bb.0: # %entry
2666 ; X64-NEXT: kmovw %edi, %k1
2667 ; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
2670 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2671 %0 = bitcast i8 %__U to <8 x i1>
2672 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2673 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__W
2677 define <4 x float> @test_mm_maskz_moveldup_ps(i8 zeroext %__U, <4 x float> %__A) {
2678 ; X86-LABEL: test_mm_maskz_moveldup_ps:
2679 ; X86: # %bb.0: # %entry
2680 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2681 ; X86-NEXT: kmovw %eax, %k1
2682 ; X86-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
2685 ; X64-LABEL: test_mm_maskz_moveldup_ps:
2686 ; X64: # %bb.0: # %entry
2687 ; X64-NEXT: kmovw %edi, %k1
2688 ; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
2691 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2692 %0 = bitcast i8 %__U to <8 x i1>
2693 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2694 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer
2698 define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) {
2699 ; CHECK-LABEL: test_mm256_moveldup_ps:
2701 ; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
2702 ; CHECK-NEXT: ret{{[l|q]}}
2703 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
2704 ret <8 x float> %res
2707 define <8 x float> @test_mm256_mask_moveldup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) {
2708 ; X86-LABEL: test_mm256_mask_moveldup_ps:
2710 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2711 ; X86-NEXT: kmovw %eax, %k1
2712 ; X86-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6]
2715 ; X64-LABEL: test_mm256_mask_moveldup_ps:
2717 ; X64-NEXT: kmovw %edi, %k1
2718 ; X64-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6]
2720 %arg1 = bitcast i8 %a1 to <8 x i1>
2721 %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
2722 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
2723 ret <8 x float> %res1
2726 define <8 x float> @test_mm256_maskz_moveldup_ps(i8 %a0, <8 x float> %a1) {
2727 ; X86-LABEL: test_mm256_maskz_moveldup_ps:
2729 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2730 ; X86-NEXT: kmovw %eax, %k1
2731 ; X86-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
2734 ; X64-LABEL: test_mm256_maskz_moveldup_ps:
2736 ; X64-NEXT: kmovw %edi, %k1
2737 ; X64-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
2739 %arg0 = bitcast i8 %a0 to <8 x i1>
2740 %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
2741 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
2742 ret <8 x float> %res1
2745 define <4 x i64> @test_mm256_permutex_epi64(<4 x i64> %a0) {
2746 ; CHECK-LABEL: test_mm256_permutex_epi64:
2748 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
2749 ; CHECK-NEXT: ret{{[l|q]}}
2750 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
2754 define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X) {
2755 ; X86-LABEL: test_mm256_mask_permutex_epi64:
2756 ; X86: # %bb.0: # %entry
2757 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2758 ; X86-NEXT: kmovw %eax, %k1
2759 ; X86-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[3,0,0,0]
2762 ; X64-LABEL: test_mm256_mask_permutex_epi64:
2763 ; X64: # %bb.0: # %entry
2764 ; X64-NEXT: kmovw %edi, %k1
2765 ; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[3,0,0,0]
2768 %perm = shufflevector <4 x i64> %__X, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
2769 %0 = bitcast i8 %__M to <8 x i1>
2770 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2771 %1 = select <4 x i1> %extract, <4 x i64> %perm, <4 x i64> %__W
2775 define <4 x i64> @test_mm256_maskz_permutex_epi64(i8 zeroext %__M, <4 x i64> %__X) {
2776 ; X86-LABEL: test_mm256_maskz_permutex_epi64:
2777 ; X86: # %bb.0: # %entry
2778 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2779 ; X86-NEXT: kmovw %eax, %k1
2780 ; X86-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0]
2783 ; X64-LABEL: test_mm256_maskz_permutex_epi64:
2784 ; X64: # %bb.0: # %entry
2785 ; X64-NEXT: kmovw %edi, %k1
2786 ; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0]
2789 %perm = shufflevector <4 x i64> %__X, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
2790 %0 = bitcast i8 %__M to <8 x i1>
2791 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2792 %1 = select <4 x i1> %extract, <4 x i64> %perm, <4 x i64> zeroinitializer
2796 define <4 x double> @test_mm256_permutex_pd(<4 x double> %a0) {
2797 ; CHECK-LABEL: test_mm256_permutex_pd:
2799 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
2800 ; CHECK-NEXT: ret{{[l|q]}}
2801 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
2802 ret <4 x double> %res
2805 define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__X) {
2806 ; X86-LABEL: test_mm256_mask_permutex_pd:
2807 ; X86: # %bb.0: # %entry
2808 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2809 ; X86-NEXT: kmovw %eax, %k1
2810 ; X86-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
2813 ; X64-LABEL: test_mm256_mask_permutex_pd:
2814 ; X64: # %bb.0: # %entry
2815 ; X64-NEXT: kmovw %edi, %k1
2816 ; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
2819 %perm = shufflevector <4 x double> %__X, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
2820 %0 = bitcast i8 %__U to <8 x i1>
2821 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2822 %1 = select <4 x i1> %extract, <4 x double> %perm, <4 x double> %__W
2826 define <4 x double> @test_mm256_maskz_permutex_pd(i8 zeroext %__U, <4 x double> %__X) {
2827 ; X86-LABEL: test_mm256_maskz_permutex_pd:
2828 ; X86: # %bb.0: # %entry
2829 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2830 ; X86-NEXT: kmovw %eax, %k1
2831 ; X86-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
2834 ; X64-LABEL: test_mm256_maskz_permutex_pd:
2835 ; X64: # %bb.0: # %entry
2836 ; X64-NEXT: kmovw %edi, %k1
2837 ; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
2840 %perm = shufflevector <4 x double> %__X, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
2841 %0 = bitcast i8 %__U to <8 x i1>
2842 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2843 %1 = select <4 x i1> %extract, <4 x double> %perm, <4 x double> zeroinitializer
2847 define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) {
2848 ; CHECK-LABEL: test_mm_shuffle_pd:
2850 ; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2851 ; CHECK-NEXT: ret{{[l|q]}}
2852 %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
2853 ret <2 x double> %res
2856 define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2857 ; X86-LABEL: test_mm_mask_shuffle_pd:
2858 ; X86: # %bb.0: # %entry
2859 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2860 ; X86-NEXT: kmovw %eax, %k1
2861 ; X86-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
2864 ; X64-LABEL: test_mm_mask_shuffle_pd:
2865 ; X64: # %bb.0: # %entry
2866 ; X64-NEXT: kmovw %edi, %k1
2867 ; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
2870 %shufp = shufflevector <2 x double> %__A, <2 x double> %__B, <2 x i32> <i32 1, i32 3>
2871 %0 = bitcast i8 %__U to <8 x i1>
2872 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2873 %1 = select <2 x i1> %extract, <2 x double> %shufp, <2 x double> %__W
2877 define <2 x double> @test_mm_maskz_shuffle_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2878 ; X86-LABEL: test_mm_maskz_shuffle_pd:
2879 ; X86: # %bb.0: # %entry
2880 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2881 ; X86-NEXT: kmovw %eax, %k1
2882 ; X86-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
2885 ; X64-LABEL: test_mm_maskz_shuffle_pd:
2886 ; X64: # %bb.0: # %entry
2887 ; X64-NEXT: kmovw %edi, %k1
2888 ; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
2891 %shufp = shufflevector <2 x double> %__A, <2 x double> %__B, <2 x i32> <i32 1, i32 3>
2892 %0 = bitcast i8 %__U to <8 x i1>
2893 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
2894 %1 = select <2 x i1> %extract, <2 x double> %shufp, <2 x double> zeroinitializer
2898 define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) {
2899 ; CHECK-LABEL: test_mm256_shuffle_pd:
2901 ; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
2902 ; CHECK-NEXT: ret{{[l|q]}}
2903 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
2904 ret <4 x double> %res
2907 define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
2908 ; X86-LABEL: test_mm256_mask_shuffle_pd:
2909 ; X86: # %bb.0: # %entry
2910 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2911 ; X86-NEXT: kmovw %eax, %k1
2912 ; X86-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
2915 ; X64-LABEL: test_mm256_mask_shuffle_pd:
2916 ; X64: # %bb.0: # %entry
2917 ; X64-NEXT: kmovw %edi, %k1
2918 ; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
2921 %shufp = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
2922 %0 = bitcast i8 %__U to <8 x i1>
2923 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2924 %1 = select <4 x i1> %extract, <4 x double> %shufp, <4 x double> %__W
2928 define <4 x double> @test_mm256_maskz_shuffle_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
2929 ; X86-LABEL: test_mm256_maskz_shuffle_pd:
2930 ; X86: # %bb.0: # %entry
2931 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2932 ; X86-NEXT: kmovw %eax, %k1
2933 ; X86-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
2936 ; X64-LABEL: test_mm256_maskz_shuffle_pd:
2937 ; X64: # %bb.0: # %entry
2938 ; X64-NEXT: kmovw %edi, %k1
2939 ; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
2942 %shufp = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
2943 %0 = bitcast i8 %__U to <8 x i1>
2944 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2945 %1 = select <4 x i1> %extract, <4 x double> %shufp, <4 x double> zeroinitializer
2949 define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) {
2950 ; CHECK-LABEL: test_mm_shuffle_ps:
2952 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
2953 ; CHECK-NEXT: ret{{[l|q]}}
2954 %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
2955 ret <4 x float> %res
2958 define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2959 ; X86-LABEL: test_mm_mask_shuffle_ps:
2960 ; X86: # %bb.0: # %entry
2961 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2962 ; X86-NEXT: kmovw %eax, %k1
2963 ; X86-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
2966 ; X64-LABEL: test_mm_mask_shuffle_ps:
2967 ; X64: # %bb.0: # %entry
2968 ; X64-NEXT: kmovw %edi, %k1
2969 ; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
2972 %shufp = shufflevector <4 x float> %__A, <4 x float> %__B, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
2973 %0 = bitcast i8 %__U to <8 x i1>
2974 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2975 %1 = select <4 x i1> %extract, <4 x float> %shufp, <4 x float> %__W
2979 define <4 x float> @test_mm_maskz_shuffle_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2980 ; X86-LABEL: test_mm_maskz_shuffle_ps:
2981 ; X86: # %bb.0: # %entry
2982 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2983 ; X86-NEXT: kmovw %eax, %k1
2984 ; X86-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
2987 ; X64-LABEL: test_mm_maskz_shuffle_ps:
2988 ; X64: # %bb.0: # %entry
2989 ; X64-NEXT: kmovw %edi, %k1
2990 ; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
2993 %shufp = shufflevector <4 x float> %__A, <4 x float> %__B, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
2994 %0 = bitcast i8 %__U to <8 x i1>
2995 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2996 %1 = select <4 x i1> %extract, <4 x float> %shufp, <4 x float> zeroinitializer
3000 define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) {
3001 ; CHECK-LABEL: test_mm256_shuffle_ps:
3003 ; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
3004 ; CHECK-NEXT: ret{{[l|q]}}
3005 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
3006 ret <8 x float> %res
3009 define <8 x float> @test_mm256_mask_shuffle_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2, <8 x float> %a3) {
3010 ; X86-LABEL: test_mm256_mask_shuffle_ps:
3012 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3013 ; X86-NEXT: kmovw %eax, %k1
3014 ; X86-NEXT: vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4]
3017 ; X64-LABEL: test_mm256_mask_shuffle_ps:
3019 ; X64-NEXT: kmovw %edi, %k1
3020 ; X64-NEXT: vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4]
3022 %arg1 = bitcast i8 %a1 to <8 x i1>
3023 %res0 = shufflevector <8 x float> %a2, <8 x float> %a3, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
3024 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
3025 ret <8 x float> %res1
3028 define <8 x float> @test_mm256_maskz_shuffle_ps(i8 %a0, <8 x float> %a1, <8 x float> %a2) {
3029 ; X86-LABEL: test_mm256_maskz_shuffle_ps:
3031 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3032 ; X86-NEXT: kmovw %eax, %k1
3033 ; X86-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
3036 ; X64-LABEL: test_mm256_maskz_shuffle_ps:
3038 ; X64-NEXT: kmovw %edi, %k1
3039 ; X64-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
3041 %arg0 = bitcast i8 %a0 to <8 x i1>
3042 %res0 = shufflevector <8 x float> %a1, <8 x float> %a2, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
3043 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
3044 ret <8 x float> %res1
3047 define <4 x i64> @test_mm256_mask_mul_epi32(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
3048 ; X86-LABEL: test_mm256_mask_mul_epi32:
3049 ; X86: # %bb.0: # %entry
3050 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3051 ; X86-NEXT: kmovw %eax, %k1
3052 ; X86-NEXT: vpmuldq %ymm1, %ymm2, %ymm0 {%k1}
3055 ; X64-LABEL: test_mm256_mask_mul_epi32:
3056 ; X64: # %bb.0: # %entry
3057 ; X64-NEXT: kmovw %edi, %k1
3058 ; X64-NEXT: vpmuldq %ymm1, %ymm2, %ymm0 {%k1}
3061 %tmp = shl <4 x i64> %__X, <i64 32, i64 32, i64 32, i64 32>
3062 %tmp1 = ashr exact <4 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32>
3063 %tmp2 = shl <4 x i64> %__Y, <i64 32, i64 32, i64 32, i64 32>
3064 %tmp3 = ashr exact <4 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32>
3065 %tmp4 = mul nsw <4 x i64> %tmp3, %tmp1
3066 %tmp5 = bitcast i8 %__M to <8 x i1>
3067 %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3068 %tmp6 = select <4 x i1> %extract.i, <4 x i64> %tmp4, <4 x i64> %__W
3072 define <4 x i64> @test_mm256_maskz_mul_epi32(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
3073 ; X86-LABEL: test_mm256_maskz_mul_epi32:
3075 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3076 ; X86-NEXT: kmovw %eax, %k1
3077 ; X86-NEXT: vpmuldq %ymm0, %ymm1, %ymm0 {%k1} {z}
3080 ; X64-LABEL: test_mm256_maskz_mul_epi32:
3082 ; X64-NEXT: kmovw %edi, %k1
3083 ; X64-NEXT: vpmuldq %ymm0, %ymm1, %ymm0 {%k1} {z}
3085 %tmp = shl <4 x i64> %__X, <i64 32, i64 32, i64 32, i64 32>
3086 %tmp1 = ashr exact <4 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32>
3087 %tmp2 = shl <4 x i64> %__Y, <i64 32, i64 32, i64 32, i64 32>
3088 %tmp3 = ashr exact <4 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32>
3089 %tmp4 = mul nsw <4 x i64> %tmp3, %tmp1
3090 %tmp5 = bitcast i8 %__M to <8 x i1>
3091 %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3092 %tmp6 = select <4 x i1> %extract.i, <4 x i64> %tmp4, <4 x i64> zeroinitializer
3096 define <2 x i64> @test_mm_mask_mul_epi32(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
3097 ; X86-LABEL: test_mm_mask_mul_epi32:
3099 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3100 ; X86-NEXT: kmovw %eax, %k1
3101 ; X86-NEXT: vpmuldq %xmm1, %xmm2, %xmm0 {%k1}
3104 ; X64-LABEL: test_mm_mask_mul_epi32:
3106 ; X64-NEXT: kmovw %edi, %k1
3107 ; X64-NEXT: vpmuldq %xmm1, %xmm2, %xmm0 {%k1}
3109 %tmp = shl <2 x i64> %__X, <i64 32, i64 32>
3110 %tmp1 = ashr exact <2 x i64> %tmp, <i64 32, i64 32>
3111 %tmp2 = shl <2 x i64> %__Y, <i64 32, i64 32>
3112 %tmp3 = ashr exact <2 x i64> %tmp2, <i64 32, i64 32>
3113 %tmp4 = mul nsw <2 x i64> %tmp3, %tmp1
3114 %tmp5 = bitcast i8 %__M to <8 x i1>
3115 %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3116 %tmp6 = select <2 x i1> %extract.i, <2 x i64> %tmp4, <2 x i64> %__W
3120 define <2 x i64> @test_mm_maskz_mul_epi32(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
3121 ; X86-LABEL: test_mm_maskz_mul_epi32:
3123 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3124 ; X86-NEXT: kmovw %eax, %k1
3125 ; X86-NEXT: vpmuldq %xmm0, %xmm1, %xmm0 {%k1} {z}
3128 ; X64-LABEL: test_mm_maskz_mul_epi32:
3130 ; X64-NEXT: kmovw %edi, %k1
3131 ; X64-NEXT: vpmuldq %xmm0, %xmm1, %xmm0 {%k1} {z}
3133 %tmp = shl <2 x i64> %__X, <i64 32, i64 32>
3134 %tmp1 = ashr exact <2 x i64> %tmp, <i64 32, i64 32>
3135 %tmp2 = shl <2 x i64> %__Y, <i64 32, i64 32>
3136 %tmp3 = ashr exact <2 x i64> %tmp2, <i64 32, i64 32>
3137 %tmp4 = mul nsw <2 x i64> %tmp3, %tmp1
3138 %tmp5 = bitcast i8 %__M to <8 x i1>
3139 %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3140 %tmp6 = select <2 x i1> %extract.i, <2 x i64> %tmp4, <2 x i64> zeroinitializer
3144 define <4 x i64> @test_mm256_mask_mul_epu32(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
3145 ; X86-LABEL: test_mm256_mask_mul_epu32:
3146 ; X86: # %bb.0: # %entry
3147 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3148 ; X86-NEXT: kmovw %eax, %k1
3149 ; X86-NEXT: vpmuludq %ymm1, %ymm2, %ymm0 {%k1}
3152 ; X64-LABEL: test_mm256_mask_mul_epu32:
3153 ; X64: # %bb.0: # %entry
3154 ; X64-NEXT: kmovw %edi, %k1
3155 ; X64-NEXT: vpmuludq %ymm1, %ymm2, %ymm0 {%k1}
3158 %tmp = and <4 x i64> %__X, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
3159 %tmp1 = and <4 x i64> %__Y, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
3160 %tmp2 = mul nuw <4 x i64> %tmp1, %tmp
3161 %tmp3 = bitcast i8 %__M to <8 x i1>
3162 %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3163 %tmp4 = select <4 x i1> %extract.i, <4 x i64> %tmp2, <4 x i64> %__W
3167 define <4 x i64> @test_mm256_maskz_mul_epu32(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
3168 ; X86-LABEL: test_mm256_maskz_mul_epu32:
3169 ; X86: # %bb.0: # %entry
3170 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3171 ; X86-NEXT: kmovw %eax, %k1
3172 ; X86-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 {%k1} {z}
3175 ; X64-LABEL: test_mm256_maskz_mul_epu32:
3176 ; X64: # %bb.0: # %entry
3177 ; X64-NEXT: kmovw %edi, %k1
3178 ; X64-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 {%k1} {z}
3181 %tmp = and <4 x i64> %__X, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
3182 %tmp1 = and <4 x i64> %__Y, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
3183 %tmp2 = mul nuw <4 x i64> %tmp1, %tmp
3184 %tmp3 = bitcast i8 %__M to <8 x i1>
3185 %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3186 %tmp4 = select <4 x i1> %extract.i, <4 x i64> %tmp2, <4 x i64> zeroinitializer
3190 define <2 x i64> @test_mm_mask_mul_epu32(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
3191 ; X86-LABEL: test_mm_mask_mul_epu32:
3192 ; X86: # %bb.0: # %entry
3193 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3194 ; X86-NEXT: kmovw %eax, %k1
3195 ; X86-NEXT: vpmuludq %xmm1, %xmm2, %xmm0 {%k1}
3198 ; X64-LABEL: test_mm_mask_mul_epu32:
3199 ; X64: # %bb.0: # %entry
3200 ; X64-NEXT: kmovw %edi, %k1
3201 ; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm0 {%k1}
3204 %tmp = and <2 x i64> %__X, <i64 4294967295, i64 4294967295>
3205 %tmp1 = and <2 x i64> %__Y, <i64 4294967295, i64 4294967295>
3206 %tmp2 = mul nuw <2 x i64> %tmp1, %tmp
3207 %tmp3 = bitcast i8 %__M to <8 x i1>
3208 %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3209 %tmp4 = select <2 x i1> %extract.i, <2 x i64> %tmp2, <2 x i64> %__W
3213 define <2 x i64> @test_mm_maskz_mul_epu32(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
3214 ; X86-LABEL: test_mm_maskz_mul_epu32:
3215 ; X86: # %bb.0: # %entry
3216 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3217 ; X86-NEXT: kmovw %eax, %k1
3218 ; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 {%k1} {z}
3221 ; X64-LABEL: test_mm_maskz_mul_epu32:
3222 ; X64: # %bb.0: # %entry
3223 ; X64-NEXT: kmovw %edi, %k1
3224 ; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 {%k1} {z}
3227 %tmp = and <2 x i64> %__X, <i64 4294967295, i64 4294967295>
3228 %tmp1 = and <2 x i64> %__Y, <i64 4294967295, i64 4294967295>
3229 %tmp2 = mul nuw <2 x i64> %tmp1, %tmp
3230 %tmp3 = bitcast i8 %__M to <8 x i1>
3231 %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3232 %tmp4 = select <2 x i1> %extract.i, <2 x i64> %tmp2, <2 x i64> zeroinitializer
3236 define <2 x i64> @test_mm_cvtepi32_epi8(<2 x i64> %__A) {
3237 ; CHECK-LABEL: test_mm_cvtepi32_epi8:
3238 ; CHECK: # %bb.0: # %entry
3239 ; CHECK-NEXT: vpmovdb %xmm0, %xmm0
3240 ; CHECK-NEXT: ret{{[l|q]}}
3242 %0 = bitcast <2 x i64> %__A to <4 x i32>
3243 %conv.i = trunc <4 x i32> %0 to <4 x i8>
3244 %shuf.i = shufflevector <4 x i8> %conv.i, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
3245 %1 = bitcast <16 x i8> %shuf.i to <2 x i64>
3249 define <2 x i64> @test_mm_cvtepi32_epi16(<2 x i64> %__A) {
3250 ; CHECK-LABEL: test_mm_cvtepi32_epi16:
3251 ; CHECK: # %bb.0: # %entry
3252 ; CHECK-NEXT: vpmovdw %xmm0, %xmm0
3253 ; CHECK-NEXT: ret{{[l|q]}}
3255 %0 = bitcast <2 x i64> %__A to <4 x i32>
3256 %conv.i = trunc <4 x i32> %0 to <4 x i16>
3257 %shuf.i = shufflevector <4 x i16> %conv.i, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3258 %1 = bitcast <8 x i16> %shuf.i to <2 x i64>
3262 define <2 x i64> @test_mm_cvtepi64_epi8(<2 x i64> %__A) {
3263 ; CHECK-LABEL: test_mm_cvtepi64_epi8:
3264 ; CHECK: # %bb.0: # %entry
3265 ; CHECK-NEXT: vpmovqb %xmm0, %xmm0
3266 ; CHECK-NEXT: ret{{[l|q]}}
3268 %conv.i = trunc <2 x i64> %__A to <2 x i8>
3269 %shuf.i = shufflevector <2 x i8> %conv.i, <2 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
3270 %0 = bitcast <16 x i8> %shuf.i to <2 x i64>
3274 define <2 x i64> @test_mm_cvtepi64_epi16(<2 x i64> %__A) {
3275 ; CHECK-LABEL: test_mm_cvtepi64_epi16:
3276 ; CHECK: # %bb.0: # %entry
3277 ; CHECK-NEXT: vpmovqw %xmm0, %xmm0
3278 ; CHECK-NEXT: ret{{[l|q]}}
3280 %conv.i = trunc <2 x i64> %__A to <2 x i16>
3281 %shuf.i = shufflevector <2 x i16> %conv.i, <2 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
3282 %0 = bitcast <8 x i16> %shuf.i to <2 x i64>
3286 define <2 x i64> @test_mm_cvtepi64_epi32(<2 x i64> %__A) {
3287 ; CHECK-LABEL: test_mm_cvtepi64_epi32:
3288 ; CHECK: # %bb.0: # %entry
3289 ; CHECK-NEXT: vpmovqd %xmm0, %xmm0
3290 ; CHECK-NEXT: ret{{[l|q]}}
3292 %conv.i = trunc <2 x i64> %__A to <2 x i32>
3293 %shuf.i = shufflevector <2 x i32> %conv.i, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3294 %0 = bitcast <4 x i32> %shuf.i to <2 x i64>
3298 define <2 x i64> @test_mm256_cvtepi32_epi16(<4 x i64> %__A) local_unnamed_addr #0 {
3299 ; CHECK-LABEL: test_mm256_cvtepi32_epi16:
3300 ; CHECK: # %bb.0: # %entry
3301 ; CHECK-NEXT: vpmovdw %ymm0, %xmm0
3302 ; CHECK-NEXT: vzeroupper
3303 ; CHECK-NEXT: ret{{[l|q]}}
3305 %0 = bitcast <4 x i64> %__A to <8 x i32>
3306 %conv.i = trunc <8 x i32> %0 to <8 x i16>
3307 %1 = bitcast <8 x i16> %conv.i to <2 x i64>
3311 define <2 x i64> @test_mm256_mask_cvtepi32_epi16(<2 x i64> %__O, i8 zeroext %__M, <4 x i64> %__A) {
3312 ; X86-LABEL: test_mm256_mask_cvtepi32_epi16:
3313 ; X86: # %bb.0: # %entry
3314 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3315 ; X86-NEXT: kmovw %eax, %k1
3316 ; X86-NEXT: vpmovdw %ymm1, %xmm0 {%k1}
3317 ; X86-NEXT: vzeroupper
3320 ; X64-LABEL: test_mm256_mask_cvtepi32_epi16:
3321 ; X64: # %bb.0: # %entry
3322 ; X64-NEXT: kmovw %edi, %k1
3323 ; X64-NEXT: vpmovdw %ymm1, %xmm0 {%k1}
3324 ; X64-NEXT: vzeroupper
3327 %0 = bitcast <4 x i64> %__A to <8 x i32>
3328 %1 = bitcast <2 x i64> %__O to <8 x i16>
3329 %2 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %0, <8 x i16> %1, i8 %__M)
3330 %3 = bitcast <8 x i16> %2 to <2 x i64>
3334 define <2 x i64> @test_mm256_maskz_cvtepi32_epi16(i8 zeroext %__M, <4 x i64> %__A) {
3335 ; X86-LABEL: test_mm256_maskz_cvtepi32_epi16:
3336 ; X86: # %bb.0: # %entry
3337 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3338 ; X86-NEXT: kmovw %eax, %k1
3339 ; X86-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z}
3340 ; X86-NEXT: vzeroupper
3343 ; X64-LABEL: test_mm256_maskz_cvtepi32_epi16:
3344 ; X64: # %bb.0: # %entry
3345 ; X64-NEXT: kmovw %edi, %k1
3346 ; X64-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z}
3347 ; X64-NEXT: vzeroupper
3350 %0 = bitcast <4 x i64> %__A to <8 x i32>
3351 %1 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %0, <8 x i16> zeroinitializer, i8 %__M)
3352 %2 = bitcast <8 x i16> %1 to <2 x i64>
3356 define <2 x i64> @test_mm256_cvtepi64_epi32(<4 x i64> %__A) local_unnamed_addr #0 {
3357 ; CHECK-LABEL: test_mm256_cvtepi64_epi32:
3358 ; CHECK: # %bb.0: # %entry
3359 ; CHECK-NEXT: vpmovqd %ymm0, %xmm0
3360 ; CHECK-NEXT: vzeroupper
3361 ; CHECK-NEXT: ret{{[l|q]}}
3363 %conv.i = trunc <4 x i64> %__A to <4 x i32>
3364 %0 = bitcast <4 x i32> %conv.i to <2 x i64>
3368 define <2 x i64> @test_mm256_mask_cvtepi64_epi32(<2 x i64> %__O, i8 zeroext %__M, <4 x i64> %__A) {
3369 ; X86-LABEL: test_mm256_mask_cvtepi64_epi32:
3370 ; X86: # %bb.0: # %entry
3371 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3372 ; X86-NEXT: kmovw %eax, %k1
3373 ; X86-NEXT: vpmovqd %ymm1, %xmm0 {%k1}
3374 ; X86-NEXT: vzeroupper
3377 ; X64-LABEL: test_mm256_mask_cvtepi64_epi32:
3378 ; X64: # %bb.0: # %entry
3379 ; X64-NEXT: kmovw %edi, %k1
3380 ; X64-NEXT: vpmovqd %ymm1, %xmm0 {%k1}
3381 ; X64-NEXT: vzeroupper
3384 %conv.i.i = trunc <4 x i64> %__A to <4 x i32>
3385 %0 = bitcast <2 x i64> %__O to <4 x i32>
3386 %1 = bitcast i8 %__M to <8 x i1>
3387 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3388 %2 = select <4 x i1> %extract.i, <4 x i32> %conv.i.i, <4 x i32> %0
3389 %3 = bitcast <4 x i32> %2 to <2 x i64>
3393 define <2 x i64> @test_mm256_maskz_cvtepi64_epi32(i8 zeroext %__M, <4 x i64> %__A) {
3394 ; X86-LABEL: test_mm256_maskz_cvtepi64_epi32:
3395 ; X86: # %bb.0: # %entry
3396 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3397 ; X86-NEXT: kmovw %eax, %k1
3398 ; X86-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z}
3399 ; X86-NEXT: vzeroupper
3402 ; X64-LABEL: test_mm256_maskz_cvtepi64_epi32:
3403 ; X64: # %bb.0: # %entry
3404 ; X64-NEXT: kmovw %edi, %k1
3405 ; X64-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z}
3406 ; X64-NEXT: vzeroupper
3409 %conv.i.i = trunc <4 x i64> %__A to <4 x i32>
3410 %0 = bitcast i8 %__M to <8 x i1>
3411 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3412 %1 = select <4 x i1> %extract.i, <4 x i32> %conv.i.i, <4 x i32> zeroinitializer
3413 %2 = bitcast <4 x i32> %1 to <2 x i64>
3417 define <2 x i64> @test_mm256_cvtepi64_epi8(<4 x i64> %__A) {
3418 ; CHECK-LABEL: test_mm256_cvtepi64_epi8:
3419 ; CHECK: # %bb.0: # %entry
3420 ; CHECK-NEXT: vpmovqb %ymm0, %xmm0
3421 ; CHECK-NEXT: vzeroupper
3422 ; CHECK-NEXT: ret{{[l|q]}}
3424 %conv.i = trunc <4 x i64> %__A to <4 x i8>
3425 %shuf.i = shufflevector <4 x i8> %conv.i, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
3426 %0 = bitcast <16 x i8> %shuf.i to <2 x i64>
3430 define <2 x i64> @test_mm256_cvtepi64_epi16(<4 x i64> %__A) {
3431 ; CHECK-LABEL: test_mm256_cvtepi64_epi16:
3432 ; CHECK: # %bb.0: # %entry
3433 ; CHECK-NEXT: vpmovqw %ymm0, %xmm0
3434 ; CHECK-NEXT: vzeroupper
3435 ; CHECK-NEXT: ret{{[l|q]}}
3437 %conv.i = trunc <4 x i64> %__A to <4 x i16>
3438 %shuf.i = shufflevector <4 x i16> %conv.i, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3439 %0 = bitcast <8 x i16> %shuf.i to <2 x i64>
3443 define <2 x i64> @test_mm256_cvtepi32_epi8(<4 x i64> %__A) {
3444 ; CHECK-LABEL: test_mm256_cvtepi32_epi8:
3445 ; CHECK: # %bb.0: # %entry
3446 ; CHECK-NEXT: vpmovdb %ymm0, %xmm0
3447 ; CHECK-NEXT: vzeroupper
3448 ; CHECK-NEXT: ret{{[l|q]}}
3450 %0 = bitcast <4 x i64> %__A to <8 x i32>
3451 %conv.i = trunc <8 x i32> %0 to <8 x i8>
3452 %shuf.i = shufflevector <8 x i8> %conv.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3453 %1 = bitcast <16 x i8> %shuf.i to <2 x i64>
3457 define <2 x i64> @test_mm_ternarylogic_epi32(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) {
3458 ; CHECK-LABEL: test_mm_ternarylogic_epi32:
3459 ; CHECK: # %bb.0: # %entry
3460 ; CHECK-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0
3461 ; CHECK-NEXT: ret{{[l|q]}}
3463 %0 = bitcast <2 x i64> %__A to <4 x i32>
3464 %1 = bitcast <2 x i64> %__B to <4 x i32>
3465 %2 = bitcast <2 x i64> %__C to <4 x i32>
3466 %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4)
3467 %4 = bitcast <4 x i32> %3 to <2 x i64>
3471 declare <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32) #2
3473 define <2 x i64> @test_mm_mask_ternarylogic_epi32(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__B, <2 x i64> %__C) {
3474 ; X86-LABEL: test_mm_mask_ternarylogic_epi32:
3475 ; X86: # %bb.0: # %entry
3476 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3477 ; X86-NEXT: kmovw %eax, %k1
3478 ; X86-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1}
3481 ; X64-LABEL: test_mm_mask_ternarylogic_epi32:
3482 ; X64: # %bb.0: # %entry
3483 ; X64-NEXT: kmovw %edi, %k1
3484 ; X64-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1}
3487 %0 = bitcast <2 x i64> %__A to <4 x i32>
3488 %1 = bitcast <2 x i64> %__B to <4 x i32>
3489 %2 = bitcast <2 x i64> %__C to <4 x i32>
3490 %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4)
3491 %4 = bitcast i8 %__U to <8 x i1>
3492 %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3493 %5 = select <4 x i1> %extract, <4 x i32> %3, <4 x i32> %0
3494 %6 = bitcast <4 x i32> %5 to <2 x i64>
3498 define <2 x i64> @test_mm_maskz_ternarylogic_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) {
3499 ; X86-LABEL: test_mm_maskz_ternarylogic_epi32:
3500 ; X86: # %bb.0: # %entry
3501 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3502 ; X86-NEXT: kmovw %eax, %k1
3503 ; X86-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} {z}
3506 ; X64-LABEL: test_mm_maskz_ternarylogic_epi32:
3507 ; X64: # %bb.0: # %entry
3508 ; X64-NEXT: kmovw %edi, %k1
3509 ; X64-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} {z}
3512 %0 = bitcast <2 x i64> %__A to <4 x i32>
3513 %1 = bitcast <2 x i64> %__B to <4 x i32>
3514 %2 = bitcast <2 x i64> %__C to <4 x i32>
3515 %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4)
3516 %4 = bitcast i8 %__U to <8 x i1>
3517 %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3518 %5 = select <4 x i1> %extract, <4 x i32> %3, <4 x i32> zeroinitializer
3519 %6 = bitcast <4 x i32> %5 to <2 x i64>
3523 define <4 x i64> @test_mm256_ternarylogic_epi32(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) {
3524 ; CHECK-LABEL: test_mm256_ternarylogic_epi32:
3525 ; CHECK: # %bb.0: # %entry
3526 ; CHECK-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0
3527 ; CHECK-NEXT: ret{{[l|q]}}
3529 %0 = bitcast <4 x i64> %__A to <8 x i32>
3530 %1 = bitcast <4 x i64> %__B to <8 x i32>
3531 %2 = bitcast <4 x i64> %__C to <8 x i32>
3532 %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4)
3533 %4 = bitcast <8 x i32> %3 to <4 x i64>
3537 declare <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i32) #2
3539 define <4 x i64> @test_mm256_mask_ternarylogic_epi32(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__B, <4 x i64> %__C) {
3540 ; X86-LABEL: test_mm256_mask_ternarylogic_epi32:
3541 ; X86: # %bb.0: # %entry
3542 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3543 ; X86-NEXT: kmovw %eax, %k1
3544 ; X86-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1}
3547 ; X64-LABEL: test_mm256_mask_ternarylogic_epi32:
3548 ; X64: # %bb.0: # %entry
3549 ; X64-NEXT: kmovw %edi, %k1
3550 ; X64-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1}
3553 %0 = bitcast <4 x i64> %__A to <8 x i32>
3554 %1 = bitcast <4 x i64> %__B to <8 x i32>
3555 %2 = bitcast <4 x i64> %__C to <8 x i32>
3556 %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4)
3557 %4 = bitcast i8 %__U to <8 x i1>
3558 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %0
3559 %6 = bitcast <8 x i32> %5 to <4 x i64>
3563 define <4 x i64> @test_mm256_maskz_ternarylogic_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) {
3564 ; X86-LABEL: test_mm256_maskz_ternarylogic_epi32:
3565 ; X86: # %bb.0: # %entry
3566 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3567 ; X86-NEXT: kmovw %eax, %k1
3568 ; X86-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} {z}
3571 ; X64-LABEL: test_mm256_maskz_ternarylogic_epi32:
3572 ; X64: # %bb.0: # %entry
3573 ; X64-NEXT: kmovw %edi, %k1
3574 ; X64-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} {z}
3577 %0 = bitcast <4 x i64> %__A to <8 x i32>
3578 %1 = bitcast <4 x i64> %__B to <8 x i32>
3579 %2 = bitcast <4 x i64> %__C to <8 x i32>
3580 %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4)
3581 %4 = bitcast i8 %__U to <8 x i1>
3582 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> zeroinitializer
3583 %6 = bitcast <8 x i32> %5 to <4 x i64>
3587 define <2 x i64> @test_mm_ternarylogic_epi64(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) {
3588 ; CHECK-LABEL: test_mm_ternarylogic_epi64:
3589 ; CHECK: # %bb.0: # %entry
3590 ; CHECK-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0
3591 ; CHECK-NEXT: ret{{[l|q]}}
3593 %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4)
3597 declare <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i32) #2
3599 define <2 x i64> @test_mm_mask_ternarylogic_epi64(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__B, <2 x i64> %__C) {
3600 ; X86-LABEL: test_mm_mask_ternarylogic_epi64:
3601 ; X86: # %bb.0: # %entry
3602 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3603 ; X86-NEXT: kmovw %eax, %k1
3604 ; X86-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1}
3607 ; X64-LABEL: test_mm_mask_ternarylogic_epi64:
3608 ; X64: # %bb.0: # %entry
3609 ; X64-NEXT: kmovw %edi, %k1
3610 ; X64-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1}
3613 %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4)
3614 %1 = bitcast i8 %__U to <8 x i1>
3615 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3616 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__A
3620 define <2 x i64> @test_mm_maskz_ternarylogic_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) {
3621 ; X86-LABEL: test_mm_maskz_ternarylogic_epi64:
3622 ; X86: # %bb.0: # %entry
3623 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3624 ; X86-NEXT: kmovw %eax, %k1
3625 ; X86-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} {z}
3628 ; X64-LABEL: test_mm_maskz_ternarylogic_epi64:
3629 ; X64: # %bb.0: # %entry
3630 ; X64-NEXT: kmovw %edi, %k1
3631 ; X64-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} {z}
3634 %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4)
3635 %1 = bitcast i8 %__U to <8 x i1>
3636 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3637 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer
3641 define <4 x i64> @test_mm256_ternarylogic_epi64(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) {
3642 ; CHECK-LABEL: test_mm256_ternarylogic_epi64:
3643 ; CHECK: # %bb.0: # %entry
3644 ; CHECK-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0
3645 ; CHECK-NEXT: ret{{[l|q]}}
3647 %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4)
3651 declare <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i32) #2
3653 define <4 x i64> @test_mm256_mask_ternarylogic_epi64(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__B, <4 x i64> %__C) {
3654 ; X86-LABEL: test_mm256_mask_ternarylogic_epi64:
3655 ; X86: # %bb.0: # %entry
3656 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3657 ; X86-NEXT: kmovw %eax, %k1
3658 ; X86-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1}
3661 ; X64-LABEL: test_mm256_mask_ternarylogic_epi64:
3662 ; X64: # %bb.0: # %entry
3663 ; X64-NEXT: kmovw %edi, %k1
3664 ; X64-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1}
3667 %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4)
3668 %1 = bitcast i8 %__U to <8 x i1>
3669 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3670 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__A
3674 define <4 x i64> @test_mm256_maskz_ternarylogic_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) {
3675 ; X86-LABEL: test_mm256_maskz_ternarylogic_epi64:
3676 ; X86: # %bb.0: # %entry
3677 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3678 ; X86-NEXT: kmovw %eax, %k1
3679 ; X86-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} {z}
3682 ; X64-LABEL: test_mm256_maskz_ternarylogic_epi64:
3683 ; X64: # %bb.0: # %entry
3684 ; X64-NEXT: kmovw %edi, %k1
3685 ; X64-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} {z}
3688 %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4)
3689 %1 = bitcast i8 %__U to <8 x i1>
3690 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3691 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer
3695 define <2 x i64> @test_mm_mask2_permutex2var_epi32(<2 x i64> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x i64> %__B) {
3696 ; X86-LABEL: test_mm_mask2_permutex2var_epi32:
3697 ; X86: # %bb.0: # %entry
3698 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3699 ; X86-NEXT: kmovw %eax, %k1
3700 ; X86-NEXT: vpermi2d %xmm2, %xmm0, %xmm1 {%k1}
3701 ; X86-NEXT: vmovdqa %xmm1, %xmm0
3704 ; X64-LABEL: test_mm_mask2_permutex2var_epi32:
3705 ; X64: # %bb.0: # %entry
3706 ; X64-NEXT: kmovw %edi, %k1
3707 ; X64-NEXT: vpermi2d %xmm2, %xmm0, %xmm1 {%k1}
3708 ; X64-NEXT: vmovdqa %xmm1, %xmm0
3711 %0 = bitcast <2 x i64> %__A to <4 x i32>
3712 %1 = bitcast <2 x i64> %__I to <4 x i32>
3713 %2 = bitcast <2 x i64> %__B to <4 x i32>
3714 %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
3715 %4 = bitcast i8 %__U to <8 x i1>
3716 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3717 %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> %1
3718 %6 = bitcast <4 x i32> %5 to <2 x i64>
3722 define <4 x i64> @test_mm256_mask2_permutex2var_epi32(<4 x i64> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x i64> %__B) {
3723 ; X86-LABEL: test_mm256_mask2_permutex2var_epi32:
3724 ; X86: # %bb.0: # %entry
3725 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3726 ; X86-NEXT: kmovw %eax, %k1
3727 ; X86-NEXT: vpermi2d %ymm2, %ymm0, %ymm1 {%k1}
3728 ; X86-NEXT: vmovdqa %ymm1, %ymm0
3731 ; X64-LABEL: test_mm256_mask2_permutex2var_epi32:
3732 ; X64: # %bb.0: # %entry
3733 ; X64-NEXT: kmovw %edi, %k1
3734 ; X64-NEXT: vpermi2d %ymm2, %ymm0, %ymm1 {%k1}
3735 ; X64-NEXT: vmovdqa %ymm1, %ymm0
3738 %0 = bitcast <4 x i64> %__A to <8 x i32>
3739 %1 = bitcast <4 x i64> %__I to <8 x i32>
3740 %2 = bitcast <4 x i64> %__B to <8 x i32>
3741 %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
3742 %4 = bitcast i8 %__U to <8 x i1>
3743 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %1
3744 %6 = bitcast <8 x i32> %5 to <4 x i64>
3748 define <2 x double> @test_mm_mask2_permutex2var_pd(<2 x double> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x double> %__B) {
3749 ; X86-LABEL: test_mm_mask2_permutex2var_pd:
3750 ; X86: # %bb.0: # %entry
3751 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3752 ; X86-NEXT: kmovw %eax, %k1
3753 ; X86-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1 {%k1}
3754 ; X86-NEXT: vmovapd %xmm1, %xmm0
3757 ; X64-LABEL: test_mm_mask2_permutex2var_pd:
3758 ; X64: # %bb.0: # %entry
3759 ; X64-NEXT: kmovw %edi, %k1
3760 ; X64-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1 {%k1}
3761 ; X64-NEXT: vmovapd %xmm1, %xmm0
3764 %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B)
3765 %1 = bitcast <2 x i64> %__I to <2 x double>
3766 %2 = bitcast i8 %__U to <8 x i1>
3767 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3768 %3 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %1
3772 define <4 x double> @test_mm256_mask2_permutex2var_pd(<4 x double> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x double> %__B) {
3773 ; X86-LABEL: test_mm256_mask2_permutex2var_pd:
3774 ; X86: # %bb.0: # %entry
3775 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3776 ; X86-NEXT: kmovw %eax, %k1
3777 ; X86-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 {%k1}
3778 ; X86-NEXT: vmovapd %ymm1, %ymm0
3781 ; X64-LABEL: test_mm256_mask2_permutex2var_pd:
3782 ; X64: # %bb.0: # %entry
3783 ; X64-NEXT: kmovw %edi, %k1
3784 ; X64-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 {%k1}
3785 ; X64-NEXT: vmovapd %ymm1, %ymm0
3788 %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B)
3789 %1 = bitcast <4 x i64> %__I to <4 x double>
3790 %2 = bitcast i8 %__U to <8 x i1>
3791 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3792 %3 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %1
3796 define <4 x float> @test_mm_mask2_permutex2var_ps(<4 x float> %__A, <2 x i64> %__I, i8 zeroext %__U, <4 x float> %__B) {
3797 ; X86-LABEL: test_mm_mask2_permutex2var_ps:
3798 ; X86: # %bb.0: # %entry
3799 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3800 ; X86-NEXT: kmovw %eax, %k1
3801 ; X86-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 {%k1}
3802 ; X86-NEXT: vmovaps %xmm1, %xmm0
3805 ; X64-LABEL: test_mm_mask2_permutex2var_ps:
3806 ; X64: # %bb.0: # %entry
3807 ; X64-NEXT: kmovw %edi, %k1
3808 ; X64-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 {%k1}
3809 ; X64-NEXT: vmovaps %xmm1, %xmm0
3812 %0 = bitcast <2 x i64> %__I to <4 x i32>
3813 %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B)
3814 %2 = bitcast <2 x i64> %__I to <4 x float>
3815 %3 = bitcast i8 %__U to <8 x i1>
3816 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3817 %4 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> %2
3821 define <8 x float> @test_mm256_mask2_permutex2var_ps(<8 x float> %__A, <4 x i64> %__I, i8 zeroext %__U, <8 x float> %__B) {
3822 ; X86-LABEL: test_mm256_mask2_permutex2var_ps:
3823 ; X86: # %bb.0: # %entry
3824 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3825 ; X86-NEXT: kmovw %eax, %k1
3826 ; X86-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 {%k1}
3827 ; X86-NEXT: vmovaps %ymm1, %ymm0
3830 ; X64-LABEL: test_mm256_mask2_permutex2var_ps:
3831 ; X64: # %bb.0: # %entry
3832 ; X64-NEXT: kmovw %edi, %k1
3833 ; X64-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 {%k1}
3834 ; X64-NEXT: vmovaps %ymm1, %ymm0
3837 %0 = bitcast <4 x i64> %__I to <8 x i32>
3838 %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B)
3839 %2 = bitcast <4 x i64> %__I to <8 x float>
3840 %3 = bitcast i8 %__U to <8 x i1>
3841 %4 = select <8 x i1> %3, <8 x float> %1, <8 x float> %2
3845 define <2 x i64> @test_mm_mask2_permutex2var_epi64(<2 x i64> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x i64> %__B) {
3846 ; X86-LABEL: test_mm_mask2_permutex2var_epi64:
3847 ; X86: # %bb.0: # %entry
3848 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3849 ; X86-NEXT: kmovw %eax, %k1
3850 ; X86-NEXT: vpermi2q %xmm2, %xmm0, %xmm1 {%k1}
3851 ; X86-NEXT: vmovdqa %xmm1, %xmm0
3854 ; X64-LABEL: test_mm_mask2_permutex2var_epi64:
3855 ; X64: # %bb.0: # %entry
3856 ; X64-NEXT: kmovw %edi, %k1
3857 ; X64-NEXT: vpermi2q %xmm2, %xmm0, %xmm1 {%k1}
3858 ; X64-NEXT: vmovdqa %xmm1, %xmm0
3861 %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B)
3862 %1 = bitcast i8 %__U to <8 x i1>
3863 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
3864 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__I
3868 define <4 x i64> @test_mm256_mask2_permutex2var_epi64(<4 x i64> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x i64> %__B) {
3869 ; X86-LABEL: test_mm256_mask2_permutex2var_epi64:
3870 ; X86: # %bb.0: # %entry
3871 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3872 ; X86-NEXT: kmovw %eax, %k1
3873 ; X86-NEXT: vpermi2q %ymm2, %ymm0, %ymm1 {%k1}
3874 ; X86-NEXT: vmovdqa %ymm1, %ymm0
3877 ; X64-LABEL: test_mm256_mask2_permutex2var_epi64:
3878 ; X64: # %bb.0: # %entry
3879 ; X64-NEXT: kmovw %edi, %k1
3880 ; X64-NEXT: vpermi2q %ymm2, %ymm0, %ymm1 {%k1}
3881 ; X64-NEXT: vmovdqa %ymm1, %ymm0
3884 %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B)
3885 %1 = bitcast i8 %__U to <8 x i1>
3886 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3887 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__I
3891 define <2 x i64> @test_mm_permutex2var_epi32(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) {
3892 ; CHECK-LABEL: test_mm_permutex2var_epi32:
3893 ; CHECK: # %bb.0: # %entry
3894 ; CHECK-NEXT: vpermt2d %xmm2, %xmm1, %xmm0
3895 ; CHECK-NEXT: ret{{[l|q]}}
3897 %0 = bitcast <2 x i64> %__A to <4 x i32>
3898 %1 = bitcast <2 x i64> %__I to <4 x i32>
3899 %2 = bitcast <2 x i64> %__B to <4 x i32>
3900 %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
3901 %4 = bitcast <4 x i32> %3 to <2 x i64>
3905 define <2 x i64> @test_mm_mask_permutex2var_epi32(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x i64> %__B) {
3906 ; X86-LABEL: test_mm_mask_permutex2var_epi32:
3907 ; X86: # %bb.0: # %entry
3908 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3909 ; X86-NEXT: kmovw %eax, %k1
3910 ; X86-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 {%k1}
3913 ; X64-LABEL: test_mm_mask_permutex2var_epi32:
3914 ; X64: # %bb.0: # %entry
3915 ; X64-NEXT: kmovw %edi, %k1
3916 ; X64-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 {%k1}
3919 %0 = bitcast <2 x i64> %__A to <4 x i32>
3920 %1 = bitcast <2 x i64> %__I to <4 x i32>
3921 %2 = bitcast <2 x i64> %__B to <4 x i32>
3922 %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
3923 %4 = bitcast i8 %__U to <8 x i1>
3924 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3925 %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> %0
3926 %6 = bitcast <4 x i32> %5 to <2 x i64>
3930 define <2 x i64> @test_mm_maskz_permutex2var_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) {
3931 ; X86-LABEL: test_mm_maskz_permutex2var_epi32:
3932 ; X86: # %bb.0: # %entry
3933 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3934 ; X86-NEXT: kmovw %eax, %k1
3935 ; X86-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 {%k1} {z}
3938 ; X64-LABEL: test_mm_maskz_permutex2var_epi32:
3939 ; X64: # %bb.0: # %entry
3940 ; X64-NEXT: kmovw %edi, %k1
3941 ; X64-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 {%k1} {z}
3944 %0 = bitcast <2 x i64> %__A to <4 x i32>
3945 %1 = bitcast <2 x i64> %__I to <4 x i32>
3946 %2 = bitcast <2 x i64> %__B to <4 x i32>
3947 %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
3948 %4 = bitcast i8 %__U to <8 x i1>
3949 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3950 %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> zeroinitializer
3951 %6 = bitcast <4 x i32> %5 to <2 x i64>
3955 define <4 x i64> @test_mm256_permutex2var_epi32(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) {
3956 ; CHECK-LABEL: test_mm256_permutex2var_epi32:
3957 ; CHECK: # %bb.0: # %entry
3958 ; CHECK-NEXT: vpermt2d %ymm2, %ymm1, %ymm0
3959 ; CHECK-NEXT: ret{{[l|q]}}
3961 %0 = bitcast <4 x i64> %__A to <8 x i32>
3962 %1 = bitcast <4 x i64> %__I to <8 x i32>
3963 %2 = bitcast <4 x i64> %__B to <8 x i32>
3964 %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
3965 %4 = bitcast <8 x i32> %3 to <4 x i64>
3969 define <4 x i64> @test_mm256_mask_permutex2var_epi32(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x i64> %__B) {
3970 ; X86-LABEL: test_mm256_mask_permutex2var_epi32:
3971 ; X86: # %bb.0: # %entry
3972 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3973 ; X86-NEXT: kmovw %eax, %k1
3974 ; X86-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 {%k1}
3977 ; X64-LABEL: test_mm256_mask_permutex2var_epi32:
3978 ; X64: # %bb.0: # %entry
3979 ; X64-NEXT: kmovw %edi, %k1
3980 ; X64-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 {%k1}
3983 %0 = bitcast <4 x i64> %__A to <8 x i32>
3984 %1 = bitcast <4 x i64> %__I to <8 x i32>
3985 %2 = bitcast <4 x i64> %__B to <8 x i32>
3986 %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
3987 %4 = bitcast i8 %__U to <8 x i1>
3988 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %0
3989 %6 = bitcast <8 x i32> %5 to <4 x i64>
3993 define <4 x i64> @test_mm256_maskz_permutex2var_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) {
3994 ; X86-LABEL: test_mm256_maskz_permutex2var_epi32:
3995 ; X86: # %bb.0: # %entry
3996 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3997 ; X86-NEXT: kmovw %eax, %k1
3998 ; X86-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 {%k1} {z}
4001 ; X64-LABEL: test_mm256_maskz_permutex2var_epi32:
4002 ; X64: # %bb.0: # %entry
4003 ; X64-NEXT: kmovw %edi, %k1
4004 ; X64-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 {%k1} {z}
4007 %0 = bitcast <4 x i64> %__A to <8 x i32>
4008 %1 = bitcast <4 x i64> %__I to <8 x i32>
4009 %2 = bitcast <4 x i64> %__B to <8 x i32>
4010 %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
4011 %4 = bitcast i8 %__U to <8 x i1>
4012 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> zeroinitializer
4013 %6 = bitcast <8 x i32> %5 to <4 x i64>
4017 define <2 x double> @test_mm_permutex2var_pd(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) {
4018 ; CHECK-LABEL: test_mm_permutex2var_pd:
4019 ; CHECK: # %bb.0: # %entry
4020 ; CHECK-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0
4021 ; CHECK-NEXT: ret{{[l|q]}}
4023 %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B)
4027 define <2 x double> @test_mm_mask_permutex2var_pd(<2 x double> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x double> %__B) {
4028 ; X86-LABEL: test_mm_mask_permutex2var_pd:
4029 ; X86: # %bb.0: # %entry
4030 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4031 ; X86-NEXT: kmovw %eax, %k1
4032 ; X86-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 {%k1}
4035 ; X64-LABEL: test_mm_mask_permutex2var_pd:
4036 ; X64: # %bb.0: # %entry
4037 ; X64-NEXT: kmovw %edi, %k1
4038 ; X64-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 {%k1}
4041 %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B)
4042 %1 = bitcast i8 %__U to <8 x i1>
4043 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4044 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
4048 define <2 x double> @test_mm_maskz_permutex2var_pd(i8 zeroext %__U, <2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) {
4049 ; X86-LABEL: test_mm_maskz_permutex2var_pd:
4050 ; X86: # %bb.0: # %entry
4051 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4052 ; X86-NEXT: kmovw %eax, %k1
4053 ; X86-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 {%k1} {z}
4056 ; X64-LABEL: test_mm_maskz_permutex2var_pd:
4057 ; X64: # %bb.0: # %entry
4058 ; X64-NEXT: kmovw %edi, %k1
4059 ; X64-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 {%k1} {z}
4062 %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B)
4063 %1 = bitcast i8 %__U to <8 x i1>
4064 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4065 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
4069 define <4 x double> @test_mm256_permutex2var_pd(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) {
4070 ; CHECK-LABEL: test_mm256_permutex2var_pd:
4071 ; CHECK: # %bb.0: # %entry
4072 ; CHECK-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0
4073 ; CHECK-NEXT: ret{{[l|q]}}
4075 %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B)
4079 define <4 x double> @test_mm256_mask_permutex2var_pd(<4 x double> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x double> %__B) {
4080 ; X86-LABEL: test_mm256_mask_permutex2var_pd:
4081 ; X86: # %bb.0: # %entry
4082 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4083 ; X86-NEXT: kmovw %eax, %k1
4084 ; X86-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 {%k1}
4087 ; X64-LABEL: test_mm256_mask_permutex2var_pd:
4088 ; X64: # %bb.0: # %entry
4089 ; X64-NEXT: kmovw %edi, %k1
4090 ; X64-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 {%k1}
4093 %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B)
4094 %1 = bitcast i8 %__U to <8 x i1>
4095 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4096 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
4100 define <4 x double> @test_mm256_maskz_permutex2var_pd(i8 zeroext %__U, <4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) {
4101 ; X86-LABEL: test_mm256_maskz_permutex2var_pd:
4102 ; X86: # %bb.0: # %entry
4103 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4104 ; X86-NEXT: kmovw %eax, %k1
4105 ; X86-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 {%k1} {z}
4108 ; X64-LABEL: test_mm256_maskz_permutex2var_pd:
4109 ; X64: # %bb.0: # %entry
4110 ; X64-NEXT: kmovw %edi, %k1
4111 ; X64-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 {%k1} {z}
4114 %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B)
4115 %1 = bitcast i8 %__U to <8 x i1>
4116 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4117 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
4121 define <4 x float> @test_mm_permutex2var_ps(<4 x float> %__A, <2 x i64> %__I, <4 x float> %__B) {
4122 ; CHECK-LABEL: test_mm_permutex2var_ps:
4123 ; CHECK: # %bb.0: # %entry
4124 ; CHECK-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0
4125 ; CHECK-NEXT: ret{{[l|q]}}
4127 %0 = bitcast <2 x i64> %__I to <4 x i32>
4128 %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B)
4132 define <4 x float> @test_mm_mask_permutex2var_ps(<4 x float> %__A, i8 zeroext %__U, <2 x i64> %__I, <4 x float> %__B) {
4133 ; X86-LABEL: test_mm_mask_permutex2var_ps:
4134 ; X86: # %bb.0: # %entry
4135 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4136 ; X86-NEXT: kmovw %eax, %k1
4137 ; X86-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 {%k1}
4140 ; X64-LABEL: test_mm_mask_permutex2var_ps:
4141 ; X64: # %bb.0: # %entry
4142 ; X64-NEXT: kmovw %edi, %k1
4143 ; X64-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 {%k1}
4146 %0 = bitcast <2 x i64> %__I to <4 x i32>
4147 %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B)
4148 %2 = bitcast i8 %__U to <8 x i1>
4149 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4150 %3 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> %__A
4154 define <4 x float> @test_mm_maskz_permutex2var_ps(i8 zeroext %__U, <4 x float> %__A, <2 x i64> %__I, <4 x float> %__B) {
4155 ; X86-LABEL: test_mm_maskz_permutex2var_ps:
4156 ; X86: # %bb.0: # %entry
4157 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4158 ; X86-NEXT: kmovw %eax, %k1
4159 ; X86-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 {%k1} {z}
4162 ; X64-LABEL: test_mm_maskz_permutex2var_ps:
4163 ; X64: # %bb.0: # %entry
4164 ; X64-NEXT: kmovw %edi, %k1
4165 ; X64-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 {%k1} {z}
4168 %0 = bitcast <2 x i64> %__I to <4 x i32>
4169 %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B)
4170 %2 = bitcast i8 %__U to <8 x i1>
4171 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4172 %3 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> zeroinitializer
4176 define <8 x float> @test_mm256_permutex2var_ps(<8 x float> %__A, <4 x i64> %__I, <8 x float> %__B) {
4177 ; CHECK-LABEL: test_mm256_permutex2var_ps:
4178 ; CHECK: # %bb.0: # %entry
4179 ; CHECK-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0
4180 ; CHECK-NEXT: ret{{[l|q]}}
4182 %0 = bitcast <4 x i64> %__I to <8 x i32>
4183 %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B)
4187 define <8 x float> @test_mm256_mask_permutex2var_ps(<8 x float> %__A, i8 zeroext %__U, <4 x i64> %__I, <8 x float> %__B) {
4188 ; X86-LABEL: test_mm256_mask_permutex2var_ps:
4189 ; X86: # %bb.0: # %entry
4190 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4191 ; X86-NEXT: kmovw %eax, %k1
4192 ; X86-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 {%k1}
4195 ; X64-LABEL: test_mm256_mask_permutex2var_ps:
4196 ; X64: # %bb.0: # %entry
4197 ; X64-NEXT: kmovw %edi, %k1
4198 ; X64-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 {%k1}
4201 %0 = bitcast <4 x i64> %__I to <8 x i32>
4202 %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B)
4203 %2 = bitcast i8 %__U to <8 x i1>
4204 %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %__A
4208 define <8 x float> @test_mm256_maskz_permutex2var_ps(i8 zeroext %__U, <8 x float> %__A, <4 x i64> %__I, <8 x float> %__B) {
4209 ; X86-LABEL: test_mm256_maskz_permutex2var_ps:
4210 ; X86: # %bb.0: # %entry
4211 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4212 ; X86-NEXT: kmovw %eax, %k1
4213 ; X86-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 {%k1} {z}
4216 ; X64-LABEL: test_mm256_maskz_permutex2var_ps:
4217 ; X64: # %bb.0: # %entry
4218 ; X64-NEXT: kmovw %edi, %k1
4219 ; X64-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 {%k1} {z}
4222 %0 = bitcast <4 x i64> %__I to <8 x i32>
4223 %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B)
4224 %2 = bitcast i8 %__U to <8 x i1>
4225 %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> zeroinitializer
4229 define <2 x i64> @test_mm_permutex2var_epi64(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) {
4230 ; CHECK-LABEL: test_mm_permutex2var_epi64:
4231 ; CHECK: # %bb.0: # %entry
4232 ; CHECK-NEXT: vpermt2q %xmm2, %xmm1, %xmm0
4233 ; CHECK-NEXT: ret{{[l|q]}}
4235 %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B)
4239 define <2 x i64> @test_mm_mask_permutex2var_epi64(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x i64> %__B) {
4240 ; X86-LABEL: test_mm_mask_permutex2var_epi64:
4241 ; X86: # %bb.0: # %entry
4242 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4243 ; X86-NEXT: kmovw %eax, %k1
4244 ; X86-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 {%k1}
4247 ; X64-LABEL: test_mm_mask_permutex2var_epi64:
4248 ; X64: # %bb.0: # %entry
4249 ; X64-NEXT: kmovw %edi, %k1
4250 ; X64-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 {%k1}
4253 %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B)
4254 %1 = bitcast i8 %__U to <8 x i1>
4255 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4256 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__A
4260 define <2 x i64> @test_mm_maskz_permutex2var_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) {
4261 ; X86-LABEL: test_mm_maskz_permutex2var_epi64:
4262 ; X86: # %bb.0: # %entry
4263 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4264 ; X86-NEXT: kmovw %eax, %k1
4265 ; X86-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 {%k1} {z}
4268 ; X64-LABEL: test_mm_maskz_permutex2var_epi64:
4269 ; X64: # %bb.0: # %entry
4270 ; X64-NEXT: kmovw %edi, %k1
4271 ; X64-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 {%k1} {z}
4274 %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B)
4275 %1 = bitcast i8 %__U to <8 x i1>
4276 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4277 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
4281 define <4 x i64> @test_mm256_permutex2var_epi64(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) {
4282 ; CHECK-LABEL: test_mm256_permutex2var_epi64:
4283 ; CHECK: # %bb.0: # %entry
4284 ; CHECK-NEXT: vpermt2q %ymm2, %ymm1, %ymm0
4285 ; CHECK-NEXT: ret{{[l|q]}}
4287 %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B)
4291 define <4 x i64> @test_mm256_mask_permutex2var_epi64(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x i64> %__B) {
4292 ; X86-LABEL: test_mm256_mask_permutex2var_epi64:
4293 ; X86: # %bb.0: # %entry
4294 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4295 ; X86-NEXT: kmovw %eax, %k1
4296 ; X86-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 {%k1}
4299 ; X64-LABEL: test_mm256_mask_permutex2var_epi64:
4300 ; X64: # %bb.0: # %entry
4301 ; X64-NEXT: kmovw %edi, %k1
4302 ; X64-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 {%k1}
4305 %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B)
4306 %1 = bitcast i8 %__U to <8 x i1>
4307 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4308 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__A
4312 define <4 x i64> @test_mm256_maskz_permutex2var_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) {
4313 ; X86-LABEL: test_mm256_maskz_permutex2var_epi64:
4314 ; X86: # %bb.0: # %entry
4315 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4316 ; X86-NEXT: kmovw %eax, %k1
4317 ; X86-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 {%k1} {z}
4320 ; X64-LABEL: test_mm256_maskz_permutex2var_epi64:
4321 ; X64: # %bb.0: # %entry
4322 ; X64-NEXT: kmovw %edi, %k1
4323 ; X64-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 {%k1} {z}
4326 %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B)
4327 %1 = bitcast i8 %__U to <8 x i1>
4328 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4329 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
4334 define <2 x double> @test_mm_mask_fmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
4335 ; X86-LABEL: test_mm_mask_fmadd_pd:
4336 ; X86: # %bb.0: # %entry
4337 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4338 ; X86-NEXT: kmovw %eax, %k1
4339 ; X86-NEXT: vfmadd132pd {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) + xmm2
4342 ; X64-LABEL: test_mm_mask_fmadd_pd:
4343 ; X64: # %bb.0: # %entry
4344 ; X64-NEXT: kmovw %edi, %k1
4345 ; X64-NEXT: vfmadd132pd {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) + xmm2
4348 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
4349 %1 = bitcast i8 %__U to <8 x i1>
4350 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4351 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
4355 define <2 x double> @test_mm_mask_fmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
4356 ; X86-LABEL: test_mm_mask_fmsub_pd:
4357 ; X86: # %bb.0: # %entry
4358 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4359 ; X86-NEXT: kmovw %eax, %k1
4360 ; X86-NEXT: vfmsub132pd {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) - xmm2
4363 ; X64-LABEL: test_mm_mask_fmsub_pd:
4364 ; X64: # %bb.0: # %entry
4365 ; X64-NEXT: kmovw %edi, %k1
4366 ; X64-NEXT: vfmsub132pd {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) - xmm2
4369 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
4370 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
4371 %1 = bitcast i8 %__U to <8 x i1>
4372 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4373 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
4377 define <2 x double> @test_mm_mask3_fmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
4378 ; X86-LABEL: test_mm_mask3_fmadd_pd:
4379 ; X86: # %bb.0: # %entry
4380 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4381 ; X86-NEXT: kmovw %eax, %k1
4382 ; X86-NEXT: vfmadd231pd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2
4383 ; X86-NEXT: vmovapd %xmm2, %xmm0
4386 ; X64-LABEL: test_mm_mask3_fmadd_pd:
4387 ; X64: # %bb.0: # %entry
4388 ; X64-NEXT: kmovw %edi, %k1
4389 ; X64-NEXT: vfmadd231pd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2
4390 ; X64-NEXT: vmovapd %xmm2, %xmm0
4393 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
4394 %1 = bitcast i8 %__U to <8 x i1>
4395 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4396 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
4400 define <2 x double> @test_mm_mask3_fnmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
4401 ; X86-LABEL: test_mm_mask3_fnmadd_pd:
4402 ; X86: # %bb.0: # %entry
4403 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4404 ; X86-NEXT: kmovw %eax, %k1
4405 ; X86-NEXT: vfnmadd231pd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
4406 ; X86-NEXT: vmovapd %xmm2, %xmm0
4409 ; X64-LABEL: test_mm_mask3_fnmadd_pd:
4410 ; X64: # %bb.0: # %entry
4411 ; X64-NEXT: kmovw %edi, %k1
4412 ; X64-NEXT: vfnmadd231pd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
4413 ; X64-NEXT: vmovapd %xmm2, %xmm0
4416 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
4417 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %__C) #9
4418 %1 = bitcast i8 %__U to <8 x i1>
4419 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4420 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
4424 define <2 x double> @test_mm_maskz_fmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
4425 ; X86-LABEL: test_mm_maskz_fmadd_pd:
4426 ; X86: # %bb.0: # %entry
4427 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4428 ; X86-NEXT: kmovw %eax, %k1
4429 ; X86-NEXT: vfmadd213pd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
4432 ; X64-LABEL: test_mm_maskz_fmadd_pd:
4433 ; X64: # %bb.0: # %entry
4434 ; X64-NEXT: kmovw %edi, %k1
4435 ; X64-NEXT: vfmadd213pd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
4438 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
4439 %1 = bitcast i8 %__U to <8 x i1>
4440 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4441 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
4445 define <2 x double> @test_mm_maskz_fmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
4446 ; X86-LABEL: test_mm_maskz_fmsub_pd:
4447 ; X86: # %bb.0: # %entry
4448 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4449 ; X86-NEXT: kmovw %eax, %k1
4450 ; X86-NEXT: vfmsub213pd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
4453 ; X64-LABEL: test_mm_maskz_fmsub_pd:
4454 ; X64: # %bb.0: # %entry
4455 ; X64-NEXT: kmovw %edi, %k1
4456 ; X64-NEXT: vfmsub213pd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
4459 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
4460 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
4461 %1 = bitcast i8 %__U to <8 x i1>
4462 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4463 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
4467 define <2 x double> @test_mm_maskz_fnmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
4468 ; X86-LABEL: test_mm_maskz_fnmadd_pd:
4469 ; X86: # %bb.0: # %entry
4470 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4471 ; X86-NEXT: kmovw %eax, %k1
4472 ; X86-NEXT: vfnmadd213pd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
4475 ; X64-LABEL: test_mm_maskz_fnmadd_pd:
4476 ; X64: # %bb.0: # %entry
4477 ; X64-NEXT: kmovw %edi, %k1
4478 ; X64-NEXT: vfnmadd213pd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
4481 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
4482 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %__C) #9
4483 %1 = bitcast i8 %__U to <8 x i1>
4484 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4485 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
4489 define <2 x double> @test_mm_maskz_fnmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
4490 ; X86-LABEL: test_mm_maskz_fnmsub_pd:
4491 ; X86: # %bb.0: # %entry
4492 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4493 ; X86-NEXT: kmovw %eax, %k1
4494 ; X86-NEXT: vfnmsub213pd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
4497 ; X64-LABEL: test_mm_maskz_fnmsub_pd:
4498 ; X64: # %bb.0: # %entry
4499 ; X64-NEXT: kmovw %edi, %k1
4500 ; X64-NEXT: vfnmsub213pd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
4503 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
4504 %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
4505 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %sub1.i) #9
4506 %1 = bitcast i8 %__U to <8 x i1>
4507 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
4508 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
4512 define <4 x double> @test_mm256_mask_fmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
4513 ; X86-LABEL: test_mm256_mask_fmadd_pd:
4514 ; X86: # %bb.0: # %entry
4515 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4516 ; X86-NEXT: kmovw %eax, %k1
4517 ; X86-NEXT: vfmadd132pd {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) + ymm2
4520 ; X64-LABEL: test_mm256_mask_fmadd_pd:
4521 ; X64: # %bb.0: # %entry
4522 ; X64-NEXT: kmovw %edi, %k1
4523 ; X64-NEXT: vfmadd132pd {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) + ymm2
4526 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
4527 %1 = bitcast i8 %__U to <8 x i1>
4528 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4529 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
4533 define <4 x double> @test_mm256_mask_fmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
4534 ; X86-LABEL: test_mm256_mask_fmsub_pd:
4535 ; X86: # %bb.0: # %entry
4536 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4537 ; X86-NEXT: kmovw %eax, %k1
4538 ; X86-NEXT: vfmsub132pd {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) - ymm2
4541 ; X64-LABEL: test_mm256_mask_fmsub_pd:
4542 ; X64: # %bb.0: # %entry
4543 ; X64-NEXT: kmovw %edi, %k1
4544 ; X64-NEXT: vfmsub132pd {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) - ymm2
4547 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4548 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
4549 %1 = bitcast i8 %__U to <8 x i1>
4550 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4551 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
4555 define <4 x double> @test_mm256_mask3_fmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
4556 ; X86-LABEL: test_mm256_mask3_fmadd_pd:
4557 ; X86: # %bb.0: # %entry
4558 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4559 ; X86-NEXT: kmovw %eax, %k1
4560 ; X86-NEXT: vfmadd231pd {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) + ymm2
4561 ; X86-NEXT: vmovapd %ymm2, %ymm0
4564 ; X64-LABEL: test_mm256_mask3_fmadd_pd:
4565 ; X64: # %bb.0: # %entry
4566 ; X64-NEXT: kmovw %edi, %k1
4567 ; X64-NEXT: vfmadd231pd {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) + ymm2
4568 ; X64-NEXT: vmovapd %ymm2, %ymm0
4571 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
4572 %1 = bitcast i8 %__U to <8 x i1>
4573 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4574 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
4578 define <4 x double> @test_mm256_mask3_fnmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
4579 ; X86-LABEL: test_mm256_mask3_fnmadd_pd:
4580 ; X86: # %bb.0: # %entry
4581 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4582 ; X86-NEXT: kmovw %eax, %k1
4583 ; X86-NEXT: vfnmadd231pd {{.*#+}} ymm2 {%k1} = -(ymm0 * ymm1) + ymm2
4584 ; X86-NEXT: vmovapd %ymm2, %ymm0
4587 ; X64-LABEL: test_mm256_mask3_fnmadd_pd:
4588 ; X64: # %bb.0: # %entry
4589 ; X64-NEXT: kmovw %edi, %k1
4590 ; X64-NEXT: vfnmadd231pd {{.*#+}} ymm2 {%k1} = -(ymm0 * ymm1) + ymm2
4591 ; X64-NEXT: vmovapd %ymm2, %ymm0
4594 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4595 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %__C) #9
4596 %1 = bitcast i8 %__U to <8 x i1>
4597 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4598 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
4602 define <4 x double> @test_mm256_maskz_fmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
4603 ; X86-LABEL: test_mm256_maskz_fmadd_pd:
4604 ; X86: # %bb.0: # %entry
4605 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4606 ; X86-NEXT: kmovw %eax, %k1
4607 ; X86-NEXT: vfmadd213pd {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2
4610 ; X64-LABEL: test_mm256_maskz_fmadd_pd:
4611 ; X64: # %bb.0: # %entry
4612 ; X64-NEXT: kmovw %edi, %k1
4613 ; X64-NEXT: vfmadd213pd {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2
4616 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
4617 %1 = bitcast i8 %__U to <8 x i1>
4618 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4619 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
4623 define <4 x double> @test_mm256_maskz_fmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
4624 ; X86-LABEL: test_mm256_maskz_fmsub_pd:
4625 ; X86: # %bb.0: # %entry
4626 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4627 ; X86-NEXT: kmovw %eax, %k1
4628 ; X86-NEXT: vfmsub213pd {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2
4631 ; X64-LABEL: test_mm256_maskz_fmsub_pd:
4632 ; X64: # %bb.0: # %entry
4633 ; X64-NEXT: kmovw %edi, %k1
4634 ; X64-NEXT: vfmsub213pd {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2
4637 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4638 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
4639 %1 = bitcast i8 %__U to <8 x i1>
4640 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4641 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
4645 define <4 x double> @test_mm256_maskz_fnmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
4646 ; X86-LABEL: test_mm256_maskz_fnmadd_pd:
4647 ; X86: # %bb.0: # %entry
4648 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4649 ; X86-NEXT: kmovw %eax, %k1
4650 ; X86-NEXT: vfnmadd213pd {{.*#+}} ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2
4653 ; X64-LABEL: test_mm256_maskz_fnmadd_pd:
4654 ; X64: # %bb.0: # %entry
4655 ; X64-NEXT: kmovw %edi, %k1
4656 ; X64-NEXT: vfnmadd213pd {{.*#+}} ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2
4659 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4660 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %__C) #9
4661 %1 = bitcast i8 %__U to <8 x i1>
4662 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4663 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
4667 define <4 x double> @test_mm256_maskz_fnmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
4668 ; X86-LABEL: test_mm256_maskz_fnmsub_pd:
4669 ; X86: # %bb.0: # %entry
4670 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4671 ; X86-NEXT: kmovw %eax, %k1
4672 ; X86-NEXT: vfnmsub213pd {{.*#+}} ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2
4675 ; X64-LABEL: test_mm256_maskz_fnmsub_pd:
4676 ; X64: # %bb.0: # %entry
4677 ; X64-NEXT: kmovw %edi, %k1
4678 ; X64-NEXT: vfnmsub213pd {{.*#+}} ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2
4681 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4682 %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4683 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %sub1.i) #9
4684 %1 = bitcast i8 %__U to <8 x i1>
4685 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4686 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
4690 define <4 x float> @test_mm_mask_fmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
4691 ; X86-LABEL: test_mm_mask_fmadd_ps:
4692 ; X86: # %bb.0: # %entry
4693 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4694 ; X86-NEXT: kmovw %eax, %k1
4695 ; X86-NEXT: vfmadd132ps {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) + xmm2
4698 ; X64-LABEL: test_mm_mask_fmadd_ps:
4699 ; X64: # %bb.0: # %entry
4700 ; X64-NEXT: kmovw %edi, %k1
4701 ; X64-NEXT: vfmadd132ps {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) + xmm2
4704 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
4705 %1 = bitcast i8 %__U to <8 x i1>
4706 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4707 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
4711 define <4 x float> @test_mm_mask_fmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
4712 ; X86-LABEL: test_mm_mask_fmsub_ps:
4713 ; X86: # %bb.0: # %entry
4714 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4715 ; X86-NEXT: kmovw %eax, %k1
4716 ; X86-NEXT: vfmsub132ps {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) - xmm2
4719 ; X64-LABEL: test_mm_mask_fmsub_ps:
4720 ; X64: # %bb.0: # %entry
4721 ; X64-NEXT: kmovw %edi, %k1
4722 ; X64-NEXT: vfmsub132ps {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) - xmm2
4725 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4726 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
4727 %1 = bitcast i8 %__U to <8 x i1>
4728 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4729 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
4733 define <4 x float> @test_mm_mask3_fmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
4734 ; X86-LABEL: test_mm_mask3_fmadd_ps:
4735 ; X86: # %bb.0: # %entry
4736 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4737 ; X86-NEXT: kmovw %eax, %k1
4738 ; X86-NEXT: vfmadd231ps {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2
4739 ; X86-NEXT: vmovaps %xmm2, %xmm0
4742 ; X64-LABEL: test_mm_mask3_fmadd_ps:
4743 ; X64: # %bb.0: # %entry
4744 ; X64-NEXT: kmovw %edi, %k1
4745 ; X64-NEXT: vfmadd231ps {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2
4746 ; X64-NEXT: vmovaps %xmm2, %xmm0
4749 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
4750 %1 = bitcast i8 %__U to <8 x i1>
4751 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4752 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
4756 define <4 x float> @test_mm_mask3_fnmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
4757 ; X86-LABEL: test_mm_mask3_fnmadd_ps:
4758 ; X86: # %bb.0: # %entry
4759 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4760 ; X86-NEXT: kmovw %eax, %k1
4761 ; X86-NEXT: vfnmadd231ps {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
4762 ; X86-NEXT: vmovaps %xmm2, %xmm0
4765 ; X64-LABEL: test_mm_mask3_fnmadd_ps:
4766 ; X64: # %bb.0: # %entry
4767 ; X64-NEXT: kmovw %edi, %k1
4768 ; X64-NEXT: vfnmadd231ps {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
4769 ; X64-NEXT: vmovaps %xmm2, %xmm0
4772 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4773 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %__C) #9
4774 %1 = bitcast i8 %__U to <8 x i1>
4775 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4776 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
4780 define <4 x float> @test_mm_maskz_fmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4781 ; X86-LABEL: test_mm_maskz_fmadd_ps:
4782 ; X86: # %bb.0: # %entry
4783 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4784 ; X86-NEXT: kmovw %eax, %k1
4785 ; X86-NEXT: vfmadd213ps {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
4788 ; X64-LABEL: test_mm_maskz_fmadd_ps:
4789 ; X64: # %bb.0: # %entry
4790 ; X64-NEXT: kmovw %edi, %k1
4791 ; X64-NEXT: vfmadd213ps {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
4794 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
4795 %1 = bitcast i8 %__U to <8 x i1>
4796 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4797 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
4801 define <4 x float> @test_mm_maskz_fmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4802 ; X86-LABEL: test_mm_maskz_fmsub_ps:
4803 ; X86: # %bb.0: # %entry
4804 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4805 ; X86-NEXT: kmovw %eax, %k1
4806 ; X86-NEXT: vfmsub213ps {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
4809 ; X64-LABEL: test_mm_maskz_fmsub_ps:
4810 ; X64: # %bb.0: # %entry
4811 ; X64-NEXT: kmovw %edi, %k1
4812 ; X64-NEXT: vfmsub213ps {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
4815 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4816 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
4817 %1 = bitcast i8 %__U to <8 x i1>
4818 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4819 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
4823 define <4 x float> @test_mm_maskz_fnmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4824 ; X86-LABEL: test_mm_maskz_fnmadd_ps:
4825 ; X86: # %bb.0: # %entry
4826 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4827 ; X86-NEXT: kmovw %eax, %k1
4828 ; X86-NEXT: vfnmadd213ps {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
4831 ; X64-LABEL: test_mm_maskz_fnmadd_ps:
4832 ; X64: # %bb.0: # %entry
4833 ; X64-NEXT: kmovw %edi, %k1
4834 ; X64-NEXT: vfnmadd213ps {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
4837 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4838 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %__C) #9
4839 %1 = bitcast i8 %__U to <8 x i1>
4840 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4841 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
4845 define <4 x float> @test_mm_maskz_fnmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4846 ; X86-LABEL: test_mm_maskz_fnmsub_ps:
4847 ; X86: # %bb.0: # %entry
4848 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4849 ; X86-NEXT: kmovw %eax, %k1
4850 ; X86-NEXT: vfnmsub213ps {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
4853 ; X64-LABEL: test_mm_maskz_fnmsub_ps:
4854 ; X64: # %bb.0: # %entry
4855 ; X64-NEXT: kmovw %edi, %k1
4856 ; X64-NEXT: vfnmsub213ps {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
4859 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4860 %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4861 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %sub1.i) #9
4862 %1 = bitcast i8 %__U to <8 x i1>
4863 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4864 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
4868 define <8 x float> @test_mm256_mask_fmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
4869 ; X86-LABEL: test_mm256_mask_fmadd_ps:
4870 ; X86: # %bb.0: # %entry
4871 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4872 ; X86-NEXT: kmovw %eax, %k1
4873 ; X86-NEXT: vfmadd132ps {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) + ymm2
4876 ; X64-LABEL: test_mm256_mask_fmadd_ps:
4877 ; X64: # %bb.0: # %entry
4878 ; X64-NEXT: kmovw %edi, %k1
4879 ; X64-NEXT: vfmadd132ps {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) + ymm2
4882 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
4883 %1 = bitcast i8 %__U to <8 x i1>
4884 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
4888 define <8 x float> @test_mm256_mask_fmsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
4889 ; X86-LABEL: test_mm256_mask_fmsub_ps:
4890 ; X86: # %bb.0: # %entry
4891 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4892 ; X86-NEXT: kmovw %eax, %k1
4893 ; X86-NEXT: vfmsub132ps {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) - ymm2
4896 ; X64-LABEL: test_mm256_mask_fmsub_ps:
4897 ; X64: # %bb.0: # %entry
4898 ; X64-NEXT: kmovw %edi, %k1
4899 ; X64-NEXT: vfmsub132ps {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) - ymm2
4902 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4903 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
4904 %1 = bitcast i8 %__U to <8 x i1>
4905 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
4909 define <8 x float> @test_mm256_mask3_fmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
4910 ; X86-LABEL: test_mm256_mask3_fmadd_ps:
4911 ; X86: # %bb.0: # %entry
4912 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4913 ; X86-NEXT: kmovw %eax, %k1
4914 ; X86-NEXT: vfmadd231ps {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) + ymm2
4915 ; X86-NEXT: vmovaps %ymm2, %ymm0
4918 ; X64-LABEL: test_mm256_mask3_fmadd_ps:
4919 ; X64: # %bb.0: # %entry
4920 ; X64-NEXT: kmovw %edi, %k1
4921 ; X64-NEXT: vfmadd231ps {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) + ymm2
4922 ; X64-NEXT: vmovaps %ymm2, %ymm0
4925 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
4926 %1 = bitcast i8 %__U to <8 x i1>
4927 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
4931 define <8 x float> @test_mm256_mask3_fnmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
4932 ; X86-LABEL: test_mm256_mask3_fnmadd_ps:
4933 ; X86: # %bb.0: # %entry
4934 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4935 ; X86-NEXT: kmovw %eax, %k1
4936 ; X86-NEXT: vfnmadd231ps {{.*#+}} ymm2 {%k1} = -(ymm0 * ymm1) + ymm2
4937 ; X86-NEXT: vmovaps %ymm2, %ymm0
4940 ; X64-LABEL: test_mm256_mask3_fnmadd_ps:
4941 ; X64: # %bb.0: # %entry
4942 ; X64-NEXT: kmovw %edi, %k1
4943 ; X64-NEXT: vfnmadd231ps {{.*#+}} ymm2 {%k1} = -(ymm0 * ymm1) + ymm2
4944 ; X64-NEXT: vmovaps %ymm2, %ymm0
4947 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4948 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %__C) #9
4949 %1 = bitcast i8 %__U to <8 x i1>
4950 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
4954 define <8 x float> @test_mm256_maskz_fmadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
4955 ; X86-LABEL: test_mm256_maskz_fmadd_ps:
4956 ; X86: # %bb.0: # %entry
4957 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4958 ; X86-NEXT: kmovw %eax, %k1
4959 ; X86-NEXT: vfmadd213ps {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2
4962 ; X64-LABEL: test_mm256_maskz_fmadd_ps:
4963 ; X64: # %bb.0: # %entry
4964 ; X64-NEXT: kmovw %edi, %k1
4965 ; X64-NEXT: vfmadd213ps {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2
4968 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
4969 %1 = bitcast i8 %__U to <8 x i1>
4970 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
4974 define <8 x float> @test_mm256_maskz_fmsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
4975 ; X86-LABEL: test_mm256_maskz_fmsub_ps:
4976 ; X86: # %bb.0: # %entry
4977 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4978 ; X86-NEXT: kmovw %eax, %k1
4979 ; X86-NEXT: vfmsub213ps {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2
4982 ; X64-LABEL: test_mm256_maskz_fmsub_ps:
4983 ; X64: # %bb.0: # %entry
4984 ; X64-NEXT: kmovw %edi, %k1
4985 ; X64-NEXT: vfmsub213ps {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2
4988 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4989 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
4990 %1 = bitcast i8 %__U to <8 x i1>
4991 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
4995 define <8 x float> @test_mm256_maskz_fnmadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
4996 ; X86-LABEL: test_mm256_maskz_fnmadd_ps:
4997 ; X86: # %bb.0: # %entry
4998 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4999 ; X86-NEXT: kmovw %eax, %k1
5000 ; X86-NEXT: vfnmadd213ps {{.*#+}} ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2
5003 ; X64-LABEL: test_mm256_maskz_fnmadd_ps:
5004 ; X64: # %bb.0: # %entry
5005 ; X64-NEXT: kmovw %edi, %k1
5006 ; X64-NEXT: vfnmadd213ps {{.*#+}} ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2
5009 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
5010 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %__C) #9
5011 %1 = bitcast i8 %__U to <8 x i1>
5012 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
5016 define <8 x float> @test_mm256_maskz_fnmsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
5017 ; X86-LABEL: test_mm256_maskz_fnmsub_ps:
5018 ; X86: # %bb.0: # %entry
5019 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5020 ; X86-NEXT: kmovw %eax, %k1
5021 ; X86-NEXT: vfnmsub213ps {{.*#+}} ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2
5024 ; X64-LABEL: test_mm256_maskz_fnmsub_ps:
5025 ; X64: # %bb.0: # %entry
5026 ; X64-NEXT: kmovw %edi, %k1
5027 ; X64-NEXT: vfnmsub213ps {{.*#+}} ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2
5030 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
5031 %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5032 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %sub1.i) #9
5033 %1 = bitcast i8 %__U to <8 x i1>
5034 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
5038 define <2 x double> @test_mm_mask_fmaddsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
5039 ; X86-LABEL: test_mm_mask_fmaddsub_pd:
5040 ; X86: # %bb.0: # %entry
5041 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5042 ; X86-NEXT: kmovw %eax, %k1
5043 ; X86-NEXT: vfmaddsub132pd {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2
5046 ; X64-LABEL: test_mm_mask_fmaddsub_pd:
5047 ; X64: # %bb.0: # %entry
5048 ; X64-NEXT: kmovw %edi, %k1
5049 ; X64-NEXT: vfmaddsub132pd {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2
5052 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
5053 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5054 %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
5055 %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
5056 %4 = bitcast i8 %__U to <8 x i1>
5057 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5058 %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> %__A
5062 define <2 x double> @test_mm_mask_fmsubadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
5063 ; X86-LABEL: test_mm_mask_fmsubadd_pd:
5064 ; X86: # %bb.0: # %entry
5065 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5066 ; X86-NEXT: kmovw %eax, %k1
5067 ; X86-NEXT: vfmsubadd132pd {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2
5070 ; X64-LABEL: test_mm_mask_fmsubadd_pd:
5071 ; X64: # %bb.0: # %entry
5072 ; X64-NEXT: kmovw %edi, %k1
5073 ; X64-NEXT: vfmsubadd132pd {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2
5076 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5077 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
5078 %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
5079 %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
5080 %3 = bitcast i8 %__U to <8 x i1>
5081 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5082 %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__A
5086 define <2 x double> @test_mm_mask3_fmaddsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
5087 ; X86-LABEL: test_mm_mask3_fmaddsub_pd:
5088 ; X86: # %bb.0: # %entry
5089 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5090 ; X86-NEXT: kmovw %eax, %k1
5091 ; X86-NEXT: vfmaddsub231pd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2
5092 ; X86-NEXT: vmovapd %xmm2, %xmm0
5095 ; X64-LABEL: test_mm_mask3_fmaddsub_pd:
5096 ; X64: # %bb.0: # %entry
5097 ; X64-NEXT: kmovw %edi, %k1
5098 ; X64-NEXT: vfmaddsub231pd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2
5099 ; X64-NEXT: vmovapd %xmm2, %xmm0
5102 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
5103 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5104 %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
5105 %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
5106 %4 = bitcast i8 %__U to <8 x i1>
5107 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5108 %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> %__C
5112 define <2 x double> @test_mm_maskz_fmaddsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5113 ; X86-LABEL: test_mm_maskz_fmaddsub_pd:
5114 ; X86: # %bb.0: # %entry
5115 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5116 ; X86-NEXT: kmovw %eax, %k1
5117 ; X86-NEXT: vfmaddsub213pd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2
5120 ; X64-LABEL: test_mm_maskz_fmaddsub_pd:
5121 ; X64: # %bb.0: # %entry
5122 ; X64-NEXT: kmovw %edi, %k1
5123 ; X64-NEXT: vfmaddsub213pd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2
5126 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
5127 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5128 %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
5129 %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
5130 %4 = bitcast i8 %__U to <8 x i1>
5131 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5132 %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> zeroinitializer
5136 define <2 x double> @test_mm_maskz_fmsubadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5137 ; X86-LABEL: test_mm_maskz_fmsubadd_pd:
5138 ; X86: # %bb.0: # %entry
5139 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5140 ; X86-NEXT: kmovw %eax, %k1
5141 ; X86-NEXT: vfmsubadd213pd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2
5144 ; X64-LABEL: test_mm_maskz_fmsubadd_pd:
5145 ; X64: # %bb.0: # %entry
5146 ; X64-NEXT: kmovw %edi, %k1
5147 ; X64-NEXT: vfmsubadd213pd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2
5150 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5151 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
5152 %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
5153 %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
5154 %3 = bitcast i8 %__U to <8 x i1>
5155 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5156 %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> zeroinitializer
5160 define <4 x double> @test_mm256_mask_fmaddsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
5161 ; X86-LABEL: test_mm256_mask_fmaddsub_pd:
5162 ; X86: # %bb.0: # %entry
5163 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5164 ; X86-NEXT: kmovw %eax, %k1
5165 ; X86-NEXT: vfmaddsub132pd {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2
5168 ; X64-LABEL: test_mm256_mask_fmaddsub_pd:
5169 ; X64: # %bb.0: # %entry
5170 ; X64-NEXT: kmovw %edi, %k1
5171 ; X64-NEXT: vfmaddsub132pd {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2
5174 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5175 %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5176 %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
5177 %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5178 %4 = bitcast i8 %__U to <8 x i1>
5179 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5180 %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> %__A
5184 define <4 x double> @test_mm256_mask_fmsubadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
5185 ; X86-LABEL: test_mm256_mask_fmsubadd_pd:
5186 ; X86: # %bb.0: # %entry
5187 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5188 ; X86-NEXT: kmovw %eax, %k1
5189 ; X86-NEXT: vfmsubadd132pd {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2
5192 ; X64-LABEL: test_mm256_mask_fmsubadd_pd:
5193 ; X64: # %bb.0: # %entry
5194 ; X64-NEXT: kmovw %edi, %k1
5195 ; X64-NEXT: vfmsubadd132pd {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2
5198 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5199 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
5200 %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5201 %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5202 %3 = bitcast i8 %__U to <8 x i1>
5203 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5204 %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__A
5208 define <4 x double> @test_mm256_mask3_fmaddsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
5209 ; X86-LABEL: test_mm256_mask3_fmaddsub_pd:
5210 ; X86: # %bb.0: # %entry
5211 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5212 ; X86-NEXT: kmovw %eax, %k1
5213 ; X86-NEXT: vfmaddsub231pd {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2
5214 ; X86-NEXT: vmovapd %ymm2, %ymm0
5217 ; X64-LABEL: test_mm256_mask3_fmaddsub_pd:
5218 ; X64: # %bb.0: # %entry
5219 ; X64-NEXT: kmovw %edi, %k1
5220 ; X64-NEXT: vfmaddsub231pd {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2
5221 ; X64-NEXT: vmovapd %ymm2, %ymm0
5224 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5225 %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5226 %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
5227 %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5228 %4 = bitcast i8 %__U to <8 x i1>
5229 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5230 %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> %__C
5234 define <4 x double> @test_mm256_maskz_fmaddsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
5235 ; X86-LABEL: test_mm256_maskz_fmaddsub_pd:
5236 ; X86: # %bb.0: # %entry
5237 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5238 ; X86-NEXT: kmovw %eax, %k1
5239 ; X86-NEXT: vfmaddsub213pd {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2
5242 ; X64-LABEL: test_mm256_maskz_fmaddsub_pd:
5243 ; X64: # %bb.0: # %entry
5244 ; X64-NEXT: kmovw %edi, %k1
5245 ; X64-NEXT: vfmaddsub213pd {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2
5248 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5249 %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5250 %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
5251 %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5252 %4 = bitcast i8 %__U to <8 x i1>
5253 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5254 %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> zeroinitializer
5258 define <4 x double> @test_mm256_maskz_fmsubadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
5259 ; X86-LABEL: test_mm256_maskz_fmsubadd_pd:
5260 ; X86: # %bb.0: # %entry
5261 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5262 ; X86-NEXT: kmovw %eax, %k1
5263 ; X86-NEXT: vfmsubadd213pd {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2
5266 ; X64-LABEL: test_mm256_maskz_fmsubadd_pd:
5267 ; X64: # %bb.0: # %entry
5268 ; X64-NEXT: kmovw %edi, %k1
5269 ; X64-NEXT: vfmsubadd213pd {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2
5272 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5273 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
5274 %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5275 %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5276 %3 = bitcast i8 %__U to <8 x i1>
5277 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5278 %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> zeroinitializer
5282 define <4 x float> @test_mm_mask_fmaddsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
5283 ; X86-LABEL: test_mm_mask_fmaddsub_ps:
5284 ; X86: # %bb.0: # %entry
5285 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5286 ; X86-NEXT: kmovw %eax, %k1
5287 ; X86-NEXT: vfmaddsub132ps {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2
5290 ; X64-LABEL: test_mm_mask_fmaddsub_ps:
5291 ; X64: # %bb.0: # %entry
5292 ; X64-NEXT: kmovw %edi, %k1
5293 ; X64-NEXT: vfmaddsub132ps {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2
5296 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5297 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5298 %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
5299 %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5300 %4 = bitcast i8 %__U to <8 x i1>
5301 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5302 %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> %__A
5306 define <4 x float> @test_mm_mask_fmsubadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
5307 ; X86-LABEL: test_mm_mask_fmsubadd_ps:
5308 ; X86: # %bb.0: # %entry
5309 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5310 ; X86-NEXT: kmovw %eax, %k1
5311 ; X86-NEXT: vfmsubadd132ps {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2
5314 ; X64-LABEL: test_mm_mask_fmsubadd_ps:
5315 ; X64: # %bb.0: # %entry
5316 ; X64-NEXT: kmovw %edi, %k1
5317 ; X64-NEXT: vfmsubadd132ps {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2
5320 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5321 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
5322 %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5323 %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5324 %3 = bitcast i8 %__U to <8 x i1>
5325 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5326 %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__A
5330 define <4 x float> @test_mm_mask3_fmaddsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
5331 ; X86-LABEL: test_mm_mask3_fmaddsub_ps:
5332 ; X86: # %bb.0: # %entry
5333 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5334 ; X86-NEXT: kmovw %eax, %k1
5335 ; X86-NEXT: vfmaddsub231ps {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2
5336 ; X86-NEXT: vmovaps %xmm2, %xmm0
5339 ; X64-LABEL: test_mm_mask3_fmaddsub_ps:
5340 ; X64: # %bb.0: # %entry
5341 ; X64-NEXT: kmovw %edi, %k1
5342 ; X64-NEXT: vfmaddsub231ps {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2
5343 ; X64-NEXT: vmovaps %xmm2, %xmm0
5346 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5347 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5348 %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
5349 %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5350 %4 = bitcast i8 %__U to <8 x i1>
5351 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5352 %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> %__C
5356 define <4 x float> @test_mm_maskz_fmaddsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5357 ; X86-LABEL: test_mm_maskz_fmaddsub_ps:
5358 ; X86: # %bb.0: # %entry
5359 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5360 ; X86-NEXT: kmovw %eax, %k1
5361 ; X86-NEXT: vfmaddsub213ps {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2
5364 ; X64-LABEL: test_mm_maskz_fmaddsub_ps:
5365 ; X64: # %bb.0: # %entry
5366 ; X64-NEXT: kmovw %edi, %k1
5367 ; X64-NEXT: vfmaddsub213ps {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2
5370 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5371 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5372 %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
5373 %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5374 %4 = bitcast i8 %__U to <8 x i1>
5375 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5376 %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> zeroinitializer
5380 define <4 x float> @test_mm_maskz_fmsubadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5381 ; X86-LABEL: test_mm_maskz_fmsubadd_ps:
5382 ; X86: # %bb.0: # %entry
5383 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5384 ; X86-NEXT: kmovw %eax, %k1
5385 ; X86-NEXT: vfmsubadd213ps {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2
5388 ; X64-LABEL: test_mm_maskz_fmsubadd_ps:
5389 ; X64: # %bb.0: # %entry
5390 ; X64-NEXT: kmovw %edi, %k1
5391 ; X64-NEXT: vfmsubadd213ps {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2
5394 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5395 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
5396 %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5397 %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5398 %3 = bitcast i8 %__U to <8 x i1>
5399 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5400 %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> zeroinitializer
5404 define <8 x float> @test_mm256_mask_fmaddsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
5405 ; X86-LABEL: test_mm256_mask_fmaddsub_ps:
5406 ; X86: # %bb.0: # %entry
5407 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5408 ; X86-NEXT: kmovw %eax, %k1
5409 ; X86-NEXT: vfmaddsub132ps {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2
5412 ; X64-LABEL: test_mm256_mask_fmaddsub_ps:
5413 ; X64: # %bb.0: # %entry
5414 ; X64-NEXT: kmovw %edi, %k1
5415 ; X64-NEXT: vfmaddsub132ps {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2
5418 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5419 %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5420 %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
5421 %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5422 %4 = bitcast i8 %__U to <8 x i1>
5423 %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %__A
5427 define <8 x float> @test_mm256_mask_fmsubadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
5428 ; X86-LABEL: test_mm256_mask_fmsubadd_ps:
5429 ; X86: # %bb.0: # %entry
5430 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5431 ; X86-NEXT: kmovw %eax, %k1
5432 ; X86-NEXT: vfmsubadd132ps {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2
5435 ; X64-LABEL: test_mm256_mask_fmsubadd_ps:
5436 ; X64: # %bb.0: # %entry
5437 ; X64-NEXT: kmovw %edi, %k1
5438 ; X64-NEXT: vfmsubadd132ps {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2
5441 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5442 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
5443 %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5444 %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5445 %3 = bitcast i8 %__U to <8 x i1>
5446 %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__A
5450 define <8 x float> @test_mm256_mask3_fmaddsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
5451 ; X86-LABEL: test_mm256_mask3_fmaddsub_ps:
5452 ; X86: # %bb.0: # %entry
5453 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5454 ; X86-NEXT: kmovw %eax, %k1
5455 ; X86-NEXT: vfmaddsub231ps {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2
5456 ; X86-NEXT: vmovaps %ymm2, %ymm0
5459 ; X64-LABEL: test_mm256_mask3_fmaddsub_ps:
5460 ; X64: # %bb.0: # %entry
5461 ; X64-NEXT: kmovw %edi, %k1
5462 ; X64-NEXT: vfmaddsub231ps {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2
5463 ; X64-NEXT: vmovaps %ymm2, %ymm0
5466 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5467 %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5468 %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
5469 %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5470 %4 = bitcast i8 %__U to <8 x i1>
5471 %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %__C
5475 define <8 x float> @test_mm256_maskz_fmaddsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
5476 ; X86-LABEL: test_mm256_maskz_fmaddsub_ps:
5477 ; X86: # %bb.0: # %entry
5478 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5479 ; X86-NEXT: kmovw %eax, %k1
5480 ; X86-NEXT: vfmaddsub213ps {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2
5483 ; X64-LABEL: test_mm256_maskz_fmaddsub_ps:
5484 ; X64: # %bb.0: # %entry
5485 ; X64-NEXT: kmovw %edi, %k1
5486 ; X64-NEXT: vfmaddsub213ps {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2
5489 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5490 %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5491 %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
5492 %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5493 %4 = bitcast i8 %__U to <8 x i1>
5494 %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> zeroinitializer
5498 define <8 x float> @test_mm256_maskz_fmsubadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
5499 ; X86-LABEL: test_mm256_maskz_fmsubadd_ps:
5500 ; X86: # %bb.0: # %entry
5501 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5502 ; X86-NEXT: kmovw %eax, %k1
5503 ; X86-NEXT: vfmsubadd213ps {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2
5506 ; X64-LABEL: test_mm256_maskz_fmsubadd_ps:
5507 ; X64: # %bb.0: # %entry
5508 ; X64-NEXT: kmovw %edi, %k1
5509 ; X64-NEXT: vfmsubadd213ps {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2
5512 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5513 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
5514 %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5515 %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5516 %3 = bitcast i8 %__U to <8 x i1>
5517 %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> zeroinitializer
5521 define <2 x double> @test_mm_mask3_fmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
5522 ; X86-LABEL: test_mm_mask3_fmsub_pd:
5523 ; X86: # %bb.0: # %entry
5524 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5525 ; X86-NEXT: kmovw %eax, %k1
5526 ; X86-NEXT: vfmsub231pd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2
5527 ; X86-NEXT: vmovapd %xmm2, %xmm0
5530 ; X64-LABEL: test_mm_mask3_fmsub_pd:
5531 ; X64: # %bb.0: # %entry
5532 ; X64-NEXT: kmovw %edi, %k1
5533 ; X64-NEXT: vfmsub231pd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2
5534 ; X64-NEXT: vmovapd %xmm2, %xmm0
5537 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5538 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
5539 %1 = bitcast i8 %__U to <8 x i1>
5540 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5541 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
5545 define <4 x double> @test_mm256_mask3_fmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
5546 ; X86-LABEL: test_mm256_mask3_fmsub_pd:
5547 ; X86: # %bb.0: # %entry
5548 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5549 ; X86-NEXT: kmovw %eax, %k1
5550 ; X86-NEXT: vfmsub231pd {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) - ymm2
5551 ; X86-NEXT: vmovapd %ymm2, %ymm0
5554 ; X64-LABEL: test_mm256_mask3_fmsub_pd:
5555 ; X64: # %bb.0: # %entry
5556 ; X64-NEXT: kmovw %edi, %k1
5557 ; X64-NEXT: vfmsub231pd {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) - ymm2
5558 ; X64-NEXT: vmovapd %ymm2, %ymm0
5561 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5562 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
5563 %1 = bitcast i8 %__U to <8 x i1>
5564 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5565 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
5569 define <4 x float> @test_mm_mask3_fmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
5570 ; X86-LABEL: test_mm_mask3_fmsub_ps:
5571 ; X86: # %bb.0: # %entry
5572 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5573 ; X86-NEXT: kmovw %eax, %k1
5574 ; X86-NEXT: vfmsub231ps {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2
5575 ; X86-NEXT: vmovaps %xmm2, %xmm0
5578 ; X64-LABEL: test_mm_mask3_fmsub_ps:
5579 ; X64: # %bb.0: # %entry
5580 ; X64-NEXT: kmovw %edi, %k1
5581 ; X64-NEXT: vfmsub231ps {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2
5582 ; X64-NEXT: vmovaps %xmm2, %xmm0
5585 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5586 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
5587 %1 = bitcast i8 %__U to <8 x i1>
5588 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5589 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
5593 define <8 x float> @test_mm256_mask3_fmsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
5594 ; X86-LABEL: test_mm256_mask3_fmsub_ps:
5595 ; X86: # %bb.0: # %entry
5596 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5597 ; X86-NEXT: kmovw %eax, %k1
5598 ; X86-NEXT: vfmsub231ps {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) - ymm2
5599 ; X86-NEXT: vmovaps %ymm2, %ymm0
5602 ; X64-LABEL: test_mm256_mask3_fmsub_ps:
5603 ; X64: # %bb.0: # %entry
5604 ; X64-NEXT: kmovw %edi, %k1
5605 ; X64-NEXT: vfmsub231ps {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) - ymm2
5606 ; X64-NEXT: vmovaps %ymm2, %ymm0
5609 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5610 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
5611 %1 = bitcast i8 %__U to <8 x i1>
5612 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
5616 define <2 x double> @test_mm_mask3_fmsubadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
5617 ; X86-LABEL: test_mm_mask3_fmsubadd_pd:
5618 ; X86: # %bb.0: # %entry
5619 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5620 ; X86-NEXT: kmovw %eax, %k1
5621 ; X86-NEXT: vfmsubadd231pd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2
5622 ; X86-NEXT: vmovapd %xmm2, %xmm0
5625 ; X64-LABEL: test_mm_mask3_fmsubadd_pd:
5626 ; X64: # %bb.0: # %entry
5627 ; X64-NEXT: kmovw %edi, %k1
5628 ; X64-NEXT: vfmsubadd231pd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2
5629 ; X64-NEXT: vmovapd %xmm2, %xmm0
5632 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5633 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
5634 %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
5635 %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
5636 %3 = bitcast i8 %__U to <8 x i1>
5637 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5638 %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__C
5642 define <4 x double> @test_mm256_mask3_fmsubadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
5643 ; X86-LABEL: test_mm256_mask3_fmsubadd_pd:
5644 ; X86: # %bb.0: # %entry
5645 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5646 ; X86-NEXT: kmovw %eax, %k1
5647 ; X86-NEXT: vfmsubadd231pd {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2
5648 ; X86-NEXT: vmovapd %ymm2, %ymm0
5651 ; X64-LABEL: test_mm256_mask3_fmsubadd_pd:
5652 ; X64: # %bb.0: # %entry
5653 ; X64-NEXT: kmovw %edi, %k1
5654 ; X64-NEXT: vfmsubadd231pd {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2
5655 ; X64-NEXT: vmovapd %ymm2, %ymm0
5658 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5659 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
5660 %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
5661 %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5662 %3 = bitcast i8 %__U to <8 x i1>
5663 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5664 %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__C
5668 define <4 x float> @test_mm_mask3_fmsubadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
5669 ; X86-LABEL: test_mm_mask3_fmsubadd_ps:
5670 ; X86: # %bb.0: # %entry
5671 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5672 ; X86-NEXT: kmovw %eax, %k1
5673 ; X86-NEXT: vfmsubadd231ps {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2
5674 ; X86-NEXT: vmovaps %xmm2, %xmm0
5677 ; X64-LABEL: test_mm_mask3_fmsubadd_ps:
5678 ; X64: # %bb.0: # %entry
5679 ; X64-NEXT: kmovw %edi, %k1
5680 ; X64-NEXT: vfmsubadd231ps {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2
5681 ; X64-NEXT: vmovaps %xmm2, %xmm0
5684 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5685 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
5686 %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
5687 %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
5688 %3 = bitcast i8 %__U to <8 x i1>
5689 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5690 %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__C
5694 define <8 x float> @test_mm256_mask3_fmsubadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
5695 ; X86-LABEL: test_mm256_mask3_fmsubadd_ps:
5696 ; X86: # %bb.0: # %entry
5697 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5698 ; X86-NEXT: kmovw %eax, %k1
5699 ; X86-NEXT: vfmsubadd231ps {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2
5700 ; X86-NEXT: vmovaps %ymm2, %ymm0
5703 ; X64-LABEL: test_mm256_mask3_fmsubadd_ps:
5704 ; X64: # %bb.0: # %entry
5705 ; X64-NEXT: kmovw %edi, %k1
5706 ; X64-NEXT: vfmsubadd231ps {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2
5707 ; X64-NEXT: vmovaps %ymm2, %ymm0
5710 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5711 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
5712 %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
5713 %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
5714 %3 = bitcast i8 %__U to <8 x i1>
5715 %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__C
5719 define <2 x double> @test_mm_mask_fnmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
5720 ; X86-LABEL: test_mm_mask_fnmadd_pd:
5721 ; X86: # %bb.0: # %entry
5722 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5723 ; X86-NEXT: kmovw %eax, %k1
5724 ; X86-NEXT: vfnmadd132pd {{.*#+}} xmm0 {%k1} = -(xmm0 * xmm1) + xmm2
5727 ; X64-LABEL: test_mm_mask_fnmadd_pd:
5728 ; X64: # %bb.0: # %entry
5729 ; X64-NEXT: kmovw %edi, %k1
5730 ; X64-NEXT: vfnmadd132pd {{.*#+}} xmm0 {%k1} = -(xmm0 * xmm1) + xmm2
5733 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
5734 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %__C) #9
5735 %1 = bitcast i8 %__U to <8 x i1>
5736 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5737 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
5741 define <4 x double> @test_mm256_mask_fnmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
5742 ; X86-LABEL: test_mm256_mask_fnmadd_pd:
5743 ; X86: # %bb.0: # %entry
5744 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5745 ; X86-NEXT: kmovw %eax, %k1
5746 ; X86-NEXT: vfnmadd132pd {{.*#+}} ymm0 {%k1} = -(ymm0 * ymm1) + ymm2
5749 ; X64-LABEL: test_mm256_mask_fnmadd_pd:
5750 ; X64: # %bb.0: # %entry
5751 ; X64-NEXT: kmovw %edi, %k1
5752 ; X64-NEXT: vfnmadd132pd {{.*#+}} ymm0 {%k1} = -(ymm0 * ymm1) + ymm2
5755 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
5756 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %__C) #9
5757 %1 = bitcast i8 %__U to <8 x i1>
5758 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5759 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
5763 define <4 x float> @test_mm_mask_fnmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
5764 ; X86-LABEL: test_mm_mask_fnmadd_ps:
5765 ; X86: # %bb.0: # %entry
5766 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5767 ; X86-NEXT: kmovw %eax, %k1
5768 ; X86-NEXT: vfnmadd132ps {{.*#+}} xmm0 {%k1} = -(xmm0 * xmm1) + xmm2
5771 ; X64-LABEL: test_mm_mask_fnmadd_ps:
5772 ; X64: # %bb.0: # %entry
5773 ; X64-NEXT: kmovw %edi, %k1
5774 ; X64-NEXT: vfnmadd132ps {{.*#+}} xmm0 {%k1} = -(xmm0 * xmm1) + xmm2
5777 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5778 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %__C) #9
5779 %1 = bitcast i8 %__U to <8 x i1>
5780 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5781 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
5785 define <8 x float> @test_mm256_mask_fnmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
5786 ; X86-LABEL: test_mm256_mask_fnmadd_ps:
5787 ; X86: # %bb.0: # %entry
5788 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5789 ; X86-NEXT: kmovw %eax, %k1
5790 ; X86-NEXT: vfnmadd132ps {{.*#+}} ymm0 {%k1} = -(ymm0 * ymm1) + ymm2
5793 ; X64-LABEL: test_mm256_mask_fnmadd_ps:
5794 ; X64: # %bb.0: # %entry
5795 ; X64-NEXT: kmovw %edi, %k1
5796 ; X64-NEXT: vfnmadd132ps {{.*#+}} ymm0 {%k1} = -(ymm0 * ymm1) + ymm2
5799 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5800 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %__C) #9
5801 %1 = bitcast i8 %__U to <8 x i1>
5802 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
5806 define <2 x double> @test_mm_mask_fnmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
5807 ; X86-LABEL: test_mm_mask_fnmsub_pd:
5808 ; X86: # %bb.0: # %entry
5809 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5810 ; X86-NEXT: kmovw %eax, %k1
5811 ; X86-NEXT: vfnmsub132pd {{.*#+}} xmm0 {%k1} = -(xmm0 * xmm1) - xmm2
5814 ; X64-LABEL: test_mm_mask_fnmsub_pd:
5815 ; X64: # %bb.0: # %entry
5816 ; X64-NEXT: kmovw %edi, %k1
5817 ; X64-NEXT: vfnmsub132pd {{.*#+}} xmm0 {%k1} = -(xmm0 * xmm1) - xmm2
5820 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
5821 %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5822 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %sub1.i) #9
5823 %1 = bitcast i8 %__U to <8 x i1>
5824 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5825 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
5829 define <2 x double> @test_mm_mask3_fnmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
5830 ; X86-LABEL: test_mm_mask3_fnmsub_pd:
5831 ; X86: # %bb.0: # %entry
5832 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5833 ; X86-NEXT: kmovw %eax, %k1
5834 ; X86-NEXT: vfnmsub231pd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
5835 ; X86-NEXT: vmovapd %xmm2, %xmm0
5838 ; X64-LABEL: test_mm_mask3_fnmsub_pd:
5839 ; X64: # %bb.0: # %entry
5840 ; X64-NEXT: kmovw %edi, %k1
5841 ; X64-NEXT: vfnmsub231pd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
5842 ; X64-NEXT: vmovapd %xmm2, %xmm0
5845 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
5846 %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
5847 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %sub1.i) #9
5848 %1 = bitcast i8 %__U to <8 x i1>
5849 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
5850 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
5854 define <4 x double> @test_mm256_mask_fnmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
5855 ; X86-LABEL: test_mm256_mask_fnmsub_pd:
5856 ; X86: # %bb.0: # %entry
5857 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5858 ; X86-NEXT: kmovw %eax, %k1
5859 ; X86-NEXT: vfnmsub132pd {{.*#+}} ymm0 {%k1} = -(ymm0 * ymm1) - ymm2
5862 ; X64-LABEL: test_mm256_mask_fnmsub_pd:
5863 ; X64: # %bb.0: # %entry
5864 ; X64-NEXT: kmovw %edi, %k1
5865 ; X64-NEXT: vfnmsub132pd {{.*#+}} ymm0 {%k1} = -(ymm0 * ymm1) - ymm2
5868 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
5869 %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5870 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %sub1.i) #9
5871 %1 = bitcast i8 %__U to <8 x i1>
5872 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5873 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
5877 define <4 x double> @test_mm256_mask3_fnmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
5878 ; X86-LABEL: test_mm256_mask3_fnmsub_pd:
5879 ; X86: # %bb.0: # %entry
5880 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5881 ; X86-NEXT: kmovw %eax, %k1
5882 ; X86-NEXT: vfnmsub231pd {{.*#+}} ymm2 {%k1} = -(ymm0 * ymm1) - ymm2
5883 ; X86-NEXT: vmovapd %ymm2, %ymm0
5886 ; X64-LABEL: test_mm256_mask3_fnmsub_pd:
5887 ; X64: # %bb.0: # %entry
5888 ; X64-NEXT: kmovw %edi, %k1
5889 ; X64-NEXT: vfnmsub231pd {{.*#+}} ymm2 {%k1} = -(ymm0 * ymm1) - ymm2
5890 ; X64-NEXT: vmovapd %ymm2, %ymm0
5893 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
5894 %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
5895 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %sub1.i) #9
5896 %1 = bitcast i8 %__U to <8 x i1>
5897 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5898 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
5902 define <4 x float> @test_mm_mask_fnmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
5903 ; X86-LABEL: test_mm_mask_fnmsub_ps:
5904 ; X86: # %bb.0: # %entry
5905 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5906 ; X86-NEXT: kmovw %eax, %k1
5907 ; X86-NEXT: vfnmsub132ps {{.*#+}} xmm0 {%k1} = -(xmm0 * xmm1) - xmm2
5910 ; X64-LABEL: test_mm_mask_fnmsub_ps:
5911 ; X64: # %bb.0: # %entry
5912 ; X64-NEXT: kmovw %edi, %k1
5913 ; X64-NEXT: vfnmsub132ps {{.*#+}} xmm0 {%k1} = -(xmm0 * xmm1) - xmm2
5916 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5917 %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5918 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %sub1.i) #9
5919 %1 = bitcast i8 %__U to <8 x i1>
5920 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5921 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
5925 define <4 x float> @test_mm_mask3_fnmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
5926 ; X86-LABEL: test_mm_mask3_fnmsub_ps:
5927 ; X86: # %bb.0: # %entry
5928 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5929 ; X86-NEXT: kmovw %eax, %k1
5930 ; X86-NEXT: vfnmsub231ps {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
5931 ; X86-NEXT: vmovaps %xmm2, %xmm0
5934 ; X64-LABEL: test_mm_mask3_fnmsub_ps:
5935 ; X64: # %bb.0: # %entry
5936 ; X64-NEXT: kmovw %edi, %k1
5937 ; X64-NEXT: vfnmsub231ps {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
5938 ; X64-NEXT: vmovaps %xmm2, %xmm0
5941 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5942 %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5943 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %sub1.i) #9
5944 %1 = bitcast i8 %__U to <8 x i1>
5945 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
5946 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
5950 define <8 x float> @test_mm256_mask_fnmsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
5951 ; X86-LABEL: test_mm256_mask_fnmsub_ps:
5952 ; X86: # %bb.0: # %entry
5953 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5954 ; X86-NEXT: kmovw %eax, %k1
5955 ; X86-NEXT: vfnmsub132ps {{.*#+}} ymm0 {%k1} = -(ymm0 * ymm1) - ymm2
5958 ; X64-LABEL: test_mm256_mask_fnmsub_ps:
5959 ; X64: # %bb.0: # %entry
5960 ; X64-NEXT: kmovw %edi, %k1
5961 ; X64-NEXT: vfnmsub132ps {{.*#+}} ymm0 {%k1} = -(ymm0 * ymm1) - ymm2
5964 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5965 %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5966 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %sub1.i) #9
5967 %1 = bitcast i8 %__U to <8 x i1>
5968 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
5972 define <8 x float> @test_mm256_mask3_fnmsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
5973 ; X86-LABEL: test_mm256_mask3_fnmsub_ps:
5974 ; X86: # %bb.0: # %entry
5975 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5976 ; X86-NEXT: kmovw %eax, %k1
5977 ; X86-NEXT: vfnmsub231ps {{.*#+}} ymm2 {%k1} = -(ymm0 * ymm1) - ymm2
5978 ; X86-NEXT: vmovaps %ymm2, %ymm0
5981 ; X64-LABEL: test_mm256_mask3_fnmsub_ps:
5982 ; X64: # %bb.0: # %entry
5983 ; X64-NEXT: kmovw %edi, %k1
5984 ; X64-NEXT: vfnmsub231ps {{.*#+}} ymm2 {%k1} = -(ymm0 * ymm1) - ymm2
5985 ; X64-NEXT: vmovaps %ymm2, %ymm0
5988 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
5989 %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
5990 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %sub1.i) #9
5991 %1 = bitcast i8 %__U to <8 x i1>
5992 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
5996 define <2 x double> @test_mm_mask_expandloadu_pd(<2 x double> %__W, i8 zeroext %__U, ptr readonly %__P) {
5997 ; X86-LABEL: test_mm_mask_expandloadu_pd:
5998 ; X86: # %bb.0: # %entry
5999 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6000 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
6001 ; X86-NEXT: kmovw %ecx, %k1
6002 ; X86-NEXT: vexpandpd (%eax), %xmm0 {%k1}
6005 ; X64-LABEL: test_mm_mask_expandloadu_pd:
6006 ; X64: # %bb.0: # %entry
6007 ; X64-NEXT: kmovw %edi, %k1
6008 ; X64-NEXT: vexpandpd (%rsi), %xmm0 {%k1}
6011 %0 = bitcast i8 %__U to <8 x i1>
6012 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6013 %1 = tail call <2 x double> @llvm.masked.expandload.v2f64(ptr %__P, <2 x i1> %extract.i, <2 x double> %__W)
6017 define <2 x double> @test_mm_maskz_expandloadu_pd(i8 zeroext %__U, ptr readonly %__P) {
6018 ; X86-LABEL: test_mm_maskz_expandloadu_pd:
6019 ; X86: # %bb.0: # %entry
6020 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6021 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
6022 ; X86-NEXT: kmovw %ecx, %k1
6023 ; X86-NEXT: vexpandpd (%eax), %xmm0 {%k1} {z}
6026 ; X64-LABEL: test_mm_maskz_expandloadu_pd:
6027 ; X64: # %bb.0: # %entry
6028 ; X64-NEXT: kmovw %edi, %k1
6029 ; X64-NEXT: vexpandpd (%rsi), %xmm0 {%k1} {z}
6032 %0 = bitcast i8 %__U to <8 x i1>
6033 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6034 %1 = tail call <2 x double> @llvm.masked.expandload.v2f64(ptr %__P, <2 x i1> %extract.i, <2 x double> zeroinitializer)
6038 define <4 x double> @test_mm256_mask_expandloadu_pd(<4 x double> %__W, i8 zeroext %__U, ptr readonly %__P) {
6039 ; X86-LABEL: test_mm256_mask_expandloadu_pd:
6040 ; X86: # %bb.0: # %entry
6041 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6042 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
6043 ; X86-NEXT: kmovw %ecx, %k1
6044 ; X86-NEXT: vexpandpd (%eax), %ymm0 {%k1}
6047 ; X64-LABEL: test_mm256_mask_expandloadu_pd:
6048 ; X64: # %bb.0: # %entry
6049 ; X64-NEXT: kmovw %edi, %k1
6050 ; X64-NEXT: vexpandpd (%rsi), %ymm0 {%k1}
6053 %0 = bitcast i8 %__U to <8 x i1>
6054 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6055 %1 = tail call <4 x double> @llvm.masked.expandload.v4f64(ptr %__P, <4 x i1> %extract.i, <4 x double> %__W)
6059 define <4 x double> @test_mm256_maskz_expandloadu_pd(i8 zeroext %__U, ptr readonly %__P) {
6060 ; X86-LABEL: test_mm256_maskz_expandloadu_pd:
6061 ; X86: # %bb.0: # %entry
6062 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6063 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
6064 ; X86-NEXT: kmovw %ecx, %k1
6065 ; X86-NEXT: vexpandpd (%eax), %ymm0 {%k1} {z}
6068 ; X64-LABEL: test_mm256_maskz_expandloadu_pd:
6069 ; X64: # %bb.0: # %entry
6070 ; X64-NEXT: kmovw %edi, %k1
6071 ; X64-NEXT: vexpandpd (%rsi), %ymm0 {%k1} {z}
6074 %0 = bitcast i8 %__U to <8 x i1>
6075 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6076 %1 = tail call <4 x double> @llvm.masked.expandload.v4f64(ptr %__P, <4 x i1> %extract.i, <4 x double> zeroinitializer)
6080 define <2 x i64> @test_mm_mask_expandloadu_epi64(<2 x i64> %__W, i8 zeroext %__U, ptr readonly %__P) {
6081 ; X86-LABEL: test_mm_mask_expandloadu_epi64:
6082 ; X86: # %bb.0: # %entry
6083 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6084 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
6085 ; X86-NEXT: kmovw %ecx, %k1
6086 ; X86-NEXT: vpexpandq (%eax), %xmm0 {%k1}
6089 ; X64-LABEL: test_mm_mask_expandloadu_epi64:
6090 ; X64: # %bb.0: # %entry
6091 ; X64-NEXT: kmovw %edi, %k1
6092 ; X64-NEXT: vpexpandq (%rsi), %xmm0 {%k1}
6095 %0 = bitcast i8 %__U to <8 x i1>
6096 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6097 %1 = tail call <2 x i64> @llvm.masked.expandload.v2i64(ptr %__P, <2 x i1> %extract.i, <2 x i64> %__W) #10
6101 define <2 x i64> @test_mm_maskz_expandloadu_epi64(i8 zeroext %__U, ptr readonly %__P) {
6102 ; X86-LABEL: test_mm_maskz_expandloadu_epi64:
6103 ; X86: # %bb.0: # %entry
6104 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6105 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
6106 ; X86-NEXT: kmovw %ecx, %k1
6107 ; X86-NEXT: vpexpandq (%eax), %xmm0 {%k1} {z}
6110 ; X64-LABEL: test_mm_maskz_expandloadu_epi64:
6111 ; X64: # %bb.0: # %entry
6112 ; X64-NEXT: kmovw %edi, %k1
6113 ; X64-NEXT: vpexpandq (%rsi), %xmm0 {%k1} {z}
6116 %0 = bitcast i8 %__U to <8 x i1>
6117 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6118 %1 = tail call <2 x i64> @llvm.masked.expandload.v2i64(ptr %__P, <2 x i1> %extract.i, <2 x i64> zeroinitializer)
6122 define <4 x i64> @test_mm256_mask_expandloadu_epi64(<4 x i64> %__W, i8 zeroext %__U, ptr readonly %__P) {
6123 ; X86-LABEL: test_mm256_mask_expandloadu_epi64:
6124 ; X86: # %bb.0: # %entry
6125 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6126 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
6127 ; X86-NEXT: kmovw %ecx, %k1
6128 ; X86-NEXT: vpexpandq (%eax), %ymm0 {%k1}
6131 ; X64-LABEL: test_mm256_mask_expandloadu_epi64:
6132 ; X64: # %bb.0: # %entry
6133 ; X64-NEXT: kmovw %edi, %k1
6134 ; X64-NEXT: vpexpandq (%rsi), %ymm0 {%k1}
6137 %0 = bitcast i8 %__U to <8 x i1>
6138 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6139 %1 = tail call <4 x i64> @llvm.masked.expandload.v4i64(ptr %__P, <4 x i1> %extract.i, <4 x i64> %__W) #10
6143 define <4 x i64> @test_mm256_maskz_expandloadu_epi64(i8 zeroext %__U, ptr readonly %__P) {
6144 ; X86-LABEL: test_mm256_maskz_expandloadu_epi64:
6145 ; X86: # %bb.0: # %entry
6146 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6147 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
6148 ; X86-NEXT: kmovw %ecx, %k1
6149 ; X86-NEXT: vpexpandq (%eax), %ymm0 {%k1} {z}
6152 ; X64-LABEL: test_mm256_maskz_expandloadu_epi64:
6153 ; X64: # %bb.0: # %entry
6154 ; X64-NEXT: kmovw %edi, %k1
6155 ; X64-NEXT: vpexpandq (%rsi), %ymm0 {%k1} {z}
6158 %0 = bitcast i8 %__U to <8 x i1>
6159 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6160 %1 = tail call <4 x i64> @llvm.masked.expandload.v4i64(ptr %__P, <4 x i1> %extract.i, <4 x i64> zeroinitializer)
6164 define <4 x float> @test_mm_mask_expandloadu_ps(<4 x float> %__W, i8 zeroext %__U, ptr readonly %__P) {
6165 ; X86-LABEL: test_mm_mask_expandloadu_ps:
6166 ; X86: # %bb.0: # %entry
6167 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6168 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
6169 ; X86-NEXT: kmovw %ecx, %k1
6170 ; X86-NEXT: vexpandps (%eax), %xmm0 {%k1}
6173 ; X64-LABEL: test_mm_mask_expandloadu_ps:
6174 ; X64: # %bb.0: # %entry
6175 ; X64-NEXT: kmovw %edi, %k1
6176 ; X64-NEXT: vexpandps (%rsi), %xmm0 {%k1}
6179 %0 = bitcast i8 %__U to <8 x i1>
6180 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6181 %1 = tail call <4 x float> @llvm.masked.expandload.v4f32(ptr %__P, <4 x i1> %extract.i, <4 x float> %__W)
6185 define <4 x float> @test_mm_maskz_expandloadu_ps(i8 zeroext %__U, ptr readonly %__P) {
6186 ; X86-LABEL: test_mm_maskz_expandloadu_ps:
6187 ; X86: # %bb.0: # %entry
6188 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6189 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
6190 ; X86-NEXT: kmovw %ecx, %k1
6191 ; X86-NEXT: vexpandps (%eax), %xmm0 {%k1} {z}
6194 ; X64-LABEL: test_mm_maskz_expandloadu_ps:
6195 ; X64: # %bb.0: # %entry
6196 ; X64-NEXT: kmovw %edi, %k1
6197 ; X64-NEXT: vexpandps (%rsi), %xmm0 {%k1} {z}
6200 %0 = bitcast i8 %__U to <8 x i1>
6201 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6202 %1 = tail call <4 x float> @llvm.masked.expandload.v4f32(ptr %__P, <4 x i1> %extract.i, <4 x float> zeroinitializer)
6206 define <8 x float> @test_mm256_mask_expandloadu_ps(<8 x float> %__W, i8 zeroext %__U, ptr readonly %__P) {
6207 ; X86-LABEL: test_mm256_mask_expandloadu_ps:
6208 ; X86: # %bb.0: # %entry
6209 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6210 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
6211 ; X86-NEXT: kmovw %ecx, %k1
6212 ; X86-NEXT: vexpandps (%eax), %ymm0 {%k1}
6215 ; X64-LABEL: test_mm256_mask_expandloadu_ps:
6216 ; X64: # %bb.0: # %entry
6217 ; X64-NEXT: kmovw %edi, %k1
6218 ; X64-NEXT: vexpandps (%rsi), %ymm0 {%k1}
6221 %0 = bitcast i8 %__U to <8 x i1>
6222 %1 = tail call <8 x float> @llvm.masked.expandload.v8f32(ptr %__P, <8 x i1> %0, <8 x float> %__W)
6226 define <8 x float> @test_mm256_maskz_expandloadu_ps(i8 zeroext %__U, ptr readonly %__P) {
6227 ; X86-LABEL: test_mm256_maskz_expandloadu_ps:
6228 ; X86: # %bb.0: # %entry
6229 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6230 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
6231 ; X86-NEXT: kmovw %ecx, %k1
6232 ; X86-NEXT: vexpandps (%eax), %ymm0 {%k1} {z}
6235 ; X64-LABEL: test_mm256_maskz_expandloadu_ps:
6236 ; X64: # %bb.0: # %entry
6237 ; X64-NEXT: kmovw %edi, %k1
6238 ; X64-NEXT: vexpandps (%rsi), %ymm0 {%k1} {z}
6241 %0 = bitcast i8 %__U to <8 x i1>
6242 %1 = tail call <8 x float> @llvm.masked.expandload.v8f32(ptr %__P, <8 x i1> %0, <8 x float> zeroinitializer)
6246 define <2 x i64> @test_mm_mask_expandloadu_epi32(<2 x i64> %__W, i8 zeroext %__U, ptr readonly %__P) {
6247 ; X86-LABEL: test_mm_mask_expandloadu_epi32:
6248 ; X86: # %bb.0: # %entry
6249 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6250 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
6251 ; X86-NEXT: kmovw %ecx, %k1
6252 ; X86-NEXT: vpexpandd (%eax), %xmm0 {%k1}
6255 ; X64-LABEL: test_mm_mask_expandloadu_epi32:
6256 ; X64: # %bb.0: # %entry
6257 ; X64-NEXT: kmovw %edi, %k1
6258 ; X64-NEXT: vpexpandd (%rsi), %xmm0 {%k1}
6261 %0 = bitcast <2 x i64> %__W to <4 x i32>
6262 %1 = bitcast i8 %__U to <8 x i1>
6263 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6264 %2 = tail call <4 x i32> @llvm.masked.expandload.v4i32(ptr %__P, <4 x i1> %extract.i, <4 x i32> %0)
6265 %3 = bitcast <4 x i32> %2 to <2 x i64>
6269 define <2 x i64> @test_mm_maskz_expandloadu_epi32(i8 zeroext %__U, ptr readonly %__P) {
6270 ; X86-LABEL: test_mm_maskz_expandloadu_epi32:
6271 ; X86: # %bb.0: # %entry
6272 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6273 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
6274 ; X86-NEXT: kmovw %ecx, %k1
6275 ; X86-NEXT: vpexpandd (%eax), %xmm0 {%k1} {z}
6278 ; X64-LABEL: test_mm_maskz_expandloadu_epi32:
6279 ; X64: # %bb.0: # %entry
6280 ; X64-NEXT: kmovw %edi, %k1
6281 ; X64-NEXT: vpexpandd (%rsi), %xmm0 {%k1} {z}
6284 %0 = bitcast i8 %__U to <8 x i1>
6285 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6286 %1 = tail call <4 x i32> @llvm.masked.expandload.v4i32(ptr %__P, <4 x i1> %extract.i, <4 x i32> zeroinitializer)
6287 %2 = bitcast <4 x i32> %1 to <2 x i64>
6291 define <4 x i64> @test_mm256_mask_expandloadu_epi32(<4 x i64> %__W, i8 zeroext %__U, ptr readonly %__P) {
6292 ; X86-LABEL: test_mm256_mask_expandloadu_epi32:
6293 ; X86: # %bb.0: # %entry
6294 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6295 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
6296 ; X86-NEXT: kmovw %ecx, %k1
6297 ; X86-NEXT: vpexpandd (%eax), %ymm0 {%k1}
6300 ; X64-LABEL: test_mm256_mask_expandloadu_epi32:
6301 ; X64: # %bb.0: # %entry
6302 ; X64-NEXT: kmovw %edi, %k1
6303 ; X64-NEXT: vpexpandd (%rsi), %ymm0 {%k1}
6306 %0 = bitcast <4 x i64> %__W to <8 x i32>
6307 %1 = bitcast i8 %__U to <8 x i1>
6308 %2 = tail call <8 x i32> @llvm.masked.expandload.v8i32(ptr %__P, <8 x i1> %1, <8 x i32> %0)
6309 %3 = bitcast <8 x i32> %2 to <4 x i64>
6313 define <4 x i64> @test_mm256_maskz_expandloadu_epi32(i8 zeroext %__U, ptr readonly %__P) {
6314 ; X86-LABEL: test_mm256_maskz_expandloadu_epi32:
6315 ; X86: # %bb.0: # %entry
6316 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6317 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
6318 ; X86-NEXT: kmovw %ecx, %k1
6319 ; X86-NEXT: vpexpandd (%eax), %ymm0 {%k1} {z}
6322 ; X64-LABEL: test_mm256_maskz_expandloadu_epi32:
6323 ; X64: # %bb.0: # %entry
6324 ; X64-NEXT: kmovw %edi, %k1
6325 ; X64-NEXT: vpexpandd (%rsi), %ymm0 {%k1} {z}
6328 %0 = bitcast i8 %__U to <8 x i1>
6329 %1 = tail call <8 x i32> @llvm.masked.expandload.v8i32(ptr %__P, <8 x i1> %0, <8 x i32> zeroinitializer)
6330 %2 = bitcast <8 x i32> %1 to <4 x i64>
6334 define void @test_mm_mask_compressstoreu_pd(ptr %__P, i8 zeroext %__U, <2 x double> %__A) {
6335 ; X86-LABEL: test_mm_mask_compressstoreu_pd:
6336 ; X86: # %bb.0: # %entry
6337 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6338 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6339 ; X86-NEXT: kmovw %eax, %k1
6340 ; X86-NEXT: vcompresspd %xmm0, (%ecx) {%k1}
6343 ; X64-LABEL: test_mm_mask_compressstoreu_pd:
6344 ; X64: # %bb.0: # %entry
6345 ; X64-NEXT: kmovw %esi, %k1
6346 ; X64-NEXT: vcompresspd %xmm0, (%rdi) {%k1}
6349 %0 = bitcast i8 %__U to <8 x i1>
6350 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6351 tail call void @llvm.masked.compressstore.v2f64(<2 x double> %__A, ptr %__P, <2 x i1> %extract.i)
6355 define void @test_mm256_mask_compressstoreu_pd(ptr %__P, i8 zeroext %__U, <4 x double> %__A) {
6356 ; X86-LABEL: test_mm256_mask_compressstoreu_pd:
6357 ; X86: # %bb.0: # %entry
6358 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6359 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6360 ; X86-NEXT: kmovw %eax, %k1
6361 ; X86-NEXT: vcompresspd %ymm0, (%ecx) {%k1}
6362 ; X86-NEXT: vzeroupper
6365 ; X64-LABEL: test_mm256_mask_compressstoreu_pd:
6366 ; X64: # %bb.0: # %entry
6367 ; X64-NEXT: kmovw %esi, %k1
6368 ; X64-NEXT: vcompresspd %ymm0, (%rdi) {%k1}
6369 ; X64-NEXT: vzeroupper
6372 %0 = bitcast i8 %__U to <8 x i1>
6373 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6374 tail call void @llvm.masked.compressstore.v4f64(<4 x double> %__A, ptr %__P, <4 x i1> %extract.i)
6378 define void @test_mm_mask_compressstoreu_epi64(ptr %__P, i8 zeroext %__U, <2 x i64> %__A) {
6379 ; X86-LABEL: test_mm_mask_compressstoreu_epi64:
6380 ; X86: # %bb.0: # %entry
6381 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6382 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6383 ; X86-NEXT: kmovw %eax, %k1
6384 ; X86-NEXT: vpcompressq %xmm0, (%ecx) {%k1}
6387 ; X64-LABEL: test_mm_mask_compressstoreu_epi64:
6388 ; X64: # %bb.0: # %entry
6389 ; X64-NEXT: kmovw %esi, %k1
6390 ; X64-NEXT: vpcompressq %xmm0, (%rdi) {%k1}
6393 %0 = bitcast i8 %__U to <8 x i1>
6394 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6395 tail call void @llvm.masked.compressstore.v2i64(<2 x i64> %__A, ptr %__P, <2 x i1> %extract.i)
6399 define void @test_mm256_mask_compressstoreu_epi64(ptr %__P, i8 zeroext %__U, <4 x i64> %__A) {
6400 ; X86-LABEL: test_mm256_mask_compressstoreu_epi64:
6401 ; X86: # %bb.0: # %entry
6402 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6403 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6404 ; X86-NEXT: kmovw %eax, %k1
6405 ; X86-NEXT: vpcompressq %ymm0, (%ecx) {%k1}
6406 ; X86-NEXT: vzeroupper
6409 ; X64-LABEL: test_mm256_mask_compressstoreu_epi64:
6410 ; X64: # %bb.0: # %entry
6411 ; X64-NEXT: kmovw %esi, %k1
6412 ; X64-NEXT: vpcompressq %ymm0, (%rdi) {%k1}
6413 ; X64-NEXT: vzeroupper
6416 %0 = bitcast i8 %__U to <8 x i1>
6417 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6418 tail call void @llvm.masked.compressstore.v4i64(<4 x i64> %__A, ptr %__P, <4 x i1> %extract.i)
6422 define void @test_mm_mask_compressstoreu_ps(ptr %__P, i8 zeroext %__U, <4 x float> %__A) {
6423 ; X86-LABEL: test_mm_mask_compressstoreu_ps:
6424 ; X86: # %bb.0: # %entry
6425 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6426 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6427 ; X86-NEXT: kmovw %eax, %k1
6428 ; X86-NEXT: vcompressps %xmm0, (%ecx) {%k1}
6431 ; X64-LABEL: test_mm_mask_compressstoreu_ps:
6432 ; X64: # %bb.0: # %entry
6433 ; X64-NEXT: kmovw %esi, %k1
6434 ; X64-NEXT: vcompressps %xmm0, (%rdi) {%k1}
6437 %0 = bitcast i8 %__U to <8 x i1>
6438 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6439 tail call void @llvm.masked.compressstore.v4f32(<4 x float> %__A, ptr %__P, <4 x i1> %extract.i)
6443 define void @test_mm256_mask_compressstoreu_ps(ptr %__P, i8 zeroext %__U, <8 x float> %__A) {
6444 ; X86-LABEL: test_mm256_mask_compressstoreu_ps:
6445 ; X86: # %bb.0: # %entry
6446 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6447 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6448 ; X86-NEXT: kmovw %eax, %k1
6449 ; X86-NEXT: vcompressps %ymm0, (%ecx) {%k1}
6450 ; X86-NEXT: vzeroupper
6453 ; X64-LABEL: test_mm256_mask_compressstoreu_ps:
6454 ; X64: # %bb.0: # %entry
6455 ; X64-NEXT: kmovw %esi, %k1
6456 ; X64-NEXT: vcompressps %ymm0, (%rdi) {%k1}
6457 ; X64-NEXT: vzeroupper
6460 %0 = bitcast i8 %__U to <8 x i1>
6461 tail call void @llvm.masked.compressstore.v8f32(<8 x float> %__A, ptr %__P, <8 x i1> %0)
6465 define void @test_mm_mask_compressstoreu_epi32(ptr %__P, i8 zeroext %__U, <2 x i64> %__A) {
6466 ; X86-LABEL: test_mm_mask_compressstoreu_epi32:
6467 ; X86: # %bb.0: # %entry
6468 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6469 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6470 ; X86-NEXT: kmovw %eax, %k1
6471 ; X86-NEXT: vpcompressd %xmm0, (%ecx) {%k1}
6474 ; X64-LABEL: test_mm_mask_compressstoreu_epi32:
6475 ; X64: # %bb.0: # %entry
6476 ; X64-NEXT: kmovw %esi, %k1
6477 ; X64-NEXT: vpcompressd %xmm0, (%rdi) {%k1}
6480 %0 = bitcast <2 x i64> %__A to <4 x i32>
6481 %1 = bitcast i8 %__U to <8 x i1>
6482 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6483 tail call void @llvm.masked.compressstore.v4i32(<4 x i32> %0, ptr %__P, <4 x i1> %extract.i)
6487 define void @test_mm256_mask_compressstoreu_epi32(ptr %__P, i8 zeroext %__U, <4 x i64> %__A) {
6488 ; X86-LABEL: test_mm256_mask_compressstoreu_epi32:
6489 ; X86: # %bb.0: # %entry
6490 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6491 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6492 ; X86-NEXT: kmovw %eax, %k1
6493 ; X86-NEXT: vpcompressd %ymm0, (%ecx) {%k1}
6494 ; X86-NEXT: vzeroupper
6497 ; X64-LABEL: test_mm256_mask_compressstoreu_epi32:
6498 ; X64: # %bb.0: # %entry
6499 ; X64-NEXT: kmovw %esi, %k1
6500 ; X64-NEXT: vpcompressd %ymm0, (%rdi) {%k1}
6501 ; X64-NEXT: vzeroupper
6504 %0 = bitcast <4 x i64> %__A to <8 x i32>
6505 %1 = bitcast i8 %__U to <8 x i1>
6506 tail call void @llvm.masked.compressstore.v8i32(<8 x i32> %0, ptr %__P, <8 x i1> %1) #10
6511 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #8
6512 declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) #8
6513 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #8
6514 declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) #8
6516 define <2 x double> @test_mm_mask_sqrt_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A) {
6517 ; X86-LABEL: test_mm_mask_sqrt_pd:
6518 ; X86: # %bb.0: # %entry
6519 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6520 ; X86-NEXT: kmovw %eax, %k1
6521 ; X86-NEXT: vsqrtpd %xmm1, %xmm0 {%k1}
6524 ; X64-LABEL: test_mm_mask_sqrt_pd:
6525 ; X64: # %bb.0: # %entry
6526 ; X64-NEXT: kmovw %edi, %k1
6527 ; X64-NEXT: vsqrtpd %xmm1, %xmm0 {%k1}
6530 %0 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %__A) #2
6531 %1 = bitcast i8 %__U to <8 x i1>
6532 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6533 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__W
6537 declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
6539 define <2 x double> @test_mm_maskz_sqrt_pd(i8 zeroext %__U, <2 x double> %__A) {
6540 ; X86-LABEL: test_mm_maskz_sqrt_pd:
6541 ; X86: # %bb.0: # %entry
6542 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6543 ; X86-NEXT: kmovw %eax, %k1
6544 ; X86-NEXT: vsqrtpd %xmm0, %xmm0 {%k1} {z}
6547 ; X64-LABEL: test_mm_maskz_sqrt_pd:
6548 ; X64: # %bb.0: # %entry
6549 ; X64-NEXT: kmovw %edi, %k1
6550 ; X64-NEXT: vsqrtpd %xmm0, %xmm0 {%k1} {z}
6553 %0 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %__A) #2
6554 %1 = bitcast i8 %__U to <8 x i1>
6555 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6556 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
6560 define <4 x double> @test_mm256_mask_sqrt_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A) {
6561 ; X86-LABEL: test_mm256_mask_sqrt_pd:
6562 ; X86: # %bb.0: # %entry
6563 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6564 ; X86-NEXT: kmovw %eax, %k1
6565 ; X86-NEXT: vsqrtpd %ymm1, %ymm0 {%k1}
6568 ; X64-LABEL: test_mm256_mask_sqrt_pd:
6569 ; X64: # %bb.0: # %entry
6570 ; X64-NEXT: kmovw %edi, %k1
6571 ; X64-NEXT: vsqrtpd %ymm1, %ymm0 {%k1}
6574 %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %__A) #2
6575 %1 = bitcast i8 %__U to <8 x i1>
6576 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6577 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__W
6581 declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
6583 define <4 x double> @test_mm256_maskz_sqrt_pd(i8 zeroext %__U, <4 x double> %__A) {
6584 ; X86-LABEL: test_mm256_maskz_sqrt_pd:
6585 ; X86: # %bb.0: # %entry
6586 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6587 ; X86-NEXT: kmovw %eax, %k1
6588 ; X86-NEXT: vsqrtpd %ymm0, %ymm0 {%k1} {z}
6591 ; X64-LABEL: test_mm256_maskz_sqrt_pd:
6592 ; X64: # %bb.0: # %entry
6593 ; X64-NEXT: kmovw %edi, %k1
6594 ; X64-NEXT: vsqrtpd %ymm0, %ymm0 {%k1} {z}
6597 %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %__A) #2
6598 %1 = bitcast i8 %__U to <8 x i1>
6599 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6600 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
6604 define <4 x float> @test_mm_mask_sqrt_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) {
6605 ; X86-LABEL: test_mm_mask_sqrt_ps:
6606 ; X86: # %bb.0: # %entry
6607 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6608 ; X86-NEXT: kmovw %eax, %k1
6609 ; X86-NEXT: vsqrtps %xmm1, %xmm0 {%k1}
6612 ; X64-LABEL: test_mm_mask_sqrt_ps:
6613 ; X64: # %bb.0: # %entry
6614 ; X64-NEXT: kmovw %edi, %k1
6615 ; X64-NEXT: vsqrtps %xmm1, %xmm0 {%k1}
6618 %0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %__A) #2
6619 %1 = bitcast i8 %__U to <8 x i1>
6620 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6621 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__W
6625 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
6627 define <4 x float> @test_mm_maskz_sqrt_ps(i8 zeroext %__U, <4 x float> %__A) {
6628 ; X86-LABEL: test_mm_maskz_sqrt_ps:
6629 ; X86: # %bb.0: # %entry
6630 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6631 ; X86-NEXT: kmovw %eax, %k1
6632 ; X86-NEXT: vsqrtps %xmm0, %xmm0 {%k1} {z}
6635 ; X64-LABEL: test_mm_maskz_sqrt_ps:
6636 ; X64: # %bb.0: # %entry
6637 ; X64-NEXT: kmovw %edi, %k1
6638 ; X64-NEXT: vsqrtps %xmm0, %xmm0 {%k1} {z}
6641 %0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %__A) #2
6642 %1 = bitcast i8 %__U to <8 x i1>
6643 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6644 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
6648 define <8 x float> @test_mm256_mask_sqrt_ps(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A) {
6649 ; X86-LABEL: test_mm256_mask_sqrt_ps:
6650 ; X86: # %bb.0: # %entry
6651 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6652 ; X86-NEXT: kmovw %eax, %k1
6653 ; X86-NEXT: vsqrtps %ymm1, %ymm0 {%k1}
6656 ; X64-LABEL: test_mm256_mask_sqrt_ps:
6657 ; X64: # %bb.0: # %entry
6658 ; X64-NEXT: kmovw %edi, %k1
6659 ; X64-NEXT: vsqrtps %ymm1, %ymm0 {%k1}
6662 %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %__A) #2
6663 %1 = bitcast i8 %__U to <8 x i1>
6664 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__W
6668 define <8 x float> @test_mm256_maskz_sqrt_ps(i8 zeroext %__U, <8 x float> %__A) {
6669 ; X86-LABEL: test_mm256_maskz_sqrt_ps:
6670 ; X86: # %bb.0: # %entry
6671 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6672 ; X86-NEXT: kmovw %eax, %k1
6673 ; X86-NEXT: vsqrtps %ymm0, %ymm0 {%k1} {z}
6676 ; X64-LABEL: test_mm256_maskz_sqrt_ps:
6677 ; X64: # %bb.0: # %entry
6678 ; X64-NEXT: kmovw %edi, %k1
6679 ; X64-NEXT: vsqrtps %ymm0, %ymm0 {%k1} {z}
6682 %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %__A) #2
6683 %1 = bitcast i8 %__U to <8 x i1>
6684 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
6688 declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
6690 define <2 x i64> @test_mm_rol_epi32(<2 x i64> %__A) {
6691 ; CHECK-LABEL: test_mm_rol_epi32:
6692 ; CHECK: # %bb.0: # %entry
6693 ; CHECK-NEXT: vprold $5, %xmm0, %xmm0
6694 ; CHECK-NEXT: ret{{[l|q]}}
6696 %0 = bitcast <2 x i64> %__A to <4 x i32>
6697 %1 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
6698 %2 = bitcast <4 x i32> %1 to <2 x i64>
6702 define <2 x i64> @test_mm_mask_rol_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) {
6703 ; X86-LABEL: test_mm_mask_rol_epi32:
6704 ; X86: # %bb.0: # %entry
6705 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6706 ; X86-NEXT: kmovw %eax, %k1
6707 ; X86-NEXT: vprold $5, %xmm1, %xmm0 {%k1}
6710 ; X64-LABEL: test_mm_mask_rol_epi32:
6711 ; X64: # %bb.0: # %entry
6712 ; X64-NEXT: kmovw %edi, %k1
6713 ; X64-NEXT: vprold $5, %xmm1, %xmm0 {%k1}
6716 %0 = bitcast <2 x i64> %__A to <4 x i32>
6717 %1 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
6718 %2 = bitcast <2 x i64> %__W to <4 x i32>
6719 %3 = bitcast i8 %__U to <8 x i1>
6720 %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6721 %4 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %2
6722 %5 = bitcast <4 x i32> %4 to <2 x i64>
6726 define <2 x i64> @test_mm_maskz_rol_epi32(i8 zeroext %__U, <2 x i64> %__A) {
6727 ; X86-LABEL: test_mm_maskz_rol_epi32:
6728 ; X86: # %bb.0: # %entry
6729 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6730 ; X86-NEXT: kmovw %eax, %k1
6731 ; X86-NEXT: vprold $5, %xmm0, %xmm0 {%k1} {z}
6734 ; X64-LABEL: test_mm_maskz_rol_epi32:
6735 ; X64: # %bb.0: # %entry
6736 ; X64-NEXT: kmovw %edi, %k1
6737 ; X64-NEXT: vprold $5, %xmm0, %xmm0 {%k1} {z}
6740 %0 = bitcast <2 x i64> %__A to <4 x i32>
6741 %1 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
6742 %2 = bitcast i8 %__U to <8 x i1>
6743 %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6744 %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer
6745 %4 = bitcast <4 x i32> %3 to <2 x i64>
6749 define <4 x i64> @test_mm256_rol_epi32(<4 x i64> %__A) {
6750 ; CHECK-LABEL: test_mm256_rol_epi32:
6751 ; CHECK: # %bb.0: # %entry
6752 ; CHECK-NEXT: vprold $5, %ymm0, %ymm0
6753 ; CHECK-NEXT: ret{{[l|q]}}
6755 %0 = bitcast <4 x i64> %__A to <8 x i32>
6756 %1 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
6757 %2 = bitcast <8 x i32> %1 to <4 x i64>
6761 define <4 x i64> @test_mm256_mask_rol_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) {
6762 ; X86-LABEL: test_mm256_mask_rol_epi32:
6763 ; X86: # %bb.0: # %entry
6764 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6765 ; X86-NEXT: kmovw %eax, %k1
6766 ; X86-NEXT: vprold $5, %ymm1, %ymm0 {%k1}
6769 ; X64-LABEL: test_mm256_mask_rol_epi32:
6770 ; X64: # %bb.0: # %entry
6771 ; X64-NEXT: kmovw %edi, %k1
6772 ; X64-NEXT: vprold $5, %ymm1, %ymm0 {%k1}
6775 %0 = bitcast <4 x i64> %__A to <8 x i32>
6776 %1 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
6777 %2 = bitcast <4 x i64> %__W to <8 x i32>
6778 %3 = bitcast i8 %__U to <8 x i1>
6779 %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2
6780 %5 = bitcast <8 x i32> %4 to <4 x i64>
6784 define <4 x i64> @test_mm256_maskz_rol_epi32(i8 zeroext %__U, <4 x i64> %__A) {
6785 ; X86-LABEL: test_mm256_maskz_rol_epi32:
6786 ; X86: # %bb.0: # %entry
6787 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6788 ; X86-NEXT: kmovw %eax, %k1
6789 ; X86-NEXT: vprold $5, %ymm0, %ymm0 {%k1} {z}
6792 ; X64-LABEL: test_mm256_maskz_rol_epi32:
6793 ; X64: # %bb.0: # %entry
6794 ; X64-NEXT: kmovw %edi, %k1
6795 ; X64-NEXT: vprold $5, %ymm0, %ymm0 {%k1} {z}
6798 %0 = bitcast <4 x i64> %__A to <8 x i32>
6799 %1 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
6800 %2 = bitcast i8 %__U to <8 x i1>
6801 %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer
6802 %4 = bitcast <8 x i32> %3 to <4 x i64>
6806 define <2 x i64> @test_mm_rol_epi64(<2 x i64> %__A) {
6807 ; CHECK-LABEL: test_mm_rol_epi64:
6808 ; CHECK: # %bb.0: # %entry
6809 ; CHECK-NEXT: vprolq $5, %xmm0, %xmm0
6810 ; CHECK-NEXT: ret{{[l|q]}}
6812 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>)
6816 define <2 x i64> @test_mm_mask_rol_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) {
6817 ; X86-LABEL: test_mm_mask_rol_epi64:
6818 ; X86: # %bb.0: # %entry
6819 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6820 ; X86-NEXT: kmovw %eax, %k1
6821 ; X86-NEXT: vprolq $5, %xmm1, %xmm0 {%k1}
6824 ; X64-LABEL: test_mm_mask_rol_epi64:
6825 ; X64: # %bb.0: # %entry
6826 ; X64-NEXT: kmovw %edi, %k1
6827 ; X64-NEXT: vprolq $5, %xmm1, %xmm0 {%k1}
6830 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>)
6831 %1 = bitcast i8 %__U to <8 x i1>
6832 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6833 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__W
6837 define <2 x i64> @test_mm_maskz_rol_epi64(i8 zeroext %__U, <2 x i64> %__A) {
6838 ; X86-LABEL: test_mm_maskz_rol_epi64:
6839 ; X86: # %bb.0: # %entry
6840 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6841 ; X86-NEXT: kmovw %eax, %k1
6842 ; X86-NEXT: vprolq $5, %xmm0, %xmm0 {%k1} {z}
6845 ; X64-LABEL: test_mm_maskz_rol_epi64:
6846 ; X64: # %bb.0: # %entry
6847 ; X64-NEXT: kmovw %edi, %k1
6848 ; X64-NEXT: vprolq $5, %xmm0, %xmm0 {%k1} {z}
6851 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>)
6852 %1 = bitcast i8 %__U to <8 x i1>
6853 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
6854 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer
6858 define <4 x i64> @test_mm256_rol_epi64(<4 x i64> %__A) {
6859 ; CHECK-LABEL: test_mm256_rol_epi64:
6860 ; CHECK: # %bb.0: # %entry
6861 ; CHECK-NEXT: vprolq $5, %ymm0, %ymm0
6862 ; CHECK-NEXT: ret{{[l|q]}}
6864 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5,i64 5, i64 5>)
6868 define <4 x i64> @test_mm256_mask_rol_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) {
6869 ; X86-LABEL: test_mm256_mask_rol_epi64:
6870 ; X86: # %bb.0: # %entry
6871 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6872 ; X86-NEXT: kmovw %eax, %k1
6873 ; X86-NEXT: vprolq $5, %ymm1, %ymm0 {%k1}
6876 ; X64-LABEL: test_mm256_mask_rol_epi64:
6877 ; X64: # %bb.0: # %entry
6878 ; X64-NEXT: kmovw %edi, %k1
6879 ; X64-NEXT: vprolq $5, %ymm1, %ymm0 {%k1}
6882 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5,i64 5, i64 5>)
6883 %1 = bitcast i8 %__U to <8 x i1>
6884 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6885 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__W
6889 define <4 x i64> @test_mm256_maskz_rol_epi64(i8 zeroext %__U, <4 x i64> %__A) {
6890 ; X86-LABEL: test_mm256_maskz_rol_epi64:
6891 ; X86: # %bb.0: # %entry
6892 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6893 ; X86-NEXT: kmovw %eax, %k1
6894 ; X86-NEXT: vprolq $5, %ymm0, %ymm0 {%k1} {z}
6897 ; X64-LABEL: test_mm256_maskz_rol_epi64:
6898 ; X64: # %bb.0: # %entry
6899 ; X64-NEXT: kmovw %edi, %k1
6900 ; X64-NEXT: vprolq $5, %ymm0, %ymm0 {%k1} {z}
6903 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5,i64 5, i64 5>)
6904 %1 = bitcast i8 %__U to <8 x i1>
6905 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6906 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer
6910 define <2 x i64> @test_mm_rolv_epi32(<2 x i64> %__A, <2 x i64> %__B) {
6911 ; CHECK-LABEL: test_mm_rolv_epi32:
6912 ; CHECK: # %bb.0: # %entry
6913 ; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm0
6914 ; CHECK-NEXT: ret{{[l|q]}}
6916 %0 = bitcast <2 x i64> %__A to <4 x i32>
6917 %1 = bitcast <2 x i64> %__B to <4 x i32>
6918 %2 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1)
6919 %3 = bitcast <4 x i32> %2 to <2 x i64>
6923 define <2 x i64> @test_mm_mask_rolv_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
6924 ; X86-LABEL: test_mm_mask_rolv_epi32:
6925 ; X86: # %bb.0: # %entry
6926 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6927 ; X86-NEXT: kmovw %eax, %k1
6928 ; X86-NEXT: vprolvd %xmm2, %xmm1, %xmm0 {%k1}
6931 ; X64-LABEL: test_mm_mask_rolv_epi32:
6932 ; X64: # %bb.0: # %entry
6933 ; X64-NEXT: kmovw %edi, %k1
6934 ; X64-NEXT: vprolvd %xmm2, %xmm1, %xmm0 {%k1}
6937 %0 = bitcast <2 x i64> %__A to <4 x i32>
6938 %1 = bitcast <2 x i64> %__B to <4 x i32>
6939 %2 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1)
6940 %3 = bitcast <2 x i64> %__W to <4 x i32>
6941 %4 = bitcast i8 %__U to <8 x i1>
6942 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6943 %5 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> %3
6944 %6 = bitcast <4 x i32> %5 to <2 x i64>
6948 define <2 x i64> @test_mm_maskz_rolv_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
6949 ; X86-LABEL: test_mm_maskz_rolv_epi32:
6950 ; X86: # %bb.0: # %entry
6951 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6952 ; X86-NEXT: kmovw %eax, %k1
6953 ; X86-NEXT: vprolvd %xmm1, %xmm0, %xmm0 {%k1} {z}
6956 ; X64-LABEL: test_mm_maskz_rolv_epi32:
6957 ; X64: # %bb.0: # %entry
6958 ; X64-NEXT: kmovw %edi, %k1
6959 ; X64-NEXT: vprolvd %xmm1, %xmm0, %xmm0 {%k1} {z}
6962 %0 = bitcast <2 x i64> %__A to <4 x i32>
6963 %1 = bitcast <2 x i64> %__B to <4 x i32>
6964 %2 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1)
6965 %3 = bitcast i8 %__U to <8 x i1>
6966 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6967 %4 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> zeroinitializer
6968 %5 = bitcast <4 x i32> %4 to <2 x i64>
6972 define <4 x i64> @test_mm256_rolv_epi32(<4 x i64> %__A, <4 x i64> %__B) {
6973 ; CHECK-LABEL: test_mm256_rolv_epi32:
6974 ; CHECK: # %bb.0: # %entry
6975 ; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm0
6976 ; CHECK-NEXT: ret{{[l|q]}}
6978 %0 = bitcast <4 x i64> %__A to <8 x i32>
6979 %1 = bitcast <4 x i64> %__B to <8 x i32>
6980 %2 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1)
6981 %3 = bitcast <8 x i32> %2 to <4 x i64>
6985 define <4 x i64> @test_mm256_mask_rolv_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
6986 ; X86-LABEL: test_mm256_mask_rolv_epi32:
6987 ; X86: # %bb.0: # %entry
6988 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6989 ; X86-NEXT: kmovw %eax, %k1
6990 ; X86-NEXT: vprolvd %ymm2, %ymm1, %ymm0 {%k1}
6993 ; X64-LABEL: test_mm256_mask_rolv_epi32:
6994 ; X64: # %bb.0: # %entry
6995 ; X64-NEXT: kmovw %edi, %k1
6996 ; X64-NEXT: vprolvd %ymm2, %ymm1, %ymm0 {%k1}
6999 %0 = bitcast <4 x i64> %__A to <8 x i32>
7000 %1 = bitcast <4 x i64> %__B to <8 x i32>
7001 %2 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1)
7002 %3 = bitcast <4 x i64> %__W to <8 x i32>
7003 %4 = bitcast i8 %__U to <8 x i1>
7004 %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3
7005 %6 = bitcast <8 x i32> %5 to <4 x i64>
7009 define <4 x i64> @test_mm256_maskz_rolv_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7010 ; X86-LABEL: test_mm256_maskz_rolv_epi32:
7011 ; X86: # %bb.0: # %entry
7012 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7013 ; X86-NEXT: kmovw %eax, %k1
7014 ; X86-NEXT: vprolvd %ymm1, %ymm0, %ymm0 {%k1} {z}
7017 ; X64-LABEL: test_mm256_maskz_rolv_epi32:
7018 ; X64: # %bb.0: # %entry
7019 ; X64-NEXT: kmovw %edi, %k1
7020 ; X64-NEXT: vprolvd %ymm1, %ymm0, %ymm0 {%k1} {z}
7023 %0 = bitcast <4 x i64> %__A to <8 x i32>
7024 %1 = bitcast <4 x i64> %__B to <8 x i32>
7025 %2 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1)
7026 %3 = bitcast i8 %__U to <8 x i1>
7027 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
7028 %5 = bitcast <8 x i32> %4 to <4 x i64>
7032 define <2 x i64> @test_mm_rolv_epi64(<2 x i64> %__A, <2 x i64> %__B) {
7033 ; CHECK-LABEL: test_mm_rolv_epi64:
7034 ; CHECK: # %bb.0: # %entry
7035 ; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm0
7036 ; CHECK-NEXT: ret{{[l|q]}}
7038 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B)
7042 define <2 x i64> @test_mm_mask_rolv_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
7043 ; X86-LABEL: test_mm_mask_rolv_epi64:
7044 ; X86: # %bb.0: # %entry
7045 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7046 ; X86-NEXT: kmovw %eax, %k1
7047 ; X86-NEXT: vprolvq %xmm2, %xmm1, %xmm0 {%k1}
7050 ; X64-LABEL: test_mm_mask_rolv_epi64:
7051 ; X64: # %bb.0: # %entry
7052 ; X64-NEXT: kmovw %edi, %k1
7053 ; X64-NEXT: vprolvq %xmm2, %xmm1, %xmm0 {%k1}
7056 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B)
7057 %1 = bitcast i8 %__U to <8 x i1>
7058 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7059 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__W
7063 define <2 x i64> @test_mm_maskz_rolv_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
7064 ; X86-LABEL: test_mm_maskz_rolv_epi64:
7065 ; X86: # %bb.0: # %entry
7066 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7067 ; X86-NEXT: kmovw %eax, %k1
7068 ; X86-NEXT: vprolvq %xmm1, %xmm0, %xmm0 {%k1} {z}
7071 ; X64-LABEL: test_mm_maskz_rolv_epi64:
7072 ; X64: # %bb.0: # %entry
7073 ; X64-NEXT: kmovw %edi, %k1
7074 ; X64-NEXT: vprolvq %xmm1, %xmm0, %xmm0 {%k1} {z}
7077 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B)
7078 %1 = bitcast i8 %__U to <8 x i1>
7079 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7080 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
7084 define <4 x i64> @test_mm256_rolv_epi64(<4 x i64> %__A, <4 x i64> %__B) {
7085 ; CHECK-LABEL: test_mm256_rolv_epi64:
7086 ; CHECK: # %bb.0: # %entry
7087 ; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm0
7088 ; CHECK-NEXT: ret{{[l|q]}}
7090 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B)
7094 define <4 x i64> @test_mm256_mask_rolv_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7095 ; X86-LABEL: test_mm256_mask_rolv_epi64:
7096 ; X86: # %bb.0: # %entry
7097 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7098 ; X86-NEXT: kmovw %eax, %k1
7099 ; X86-NEXT: vprolvq %ymm2, %ymm1, %ymm0 {%k1}
7102 ; X64-LABEL: test_mm256_mask_rolv_epi64:
7103 ; X64: # %bb.0: # %entry
7104 ; X64-NEXT: kmovw %edi, %k1
7105 ; X64-NEXT: vprolvq %ymm2, %ymm1, %ymm0 {%k1}
7108 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B)
7109 %1 = bitcast i8 %__U to <8 x i1>
7110 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7111 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__W
7115 define <4 x i64> @test_mm256_maskz_rolv_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7116 ; X86-LABEL: test_mm256_maskz_rolv_epi64:
7117 ; X86: # %bb.0: # %entry
7118 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7119 ; X86-NEXT: kmovw %eax, %k1
7120 ; X86-NEXT: vprolvq %ymm1, %ymm0, %ymm0 {%k1} {z}
7123 ; X64-LABEL: test_mm256_maskz_rolv_epi64:
7124 ; X64: # %bb.0: # %entry
7125 ; X64-NEXT: kmovw %edi, %k1
7126 ; X64-NEXT: vprolvq %ymm1, %ymm0, %ymm0 {%k1} {z}
7129 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B)
7130 %1 = bitcast i8 %__U to <8 x i1>
7131 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7132 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
7136 define <2 x i64> @test_mm_ror_epi32(<2 x i64> %__A) {
7137 ; CHECK-LABEL: test_mm_ror_epi32:
7138 ; CHECK: # %bb.0: # %entry
7139 ; CHECK-NEXT: vprord $5, %xmm0, %xmm0
7140 ; CHECK-NEXT: ret{{[l|q]}}
7142 %0 = bitcast <2 x i64> %__A to <4 x i32>
7143 %1 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
7144 %2 = bitcast <4 x i32> %1 to <2 x i64>
7148 define <2 x i64> @test_mm_mask_ror_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) {
7149 ; X86-LABEL: test_mm_mask_ror_epi32:
7150 ; X86: # %bb.0: # %entry
7151 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7152 ; X86-NEXT: kmovw %eax, %k1
7153 ; X86-NEXT: vprord $5, %xmm1, %xmm0 {%k1}
7156 ; X64-LABEL: test_mm_mask_ror_epi32:
7157 ; X64: # %bb.0: # %entry
7158 ; X64-NEXT: kmovw %edi, %k1
7159 ; X64-NEXT: vprord $5, %xmm1, %xmm0 {%k1}
7162 %0 = bitcast <2 x i64> %__A to <4 x i32>
7163 %1 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
7164 %2 = bitcast <2 x i64> %__W to <4 x i32>
7165 %3 = bitcast i8 %__U to <8 x i1>
7166 %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7167 %4 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %2
7168 %5 = bitcast <4 x i32> %4 to <2 x i64>
7172 define <2 x i64> @test_mm_maskz_ror_epi32(i8 zeroext %__U, <2 x i64> %__A) {
7173 ; X86-LABEL: test_mm_maskz_ror_epi32:
7174 ; X86: # %bb.0: # %entry
7175 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7176 ; X86-NEXT: kmovw %eax, %k1
7177 ; X86-NEXT: vprord $5, %xmm0, %xmm0 {%k1} {z}
7180 ; X64-LABEL: test_mm_maskz_ror_epi32:
7181 ; X64: # %bb.0: # %entry
7182 ; X64-NEXT: kmovw %edi, %k1
7183 ; X64-NEXT: vprord $5, %xmm0, %xmm0 {%k1} {z}
7186 %0 = bitcast <2 x i64> %__A to <4 x i32>
7187 %1 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
7188 %2 = bitcast i8 %__U to <8 x i1>
7189 %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7190 %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer
7191 %4 = bitcast <4 x i32> %3 to <2 x i64>
7195 define <4 x i64> @test_mm256_ror_epi32(<4 x i64> %__A) {
7196 ; CHECK-LABEL: test_mm256_ror_epi32:
7197 ; CHECK: # %bb.0: # %entry
7198 ; CHECK-NEXT: vprord $5, %ymm0, %ymm0
7199 ; CHECK-NEXT: ret{{[l|q]}}
7201 %0 = bitcast <4 x i64> %__A to <8 x i32>
7202 %1 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
7203 %2 = bitcast <8 x i32> %1 to <4 x i64>
7207 define <4 x i64> @test_mm256_mask_ror_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) {
7208 ; X86-LABEL: test_mm256_mask_ror_epi32:
7209 ; X86: # %bb.0: # %entry
7210 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7211 ; X86-NEXT: kmovw %eax, %k1
7212 ; X86-NEXT: vprord $5, %ymm1, %ymm0 {%k1}
7215 ; X64-LABEL: test_mm256_mask_ror_epi32:
7216 ; X64: # %bb.0: # %entry
7217 ; X64-NEXT: kmovw %edi, %k1
7218 ; X64-NEXT: vprord $5, %ymm1, %ymm0 {%k1}
7221 %0 = bitcast <4 x i64> %__A to <8 x i32>
7222 %1 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
7223 %2 = bitcast <4 x i64> %__W to <8 x i32>
7224 %3 = bitcast i8 %__U to <8 x i1>
7225 %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2
7226 %5 = bitcast <8 x i32> %4 to <4 x i64>
7230 define <4 x i64> @test_mm256_maskz_ror_epi32(i8 zeroext %__U, <4 x i64> %__A) {
7231 ; X86-LABEL: test_mm256_maskz_ror_epi32:
7232 ; X86: # %bb.0: # %entry
7233 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7234 ; X86-NEXT: kmovw %eax, %k1
7235 ; X86-NEXT: vprord $5, %ymm0, %ymm0 {%k1} {z}
7238 ; X64-LABEL: test_mm256_maskz_ror_epi32:
7239 ; X64: # %bb.0: # %entry
7240 ; X64-NEXT: kmovw %edi, %k1
7241 ; X64-NEXT: vprord $5, %ymm0, %ymm0 {%k1} {z}
7244 %0 = bitcast <4 x i64> %__A to <8 x i32>
7245 %1 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
7246 %2 = bitcast i8 %__U to <8 x i1>
7247 %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer
7248 %4 = bitcast <8 x i32> %3 to <4 x i64>
7252 define <2 x i64> @test_mm_ror_epi64(<2 x i64> %__A) {
7253 ; CHECK-LABEL: test_mm_ror_epi64:
7254 ; CHECK: # %bb.0: # %entry
7255 ; CHECK-NEXT: vprorq $5, %xmm0, %xmm0
7256 ; CHECK-NEXT: ret{{[l|q]}}
7258 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>)
7262 define <2 x i64> @test_mm_mask_ror_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) {
7263 ; X86-LABEL: test_mm_mask_ror_epi64:
7264 ; X86: # %bb.0: # %entry
7265 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7266 ; X86-NEXT: kmovw %eax, %k1
7267 ; X86-NEXT: vprorq $5, %xmm1, %xmm0 {%k1}
7270 ; X64-LABEL: test_mm_mask_ror_epi64:
7271 ; X64: # %bb.0: # %entry
7272 ; X64-NEXT: kmovw %edi, %k1
7273 ; X64-NEXT: vprorq $5, %xmm1, %xmm0 {%k1}
7276 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>)
7277 %1 = bitcast i8 %__U to <8 x i1>
7278 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7279 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__W
7283 define <2 x i64> @test_mm_maskz_ror_epi64(i8 zeroext %__U, <2 x i64> %__A) {
7284 ; X86-LABEL: test_mm_maskz_ror_epi64:
7285 ; X86: # %bb.0: # %entry
7286 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7287 ; X86-NEXT: kmovw %eax, %k1
7288 ; X86-NEXT: vprorq $5, %xmm0, %xmm0 {%k1} {z}
7291 ; X64-LABEL: test_mm_maskz_ror_epi64:
7292 ; X64: # %bb.0: # %entry
7293 ; X64-NEXT: kmovw %edi, %k1
7294 ; X64-NEXT: vprorq $5, %xmm0, %xmm0 {%k1} {z}
7297 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>)
7298 %1 = bitcast i8 %__U to <8 x i1>
7299 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7300 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer
7304 define <4 x i64> @test_mm256_ror_epi64(<4 x i64> %__A) {
7305 ; CHECK-LABEL: test_mm256_ror_epi64:
7306 ; CHECK: # %bb.0: # %entry
7307 ; CHECK-NEXT: vprorq $5, %ymm0, %ymm0
7308 ; CHECK-NEXT: ret{{[l|q]}}
7310 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5, i64 5, i64 5>)
7314 define <4 x i64> @test_mm256_mask_ror_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) {
7315 ; X86-LABEL: test_mm256_mask_ror_epi64:
7316 ; X86: # %bb.0: # %entry
7317 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7318 ; X86-NEXT: kmovw %eax, %k1
7319 ; X86-NEXT: vprorq $5, %ymm1, %ymm0 {%k1}
7322 ; X64-LABEL: test_mm256_mask_ror_epi64:
7323 ; X64: # %bb.0: # %entry
7324 ; X64-NEXT: kmovw %edi, %k1
7325 ; X64-NEXT: vprorq $5, %ymm1, %ymm0 {%k1}
7328 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5, i64 5, i64 5>)
7329 %1 = bitcast i8 %__U to <8 x i1>
7330 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7331 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__W
7335 define <4 x i64> @test_mm256_maskz_ror_epi64(i8 zeroext %__U, <4 x i64> %__A) {
7336 ; X86-LABEL: test_mm256_maskz_ror_epi64:
7337 ; X86: # %bb.0: # %entry
7338 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7339 ; X86-NEXT: kmovw %eax, %k1
7340 ; X86-NEXT: vprorq $5, %ymm0, %ymm0 {%k1} {z}
7343 ; X64-LABEL: test_mm256_maskz_ror_epi64:
7344 ; X64: # %bb.0: # %entry
7345 ; X64-NEXT: kmovw %edi, %k1
7346 ; X64-NEXT: vprorq $5, %ymm0, %ymm0 {%k1} {z}
7349 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5, i64 5, i64 5>)
7350 %1 = bitcast i8 %__U to <8 x i1>
7351 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7352 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer
7356 define <2 x i64> @test_mm_rorv_epi32(<2 x i64> %__A, <2 x i64> %__B) {
7357 ; CHECK-LABEL: test_mm_rorv_epi32:
7358 ; CHECK: # %bb.0: # %entry
7359 ; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm0
7360 ; CHECK-NEXT: ret{{[l|q]}}
7362 %0 = bitcast <2 x i64> %__A to <4 x i32>
7363 %1 = bitcast <2 x i64> %__B to <4 x i32>
7364 %2 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1)
7365 %3 = bitcast <4 x i32> %2 to <2 x i64>
7369 define <2 x i64> @test_mm_mask_rorv_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
7370 ; X86-LABEL: test_mm_mask_rorv_epi32:
7371 ; X86: # %bb.0: # %entry
7372 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7373 ; X86-NEXT: kmovw %eax, %k1
7374 ; X86-NEXT: vprorvd %xmm2, %xmm1, %xmm0 {%k1}
7377 ; X64-LABEL: test_mm_mask_rorv_epi32:
7378 ; X64: # %bb.0: # %entry
7379 ; X64-NEXT: kmovw %edi, %k1
7380 ; X64-NEXT: vprorvd %xmm2, %xmm1, %xmm0 {%k1}
7383 %0 = bitcast <2 x i64> %__A to <4 x i32>
7384 %1 = bitcast <2 x i64> %__B to <4 x i32>
7385 %2 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1)
7386 %3 = bitcast <2 x i64> %__W to <4 x i32>
7387 %4 = bitcast i8 %__U to <8 x i1>
7388 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7389 %5 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> %3
7390 %6 = bitcast <4 x i32> %5 to <2 x i64>
7394 define <2 x i64> @test_mm_maskz_rorv_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
7395 ; X86-LABEL: test_mm_maskz_rorv_epi32:
7396 ; X86: # %bb.0: # %entry
7397 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7398 ; X86-NEXT: kmovw %eax, %k1
7399 ; X86-NEXT: vprorvd %xmm1, %xmm0, %xmm0 {%k1} {z}
7402 ; X64-LABEL: test_mm_maskz_rorv_epi32:
7403 ; X64: # %bb.0: # %entry
7404 ; X64-NEXT: kmovw %edi, %k1
7405 ; X64-NEXT: vprorvd %xmm1, %xmm0, %xmm0 {%k1} {z}
7408 %0 = bitcast <2 x i64> %__A to <4 x i32>
7409 %1 = bitcast <2 x i64> %__B to <4 x i32>
7410 %2 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1)
7411 %3 = bitcast i8 %__U to <8 x i1>
7412 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7413 %4 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> zeroinitializer
7414 %5 = bitcast <4 x i32> %4 to <2 x i64>
7418 define <4 x i64> @test_mm256_rorv_epi32(<4 x i64> %__A, <4 x i64> %__B) {
7419 ; CHECK-LABEL: test_mm256_rorv_epi32:
7420 ; CHECK: # %bb.0: # %entry
7421 ; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm0
7422 ; CHECK-NEXT: ret{{[l|q]}}
7424 %0 = bitcast <4 x i64> %__A to <8 x i32>
7425 %1 = bitcast <4 x i64> %__B to <8 x i32>
7426 %2 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1)
7427 %3 = bitcast <8 x i32> %2 to <4 x i64>
7431 define <4 x i64> @test_mm256_mask_rorv_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7432 ; X86-LABEL: test_mm256_mask_rorv_epi32:
7433 ; X86: # %bb.0: # %entry
7434 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7435 ; X86-NEXT: kmovw %eax, %k1
7436 ; X86-NEXT: vprorvd %ymm2, %ymm1, %ymm0 {%k1}
7439 ; X64-LABEL: test_mm256_mask_rorv_epi32:
7440 ; X64: # %bb.0: # %entry
7441 ; X64-NEXT: kmovw %edi, %k1
7442 ; X64-NEXT: vprorvd %ymm2, %ymm1, %ymm0 {%k1}
7445 %0 = bitcast <4 x i64> %__A to <8 x i32>
7446 %1 = bitcast <4 x i64> %__B to <8 x i32>
7447 %2 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1)
7448 %3 = bitcast <4 x i64> %__W to <8 x i32>
7449 %4 = bitcast i8 %__U to <8 x i1>
7450 %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3
7451 %6 = bitcast <8 x i32> %5 to <4 x i64>
7455 define <4 x i64> @test_mm256_maskz_rorv_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7456 ; X86-LABEL: test_mm256_maskz_rorv_epi32:
7457 ; X86: # %bb.0: # %entry
7458 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7459 ; X86-NEXT: kmovw %eax, %k1
7460 ; X86-NEXT: vprorvd %ymm1, %ymm0, %ymm0 {%k1} {z}
7463 ; X64-LABEL: test_mm256_maskz_rorv_epi32:
7464 ; X64: # %bb.0: # %entry
7465 ; X64-NEXT: kmovw %edi, %k1
7466 ; X64-NEXT: vprorvd %ymm1, %ymm0, %ymm0 {%k1} {z}
7469 %0 = bitcast <4 x i64> %__A to <8 x i32>
7470 %1 = bitcast <4 x i64> %__B to <8 x i32>
7471 %2 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1)
7472 %3 = bitcast i8 %__U to <8 x i1>
7473 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
7474 %5 = bitcast <8 x i32> %4 to <4 x i64>
7478 define <2 x i64> @test_mm_rorv_epi64(<2 x i64> %__A, <2 x i64> %__B) {
7479 ; CHECK-LABEL: test_mm_rorv_epi64:
7480 ; CHECK: # %bb.0: # %entry
7481 ; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm0
7482 ; CHECK-NEXT: ret{{[l|q]}}
7484 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B)
7488 define <2 x i64> @test_mm_mask_rorv_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
7489 ; X86-LABEL: test_mm_mask_rorv_epi64:
7490 ; X86: # %bb.0: # %entry
7491 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7492 ; X86-NEXT: kmovw %eax, %k1
7493 ; X86-NEXT: vprorvq %xmm2, %xmm1, %xmm0 {%k1}
7496 ; X64-LABEL: test_mm_mask_rorv_epi64:
7497 ; X64: # %bb.0: # %entry
7498 ; X64-NEXT: kmovw %edi, %k1
7499 ; X64-NEXT: vprorvq %xmm2, %xmm1, %xmm0 {%k1}
7502 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B)
7503 %1 = bitcast i8 %__U to <8 x i1>
7504 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7505 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__W
7509 define <2 x i64> @test_mm_maskz_rorv_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
7510 ; X86-LABEL: test_mm_maskz_rorv_epi64:
7511 ; X86: # %bb.0: # %entry
7512 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7513 ; X86-NEXT: kmovw %eax, %k1
7514 ; X86-NEXT: vprorvq %xmm1, %xmm0, %xmm0 {%k1} {z}
7517 ; X64-LABEL: test_mm_maskz_rorv_epi64:
7518 ; X64: # %bb.0: # %entry
7519 ; X64-NEXT: kmovw %edi, %k1
7520 ; X64-NEXT: vprorvq %xmm1, %xmm0, %xmm0 {%k1} {z}
7523 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B)
7524 %1 = bitcast i8 %__U to <8 x i1>
7525 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
7526 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
7530 define <4 x i64> @test_mm256_rorv_epi64(<4 x i64> %__A, <4 x i64> %__B) {
7531 ; CHECK-LABEL: test_mm256_rorv_epi64:
7532 ; CHECK: # %bb.0: # %entry
7533 ; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm0
7534 ; CHECK-NEXT: ret{{[l|q]}}
7536 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B)
7540 define <4 x i64> @test_mm256_mask_rorv_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7541 ; X86-LABEL: test_mm256_mask_rorv_epi64:
7542 ; X86: # %bb.0: # %entry
7543 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7544 ; X86-NEXT: kmovw %eax, %k1
7545 ; X86-NEXT: vprorvq %ymm2, %ymm1, %ymm0 {%k1}
7548 ; X64-LABEL: test_mm256_mask_rorv_epi64:
7549 ; X64: # %bb.0: # %entry
7550 ; X64-NEXT: kmovw %edi, %k1
7551 ; X64-NEXT: vprorvq %ymm2, %ymm1, %ymm0 {%k1}
7554 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B)
7555 %1 = bitcast i8 %__U to <8 x i1>
7556 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7557 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__W
7561 define <4 x i64> @test_mm256_maskz_rorv_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
7562 ; X86-LABEL: test_mm256_maskz_rorv_epi64:
7563 ; X86: # %bb.0: # %entry
7564 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7565 ; X86-NEXT: kmovw %eax, %k1
7566 ; X86-NEXT: vprorvq %ymm1, %ymm0, %ymm0 {%k1} {z}
7569 ; X64-LABEL: test_mm256_maskz_rorv_epi64:
7570 ; X64: # %bb.0: # %entry
7571 ; X64-NEXT: kmovw %edi, %k1
7572 ; X64-NEXT: vprorvq %ymm1, %ymm0, %ymm0 {%k1} {z}
7575 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B)
7576 %1 = bitcast i8 %__U to <8 x i1>
7577 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7578 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
7582 declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>)
7583 declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>)
7584 declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double>, <4 x i32>, i8)
7585 declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>)
7586 declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double>, <4 x float>, i8)
7587 declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>)
7588 declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double>, <4 x i32>, i8)
7589 declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double>, <4 x i32>, i8)
7590 declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>)
7591 declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>)
7592 declare <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float>, <4 x i32>, i8)
7593 declare <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float>, <8 x i32>, i8)
7594 declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double>, <4 x i32>, i8)
7595 declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>)
7596 declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double>, <4 x i32>, i8)
7597 declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double>, <4 x i32>, i8)
7598 declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>)
7599 declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>)
7600 declare <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float>, <4 x i32>, i8)
7601 declare <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float>, <8 x i32>, i8)
7602 declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8)
7603 declare <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>)
7604 declare <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>)
7605 declare <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>)
7606 declare <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>)
7607 declare <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>)
7608 declare <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>)
7609 declare <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>)
7610 declare <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>)
7611 declare <2 x double> @llvm.masked.expandload.v2f64(ptr, <2 x i1>, <2 x double>)
7612 declare <4 x double> @llvm.masked.expandload.v4f64(ptr, <4 x i1>, <4 x double>)
7613 declare <2 x i64> @llvm.masked.expandload.v2i64(ptr, <2 x i1>, <2 x i64>)
7614 declare <4 x i64> @llvm.masked.expandload.v4i64(ptr, <4 x i1>, <4 x i64>)
7615 declare <4 x float> @llvm.masked.expandload.v4f32(ptr, <4 x i1>, <4 x float>)
7616 declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>)
7617 declare <4 x i32> @llvm.masked.expandload.v4i32(ptr, <4 x i1>, <4 x i32>)
7618 declare <8 x i32> @llvm.masked.expandload.v8i32(ptr, <8 x i1>, <8 x i32>)
7619 declare void @llvm.masked.compressstore.v2f64(<2 x double>, ptr, <2 x i1>)
7620 declare void @llvm.masked.compressstore.v4f64(<4 x double>, ptr, <4 x i1>)
7621 declare void @llvm.masked.compressstore.v2i64(<2 x i64>, ptr, <2 x i1>)
7622 declare void @llvm.masked.compressstore.v4i64(<4 x i64>, ptr, <4 x i1>)
7623 declare void @llvm.masked.compressstore.v4f32(<4 x float>, ptr, <4 x i1>)
7624 declare void @llvm.masked.compressstore.v8f32(<8 x float>, ptr, <8 x i1>)
7625 declare void @llvm.masked.compressstore.v4i32(<4 x i32>, ptr, <4 x i1>)
7626 declare void @llvm.masked.compressstore.v8i32(<8 x i32>, ptr, <8 x i1>)
7627 declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
7628 declare <8 x i32> @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
7629 declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
7630 declare <4 x i64> @llvm.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
7631 declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
7632 declare <8 x i32> @llvm.fshr.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
7633 declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
7634 declare <4 x i64> @llvm.fshr.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)