1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X32
3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X64
5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vl-builtins.c
7 define <8 x float> @test_mm256_shuffle_f32x4(<8 x float> %__A, <8 x float> %__B) {
8 ; X32-LABEL: test_mm256_shuffle_f32x4:
9 ; X32: # BB#0: # %entry
10 ; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
13 ; X64-LABEL: test_mm256_shuffle_f32x4:
14 ; X64: # BB#0: # %entry
15 ; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
18 %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
19 ret <8 x float> %shuffle
22 define <8 x float> @test_mm256_mask_shuffle_f32x4(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) {
23 ; X32-LABEL: test_mm256_mask_shuffle_f32x4:
24 ; X32: # BB#0: # %entry
25 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
26 ; X32-NEXT: kmovw %eax, %k1
27 ; X32-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
30 ; X64-LABEL: test_mm256_mask_shuffle_f32x4:
31 ; X64: # BB#0: # %entry
32 ; X64-NEXT: kmovw %edi, %k1
33 ; X64-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
36 %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
37 %0 = bitcast i8 %__U to <8 x i1>
38 %1 = select <8 x i1> %0, <8 x float> %shuffle, <8 x float> %__W
42 define <8 x float> @test_mm256_maskz_shuffle_f32x4(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) {
43 ; X32-LABEL: test_mm256_maskz_shuffle_f32x4:
44 ; X32: # BB#0: # %entry
45 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
46 ; X32-NEXT: kmovw %eax, %k1
47 ; X32-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
50 ; X64-LABEL: test_mm256_maskz_shuffle_f32x4:
51 ; X64: # BB#0: # %entry
52 ; X64-NEXT: kmovw %edi, %k1
53 ; X64-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
56 %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
57 %0 = bitcast i8 %__U to <8 x i1>
58 %1 = select <8 x i1> %0, <8 x float> %shuffle, <8 x float> zeroinitializer
62 define <4 x double> @test_mm256_shuffle_f64x2(<4 x double> %__A, <4 x double> %__B) {
63 ; X32-LABEL: test_mm256_shuffle_f64x2:
64 ; X32: # BB#0: # %entry
65 ; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
68 ; X64-LABEL: test_mm256_shuffle_f64x2:
69 ; X64: # BB#0: # %entry
70 ; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
73 %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
74 ret <4 x double> %shuffle
77 define <4 x double> @test_mm256_mask_shuffle_f64x2(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
78 ; X32-LABEL: test_mm256_mask_shuffle_f64x2:
79 ; X32: # BB#0: # %entry
80 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
81 ; X32-NEXT: kmovw %eax, %k1
82 ; X32-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
85 ; X64-LABEL: test_mm256_mask_shuffle_f64x2:
86 ; X64: # BB#0: # %entry
87 ; X64-NEXT: kmovw %edi, %k1
88 ; X64-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
91 %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
92 %0 = bitcast i8 %__U to <8 x i1>
93 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
94 %1 = select <4 x i1> %extract, <4 x double> %shuffle, <4 x double> %__W
98 define <4 x double> @test_mm256_maskz_shuffle_f64x2(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
99 ; X32-LABEL: test_mm256_maskz_shuffle_f64x2:
100 ; X32: # BB#0: # %entry
101 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
102 ; X32-NEXT: kmovw %eax, %k1
103 ; X32-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
106 ; X64-LABEL: test_mm256_maskz_shuffle_f64x2:
107 ; X64: # BB#0: # %entry
108 ; X64-NEXT: kmovw %edi, %k1
109 ; X64-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
112 %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
113 %0 = bitcast i8 %__U to <8 x i1>
114 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
115 %1 = select <4 x i1> %extract, <4 x double> %shuffle, <4 x double> zeroinitializer
119 define <4 x i64> @test_mm256_shuffle_i32x4(<4 x i64> %__A, <4 x i64> %__B) {
120 ; X32-LABEL: test_mm256_shuffle_i32x4:
121 ; X32: # BB#0: # %entry
122 ; X32-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
125 ; X64-LABEL: test_mm256_shuffle_i32x4:
126 ; X64: # BB#0: # %entry
127 ; X64-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
130 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
131 ret <4 x i64> %shuffle
134 define <4 x i64> @test_mm256_mask_shuffle_i32x4(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
135 ; X32-LABEL: test_mm256_mask_shuffle_i32x4:
136 ; X32: # BB#0: # %entry
137 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
138 ; X32-NEXT: kmovw %eax, %k1
139 ; X32-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
142 ; X64-LABEL: test_mm256_mask_shuffle_i32x4:
143 ; X64: # BB#0: # %entry
144 ; X64-NEXT: kmovw %edi, %k1
145 ; X64-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
148 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
149 %0 = bitcast <4 x i64> %shuffle to <8 x i32>
150 %1 = bitcast <4 x i64> %__W to <8 x i32>
151 %2 = bitcast i8 %__U to <8 x i1>
152 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
153 %4 = bitcast <8 x i32> %3 to <4 x i64>
157 define <4 x i64> @test_mm256_maskz_shuffle_i32x4(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
158 ; X32-LABEL: test_mm256_maskz_shuffle_i32x4:
159 ; X32: # BB#0: # %entry
160 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
161 ; X32-NEXT: kmovw %eax, %k1
162 ; X32-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
165 ; X64-LABEL: test_mm256_maskz_shuffle_i32x4:
166 ; X64: # BB#0: # %entry
167 ; X64-NEXT: kmovw %edi, %k1
168 ; X64-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
171 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
172 %0 = bitcast <4 x i64> %shuffle to <8 x i32>
173 %1 = bitcast i8 %__U to <8 x i1>
174 %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer
175 %3 = bitcast <8 x i32> %2 to <4 x i64>
179 define <4 x i64> @test_mm256_shuffle_i64x2(<4 x i64> %__A, <4 x i64> %__B) {
180 ; X32-LABEL: test_mm256_shuffle_i64x2:
181 ; X32: # BB#0: # %entry
182 ; X32-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
185 ; X64-LABEL: test_mm256_shuffle_i64x2:
186 ; X64: # BB#0: # %entry
187 ; X64-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
190 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
191 ret <4 x i64> %shuffle
194 define <4 x i64> @test_mm256_mask_shuffle_i64x2(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
195 ; X32-LABEL: test_mm256_mask_shuffle_i64x2:
196 ; X32: # BB#0: # %entry
197 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
198 ; X32-NEXT: kmovw %eax, %k1
199 ; X32-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
202 ; X64-LABEL: test_mm256_mask_shuffle_i64x2:
203 ; X64: # BB#0: # %entry
204 ; X64-NEXT: kmovw %edi, %k1
205 ; X64-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
208 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
209 %0 = bitcast i8 %__U to <8 x i1>
210 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
211 %1 = select <4 x i1> %extract, <4 x i64> %shuffle, <4 x i64> %__W
215 define <4 x i64> @test_mm256_maskz_shuffle_i64x2(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
216 ; X32-LABEL: test_mm256_maskz_shuffle_i64x2:
217 ; X32: # BB#0: # %entry
218 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
219 ; X32-NEXT: kmovw %eax, %k1
220 ; X32-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
223 ; X64-LABEL: test_mm256_maskz_shuffle_i64x2:
224 ; X64: # BB#0: # %entry
225 ; X64-NEXT: kmovw %edi, %k1
226 ; X64-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
229 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
230 %0 = bitcast i8 %__U to <8 x i1>
231 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
232 %1 = select <4 x i1> %extract, <4 x i64> %shuffle, <4 x i64> zeroinitializer
236 define zeroext i8 @test_mm_test_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) {
237 ; X32-LABEL: test_mm_test_epi32_mask:
238 ; X32: # BB#0: # %entry
239 ; X32-NEXT: vptestmd %xmm0, %xmm1, %k0
240 ; X32-NEXT: kmovw %k0, %eax
241 ; X32-NEXT: movzbl %al, %eax
244 ; X64-LABEL: test_mm_test_epi32_mask:
245 ; X64: # BB#0: # %entry
246 ; X64-NEXT: vptestmd %xmm0, %xmm1, %k0
247 ; X64-NEXT: kmovw %k0, %eax
248 ; X64-NEXT: movzbl %al, %eax
251 %and.i.i = and <2 x i64> %__B, %__A
252 %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
253 %1 = icmp ne <4 x i32> %0, zeroinitializer
254 %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
255 %3 = bitcast <8 x i1> %2 to i8
259 define zeroext i8 @test_mm_mask_test_epi32_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
260 ; X32-LABEL: test_mm_mask_test_epi32_mask:
261 ; X32: # BB#0: # %entry
262 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
263 ; X32-NEXT: kmovw %eax, %k1
264 ; X32-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1}
265 ; X32-NEXT: kmovw %k0, %eax
266 ; X32-NEXT: movzbl %al, %eax
269 ; X64-LABEL: test_mm_mask_test_epi32_mask:
270 ; X64: # BB#0: # %entry
271 ; X64-NEXT: kmovw %edi, %k1
272 ; X64-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1}
273 ; X64-NEXT: kmovw %k0, %eax
274 ; X64-NEXT: movzbl %al, %eax
277 %and.i.i = and <2 x i64> %__B, %__A
278 %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
279 %1 = icmp ne <4 x i32> %0, zeroinitializer
280 %2 = bitcast i8 %__U to <8 x i1>
281 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
282 %3 = and <4 x i1> %1, %extract.i
283 %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
284 %5 = bitcast <8 x i1> %4 to i8
288 define zeroext i8 @test_mm256_test_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) {
289 ; X32-LABEL: test_mm256_test_epi32_mask:
290 ; X32: # BB#0: # %entry
291 ; X32-NEXT: vptestmd %ymm0, %ymm1, %k0
292 ; X32-NEXT: kmovw %k0, %eax
293 ; X32-NEXT: movzbl %al, %eax
294 ; X32-NEXT: vzeroupper
297 ; X64-LABEL: test_mm256_test_epi32_mask:
298 ; X64: # BB#0: # %entry
299 ; X64-NEXT: vptestmd %ymm0, %ymm1, %k0
300 ; X64-NEXT: kmovw %k0, %eax
301 ; X64-NEXT: movzbl %al, %eax
302 ; X64-NEXT: vzeroupper
305 %and.i.i = and <4 x i64> %__B, %__A
306 %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
307 %1 = icmp ne <8 x i32> %0, zeroinitializer
308 %2 = bitcast <8 x i1> %1 to i8
312 define zeroext i8 @test_mm256_mask_test_epi32_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
313 ; X32-LABEL: test_mm256_mask_test_epi32_mask:
314 ; X32: # BB#0: # %entry
315 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
316 ; X32-NEXT: kmovw %eax, %k1
317 ; X32-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1}
318 ; X32-NEXT: kmovw %k0, %eax
319 ; X32-NEXT: movzbl %al, %eax
320 ; X32-NEXT: vzeroupper
323 ; X64-LABEL: test_mm256_mask_test_epi32_mask:
324 ; X64: # BB#0: # %entry
325 ; X64-NEXT: kmovw %edi, %k1
326 ; X64-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1}
327 ; X64-NEXT: kmovw %k0, %eax
328 ; X64-NEXT: movzbl %al, %eax
329 ; X64-NEXT: vzeroupper
332 %and.i.i = and <4 x i64> %__B, %__A
333 %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
334 %1 = icmp ne <8 x i32> %0, zeroinitializer
335 %2 = bitcast i8 %__U to <8 x i1>
336 %3 = and <8 x i1> %1, %2
337 %4 = bitcast <8 x i1> %3 to i8
341 define zeroext i8 @test_mm_test_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) {
342 ; X32-LABEL: test_mm_test_epi64_mask:
343 ; X32: # BB#0: # %entry
344 ; X32-NEXT: vptestmq %xmm0, %xmm1, %k0
345 ; X32-NEXT: kmovw %k0, %eax
346 ; X32-NEXT: movzbl %al, %eax
349 ; X64-LABEL: test_mm_test_epi64_mask:
350 ; X64: # BB#0: # %entry
351 ; X64-NEXT: vptestmq %xmm0, %xmm1, %k0
352 ; X64-NEXT: kmovw %k0, %eax
353 ; X64-NEXT: movzbl %al, %eax
356 %and.i.i = and <2 x i64> %__B, %__A
357 %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer
358 %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
359 %2 = bitcast <8 x i1> %1 to i8
363 define zeroext i8 @test_mm_mask_test_epi64_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
364 ; X32-LABEL: test_mm_mask_test_epi64_mask:
365 ; X32: # BB#0: # %entry
366 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
367 ; X32-NEXT: kmovw %eax, %k1
368 ; X32-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1}
369 ; X32-NEXT: kmovw %k0, %eax
370 ; X32-NEXT: movzbl %al, %eax
373 ; X64-LABEL: test_mm_mask_test_epi64_mask:
374 ; X64: # BB#0: # %entry
375 ; X64-NEXT: kmovw %edi, %k1
376 ; X64-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1}
377 ; X64-NEXT: kmovw %k0, %eax
378 ; X64-NEXT: movzbl %al, %eax
381 %and.i.i = and <2 x i64> %__B, %__A
382 %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer
383 %1 = bitcast i8 %__U to <8 x i1>
384 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
385 %2 = and <2 x i1> %0, %extract.i
386 %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
387 %4 = bitcast <8 x i1> %3 to i8
391 define zeroext i8 @test_mm256_test_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) {
392 ; X32-LABEL: test_mm256_test_epi64_mask:
393 ; X32: # BB#0: # %entry
394 ; X32-NEXT: vptestmq %ymm0, %ymm1, %k0
395 ; X32-NEXT: kmovw %k0, %eax
396 ; X32-NEXT: movzbl %al, %eax
397 ; X32-NEXT: vzeroupper
400 ; X64-LABEL: test_mm256_test_epi64_mask:
401 ; X64: # BB#0: # %entry
402 ; X64-NEXT: vptestmq %ymm0, %ymm1, %k0
403 ; X64-NEXT: kmovw %k0, %eax
404 ; X64-NEXT: movzbl %al, %eax
405 ; X64-NEXT: vzeroupper
408 %and.i.i = and <4 x i64> %__B, %__A
409 %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer
410 %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
411 %2 = bitcast <8 x i1> %1 to i8
415 define zeroext i8 @test_mm256_mask_test_epi64_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
416 ; X32-LABEL: test_mm256_mask_test_epi64_mask:
417 ; X32: # BB#0: # %entry
418 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
419 ; X32-NEXT: kmovw %eax, %k1
420 ; X32-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1}
421 ; X32-NEXT: kmovw %k0, %eax
422 ; X32-NEXT: movzbl %al, %eax
423 ; X32-NEXT: vzeroupper
426 ; X64-LABEL: test_mm256_mask_test_epi64_mask:
427 ; X64: # BB#0: # %entry
428 ; X64-NEXT: kmovw %edi, %k1
429 ; X64-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1}
430 ; X64-NEXT: kmovw %k0, %eax
431 ; X64-NEXT: movzbl %al, %eax
432 ; X64-NEXT: vzeroupper
435 %and.i.i = and <4 x i64> %__B, %__A
436 %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer
437 %1 = bitcast i8 %__U to <8 x i1>
438 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
439 %2 = and <4 x i1> %0, %extract.i
440 %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
441 %4 = bitcast <8 x i1> %3 to i8
445 define zeroext i8 @test_mm_testn_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) {
446 ; X32-LABEL: test_mm_testn_epi32_mask:
447 ; X32: # BB#0: # %entry
448 ; X32-NEXT: vptestnmd %xmm0, %xmm1, %k0
449 ; X32-NEXT: kmovw %k0, %eax
450 ; X32-NEXT: movzbl %al, %eax
453 ; X64-LABEL: test_mm_testn_epi32_mask:
454 ; X64: # BB#0: # %entry
455 ; X64-NEXT: vptestnmd %xmm0, %xmm1, %k0
456 ; X64-NEXT: kmovw %k0, %eax
457 ; X64-NEXT: movzbl %al, %eax
460 %and.i.i = and <2 x i64> %__B, %__A
461 %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
462 %1 = icmp eq <4 x i32> %0, zeroinitializer
463 %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
464 %3 = bitcast <8 x i1> %2 to i8
468 define zeroext i8 @test_mm_mask_testn_epi32_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
469 ; X32-LABEL: test_mm_mask_testn_epi32_mask:
470 ; X32: # BB#0: # %entry
471 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
472 ; X32-NEXT: kmovw %eax, %k1
473 ; X32-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1}
474 ; X32-NEXT: kmovw %k0, %eax
475 ; X32-NEXT: movzbl %al, %eax
478 ; X64-LABEL: test_mm_mask_testn_epi32_mask:
479 ; X64: # BB#0: # %entry
480 ; X64-NEXT: kmovw %edi, %k1
481 ; X64-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1}
482 ; X64-NEXT: kmovw %k0, %eax
483 ; X64-NEXT: movzbl %al, %eax
486 %and.i.i = and <2 x i64> %__B, %__A
487 %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
488 %1 = icmp eq <4 x i32> %0, zeroinitializer
489 %2 = bitcast i8 %__U to <8 x i1>
490 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
491 %3 = and <4 x i1> %1, %extract.i
492 %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
493 %5 = bitcast <8 x i1> %4 to i8
497 define zeroext i8 @test_mm256_testn_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) {
498 ; X32-LABEL: test_mm256_testn_epi32_mask:
499 ; X32: # BB#0: # %entry
500 ; X32-NEXT: vptestnmd %ymm0, %ymm1, %k0
501 ; X32-NEXT: kmovw %k0, %eax
502 ; X32-NEXT: movzbl %al, %eax
503 ; X32-NEXT: vzeroupper
506 ; X64-LABEL: test_mm256_testn_epi32_mask:
507 ; X64: # BB#0: # %entry
508 ; X64-NEXT: vptestnmd %ymm0, %ymm1, %k0
509 ; X64-NEXT: kmovw %k0, %eax
510 ; X64-NEXT: movzbl %al, %eax
511 ; X64-NEXT: vzeroupper
514 %and.i.i = and <4 x i64> %__B, %__A
515 %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
516 %1 = icmp eq <8 x i32> %0, zeroinitializer
517 %2 = bitcast <8 x i1> %1 to i8
521 define zeroext i8 @test_mm256_mask_testn_epi32_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
522 ; X32-LABEL: test_mm256_mask_testn_epi32_mask:
523 ; X32: # BB#0: # %entry
524 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
525 ; X32-NEXT: kmovw %eax, %k1
526 ; X32-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1}
527 ; X32-NEXT: kmovw %k0, %eax
528 ; X32-NEXT: movzbl %al, %eax
529 ; X32-NEXT: vzeroupper
532 ; X64-LABEL: test_mm256_mask_testn_epi32_mask:
533 ; X64: # BB#0: # %entry
534 ; X64-NEXT: kmovw %edi, %k1
535 ; X64-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1}
536 ; X64-NEXT: kmovw %k0, %eax
537 ; X64-NEXT: movzbl %al, %eax
538 ; X64-NEXT: vzeroupper
541 %and.i.i = and <4 x i64> %__B, %__A
542 %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
543 %1 = icmp eq <8 x i32> %0, zeroinitializer
544 %2 = bitcast i8 %__U to <8 x i1>
545 %3 = and <8 x i1> %1, %2
546 %4 = bitcast <8 x i1> %3 to i8
550 define zeroext i8 @test_mm_testn_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) {
551 ; X32-LABEL: test_mm_testn_epi64_mask:
552 ; X32: # BB#0: # %entry
553 ; X32-NEXT: vptestnmq %xmm0, %xmm1, %k0
554 ; X32-NEXT: kmovw %k0, %eax
555 ; X32-NEXT: movzbl %al, %eax
558 ; X64-LABEL: test_mm_testn_epi64_mask:
559 ; X64: # BB#0: # %entry
560 ; X64-NEXT: vptestnmq %xmm0, %xmm1, %k0
561 ; X64-NEXT: kmovw %k0, %eax
562 ; X64-NEXT: movzbl %al, %eax
565 %and.i.i = and <2 x i64> %__B, %__A
566 %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer
567 %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
568 %2 = bitcast <8 x i1> %1 to i8
572 define zeroext i8 @test_mm_mask_testn_epi64_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
573 ; X32-LABEL: test_mm_mask_testn_epi64_mask:
574 ; X32: # BB#0: # %entry
575 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
576 ; X32-NEXT: kmovw %eax, %k1
577 ; X32-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1}
578 ; X32-NEXT: kmovw %k0, %eax
579 ; X32-NEXT: movzbl %al, %eax
582 ; X64-LABEL: test_mm_mask_testn_epi64_mask:
583 ; X64: # BB#0: # %entry
584 ; X64-NEXT: kmovw %edi, %k1
585 ; X64-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1}
586 ; X64-NEXT: kmovw %k0, %eax
587 ; X64-NEXT: movzbl %al, %eax
590 %and.i.i = and <2 x i64> %__B, %__A
591 %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer
592 %1 = bitcast i8 %__U to <8 x i1>
593 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
594 %2 = and <2 x i1> %0, %extract.i
595 %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
596 %4 = bitcast <8 x i1> %3 to i8
600 define zeroext i8 @test_mm256_testn_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) {
601 ; X32-LABEL: test_mm256_testn_epi64_mask:
602 ; X32: # BB#0: # %entry
603 ; X32-NEXT: vptestnmq %ymm0, %ymm1, %k0
604 ; X32-NEXT: kmovw %k0, %eax
605 ; X32-NEXT: movzbl %al, %eax
606 ; X32-NEXT: vzeroupper
609 ; X64-LABEL: test_mm256_testn_epi64_mask:
610 ; X64: # BB#0: # %entry
611 ; X64-NEXT: vptestnmq %ymm0, %ymm1, %k0
612 ; X64-NEXT: kmovw %k0, %eax
613 ; X64-NEXT: movzbl %al, %eax
614 ; X64-NEXT: vzeroupper
617 %and.i.i = and <4 x i64> %__B, %__A
618 %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer
619 %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
620 %2 = bitcast <8 x i1> %1 to i8
624 define zeroext i8 @test_mm256_mask_testn_epi64_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
625 ; X32-LABEL: test_mm256_mask_testn_epi64_mask:
626 ; X32: # BB#0: # %entry
627 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
628 ; X32-NEXT: kmovw %eax, %k1
629 ; X32-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1}
630 ; X32-NEXT: kmovw %k0, %eax
631 ; X32-NEXT: movzbl %al, %eax
632 ; X32-NEXT: vzeroupper
635 ; X64-LABEL: test_mm256_mask_testn_epi64_mask:
636 ; X64: # BB#0: # %entry
637 ; X64-NEXT: kmovw %edi, %k1
638 ; X64-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1}
639 ; X64-NEXT: kmovw %k0, %eax
640 ; X64-NEXT: movzbl %al, %eax
641 ; X64-NEXT: vzeroupper
644 %and.i.i = and <4 x i64> %__B, %__A
645 %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer
646 %1 = bitcast i8 %__U to <8 x i1>
647 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
648 %2 = and <4 x i1> %0, %extract.i
649 %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
650 %4 = bitcast <8 x i1> %3 to i8
654 define <2 x i64> @test_mm_mask_set1_epi32(<2 x i64> %__O, i8 zeroext %__M) {
655 ; X32-LABEL: test_mm_mask_set1_epi32:
656 ; X32: # BB#0: # %entry
657 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
658 ; X32-NEXT: kmovw %eax, %k1
659 ; X32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm0 {%k1}
662 ; X64-LABEL: test_mm_mask_set1_epi32:
663 ; X64: # BB#0: # %entry
664 ; X64-NEXT: kmovw %edi, %k1
665 ; X64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1}
668 %0 = bitcast <2 x i64> %__O to <4 x i32>
669 %1 = bitcast i8 %__M to <8 x i1>
670 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
671 %2 = select <4 x i1> %extract.i, <4 x i32> <i32 5, i32 5, i32 5, i32 5>, <4 x i32> %0
672 %3 = bitcast <4 x i32> %2 to <2 x i64>
676 define <2 x i64> @test_mm_maskz_set1_epi32(i8 zeroext %__M) {
677 ; X32-LABEL: test_mm_maskz_set1_epi32:
678 ; X32: # BB#0: # %entry
679 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
680 ; X32-NEXT: kmovw %eax, %k1
681 ; X32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm0 {%k1} {z}
684 ; X64-LABEL: test_mm_maskz_set1_epi32:
685 ; X64: # BB#0: # %entry
686 ; X64-NEXT: kmovw %edi, %k1
687 ; X64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
690 %0 = bitcast i8 %__M to <8 x i1>
691 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
692 %1 = select <4 x i1> %extract.i, <4 x i32> <i32 5, i32 5, i32 5, i32 5>, <4 x i32> zeroinitializer
693 %2 = bitcast <4 x i32> %1 to <2 x i64>
697 define <4 x i64> @test_mm256_mask_set1_epi32(<4 x i64> %__O, i8 zeroext %__M) {
698 ; X32-LABEL: test_mm256_mask_set1_epi32:
699 ; X32: # BB#0: # %entry
700 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
701 ; X32-NEXT: kmovw %eax, %k1
702 ; X32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm0 {%k1}
705 ; X64-LABEL: test_mm256_mask_set1_epi32:
706 ; X64: # BB#0: # %entry
707 ; X64-NEXT: kmovw %edi, %k1
708 ; X64-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1}
711 %0 = bitcast <4 x i64> %__O to <8 x i32>
712 %1 = bitcast i8 %__M to <8 x i1>
713 %2 = select <8 x i1> %1, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <8 x i32> %0
714 %3 = bitcast <8 x i32> %2 to <4 x i64>
718 define <4 x i64> @test_mm256_maskz_set1_epi32(i8 zeroext %__M) {
719 ; X32-LABEL: test_mm256_maskz_set1_epi32:
720 ; X32: # BB#0: # %entry
721 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
722 ; X32-NEXT: kmovw %eax, %k1
723 ; X32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm0 {%k1} {z}
726 ; X64-LABEL: test_mm256_maskz_set1_epi32:
727 ; X64: # BB#0: # %entry
728 ; X64-NEXT: kmovw %edi, %k1
729 ; X64-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
732 %0 = bitcast i8 %__M to <8 x i1>
733 %1 = select <8 x i1> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <8 x i32> zeroinitializer
734 %2 = bitcast <8 x i32> %1 to <4 x i64>
738 define <2 x i64> @test_mm_mask_set1_epi64(<2 x i64> %__O, i8 zeroext %__M, i64 %__A) {
739 ; X32-LABEL: test_mm_mask_set1_epi64:
740 ; X32: # BB#0: # %entry
741 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
742 ; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
743 ; X32-NEXT: vmovd %eax, %xmm1
744 ; X32-NEXT: vpbroadcastb %xmm1, %xmm1
745 ; X32-NEXT: kmovw %ecx, %k1
746 ; X32-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
749 ; X64-LABEL: test_mm_mask_set1_epi64:
750 ; X64: # BB#0: # %entry
751 ; X64-NEXT: vmovd %esi, %xmm1
752 ; X64-NEXT: vpbroadcastb %xmm1, %xmm1
753 ; X64-NEXT: kmovw %edi, %k1
754 ; X64-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
757 %conv.i = trunc i64 %__A to i8
758 %vecinit.i.i = insertelement <16 x i8> undef, i8 %conv.i, i32 0
759 %vecinit15.i.i = shufflevector <16 x i8> %vecinit.i.i, <16 x i8> undef, <16 x i32> zeroinitializer
760 %0 = bitcast <16 x i8> %vecinit15.i.i to <2 x i64>
761 %1 = bitcast i8 %__M to <8 x i1>
762 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
763 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__O
767 define <2 x i64> @test_mm_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) {
768 ; X32-LABEL: test_mm_maskz_set1_epi64:
769 ; X32: # BB#0: # %entry
770 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
771 ; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
772 ; X32-NEXT: vmovd %eax, %xmm0
773 ; X32-NEXT: vpbroadcastb %xmm0, %xmm0
774 ; X32-NEXT: kmovw %ecx, %k1
775 ; X32-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
778 ; X64-LABEL: test_mm_maskz_set1_epi64:
779 ; X64: # BB#0: # %entry
780 ; X64-NEXT: vmovd %esi, %xmm0
781 ; X64-NEXT: vpbroadcastb %xmm0, %xmm0
782 ; X64-NEXT: kmovw %edi, %k1
783 ; X64-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
786 %conv.i = trunc i64 %__A to i8
787 %vecinit.i.i = insertelement <16 x i8> undef, i8 %conv.i, i32 0
788 %vecinit15.i.i = shufflevector <16 x i8> %vecinit.i.i, <16 x i8> undef, <16 x i32> zeroinitializer
789 %0 = bitcast <16 x i8> %vecinit15.i.i to <2 x i64>
790 %1 = bitcast i8 %__M to <8 x i1>
791 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
792 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
797 define <4 x i64> @test_mm256_mask_set1_epi64(<4 x i64> %__O, i8 zeroext %__M, i64 %__A) {
798 ; X32-LABEL: test_mm256_mask_set1_epi64:
799 ; X32: # BB#0: # %entry
800 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
801 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
802 ; X32-NEXT: movb {{[0-9]+}}(%esp), %dl
803 ; X32-NEXT: vmovd %ecx, %xmm1
804 ; X32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
805 ; X32-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
806 ; X32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
807 ; X32-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
808 ; X32-NEXT: kmovw %edx, %k1
809 ; X32-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1}
812 ; X64-LABEL: test_mm256_mask_set1_epi64:
813 ; X64: # BB#0: # %entry
814 ; X64-NEXT: kmovw %edi, %k1
815 ; X64-NEXT: vpbroadcastq %rsi, %ymm0 {%k1}
818 %vecinit.i.i = insertelement <4 x i64> undef, i64 %__A, i32 0
819 %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer
820 %0 = bitcast i8 %__M to <8 x i1>
821 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
822 %1 = select <4 x i1> %extract.i, <4 x i64> %vecinit3.i.i, <4 x i64> %__O
826 define <4 x i64> @test_mm256_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) {
827 ; X32-LABEL: test_mm256_maskz_set1_epi64:
828 ; X32: # BB#0: # %entry
829 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
830 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
831 ; X32-NEXT: movb {{[0-9]+}}(%esp), %dl
832 ; X32-NEXT: vmovd %ecx, %xmm0
833 ; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
834 ; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
835 ; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
836 ; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
837 ; X32-NEXT: kmovw %edx, %k1
838 ; X32-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
841 ; X64-LABEL: test_mm256_maskz_set1_epi64:
842 ; X64: # BB#0: # %entry
843 ; X64-NEXT: kmovw %edi, %k1
844 ; X64-NEXT: vpbroadcastq %rsi, %ymm0 {%k1} {z}
847 %vecinit.i.i = insertelement <4 x i64> undef, i64 %__A, i32 0
848 %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer
849 %0 = bitcast i8 %__M to <8 x i1>
850 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
851 %1 = select <4 x i1> %extract.i, <4 x i64> %vecinit3.i.i, <4 x i64> zeroinitializer
855 define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
856 ; X32-LABEL: test_mm_broadcastd_epi32:
858 ; X32-NEXT: vbroadcastss %xmm0, %xmm0
861 ; X64-LABEL: test_mm_broadcastd_epi32:
863 ; X64-NEXT: vbroadcastss %xmm0, %xmm0
865 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
866 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
867 %res1 = bitcast <4 x i32> %res0 to <2 x i64>
871 define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %a0, i8 %a1, <2 x i64> %a2) {
872 ; X32-LABEL: test_mm_mask_broadcastd_epi32:
874 ; X32-NEXT: pushl %eax
875 ; X32-NEXT: .cfi_def_cfa_offset 8
876 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
877 ; X32-NEXT: andb $15, %al
878 ; X32-NEXT: movb %al, (%esp)
879 ; X32-NEXT: movzbl (%esp), %eax
880 ; X32-NEXT: kmovw %eax, %k1
881 ; X32-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1}
882 ; X32-NEXT: popl %eax
885 ; X64-LABEL: test_mm_mask_broadcastd_epi32:
887 ; X64-NEXT: andb $15, %dil
888 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
889 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
890 ; X64-NEXT: kmovw %eax, %k1
891 ; X64-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1}
893 %trn1 = trunc i8 %a1 to i4
894 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
895 %arg1 = bitcast i4 %trn1 to <4 x i1>
896 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
897 %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <4 x i32> zeroinitializer
898 %res1 = select <4 x i1> %arg1, <4 x i32> %res0, <4 x i32> %arg0
899 %res2 = bitcast <4 x i32> %res1 to <2 x i64>
903 define <2 x i64> @test_mm_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) {
904 ; X32-LABEL: test_mm_maskz_broadcastd_epi32:
906 ; X32-NEXT: pushl %eax
907 ; X32-NEXT: .cfi_def_cfa_offset 8
908 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
909 ; X32-NEXT: andb $15, %al
910 ; X32-NEXT: movb %al, (%esp)
911 ; X32-NEXT: movzbl (%esp), %eax
912 ; X32-NEXT: kmovw %eax, %k1
913 ; X32-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z}
914 ; X32-NEXT: popl %eax
917 ; X64-LABEL: test_mm_maskz_broadcastd_epi32:
919 ; X64-NEXT: andb $15, %dil
920 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
921 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
922 ; X64-NEXT: kmovw %eax, %k1
923 ; X64-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z}
925 %trn0 = trunc i8 %a0 to i4
926 %arg0 = bitcast i4 %trn0 to <4 x i1>
927 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
928 %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <4 x i32> zeroinitializer
929 %res1 = select <4 x i1> %arg0, <4 x i32> %res0, <4 x i32> zeroinitializer
930 %res2 = bitcast <4 x i32> %res1 to <2 x i64>
934 define <4 x i64> @test_mm256_broadcastd_epi32(<2 x i64> %a0) {
935 ; X32-LABEL: test_mm256_broadcastd_epi32:
937 ; X32-NEXT: vbroadcastss %xmm0, %ymm0
940 ; X64-LABEL: test_mm256_broadcastd_epi32:
942 ; X64-NEXT: vbroadcastss %xmm0, %ymm0
944 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
945 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <8 x i32> zeroinitializer
946 %res1 = bitcast <8 x i32> %res0 to <4 x i64>
950 define <4 x i64> @test_mm256_mask_broadcastd_epi32(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) {
951 ; X32-LABEL: test_mm256_mask_broadcastd_epi32:
953 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
954 ; X32-NEXT: kmovw %eax, %k1
955 ; X32-NEXT: vpbroadcastd %xmm1, %ymm0 {%k1}
958 ; X64-LABEL: test_mm256_mask_broadcastd_epi32:
960 ; X64-NEXT: kmovw %edi, %k1
961 ; X64-NEXT: vpbroadcastd %xmm1, %ymm0 {%k1}
963 %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
964 %arg1 = bitcast i8 %a1 to <8 x i1>
965 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
966 %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <8 x i32> zeroinitializer
967 %res1 = select <8 x i1> %arg1, <8 x i32> %res0, <8 x i32> %arg0
968 %res2 = bitcast <8 x i32> %res1 to <4 x i64>
972 define <4 x i64> @test_mm256_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) {
973 ; X32-LABEL: test_mm256_maskz_broadcastd_epi32:
975 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
976 ; X32-NEXT: kmovw %eax, %k1
977 ; X32-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z}
980 ; X64-LABEL: test_mm256_maskz_broadcastd_epi32:
982 ; X64-NEXT: kmovw %edi, %k1
983 ; X64-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z}
985 %arg0 = bitcast i8 %a0 to <8 x i1>
986 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
987 %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <8 x i32> zeroinitializer
988 %res1 = select <8 x i1> %arg0, <8 x i32> %res0, <8 x i32> zeroinitializer
989 %res2 = bitcast <8 x i32> %res1 to <4 x i64>
993 define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
994 ; X32-LABEL: test_mm_broadcastq_epi64:
996 ; X32-NEXT: vpbroadcastq %xmm0, %xmm0
999 ; X64-LABEL: test_mm_broadcastq_epi64:
1001 ; X64-NEXT: vpbroadcastq %xmm0, %xmm0
1003 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
1007 define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %a0, i8 %a1, <2 x i64> %a2) {
1008 ; X32-LABEL: test_mm_mask_broadcastq_epi64:
1010 ; X32-NEXT: pushl %eax
1011 ; X32-NEXT: .cfi_def_cfa_offset 8
1012 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1013 ; X32-NEXT: andb $3, %al
1014 ; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
1015 ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1016 ; X32-NEXT: kmovw %eax, %k1
1017 ; X32-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1}
1018 ; X32-NEXT: popl %eax
1021 ; X64-LABEL: test_mm_mask_broadcastq_epi64:
1023 ; X64-NEXT: andb $3, %dil
1024 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1025 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1026 ; X64-NEXT: kmovw %eax, %k1
1027 ; X64-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1}
1029 %trn1 = trunc i8 %a1 to i2
1030 %arg1 = bitcast i2 %trn1 to <2 x i1>
1031 %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <2 x i32> zeroinitializer
1032 %res1 = select <2 x i1> %arg1, <2 x i64> %res0, <2 x i64> %a0
1036 define <2 x i64> @test_mm_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
1037 ; X32-LABEL: test_mm_maskz_broadcastq_epi64:
1039 ; X32-NEXT: pushl %eax
1040 ; X32-NEXT: .cfi_def_cfa_offset 8
1041 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1042 ; X32-NEXT: andb $3, %al
1043 ; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
1044 ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1045 ; X32-NEXT: kmovw %eax, %k1
1046 ; X32-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z}
1047 ; X32-NEXT: popl %eax
1050 ; X64-LABEL: test_mm_maskz_broadcastq_epi64:
1052 ; X64-NEXT: andb $3, %dil
1053 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1054 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1055 ; X64-NEXT: kmovw %eax, %k1
1056 ; X64-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z}
1058 %trn0 = trunc i8 %a0 to i2
1059 %arg0 = bitcast i2 %trn0 to <2 x i1>
1060 %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <2 x i32> zeroinitializer
1061 %res1 = select <2 x i1> %arg0, <2 x i64> %res0, <2 x i64> zeroinitializer
1065 define <4 x i64> @test_mm256_broadcastq_epi64(<2 x i64> %a0) {
1066 ; X32-LABEL: test_mm256_broadcastq_epi64:
1068 ; X32-NEXT: vbroadcastsd %xmm0, %ymm0
1071 ; X64-LABEL: test_mm256_broadcastq_epi64:
1073 ; X64-NEXT: vbroadcastsd %xmm0, %ymm0
1075 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> zeroinitializer
1079 define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) {
1080 ; X32-LABEL: test_mm256_mask_broadcastq_epi64:
1082 ; X32-NEXT: pushl %eax
1083 ; X32-NEXT: .cfi_def_cfa_offset 8
1084 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1085 ; X32-NEXT: andb $15, %al
1086 ; X32-NEXT: movb %al, (%esp)
1087 ; X32-NEXT: movzbl (%esp), %eax
1088 ; X32-NEXT: kmovw %eax, %k1
1089 ; X32-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1}
1090 ; X32-NEXT: popl %eax
1093 ; X64-LABEL: test_mm256_mask_broadcastq_epi64:
1095 ; X64-NEXT: andb $15, %dil
1096 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1097 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1098 ; X64-NEXT: kmovw %eax, %k1
1099 ; X64-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1}
1101 %trn1 = trunc i8 %a1 to i4
1102 %arg1 = bitcast i4 %trn1 to <4 x i1>
1103 %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <4 x i32> zeroinitializer
1104 %res1 = select <4 x i1> %arg1, <4 x i64> %res0, <4 x i64> %a0
1108 define <4 x i64> @test_mm256_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
1109 ; X32-LABEL: test_mm256_maskz_broadcastq_epi64:
1111 ; X32-NEXT: pushl %eax
1112 ; X32-NEXT: .cfi_def_cfa_offset 8
1113 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1114 ; X32-NEXT: andb $15, %al
1115 ; X32-NEXT: movb %al, (%esp)
1116 ; X32-NEXT: movzbl (%esp), %eax
1117 ; X32-NEXT: kmovw %eax, %k1
1118 ; X32-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z}
1119 ; X32-NEXT: popl %eax
1122 ; X64-LABEL: test_mm256_maskz_broadcastq_epi64:
1124 ; X64-NEXT: andb $15, %dil
1125 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1126 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1127 ; X64-NEXT: kmovw %eax, %k1
1128 ; X64-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z}
1130 %trn0 = trunc i8 %a0 to i4
1131 %arg0 = bitcast i4 %trn0 to <4 x i1>
1132 %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <4 x i32> zeroinitializer
1133 %res1 = select <4 x i1> %arg0, <4 x i64> %res0, <4 x i64> zeroinitializer
1137 define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) {
1138 ; X32-LABEL: test_mm_broadcastsd_pd:
1140 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1143 ; X64-LABEL: test_mm_broadcastsd_pd:
1145 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1147 %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
1148 ret <2 x double> %res
1151 define <2 x double> @test_mm_mask_broadcastsd_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2) {
1152 ; X32-LABEL: test_mm_mask_broadcastsd_pd:
1154 ; X32-NEXT: pushl %eax
1155 ; X32-NEXT: .cfi_def_cfa_offset 8
1156 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1157 ; X32-NEXT: andb $3, %al
1158 ; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
1159 ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1160 ; X32-NEXT: kmovw %eax, %k1
1161 ; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
1162 ; X32-NEXT: popl %eax
1165 ; X64-LABEL: test_mm_mask_broadcastsd_pd:
1167 ; X64-NEXT: andb $3, %dil
1168 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1169 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1170 ; X64-NEXT: kmovw %eax, %k1
1171 ; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
1173 %trn1 = trunc i8 %a1 to i2
1174 %arg1 = bitcast i2 %trn1 to <2 x i1>
1175 %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <2 x i32> zeroinitializer
1176 %res1 = select <2 x i1> %arg1, <2 x double> %res0, <2 x double> %a0
1177 ret <2 x double> %res1
1180 define <2 x double> @test_mm_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
1181 ; X32-LABEL: test_mm_maskz_broadcastsd_pd:
1183 ; X32-NEXT: pushl %eax
1184 ; X32-NEXT: .cfi_def_cfa_offset 8
1185 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1186 ; X32-NEXT: andb $3, %al
1187 ; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
1188 ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1189 ; X32-NEXT: kmovw %eax, %k1
1190 ; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
1191 ; X32-NEXT: popl %eax
1194 ; X64-LABEL: test_mm_maskz_broadcastsd_pd:
1196 ; X64-NEXT: andb $3, %dil
1197 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1198 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1199 ; X64-NEXT: kmovw %eax, %k1
1200 ; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
1202 %trn0 = trunc i8 %a0 to i2
1203 %arg0 = bitcast i2 %trn0 to <2 x i1>
1204 %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer
1205 %res1 = select <2 x i1> %arg0, <2 x double> %res0, <2 x double> zeroinitializer
1206 ret <2 x double> %res1
1209 define <4 x double> @test_mm256_broadcastsd_pd(<2 x double> %a0) {
1210 ; X32-LABEL: test_mm256_broadcastsd_pd:
1212 ; X32-NEXT: vbroadcastsd %xmm0, %ymm0
1215 ; X64-LABEL: test_mm256_broadcastsd_pd:
1217 ; X64-NEXT: vbroadcastsd %xmm0, %ymm0
1219 %res = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer
1220 ret <4 x double> %res
1223 define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %a0, i8 %a1, <2 x double> %a2) {
1224 ; X32-LABEL: test_mm256_mask_broadcastsd_pd:
1226 ; X32-NEXT: pushl %eax
1227 ; X32-NEXT: .cfi_def_cfa_offset 8
1228 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1229 ; X32-NEXT: andb $15, %al
1230 ; X32-NEXT: movb %al, (%esp)
1231 ; X32-NEXT: movzbl (%esp), %eax
1232 ; X32-NEXT: kmovw %eax, %k1
1233 ; X32-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1}
1234 ; X32-NEXT: popl %eax
1237 ; X64-LABEL: test_mm256_mask_broadcastsd_pd:
1239 ; X64-NEXT: andb $15, %dil
1240 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1241 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1242 ; X64-NEXT: kmovw %eax, %k1
1243 ; X64-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1}
1245 %trn1 = trunc i8 %a1 to i4
1246 %arg1 = bitcast i4 %trn1 to <4 x i1>
1247 %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <4 x i32> zeroinitializer
1248 %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0
1249 ret <4 x double> %res1
1252 define <4 x double> @test_mm256_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
1253 ; X32-LABEL: test_mm256_maskz_broadcastsd_pd:
1255 ; X32-NEXT: pushl %eax
1256 ; X32-NEXT: .cfi_def_cfa_offset 8
1257 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1258 ; X32-NEXT: andb $15, %al
1259 ; X32-NEXT: movb %al, (%esp)
1260 ; X32-NEXT: movzbl (%esp), %eax
1261 ; X32-NEXT: kmovw %eax, %k1
1262 ; X32-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z}
1263 ; X32-NEXT: popl %eax
1266 ; X64-LABEL: test_mm256_maskz_broadcastsd_pd:
1268 ; X64-NEXT: andb $15, %dil
1269 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1270 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1271 ; X64-NEXT: kmovw %eax, %k1
1272 ; X64-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z}
1274 %trn0 = trunc i8 %a0 to i4
1275 %arg0 = bitcast i4 %trn0 to <4 x i1>
1276 %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <4 x i32> zeroinitializer
1277 %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer
1278 ret <4 x double> %res1
1281 define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
1282 ; X32-LABEL: test_mm_broadcastss_ps:
1284 ; X32-NEXT: vbroadcastss %xmm0, %xmm0
1287 ; X64-LABEL: test_mm_broadcastss_ps:
1289 ; X64-NEXT: vbroadcastss %xmm0, %xmm0
1291 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
1292 ret <4 x float> %res
1295 define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) {
1296 ; X32-LABEL: test_mm_mask_broadcastss_ps:
1298 ; X32-NEXT: pushl %eax
1299 ; X32-NEXT: .cfi_def_cfa_offset 8
1300 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1301 ; X32-NEXT: andb $15, %al
1302 ; X32-NEXT: movb %al, (%esp)
1303 ; X32-NEXT: movzbl (%esp), %eax
1304 ; X32-NEXT: kmovw %eax, %k1
1305 ; X32-NEXT: vbroadcastss %xmm1, %xmm0 {%k1}
1306 ; X32-NEXT: popl %eax
1309 ; X64-LABEL: test_mm_mask_broadcastss_ps:
1311 ; X64-NEXT: andb $15, %dil
1312 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1313 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1314 ; X64-NEXT: kmovw %eax, %k1
1315 ; X64-NEXT: vbroadcastss %xmm1, %xmm0 {%k1}
1317 %trn1 = trunc i8 %a1 to i4
1318 %arg1 = bitcast i4 %trn1 to <4 x i1>
1319 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <4 x i32> zeroinitializer
1320 %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0
1321 ret <4 x float> %res1
1324 define <4 x float> @test_mm_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) {
1325 ; X32-LABEL: test_mm_maskz_broadcastss_ps:
1327 ; X32-NEXT: pushl %eax
1328 ; X32-NEXT: .cfi_def_cfa_offset 8
1329 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1330 ; X32-NEXT: andb $15, %al
1331 ; X32-NEXT: movb %al, (%esp)
1332 ; X32-NEXT: movzbl (%esp), %eax
1333 ; X32-NEXT: kmovw %eax, %k1
1334 ; X32-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z}
1335 ; X32-NEXT: popl %eax
1338 ; X64-LABEL: test_mm_maskz_broadcastss_ps:
1340 ; X64-NEXT: andb $15, %dil
1341 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1342 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1343 ; X64-NEXT: kmovw %eax, %k1
1344 ; X64-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z}
1346 %trn0 = trunc i8 %a0 to i4
1347 %arg0 = bitcast i4 %trn0 to <4 x i1>
1348 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
1349 %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer
1350 ret <4 x float> %res1
1353 define <8 x float> @test_mm256_broadcastss_ps(<4 x float> %a0) {
1354 ; X32-LABEL: test_mm256_broadcastss_ps:
1356 ; X32-NEXT: vbroadcastss %xmm0, %ymm0
1359 ; X64-LABEL: test_mm256_broadcastss_ps:
1361 ; X64-NEXT: vbroadcastss %xmm0, %ymm0
1363 %res = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer
1364 ret <8 x float> %res
1367 define <8 x float> @test_mm256_mask_broadcastss_ps(<8 x float> %a0, i8 %a1, <4 x float> %a2) {
1368 ; X32-LABEL: test_mm256_mask_broadcastss_ps:
1370 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1371 ; X32-NEXT: kmovw %eax, %k1
1372 ; X32-NEXT: vbroadcastss %xmm1, %ymm0 {%k1}
1375 ; X64-LABEL: test_mm256_mask_broadcastss_ps:
1377 ; X64-NEXT: kmovw %edi, %k1
1378 ; X64-NEXT: vbroadcastss %xmm1, %ymm0 {%k1}
1380 %arg1 = bitcast i8 %a1 to <8 x i1>
1381 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <8 x i32> zeroinitializer
1382 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
1383 ret <8 x float> %res1
1386 define <8 x float> @test_mm256_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) {
1387 ; X32-LABEL: test_mm256_maskz_broadcastss_ps:
1389 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1390 ; X32-NEXT: kmovw %eax, %k1
1391 ; X32-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z}
1394 ; X64-LABEL: test_mm256_maskz_broadcastss_ps:
1396 ; X64-NEXT: kmovw %edi, %k1
1397 ; X64-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z}
1399 %arg0 = bitcast i8 %a0 to <8 x i1>
1400 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> zeroinitializer
1401 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
1402 ret <8 x float> %res1
1405 define <2 x double> @test_mm_movddup_pd(<2 x double> %a0) {
1406 ; X32-LABEL: test_mm_movddup_pd:
1408 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1411 ; X64-LABEL: test_mm_movddup_pd:
1413 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1415 %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
1416 ret <2 x double> %res
1419 define <2 x double> @test_mm_mask_movddup_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2) {
1420 ; X32-LABEL: test_mm_mask_movddup_pd:
1422 ; X32-NEXT: pushl %eax
1423 ; X32-NEXT: .cfi_def_cfa_offset 8
1424 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1425 ; X32-NEXT: andb $3, %al
1426 ; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
1427 ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1428 ; X32-NEXT: kmovw %eax, %k1
1429 ; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
1430 ; X32-NEXT: popl %eax
1433 ; X64-LABEL: test_mm_mask_movddup_pd:
1435 ; X64-NEXT: andb $3, %dil
1436 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1437 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1438 ; X64-NEXT: kmovw %eax, %k1
1439 ; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
1441 %trn1 = trunc i8 %a1 to i2
1442 %arg1 = bitcast i2 %trn1 to <2 x i1>
1443 %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <2 x i32> zeroinitializer
1444 %res1 = select <2 x i1> %arg1, <2 x double> %res0, <2 x double> %a0
1445 ret <2 x double> %res1
1448 define <2 x double> @test_mm_maskz_movddup_pd(i8 %a0, <2 x double> %a1) {
1449 ; X32-LABEL: test_mm_maskz_movddup_pd:
1451 ; X32-NEXT: pushl %eax
1452 ; X32-NEXT: .cfi_def_cfa_offset 8
1453 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1454 ; X32-NEXT: andb $3, %al
1455 ; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
1456 ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1457 ; X32-NEXT: kmovw %eax, %k1
1458 ; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
1459 ; X32-NEXT: popl %eax
1462 ; X64-LABEL: test_mm_maskz_movddup_pd:
1464 ; X64-NEXT: andb $3, %dil
1465 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1466 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1467 ; X64-NEXT: kmovw %eax, %k1
1468 ; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
1470 %trn1 = trunc i8 %a0 to i2
1471 %arg0 = bitcast i2 %trn1 to <2 x i1>
1472 %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer
1473 %res1 = select <2 x i1> %arg0, <2 x double> %res0, <2 x double> zeroinitializer
1474 ret <2 x double> %res1
1477 define <4 x double> @test_mm256_movddup_pd(<4 x double> %a0) {
1478 ; X32-LABEL: test_mm256_movddup_pd:
1480 ; X32-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
1483 ; X64-LABEL: test_mm256_movddup_pd:
1485 ; X64-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
1487 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
1488 ret <4 x double> %res
1491 define <4 x double> @test_mm256_mask_movddup_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2) {
1492 ; X32-LABEL: test_mm256_mask_movddup_pd:
1494 ; X32-NEXT: pushl %eax
1495 ; X32-NEXT: .cfi_def_cfa_offset 8
1496 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1497 ; X32-NEXT: andb $15, %al
1498 ; X32-NEXT: movb %al, (%esp)
1499 ; X32-NEXT: movzbl (%esp), %eax
1500 ; X32-NEXT: kmovw %eax, %k1
1501 ; X32-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
1502 ; X32-NEXT: popl %eax
1505 ; X64-LABEL: test_mm256_mask_movddup_pd:
1507 ; X64-NEXT: andb $15, %dil
1508 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1509 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1510 ; X64-NEXT: kmovw %eax, %k1
1511 ; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
1513 %trn1 = trunc i8 %a1 to i4
1514 %arg1 = bitcast i4 %trn1 to <4 x i1>
1515 %res0 = shufflevector <4 x double> %a2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
1516 %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0
1517 ret <4 x double> %res1
1520 define <4 x double> @test_mm256_maskz_movddup_pd(i8 %a0, <4 x double> %a1) {
1521 ; X32-LABEL: test_mm256_maskz_movddup_pd:
1523 ; X32-NEXT: pushl %eax
1524 ; X32-NEXT: .cfi_def_cfa_offset 8
1525 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1526 ; X32-NEXT: andb $15, %al
1527 ; X32-NEXT: movb %al, (%esp)
1528 ; X32-NEXT: movzbl (%esp), %eax
1529 ; X32-NEXT: kmovw %eax, %k1
1530 ; X32-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
1531 ; X32-NEXT: popl %eax
1534 ; X64-LABEL: test_mm256_maskz_movddup_pd:
1536 ; X64-NEXT: andb $15, %dil
1537 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1538 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1539 ; X64-NEXT: kmovw %eax, %k1
1540 ; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
1542 %trn1 = trunc i8 %a0 to i4
1543 %arg0 = bitcast i4 %trn1 to <4 x i1>
1544 %res0 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
1545 %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer
1546 ret <4 x double> %res1
1549 define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) {
1550 ; X32-LABEL: test_mm_movehdup_ps:
1552 ; X32-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1555 ; X64-LABEL: test_mm_movehdup_ps:
1557 ; X64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1559 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
1560 ret <4 x float> %res
1563 define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) {
1564 ; X32-LABEL: test_mm_mask_movehdup_ps:
1566 ; X32-NEXT: pushl %eax
1567 ; X32-NEXT: .cfi_def_cfa_offset 8
1568 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1569 ; X32-NEXT: andb $15, %al
1570 ; X32-NEXT: movb %al, (%esp)
1571 ; X32-NEXT: movzbl (%esp), %eax
1572 ; X32-NEXT: kmovw %eax, %k1
1573 ; X32-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
1574 ; X32-NEXT: popl %eax
1577 ; X64-LABEL: test_mm_mask_movehdup_ps:
1579 ; X64-NEXT: andb $15, %dil
1580 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1581 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1582 ; X64-NEXT: kmovw %eax, %k1
1583 ; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
1585 %trn1 = trunc i8 %a1 to i4
1586 %arg1 = bitcast i4 %trn1 to <4 x i1>
1587 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
1588 %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0
1589 ret <4 x float> %res1
1592 define <4 x float> @test_mm_maskz_movehdup_ps(i8 %a0, <4 x float> %a1) {
1593 ; X32-LABEL: test_mm_maskz_movehdup_ps:
1595 ; X32-NEXT: pushl %eax
1596 ; X32-NEXT: .cfi_def_cfa_offset 8
1597 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1598 ; X32-NEXT: andb $15, %al
1599 ; X32-NEXT: movb %al, (%esp)
1600 ; X32-NEXT: movzbl (%esp), %eax
1601 ; X32-NEXT: kmovw %eax, %k1
1602 ; X32-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
1603 ; X32-NEXT: popl %eax
1606 ; X64-LABEL: test_mm_maskz_movehdup_ps:
1608 ; X64-NEXT: andb $15, %dil
1609 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1610 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1611 ; X64-NEXT: kmovw %eax, %k1
1612 ; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
1614 %trn0 = trunc i8 %a0 to i4
1615 %arg0 = bitcast i4 %trn0 to <4 x i1>
1616 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
1617 %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer
1618 ret <4 x float> %res1
1621 define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) {
1622 ; X32-LABEL: test_mm256_movehdup_ps:
1624 ; X32-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
1627 ; X64-LABEL: test_mm256_movehdup_ps:
1629 ; X64-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
1631 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
1632 ret <8 x float> %res
1635 define <8 x float> @test_mm256_mask_movehdup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) {
1636 ; X32-LABEL: test_mm256_mask_movehdup_ps:
1638 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1639 ; X32-NEXT: kmovw %eax, %k1
1640 ; X32-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7]
1643 ; X64-LABEL: test_mm256_mask_movehdup_ps:
1645 ; X64-NEXT: kmovw %edi, %k1
1646 ; X64-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7]
1648 %arg1 = bitcast i8 %a1 to <8 x i1>
1649 %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
1650 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
1651 ret <8 x float> %res1
1654 define <8 x float> @test_mm256_maskz_movehdup_ps(i8 %a0, <8 x float> %a1) {
1655 ; X32-LABEL: test_mm256_maskz_movehdup_ps:
1657 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1658 ; X32-NEXT: kmovw %eax, %k1
1659 ; X32-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
1662 ; X64-LABEL: test_mm256_maskz_movehdup_ps:
1664 ; X64-NEXT: kmovw %edi, %k1
1665 ; X64-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
1667 %arg0 = bitcast i8 %a0 to <8 x i1>
1668 %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
1669 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
1670 ret <8 x float> %res1
1673 define <4 x float> @test_mm_moveldup_ps(<4 x float> %a0) {
1674 ; X32-LABEL: test_mm_moveldup_ps:
1676 ; X32-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
1679 ; X64-LABEL: test_mm_moveldup_ps:
1681 ; X64-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
1683 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
1684 ret <4 x float> %res
1687 define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) {
1688 ; X32-LABEL: test_mm_mask_moveldup_ps:
1690 ; X32-NEXT: pushl %eax
1691 ; X32-NEXT: .cfi_def_cfa_offset 8
1692 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1693 ; X32-NEXT: andb $15, %al
1694 ; X32-NEXT: movb %al, (%esp)
1695 ; X32-NEXT: movzbl (%esp), %eax
1696 ; X32-NEXT: kmovw %eax, %k1
1697 ; X32-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
1698 ; X32-NEXT: popl %eax
1701 ; X64-LABEL: test_mm_mask_moveldup_ps:
1703 ; X64-NEXT: andb $15, %dil
1704 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1705 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1706 ; X64-NEXT: kmovw %eax, %k1
1707 ; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
1709 %trn1 = trunc i8 %a1 to i4
1710 %arg1 = bitcast i4 %trn1 to <4 x i1>
1711 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
1712 %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0
1713 ret <4 x float> %res1
1716 define <4 x float> @test_mm_maskz_moveldup_ps(i8 %a0, <4 x float> %a1) {
1717 ; X32-LABEL: test_mm_maskz_moveldup_ps:
1719 ; X32-NEXT: pushl %eax
1720 ; X32-NEXT: .cfi_def_cfa_offset 8
1721 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1722 ; X32-NEXT: andb $15, %al
1723 ; X32-NEXT: movb %al, (%esp)
1724 ; X32-NEXT: movzbl (%esp), %eax
1725 ; X32-NEXT: kmovw %eax, %k1
1726 ; X32-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
1727 ; X32-NEXT: popl %eax
1730 ; X64-LABEL: test_mm_maskz_moveldup_ps:
1732 ; X64-NEXT: andb $15, %dil
1733 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1734 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1735 ; X64-NEXT: kmovw %eax, %k1
1736 ; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
1738 %trn0 = trunc i8 %a0 to i4
1739 %arg0 = bitcast i4 %trn0 to <4 x i1>
1740 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
1741 %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer
1742 ret <4 x float> %res1
1745 define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) {
1746 ; X32-LABEL: test_mm256_moveldup_ps:
1748 ; X32-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
1751 ; X64-LABEL: test_mm256_moveldup_ps:
1753 ; X64-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
1755 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1756 ret <8 x float> %res
1759 define <8 x float> @test_mm256_mask_moveldup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) {
1760 ; X32-LABEL: test_mm256_mask_moveldup_ps:
1762 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1763 ; X32-NEXT: kmovw %eax, %k1
1764 ; X32-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6]
1767 ; X64-LABEL: test_mm256_mask_moveldup_ps:
1769 ; X64-NEXT: kmovw %edi, %k1
1770 ; X64-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6]
1772 %arg1 = bitcast i8 %a1 to <8 x i1>
1773 %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1774 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
1775 ret <8 x float> %res1
1778 define <8 x float> @test_mm256_maskz_moveldup_ps(i8 %a0, <8 x float> %a1) {
1779 ; X32-LABEL: test_mm256_maskz_moveldup_ps:
1781 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1782 ; X32-NEXT: kmovw %eax, %k1
1783 ; X32-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
1786 ; X64-LABEL: test_mm256_maskz_moveldup_ps:
1788 ; X64-NEXT: kmovw %edi, %k1
1789 ; X64-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
1791 %arg0 = bitcast i8 %a0 to <8 x i1>
1792 %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1793 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
1794 ret <8 x float> %res1
1797 define <4 x i64> @test_mm256_permutex_epi64(<4 x i64> %a0) {
1798 ; X32-LABEL: test_mm256_permutex_epi64:
1800 ; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
1803 ; X64-LABEL: test_mm256_permutex_epi64:
1805 ; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
1807 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
1811 define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %a0, i8 %a1, <4 x i64> %a2) {
1812 ; X32-LABEL: test_mm256_mask_permutex_epi64:
1814 ; X32-NEXT: pushl %eax
1815 ; X32-NEXT: .cfi_def_cfa_offset 8
1816 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1817 ; X32-NEXT: andb $15, %al
1818 ; X32-NEXT: movb %al, (%esp)
1819 ; X32-NEXT: movzbl (%esp), %eax
1820 ; X32-NEXT: kmovw %eax, %k1
1821 ; X32-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
1822 ; X32-NEXT: popl %eax
1825 ; X64-LABEL: test_mm256_mask_permutex_epi64:
1827 ; X64-NEXT: andb $15, %dil
1828 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1829 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1830 ; X64-NEXT: kmovw %eax, %k1
1831 ; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
1833 %trn1 = trunc i8 %a1 to i4
1834 %arg1 = bitcast i4 %trn1 to <4 x i1>
1835 %res0 = shufflevector <4 x i64> %a2, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
1836 %res1 = select <4 x i1> %arg1, <4 x i64> %res0, <4 x i64> %a0
1840 define <4 x i64> @test_mm256_maskz_permutex_epi64(i8 %a0, <4 x i64> %a1) {
1841 ; X32-LABEL: test_mm256_maskz_permutex_epi64:
1843 ; X32-NEXT: pushl %eax
1844 ; X32-NEXT: .cfi_def_cfa_offset 8
1845 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1846 ; X32-NEXT: andb $15, %al
1847 ; X32-NEXT: movb %al, (%esp)
1848 ; X32-NEXT: movzbl (%esp), %eax
1849 ; X32-NEXT: kmovw %eax, %k1
1850 ; X32-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
1851 ; X32-NEXT: popl %eax
1854 ; X64-LABEL: test_mm256_maskz_permutex_epi64:
1856 ; X64-NEXT: andb $15, %dil
1857 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1858 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1859 ; X64-NEXT: kmovw %eax, %k1
1860 ; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
1862 %trn1 = trunc i8 %a0 to i4
1863 %arg0 = bitcast i4 %trn1 to <4 x i1>
1864 %res0 = shufflevector <4 x i64> %a1, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
1865 %res1 = select <4 x i1> %arg0, <4 x i64> %res0, <4 x i64> zeroinitializer
1869 define <4 x double> @test_mm256_permutex_pd(<4 x double> %a0) {
1870 ; X32-LABEL: test_mm256_permutex_pd:
1872 ; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
1875 ; X64-LABEL: test_mm256_permutex_pd:
1877 ; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
1879 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
1880 ret <4 x double> %res
1883 define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2) {
1884 ; X32-LABEL: test_mm256_mask_permutex_pd:
1886 ; X32-NEXT: pushl %eax
1887 ; X32-NEXT: .cfi_def_cfa_offset 8
1888 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1889 ; X32-NEXT: andb $15, %al
1890 ; X32-NEXT: movb %al, (%esp)
1891 ; X32-NEXT: movzbl (%esp), %eax
1892 ; X32-NEXT: kmovw %eax, %k1
1893 ; X32-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
1894 ; X32-NEXT: popl %eax
1897 ; X64-LABEL: test_mm256_mask_permutex_pd:
1899 ; X64-NEXT: andb $15, %dil
1900 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1901 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1902 ; X64-NEXT: kmovw %eax, %k1
1903 ; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
1905 %trn1 = trunc i8 %a1 to i4
1906 %arg1 = bitcast i4 %trn1 to <4 x i1>
1907 %res0 = shufflevector <4 x double> %a2, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
1908 %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0
1909 ret <4 x double> %res1
1912 define <4 x double> @test_mm256_maskz_permutex_pd(i8 %a0, <4 x double> %a1) {
1913 ; X32-LABEL: test_mm256_maskz_permutex_pd:
1915 ; X32-NEXT: pushl %eax
1916 ; X32-NEXT: .cfi_def_cfa_offset 8
1917 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1918 ; X32-NEXT: andb $15, %al
1919 ; X32-NEXT: movb %al, (%esp)
1920 ; X32-NEXT: movzbl (%esp), %eax
1921 ; X32-NEXT: kmovw %eax, %k1
1922 ; X32-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
1923 ; X32-NEXT: popl %eax
1926 ; X64-LABEL: test_mm256_maskz_permutex_pd:
1928 ; X64-NEXT: andb $15, %dil
1929 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1930 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1931 ; X64-NEXT: kmovw %eax, %k1
1932 ; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
1934 %trn1 = trunc i8 %a0 to i4
1935 %arg0 = bitcast i4 %trn1 to <4 x i1>
1936 %res0 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
1937 %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer
1938 ret <4 x double> %res1
1941 define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) {
1942 ; X32-LABEL: test_mm_shuffle_pd:
1944 ; X32-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1947 ; X64-LABEL: test_mm_shuffle_pd:
1949 ; X64-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1951 %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
1952 ret <2 x double> %res
1955 define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2, <2 x double> %a3) {
1956 ; X32-LABEL: test_mm_mask_shuffle_pd:
1958 ; X32-NEXT: pushl %eax
1959 ; X32-NEXT: .cfi_def_cfa_offset 8
1960 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1961 ; X32-NEXT: andb $3, %al
1962 ; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
1963 ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1964 ; X32-NEXT: kmovw %eax, %k1
1965 ; X32-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
1966 ; X32-NEXT: popl %eax
1969 ; X64-LABEL: test_mm_mask_shuffle_pd:
1971 ; X64-NEXT: andb $3, %dil
1972 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1973 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
1974 ; X64-NEXT: kmovw %eax, %k1
1975 ; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
1977 %trn1 = trunc i8 %a1 to i2
1978 %arg1 = bitcast i2 %trn1 to <2 x i1>
1979 %res0 = shufflevector <2 x double> %a2, <2 x double> %a3, <2 x i32> <i32 1, i32 3>
1980 %res1 = select <2 x i1> %arg1, <2 x double> %res0, <2 x double> %a0
1981 ret <2 x double> %res1
1984 define <2 x double> @test_mm_maskz_shuffle_pd(i8 %a0, <2 x double> %a1, <2 x double> %a2) {
1985 ; X32-LABEL: test_mm_maskz_shuffle_pd:
1987 ; X32-NEXT: pushl %eax
1988 ; X32-NEXT: .cfi_def_cfa_offset 8
1989 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
1990 ; X32-NEXT: andb $3, %al
1991 ; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
1992 ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1993 ; X32-NEXT: kmovw %eax, %k1
1994 ; X32-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
1995 ; X32-NEXT: popl %eax
1998 ; X64-LABEL: test_mm_maskz_shuffle_pd:
2000 ; X64-NEXT: andb $3, %dil
2001 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
2002 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
2003 ; X64-NEXT: kmovw %eax, %k1
2004 ; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
2006 %trn1 = trunc i8 %a0 to i2
2007 %arg0 = bitcast i2 %trn1 to <2 x i1>
2008 %res0 = shufflevector <2 x double> %a1, <2 x double> %a2, <2 x i32> <i32 1, i32 3>
2009 %res1 = select <2 x i1> %arg0, <2 x double> %res0, <2 x double> zeroinitializer
2010 ret <2 x double> %res1
2013 define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) {
2014 ; X32-LABEL: test_mm256_shuffle_pd:
2016 ; X32-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
2019 ; X64-LABEL: test_mm256_shuffle_pd:
2021 ; X64-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
2023 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
2024 ret <4 x double> %res
2027 define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2, <4 x double> %a3) {
2028 ; X32-LABEL: test_mm256_mask_shuffle_pd:
2030 ; X32-NEXT: pushl %eax
2031 ; X32-NEXT: .cfi_def_cfa_offset 8
2032 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
2033 ; X32-NEXT: andb $15, %al
2034 ; X32-NEXT: movb %al, (%esp)
2035 ; X32-NEXT: movzbl (%esp), %eax
2036 ; X32-NEXT: kmovw %eax, %k1
2037 ; X32-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
2038 ; X32-NEXT: popl %eax
2041 ; X64-LABEL: test_mm256_mask_shuffle_pd:
2043 ; X64-NEXT: andb $15, %dil
2044 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
2045 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
2046 ; X64-NEXT: kmovw %eax, %k1
2047 ; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
2049 %trn1 = trunc i8 %a1 to i4
2050 %arg1 = bitcast i4 %trn1 to <4 x i1>
2051 %res0 = shufflevector <4 x double> %a2, <4 x double> %a3, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
2052 %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0
2053 ret <4 x double> %res1
2056 define <4 x double> @test_mm256_maskz_shuffle_pd(i8 %a0, <4 x double> %a1, <4 x double> %a2) {
2057 ; X32-LABEL: test_mm256_maskz_shuffle_pd:
2059 ; X32-NEXT: pushl %eax
2060 ; X32-NEXT: .cfi_def_cfa_offset 8
2061 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
2062 ; X32-NEXT: andb $15, %al
2063 ; X32-NEXT: movb %al, (%esp)
2064 ; X32-NEXT: movzbl (%esp), %eax
2065 ; X32-NEXT: kmovw %eax, %k1
2066 ; X32-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
2067 ; X32-NEXT: popl %eax
2070 ; X64-LABEL: test_mm256_maskz_shuffle_pd:
2072 ; X64-NEXT: andb $15, %dil
2073 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
2074 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
2075 ; X64-NEXT: kmovw %eax, %k1
2076 ; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
2078 %trn1 = trunc i8 %a0 to i4
2079 %arg0 = bitcast i4 %trn1 to <4 x i1>
2080 %res0 = shufflevector <4 x double> %a1, <4 x double> %a2, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
2081 %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer
2082 ret <4 x double> %res1
2085 define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) {
2086 ; X32-LABEL: test_mm_shuffle_ps:
2088 ; X32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
2091 ; X64-LABEL: test_mm_shuffle_ps:
2093 ; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
2095 %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
2096 ret <4 x float> %res
2099 define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2, <4 x float> %a3) {
2100 ; X32-LABEL: test_mm_mask_shuffle_ps:
2102 ; X32-NEXT: pushl %eax
2103 ; X32-NEXT: .cfi_def_cfa_offset 8
2104 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
2105 ; X32-NEXT: andb $15, %al
2106 ; X32-NEXT: movb %al, (%esp)
2107 ; X32-NEXT: movzbl (%esp), %eax
2108 ; X32-NEXT: kmovw %eax, %k1
2109 ; X32-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
2110 ; X32-NEXT: popl %eax
2113 ; X64-LABEL: test_mm_mask_shuffle_ps:
2115 ; X64-NEXT: andb $15, %dil
2116 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
2117 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
2118 ; X64-NEXT: kmovw %eax, %k1
2119 ; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
2121 %trn1 = trunc i8 %a1 to i4
2122 %arg1 = bitcast i4 %trn1 to <4 x i1>
2123 %res0 = shufflevector <4 x float> %a2, <4 x float> %a3, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
2124 %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0
2125 ret <4 x float> %res1
2128 define <4 x float> @test_mm_maskz_shuffle_ps(i8 %a0, <4 x float> %a1, <4 x float> %a2) {
2129 ; X32-LABEL: test_mm_maskz_shuffle_ps:
2131 ; X32-NEXT: pushl %eax
2132 ; X32-NEXT: .cfi_def_cfa_offset 8
2133 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
2134 ; X32-NEXT: andb $15, %al
2135 ; X32-NEXT: movb %al, (%esp)
2136 ; X32-NEXT: movzbl (%esp), %eax
2137 ; X32-NEXT: kmovw %eax, %k1
2138 ; X32-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
2139 ; X32-NEXT: popl %eax
2142 ; X64-LABEL: test_mm_maskz_shuffle_ps:
2144 ; X64-NEXT: andb $15, %dil
2145 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
2146 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
2147 ; X64-NEXT: kmovw %eax, %k1
2148 ; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
2150 %trn0 = trunc i8 %a0 to i4
2151 %arg0 = bitcast i4 %trn0 to <4 x i1>
2152 %res0 = shufflevector <4 x float> %a1, <4 x float> %a2, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
2153 %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer
2154 ret <4 x float> %res1
2157 define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) {
2158 ; X32-LABEL: test_mm256_shuffle_ps:
2160 ; X32-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
2163 ; X64-LABEL: test_mm256_shuffle_ps:
2165 ; X64-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
2167 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
2168 ret <8 x float> %res
2171 define <8 x float> @test_mm256_mask_shuffle_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2, <8 x float> %a3) {
2172 ; X32-LABEL: test_mm256_mask_shuffle_ps:
2174 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
2175 ; X32-NEXT: kmovw %eax, %k1
2176 ; X32-NEXT: vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4]
2179 ; X64-LABEL: test_mm256_mask_shuffle_ps:
2181 ; X64-NEXT: kmovw %edi, %k1
2182 ; X64-NEXT: vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4]
2184 %arg1 = bitcast i8 %a1 to <8 x i1>
2185 %res0 = shufflevector <8 x float> %a2, <8 x float> %a3, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
2186 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
2187 ret <8 x float> %res1
2190 define <8 x float> @test_mm256_maskz_shuffle_ps(i8 %a0, <8 x float> %a1, <8 x float> %a2) {
2191 ; X32-LABEL: test_mm256_maskz_shuffle_ps:
2193 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
2194 ; X32-NEXT: kmovw %eax, %k1
2195 ; X32-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
2198 ; X64-LABEL: test_mm256_maskz_shuffle_ps:
2200 ; X64-NEXT: kmovw %edi, %k1
2201 ; X64-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
2203 %arg0 = bitcast i8 %a0 to <8 x i1>
2204 %res0 = shufflevector <8 x float> %a1, <8 x float> %a2, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
2205 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
2206 ret <8 x float> %res1