1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512
13 define i64 @test_v2i64(<2 x i64> %a0) {
14 ; SSE-LABEL: test_v2i64:
16 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
17 ; SSE-NEXT: pand %xmm0, %xmm1
18 ; SSE-NEXT: movq %xmm1, %rax
21 ; AVX-LABEL: test_v2i64:
23 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
24 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
25 ; AVX-NEXT: vmovq %xmm0, %rax
27 %1 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> %a0)
31 define i64 @test_v4i64(<4 x i64> %a0) {
32 ; SSE-LABEL: test_v4i64:
34 ; SSE-NEXT: pand %xmm1, %xmm0
35 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
36 ; SSE-NEXT: pand %xmm0, %xmm1
37 ; SSE-NEXT: movq %xmm1, %rax
40 ; AVX1-LABEL: test_v4i64:
42 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
43 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
44 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
45 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
46 ; AVX1-NEXT: vmovq %xmm0, %rax
47 ; AVX1-NEXT: vzeroupper
50 ; AVX2-LABEL: test_v4i64:
52 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
53 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
54 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
55 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
56 ; AVX2-NEXT: vmovq %xmm0, %rax
57 ; AVX2-NEXT: vzeroupper
60 ; AVX512-LABEL: test_v4i64:
62 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
63 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
64 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
65 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
66 ; AVX512-NEXT: vmovq %xmm0, %rax
67 ; AVX512-NEXT: vzeroupper
69 %1 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> %a0)
73 define i64 @test_v8i64(<8 x i64> %a0) {
74 ; SSE-LABEL: test_v8i64:
76 ; SSE-NEXT: pand %xmm3, %xmm1
77 ; SSE-NEXT: pand %xmm2, %xmm1
78 ; SSE-NEXT: pand %xmm0, %xmm1
79 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
80 ; SSE-NEXT: pand %xmm1, %xmm0
81 ; SSE-NEXT: movq %xmm0, %rax
84 ; AVX1-LABEL: test_v8i64:
86 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
87 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
88 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
89 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
90 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
91 ; AVX1-NEXT: vmovq %xmm0, %rax
92 ; AVX1-NEXT: vzeroupper
95 ; AVX2-LABEL: test_v8i64:
97 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
98 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
99 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
100 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
101 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
102 ; AVX2-NEXT: vmovq %xmm0, %rax
103 ; AVX2-NEXT: vzeroupper
106 ; AVX512-LABEL: test_v8i64:
108 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
109 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
110 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
111 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
112 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
113 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
114 ; AVX512-NEXT: vmovq %xmm0, %rax
115 ; AVX512-NEXT: vzeroupper
117 %1 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> %a0)
121 define i64 @test_v16i64(<16 x i64> %a0) {
122 ; SSE-LABEL: test_v16i64:
124 ; SSE-NEXT: pand %xmm6, %xmm2
125 ; SSE-NEXT: pand %xmm7, %xmm3
126 ; SSE-NEXT: pand %xmm5, %xmm3
127 ; SSE-NEXT: pand %xmm1, %xmm3
128 ; SSE-NEXT: pand %xmm4, %xmm2
129 ; SSE-NEXT: pand %xmm3, %xmm2
130 ; SSE-NEXT: pand %xmm0, %xmm2
131 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
132 ; SSE-NEXT: pand %xmm2, %xmm0
133 ; SSE-NEXT: movq %xmm0, %rax
136 ; AVX1-LABEL: test_v16i64:
138 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
139 ; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1
140 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
141 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
142 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
143 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
144 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
145 ; AVX1-NEXT: vmovq %xmm0, %rax
146 ; AVX1-NEXT: vzeroupper
149 ; AVX2-LABEL: test_v16i64:
151 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
152 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
153 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
154 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
155 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
156 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
157 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
158 ; AVX2-NEXT: vmovq %xmm0, %rax
159 ; AVX2-NEXT: vzeroupper
162 ; AVX512-LABEL: test_v16i64:
164 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
165 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
166 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
167 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
168 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
169 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
170 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
171 ; AVX512-NEXT: vmovq %xmm0, %rax
172 ; AVX512-NEXT: vzeroupper
174 %1 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> %a0)
182 define i32 @test_v2i32(<2 x i32> %a0) {
183 ; SSE-LABEL: test_v2i32:
185 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
186 ; SSE-NEXT: pand %xmm0, %xmm1
187 ; SSE-NEXT: movd %xmm1, %eax
190 ; AVX-LABEL: test_v2i32:
192 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
193 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
194 ; AVX-NEXT: vmovd %xmm0, %eax
196 %1 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> %a0)
200 define i32 @test_v4i32(<4 x i32> %a0) {
201 ; SSE-LABEL: test_v4i32:
203 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
204 ; SSE-NEXT: pand %xmm0, %xmm1
205 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
206 ; SSE-NEXT: pand %xmm1, %xmm0
207 ; SSE-NEXT: movd %xmm0, %eax
210 ; AVX-LABEL: test_v4i32:
212 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
213 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
214 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
215 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
216 ; AVX-NEXT: vmovd %xmm0, %eax
218 %1 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %a0)
222 define i32 @test_v8i32(<8 x i32> %a0) {
223 ; SSE-LABEL: test_v8i32:
225 ; SSE-NEXT: pand %xmm1, %xmm0
226 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
227 ; SSE-NEXT: pand %xmm0, %xmm1
228 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
229 ; SSE-NEXT: pand %xmm1, %xmm0
230 ; SSE-NEXT: movd %xmm0, %eax
233 ; AVX1-LABEL: test_v8i32:
235 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
236 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
237 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
238 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
239 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
240 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
241 ; AVX1-NEXT: vmovd %xmm0, %eax
242 ; AVX1-NEXT: vzeroupper
245 ; AVX2-LABEL: test_v8i32:
247 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
248 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
249 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
250 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
251 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
252 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
253 ; AVX2-NEXT: vmovd %xmm0, %eax
254 ; AVX2-NEXT: vzeroupper
257 ; AVX512-LABEL: test_v8i32:
259 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
260 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
261 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
262 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
263 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
264 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
265 ; AVX512-NEXT: vmovd %xmm0, %eax
266 ; AVX512-NEXT: vzeroupper
268 %1 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> %a0)
272 define i32 @test_v16i32(<16 x i32> %a0) {
273 ; SSE-LABEL: test_v16i32:
275 ; SSE-NEXT: pand %xmm3, %xmm1
276 ; SSE-NEXT: pand %xmm2, %xmm1
277 ; SSE-NEXT: pand %xmm0, %xmm1
278 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
279 ; SSE-NEXT: pand %xmm1, %xmm0
280 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
281 ; SSE-NEXT: pand %xmm0, %xmm1
282 ; SSE-NEXT: movd %xmm1, %eax
285 ; AVX1-LABEL: test_v16i32:
287 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
288 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
289 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
290 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
291 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
292 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
293 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
294 ; AVX1-NEXT: vmovd %xmm0, %eax
295 ; AVX1-NEXT: vzeroupper
298 ; AVX2-LABEL: test_v16i32:
300 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
301 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
302 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
303 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
304 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
305 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
306 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
307 ; AVX2-NEXT: vmovd %xmm0, %eax
308 ; AVX2-NEXT: vzeroupper
311 ; AVX512-LABEL: test_v16i32:
313 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
314 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
315 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
316 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
317 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
318 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
319 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
320 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
321 ; AVX512-NEXT: vmovd %xmm0, %eax
322 ; AVX512-NEXT: vzeroupper
324 %1 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> %a0)
328 define i32 @test_v32i32(<32 x i32> %a0) {
329 ; SSE-LABEL: test_v32i32:
331 ; SSE-NEXT: pand %xmm6, %xmm2
332 ; SSE-NEXT: pand %xmm7, %xmm3
333 ; SSE-NEXT: pand %xmm5, %xmm3
334 ; SSE-NEXT: pand %xmm1, %xmm3
335 ; SSE-NEXT: pand %xmm4, %xmm2
336 ; SSE-NEXT: pand %xmm3, %xmm2
337 ; SSE-NEXT: pand %xmm0, %xmm2
338 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
339 ; SSE-NEXT: pand %xmm2, %xmm0
340 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
341 ; SSE-NEXT: pand %xmm0, %xmm1
342 ; SSE-NEXT: movd %xmm1, %eax
345 ; AVX1-LABEL: test_v32i32:
347 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
348 ; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1
349 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
350 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
351 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
352 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
353 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
354 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
355 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
356 ; AVX1-NEXT: vmovd %xmm0, %eax
357 ; AVX1-NEXT: vzeroupper
360 ; AVX2-LABEL: test_v32i32:
362 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
363 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
364 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
365 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
366 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
367 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
368 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
369 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
370 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
371 ; AVX2-NEXT: vmovd %xmm0, %eax
372 ; AVX2-NEXT: vzeroupper
375 ; AVX512-LABEL: test_v32i32:
377 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
378 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
379 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
380 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
381 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
382 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
383 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
384 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
385 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
386 ; AVX512-NEXT: vmovd %xmm0, %eax
387 ; AVX512-NEXT: vzeroupper
389 %1 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> %a0)
397 define i16 @test_v2i16(<2 x i16> %a0) {
398 ; SSE-LABEL: test_v2i16:
400 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
401 ; SSE-NEXT: pand %xmm0, %xmm1
402 ; SSE-NEXT: movd %xmm1, %eax
403 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
406 ; AVX-LABEL: test_v2i16:
408 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
409 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
410 ; AVX-NEXT: vmovd %xmm0, %eax
411 ; AVX-NEXT: # kill: def $ax killed $ax killed $eax
413 %1 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> %a0)
417 define i16 @test_v4i16(<4 x i16> %a0) {
418 ; SSE-LABEL: test_v4i16:
420 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
421 ; SSE-NEXT: pand %xmm0, %xmm1
422 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
423 ; SSE-NEXT: pand %xmm1, %xmm0
424 ; SSE-NEXT: movd %xmm0, %eax
425 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
428 ; AVX-LABEL: test_v4i16:
430 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
431 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
432 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
433 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
434 ; AVX-NEXT: vmovd %xmm0, %eax
435 ; AVX-NEXT: # kill: def $ax killed $ax killed $eax
437 %1 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> %a0)
441 define i16 @test_v8i16(<8 x i16> %a0) {
442 ; SSE-LABEL: test_v8i16:
444 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
445 ; SSE-NEXT: pand %xmm0, %xmm1
446 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
447 ; SSE-NEXT: pand %xmm1, %xmm0
448 ; SSE-NEXT: movdqa %xmm0, %xmm1
449 ; SSE-NEXT: psrld $16, %xmm1
450 ; SSE-NEXT: pand %xmm0, %xmm1
451 ; SSE-NEXT: movd %xmm1, %eax
452 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
455 ; AVX-LABEL: test_v8i16:
457 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
458 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
459 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
460 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
461 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
462 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
463 ; AVX-NEXT: vmovd %xmm0, %eax
464 ; AVX-NEXT: # kill: def $ax killed $ax killed $eax
466 %1 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> %a0)
470 define i16 @test_v16i16(<16 x i16> %a0) {
471 ; SSE-LABEL: test_v16i16:
473 ; SSE-NEXT: pand %xmm1, %xmm0
474 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
475 ; SSE-NEXT: pand %xmm0, %xmm1
476 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
477 ; SSE-NEXT: pand %xmm1, %xmm0
478 ; SSE-NEXT: movdqa %xmm0, %xmm1
479 ; SSE-NEXT: psrld $16, %xmm1
480 ; SSE-NEXT: pand %xmm0, %xmm1
481 ; SSE-NEXT: movd %xmm1, %eax
482 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
485 ; AVX1-LABEL: test_v16i16:
487 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
488 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
489 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
490 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
491 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
492 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
493 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
494 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
495 ; AVX1-NEXT: vmovd %xmm0, %eax
496 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
497 ; AVX1-NEXT: vzeroupper
500 ; AVX2-LABEL: test_v16i16:
502 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
503 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
504 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
505 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
506 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
507 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
508 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
509 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
510 ; AVX2-NEXT: vmovd %xmm0, %eax
511 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
512 ; AVX2-NEXT: vzeroupper
515 ; AVX512-LABEL: test_v16i16:
517 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
518 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
519 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
520 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
521 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
522 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
523 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
524 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
525 ; AVX512-NEXT: vmovd %xmm0, %eax
526 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
527 ; AVX512-NEXT: vzeroupper
529 %1 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> %a0)
533 define i16 @test_v32i16(<32 x i16> %a0) {
534 ; SSE-LABEL: test_v32i16:
536 ; SSE-NEXT: pand %xmm3, %xmm1
537 ; SSE-NEXT: pand %xmm2, %xmm1
538 ; SSE-NEXT: pand %xmm0, %xmm1
539 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
540 ; SSE-NEXT: pand %xmm1, %xmm0
541 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
542 ; SSE-NEXT: pand %xmm0, %xmm1
543 ; SSE-NEXT: movdqa %xmm1, %xmm0
544 ; SSE-NEXT: psrld $16, %xmm0
545 ; SSE-NEXT: pand %xmm1, %xmm0
546 ; SSE-NEXT: movd %xmm0, %eax
547 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
550 ; AVX1-LABEL: test_v32i16:
552 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
553 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
554 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
555 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
556 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
557 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
558 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
559 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
560 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
561 ; AVX1-NEXT: vmovd %xmm0, %eax
562 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
563 ; AVX1-NEXT: vzeroupper
566 ; AVX2-LABEL: test_v32i16:
568 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
569 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
570 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
571 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
572 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
573 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
574 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
575 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
576 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
577 ; AVX2-NEXT: vmovd %xmm0, %eax
578 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
579 ; AVX2-NEXT: vzeroupper
582 ; AVX512-LABEL: test_v32i16:
584 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
585 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
586 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
587 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
588 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
589 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
590 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
591 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
592 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
593 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
594 ; AVX512-NEXT: vmovd %xmm0, %eax
595 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
596 ; AVX512-NEXT: vzeroupper
598 %1 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> %a0)
602 define i16 @test_v64i16(<64 x i16> %a0) {
603 ; SSE-LABEL: test_v64i16:
605 ; SSE-NEXT: pand %xmm6, %xmm2
606 ; SSE-NEXT: pand %xmm7, %xmm3
607 ; SSE-NEXT: pand %xmm5, %xmm3
608 ; SSE-NEXT: pand %xmm1, %xmm3
609 ; SSE-NEXT: pand %xmm4, %xmm2
610 ; SSE-NEXT: pand %xmm3, %xmm2
611 ; SSE-NEXT: pand %xmm0, %xmm2
612 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
613 ; SSE-NEXT: pand %xmm2, %xmm0
614 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
615 ; SSE-NEXT: pand %xmm0, %xmm1
616 ; SSE-NEXT: movdqa %xmm1, %xmm0
617 ; SSE-NEXT: psrld $16, %xmm0
618 ; SSE-NEXT: pand %xmm1, %xmm0
619 ; SSE-NEXT: movd %xmm0, %eax
620 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
623 ; AVX1-LABEL: test_v64i16:
625 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
626 ; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1
627 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
628 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
629 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
630 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
631 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
632 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
633 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
634 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
635 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
636 ; AVX1-NEXT: vmovd %xmm0, %eax
637 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
638 ; AVX1-NEXT: vzeroupper
641 ; AVX2-LABEL: test_v64i16:
643 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
644 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
645 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
646 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
647 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
648 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
649 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
650 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
651 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
652 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
653 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
654 ; AVX2-NEXT: vmovd %xmm0, %eax
655 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
656 ; AVX2-NEXT: vzeroupper
659 ; AVX512-LABEL: test_v64i16:
661 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
662 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
663 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
664 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
665 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
666 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
667 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
668 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
669 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
670 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
671 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
672 ; AVX512-NEXT: vmovd %xmm0, %eax
673 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
674 ; AVX512-NEXT: vzeroupper
676 %1 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> %a0)
684 define i8 @test_v2i8(<2 x i8> %a0) {
685 ; SSE2-LABEL: test_v2i8:
687 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
688 ; SSE2-NEXT: pand %xmm0, %xmm1
689 ; SSE2-NEXT: movd %xmm1, %eax
690 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
693 ; SSE41-LABEL: test_v2i8:
695 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
696 ; SSE41-NEXT: pand %xmm0, %xmm1
697 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
698 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
701 ; AVX-LABEL: test_v2i8:
703 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
704 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
705 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
706 ; AVX-NEXT: # kill: def $al killed $al killed $eax
708 %1 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> %a0)
712 define i8 @test_v4i8(<4 x i8> %a0) {
713 ; SSE2-LABEL: test_v4i8:
715 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
716 ; SSE2-NEXT: pand %xmm0, %xmm1
717 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
718 ; SSE2-NEXT: pand %xmm1, %xmm0
719 ; SSE2-NEXT: movd %xmm0, %eax
720 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
723 ; SSE41-LABEL: test_v4i8:
725 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
726 ; SSE41-NEXT: pand %xmm0, %xmm1
727 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
728 ; SSE41-NEXT: pand %xmm1, %xmm0
729 ; SSE41-NEXT: pextrb $0, %xmm0, %eax
730 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
733 ; AVX-LABEL: test_v4i8:
735 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
736 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
737 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
738 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
739 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
740 ; AVX-NEXT: # kill: def $al killed $al killed $eax
742 %1 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> %a0)
746 define i8 @test_v8i8(<8 x i8> %a0) {
747 ; SSE2-LABEL: test_v8i8:
749 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
750 ; SSE2-NEXT: pand %xmm0, %xmm1
751 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
752 ; SSE2-NEXT: pand %xmm1, %xmm0
753 ; SSE2-NEXT: movdqa %xmm0, %xmm1
754 ; SSE2-NEXT: psrld $16, %xmm1
755 ; SSE2-NEXT: pand %xmm0, %xmm1
756 ; SSE2-NEXT: movd %xmm1, %eax
757 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
760 ; SSE41-LABEL: test_v8i8:
762 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
763 ; SSE41-NEXT: pand %xmm0, %xmm1
764 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
765 ; SSE41-NEXT: pand %xmm1, %xmm0
766 ; SSE41-NEXT: movdqa %xmm0, %xmm1
767 ; SSE41-NEXT: psrld $16, %xmm1
768 ; SSE41-NEXT: pand %xmm0, %xmm1
769 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
770 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
773 ; AVX-LABEL: test_v8i8:
775 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
776 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
777 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
778 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
779 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
780 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
781 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
782 ; AVX-NEXT: # kill: def $al killed $al killed $eax
784 %1 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> %a0)
788 define i8 @test_v16i8(<16 x i8> %a0) {
789 ; SSE2-LABEL: test_v16i8:
791 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
792 ; SSE2-NEXT: pand %xmm0, %xmm1
793 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
794 ; SSE2-NEXT: pand %xmm1, %xmm0
795 ; SSE2-NEXT: movdqa %xmm0, %xmm1
796 ; SSE2-NEXT: psrld $16, %xmm1
797 ; SSE2-NEXT: pand %xmm0, %xmm1
798 ; SSE2-NEXT: movdqa %xmm1, %xmm0
799 ; SSE2-NEXT: psrlw $8, %xmm0
800 ; SSE2-NEXT: pand %xmm1, %xmm0
801 ; SSE2-NEXT: movd %xmm0, %eax
802 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
805 ; SSE41-LABEL: test_v16i8:
807 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
808 ; SSE41-NEXT: pand %xmm0, %xmm1
809 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
810 ; SSE41-NEXT: pand %xmm1, %xmm0
811 ; SSE41-NEXT: movdqa %xmm0, %xmm1
812 ; SSE41-NEXT: psrld $16, %xmm1
813 ; SSE41-NEXT: pand %xmm0, %xmm1
814 ; SSE41-NEXT: movdqa %xmm1, %xmm0
815 ; SSE41-NEXT: psrlw $8, %xmm0
816 ; SSE41-NEXT: pand %xmm1, %xmm0
817 ; SSE41-NEXT: pextrb $0, %xmm0, %eax
818 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
821 ; AVX-LABEL: test_v16i8:
823 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
824 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
825 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
826 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
827 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
828 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
829 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
830 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
831 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
832 ; AVX-NEXT: # kill: def $al killed $al killed $eax
834 %1 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> %a0)
838 define i8 @test_v32i8(<32 x i8> %a0) {
839 ; SSE2-LABEL: test_v32i8:
841 ; SSE2-NEXT: pand %xmm1, %xmm0
842 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
843 ; SSE2-NEXT: pand %xmm0, %xmm1
844 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
845 ; SSE2-NEXT: pand %xmm1, %xmm0
846 ; SSE2-NEXT: movdqa %xmm0, %xmm1
847 ; SSE2-NEXT: psrld $16, %xmm1
848 ; SSE2-NEXT: pand %xmm0, %xmm1
849 ; SSE2-NEXT: movdqa %xmm1, %xmm0
850 ; SSE2-NEXT: psrlw $8, %xmm0
851 ; SSE2-NEXT: pand %xmm1, %xmm0
852 ; SSE2-NEXT: movd %xmm0, %eax
853 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
856 ; SSE41-LABEL: test_v32i8:
858 ; SSE41-NEXT: pand %xmm1, %xmm0
859 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
860 ; SSE41-NEXT: pand %xmm0, %xmm1
861 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
862 ; SSE41-NEXT: pand %xmm1, %xmm0
863 ; SSE41-NEXT: movdqa %xmm0, %xmm1
864 ; SSE41-NEXT: psrld $16, %xmm1
865 ; SSE41-NEXT: pand %xmm0, %xmm1
866 ; SSE41-NEXT: movdqa %xmm1, %xmm0
867 ; SSE41-NEXT: psrlw $8, %xmm0
868 ; SSE41-NEXT: pand %xmm1, %xmm0
869 ; SSE41-NEXT: pextrb $0, %xmm0, %eax
870 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
873 ; AVX1-LABEL: test_v32i8:
875 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
876 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
877 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
878 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
879 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
880 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
881 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
882 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
883 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
884 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
885 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
886 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
887 ; AVX1-NEXT: vzeroupper
890 ; AVX2-LABEL: test_v32i8:
892 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
893 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
894 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
895 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
896 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
897 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
898 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
899 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
900 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
901 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
902 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
903 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
904 ; AVX2-NEXT: vzeroupper
907 ; AVX512-LABEL: test_v32i8:
909 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
910 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
911 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
912 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
913 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
914 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
915 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
916 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
917 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
918 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
919 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
920 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
921 ; AVX512-NEXT: vzeroupper
923 %1 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> %a0)
927 define i8 @test_v64i8(<64 x i8> %a0) {
928 ; SSE2-LABEL: test_v64i8:
930 ; SSE2-NEXT: pand %xmm3, %xmm1
931 ; SSE2-NEXT: pand %xmm2, %xmm1
932 ; SSE2-NEXT: pand %xmm0, %xmm1
933 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
934 ; SSE2-NEXT: pand %xmm1, %xmm0
935 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
936 ; SSE2-NEXT: pand %xmm0, %xmm1
937 ; SSE2-NEXT: movdqa %xmm1, %xmm0
938 ; SSE2-NEXT: psrld $16, %xmm0
939 ; SSE2-NEXT: pand %xmm1, %xmm0
940 ; SSE2-NEXT: movdqa %xmm0, %xmm1
941 ; SSE2-NEXT: psrlw $8, %xmm1
942 ; SSE2-NEXT: pand %xmm0, %xmm1
943 ; SSE2-NEXT: movd %xmm1, %eax
944 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
947 ; SSE41-LABEL: test_v64i8:
949 ; SSE41-NEXT: pand %xmm3, %xmm1
950 ; SSE41-NEXT: pand %xmm2, %xmm1
951 ; SSE41-NEXT: pand %xmm0, %xmm1
952 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
953 ; SSE41-NEXT: pand %xmm1, %xmm0
954 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
955 ; SSE41-NEXT: pand %xmm0, %xmm1
956 ; SSE41-NEXT: movdqa %xmm1, %xmm0
957 ; SSE41-NEXT: psrld $16, %xmm0
958 ; SSE41-NEXT: pand %xmm1, %xmm0
959 ; SSE41-NEXT: movdqa %xmm0, %xmm1
960 ; SSE41-NEXT: psrlw $8, %xmm1
961 ; SSE41-NEXT: pand %xmm0, %xmm1
962 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
963 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
966 ; AVX1-LABEL: test_v64i8:
968 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
969 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
970 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
971 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
972 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
973 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
974 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
975 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
976 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
977 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
978 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
979 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
980 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
981 ; AVX1-NEXT: vzeroupper
984 ; AVX2-LABEL: test_v64i8:
986 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
987 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
988 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
989 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
990 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
991 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
992 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
993 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
994 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
995 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
996 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
997 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
998 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
999 ; AVX2-NEXT: vzeroupper
1002 ; AVX512-LABEL: test_v64i8:
1004 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1005 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1006 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1007 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1008 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1009 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1010 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1011 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1012 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1013 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1014 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1015 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1016 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1017 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1018 ; AVX512-NEXT: vzeroupper
1020 %1 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> %a0)
1024 define i8 @test_v128i8(<128 x i8> %a0) {
1025 ; SSE2-LABEL: test_v128i8:
1027 ; SSE2-NEXT: pand %xmm6, %xmm2
1028 ; SSE2-NEXT: pand %xmm7, %xmm3
1029 ; SSE2-NEXT: pand %xmm5, %xmm3
1030 ; SSE2-NEXT: pand %xmm1, %xmm3
1031 ; SSE2-NEXT: pand %xmm4, %xmm2
1032 ; SSE2-NEXT: pand %xmm3, %xmm2
1033 ; SSE2-NEXT: pand %xmm0, %xmm2
1034 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
1035 ; SSE2-NEXT: pand %xmm2, %xmm0
1036 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1037 ; SSE2-NEXT: pand %xmm0, %xmm1
1038 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1039 ; SSE2-NEXT: psrld $16, %xmm0
1040 ; SSE2-NEXT: pand %xmm1, %xmm0
1041 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1042 ; SSE2-NEXT: psrlw $8, %xmm1
1043 ; SSE2-NEXT: pand %xmm0, %xmm1
1044 ; SSE2-NEXT: movd %xmm1, %eax
1045 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
1048 ; SSE41-LABEL: test_v128i8:
1050 ; SSE41-NEXT: pand %xmm6, %xmm2
1051 ; SSE41-NEXT: pand %xmm7, %xmm3
1052 ; SSE41-NEXT: pand %xmm5, %xmm3
1053 ; SSE41-NEXT: pand %xmm1, %xmm3
1054 ; SSE41-NEXT: pand %xmm4, %xmm2
1055 ; SSE41-NEXT: pand %xmm3, %xmm2
1056 ; SSE41-NEXT: pand %xmm0, %xmm2
1057 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
1058 ; SSE41-NEXT: pand %xmm2, %xmm0
1059 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1060 ; SSE41-NEXT: pand %xmm0, %xmm1
1061 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1062 ; SSE41-NEXT: psrld $16, %xmm0
1063 ; SSE41-NEXT: pand %xmm1, %xmm0
1064 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1065 ; SSE41-NEXT: psrlw $8, %xmm1
1066 ; SSE41-NEXT: pand %xmm0, %xmm1
1067 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
1068 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1071 ; AVX1-LABEL: test_v128i8:
1073 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
1074 ; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1
1075 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
1076 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1077 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
1078 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
1079 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
1080 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
1081 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
1082 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1083 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
1084 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
1085 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
1086 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
1087 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
1088 ; AVX1-NEXT: vzeroupper
1091 ; AVX2-LABEL: test_v128i8:
1093 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
1094 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
1095 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1096 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1097 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
1098 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1099 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
1100 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1101 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
1102 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1103 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
1104 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1105 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
1106 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
1107 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
1108 ; AVX2-NEXT: vzeroupper
1111 ; AVX512-LABEL: test_v128i8:
1113 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1114 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1115 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1116 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1117 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1118 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1119 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1120 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1121 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1122 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1123 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1124 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1125 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1126 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1127 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1128 ; AVX512-NEXT: vzeroupper
1130 %1 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> %a0)
1134 declare i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64>)
1135 declare i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64>)
1136 declare i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64>)
1137 declare i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64>)
1139 declare i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32>)
1140 declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>)
1141 declare i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32>)
1142 declare i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32>)
1143 declare i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32>)
1145 declare i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16>)
1146 declare i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16>)
1147 declare i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16>)
1148 declare i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16>)
1149 declare i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16>)
1150 declare i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16>)
1152 declare i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8>)
1153 declare i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8>)
1154 declare i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8>)
1155 declare i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8>)
1156 declare i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8>)
1157 declare i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8>)
1158 declare i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8>)