1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
7 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL --check-prefix=AVX512BWVL
8 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
9 ; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL --check-prefix=AVX512DQVL
15 define i64 @test_v2i64(<2 x i64> %a0) {
16 ; SSE-LABEL: test_v2i64:
18 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
19 ; SSE-NEXT: movdqa %xmm0, %xmm2
20 ; SSE-NEXT: psrlq $32, %xmm2
21 ; SSE-NEXT: pmuludq %xmm1, %xmm2
22 ; SSE-NEXT: movdqa %xmm0, %xmm3
23 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
24 ; SSE-NEXT: pmuludq %xmm0, %xmm3
25 ; SSE-NEXT: paddq %xmm2, %xmm3
26 ; SSE-NEXT: psllq $32, %xmm3
27 ; SSE-NEXT: pmuludq %xmm1, %xmm0
28 ; SSE-NEXT: paddq %xmm3, %xmm0
29 ; SSE-NEXT: movq %xmm0, %rax
32 ; AVX-LABEL: test_v2i64:
34 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
35 ; AVX-NEXT: vpsrlq $32, %xmm0, %xmm2
36 ; AVX-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
37 ; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
38 ; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
39 ; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm2
40 ; AVX-NEXT: vpsllq $32, %xmm2, %xmm2
41 ; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
42 ; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0
43 ; AVX-NEXT: vmovq %xmm0, %rax
46 ; AVX512BW-LABEL: test_v2i64:
48 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
49 ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
50 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
51 ; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
52 ; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
53 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
54 ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
55 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
56 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
57 ; AVX512BW-NEXT: vmovq %xmm0, %rax
60 ; AVX512BWVL-LABEL: test_v2i64:
61 ; AVX512BWVL: # %bb.0:
62 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
63 ; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
64 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
65 ; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3
66 ; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
67 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
68 ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
69 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
70 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
71 ; AVX512BWVL-NEXT: vmovq %xmm0, %rax
72 ; AVX512BWVL-NEXT: retq
74 ; AVX512DQ-LABEL: test_v2i64:
76 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
77 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
78 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
79 ; AVX512DQ-NEXT: vmovq %xmm0, %rax
80 ; AVX512DQ-NEXT: vzeroupper
83 ; AVX512DQVL-LABEL: test_v2i64:
84 ; AVX512DQVL: # %bb.0:
85 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
86 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
87 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax
88 ; AVX512DQVL-NEXT: retq
89 %1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> %a0)
93 define i64 @test_v4i64(<4 x i64> %a0) {
94 ; SSE-LABEL: test_v4i64:
96 ; SSE-NEXT: movdqa %xmm0, %xmm2
97 ; SSE-NEXT: psrlq $32, %xmm2
98 ; SSE-NEXT: pmuludq %xmm1, %xmm2
99 ; SSE-NEXT: movdqa %xmm1, %xmm3
100 ; SSE-NEXT: psrlq $32, %xmm3
101 ; SSE-NEXT: pmuludq %xmm0, %xmm3
102 ; SSE-NEXT: paddq %xmm2, %xmm3
103 ; SSE-NEXT: psllq $32, %xmm3
104 ; SSE-NEXT: pmuludq %xmm1, %xmm0
105 ; SSE-NEXT: paddq %xmm3, %xmm0
106 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
107 ; SSE-NEXT: movdqa %xmm0, %xmm2
108 ; SSE-NEXT: psrlq $32, %xmm2
109 ; SSE-NEXT: pmuludq %xmm1, %xmm2
110 ; SSE-NEXT: movdqa %xmm0, %xmm3
111 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
112 ; SSE-NEXT: pmuludq %xmm0, %xmm3
113 ; SSE-NEXT: paddq %xmm2, %xmm3
114 ; SSE-NEXT: psllq $32, %xmm3
115 ; SSE-NEXT: pmuludq %xmm1, %xmm0
116 ; SSE-NEXT: paddq %xmm3, %xmm0
117 ; SSE-NEXT: movq %xmm0, %rax
120 ; AVX1-LABEL: test_v4i64:
122 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
123 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
124 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
125 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
126 ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
127 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
128 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
129 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
130 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
131 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
132 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
133 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
134 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
135 ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
136 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
137 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
138 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
139 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
140 ; AVX1-NEXT: vmovq %xmm0, %rax
141 ; AVX1-NEXT: vzeroupper
144 ; AVX2-LABEL: test_v4i64:
146 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
147 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2
148 ; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
149 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
150 ; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
151 ; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2
152 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
153 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
154 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
155 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
156 ; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
157 ; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
158 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3
159 ; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
160 ; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
161 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
162 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
163 ; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
164 ; AVX2-NEXT: vmovq %xmm0, %rax
165 ; AVX2-NEXT: vzeroupper
168 ; AVX512BW-LABEL: test_v4i64:
170 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
171 ; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm2
172 ; AVX512BW-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
173 ; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm3
174 ; AVX512BW-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
175 ; AVX512BW-NEXT: vpaddq %ymm2, %ymm3, %ymm2
176 ; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2
177 ; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
178 ; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0
179 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
180 ; AVX512BW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
181 ; AVX512BW-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
182 ; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm3
183 ; AVX512BW-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
184 ; AVX512BW-NEXT: vpaddq %ymm3, %ymm2, %ymm2
185 ; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2
186 ; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
187 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
188 ; AVX512BW-NEXT: vmovq %xmm0, %rax
189 ; AVX512BW-NEXT: vzeroupper
190 ; AVX512BW-NEXT: retq
192 ; AVX512BWVL-LABEL: test_v4i64:
193 ; AVX512BWVL: # %bb.0:
194 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
195 ; AVX512BWVL-NEXT: vpsrlq $32, %ymm0, %ymm2
196 ; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
197 ; AVX512BWVL-NEXT: vpsrlq $32, %ymm1, %ymm3
198 ; AVX512BWVL-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
199 ; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm3, %ymm2
200 ; AVX512BWVL-NEXT: vpsllq $32, %ymm2, %ymm2
201 ; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
202 ; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm0, %ymm0
203 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
204 ; AVX512BWVL-NEXT: vpsrlq $32, %ymm0, %ymm2
205 ; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
206 ; AVX512BWVL-NEXT: vpsrlq $32, %ymm1, %ymm3
207 ; AVX512BWVL-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
208 ; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm3, %ymm2
209 ; AVX512BWVL-NEXT: vpsllq $32, %ymm2, %ymm2
210 ; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
211 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
212 ; AVX512BWVL-NEXT: vmovq %xmm0, %rax
213 ; AVX512BWVL-NEXT: vzeroupper
214 ; AVX512BWVL-NEXT: retq
216 ; AVX512DQ-LABEL: test_v4i64:
218 ; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
219 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
220 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
221 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
222 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
223 ; AVX512DQ-NEXT: vmovq %xmm0, %rax
224 ; AVX512DQ-NEXT: vzeroupper
225 ; AVX512DQ-NEXT: retq
227 ; AVX512DQVL-LABEL: test_v4i64:
228 ; AVX512DQVL: # %bb.0:
229 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
230 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
231 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
232 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
233 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax
234 ; AVX512DQVL-NEXT: vzeroupper
235 ; AVX512DQVL-NEXT: retq
236 %1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> %a0)
240 define i64 @test_v8i64(<8 x i64> %a0) {
241 ; SSE-LABEL: test_v8i64:
243 ; SSE-NEXT: movdqa %xmm1, %xmm4
244 ; SSE-NEXT: psrlq $32, %xmm4
245 ; SSE-NEXT: pmuludq %xmm3, %xmm4
246 ; SSE-NEXT: movdqa %xmm3, %xmm5
247 ; SSE-NEXT: psrlq $32, %xmm5
248 ; SSE-NEXT: pmuludq %xmm1, %xmm5
249 ; SSE-NEXT: paddq %xmm4, %xmm5
250 ; SSE-NEXT: psllq $32, %xmm5
251 ; SSE-NEXT: pmuludq %xmm3, %xmm1
252 ; SSE-NEXT: paddq %xmm5, %xmm1
253 ; SSE-NEXT: movdqa %xmm0, %xmm3
254 ; SSE-NEXT: psrlq $32, %xmm3
255 ; SSE-NEXT: pmuludq %xmm2, %xmm3
256 ; SSE-NEXT: movdqa %xmm2, %xmm4
257 ; SSE-NEXT: psrlq $32, %xmm4
258 ; SSE-NEXT: pmuludq %xmm0, %xmm4
259 ; SSE-NEXT: paddq %xmm3, %xmm4
260 ; SSE-NEXT: psllq $32, %xmm4
261 ; SSE-NEXT: pmuludq %xmm2, %xmm0
262 ; SSE-NEXT: paddq %xmm4, %xmm0
263 ; SSE-NEXT: movdqa %xmm0, %xmm2
264 ; SSE-NEXT: psrlq $32, %xmm2
265 ; SSE-NEXT: pmuludq %xmm1, %xmm2
266 ; SSE-NEXT: movdqa %xmm1, %xmm3
267 ; SSE-NEXT: psrlq $32, %xmm3
268 ; SSE-NEXT: pmuludq %xmm0, %xmm3
269 ; SSE-NEXT: paddq %xmm2, %xmm3
270 ; SSE-NEXT: psllq $32, %xmm3
271 ; SSE-NEXT: pmuludq %xmm1, %xmm0
272 ; SSE-NEXT: paddq %xmm3, %xmm0
273 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
274 ; SSE-NEXT: movdqa %xmm0, %xmm2
275 ; SSE-NEXT: psrlq $32, %xmm2
276 ; SSE-NEXT: pmuludq %xmm1, %xmm2
277 ; SSE-NEXT: movdqa %xmm0, %xmm3
278 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
279 ; SSE-NEXT: pmuludq %xmm0, %xmm3
280 ; SSE-NEXT: paddq %xmm2, %xmm3
281 ; SSE-NEXT: psllq $32, %xmm3
282 ; SSE-NEXT: pmuludq %xmm1, %xmm0
283 ; SSE-NEXT: paddq %xmm3, %xmm0
284 ; SSE-NEXT: movq %xmm0, %rax
287 ; AVX1-LABEL: test_v8i64:
289 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
290 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
291 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4
292 ; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm4
293 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5
294 ; AVX1-NEXT: vpmuludq %xmm5, %xmm3, %xmm5
295 ; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4
296 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
297 ; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
298 ; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
299 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3
300 ; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm3
301 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
302 ; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4
303 ; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3
304 ; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
305 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
306 ; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0
307 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm1
308 ; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
309 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm3
310 ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
311 ; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
312 ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
313 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
314 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
315 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
316 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
317 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
318 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
319 ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
320 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
321 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
322 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
323 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
324 ; AVX1-NEXT: vmovq %xmm0, %rax
325 ; AVX1-NEXT: vzeroupper
328 ; AVX2-LABEL: test_v8i64:
330 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2
331 ; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
332 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
333 ; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
334 ; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2
335 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
336 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
337 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
338 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
339 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2
340 ; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
341 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
342 ; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
343 ; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2
344 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
345 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
346 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
347 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
348 ; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
349 ; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
350 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3
351 ; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
352 ; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
353 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
354 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
355 ; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
356 ; AVX2-NEXT: vmovq %xmm0, %rax
357 ; AVX2-NEXT: vzeroupper
360 ; AVX512BW-LABEL: test_v8i64:
362 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
363 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2
364 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
365 ; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
366 ; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
367 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2
368 ; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2
369 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
370 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
371 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
372 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2
373 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
374 ; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
375 ; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
376 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2
377 ; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2
378 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
379 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
380 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
381 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2
382 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
383 ; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
384 ; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
385 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2
386 ; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2
387 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
388 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
389 ; AVX512BW-NEXT: vmovq %xmm0, %rax
390 ; AVX512BW-NEXT: vzeroupper
391 ; AVX512BW-NEXT: retq
393 ; AVX512BWVL-LABEL: test_v8i64:
394 ; AVX512BWVL: # %bb.0:
395 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
396 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2
397 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
398 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3
399 ; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
400 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2
401 ; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2
402 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
403 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0
404 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
405 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2
406 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
407 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3
408 ; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
409 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2
410 ; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2
411 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
412 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0
413 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
414 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2
415 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
416 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3
417 ; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
418 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2
419 ; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2
420 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
421 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
422 ; AVX512BWVL-NEXT: vmovq %xmm0, %rax
423 ; AVX512BWVL-NEXT: vzeroupper
424 ; AVX512BWVL-NEXT: retq
426 ; AVX512DQ-LABEL: test_v8i64:
428 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
429 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
430 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
431 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
432 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
433 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
434 ; AVX512DQ-NEXT: vmovq %xmm0, %rax
435 ; AVX512DQ-NEXT: vzeroupper
436 ; AVX512DQ-NEXT: retq
438 ; AVX512DQVL-LABEL: test_v8i64:
439 ; AVX512DQVL: # %bb.0:
440 ; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
441 ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0
442 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
443 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
444 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
445 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
446 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax
447 ; AVX512DQVL-NEXT: vzeroupper
448 ; AVX512DQVL-NEXT: retq
449 %1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> %a0)
453 define i64 @test_v16i64(<16 x i64> %a0) {
454 ; SSE-LABEL: test_v16i64:
456 ; SSE-NEXT: movdqa %xmm2, %xmm8
457 ; SSE-NEXT: psrlq $32, %xmm8
458 ; SSE-NEXT: pmuludq %xmm6, %xmm8
459 ; SSE-NEXT: movdqa %xmm6, %xmm9
460 ; SSE-NEXT: psrlq $32, %xmm9
461 ; SSE-NEXT: pmuludq %xmm2, %xmm9
462 ; SSE-NEXT: paddq %xmm8, %xmm9
463 ; SSE-NEXT: psllq $32, %xmm9
464 ; SSE-NEXT: pmuludq %xmm6, %xmm2
465 ; SSE-NEXT: paddq %xmm9, %xmm2
466 ; SSE-NEXT: movdqa %xmm0, %xmm8
467 ; SSE-NEXT: psrlq $32, %xmm8
468 ; SSE-NEXT: pmuludq %xmm4, %xmm8
469 ; SSE-NEXT: movdqa %xmm4, %xmm6
470 ; SSE-NEXT: psrlq $32, %xmm6
471 ; SSE-NEXT: pmuludq %xmm0, %xmm6
472 ; SSE-NEXT: paddq %xmm8, %xmm6
473 ; SSE-NEXT: psllq $32, %xmm6
474 ; SSE-NEXT: pmuludq %xmm4, %xmm0
475 ; SSE-NEXT: paddq %xmm6, %xmm0
476 ; SSE-NEXT: movdqa %xmm3, %xmm4
477 ; SSE-NEXT: psrlq $32, %xmm4
478 ; SSE-NEXT: pmuludq %xmm7, %xmm4
479 ; SSE-NEXT: movdqa %xmm7, %xmm6
480 ; SSE-NEXT: psrlq $32, %xmm6
481 ; SSE-NEXT: pmuludq %xmm3, %xmm6
482 ; SSE-NEXT: paddq %xmm4, %xmm6
483 ; SSE-NEXT: psllq $32, %xmm6
484 ; SSE-NEXT: pmuludq %xmm7, %xmm3
485 ; SSE-NEXT: paddq %xmm6, %xmm3
486 ; SSE-NEXT: movdqa %xmm1, %xmm4
487 ; SSE-NEXT: psrlq $32, %xmm4
488 ; SSE-NEXT: pmuludq %xmm5, %xmm4
489 ; SSE-NEXT: movdqa %xmm5, %xmm6
490 ; SSE-NEXT: psrlq $32, %xmm6
491 ; SSE-NEXT: pmuludq %xmm1, %xmm6
492 ; SSE-NEXT: paddq %xmm4, %xmm6
493 ; SSE-NEXT: psllq $32, %xmm6
494 ; SSE-NEXT: pmuludq %xmm5, %xmm1
495 ; SSE-NEXT: paddq %xmm6, %xmm1
496 ; SSE-NEXT: movdqa %xmm1, %xmm4
497 ; SSE-NEXT: psrlq $32, %xmm4
498 ; SSE-NEXT: pmuludq %xmm3, %xmm4
499 ; SSE-NEXT: movdqa %xmm3, %xmm5
500 ; SSE-NEXT: psrlq $32, %xmm5
501 ; SSE-NEXT: pmuludq %xmm1, %xmm5
502 ; SSE-NEXT: paddq %xmm4, %xmm5
503 ; SSE-NEXT: psllq $32, %xmm5
504 ; SSE-NEXT: pmuludq %xmm3, %xmm1
505 ; SSE-NEXT: paddq %xmm5, %xmm1
506 ; SSE-NEXT: movdqa %xmm0, %xmm3
507 ; SSE-NEXT: psrlq $32, %xmm3
508 ; SSE-NEXT: pmuludq %xmm2, %xmm3
509 ; SSE-NEXT: movdqa %xmm2, %xmm4
510 ; SSE-NEXT: psrlq $32, %xmm4
511 ; SSE-NEXT: pmuludq %xmm0, %xmm4
512 ; SSE-NEXT: paddq %xmm3, %xmm4
513 ; SSE-NEXT: psllq $32, %xmm4
514 ; SSE-NEXT: pmuludq %xmm2, %xmm0
515 ; SSE-NEXT: paddq %xmm4, %xmm0
516 ; SSE-NEXT: movdqa %xmm0, %xmm2
517 ; SSE-NEXT: psrlq $32, %xmm2
518 ; SSE-NEXT: pmuludq %xmm1, %xmm2
519 ; SSE-NEXT: movdqa %xmm1, %xmm3
520 ; SSE-NEXT: psrlq $32, %xmm3
521 ; SSE-NEXT: pmuludq %xmm0, %xmm3
522 ; SSE-NEXT: paddq %xmm2, %xmm3
523 ; SSE-NEXT: psllq $32, %xmm3
524 ; SSE-NEXT: pmuludq %xmm1, %xmm0
525 ; SSE-NEXT: paddq %xmm3, %xmm0
526 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
527 ; SSE-NEXT: movdqa %xmm0, %xmm2
528 ; SSE-NEXT: psrlq $32, %xmm2
529 ; SSE-NEXT: pmuludq %xmm1, %xmm2
530 ; SSE-NEXT: movdqa %xmm0, %xmm3
531 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
532 ; SSE-NEXT: pmuludq %xmm0, %xmm3
533 ; SSE-NEXT: paddq %xmm2, %xmm3
534 ; SSE-NEXT: psllq $32, %xmm3
535 ; SSE-NEXT: pmuludq %xmm1, %xmm0
536 ; SSE-NEXT: paddq %xmm3, %xmm0
537 ; SSE-NEXT: movq %xmm0, %rax
540 ; AVX1-LABEL: test_v16i64:
542 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
543 ; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm4
544 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
545 ; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm5
546 ; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4
547 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
548 ; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm5
549 ; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4
550 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm5
551 ; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm5
552 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6
553 ; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6
554 ; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5
555 ; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
556 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm6
557 ; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5
558 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
559 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
560 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm6
561 ; AVX1-NEXT: vpmuludq %xmm3, %xmm6, %xmm6
562 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm7
563 ; AVX1-NEXT: vpmuludq %xmm7, %xmm1, %xmm7
564 ; AVX1-NEXT: vpaddq %xmm6, %xmm7, %xmm6
565 ; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6
566 ; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
567 ; AVX1-NEXT: vpaddq %xmm6, %xmm1, %xmm1
568 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
569 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
570 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3
571 ; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm3
572 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6
573 ; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6
574 ; AVX1-NEXT: vpaddq %xmm3, %xmm6, %xmm3
575 ; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
576 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
577 ; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0
578 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
579 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
580 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
581 ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
582 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
583 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
584 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
585 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
586 ; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm1
587 ; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1
588 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm2
589 ; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm2
590 ; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
591 ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
592 ; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm2
593 ; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
594 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm2
595 ; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
596 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3
597 ; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
598 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
599 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
600 ; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
601 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
602 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
603 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
604 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
605 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
606 ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
607 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
608 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
609 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
610 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
611 ; AVX1-NEXT: vmovq %xmm0, %rax
612 ; AVX1-NEXT: vzeroupper
615 ; AVX2-LABEL: test_v16i64:
617 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm4
618 ; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm4
619 ; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm5
620 ; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm5
621 ; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4
622 ; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4
623 ; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
624 ; AVX2-NEXT: vpaddq %ymm4, %ymm1, %ymm1
625 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3
626 ; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm3
627 ; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4
628 ; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm4
629 ; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3
630 ; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
631 ; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0
632 ; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0
633 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2
634 ; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
635 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
636 ; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
637 ; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2
638 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
639 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
640 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
641 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
642 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2
643 ; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
644 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
645 ; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
646 ; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2
647 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
648 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
649 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
650 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
651 ; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
652 ; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
653 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3
654 ; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
655 ; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
656 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
657 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
658 ; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
659 ; AVX2-NEXT: vmovq %xmm0, %rax
660 ; AVX2-NEXT: vzeroupper
663 ; AVX512BW-LABEL: test_v16i64:
665 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2
666 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
667 ; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
668 ; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
669 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2
670 ; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2
671 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
672 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
673 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
674 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2
675 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
676 ; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
677 ; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
678 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2
679 ; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2
680 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
681 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
682 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
683 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2
684 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
685 ; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
686 ; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
687 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2
688 ; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2
689 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
690 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
691 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
692 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2
693 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
694 ; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
695 ; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
696 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2
697 ; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2
698 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
699 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
700 ; AVX512BW-NEXT: vmovq %xmm0, %rax
701 ; AVX512BW-NEXT: vzeroupper
702 ; AVX512BW-NEXT: retq
704 ; AVX512BWVL-LABEL: test_v16i64:
705 ; AVX512BWVL: # %bb.0:
706 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2
707 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
708 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3
709 ; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
710 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2
711 ; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2
712 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
713 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0
714 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
715 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2
716 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
717 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3
718 ; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
719 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2
720 ; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2
721 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
722 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0
723 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
724 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2
725 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
726 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3
727 ; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
728 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2
729 ; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2
730 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
731 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0
732 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
733 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2
734 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
735 ; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3
736 ; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
737 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2
738 ; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2
739 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
740 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
741 ; AVX512BWVL-NEXT: vmovq %xmm0, %rax
742 ; AVX512BWVL-NEXT: vzeroupper
743 ; AVX512BWVL-NEXT: retq
745 ; AVX512DQ-LABEL: test_v16i64:
747 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
748 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
749 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
750 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
751 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
752 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
753 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
754 ; AVX512DQ-NEXT: vmovq %xmm0, %rax
755 ; AVX512DQ-NEXT: vzeroupper
756 ; AVX512DQ-NEXT: retq
758 ; AVX512DQVL-LABEL: test_v16i64:
759 ; AVX512DQVL: # %bb.0:
760 ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0
761 ; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
762 ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0
763 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
764 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
765 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
766 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
767 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax
768 ; AVX512DQVL-NEXT: vzeroupper
769 ; AVX512DQVL-NEXT: retq
770 %1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> %a0)
778 define i32 @test_v2i32(<2 x i32> %a0) {
779 ; SSE2-LABEL: test_v2i32:
781 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
782 ; SSE2-NEXT: pmuludq %xmm0, %xmm1
783 ; SSE2-NEXT: movd %xmm1, %eax
786 ; SSE41-LABEL: test_v2i32:
788 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
789 ; SSE41-NEXT: pmulld %xmm0, %xmm1
790 ; SSE41-NEXT: movd %xmm1, %eax
793 ; AVX-LABEL: test_v2i32:
795 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
796 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
797 ; AVX-NEXT: vmovd %xmm0, %eax
800 ; AVX512-LABEL: test_v2i32:
802 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
803 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
804 ; AVX512-NEXT: vmovd %xmm0, %eax
806 %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> %a0)
810 define i32 @test_v4i32(<4 x i32> %a0) {
811 ; SSE2-LABEL: test_v4i32:
813 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
814 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,1,1]
815 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
816 ; SSE2-NEXT: pmuludq %xmm2, %xmm3
817 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
818 ; SSE2-NEXT: pmuludq %xmm0, %xmm1
819 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
820 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
821 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
822 ; SSE2-NEXT: pmuludq %xmm0, %xmm1
823 ; SSE2-NEXT: movd %xmm1, %eax
826 ; SSE41-LABEL: test_v4i32:
828 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
829 ; SSE41-NEXT: pmulld %xmm0, %xmm1
830 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
831 ; SSE41-NEXT: pmulld %xmm1, %xmm0
832 ; SSE41-NEXT: movd %xmm0, %eax
835 ; AVX-LABEL: test_v4i32:
837 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
838 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
839 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
840 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
841 ; AVX-NEXT: vmovd %xmm0, %eax
844 ; AVX512-LABEL: test_v4i32:
846 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
847 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
848 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
849 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
850 ; AVX512-NEXT: vmovd %xmm0, %eax
852 %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> %a0)
856 define i32 @test_v8i32(<8 x i32> %a0) {
857 ; SSE2-LABEL: test_v8i32:
859 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
860 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
861 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
862 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
863 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
864 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
865 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
866 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
867 ; SSE2-NEXT: pmuludq %xmm0, %xmm2
868 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
869 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,0,0]
870 ; SSE2-NEXT: pmuludq %xmm1, %xmm2
871 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
872 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
873 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
874 ; SSE2-NEXT: pmuludq %xmm0, %xmm1
875 ; SSE2-NEXT: movd %xmm1, %eax
878 ; SSE41-LABEL: test_v8i32:
880 ; SSE41-NEXT: pmulld %xmm1, %xmm0
881 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
882 ; SSE41-NEXT: pmulld %xmm0, %xmm1
883 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
884 ; SSE41-NEXT: pmulld %xmm1, %xmm0
885 ; SSE41-NEXT: movd %xmm0, %eax
888 ; AVX1-LABEL: test_v8i32:
890 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
891 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
892 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
893 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
894 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
895 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
896 ; AVX1-NEXT: vmovd %xmm0, %eax
897 ; AVX1-NEXT: vzeroupper
900 ; AVX2-LABEL: test_v8i32:
902 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
903 ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
904 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
905 ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
906 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
907 ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
908 ; AVX2-NEXT: vmovd %xmm0, %eax
909 ; AVX2-NEXT: vzeroupper
912 ; AVX512-LABEL: test_v8i32:
914 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
915 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
916 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
917 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
918 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
919 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
920 ; AVX512-NEXT: vmovd %xmm0, %eax
921 ; AVX512-NEXT: vzeroupper
923 %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> %a0)
927 define i32 @test_v16i32(<16 x i32> %a0) {
928 ; SSE2-LABEL: test_v16i32:
930 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
931 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
932 ; SSE2-NEXT: pmuludq %xmm4, %xmm5
933 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
934 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
935 ; SSE2-NEXT: pmuludq %xmm4, %xmm6
936 ; SSE2-NEXT: pmuludq %xmm5, %xmm6
937 ; SSE2-NEXT: pmuludq %xmm3, %xmm1
938 ; SSE2-NEXT: pmuludq %xmm2, %xmm0
939 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
940 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
941 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,2,3]
942 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
943 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
944 ; SSE2-NEXT: pmuludq %xmm0, %xmm1
945 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
946 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,0,0]
947 ; SSE2-NEXT: pmuludq %xmm6, %xmm1
948 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
949 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
950 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
951 ; SSE2-NEXT: pmuludq %xmm0, %xmm1
952 ; SSE2-NEXT: movd %xmm1, %eax
955 ; SSE41-LABEL: test_v16i32:
957 ; SSE41-NEXT: pmulld %xmm3, %xmm1
958 ; SSE41-NEXT: pmulld %xmm2, %xmm0
959 ; SSE41-NEXT: pmulld %xmm1, %xmm0
960 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
961 ; SSE41-NEXT: pmulld %xmm0, %xmm1
962 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
963 ; SSE41-NEXT: pmulld %xmm1, %xmm0
964 ; SSE41-NEXT: movd %xmm0, %eax
967 ; AVX1-LABEL: test_v16i32:
969 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
970 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
971 ; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2
972 ; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1
973 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
974 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
975 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
976 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
977 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
978 ; AVX1-NEXT: vmovd %xmm0, %eax
979 ; AVX1-NEXT: vzeroupper
982 ; AVX2-LABEL: test_v16i32:
984 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
985 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
986 ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
987 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
988 ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
989 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
990 ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
991 ; AVX2-NEXT: vmovd %xmm0, %eax
992 ; AVX2-NEXT: vzeroupper
995 ; AVX512-LABEL: test_v16i32:
997 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
998 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
999 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1000 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1001 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1002 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1003 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1004 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1005 ; AVX512-NEXT: vmovd %xmm0, %eax
1006 ; AVX512-NEXT: vzeroupper
1008 %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> %a0)
1012 define i32 @test_v32i32(<32 x i32> %a0) {
1013 ; SSE2-LABEL: test_v32i32:
1015 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3]
1016 ; SSE2-NEXT: pmuludq %xmm6, %xmm2
1017 ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
1018 ; SSE2-NEXT: pmuludq %xmm4, %xmm0
1019 ; SSE2-NEXT: pmuludq %xmm2, %xmm0
1020 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
1021 ; SSE2-NEXT: pmuludq %xmm7, %xmm3
1022 ; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3]
1023 ; SSE2-NEXT: pmuludq %xmm5, %xmm1
1024 ; SSE2-NEXT: pmuludq %xmm3, %xmm1
1025 ; SSE2-NEXT: pmuludq %xmm0, %xmm1
1026 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
1027 ; SSE2-NEXT: pmuludq %xmm8, %xmm0
1028 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
1029 ; SSE2-NEXT: pmuludq %xmm9, %xmm3
1030 ; SSE2-NEXT: pmuludq %xmm0, %xmm3
1031 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
1032 ; SSE2-NEXT: pmuludq %xmm2, %xmm0
1033 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
1034 ; SSE2-NEXT: pmuludq %xmm10, %xmm2
1035 ; SSE2-NEXT: pmuludq %xmm0, %xmm2
1036 ; SSE2-NEXT: pmuludq %xmm3, %xmm2
1037 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
1038 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
1039 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1040 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1041 ; SSE2-NEXT: pmuludq %xmm0, %xmm1
1042 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
1043 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,0,0]
1044 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
1045 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1046 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1047 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1048 ; SSE2-NEXT: pmuludq %xmm0, %xmm1
1049 ; SSE2-NEXT: movd %xmm1, %eax
1052 ; SSE41-LABEL: test_v32i32:
1054 ; SSE41-NEXT: pmulld %xmm6, %xmm2
1055 ; SSE41-NEXT: pmulld %xmm4, %xmm0
1056 ; SSE41-NEXT: pmulld %xmm2, %xmm0
1057 ; SSE41-NEXT: pmulld %xmm7, %xmm3
1058 ; SSE41-NEXT: pmulld %xmm5, %xmm1
1059 ; SSE41-NEXT: pmulld %xmm3, %xmm1
1060 ; SSE41-NEXT: pmulld %xmm0, %xmm1
1061 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
1062 ; SSE41-NEXT: pmulld %xmm1, %xmm0
1063 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1064 ; SSE41-NEXT: pmulld %xmm0, %xmm1
1065 ; SSE41-NEXT: movd %xmm1, %eax
1068 ; AVX1-LABEL: test_v32i32:
1070 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm4
1071 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1072 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1073 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1
1074 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
1075 ; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1
1076 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1077 ; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1
1078 ; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2
1079 ; AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
1080 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1081 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1082 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1083 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1084 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1085 ; AVX1-NEXT: vmovd %xmm0, %eax
1086 ; AVX1-NEXT: vzeroupper
1089 ; AVX2-LABEL: test_v32i32:
1091 ; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1
1092 ; AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
1093 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
1094 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1095 ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1096 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1097 ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1098 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1099 ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1100 ; AVX2-NEXT: vmovd %xmm0, %eax
1101 ; AVX2-NEXT: vzeroupper
1104 ; AVX512-LABEL: test_v32i32:
1106 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
1107 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1108 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
1109 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1110 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1111 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1112 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1113 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1114 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1115 ; AVX512-NEXT: vmovd %xmm0, %eax
1116 ; AVX512-NEXT: vzeroupper
1118 %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> %a0)
1126 define i16 @test_v2i16(<2 x i16> %a0) {
1127 ; SSE-LABEL: test_v2i16:
1129 ; SSE-NEXT: movdqa %xmm0, %xmm1
1130 ; SSE-NEXT: psrld $16, %xmm1
1131 ; SSE-NEXT: pmullw %xmm0, %xmm1
1132 ; SSE-NEXT: movd %xmm1, %eax
1133 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
1136 ; AVX-LABEL: test_v2i16:
1138 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
1139 ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1140 ; AVX-NEXT: vmovd %xmm0, %eax
1141 ; AVX-NEXT: # kill: def $ax killed $ax killed $eax
1144 ; AVX512-LABEL: test_v2i16:
1146 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1147 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1148 ; AVX512-NEXT: vmovd %xmm0, %eax
1149 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
1151 %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> %a0)
1155 define i16 @test_v4i16(<4 x i16> %a0) {
1156 ; SSE-LABEL: test_v4i16:
1158 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1159 ; SSE-NEXT: pmullw %xmm0, %xmm1
1160 ; SSE-NEXT: movdqa %xmm1, %xmm0
1161 ; SSE-NEXT: psrld $16, %xmm0
1162 ; SSE-NEXT: pmullw %xmm1, %xmm0
1163 ; SSE-NEXT: movd %xmm0, %eax
1164 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
1167 ; AVX-LABEL: test_v4i16:
1169 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1170 ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1171 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
1172 ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1173 ; AVX-NEXT: vmovd %xmm0, %eax
1174 ; AVX-NEXT: # kill: def $ax killed $ax killed $eax
1177 ; AVX512-LABEL: test_v4i16:
1179 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1180 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1181 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1182 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1183 ; AVX512-NEXT: vmovd %xmm0, %eax
1184 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
1186 %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> %a0)
1190 define i16 @test_v8i16(<8 x i16> %a0) {
1191 ; SSE-LABEL: test_v8i16:
1193 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1194 ; SSE-NEXT: pmullw %xmm0, %xmm1
1195 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1196 ; SSE-NEXT: pmullw %xmm1, %xmm0
1197 ; SSE-NEXT: movdqa %xmm0, %xmm1
1198 ; SSE-NEXT: psrld $16, %xmm1
1199 ; SSE-NEXT: pmullw %xmm0, %xmm1
1200 ; SSE-NEXT: movd %xmm1, %eax
1201 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
1204 ; AVX-LABEL: test_v8i16:
1206 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1207 ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1208 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1209 ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1210 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
1211 ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1212 ; AVX-NEXT: vmovd %xmm0, %eax
1213 ; AVX-NEXT: # kill: def $ax killed $ax killed $eax
1216 ; AVX512-LABEL: test_v8i16:
1218 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1219 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1220 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1221 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1222 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1223 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1224 ; AVX512-NEXT: vmovd %xmm0, %eax
1225 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
1227 %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> %a0)
1231 define i16 @test_v16i16(<16 x i16> %a0) {
1232 ; SSE-LABEL: test_v16i16:
1234 ; SSE-NEXT: pmullw %xmm1, %xmm0
1235 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1236 ; SSE-NEXT: pmullw %xmm0, %xmm1
1237 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1238 ; SSE-NEXT: pmullw %xmm1, %xmm0
1239 ; SSE-NEXT: movdqa %xmm0, %xmm1
1240 ; SSE-NEXT: psrld $16, %xmm1
1241 ; SSE-NEXT: pmullw %xmm0, %xmm1
1242 ; SSE-NEXT: movd %xmm1, %eax
1243 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
1246 ; AVX1-LABEL: test_v16i16:
1248 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1249 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1250 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1251 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1252 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1253 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1254 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1255 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1256 ; AVX1-NEXT: vmovd %xmm0, %eax
1257 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
1258 ; AVX1-NEXT: vzeroupper
1261 ; AVX2-LABEL: test_v16i16:
1263 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1264 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1265 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1266 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1267 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1268 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1269 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1270 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1271 ; AVX2-NEXT: vmovd %xmm0, %eax
1272 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
1273 ; AVX2-NEXT: vzeroupper
1276 ; AVX512-LABEL: test_v16i16:
1278 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1279 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1280 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1281 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1282 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1283 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1284 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1285 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1286 ; AVX512-NEXT: vmovd %xmm0, %eax
1287 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
1288 ; AVX512-NEXT: vzeroupper
1290 %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> %a0)
1294 define i16 @test_v32i16(<32 x i16> %a0) {
1295 ; SSE-LABEL: test_v32i16:
1297 ; SSE-NEXT: pmullw %xmm3, %xmm1
1298 ; SSE-NEXT: pmullw %xmm2, %xmm0
1299 ; SSE-NEXT: pmullw %xmm1, %xmm0
1300 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1301 ; SSE-NEXT: pmullw %xmm0, %xmm1
1302 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1303 ; SSE-NEXT: pmullw %xmm1, %xmm0
1304 ; SSE-NEXT: movdqa %xmm0, %xmm1
1305 ; SSE-NEXT: psrld $16, %xmm1
1306 ; SSE-NEXT: pmullw %xmm0, %xmm1
1307 ; SSE-NEXT: movd %xmm1, %eax
1308 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
1311 ; AVX1-LABEL: test_v32i16:
1313 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1314 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1315 ; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
1316 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
1317 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1318 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1319 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1320 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1321 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1322 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1323 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1324 ; AVX1-NEXT: vmovd %xmm0, %eax
1325 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
1326 ; AVX1-NEXT: vzeroupper
1329 ; AVX2-LABEL: test_v32i16:
1331 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1332 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1333 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1334 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1335 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1336 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1337 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1338 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1339 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1340 ; AVX2-NEXT: vmovd %xmm0, %eax
1341 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
1342 ; AVX2-NEXT: vzeroupper
1345 ; AVX512BW-LABEL: test_v32i16:
1346 ; AVX512BW: # %bb.0:
1347 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1348 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1349 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
1350 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1351 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1352 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1353 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1354 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1355 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1
1356 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1357 ; AVX512BW-NEXT: vmovd %xmm0, %eax
1358 ; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax
1359 ; AVX512BW-NEXT: vzeroupper
1360 ; AVX512BW-NEXT: retq
1362 ; AVX512BWVL-LABEL: test_v32i16:
1363 ; AVX512BWVL: # %bb.0:
1364 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1365 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1366 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
1367 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1368 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1369 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1370 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1371 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1372 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1
1373 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1374 ; AVX512BWVL-NEXT: vmovd %xmm0, %eax
1375 ; AVX512BWVL-NEXT: # kill: def $ax killed $ax killed $eax
1376 ; AVX512BWVL-NEXT: vzeroupper
1377 ; AVX512BWVL-NEXT: retq
1379 ; AVX512DQ-LABEL: test_v32i16:
1380 ; AVX512DQ: # %bb.0:
1381 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1382 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
1383 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1384 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1385 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1386 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1387 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1388 ; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1
1389 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1390 ; AVX512DQ-NEXT: vmovd %xmm0, %eax
1391 ; AVX512DQ-NEXT: # kill: def $ax killed $ax killed $eax
1392 ; AVX512DQ-NEXT: vzeroupper
1393 ; AVX512DQ-NEXT: retq
1395 ; AVX512DQVL-LABEL: test_v32i16:
1396 ; AVX512DQVL: # %bb.0:
1397 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1398 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
1399 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1400 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1401 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1402 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1403 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1404 ; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1
1405 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1406 ; AVX512DQVL-NEXT: vmovd %xmm0, %eax
1407 ; AVX512DQVL-NEXT: # kill: def $ax killed $ax killed $eax
1408 ; AVX512DQVL-NEXT: vzeroupper
1409 ; AVX512DQVL-NEXT: retq
1410 %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> %a0)
1414 define i16 @test_v64i16(<64 x i16> %a0) {
1415 ; SSE-LABEL: test_v64i16:
1417 ; SSE-NEXT: pmullw %xmm6, %xmm2
1418 ; SSE-NEXT: pmullw %xmm4, %xmm0
1419 ; SSE-NEXT: pmullw %xmm2, %xmm0
1420 ; SSE-NEXT: pmullw %xmm7, %xmm3
1421 ; SSE-NEXT: pmullw %xmm5, %xmm1
1422 ; SSE-NEXT: pmullw %xmm3, %xmm1
1423 ; SSE-NEXT: pmullw %xmm0, %xmm1
1424 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
1425 ; SSE-NEXT: pmullw %xmm1, %xmm0
1426 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1427 ; SSE-NEXT: pmullw %xmm0, %xmm1
1428 ; SSE-NEXT: movdqa %xmm1, %xmm0
1429 ; SSE-NEXT: psrld $16, %xmm0
1430 ; SSE-NEXT: pmullw %xmm1, %xmm0
1431 ; SSE-NEXT: movd %xmm0, %eax
1432 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
1435 ; AVX1-LABEL: test_v64i16:
1437 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm4
1438 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1439 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1440 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
1441 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
1442 ; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm1
1443 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1444 ; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm1
1445 ; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
1446 ; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1
1447 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1448 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1449 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1450 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1451 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1452 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1453 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1454 ; AVX1-NEXT: vmovd %xmm0, %eax
1455 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
1456 ; AVX1-NEXT: vzeroupper
1459 ; AVX2-LABEL: test_v64i16:
1461 ; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1
1462 ; AVX2-NEXT: vpmullw %ymm1, %ymm2, %ymm1
1463 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1464 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1465 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1466 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1467 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1468 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1469 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1470 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1471 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1472 ; AVX2-NEXT: vmovd %xmm0, %eax
1473 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
1474 ; AVX2-NEXT: vzeroupper
1477 ; AVX512BW-LABEL: test_v64i16:
1478 ; AVX512BW: # %bb.0:
1479 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1480 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1481 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1482 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
1483 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1484 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1485 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1486 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1487 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1488 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1
1489 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1490 ; AVX512BW-NEXT: vmovd %xmm0, %eax
1491 ; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax
1492 ; AVX512BW-NEXT: vzeroupper
1493 ; AVX512BW-NEXT: retq
1495 ; AVX512BWVL-LABEL: test_v64i16:
1496 ; AVX512BWVL: # %bb.0:
1497 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1498 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1499 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1500 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
1501 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1502 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1503 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1504 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1505 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1506 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1
1507 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1508 ; AVX512BWVL-NEXT: vmovd %xmm0, %eax
1509 ; AVX512BWVL-NEXT: # kill: def $ax killed $ax killed $eax
1510 ; AVX512BWVL-NEXT: vzeroupper
1511 ; AVX512BWVL-NEXT: retq
1513 ; AVX512DQ-LABEL: test_v64i16:
1514 ; AVX512DQ: # %bb.0:
1515 ; AVX512DQ-NEXT: vpmullw %ymm3, %ymm1, %ymm1
1516 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm2, %ymm1
1517 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1518 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
1519 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1520 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1521 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1522 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1523 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1524 ; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1
1525 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1526 ; AVX512DQ-NEXT: vmovd %xmm0, %eax
1527 ; AVX512DQ-NEXT: # kill: def $ax killed $ax killed $eax
1528 ; AVX512DQ-NEXT: vzeroupper
1529 ; AVX512DQ-NEXT: retq
1531 ; AVX512DQVL-LABEL: test_v64i16:
1532 ; AVX512DQVL: # %bb.0:
1533 ; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm1, %ymm1
1534 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm2, %ymm1
1535 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1536 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
1537 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1538 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1539 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1540 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1541 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1542 ; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1
1543 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1544 ; AVX512DQVL-NEXT: vmovd %xmm0, %eax
1545 ; AVX512DQVL-NEXT: # kill: def $ax killed $ax killed $eax
1546 ; AVX512DQVL-NEXT: vzeroupper
1547 ; AVX512DQVL-NEXT: retq
1548 %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> %a0)
1556 define i8 @test_v2i8(<2 x i8> %a0) {
1557 ; SSE2-LABEL: test_v2i8:
1559 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1560 ; SSE2-NEXT: psrlw $8, %xmm1
1561 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1562 ; SSE2-NEXT: pmullw %xmm1, %xmm0
1563 ; SSE2-NEXT: movd %xmm0, %eax
1564 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
1567 ; SSE41-LABEL: test_v2i8:
1569 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1570 ; SSE41-NEXT: psrlw $8, %xmm0
1571 ; SSE41-NEXT: pmullw %xmm1, %xmm0
1572 ; SSE41-NEXT: pextrb $0, %xmm0, %eax
1573 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1576 ; AVX-LABEL: test_v2i8:
1578 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
1579 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1580 ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1581 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
1582 ; AVX-NEXT: # kill: def $al killed $al killed $eax
1585 ; AVX512-LABEL: test_v2i8:
1587 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1588 ; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1589 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1590 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1591 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1593 %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> %a0)
1597 define i8 @test_v4i8(<4 x i8> %a0) {
1598 ; SSE2-LABEL: test_v4i8:
1600 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1601 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1602 ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1603 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1604 ; SSE2-NEXT: pmullw %xmm1, %xmm0
1605 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1606 ; SSE2-NEXT: packuswb %xmm0, %xmm0
1607 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1608 ; SSE2-NEXT: psrlw $8, %xmm1
1609 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1610 ; SSE2-NEXT: pmullw %xmm1, %xmm0
1611 ; SSE2-NEXT: movd %xmm0, %eax
1612 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
1615 ; SSE41-LABEL: test_v4i8:
1617 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1618 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1619 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1620 ; SSE41-NEXT: pmullw %xmm1, %xmm0
1621 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
1622 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1623 ; SSE41-NEXT: psrlw $8, %xmm0
1624 ; SSE41-NEXT: pmullw %xmm1, %xmm0
1625 ; SSE41-NEXT: pextrb $0, %xmm0, %eax
1626 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1629 ; AVX-LABEL: test_v4i8:
1631 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1632 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1633 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1634 ; AVX-NEXT: vpmullw %xmm0, %xmm1, %xmm0
1635 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
1636 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
1637 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1638 ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1639 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
1640 ; AVX-NEXT: # kill: def $al killed $al killed $eax
1643 ; AVX512-LABEL: test_v4i8:
1645 ; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1646 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1647 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1648 ; AVX512-NEXT: vpmullw %xmm0, %xmm1, %xmm0
1649 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
1650 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1651 ; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1652 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1653 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1654 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1656 %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8> %a0)
1660 define i8 @test_v8i8(<8 x i8> %a0) {
1661 ; SSE2-LABEL: test_v8i8:
1663 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,2,3,3]
1664 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1665 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1666 ; SSE2-NEXT: pmullw %xmm0, %xmm1
1667 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
1668 ; SSE2-NEXT: pand %xmm0, %xmm1
1669 ; SSE2-NEXT: packuswb %xmm1, %xmm1
1670 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1671 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1672 ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1673 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1674 ; SSE2-NEXT: pmullw %xmm2, %xmm1
1675 ; SSE2-NEXT: pand %xmm0, %xmm1
1676 ; SSE2-NEXT: packuswb %xmm1, %xmm1
1677 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1678 ; SSE2-NEXT: psrlw $8, %xmm0
1679 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1680 ; SSE2-NEXT: pmullw %xmm0, %xmm1
1681 ; SSE2-NEXT: movd %xmm1, %eax
1682 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
1685 ; SSE41-LABEL: test_v8i8:
1687 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1688 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1689 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1690 ; SSE41-NEXT: pmullw %xmm1, %xmm0
1691 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1692 ; SSE41-NEXT: pshufb %xmm1, %xmm0
1693 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1694 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1695 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1696 ; SSE41-NEXT: pmullw %xmm2, %xmm0
1697 ; SSE41-NEXT: pshufb %xmm1, %xmm0
1698 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1699 ; SSE41-NEXT: psrlw $8, %xmm0
1700 ; SSE41-NEXT: pmullw %xmm1, %xmm0
1701 ; SSE41-NEXT: pextrb $0, %xmm0, %eax
1702 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1705 ; AVX-LABEL: test_v8i8:
1707 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1708 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1709 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1710 ; AVX-NEXT: vpmullw %xmm0, %xmm1, %xmm0
1711 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1712 ; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0
1713 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1714 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1715 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1716 ; AVX-NEXT: vpmullw %xmm0, %xmm2, %xmm0
1717 ; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0
1718 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
1719 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1720 ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1721 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
1722 ; AVX-NEXT: # kill: def $al killed $al killed $eax
1725 ; AVX512-LABEL: test_v8i8:
1727 ; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1728 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1729 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1730 ; AVX512-NEXT: vpmullw %xmm0, %xmm1, %xmm0
1731 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1732 ; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0
1733 ; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1734 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1735 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1736 ; AVX512-NEXT: vpmullw %xmm0, %xmm2, %xmm0
1737 ; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0
1738 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1739 ; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1740 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1741 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1742 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1744 %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> %a0)
1748 define i8 @test_v16i8(<16 x i8> %a0) {
1749 ; SSE2-LABEL: test_v16i8:
1751 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1752 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1753 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1754 ; SSE2-NEXT: pmullw %xmm1, %xmm0
1755 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
1756 ; SSE2-NEXT: pand %xmm1, %xmm0
1757 ; SSE2-NEXT: pxor %xmm2, %xmm2
1758 ; SSE2-NEXT: packuswb %xmm2, %xmm0
1759 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,2,3,3]
1760 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1761 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1762 ; SSE2-NEXT: pmullw %xmm0, %xmm3
1763 ; SSE2-NEXT: pand %xmm1, %xmm3
1764 ; SSE2-NEXT: packuswb %xmm2, %xmm3
1765 ; SSE2-NEXT: movdqa %xmm3, %xmm0
1766 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1767 ; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1768 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1769 ; SSE2-NEXT: pmullw %xmm0, %xmm3
1770 ; SSE2-NEXT: pand %xmm1, %xmm3
1771 ; SSE2-NEXT: packuswb %xmm2, %xmm3
1772 ; SSE2-NEXT: movdqa %xmm3, %xmm0
1773 ; SSE2-NEXT: psrlw $8, %xmm0
1774 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1775 ; SSE2-NEXT: pmullw %xmm0, %xmm3
1776 ; SSE2-NEXT: pand %xmm1, %xmm3
1777 ; SSE2-NEXT: packuswb %xmm2, %xmm3
1778 ; SSE2-NEXT: movd %xmm3, %eax
1779 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
1782 ; SSE41-LABEL: test_v16i8:
1784 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1785 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1786 ; SSE41-NEXT: pmullw %xmm1, %xmm0
1787 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
1788 ; SSE41-NEXT: pand %xmm1, %xmm0
1789 ; SSE41-NEXT: pxor %xmm2, %xmm2
1790 ; SSE41-NEXT: packuswb %xmm2, %xmm0
1791 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1792 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1793 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1794 ; SSE41-NEXT: pmullw %xmm3, %xmm0
1795 ; SSE41-NEXT: pand %xmm1, %xmm0
1796 ; SSE41-NEXT: packuswb %xmm2, %xmm0
1797 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1798 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1799 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1800 ; SSE41-NEXT: pmullw %xmm3, %xmm0
1801 ; SSE41-NEXT: pand %xmm1, %xmm0
1802 ; SSE41-NEXT: packuswb %xmm2, %xmm0
1803 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1804 ; SSE41-NEXT: psrlw $8, %xmm0
1805 ; SSE41-NEXT: pmullw %xmm1, %xmm0
1806 ; SSE41-NEXT: pextrb $0, %xmm0, %eax
1807 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
1810 ; AVX1-LABEL: test_v16i8:
1812 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1813 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1814 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1815 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
1816 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
1817 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1818 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1819 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1820 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1821 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1822 ; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0
1823 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
1824 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1825 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1826 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1827 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1828 ; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0
1829 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
1830 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1831 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
1832 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1833 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1834 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
1835 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
1838 ; AVX2-LABEL: test_v16i8:
1840 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1841 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1842 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1843 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1844 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1845 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1846 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
1847 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1848 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
1849 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1850 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
1851 ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
1852 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1853 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
1854 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1855 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm2
1856 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1857 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
1858 ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
1859 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1860 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1861 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1862 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1863 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1864 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1865 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1866 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
1867 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
1868 ; AVX2-NEXT: vzeroupper
1871 ; AVX512BW-LABEL: test_v16i8:
1872 ; AVX512BW: # %bb.0:
1873 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1874 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1875 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1876 ; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1877 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1
1878 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
1879 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1880 ; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1881 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1
1882 ; AVX512BW-NEXT: vpsrld $16, %xmm1, %xmm1
1883 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1884 ; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1885 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1
1886 ; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
1887 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1888 ; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1889 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1890 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
1891 ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
1892 ; AVX512BW-NEXT: vzeroupper
1893 ; AVX512BW-NEXT: retq
1895 ; AVX512BWVL-LABEL: test_v16i8:
1896 ; AVX512BWVL: # %bb.0:
1897 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1898 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1899 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1900 ; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1901 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1
1902 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
1903 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1904 ; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1905 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1
1906 ; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1
1907 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1908 ; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1909 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1
1910 ; AVX512BWVL-NEXT: vpsrlw $8, %xmm1, %xmm1
1911 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1912 ; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1913 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
1914 ; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax
1915 ; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax
1916 ; AVX512BWVL-NEXT: vzeroupper
1917 ; AVX512BWVL-NEXT: retq
1919 ; AVX512DQ-LABEL: test_v16i8:
1920 ; AVX512DQ: # %bb.0:
1921 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1922 ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1923 ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1924 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1925 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1926 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
1927 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
1928 ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1929 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1930 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1931 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
1932 ; AVX512DQ-NEXT: vpsrld $16, %xmm1, %xmm1
1933 ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1934 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1935 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1936 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
1937 ; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1
1938 ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1939 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1940 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1941 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
1942 ; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
1943 ; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax
1944 ; AVX512DQ-NEXT: vzeroupper
1945 ; AVX512DQ-NEXT: retq
1947 ; AVX512DQVL-LABEL: test_v16i8:
1948 ; AVX512DQVL: # %bb.0:
1949 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1950 ; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1951 ; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1952 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1953 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1954 ; AVX512DQVL-NEXT: vpmovdb %zmm1, %xmm1
1955 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
1956 ; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1957 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1958 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1959 ; AVX512DQVL-NEXT: vpmovdb %zmm1, %xmm1
1960 ; AVX512DQVL-NEXT: vpsrld $16, %xmm1, %xmm1
1961 ; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1962 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1963 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1964 ; AVX512DQVL-NEXT: vpmovdb %zmm1, %xmm1
1965 ; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1
1966 ; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1967 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1968 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1969 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
1970 ; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax
1971 ; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax
1972 ; AVX512DQVL-NEXT: vzeroupper
1973 ; AVX512DQVL-NEXT: retq
1974 %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> %a0)
1978 define i8 @test_v32i8(<32 x i8> %a0) {
1979 ; SSE2-LABEL: test_v32i8:
1981 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1982 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
1983 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1984 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
1985 ; SSE2-NEXT: pmullw %xmm2, %xmm3
1986 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1987 ; SSE2-NEXT: pand %xmm2, %xmm3
1988 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1989 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1990 ; SSE2-NEXT: pmullw %xmm1, %xmm0
1991 ; SSE2-NEXT: pand %xmm2, %xmm0
1992 ; SSE2-NEXT: packuswb %xmm3, %xmm0
1993 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1994 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1995 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1996 ; SSE2-NEXT: pmullw %xmm1, %xmm0
1997 ; SSE2-NEXT: pand %xmm2, %xmm0
1998 ; SSE2-NEXT: pxor %xmm1, %xmm1
1999 ; SSE2-NEXT: packuswb %xmm1, %xmm0
2000 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,2,3,3]
2001 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2002 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2003 ; SSE2-NEXT: pmullw %xmm0, %xmm3
2004 ; SSE2-NEXT: pand %xmm2, %xmm3
2005 ; SSE2-NEXT: packuswb %xmm1, %xmm3
2006 ; SSE2-NEXT: movdqa %xmm3, %xmm0
2007 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2008 ; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
2009 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2010 ; SSE2-NEXT: pmullw %xmm0, %xmm3
2011 ; SSE2-NEXT: pand %xmm2, %xmm3
2012 ; SSE2-NEXT: packuswb %xmm1, %xmm3
2013 ; SSE2-NEXT: movdqa %xmm3, %xmm0
2014 ; SSE2-NEXT: psrlw $8, %xmm0
2015 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2016 ; SSE2-NEXT: pmullw %xmm0, %xmm3
2017 ; SSE2-NEXT: pand %xmm2, %xmm3
2018 ; SSE2-NEXT: packuswb %xmm1, %xmm3
2019 ; SSE2-NEXT: movd %xmm3, %eax
2020 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
2023 ; SSE41-LABEL: test_v32i8:
2025 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2026 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2027 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2028 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2029 ; SSE41-NEXT: pmullw %xmm1, %xmm0
2030 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
2031 ; SSE41-NEXT: pand %xmm1, %xmm0
2032 ; SSE41-NEXT: pmullw %xmm2, %xmm3
2033 ; SSE41-NEXT: pand %xmm1, %xmm3
2034 ; SSE41-NEXT: packuswb %xmm0, %xmm3
2035 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2036 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2037 ; SSE41-NEXT: pmullw %xmm0, %xmm3
2038 ; SSE41-NEXT: pand %xmm1, %xmm3
2039 ; SSE41-NEXT: pxor %xmm0, %xmm0
2040 ; SSE41-NEXT: packuswb %xmm0, %xmm3
2041 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2042 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2043 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
2044 ; SSE41-NEXT: pmullw %xmm2, %xmm3
2045 ; SSE41-NEXT: pand %xmm1, %xmm3
2046 ; SSE41-NEXT: packuswb %xmm0, %xmm3
2047 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2048 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2049 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
2050 ; SSE41-NEXT: pmullw %xmm2, %xmm3
2051 ; SSE41-NEXT: pand %xmm1, %xmm3
2052 ; SSE41-NEXT: packuswb %xmm0, %xmm3
2053 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2054 ; SSE41-NEXT: psrlw $8, %xmm3
2055 ; SSE41-NEXT: pmullw %xmm0, %xmm3
2056 ; SSE41-NEXT: pextrb $0, %xmm3, %eax
2057 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
2060 ; AVX1-LABEL: test_v32i8:
2062 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2063 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2064 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2065 ; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
2066 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2067 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
2068 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2069 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2070 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2071 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
2072 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2073 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2074 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2075 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2076 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
2077 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2078 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2079 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2080 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2081 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2082 ; AVX1-NEXT: vpmullw %xmm0, %xmm2, %xmm0
2083 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
2084 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2085 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2086 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2087 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
2088 ; AVX1-NEXT: vpmullw %xmm0, %xmm2, %xmm0
2089 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
2090 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2091 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
2092 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2093 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
2094 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
2095 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2096 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
2097 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
2098 ; AVX1-NEXT: vzeroupper
2101 ; AVX2-LABEL: test_v32i8:
2103 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
2104 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2105 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
2106 ; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm3
2107 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2108 ; AVX2-NEXT: vpand %ymm1, %ymm3, %ymm3
2109 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2110 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
2111 ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
2112 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
2113 ; AVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
2114 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2115 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2116 ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
2117 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
2118 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
2119 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
2120 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
2121 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2122 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
2123 ; AVX2-NEXT: vpmullw %ymm3, %ymm0, %ymm0
2124 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
2125 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
2126 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm3
2127 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2128 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
2129 ; AVX2-NEXT: vpmullw %ymm3, %ymm0, %ymm0
2130 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
2131 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
2132 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm3
2133 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2134 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
2135 ; AVX2-NEXT: vpmullw %ymm3, %ymm0, %ymm0
2136 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
2137 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
2138 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
2139 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
2140 ; AVX2-NEXT: vzeroupper
2143 ; AVX512BW-LABEL: test_v32i8:
2144 ; AVX512BW: # %bb.0:
2145 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
2146 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
2147 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
2148 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
2149 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1
2150 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2151 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
2152 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
2153 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1
2154 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
2155 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
2156 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
2157 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1
2158 ; AVX512BW-NEXT: vpsrld $16, %xmm1, %xmm1
2159 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
2160 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
2161 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1
2162 ; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
2163 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
2164 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
2165 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2166 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
2167 ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
2168 ; AVX512BW-NEXT: vzeroupper
2169 ; AVX512BW-NEXT: retq
2171 ; AVX512BWVL-LABEL: test_v32i8:
2172 ; AVX512BWVL: # %bb.0:
2173 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
2174 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
2175 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
2176 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
2177 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm1
2178 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2179 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
2180 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
2181 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm1
2182 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
2183 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
2184 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
2185 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm1
2186 ; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1
2187 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
2188 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
2189 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm1
2190 ; AVX512BWVL-NEXT: vpsrlw $8, %xmm1, %xmm1
2191 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
2192 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
2193 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
2194 ; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax
2195 ; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax
2196 ; AVX512BWVL-NEXT: vzeroupper
2197 ; AVX512BWVL-NEXT: retq
2199 ; AVX512DQ-LABEL: test_v32i8:
2200 ; AVX512DQ: # %bb.0:
2201 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2
2202 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2203 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
2204 ; AVX512DQ-NEXT: vpmullw %ymm3, %ymm1, %ymm3
2205 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2206 ; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm3
2207 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2208 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
2209 ; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0
2210 ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
2211 ; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
2212 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2213 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2214 ; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0
2215 ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
2216 ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
2217 ; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
2218 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
2219 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2220 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
2221 ; AVX512DQ-NEXT: vpmullw %ymm3, %ymm0, %ymm0
2222 ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
2223 ; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
2224 ; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm3
2225 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2226 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
2227 ; AVX512DQ-NEXT: vpmullw %ymm3, %ymm0, %ymm0
2228 ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
2229 ; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
2230 ; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm3
2231 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2232 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
2233 ; AVX512DQ-NEXT: vpmullw %ymm3, %ymm0, %ymm0
2234 ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
2235 ; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
2236 ; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
2237 ; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax
2238 ; AVX512DQ-NEXT: vzeroupper
2239 ; AVX512DQ-NEXT: retq
2241 ; AVX512DQVL-LABEL: test_v32i8:
2242 ; AVX512DQVL: # %bb.0:
2243 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
2244 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2245 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
2246 ; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm2, %ymm2
2247 ; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2248 ; AVX512DQVL-NEXT: vpand %ymm3, %ymm2, %ymm2
2249 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2250 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
2251 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2252 ; AVX512DQVL-NEXT: vpand %ymm3, %ymm0, %ymm0
2253 ; AVX512DQVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
2254 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2255 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2256 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2257 ; AVX512DQVL-NEXT: vpand %ymm3, %ymm0, %ymm0
2258 ; AVX512DQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
2259 ; AVX512DQVL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
2260 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
2261 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2262 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
2263 ; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm0, %ymm0
2264 ; AVX512DQVL-NEXT: vpand %ymm3, %ymm0, %ymm0
2265 ; AVX512DQVL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
2266 ; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm2
2267 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2268 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
2269 ; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm0, %ymm0
2270 ; AVX512DQVL-NEXT: vpand %ymm3, %ymm0, %ymm0
2271 ; AVX512DQVL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
2272 ; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm2
2273 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2274 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
2275 ; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm0, %ymm0
2276 ; AVX512DQVL-NEXT: vpand %ymm3, %ymm0, %ymm0
2277 ; AVX512DQVL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
2278 ; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax
2279 ; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax
2280 ; AVX512DQVL-NEXT: vzeroupper
2281 ; AVX512DQVL-NEXT: retq
2282 %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> %a0)
2286 define i8 @test_v64i8(<64 x i8> %a0) {
2287 ; SSE2-LABEL: test_v64i8:
2289 ; SSE2-NEXT: movdqa %xmm2, %xmm4
2290 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
2291 ; SSE2-NEXT: movdqa %xmm0, %xmm5
2292 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
2293 ; SSE2-NEXT: pmullw %xmm4, %xmm5
2294 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
2295 ; SSE2-NEXT: pand %xmm4, %xmm5
2296 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2297 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2298 ; SSE2-NEXT: pmullw %xmm2, %xmm0
2299 ; SSE2-NEXT: pand %xmm4, %xmm0
2300 ; SSE2-NEXT: packuswb %xmm5, %xmm0
2301 ; SSE2-NEXT: movdqa %xmm3, %xmm2
2302 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2303 ; SSE2-NEXT: movdqa %xmm1, %xmm5
2304 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
2305 ; SSE2-NEXT: pmullw %xmm2, %xmm5
2306 ; SSE2-NEXT: pand %xmm4, %xmm5
2307 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2308 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2309 ; SSE2-NEXT: pmullw %xmm3, %xmm1
2310 ; SSE2-NEXT: pand %xmm4, %xmm1
2311 ; SSE2-NEXT: packuswb %xmm5, %xmm1
2312 ; SSE2-NEXT: movdqa %xmm1, %xmm2
2313 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2314 ; SSE2-NEXT: movdqa %xmm0, %xmm3
2315 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2316 ; SSE2-NEXT: pmullw %xmm2, %xmm3
2317 ; SSE2-NEXT: pand %xmm4, %xmm3
2318 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2319 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2320 ; SSE2-NEXT: pmullw %xmm1, %xmm0
2321 ; SSE2-NEXT: pand %xmm4, %xmm0
2322 ; SSE2-NEXT: packuswb %xmm3, %xmm0
2323 ; SSE2-NEXT: movdqa %xmm0, %xmm1
2324 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2325 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2326 ; SSE2-NEXT: pmullw %xmm1, %xmm0
2327 ; SSE2-NEXT: pand %xmm4, %xmm0
2328 ; SSE2-NEXT: pxor %xmm1, %xmm1
2329 ; SSE2-NEXT: packuswb %xmm1, %xmm0
2330 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,2,3,3]
2331 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2332 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2333 ; SSE2-NEXT: pmullw %xmm0, %xmm2
2334 ; SSE2-NEXT: pand %xmm4, %xmm2
2335 ; SSE2-NEXT: packuswb %xmm1, %xmm2
2336 ; SSE2-NEXT: movdqa %xmm2, %xmm0
2337 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2338 ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
2339 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2340 ; SSE2-NEXT: pmullw %xmm0, %xmm2
2341 ; SSE2-NEXT: pand %xmm4, %xmm2
2342 ; SSE2-NEXT: packuswb %xmm1, %xmm2
2343 ; SSE2-NEXT: movdqa %xmm2, %xmm0
2344 ; SSE2-NEXT: psrlw $8, %xmm0
2345 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2346 ; SSE2-NEXT: pmullw %xmm0, %xmm2
2347 ; SSE2-NEXT: pand %xmm4, %xmm2
2348 ; SSE2-NEXT: packuswb %xmm1, %xmm2
2349 ; SSE2-NEXT: movd %xmm2, %eax
2350 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
2353 ; SSE41-LABEL: test_v64i8:
2355 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2356 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2357 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2358 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2359 ; SSE41-NEXT: pmullw %xmm2, %xmm0
2360 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2361 ; SSE41-NEXT: pand %xmm2, %xmm0
2362 ; SSE41-NEXT: pmullw %xmm5, %xmm4
2363 ; SSE41-NEXT: pand %xmm2, %xmm4
2364 ; SSE41-NEXT: packuswb %xmm0, %xmm4
2365 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2366 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2367 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2368 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2369 ; SSE41-NEXT: pmullw %xmm3, %xmm1
2370 ; SSE41-NEXT: pand %xmm2, %xmm1
2371 ; SSE41-NEXT: pmullw %xmm0, %xmm5
2372 ; SSE41-NEXT: pand %xmm2, %xmm5
2373 ; SSE41-NEXT: packuswb %xmm1, %xmm5
2374 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
2375 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
2376 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
2377 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
2378 ; SSE41-NEXT: pmullw %xmm5, %xmm4
2379 ; SSE41-NEXT: pand %xmm2, %xmm4
2380 ; SSE41-NEXT: pmullw %xmm0, %xmm1
2381 ; SSE41-NEXT: pand %xmm2, %xmm1
2382 ; SSE41-NEXT: packuswb %xmm4, %xmm1
2383 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2384 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2385 ; SSE41-NEXT: pmullw %xmm0, %xmm1
2386 ; SSE41-NEXT: pand %xmm2, %xmm1
2387 ; SSE41-NEXT: pxor %xmm0, %xmm0
2388 ; SSE41-NEXT: packuswb %xmm0, %xmm1
2389 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2390 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2391 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2392 ; SSE41-NEXT: pmullw %xmm3, %xmm1
2393 ; SSE41-NEXT: pand %xmm2, %xmm1
2394 ; SSE41-NEXT: packuswb %xmm0, %xmm1
2395 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2396 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2397 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
2398 ; SSE41-NEXT: pmullw %xmm3, %xmm1
2399 ; SSE41-NEXT: pand %xmm2, %xmm1
2400 ; SSE41-NEXT: packuswb %xmm0, %xmm1
2401 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2402 ; SSE41-NEXT: psrlw $8, %xmm1
2403 ; SSE41-NEXT: pmullw %xmm0, %xmm1
2404 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
2405 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
2408 ; AVX1-LABEL: test_v64i8:
2410 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2411 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2412 ; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm3
2413 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2414 ; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3
2415 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2416 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2417 ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4
2418 ; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4
2419 ; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3
2420 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2421 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2422 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2423 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2424 ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4
2425 ; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4
2426 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2427 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2428 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2429 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
2430 ; AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
2431 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2432 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2433 ; AVX1-NEXT: vpmullw %xmm1, %xmm4, %xmm1
2434 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
2435 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2436 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2437 ; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0
2438 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
2439 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2440 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2441 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2442 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2443 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
2444 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2445 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2446 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2447 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2448 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2449 ; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0
2450 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
2451 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2452 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2453 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2454 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
2455 ; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0
2456 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
2457 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2458 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3
2459 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2460 ; AVX1-NEXT: vpmullw %xmm3, %xmm0, %xmm0
2461 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
2462 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2463 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
2464 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
2465 ; AVX1-NEXT: vzeroupper
2468 ; AVX2-LABEL: test_v64i8:
2470 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
2471 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2472 ; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm3
2473 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2474 ; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm3
2475 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
2476 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2477 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2478 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
2479 ; AVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
2480 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2481 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2482 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
2483 ; AVX2-NEXT: vpmullw %ymm4, %ymm3, %ymm3
2484 ; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm3
2485 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2486 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
2487 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2488 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
2489 ; AVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
2490 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2491 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2492 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2493 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
2494 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
2495 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
2496 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
2497 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2498 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
2499 ; AVX2-NEXT: vpmullw %ymm3, %ymm0, %ymm0
2500 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
2501 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
2502 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm3
2503 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2504 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
2505 ; AVX2-NEXT: vpmullw %ymm3, %ymm0, %ymm0
2506 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
2507 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
2508 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm3
2509 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2510 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
2511 ; AVX2-NEXT: vpmullw %ymm3, %ymm0, %ymm0
2512 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
2513 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
2514 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
2515 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
2516 ; AVX2-NEXT: vzeroupper
2519 ; AVX512BW-LABEL: test_v64i8:
2520 ; AVX512BW: # %bb.0:
2521 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2522 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
2523 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
2524 ; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2
2525 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2526 ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
2527 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
2528 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
2529 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
2530 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
2531 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
2532 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
2533 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
2534 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
2535 ; AVX512BW-NEXT: vpmullw %zmm4, %zmm2, %zmm2
2536 ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
2537 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
2538 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
2539 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
2540 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
2541 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
2542 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
2543 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
2544 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
2545 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
2546 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
2547 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
2548 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
2549 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
2550 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55]
2551 ; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0
2552 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
2553 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
2554 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm2
2555 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
2556 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55]
2557 ; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0
2558 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
2559 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
2560 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm2
2561 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
2562 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55]
2563 ; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0
2564 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
2565 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
2566 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
2567 ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
2568 ; AVX512BW-NEXT: vzeroupper
2569 ; AVX512BW-NEXT: retq
2571 ; AVX512BWVL-LABEL: test_v64i8:
2572 ; AVX512BWVL: # %bb.0:
2573 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2574 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
2575 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
2576 ; AVX512BWVL-NEXT: vpmullw %zmm3, %zmm2, %zmm2
2577 ; AVX512BWVL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2578 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm2, %zmm2
2579 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
2580 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
2581 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
2582 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
2583 ; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
2584 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
2585 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
2586 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
2587 ; AVX512BWVL-NEXT: vpmullw %zmm4, %zmm2, %zmm2
2588 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm2, %zmm2
2589 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
2590 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
2591 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
2592 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
2593 ; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
2594 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
2595 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
2596 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
2597 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
2598 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
2599 ; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
2600 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
2601 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
2602 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55]
2603 ; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm0, %zmm0
2604 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
2605 ; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
2606 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm2
2607 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
2608 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55]
2609 ; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm0, %zmm0
2610 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
2611 ; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
2612 ; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm2
2613 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
2614 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55]
2615 ; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm0, %zmm0
2616 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
2617 ; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
2618 ; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax
2619 ; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax
2620 ; AVX512BWVL-NEXT: vzeroupper
2621 ; AVX512BWVL-NEXT: retq
2623 ; AVX512DQ-LABEL: test_v64i8:
2624 ; AVX512DQ: # %bb.0:
2625 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
2626 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2627 ; AVX512DQ-NEXT: vpmullw %ymm2, %ymm3, %ymm3
2628 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2629 ; AVX512DQ-NEXT: vpand %ymm2, %ymm3, %ymm3
2630 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
2631 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2632 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2633 ; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
2634 ; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
2635 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
2636 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2637 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
2638 ; AVX512DQ-NEXT: vpmullw %ymm4, %ymm3, %ymm3
2639 ; AVX512DQ-NEXT: vpand %ymm2, %ymm3, %ymm3
2640 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2641 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
2642 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2643 ; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
2644 ; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
2645 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2646 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2647 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2648 ; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
2649 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
2650 ; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
2651 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
2652 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2653 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
2654 ; AVX512DQ-NEXT: vpmullw %ymm3, %ymm0, %ymm0
2655 ; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
2656 ; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
2657 ; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm3
2658 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2659 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
2660 ; AVX512DQ-NEXT: vpmullw %ymm3, %ymm0, %ymm0
2661 ; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
2662 ; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
2663 ; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm3
2664 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2665 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
2666 ; AVX512DQ-NEXT: vpmullw %ymm3, %ymm0, %ymm0
2667 ; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
2668 ; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
2669 ; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
2670 ; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax
2671 ; AVX512DQ-NEXT: vzeroupper
2672 ; AVX512DQ-NEXT: retq
2674 ; AVX512DQVL-LABEL: test_v64i8:
2675 ; AVX512DQVL: # %bb.0:
2676 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
2677 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2678 ; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm3, %ymm2
2679 ; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2680 ; AVX512DQVL-NEXT: vpand %ymm3, %ymm2, %ymm2
2681 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
2682 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2683 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2684 ; AVX512DQVL-NEXT: vpand %ymm3, %ymm0, %ymm0
2685 ; AVX512DQVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
2686 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
2687 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2688 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
2689 ; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm2, %ymm2
2690 ; AVX512DQVL-NEXT: vpand %ymm3, %ymm2, %ymm2
2691 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2692 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
2693 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2694 ; AVX512DQVL-NEXT: vpand %ymm3, %ymm0, %ymm0
2695 ; AVX512DQVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
2696 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
2697 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2698 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
2699 ; AVX512DQVL-NEXT: vpand %ymm3, %ymm0, %ymm0
2700 ; AVX512DQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
2701 ; AVX512DQVL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
2702 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
2703 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2704 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
2705 ; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm0, %ymm0
2706 ; AVX512DQVL-NEXT: vpand %ymm3, %ymm0, %ymm0
2707 ; AVX512DQVL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
2708 ; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm2
2709 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2710 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
2711 ; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm0, %ymm0
2712 ; AVX512DQVL-NEXT: vpand %ymm3, %ymm0, %ymm0
2713 ; AVX512DQVL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
2714 ; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm2
2715 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
2716 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
2717 ; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm0, %ymm0
2718 ; AVX512DQVL-NEXT: vpand %ymm3, %ymm0, %ymm0
2719 ; AVX512DQVL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
2720 ; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax
2721 ; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax
2722 ; AVX512DQVL-NEXT: vzeroupper
2723 ; AVX512DQVL-NEXT: retq
2724 %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> %a0)
2728 define i8 @test_v128i8(<128 x i8> %a0) {
2729 ; SSE2-LABEL: test_v128i8:
2731 ; SSE2-NEXT: movdqa %xmm5, %xmm8
2732 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
2733 ; SSE2-NEXT: movdqa %xmm1, %xmm9
2734 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15]
2735 ; SSE2-NEXT: pmullw %xmm8, %xmm9
2736 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
2737 ; SSE2-NEXT: pand %xmm8, %xmm9
2738 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
2739 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2740 ; SSE2-NEXT: pmullw %xmm5, %xmm1
2741 ; SSE2-NEXT: pand %xmm8, %xmm1
2742 ; SSE2-NEXT: packuswb %xmm9, %xmm1
2743 ; SSE2-NEXT: movdqa %xmm7, %xmm9
2744 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15]
2745 ; SSE2-NEXT: movdqa %xmm3, %xmm5
2746 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
2747 ; SSE2-NEXT: pmullw %xmm9, %xmm5
2748 ; SSE2-NEXT: pand %xmm8, %xmm5
2749 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
2750 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2751 ; SSE2-NEXT: pmullw %xmm7, %xmm3
2752 ; SSE2-NEXT: pand %xmm8, %xmm3
2753 ; SSE2-NEXT: packuswb %xmm5, %xmm3
2754 ; SSE2-NEXT: movdqa %xmm4, %xmm5
2755 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
2756 ; SSE2-NEXT: movdqa %xmm0, %xmm7
2757 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
2758 ; SSE2-NEXT: pmullw %xmm5, %xmm7
2759 ; SSE2-NEXT: pand %xmm8, %xmm7
2760 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
2761 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2762 ; SSE2-NEXT: pmullw %xmm4, %xmm0
2763 ; SSE2-NEXT: pand %xmm8, %xmm0
2764 ; SSE2-NEXT: packuswb %xmm7, %xmm0
2765 ; SSE2-NEXT: movdqa %xmm6, %xmm4
2766 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
2767 ; SSE2-NEXT: movdqa %xmm2, %xmm5
2768 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
2769 ; SSE2-NEXT: pmullw %xmm4, %xmm5
2770 ; SSE2-NEXT: pand %xmm8, %xmm5
2771 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
2772 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2773 ; SSE2-NEXT: pmullw %xmm6, %xmm2
2774 ; SSE2-NEXT: pand %xmm8, %xmm2
2775 ; SSE2-NEXT: packuswb %xmm5, %xmm2
2776 ; SSE2-NEXT: movdqa %xmm2, %xmm4
2777 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
2778 ; SSE2-NEXT: movdqa %xmm0, %xmm5
2779 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
2780 ; SSE2-NEXT: pmullw %xmm4, %xmm5
2781 ; SSE2-NEXT: pand %xmm8, %xmm5
2782 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2783 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2784 ; SSE2-NEXT: pmullw %xmm2, %xmm0
2785 ; SSE2-NEXT: pand %xmm8, %xmm0
2786 ; SSE2-NEXT: packuswb %xmm5, %xmm0
2787 ; SSE2-NEXT: movdqa %xmm3, %xmm2
2788 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2789 ; SSE2-NEXT: movdqa %xmm1, %xmm4
2790 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
2791 ; SSE2-NEXT: pmullw %xmm2, %xmm4
2792 ; SSE2-NEXT: pand %xmm8, %xmm4
2793 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2794 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2795 ; SSE2-NEXT: pmullw %xmm3, %xmm1
2796 ; SSE2-NEXT: pand %xmm8, %xmm1
2797 ; SSE2-NEXT: packuswb %xmm4, %xmm1
2798 ; SSE2-NEXT: movdqa %xmm1, %xmm2
2799 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2800 ; SSE2-NEXT: movdqa %xmm0, %xmm3
2801 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2802 ; SSE2-NEXT: pmullw %xmm2, %xmm3
2803 ; SSE2-NEXT: pand %xmm8, %xmm3
2804 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2805 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2806 ; SSE2-NEXT: pmullw %xmm1, %xmm0
2807 ; SSE2-NEXT: pand %xmm8, %xmm0
2808 ; SSE2-NEXT: packuswb %xmm3, %xmm0
2809 ; SSE2-NEXT: movdqa %xmm0, %xmm1
2810 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2811 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2812 ; SSE2-NEXT: pmullw %xmm1, %xmm0
2813 ; SSE2-NEXT: pand %xmm8, %xmm0
2814 ; SSE2-NEXT: pxor %xmm1, %xmm1
2815 ; SSE2-NEXT: packuswb %xmm1, %xmm0
2816 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,2,3,3]
2817 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2818 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2819 ; SSE2-NEXT: pmullw %xmm0, %xmm2
2820 ; SSE2-NEXT: pand %xmm8, %xmm2
2821 ; SSE2-NEXT: packuswb %xmm1, %xmm2
2822 ; SSE2-NEXT: movdqa %xmm2, %xmm0
2823 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2824 ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
2825 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2826 ; SSE2-NEXT: pmullw %xmm0, %xmm2
2827 ; SSE2-NEXT: pand %xmm8, %xmm2
2828 ; SSE2-NEXT: packuswb %xmm1, %xmm2
2829 ; SSE2-NEXT: movdqa %xmm2, %xmm0
2830 ; SSE2-NEXT: psrlw $8, %xmm0
2831 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2832 ; SSE2-NEXT: pmullw %xmm0, %xmm2
2833 ; SSE2-NEXT: pand %xmm8, %xmm2
2834 ; SSE2-NEXT: packuswb %xmm1, %xmm2
2835 ; SSE2-NEXT: movd %xmm2, %eax
2836 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
2839 ; SSE41-LABEL: test_v128i8:
2841 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
2842 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
2843 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2844 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2845 ; SSE41-NEXT: pmullw %xmm5, %xmm1
2846 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
2847 ; SSE41-NEXT: pand %xmm5, %xmm1
2848 ; SSE41-NEXT: pmullw %xmm9, %xmm8
2849 ; SSE41-NEXT: pand %xmm5, %xmm8
2850 ; SSE41-NEXT: packuswb %xmm1, %xmm8
2851 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
2852 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
2853 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2854 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2855 ; SSE41-NEXT: pmullw %xmm7, %xmm3
2856 ; SSE41-NEXT: pand %xmm5, %xmm3
2857 ; SSE41-NEXT: pmullw %xmm9, %xmm1
2858 ; SSE41-NEXT: pand %xmm5, %xmm1
2859 ; SSE41-NEXT: packuswb %xmm3, %xmm1
2860 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
2861 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
2862 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2863 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2864 ; SSE41-NEXT: pmullw %xmm4, %xmm0
2865 ; SSE41-NEXT: pand %xmm5, %xmm0
2866 ; SSE41-NEXT: pmullw %xmm7, %xmm3
2867 ; SSE41-NEXT: pand %xmm5, %xmm3
2868 ; SSE41-NEXT: packuswb %xmm0, %xmm3
2869 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
2870 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
2871 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2872 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2873 ; SSE41-NEXT: pmullw %xmm6, %xmm2
2874 ; SSE41-NEXT: pand %xmm5, %xmm2
2875 ; SSE41-NEXT: pmullw %xmm0, %xmm4
2876 ; SSE41-NEXT: pand %xmm5, %xmm4
2877 ; SSE41-NEXT: packuswb %xmm2, %xmm4
2878 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
2879 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
2880 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2881 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2882 ; SSE41-NEXT: pmullw %xmm4, %xmm3
2883 ; SSE41-NEXT: pand %xmm5, %xmm3
2884 ; SSE41-NEXT: pmullw %xmm2, %xmm0
2885 ; SSE41-NEXT: pand %xmm5, %xmm0
2886 ; SSE41-NEXT: packuswb %xmm3, %xmm0
2887 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2888 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2889 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero
2890 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
2891 ; SSE41-NEXT: pmullw %xmm1, %xmm8
2892 ; SSE41-NEXT: pand %xmm5, %xmm8
2893 ; SSE41-NEXT: pmullw %xmm2, %xmm3
2894 ; SSE41-NEXT: pand %xmm5, %xmm3
2895 ; SSE41-NEXT: packuswb %xmm8, %xmm3
2896 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2897 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2898 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2899 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2900 ; SSE41-NEXT: pmullw %xmm3, %xmm0
2901 ; SSE41-NEXT: pand %xmm5, %xmm0
2902 ; SSE41-NEXT: pmullw %xmm1, %xmm2
2903 ; SSE41-NEXT: pand %xmm5, %xmm2
2904 ; SSE41-NEXT: packuswb %xmm0, %xmm2
2905 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2906 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2907 ; SSE41-NEXT: pmullw %xmm0, %xmm2
2908 ; SSE41-NEXT: pand %xmm5, %xmm2
2909 ; SSE41-NEXT: pxor %xmm0, %xmm0
2910 ; SSE41-NEXT: packuswb %xmm0, %xmm2
2911 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2912 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2913 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
2914 ; SSE41-NEXT: pmullw %xmm1, %xmm2
2915 ; SSE41-NEXT: pand %xmm5, %xmm2
2916 ; SSE41-NEXT: packuswb %xmm0, %xmm2
2917 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2918 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2919 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
2920 ; SSE41-NEXT: pmullw %xmm1, %xmm2
2921 ; SSE41-NEXT: pand %xmm5, %xmm2
2922 ; SSE41-NEXT: packuswb %xmm0, %xmm2
2923 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2924 ; SSE41-NEXT: psrlw $8, %xmm2
2925 ; SSE41-NEXT: pmullw %xmm0, %xmm2
2926 ; SSE41-NEXT: pextrb $0, %xmm2, %eax
2927 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
2930 ; AVX1-LABEL: test_v128i8:
2932 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
2933 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
2934 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
2935 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
2936 ; AVX1-NEXT: vpmullw %xmm4, %xmm7, %xmm7
2937 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
2938 ; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7
2939 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
2940 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
2941 ; AVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm5
2942 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5
2943 ; AVX1-NEXT: vpackuswb %xmm7, %xmm5, %xmm8
2944 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
2945 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
2946 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
2947 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
2948 ; AVX1-NEXT: vpmullw %xmm9, %xmm7, %xmm7
2949 ; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7
2950 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
2951 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
2952 ; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm5
2953 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5
2954 ; AVX1-NEXT: vpackuswb %xmm7, %xmm5, %xmm6
2955 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2956 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2957 ; AVX1-NEXT: vpmullw %xmm5, %xmm7, %xmm5
2958 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5
2959 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2960 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2961 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
2962 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
2963 ; AVX1-NEXT: vpackuswb %xmm5, %xmm0, %xmm0
2964 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2965 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2966 ; AVX1-NEXT: vpmullw %xmm2, %xmm5, %xmm2
2967 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
2968 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2969 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2970 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
2971 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
2972 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
2973 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2974 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2975 ; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
2976 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
2977 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2978 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2979 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2980 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
2981 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2982 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
2983 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
2984 ; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1
2985 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
2986 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
2987 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero
2988 ; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
2989 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
2990 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
2991 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2992 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2993 ; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
2994 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
2995 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2996 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2997 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
2998 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
2999 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3000 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
3001 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
3002 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
3003 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
3004 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
3005 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3006 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
3007 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3008 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
3009 ; AVX1-NEXT: vpmullw %xmm0, %xmm2, %xmm0
3010 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
3011 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3012 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
3013 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3014 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
3015 ; AVX1-NEXT: vpmullw %xmm0, %xmm2, %xmm0
3016 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
3017 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3018 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
3019 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
3020 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
3021 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
3022 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3023 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
3024 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
3025 ; AVX1-NEXT: vzeroupper
3028 ; AVX2-LABEL: test_v128i8:
3030 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
3031 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
3032 ; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm5
3033 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3034 ; AVX2-NEXT: vpand %ymm4, %ymm5, %ymm5
3035 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
3036 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
3037 ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
3038 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
3039 ; AVX2-NEXT: vpackuswb %ymm5, %ymm0, %ymm0
3040 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
3041 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
3042 ; AVX2-NEXT: vpmullw %ymm2, %ymm5, %ymm2
3043 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
3044 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
3045 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
3046 ; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1
3047 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
3048 ; AVX2-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
3049 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
3050 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
3051 ; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2
3052 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
3053 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
3054 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
3055 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
3056 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
3057 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
3058 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3059 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
3060 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
3061 ; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2
3062 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
3063 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
3064 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
3065 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
3066 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
3067 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
3068 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
3069 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
3070 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
3071 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
3072 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
3073 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
3074 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
3075 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
3076 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
3077 ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
3078 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
3079 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
3080 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm2
3081 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
3082 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
3083 ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
3084 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
3085 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
3086 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2
3087 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
3088 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
3089 ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
3090 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
3091 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
3092 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
3093 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
3094 ; AVX2-NEXT: vzeroupper
3097 ; AVX512BW-LABEL: test_v128i8:
3098 ; AVX512BW: # %bb.0:
3099 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
3100 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
3101 ; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2
3102 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3103 ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
3104 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
3105 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
3106 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
3107 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
3108 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
3109 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3110 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
3111 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
3112 ; AVX512BW-NEXT: vpmullw %zmm4, %zmm2, %zmm2
3113 ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
3114 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
3115 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
3116 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
3117 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
3118 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
3119 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
3120 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
3121 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
3122 ; AVX512BW-NEXT: vpmullw %zmm4, %zmm2, %zmm2
3123 ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
3124 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
3125 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
3126 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
3127 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
3128 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
3129 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
3130 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
3131 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
3132 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
3133 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
3134 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
3135 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
3136 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
3137 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55]
3138 ; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0
3139 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
3140 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
3141 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm2
3142 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
3143 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55]
3144 ; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0
3145 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
3146 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
3147 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm2
3148 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
3149 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55]
3150 ; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0
3151 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
3152 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
3153 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
3154 ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
3155 ; AVX512BW-NEXT: vzeroupper
3156 ; AVX512BW-NEXT: retq
3158 ; AVX512BWVL-LABEL: test_v128i8:
3159 ; AVX512BWVL: # %bb.0:
3160 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
3161 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
3162 ; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm3, %zmm2
3163 ; AVX512BWVL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3164 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm2, %zmm2
3165 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
3166 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
3167 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
3168 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
3169 ; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
3170 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3171 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
3172 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
3173 ; AVX512BWVL-NEXT: vpmullw %zmm4, %zmm2, %zmm2
3174 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm2, %zmm2
3175 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
3176 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
3177 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
3178 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
3179 ; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
3180 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
3181 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
3182 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
3183 ; AVX512BWVL-NEXT: vpmullw %zmm4, %zmm2, %zmm2
3184 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm2, %zmm2
3185 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
3186 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
3187 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
3188 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
3189 ; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
3190 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
3191 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
3192 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
3193 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
3194 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
3195 ; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
3196 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
3197 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
3198 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55]
3199 ; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm0, %zmm0
3200 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
3201 ; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
3202 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm2
3203 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
3204 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55]
3205 ; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm0, %zmm0
3206 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
3207 ; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
3208 ; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm2
3209 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
3210 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55]
3211 ; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm0, %zmm0
3212 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
3213 ; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
3214 ; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax
3215 ; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax
3216 ; AVX512BWVL-NEXT: vzeroupper
3217 ; AVX512BWVL-NEXT: retq
3219 ; AVX512DQ-LABEL: test_v128i8:
3220 ; AVX512DQ: # %bb.0:
3221 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
3222 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
3223 ; AVX512DQ-NEXT: vpmullw %ymm4, %ymm5, %ymm5
3224 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3225 ; AVX512DQ-NEXT: vpand %ymm4, %ymm5, %ymm5
3226 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
3227 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
3228 ; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0
3229 ; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm0
3230 ; AVX512DQ-NEXT: vpackuswb %ymm5, %ymm0, %ymm0
3231 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
3232 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
3233 ; AVX512DQ-NEXT: vpmullw %ymm2, %ymm5, %ymm2
3234 ; AVX512DQ-NEXT: vpand %ymm4, %ymm2, %ymm2
3235 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
3236 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
3237 ; AVX512DQ-NEXT: vpmullw %ymm3, %ymm1, %ymm1
3238 ; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm1
3239 ; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
3240 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
3241 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
3242 ; AVX512DQ-NEXT: vpmullw %ymm2, %ymm3, %ymm2
3243 ; AVX512DQ-NEXT: vpand %ymm4, %ymm2, %ymm2
3244 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
3245 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
3246 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
3247 ; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm0
3248 ; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
3249 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
3250 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
3251 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
3252 ; AVX512DQ-NEXT: vpmullw %ymm3, %ymm2, %ymm2
3253 ; AVX512DQ-NEXT: vpand %ymm4, %ymm2, %ymm2
3254 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
3255 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
3256 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
3257 ; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm0
3258 ; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
3259 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
3260 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
3261 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
3262 ; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm0
3263 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
3264 ; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
3265 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
3266 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
3267 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
3268 ; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0
3269 ; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm0
3270 ; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
3271 ; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm2
3272 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
3273 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
3274 ; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0
3275 ; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm0
3276 ; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
3277 ; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm2
3278 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
3279 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
3280 ; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0
3281 ; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm0
3282 ; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
3283 ; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
3284 ; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax
3285 ; AVX512DQ-NEXT: vzeroupper
3286 ; AVX512DQ-NEXT: retq
3288 ; AVX512DQVL-LABEL: test_v128i8:
3289 ; AVX512DQVL: # %bb.0:
3290 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
3291 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
3292 ; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm5, %ymm4
3293 ; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3294 ; AVX512DQVL-NEXT: vpand %ymm5, %ymm4, %ymm4
3295 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
3296 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
3297 ; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm0, %ymm0
3298 ; AVX512DQVL-NEXT: vpand %ymm5, %ymm0, %ymm0
3299 ; AVX512DQVL-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
3300 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
3301 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
3302 ; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm4, %ymm2
3303 ; AVX512DQVL-NEXT: vpand %ymm5, %ymm2, %ymm2
3304 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
3305 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
3306 ; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm1, %ymm1
3307 ; AVX512DQVL-NEXT: vpand %ymm5, %ymm1, %ymm1
3308 ; AVX512DQVL-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
3309 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
3310 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
3311 ; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm3, %ymm2
3312 ; AVX512DQVL-NEXT: vpand %ymm5, %ymm2, %ymm2
3313 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
3314 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
3315 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
3316 ; AVX512DQVL-NEXT: vpand %ymm5, %ymm0, %ymm0
3317 ; AVX512DQVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
3318 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
3319 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
3320 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
3321 ; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm2, %ymm2
3322 ; AVX512DQVL-NEXT: vpand %ymm5, %ymm2, %ymm2
3323 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
3324 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
3325 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
3326 ; AVX512DQVL-NEXT: vpand %ymm5, %ymm0, %ymm0
3327 ; AVX512DQVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
3328 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
3329 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
3330 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
3331 ; AVX512DQVL-NEXT: vpand %ymm5, %ymm0, %ymm0
3332 ; AVX512DQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
3333 ; AVX512DQVL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
3334 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
3335 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
3336 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
3337 ; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm0, %ymm0
3338 ; AVX512DQVL-NEXT: vpand %ymm5, %ymm0, %ymm0
3339 ; AVX512DQVL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
3340 ; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm2
3341 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
3342 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
3343 ; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm0, %ymm0
3344 ; AVX512DQVL-NEXT: vpand %ymm5, %ymm0, %ymm0
3345 ; AVX512DQVL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
3346 ; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm2
3347 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
3348 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
3349 ; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm0, %ymm0
3350 ; AVX512DQVL-NEXT: vpand %ymm5, %ymm0, %ymm0
3351 ; AVX512DQVL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
3352 ; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax
3353 ; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax
3354 ; AVX512DQVL-NEXT: vzeroupper
3355 ; AVX512DQVL-NEXT: retq
3356 %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> %a0)
3360 declare i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64>)
3361 declare i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64>)
3362 declare i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64>)
3363 declare i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64>)
3365 declare i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32>)
3366 declare i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32>)
3367 declare i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32>)
3368 declare i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32>)
3369 declare i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32>)
3371 declare i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16>)
3372 declare i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16>)
3373 declare i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16>)
3374 declare i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16>)
3375 declare i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16>)
3376 declare i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16>)
3378 declare i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8>)
3379 declare i8 @llvm.experimental.vector.reduce.mul.i8.v4i8(<4 x i8>)
3380 declare i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8>)
3381 declare i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8>)
3382 declare i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8>)
3383 declare i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8>)
3384 declare i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8>)