1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont | FileCheck %s --check-prefixes=CHECK32,SLM32
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=silvermont | FileCheck %s --check-prefixes=CHECK64,SLM64
4 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=CHECK32,SLOW32
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=CHECK64,SLOW64
6 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4-32
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4-64
8 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX-32,AVX2-32
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX-64,AVX2-64
10 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX-32,AVX512-32,AVX512DQ-32
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX-64,AVX512-64,AVX512DQ-64
12 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX-32,AVX512-32,AVX512BW-32
13 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX-64,AVX512-64,AVX512BW-64
14 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=knl | FileCheck %s --check-prefixes=AVX-32,AVX512-32,KNL-32
15 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefixes=AVX-64,AVX512-64,KNL-64
17 ; Make sure that the slow-pmulld feature can be used without SSE4.1.
18 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont -mattr=-sse4.1
20 define <4 x i32> @test_mul_v4i32_v4i8(<4 x i8> %A) {
21 ; CHECK32-LABEL: test_mul_v4i32_v4i8:
23 ; CHECK32-NEXT: pand {{\.LCPI.*}}, %xmm0
24 ; CHECK32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
27 ; CHECK64-LABEL: test_mul_v4i32_v4i8:
29 ; CHECK64-NEXT: pand {{.*}}(%rip), %xmm0
30 ; CHECK64-NEXT: pmaddwd {{.*}}(%rip), %xmm0
33 ; SSE4-32-LABEL: test_mul_v4i32_v4i8:
35 ; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0
36 ; SSE4-32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
39 ; SSE4-64-LABEL: test_mul_v4i32_v4i8:
41 ; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0
42 ; SSE4-64-NEXT: pmaddwd {{.*}}(%rip), %xmm0
45 ; AVX2-32-LABEL: test_mul_v4i32_v4i8:
47 ; AVX2-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
48 ; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
51 ; AVX2-64-LABEL: test_mul_v4i32_v4i8:
53 ; AVX2-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
54 ; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
57 ; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8:
58 ; AVX512DQ-32: # %bb.0:
59 ; AVX512DQ-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
60 ; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
61 ; AVX512DQ-32-NEXT: retl
63 ; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8:
64 ; AVX512DQ-64: # %bb.0:
65 ; AVX512DQ-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
66 ; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
67 ; AVX512DQ-64-NEXT: retq
69 ; AVX512BW-32-LABEL: test_mul_v4i32_v4i8:
70 ; AVX512BW-32: # %bb.0:
71 ; AVX512BW-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
72 ; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
73 ; AVX512BW-32-NEXT: retl
75 ; AVX512BW-64-LABEL: test_mul_v4i32_v4i8:
76 ; AVX512BW-64: # %bb.0:
77 ; AVX512BW-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
78 ; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
79 ; AVX512BW-64-NEXT: retq
81 ; KNL-32-LABEL: test_mul_v4i32_v4i8:
83 ; KNL-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
84 ; KNL-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
85 ; KNL-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
88 ; KNL-64-LABEL: test_mul_v4i32_v4i8:
90 ; KNL-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
91 ; KNL-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
92 ; KNL-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
94 %z = zext <4 x i8> %A to <4 x i32>
95 %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
99 define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) {
100 ; SLM32-LABEL: test_mul_v8i32_v8i8:
102 ; SLM32-NEXT: movdqa %xmm0, %xmm1
103 ; SLM32-NEXT: pand {{\.LCPI.*}}, %xmm1
104 ; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
105 ; SLM32-NEXT: movdqa %xmm1, %xmm2
106 ; SLM32-NEXT: pmullw %xmm0, %xmm1
107 ; SLM32-NEXT: pmulhw %xmm0, %xmm2
108 ; SLM32-NEXT: movdqa %xmm1, %xmm0
109 ; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
110 ; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
113 ; SLM64-LABEL: test_mul_v8i32_v8i8:
115 ; SLM64-NEXT: movdqa %xmm0, %xmm1
116 ; SLM64-NEXT: pand {{.*}}(%rip), %xmm1
117 ; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
118 ; SLM64-NEXT: movdqa %xmm1, %xmm2
119 ; SLM64-NEXT: pmullw %xmm0, %xmm1
120 ; SLM64-NEXT: pmulhw %xmm0, %xmm2
121 ; SLM64-NEXT: movdqa %xmm1, %xmm0
122 ; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
123 ; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
126 ; SLOW32-LABEL: test_mul_v8i32_v8i8:
128 ; SLOW32-NEXT: movdqa %xmm0, %xmm1
129 ; SLOW32-NEXT: pand {{\.LCPI.*}}, %xmm1
130 ; SLOW32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
131 ; SLOW32-NEXT: movdqa %xmm1, %xmm2
132 ; SLOW32-NEXT: pmulhw %xmm0, %xmm2
133 ; SLOW32-NEXT: pmullw %xmm0, %xmm1
134 ; SLOW32-NEXT: movdqa %xmm1, %xmm0
135 ; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
136 ; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
139 ; SLOW64-LABEL: test_mul_v8i32_v8i8:
141 ; SLOW64-NEXT: movdqa %xmm0, %xmm1
142 ; SLOW64-NEXT: pand {{.*}}(%rip), %xmm1
143 ; SLOW64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
144 ; SLOW64-NEXT: movdqa %xmm1, %xmm2
145 ; SLOW64-NEXT: pmulhw %xmm0, %xmm2
146 ; SLOW64-NEXT: pmullw %xmm0, %xmm1
147 ; SLOW64-NEXT: movdqa %xmm1, %xmm0
148 ; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
149 ; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
152 ; SSE4-32-LABEL: test_mul_v8i32_v8i8:
154 ; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0
155 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
156 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
157 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
158 ; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
159 ; SSE4-32-NEXT: pmaddwd %xmm2, %xmm0
160 ; SSE4-32-NEXT: pmaddwd %xmm2, %xmm1
163 ; SSE4-64-LABEL: test_mul_v8i32_v8i8:
165 ; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0
166 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
167 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
168 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
169 ; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
170 ; SSE4-64-NEXT: pmaddwd %xmm2, %xmm0
171 ; SSE4-64-NEXT: pmaddwd %xmm2, %xmm1
174 ; AVX2-32-LABEL: test_mul_v8i32_v8i8:
176 ; AVX2-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
177 ; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
178 ; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
181 ; AVX2-64-LABEL: test_mul_v8i32_v8i8:
183 ; AVX2-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
184 ; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
185 ; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
188 ; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8:
189 ; AVX512DQ-32: # %bb.0:
190 ; AVX512DQ-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
191 ; AVX512DQ-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
192 ; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
193 ; AVX512DQ-32-NEXT: retl
195 ; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8:
196 ; AVX512DQ-64: # %bb.0:
197 ; AVX512DQ-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
198 ; AVX512DQ-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
199 ; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
200 ; AVX512DQ-64-NEXT: retq
202 ; AVX512BW-32-LABEL: test_mul_v8i32_v8i8:
203 ; AVX512BW-32: # %bb.0:
204 ; AVX512BW-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
205 ; AVX512BW-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
206 ; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
207 ; AVX512BW-32-NEXT: retl
209 ; AVX512BW-64-LABEL: test_mul_v8i32_v8i8:
210 ; AVX512BW-64: # %bb.0:
211 ; AVX512BW-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
212 ; AVX512BW-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
213 ; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
214 ; AVX512BW-64-NEXT: retq
216 ; KNL-32-LABEL: test_mul_v8i32_v8i8:
218 ; KNL-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
219 ; KNL-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
220 ; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
221 ; KNL-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
224 ; KNL-64-LABEL: test_mul_v8i32_v8i8:
226 ; KNL-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
227 ; KNL-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
228 ; KNL-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
229 ; KNL-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
231 %z = zext <8 x i8> %A to <8 x i32>
232 %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
236 define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
237 ; SLM32-LABEL: test_mul_v16i32_v16i8:
239 ; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
240 ; SLM32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
241 ; SLM32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
242 ; SLM32-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
243 ; SLM32-NEXT: movdqa %xmm1, %xmm4
244 ; SLM32-NEXT: movdqa %xmm3, %xmm5
245 ; SLM32-NEXT: pmullw %xmm2, %xmm1
246 ; SLM32-NEXT: pmullw %xmm2, %xmm3
247 ; SLM32-NEXT: pmulhw %xmm2, %xmm4
248 ; SLM32-NEXT: pmulhw %xmm2, %xmm5
249 ; SLM32-NEXT: movdqa %xmm1, %xmm0
250 ; SLM32-NEXT: movdqa %xmm3, %xmm2
251 ; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
252 ; SLM32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
253 ; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
254 ; SLM32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
257 ; SLM64-LABEL: test_mul_v16i32_v16i8:
259 ; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
260 ; SLM64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
261 ; SLM64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
262 ; SLM64-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
263 ; SLM64-NEXT: movdqa %xmm1, %xmm4
264 ; SLM64-NEXT: movdqa %xmm3, %xmm5
265 ; SLM64-NEXT: pmullw %xmm2, %xmm1
266 ; SLM64-NEXT: pmullw %xmm2, %xmm3
267 ; SLM64-NEXT: pmulhw %xmm2, %xmm4
268 ; SLM64-NEXT: pmulhw %xmm2, %xmm5
269 ; SLM64-NEXT: movdqa %xmm1, %xmm0
270 ; SLM64-NEXT: movdqa %xmm3, %xmm2
271 ; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
272 ; SLM64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
273 ; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
274 ; SLM64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
277 ; SLOW32-LABEL: test_mul_v16i32_v16i8:
279 ; SLOW32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
280 ; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
281 ; SLOW32-NEXT: movdqa %xmm1, %xmm3
282 ; SLOW32-NEXT: pmulhw %xmm2, %xmm3
283 ; SLOW32-NEXT: pmullw %xmm2, %xmm1
284 ; SLOW32-NEXT: movdqa %xmm1, %xmm4
285 ; SLOW32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
286 ; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
287 ; SLOW32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
288 ; SLOW32-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
289 ; SLOW32-NEXT: movdqa %xmm3, %xmm0
290 ; SLOW32-NEXT: pmulhw %xmm2, %xmm0
291 ; SLOW32-NEXT: pmullw %xmm2, %xmm3
292 ; SLOW32-NEXT: movdqa %xmm3, %xmm2
293 ; SLOW32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
294 ; SLOW32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
295 ; SLOW32-NEXT: movdqa %xmm4, %xmm0
298 ; SLOW64-LABEL: test_mul_v16i32_v16i8:
300 ; SLOW64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
301 ; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
302 ; SLOW64-NEXT: movdqa %xmm1, %xmm3
303 ; SLOW64-NEXT: pmulhw %xmm2, %xmm3
304 ; SLOW64-NEXT: pmullw %xmm2, %xmm1
305 ; SLOW64-NEXT: movdqa %xmm1, %xmm4
306 ; SLOW64-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
307 ; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
308 ; SLOW64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
309 ; SLOW64-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
310 ; SLOW64-NEXT: movdqa %xmm3, %xmm0
311 ; SLOW64-NEXT: pmulhw %xmm2, %xmm0
312 ; SLOW64-NEXT: pmullw %xmm2, %xmm3
313 ; SLOW64-NEXT: movdqa %xmm3, %xmm2
314 ; SLOW64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
315 ; SLOW64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
316 ; SLOW64-NEXT: movdqa %xmm4, %xmm0
319 ; SSE4-32-LABEL: test_mul_v16i32_v16i8:
321 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
322 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
323 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
324 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
325 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
326 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
327 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
328 ; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
329 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm0
330 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm1
331 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm2
332 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm3
335 ; SSE4-64-LABEL: test_mul_v16i32_v16i8:
337 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
338 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
339 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
340 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
341 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
342 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
343 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
344 ; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
345 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm0
346 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm1
347 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm2
348 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm3
351 ; AVX2-32-LABEL: test_mul_v16i32_v16i8:
353 ; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
354 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
355 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
356 ; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
357 ; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
358 ; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
361 ; AVX2-64-LABEL: test_mul_v16i32_v16i8:
363 ; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
364 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
365 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
366 ; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
367 ; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
368 ; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
371 ; AVX512DQ-32-LABEL: test_mul_v16i32_v16i8:
372 ; AVX512DQ-32: # %bb.0:
373 ; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
374 ; AVX512DQ-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
375 ; AVX512DQ-32-NEXT: retl
377 ; AVX512DQ-64-LABEL: test_mul_v16i32_v16i8:
378 ; AVX512DQ-64: # %bb.0:
379 ; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
380 ; AVX512DQ-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
381 ; AVX512DQ-64-NEXT: retq
383 ; AVX512BW-32-LABEL: test_mul_v16i32_v16i8:
384 ; AVX512BW-32: # %bb.0:
385 ; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
386 ; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %zmm0, %zmm0
387 ; AVX512BW-32-NEXT: retl
389 ; AVX512BW-64-LABEL: test_mul_v16i32_v16i8:
390 ; AVX512BW-64: # %bb.0:
391 ; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
392 ; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %zmm0, %zmm0
393 ; AVX512BW-64-NEXT: retq
395 ; KNL-32-LABEL: test_mul_v16i32_v16i8:
397 ; KNL-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
398 ; KNL-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
401 ; KNL-64-LABEL: test_mul_v16i32_v16i8:
403 ; KNL-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
404 ; KNL-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
406 %z = zext <16 x i8> %A to <16 x i32>
407 %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
411 define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) {
412 ; SLM32-LABEL: test_mul_v4i32_v4i16:
414 ; SLM32-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
415 ; SLM32-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
416 ; SLM32-NEXT: movdqa %xmm0, %xmm2
417 ; SLM32-NEXT: pmullw %xmm1, %xmm0
418 ; SLM32-NEXT: pmulhuw %xmm1, %xmm2
419 ; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
422 ; SLM64-LABEL: test_mul_v4i32_v4i16:
424 ; SLM64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
425 ; SLM64-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
426 ; SLM64-NEXT: movdqa %xmm0, %xmm2
427 ; SLM64-NEXT: pmullw %xmm1, %xmm0
428 ; SLM64-NEXT: pmulhuw %xmm1, %xmm2
429 ; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
432 ; SLOW32-LABEL: test_mul_v4i32_v4i16:
434 ; SLOW32-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
435 ; SLOW32-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
436 ; SLOW32-NEXT: movdqa %xmm0, %xmm2
437 ; SLOW32-NEXT: pmulhuw %xmm1, %xmm2
438 ; SLOW32-NEXT: pmullw %xmm1, %xmm0
439 ; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
442 ; SLOW64-LABEL: test_mul_v4i32_v4i16:
444 ; SLOW64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
445 ; SLOW64-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
446 ; SLOW64-NEXT: movdqa %xmm0, %xmm2
447 ; SLOW64-NEXT: pmulhuw %xmm1, %xmm2
448 ; SLOW64-NEXT: pmullw %xmm1, %xmm0
449 ; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
452 ; SSE4-32-LABEL: test_mul_v4i32_v4i16:
454 ; SSE4-32-NEXT: pxor %xmm1, %xmm1
455 ; SSE4-32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
456 ; SSE4-32-NEXT: pmulld {{\.LCPI.*}}, %xmm0
459 ; SSE4-64-LABEL: test_mul_v4i32_v4i16:
461 ; SSE4-64-NEXT: pxor %xmm1, %xmm1
462 ; SSE4-64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
463 ; SSE4-64-NEXT: pmulld {{.*}}(%rip), %xmm0
466 ; AVX-32-LABEL: test_mul_v4i32_v4i16:
468 ; AVX-32-NEXT: vpxor %xmm1, %xmm1, %xmm1
469 ; AVX-32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
470 ; AVX-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
471 ; AVX-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
474 ; AVX-64-LABEL: test_mul_v4i32_v4i16:
476 ; AVX-64-NEXT: vpxor %xmm1, %xmm1, %xmm1
477 ; AVX-64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
478 ; AVX-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
479 ; AVX-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
481 %z = zext <4 x i16> %A to <4 x i32>
482 %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
486 define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) {
487 ; SLM32-LABEL: test_mul_v8i32_v8i16:
489 ; SLM32-NEXT: movdqa %xmm0, %xmm1
490 ; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
491 ; SLM32-NEXT: movdqa %xmm1, %xmm2
492 ; SLM32-NEXT: pmullw %xmm0, %xmm1
493 ; SLM32-NEXT: pmulhuw %xmm0, %xmm2
494 ; SLM32-NEXT: movdqa %xmm1, %xmm0
495 ; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
496 ; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
499 ; SLM64-LABEL: test_mul_v8i32_v8i16:
501 ; SLM64-NEXT: movdqa %xmm0, %xmm1
502 ; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
503 ; SLM64-NEXT: movdqa %xmm1, %xmm2
504 ; SLM64-NEXT: pmullw %xmm0, %xmm1
505 ; SLM64-NEXT: pmulhuw %xmm0, %xmm2
506 ; SLM64-NEXT: movdqa %xmm1, %xmm0
507 ; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
508 ; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
511 ; SLOW32-LABEL: test_mul_v8i32_v8i16:
513 ; SLOW32-NEXT: movdqa %xmm0, %xmm1
514 ; SLOW32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
515 ; SLOW32-NEXT: movdqa %xmm1, %xmm2
516 ; SLOW32-NEXT: pmulhuw %xmm0, %xmm2
517 ; SLOW32-NEXT: pmullw %xmm0, %xmm1
518 ; SLOW32-NEXT: movdqa %xmm1, %xmm0
519 ; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
520 ; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
523 ; SLOW64-LABEL: test_mul_v8i32_v8i16:
525 ; SLOW64-NEXT: movdqa %xmm0, %xmm1
526 ; SLOW64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
527 ; SLOW64-NEXT: movdqa %xmm1, %xmm2
528 ; SLOW64-NEXT: pmulhuw %xmm0, %xmm2
529 ; SLOW64-NEXT: pmullw %xmm0, %xmm1
530 ; SLOW64-NEXT: movdqa %xmm1, %xmm0
531 ; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
532 ; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
535 ; SSE4-32-LABEL: test_mul_v8i32_v8i16:
537 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
538 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
539 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
540 ; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
541 ; SSE4-32-NEXT: pmulld %xmm2, %xmm0
542 ; SSE4-32-NEXT: pmulld %xmm2, %xmm1
545 ; SSE4-64-LABEL: test_mul_v8i32_v8i16:
547 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
548 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
549 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
550 ; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
551 ; SSE4-64-NEXT: pmulld %xmm2, %xmm0
552 ; SSE4-64-NEXT: pmulld %xmm2, %xmm1
555 ; AVX-32-LABEL: test_mul_v8i32_v8i16:
557 ; AVX-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
558 ; AVX-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
559 ; AVX-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
562 ; AVX-64-LABEL: test_mul_v8i32_v8i16:
564 ; AVX-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
565 ; AVX-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
566 ; AVX-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
568 %z = zext <8 x i16> %A to <8 x i32>
569 %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
573 define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) {
574 ; SLM32-LABEL: test_mul_v16i32_v16i16:
576 ; SLM32-NEXT: movdqa %xmm1, %xmm3
577 ; SLM32-NEXT: movdqa %xmm0, %xmm1
578 ; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
579 ; SLM32-NEXT: movdqa %xmm1, %xmm2
580 ; SLM32-NEXT: movdqa %xmm3, %xmm4
581 ; SLM32-NEXT: pmullw %xmm0, %xmm1
582 ; SLM32-NEXT: pmulhuw %xmm0, %xmm2
583 ; SLM32-NEXT: pmullw %xmm0, %xmm3
584 ; SLM32-NEXT: pmulhuw %xmm0, %xmm4
585 ; SLM32-NEXT: movdqa %xmm1, %xmm0
586 ; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
587 ; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
588 ; SLM32-NEXT: movdqa %xmm3, %xmm2
589 ; SLM32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
590 ; SLM32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
593 ; SLM64-LABEL: test_mul_v16i32_v16i16:
595 ; SLM64-NEXT: movdqa %xmm1, %xmm3
596 ; SLM64-NEXT: movdqa %xmm0, %xmm1
597 ; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
598 ; SLM64-NEXT: movdqa %xmm1, %xmm2
599 ; SLM64-NEXT: movdqa %xmm3, %xmm4
600 ; SLM64-NEXT: pmullw %xmm0, %xmm1
601 ; SLM64-NEXT: pmulhuw %xmm0, %xmm2
602 ; SLM64-NEXT: pmullw %xmm0, %xmm3
603 ; SLM64-NEXT: pmulhuw %xmm0, %xmm4
604 ; SLM64-NEXT: movdqa %xmm1, %xmm0
605 ; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
606 ; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
607 ; SLM64-NEXT: movdqa %xmm3, %xmm2
608 ; SLM64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
609 ; SLM64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
612 ; SLOW32-LABEL: test_mul_v16i32_v16i16:
614 ; SLOW32-NEXT: movdqa %xmm1, %xmm3
615 ; SLOW32-NEXT: movdqa %xmm0, %xmm1
616 ; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
617 ; SLOW32-NEXT: movdqa %xmm0, %xmm4
618 ; SLOW32-NEXT: pmulhuw %xmm2, %xmm4
619 ; SLOW32-NEXT: pmullw %xmm2, %xmm1
620 ; SLOW32-NEXT: movdqa %xmm1, %xmm0
621 ; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
622 ; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
623 ; SLOW32-NEXT: movdqa %xmm3, %xmm4
624 ; SLOW32-NEXT: pmulhuw %xmm2, %xmm4
625 ; SLOW32-NEXT: pmullw %xmm2, %xmm3
626 ; SLOW32-NEXT: movdqa %xmm3, %xmm2
627 ; SLOW32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
628 ; SLOW32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
631 ; SLOW64-LABEL: test_mul_v16i32_v16i16:
633 ; SLOW64-NEXT: movdqa %xmm1, %xmm3
634 ; SLOW64-NEXT: movdqa %xmm0, %xmm1
635 ; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
636 ; SLOW64-NEXT: movdqa %xmm0, %xmm4
637 ; SLOW64-NEXT: pmulhuw %xmm2, %xmm4
638 ; SLOW64-NEXT: pmullw %xmm2, %xmm1
639 ; SLOW64-NEXT: movdqa %xmm1, %xmm0
640 ; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
641 ; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
642 ; SLOW64-NEXT: movdqa %xmm3, %xmm4
643 ; SLOW64-NEXT: pmulhuw %xmm2, %xmm4
644 ; SLOW64-NEXT: pmullw %xmm2, %xmm3
645 ; SLOW64-NEXT: movdqa %xmm3, %xmm2
646 ; SLOW64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
647 ; SLOW64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
650 ; SSE4-32-LABEL: test_mul_v16i32_v16i16:
652 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
653 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
654 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
655 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
656 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
657 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
658 ; SSE4-32-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
659 ; SSE4-32-NEXT: pmulld %xmm1, %xmm0
660 ; SSE4-32-NEXT: pmulld %xmm1, %xmm2
661 ; SSE4-32-NEXT: pmulld %xmm1, %xmm4
662 ; SSE4-32-NEXT: pmulld %xmm1, %xmm3
663 ; SSE4-32-NEXT: movdqa %xmm4, %xmm1
666 ; SSE4-64-LABEL: test_mul_v16i32_v16i16:
668 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
669 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
670 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
671 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
672 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
673 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
674 ; SSE4-64-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
675 ; SSE4-64-NEXT: pmulld %xmm1, %xmm0
676 ; SSE4-64-NEXT: pmulld %xmm1, %xmm2
677 ; SSE4-64-NEXT: pmulld %xmm1, %xmm4
678 ; SSE4-64-NEXT: pmulld %xmm1, %xmm3
679 ; SSE4-64-NEXT: movdqa %xmm4, %xmm1
682 ; AVX2-32-LABEL: test_mul_v16i32_v16i16:
684 ; AVX2-32-NEXT: vextracti128 $1, %ymm0, %xmm1
685 ; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
686 ; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
687 ; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
688 ; AVX2-32-NEXT: vpmulld %ymm2, %ymm0, %ymm0
689 ; AVX2-32-NEXT: vpmulld %ymm2, %ymm1, %ymm1
692 ; AVX2-64-LABEL: test_mul_v16i32_v16i16:
694 ; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm1
695 ; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
696 ; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
697 ; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
698 ; AVX2-64-NEXT: vpmulld %ymm2, %ymm0, %ymm0
699 ; AVX2-64-NEXT: vpmulld %ymm2, %ymm1, %ymm1
702 ; AVX512-32-LABEL: test_mul_v16i32_v16i16:
703 ; AVX512-32: # %bb.0:
704 ; AVX512-32-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
705 ; AVX512-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
706 ; AVX512-32-NEXT: retl
708 ; AVX512-64-LABEL: test_mul_v16i32_v16i16:
709 ; AVX512-64: # %bb.0:
710 ; AVX512-64-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
711 ; AVX512-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
712 ; AVX512-64-NEXT: retq
713 %z = zext <16 x i16> %A to <16 x i32>
714 %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
722 define <4 x i32> @test_mul_v4i32_v4i8_minsize(<4 x i8> %A) minsize {
723 ; CHECK32-LABEL: test_mul_v4i32_v4i8_minsize:
725 ; CHECK32-NEXT: pand {{\.LCPI.*}}, %xmm0
726 ; CHECK32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
729 ; CHECK64-LABEL: test_mul_v4i32_v4i8_minsize:
731 ; CHECK64-NEXT: pand {{.*}}(%rip), %xmm0
732 ; CHECK64-NEXT: pmaddwd {{.*}}(%rip), %xmm0
735 ; SSE4-32-LABEL: test_mul_v4i32_v4i8_minsize:
737 ; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0
738 ; SSE4-32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
741 ; SSE4-64-LABEL: test_mul_v4i32_v4i8_minsize:
743 ; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0
744 ; SSE4-64-NEXT: pmaddwd {{.*}}(%rip), %xmm0
747 ; AVX2-32-LABEL: test_mul_v4i32_v4i8_minsize:
749 ; AVX2-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
750 ; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
753 ; AVX2-64-LABEL: test_mul_v4i32_v4i8_minsize:
755 ; AVX2-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
756 ; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
759 ; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8_minsize:
760 ; AVX512DQ-32: # %bb.0:
761 ; AVX512DQ-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
762 ; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
763 ; AVX512DQ-32-NEXT: retl
765 ; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8_minsize:
766 ; AVX512DQ-64: # %bb.0:
767 ; AVX512DQ-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
768 ; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
769 ; AVX512DQ-64-NEXT: retq
771 ; AVX512BW-32-LABEL: test_mul_v4i32_v4i8_minsize:
772 ; AVX512BW-32: # %bb.0:
773 ; AVX512BW-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
774 ; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
775 ; AVX512BW-32-NEXT: retl
777 ; AVX512BW-64-LABEL: test_mul_v4i32_v4i8_minsize:
778 ; AVX512BW-64: # %bb.0:
779 ; AVX512BW-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
780 ; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
781 ; AVX512BW-64-NEXT: retq
783 ; KNL-32-LABEL: test_mul_v4i32_v4i8_minsize:
785 ; KNL-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
786 ; KNL-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
787 ; KNL-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
790 ; KNL-64-LABEL: test_mul_v4i32_v4i8_minsize:
792 ; KNL-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
793 ; KNL-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
794 ; KNL-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
796 %z = zext <4 x i8> %A to <4 x i32>
797 %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
801 define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize {
802 ; SLM32-LABEL: test_mul_v8i32_v8i8_minsize:
804 ; SLM32-NEXT: pand {{\.LCPI.*}}, %xmm0
805 ; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
806 ; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
807 ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
808 ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
809 ; SLM32-NEXT: pmaddwd %xmm2, %xmm0
810 ; SLM32-NEXT: pmaddwd %xmm2, %xmm1
813 ; SLM64-LABEL: test_mul_v8i32_v8i8_minsize:
815 ; SLM64-NEXT: pand {{.*}}(%rip), %xmm0
816 ; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
817 ; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
818 ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
819 ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
820 ; SLM64-NEXT: pmaddwd %xmm2, %xmm0
821 ; SLM64-NEXT: pmaddwd %xmm2, %xmm1
824 ; SLOW32-LABEL: test_mul_v8i32_v8i8_minsize:
826 ; SLOW32-NEXT: pand {{\.LCPI.*}}, %xmm0
827 ; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
828 ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
829 ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
830 ; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
831 ; SLOW32-NEXT: pmaddwd %xmm2, %xmm0
832 ; SLOW32-NEXT: pmaddwd %xmm2, %xmm1
835 ; SLOW64-LABEL: test_mul_v8i32_v8i8_minsize:
837 ; SLOW64-NEXT: pand {{.*}}(%rip), %xmm0
838 ; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
839 ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
840 ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
841 ; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
842 ; SLOW64-NEXT: pmaddwd %xmm2, %xmm0
843 ; SLOW64-NEXT: pmaddwd %xmm2, %xmm1
846 ; SSE4-32-LABEL: test_mul_v8i32_v8i8_minsize:
848 ; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0
849 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
850 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
851 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
852 ; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
853 ; SSE4-32-NEXT: pmaddwd %xmm2, %xmm0
854 ; SSE4-32-NEXT: pmaddwd %xmm2, %xmm1
857 ; SSE4-64-LABEL: test_mul_v8i32_v8i8_minsize:
859 ; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0
860 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
861 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
862 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
863 ; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
864 ; SSE4-64-NEXT: pmaddwd %xmm2, %xmm0
865 ; SSE4-64-NEXT: pmaddwd %xmm2, %xmm1
868 ; AVX2-32-LABEL: test_mul_v8i32_v8i8_minsize:
870 ; AVX2-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
871 ; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
872 ; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
875 ; AVX2-64-LABEL: test_mul_v8i32_v8i8_minsize:
877 ; AVX2-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
878 ; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
879 ; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
882 ; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8_minsize:
883 ; AVX512DQ-32: # %bb.0:
884 ; AVX512DQ-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
885 ; AVX512DQ-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
886 ; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
887 ; AVX512DQ-32-NEXT: retl
889 ; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8_minsize:
890 ; AVX512DQ-64: # %bb.0:
891 ; AVX512DQ-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
892 ; AVX512DQ-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
893 ; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
894 ; AVX512DQ-64-NEXT: retq
896 ; AVX512BW-32-LABEL: test_mul_v8i32_v8i8_minsize:
897 ; AVX512BW-32: # %bb.0:
898 ; AVX512BW-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
899 ; AVX512BW-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
900 ; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
901 ; AVX512BW-32-NEXT: retl
903 ; AVX512BW-64-LABEL: test_mul_v8i32_v8i8_minsize:
904 ; AVX512BW-64: # %bb.0:
905 ; AVX512BW-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
906 ; AVX512BW-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
907 ; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
908 ; AVX512BW-64-NEXT: retq
910 ; KNL-32-LABEL: test_mul_v8i32_v8i8_minsize:
912 ; KNL-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
913 ; KNL-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
914 ; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
915 ; KNL-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
918 ; KNL-64-LABEL: test_mul_v8i32_v8i8_minsize:
920 ; KNL-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
921 ; KNL-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
922 ; KNL-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
923 ; KNL-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
925 %z = zext <8 x i8> %A to <8 x i32>
926 %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
930 define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
931 ; SLM32-LABEL: test_mul_v16i32_v16i8_minsize:
933 ; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
934 ; SLM32-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778]
935 ; SLM32-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
936 ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
937 ; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
938 ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
939 ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
940 ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
941 ; SLM32-NEXT: pmaddwd %xmm5, %xmm0
942 ; SLM32-NEXT: pmaddwd %xmm5, %xmm1
943 ; SLM32-NEXT: pmaddwd %xmm5, %xmm2
944 ; SLM32-NEXT: pmaddwd %xmm5, %xmm3
947 ; SLM64-LABEL: test_mul_v16i32_v16i8_minsize:
949 ; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
950 ; SLM64-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778]
951 ; SLM64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
952 ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
953 ; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
954 ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
955 ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
956 ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
957 ; SLM64-NEXT: pmaddwd %xmm5, %xmm0
958 ; SLM64-NEXT: pmaddwd %xmm5, %xmm1
959 ; SLM64-NEXT: pmaddwd %xmm5, %xmm2
960 ; SLM64-NEXT: pmaddwd %xmm5, %xmm3
963 ; SLOW32-LABEL: test_mul_v16i32_v16i8_minsize:
965 ; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
966 ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
967 ; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
968 ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
969 ; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
970 ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
971 ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
972 ; SLOW32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
973 ; SLOW32-NEXT: pmaddwd %xmm4, %xmm0
974 ; SLOW32-NEXT: pmaddwd %xmm4, %xmm1
975 ; SLOW32-NEXT: pmaddwd %xmm4, %xmm2
976 ; SLOW32-NEXT: pmaddwd %xmm4, %xmm3
979 ; SLOW64-LABEL: test_mul_v16i32_v16i8_minsize:
981 ; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
982 ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
983 ; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
984 ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
985 ; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
986 ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
987 ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
988 ; SLOW64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
989 ; SLOW64-NEXT: pmaddwd %xmm4, %xmm0
990 ; SLOW64-NEXT: pmaddwd %xmm4, %xmm1
991 ; SLOW64-NEXT: pmaddwd %xmm4, %xmm2
992 ; SLOW64-NEXT: pmaddwd %xmm4, %xmm3
995 ; SSE4-32-LABEL: test_mul_v16i32_v16i8_minsize:
997 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
998 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
999 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1000 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1001 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1002 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1003 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1004 ; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
1005 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm0
1006 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm1
1007 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm2
1008 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm3
1009 ; SSE4-32-NEXT: retl
1011 ; SSE4-64-LABEL: test_mul_v16i32_v16i8_minsize:
1013 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
1014 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1015 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1016 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1017 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1018 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1019 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1020 ; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
1021 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm0
1022 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm1
1023 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm2
1024 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm3
1025 ; SSE4-64-NEXT: retq
1027 ; AVX2-32-LABEL: test_mul_v16i32_v16i8_minsize:
1029 ; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1030 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
1031 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1032 ; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
1033 ; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
1034 ; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
1035 ; AVX2-32-NEXT: retl
1037 ; AVX2-64-LABEL: test_mul_v16i32_v16i8_minsize:
1039 ; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1040 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
1041 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1042 ; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
1043 ; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
1044 ; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
1045 ; AVX2-64-NEXT: retq
1047 ; AVX512DQ-32-LABEL: test_mul_v16i32_v16i8_minsize:
1048 ; AVX512DQ-32: # %bb.0:
1049 ; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1050 ; AVX512DQ-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
1051 ; AVX512DQ-32-NEXT: retl
1053 ; AVX512DQ-64-LABEL: test_mul_v16i32_v16i8_minsize:
1054 ; AVX512DQ-64: # %bb.0:
1055 ; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1056 ; AVX512DQ-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
1057 ; AVX512DQ-64-NEXT: retq
1059 ; AVX512BW-32-LABEL: test_mul_v16i32_v16i8_minsize:
1060 ; AVX512BW-32: # %bb.0:
1061 ; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1062 ; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %zmm0, %zmm0
1063 ; AVX512BW-32-NEXT: retl
1065 ; AVX512BW-64-LABEL: test_mul_v16i32_v16i8_minsize:
1066 ; AVX512BW-64: # %bb.0:
1067 ; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1068 ; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %zmm0, %zmm0
1069 ; AVX512BW-64-NEXT: retq
1071 ; KNL-32-LABEL: test_mul_v16i32_v16i8_minsize:
1073 ; KNL-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1074 ; KNL-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
1077 ; KNL-64-LABEL: test_mul_v16i32_v16i8_minsize:
1079 ; KNL-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1080 ; KNL-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
1082 %z = zext <16 x i8> %A to <16 x i32>
1083 %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
1087 define <4 x i32> @test_mul_v4i32_v4i16_minsize(<4 x i16> %A) minsize {
1088 ; CHECK32-LABEL: test_mul_v4i32_v4i16_minsize:
1090 ; CHECK32-NEXT: pxor %xmm1, %xmm1
1091 ; CHECK32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1092 ; CHECK32-NEXT: pmulld {{\.LCPI.*}}, %xmm0
1093 ; CHECK32-NEXT: retl
1095 ; CHECK64-LABEL: test_mul_v4i32_v4i16_minsize:
1097 ; CHECK64-NEXT: pxor %xmm1, %xmm1
1098 ; CHECK64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1099 ; CHECK64-NEXT: pmulld {{.*}}(%rip), %xmm0
1100 ; CHECK64-NEXT: retq
1102 ; SSE4-32-LABEL: test_mul_v4i32_v4i16_minsize:
1104 ; SSE4-32-NEXT: pxor %xmm1, %xmm1
1105 ; SSE4-32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1106 ; SSE4-32-NEXT: pmulld {{\.LCPI.*}}, %xmm0
1107 ; SSE4-32-NEXT: retl
1109 ; SSE4-64-LABEL: test_mul_v4i32_v4i16_minsize:
1111 ; SSE4-64-NEXT: pxor %xmm1, %xmm1
1112 ; SSE4-64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1113 ; SSE4-64-NEXT: pmulld {{.*}}(%rip), %xmm0
1114 ; SSE4-64-NEXT: retq
1116 ; AVX-32-LABEL: test_mul_v4i32_v4i16_minsize:
1118 ; AVX-32-NEXT: vpxor %xmm1, %xmm1, %xmm1
1119 ; AVX-32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1120 ; AVX-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
1121 ; AVX-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1124 ; AVX-64-LABEL: test_mul_v4i32_v4i16_minsize:
1126 ; AVX-64-NEXT: vpxor %xmm1, %xmm1, %xmm1
1127 ; AVX-64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1128 ; AVX-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
1129 ; AVX-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1131 %z = zext <4 x i16> %A to <4 x i32>
1132 %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
1136 define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize {
1137 ; SLM32-LABEL: test_mul_v8i32_v8i16_minsize:
1139 ; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
1140 ; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1141 ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1142 ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1143 ; SLM32-NEXT: pmulld %xmm2, %xmm0
1144 ; SLM32-NEXT: pmulld %xmm2, %xmm1
1147 ; SLM64-LABEL: test_mul_v8i32_v8i16_minsize:
1149 ; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
1150 ; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1151 ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1152 ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1153 ; SLM64-NEXT: pmulld %xmm2, %xmm0
1154 ; SLM64-NEXT: pmulld %xmm2, %xmm1
1157 ; SLOW32-LABEL: test_mul_v8i32_v8i16_minsize:
1159 ; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1160 ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1161 ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1162 ; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
1163 ; SLOW32-NEXT: pmulld %xmm2, %xmm0
1164 ; SLOW32-NEXT: pmulld %xmm2, %xmm1
1167 ; SLOW64-LABEL: test_mul_v8i32_v8i16_minsize:
1169 ; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1170 ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1171 ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1172 ; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
1173 ; SLOW64-NEXT: pmulld %xmm2, %xmm0
1174 ; SLOW64-NEXT: pmulld %xmm2, %xmm1
1177 ; SSE4-32-LABEL: test_mul_v8i32_v8i16_minsize:
1179 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1180 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1181 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1182 ; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
1183 ; SSE4-32-NEXT: pmulld %xmm2, %xmm0
1184 ; SSE4-32-NEXT: pmulld %xmm2, %xmm1
1185 ; SSE4-32-NEXT: retl
1187 ; SSE4-64-LABEL: test_mul_v8i32_v8i16_minsize:
1189 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1190 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1191 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1192 ; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
1193 ; SSE4-64-NEXT: pmulld %xmm2, %xmm0
1194 ; SSE4-64-NEXT: pmulld %xmm2, %xmm1
1195 ; SSE4-64-NEXT: retq
1197 ; AVX-32-LABEL: test_mul_v8i32_v8i16_minsize:
1199 ; AVX-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1200 ; AVX-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
1201 ; AVX-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
1204 ; AVX-64-LABEL: test_mul_v8i32_v8i16_minsize:
1206 ; AVX-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1207 ; AVX-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
1208 ; AVX-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
1210 %z = zext <8 x i16> %A to <8 x i32>
1211 %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
1215 define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize {
1216 ; SLM32-LABEL: test_mul_v16i32_v16i16_minsize:
1218 ; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
1219 ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1220 ; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
1221 ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1222 ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1223 ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1224 ; SLM32-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
1225 ; SLM32-NEXT: pmulld %xmm1, %xmm4
1226 ; SLM32-NEXT: pmulld %xmm1, %xmm0
1227 ; SLM32-NEXT: pmulld %xmm1, %xmm2
1228 ; SLM32-NEXT: pmulld %xmm1, %xmm3
1229 ; SLM32-NEXT: movdqa %xmm4, %xmm1
1232 ; SLM64-LABEL: test_mul_v16i32_v16i16_minsize:
1234 ; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
1235 ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1236 ; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
1237 ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1238 ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1239 ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1240 ; SLM64-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
1241 ; SLM64-NEXT: pmulld %xmm1, %xmm4
1242 ; SLM64-NEXT: pmulld %xmm1, %xmm0
1243 ; SLM64-NEXT: pmulld %xmm1, %xmm2
1244 ; SLM64-NEXT: pmulld %xmm1, %xmm3
1245 ; SLM64-NEXT: movdqa %xmm4, %xmm1
1248 ; SLOW32-LABEL: test_mul_v16i32_v16i16_minsize:
1250 ; SLOW32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
1251 ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1252 ; SLOW32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
1253 ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1254 ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1255 ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1256 ; SLOW32-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
1257 ; SLOW32-NEXT: pmulld %xmm1, %xmm0
1258 ; SLOW32-NEXT: pmulld %xmm1, %xmm2
1259 ; SLOW32-NEXT: pmulld %xmm1, %xmm4
1260 ; SLOW32-NEXT: pmulld %xmm1, %xmm3
1261 ; SLOW32-NEXT: movdqa %xmm4, %xmm1
1264 ; SLOW64-LABEL: test_mul_v16i32_v16i16_minsize:
1266 ; SLOW64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
1267 ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1268 ; SLOW64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
1269 ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1270 ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1271 ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1272 ; SLOW64-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
1273 ; SLOW64-NEXT: pmulld %xmm1, %xmm0
1274 ; SLOW64-NEXT: pmulld %xmm1, %xmm2
1275 ; SLOW64-NEXT: pmulld %xmm1, %xmm4
1276 ; SLOW64-NEXT: pmulld %xmm1, %xmm3
1277 ; SLOW64-NEXT: movdqa %xmm4, %xmm1
1280 ; SSE4-32-LABEL: test_mul_v16i32_v16i16_minsize:
1282 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
1283 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1284 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
1285 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1286 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1287 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1288 ; SSE4-32-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
1289 ; SSE4-32-NEXT: pmulld %xmm1, %xmm0
1290 ; SSE4-32-NEXT: pmulld %xmm1, %xmm2
1291 ; SSE4-32-NEXT: pmulld %xmm1, %xmm4
1292 ; SSE4-32-NEXT: pmulld %xmm1, %xmm3
1293 ; SSE4-32-NEXT: movdqa %xmm4, %xmm1
1294 ; SSE4-32-NEXT: retl
1296 ; SSE4-64-LABEL: test_mul_v16i32_v16i16_minsize:
1298 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
1299 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1300 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
1301 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1302 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1303 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1304 ; SSE4-64-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
1305 ; SSE4-64-NEXT: pmulld %xmm1, %xmm0
1306 ; SSE4-64-NEXT: pmulld %xmm1, %xmm2
1307 ; SSE4-64-NEXT: pmulld %xmm1, %xmm4
1308 ; SSE4-64-NEXT: pmulld %xmm1, %xmm3
1309 ; SSE4-64-NEXT: movdqa %xmm4, %xmm1
1310 ; SSE4-64-NEXT: retq
1312 ; AVX2-32-LABEL: test_mul_v16i32_v16i16_minsize:
1314 ; AVX2-32-NEXT: vextracti128 $1, %ymm0, %xmm1
1315 ; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1316 ; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1317 ; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
1318 ; AVX2-32-NEXT: vpmulld %ymm2, %ymm0, %ymm0
1319 ; AVX2-32-NEXT: vpmulld %ymm2, %ymm1, %ymm1
1320 ; AVX2-32-NEXT: retl
1322 ; AVX2-64-LABEL: test_mul_v16i32_v16i16_minsize:
1324 ; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm1
1325 ; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1326 ; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1327 ; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
1328 ; AVX2-64-NEXT: vpmulld %ymm2, %ymm0, %ymm0
1329 ; AVX2-64-NEXT: vpmulld %ymm2, %ymm1, %ymm1
1330 ; AVX2-64-NEXT: retq
1332 ; AVX512-32-LABEL: test_mul_v16i32_v16i16_minsize:
1333 ; AVX512-32: # %bb.0:
1334 ; AVX512-32-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1335 ; AVX512-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
1336 ; AVX512-32-NEXT: retl
1338 ; AVX512-64-LABEL: test_mul_v16i32_v16i16_minsize:
1339 ; AVX512-64: # %bb.0:
1340 ; AVX512-64-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1341 ; AVX512-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
1342 ; AVX512-64-NEXT: retq
1343 %z = zext <16 x i16> %A to <16 x i32>
1344 %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>