1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont | FileCheck %s --check-prefixes=CHECK32,SLM32
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=silvermont | FileCheck %s --check-prefixes=CHECK64,SLM64
4 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=CHECK32,SLOW32
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=CHECK64,SLOW64
6 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4-32
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4-64
8 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX-32,AVX2-32
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX-64,AVX2-64
10 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX-32,AVX512-32,AVX512DQ-32
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX-64,AVX512-64,AVX512DQ-64
12 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX-32,AVX512-32,AVX512BW-32
13 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX-64,AVX512-64,AVX512BW-64
14 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=knl | FileCheck %s --check-prefixes=AVX-32,AVX512-32,KNL-32
15 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefixes=AVX-64,AVX512-64,KNL-64
17 ; Make sure that the slow-pmulld feature can be used without SSE4.1.
18 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont -mattr=-sse4.1
20 define <4 x i32> @test_mul_v4i32_v4i8(<4 x i8> %A) {
21 ; CHECK32-LABEL: test_mul_v4i32_v4i8:
23 ; CHECK32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
24 ; CHECK32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
27 ; CHECK64-LABEL: test_mul_v4i32_v4i8:
29 ; CHECK64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
30 ; CHECK64-NEXT: pmaddwd {{.*}}(%rip), %xmm0
33 ; SSE4-32-LABEL: test_mul_v4i32_v4i8:
35 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
36 ; SSE4-32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
39 ; SSE4-64-LABEL: test_mul_v4i32_v4i8:
41 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
42 ; SSE4-64-NEXT: pmaddwd {{.*}}(%rip), %xmm0
45 ; AVX2-32-LABEL: test_mul_v4i32_v4i8:
47 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
48 ; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
51 ; AVX2-64-LABEL: test_mul_v4i32_v4i8:
53 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
54 ; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
57 ; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8:
58 ; AVX512DQ-32: # %bb.0:
59 ; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
60 ; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
61 ; AVX512DQ-32-NEXT: retl
63 ; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8:
64 ; AVX512DQ-64: # %bb.0:
65 ; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
66 ; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
67 ; AVX512DQ-64-NEXT: retq
69 ; AVX512BW-32-LABEL: test_mul_v4i32_v4i8:
70 ; AVX512BW-32: # %bb.0:
71 ; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
72 ; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
73 ; AVX512BW-32-NEXT: retl
75 ; AVX512BW-64-LABEL: test_mul_v4i32_v4i8:
76 ; AVX512BW-64: # %bb.0:
77 ; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
78 ; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
79 ; AVX512BW-64-NEXT: retq
81 ; KNL-32-LABEL: test_mul_v4i32_v4i8:
83 ; KNL-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
84 ; KNL-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
85 ; KNL-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
88 ; KNL-64-LABEL: test_mul_v4i32_v4i8:
90 ; KNL-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
91 ; KNL-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
92 ; KNL-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
94 %z = zext <4 x i8> %A to <4 x i32>
95 %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
99 define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) {
100 ; SLM32-LABEL: test_mul_v8i32_v8i8:
102 ; SLM32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
103 ; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
104 ; SLM32-NEXT: movdqa %xmm1, %xmm2
105 ; SLM32-NEXT: pmullw %xmm0, %xmm1
106 ; SLM32-NEXT: pmulhw %xmm0, %xmm2
107 ; SLM32-NEXT: movdqa %xmm1, %xmm0
108 ; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
109 ; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
112 ; SLM64-LABEL: test_mul_v8i32_v8i8:
114 ; SLM64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
115 ; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
116 ; SLM64-NEXT: movdqa %xmm1, %xmm2
117 ; SLM64-NEXT: pmullw %xmm0, %xmm1
118 ; SLM64-NEXT: pmulhw %xmm0, %xmm2
119 ; SLM64-NEXT: movdqa %xmm1, %xmm0
120 ; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
121 ; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
124 ; SLOW32-LABEL: test_mul_v8i32_v8i8:
126 ; SLOW32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
127 ; SLOW32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
128 ; SLOW32-NEXT: movdqa %xmm1, %xmm2
129 ; SLOW32-NEXT: pmulhw %xmm0, %xmm2
130 ; SLOW32-NEXT: pmullw %xmm0, %xmm1
131 ; SLOW32-NEXT: movdqa %xmm1, %xmm0
132 ; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
133 ; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
136 ; SLOW64-LABEL: test_mul_v8i32_v8i8:
138 ; SLOW64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
139 ; SLOW64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
140 ; SLOW64-NEXT: movdqa %xmm1, %xmm2
141 ; SLOW64-NEXT: pmulhw %xmm0, %xmm2
142 ; SLOW64-NEXT: pmullw %xmm0, %xmm1
143 ; SLOW64-NEXT: movdqa %xmm1, %xmm0
144 ; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
145 ; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
148 ; SSE4-32-LABEL: test_mul_v8i32_v8i8:
150 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
151 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
152 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
153 ; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
154 ; SSE4-32-NEXT: pmaddwd %xmm2, %xmm0
155 ; SSE4-32-NEXT: pmaddwd %xmm2, %xmm1
158 ; SSE4-64-LABEL: test_mul_v8i32_v8i8:
160 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
161 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
162 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
163 ; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
164 ; SSE4-64-NEXT: pmaddwd %xmm2, %xmm0
165 ; SSE4-64-NEXT: pmaddwd %xmm2, %xmm1
168 ; AVX2-32-LABEL: test_mul_v8i32_v8i8:
170 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
171 ; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
174 ; AVX2-64-LABEL: test_mul_v8i32_v8i8:
176 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
177 ; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
180 ; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8:
181 ; AVX512DQ-32: # %bb.0:
182 ; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
183 ; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
184 ; AVX512DQ-32-NEXT: retl
186 ; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8:
187 ; AVX512DQ-64: # %bb.0:
188 ; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
189 ; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
190 ; AVX512DQ-64-NEXT: retq
192 ; AVX512BW-32-LABEL: test_mul_v8i32_v8i8:
193 ; AVX512BW-32: # %bb.0:
194 ; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
195 ; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
196 ; AVX512BW-32-NEXT: retl
198 ; AVX512BW-64-LABEL: test_mul_v8i32_v8i8:
199 ; AVX512BW-64: # %bb.0:
200 ; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
201 ; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
202 ; AVX512BW-64-NEXT: retq
204 ; KNL-32-LABEL: test_mul_v8i32_v8i8:
206 ; KNL-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
207 ; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
208 ; KNL-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
211 ; KNL-64-LABEL: test_mul_v8i32_v8i8:
213 ; KNL-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
214 ; KNL-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
215 ; KNL-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
217 %z = zext <8 x i8> %A to <8 x i32>
218 %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
222 define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
223 ; SLM32-LABEL: test_mul_v16i32_v16i8:
225 ; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
226 ; SLM32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
227 ; SLM32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
228 ; SLM32-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
229 ; SLM32-NEXT: movdqa %xmm1, %xmm4
230 ; SLM32-NEXT: movdqa %xmm3, %xmm5
231 ; SLM32-NEXT: pmullw %xmm2, %xmm1
232 ; SLM32-NEXT: pmullw %xmm2, %xmm3
233 ; SLM32-NEXT: pmulhw %xmm2, %xmm4
234 ; SLM32-NEXT: pmulhw %xmm2, %xmm5
235 ; SLM32-NEXT: movdqa %xmm1, %xmm0
236 ; SLM32-NEXT: movdqa %xmm3, %xmm2
237 ; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
238 ; SLM32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
239 ; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
240 ; SLM32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
243 ; SLM64-LABEL: test_mul_v16i32_v16i8:
245 ; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
246 ; SLM64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
247 ; SLM64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
248 ; SLM64-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
249 ; SLM64-NEXT: movdqa %xmm1, %xmm4
250 ; SLM64-NEXT: movdqa %xmm3, %xmm5
251 ; SLM64-NEXT: pmullw %xmm2, %xmm1
252 ; SLM64-NEXT: pmullw %xmm2, %xmm3
253 ; SLM64-NEXT: pmulhw %xmm2, %xmm4
254 ; SLM64-NEXT: pmulhw %xmm2, %xmm5
255 ; SLM64-NEXT: movdqa %xmm1, %xmm0
256 ; SLM64-NEXT: movdqa %xmm3, %xmm2
257 ; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
258 ; SLM64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
259 ; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
260 ; SLM64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
263 ; SLOW32-LABEL: test_mul_v16i32_v16i8:
265 ; SLOW32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
266 ; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
267 ; SLOW32-NEXT: movdqa %xmm1, %xmm3
268 ; SLOW32-NEXT: pmulhw %xmm2, %xmm3
269 ; SLOW32-NEXT: pmullw %xmm2, %xmm1
270 ; SLOW32-NEXT: movdqa %xmm1, %xmm4
271 ; SLOW32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
272 ; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
273 ; SLOW32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
274 ; SLOW32-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
275 ; SLOW32-NEXT: movdqa %xmm3, %xmm0
276 ; SLOW32-NEXT: pmulhw %xmm2, %xmm0
277 ; SLOW32-NEXT: pmullw %xmm2, %xmm3
278 ; SLOW32-NEXT: movdqa %xmm3, %xmm2
279 ; SLOW32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
280 ; SLOW32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
281 ; SLOW32-NEXT: movdqa %xmm4, %xmm0
284 ; SLOW64-LABEL: test_mul_v16i32_v16i8:
286 ; SLOW64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
287 ; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
288 ; SLOW64-NEXT: movdqa %xmm1, %xmm3
289 ; SLOW64-NEXT: pmulhw %xmm2, %xmm3
290 ; SLOW64-NEXT: pmullw %xmm2, %xmm1
291 ; SLOW64-NEXT: movdqa %xmm1, %xmm4
292 ; SLOW64-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
293 ; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
294 ; SLOW64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
295 ; SLOW64-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
296 ; SLOW64-NEXT: movdqa %xmm3, %xmm0
297 ; SLOW64-NEXT: pmulhw %xmm2, %xmm0
298 ; SLOW64-NEXT: pmullw %xmm2, %xmm3
299 ; SLOW64-NEXT: movdqa %xmm3, %xmm2
300 ; SLOW64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
301 ; SLOW64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
302 ; SLOW64-NEXT: movdqa %xmm4, %xmm0
305 ; SSE4-32-LABEL: test_mul_v16i32_v16i8:
307 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
308 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
309 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
310 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
311 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
312 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
313 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
314 ; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
315 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm0
316 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm1
317 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm2
318 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm3
321 ; SSE4-64-LABEL: test_mul_v16i32_v16i8:
323 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
324 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
325 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
326 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
327 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
328 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
329 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
330 ; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
331 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm0
332 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm1
333 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm2
334 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm3
337 ; AVX2-32-LABEL: test_mul_v16i32_v16i8:
339 ; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
340 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
341 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
342 ; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
343 ; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
344 ; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
347 ; AVX2-64-LABEL: test_mul_v16i32_v16i8:
349 ; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
350 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
351 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
352 ; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
353 ; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
354 ; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
357 ; AVX512DQ-32-LABEL: test_mul_v16i32_v16i8:
358 ; AVX512DQ-32: # %bb.0:
359 ; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
360 ; AVX512DQ-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
361 ; AVX512DQ-32-NEXT: retl
363 ; AVX512DQ-64-LABEL: test_mul_v16i32_v16i8:
364 ; AVX512DQ-64: # %bb.0:
365 ; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
366 ; AVX512DQ-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
367 ; AVX512DQ-64-NEXT: retq
369 ; AVX512BW-32-LABEL: test_mul_v16i32_v16i8:
370 ; AVX512BW-32: # %bb.0:
371 ; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
372 ; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %zmm0, %zmm0
373 ; AVX512BW-32-NEXT: retl
375 ; AVX512BW-64-LABEL: test_mul_v16i32_v16i8:
376 ; AVX512BW-64: # %bb.0:
377 ; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
378 ; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %zmm0, %zmm0
379 ; AVX512BW-64-NEXT: retq
381 ; KNL-32-LABEL: test_mul_v16i32_v16i8:
383 ; KNL-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
384 ; KNL-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
387 ; KNL-64-LABEL: test_mul_v16i32_v16i8:
389 ; KNL-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
390 ; KNL-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
392 %z = zext <16 x i8> %A to <16 x i32>
393 %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
397 define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) {
398 ; CHECK32-LABEL: test_mul_v4i32_v4i16:
400 ; CHECK32-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
401 ; CHECK32-NEXT: movdqa %xmm0, %xmm2
402 ; CHECK32-NEXT: pmulhuw %xmm1, %xmm2
403 ; CHECK32-NEXT: pmullw %xmm1, %xmm0
404 ; CHECK32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
407 ; CHECK64-LABEL: test_mul_v4i32_v4i16:
409 ; CHECK64-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
410 ; CHECK64-NEXT: movdqa %xmm0, %xmm2
411 ; CHECK64-NEXT: pmulhuw %xmm1, %xmm2
412 ; CHECK64-NEXT: pmullw %xmm1, %xmm0
413 ; CHECK64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
416 ; SSE4-32-LABEL: test_mul_v4i32_v4i16:
418 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
419 ; SSE4-32-NEXT: pmulld {{\.LCPI.*}}, %xmm0
422 ; SSE4-64-LABEL: test_mul_v4i32_v4i16:
424 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
425 ; SSE4-64-NEXT: pmulld {{.*}}(%rip), %xmm0
428 ; AVX-32-LABEL: test_mul_v4i32_v4i16:
430 ; AVX-32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
431 ; AVX-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
432 ; AVX-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
435 ; AVX-64-LABEL: test_mul_v4i32_v4i16:
437 ; AVX-64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
438 ; AVX-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
439 ; AVX-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
441 %z = zext <4 x i16> %A to <4 x i32>
442 %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
446 define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) {
447 ; SLM32-LABEL: test_mul_v8i32_v8i16:
449 ; SLM32-NEXT: movdqa %xmm0, %xmm1
450 ; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
451 ; SLM32-NEXT: movdqa %xmm1, %xmm2
452 ; SLM32-NEXT: pmullw %xmm0, %xmm1
453 ; SLM32-NEXT: pmulhuw %xmm0, %xmm2
454 ; SLM32-NEXT: movdqa %xmm1, %xmm0
455 ; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
456 ; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
459 ; SLM64-LABEL: test_mul_v8i32_v8i16:
461 ; SLM64-NEXT: movdqa %xmm0, %xmm1
462 ; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
463 ; SLM64-NEXT: movdqa %xmm1, %xmm2
464 ; SLM64-NEXT: pmullw %xmm0, %xmm1
465 ; SLM64-NEXT: pmulhuw %xmm0, %xmm2
466 ; SLM64-NEXT: movdqa %xmm1, %xmm0
467 ; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
468 ; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
471 ; SLOW32-LABEL: test_mul_v8i32_v8i16:
473 ; SLOW32-NEXT: movdqa %xmm0, %xmm1
474 ; SLOW32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
475 ; SLOW32-NEXT: movdqa %xmm1, %xmm2
476 ; SLOW32-NEXT: pmulhuw %xmm0, %xmm2
477 ; SLOW32-NEXT: pmullw %xmm0, %xmm1
478 ; SLOW32-NEXT: movdqa %xmm1, %xmm0
479 ; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
480 ; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
483 ; SLOW64-LABEL: test_mul_v8i32_v8i16:
485 ; SLOW64-NEXT: movdqa %xmm0, %xmm1
486 ; SLOW64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
487 ; SLOW64-NEXT: movdqa %xmm1, %xmm2
488 ; SLOW64-NEXT: pmulhuw %xmm0, %xmm2
489 ; SLOW64-NEXT: pmullw %xmm0, %xmm1
490 ; SLOW64-NEXT: movdqa %xmm1, %xmm0
491 ; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
492 ; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
495 ; SSE4-32-LABEL: test_mul_v8i32_v8i16:
497 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
498 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
499 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
500 ; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
501 ; SSE4-32-NEXT: pmulld %xmm2, %xmm0
502 ; SSE4-32-NEXT: pmulld %xmm2, %xmm1
505 ; SSE4-64-LABEL: test_mul_v8i32_v8i16:
507 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
508 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
509 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
510 ; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
511 ; SSE4-64-NEXT: pmulld %xmm2, %xmm0
512 ; SSE4-64-NEXT: pmulld %xmm2, %xmm1
515 ; AVX-32-LABEL: test_mul_v8i32_v8i16:
517 ; AVX-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
518 ; AVX-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
519 ; AVX-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
522 ; AVX-64-LABEL: test_mul_v8i32_v8i16:
524 ; AVX-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
525 ; AVX-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
526 ; AVX-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
528 %z = zext <8 x i16> %A to <8 x i32>
529 %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
533 define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) {
534 ; SLM32-LABEL: test_mul_v16i32_v16i16:
536 ; SLM32-NEXT: movdqa %xmm1, %xmm3
537 ; SLM32-NEXT: movdqa %xmm0, %xmm1
538 ; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
539 ; SLM32-NEXT: movdqa %xmm1, %xmm2
540 ; SLM32-NEXT: movdqa %xmm3, %xmm4
541 ; SLM32-NEXT: pmullw %xmm0, %xmm1
542 ; SLM32-NEXT: pmulhuw %xmm0, %xmm2
543 ; SLM32-NEXT: pmullw %xmm0, %xmm3
544 ; SLM32-NEXT: pmulhuw %xmm0, %xmm4
545 ; SLM32-NEXT: movdqa %xmm1, %xmm0
546 ; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
547 ; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
548 ; SLM32-NEXT: movdqa %xmm3, %xmm2
549 ; SLM32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
550 ; SLM32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
553 ; SLM64-LABEL: test_mul_v16i32_v16i16:
555 ; SLM64-NEXT: movdqa %xmm1, %xmm3
556 ; SLM64-NEXT: movdqa %xmm0, %xmm1
557 ; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
558 ; SLM64-NEXT: movdqa %xmm1, %xmm2
559 ; SLM64-NEXT: movdqa %xmm3, %xmm4
560 ; SLM64-NEXT: pmullw %xmm0, %xmm1
561 ; SLM64-NEXT: pmulhuw %xmm0, %xmm2
562 ; SLM64-NEXT: pmullw %xmm0, %xmm3
563 ; SLM64-NEXT: pmulhuw %xmm0, %xmm4
564 ; SLM64-NEXT: movdqa %xmm1, %xmm0
565 ; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
566 ; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
567 ; SLM64-NEXT: movdqa %xmm3, %xmm2
568 ; SLM64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
569 ; SLM64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
572 ; SLOW32-LABEL: test_mul_v16i32_v16i16:
574 ; SLOW32-NEXT: movdqa %xmm1, %xmm3
575 ; SLOW32-NEXT: movdqa %xmm0, %xmm1
576 ; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
577 ; SLOW32-NEXT: movdqa %xmm0, %xmm4
578 ; SLOW32-NEXT: pmulhuw %xmm2, %xmm4
579 ; SLOW32-NEXT: pmullw %xmm2, %xmm1
580 ; SLOW32-NEXT: movdqa %xmm1, %xmm0
581 ; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
582 ; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
583 ; SLOW32-NEXT: movdqa %xmm3, %xmm4
584 ; SLOW32-NEXT: pmulhuw %xmm2, %xmm4
585 ; SLOW32-NEXT: pmullw %xmm2, %xmm3
586 ; SLOW32-NEXT: movdqa %xmm3, %xmm2
587 ; SLOW32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
588 ; SLOW32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
591 ; SLOW64-LABEL: test_mul_v16i32_v16i16:
593 ; SLOW64-NEXT: movdqa %xmm1, %xmm3
594 ; SLOW64-NEXT: movdqa %xmm0, %xmm1
595 ; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
596 ; SLOW64-NEXT: movdqa %xmm0, %xmm4
597 ; SLOW64-NEXT: pmulhuw %xmm2, %xmm4
598 ; SLOW64-NEXT: pmullw %xmm2, %xmm1
599 ; SLOW64-NEXT: movdqa %xmm1, %xmm0
600 ; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
601 ; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
602 ; SLOW64-NEXT: movdqa %xmm3, %xmm4
603 ; SLOW64-NEXT: pmulhuw %xmm2, %xmm4
604 ; SLOW64-NEXT: pmullw %xmm2, %xmm3
605 ; SLOW64-NEXT: movdqa %xmm3, %xmm2
606 ; SLOW64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
607 ; SLOW64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
610 ; SSE4-32-LABEL: test_mul_v16i32_v16i16:
612 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
613 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
614 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
615 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
616 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
617 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
618 ; SSE4-32-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
619 ; SSE4-32-NEXT: pmulld %xmm1, %xmm0
620 ; SSE4-32-NEXT: pmulld %xmm1, %xmm2
621 ; SSE4-32-NEXT: pmulld %xmm1, %xmm4
622 ; SSE4-32-NEXT: pmulld %xmm1, %xmm3
623 ; SSE4-32-NEXT: movdqa %xmm4, %xmm1
626 ; SSE4-64-LABEL: test_mul_v16i32_v16i16:
628 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
629 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
630 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
631 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
632 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
633 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
634 ; SSE4-64-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
635 ; SSE4-64-NEXT: pmulld %xmm1, %xmm0
636 ; SSE4-64-NEXT: pmulld %xmm1, %xmm2
637 ; SSE4-64-NEXT: pmulld %xmm1, %xmm4
638 ; SSE4-64-NEXT: pmulld %xmm1, %xmm3
639 ; SSE4-64-NEXT: movdqa %xmm4, %xmm1
642 ; AVX2-32-LABEL: test_mul_v16i32_v16i16:
644 ; AVX2-32-NEXT: vextracti128 $1, %ymm0, %xmm1
645 ; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
646 ; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
647 ; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
648 ; AVX2-32-NEXT: vpmulld %ymm2, %ymm0, %ymm0
649 ; AVX2-32-NEXT: vpmulld %ymm2, %ymm1, %ymm1
652 ; AVX2-64-LABEL: test_mul_v16i32_v16i16:
654 ; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm1
655 ; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
656 ; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
657 ; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
658 ; AVX2-64-NEXT: vpmulld %ymm2, %ymm0, %ymm0
659 ; AVX2-64-NEXT: vpmulld %ymm2, %ymm1, %ymm1
662 ; AVX512-32-LABEL: test_mul_v16i32_v16i16:
663 ; AVX512-32: # %bb.0:
664 ; AVX512-32-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
665 ; AVX512-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
666 ; AVX512-32-NEXT: retl
668 ; AVX512-64-LABEL: test_mul_v16i32_v16i16:
669 ; AVX512-64: # %bb.0:
670 ; AVX512-64-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
671 ; AVX512-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
672 ; AVX512-64-NEXT: retq
673 %z = zext <16 x i16> %A to <16 x i32>
674 %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
682 define <4 x i32> @test_mul_v4i32_v4i8_minsize(<4 x i8> %A) minsize {
683 ; CHECK32-LABEL: test_mul_v4i32_v4i8_minsize:
685 ; CHECK32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
686 ; CHECK32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
689 ; CHECK64-LABEL: test_mul_v4i32_v4i8_minsize:
691 ; CHECK64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
692 ; CHECK64-NEXT: pmaddwd {{.*}}(%rip), %xmm0
695 ; SSE4-32-LABEL: test_mul_v4i32_v4i8_minsize:
697 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
698 ; SSE4-32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
701 ; SSE4-64-LABEL: test_mul_v4i32_v4i8_minsize:
703 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
704 ; SSE4-64-NEXT: pmaddwd {{.*}}(%rip), %xmm0
707 ; AVX2-32-LABEL: test_mul_v4i32_v4i8_minsize:
709 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
710 ; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
713 ; AVX2-64-LABEL: test_mul_v4i32_v4i8_minsize:
715 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
716 ; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
719 ; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8_minsize:
720 ; AVX512DQ-32: # %bb.0:
721 ; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
722 ; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
723 ; AVX512DQ-32-NEXT: retl
725 ; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8_minsize:
726 ; AVX512DQ-64: # %bb.0:
727 ; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
728 ; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
729 ; AVX512DQ-64-NEXT: retq
731 ; AVX512BW-32-LABEL: test_mul_v4i32_v4i8_minsize:
732 ; AVX512BW-32: # %bb.0:
733 ; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
734 ; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
735 ; AVX512BW-32-NEXT: retl
737 ; AVX512BW-64-LABEL: test_mul_v4i32_v4i8_minsize:
738 ; AVX512BW-64: # %bb.0:
739 ; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
740 ; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
741 ; AVX512BW-64-NEXT: retq
743 ; KNL-32-LABEL: test_mul_v4i32_v4i8_minsize:
745 ; KNL-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
746 ; KNL-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
747 ; KNL-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
750 ; KNL-64-LABEL: test_mul_v4i32_v4i8_minsize:
752 ; KNL-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
753 ; KNL-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
754 ; KNL-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
756 %z = zext <4 x i8> %A to <4 x i32>
757 %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
761 define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize {
762 ; SLM32-LABEL: test_mul_v8i32_v8i8_minsize:
764 ; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
765 ; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
766 ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
767 ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
768 ; SLM32-NEXT: pmaddwd %xmm2, %xmm0
769 ; SLM32-NEXT: pmaddwd %xmm2, %xmm1
772 ; SLM64-LABEL: test_mul_v8i32_v8i8_minsize:
774 ; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
775 ; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
776 ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
777 ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
778 ; SLM64-NEXT: pmaddwd %xmm2, %xmm0
779 ; SLM64-NEXT: pmaddwd %xmm2, %xmm1
782 ; SLOW32-LABEL: test_mul_v8i32_v8i8_minsize:
784 ; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
785 ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
786 ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
787 ; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
788 ; SLOW32-NEXT: pmaddwd %xmm2, %xmm0
789 ; SLOW32-NEXT: pmaddwd %xmm2, %xmm1
792 ; SLOW64-LABEL: test_mul_v8i32_v8i8_minsize:
794 ; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
795 ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
796 ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
797 ; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
798 ; SLOW64-NEXT: pmaddwd %xmm2, %xmm0
799 ; SLOW64-NEXT: pmaddwd %xmm2, %xmm1
802 ; SSE4-32-LABEL: test_mul_v8i32_v8i8_minsize:
804 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
805 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
806 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
807 ; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
808 ; SSE4-32-NEXT: pmaddwd %xmm2, %xmm0
809 ; SSE4-32-NEXT: pmaddwd %xmm2, %xmm1
812 ; SSE4-64-LABEL: test_mul_v8i32_v8i8_minsize:
814 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
815 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
816 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
817 ; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
818 ; SSE4-64-NEXT: pmaddwd %xmm2, %xmm0
819 ; SSE4-64-NEXT: pmaddwd %xmm2, %xmm1
822 ; AVX2-32-LABEL: test_mul_v8i32_v8i8_minsize:
824 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
825 ; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
828 ; AVX2-64-LABEL: test_mul_v8i32_v8i8_minsize:
830 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
831 ; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
834 ; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8_minsize:
835 ; AVX512DQ-32: # %bb.0:
836 ; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
837 ; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
838 ; AVX512DQ-32-NEXT: retl
840 ; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8_minsize:
841 ; AVX512DQ-64: # %bb.0:
842 ; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
843 ; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
844 ; AVX512DQ-64-NEXT: retq
846 ; AVX512BW-32-LABEL: test_mul_v8i32_v8i8_minsize:
847 ; AVX512BW-32: # %bb.0:
848 ; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
849 ; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
850 ; AVX512BW-32-NEXT: retl
852 ; AVX512BW-64-LABEL: test_mul_v8i32_v8i8_minsize:
853 ; AVX512BW-64: # %bb.0:
854 ; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
855 ; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
856 ; AVX512BW-64-NEXT: retq
858 ; KNL-32-LABEL: test_mul_v8i32_v8i8_minsize:
860 ; KNL-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
861 ; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
862 ; KNL-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
865 ; KNL-64-LABEL: test_mul_v8i32_v8i8_minsize:
867 ; KNL-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
868 ; KNL-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
869 ; KNL-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
871 %z = zext <8 x i8> %A to <8 x i32>
872 %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
876 define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
877 ; SLM32-LABEL: test_mul_v16i32_v16i8_minsize:
879 ; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
880 ; SLM32-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778]
881 ; SLM32-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
882 ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
883 ; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
884 ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
885 ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
886 ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
887 ; SLM32-NEXT: pmaddwd %xmm5, %xmm0
888 ; SLM32-NEXT: pmaddwd %xmm5, %xmm1
889 ; SLM32-NEXT: pmaddwd %xmm5, %xmm2
890 ; SLM32-NEXT: pmaddwd %xmm5, %xmm3
893 ; SLM64-LABEL: test_mul_v16i32_v16i8_minsize:
895 ; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
896 ; SLM64-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778]
897 ; SLM64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
898 ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
899 ; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
900 ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
901 ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
902 ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
903 ; SLM64-NEXT: pmaddwd %xmm5, %xmm0
904 ; SLM64-NEXT: pmaddwd %xmm5, %xmm1
905 ; SLM64-NEXT: pmaddwd %xmm5, %xmm2
906 ; SLM64-NEXT: pmaddwd %xmm5, %xmm3
909 ; SLOW32-LABEL: test_mul_v16i32_v16i8_minsize:
911 ; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
912 ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
913 ; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
914 ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
915 ; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
916 ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
917 ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
918 ; SLOW32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
919 ; SLOW32-NEXT: pmaddwd %xmm4, %xmm0
920 ; SLOW32-NEXT: pmaddwd %xmm4, %xmm1
921 ; SLOW32-NEXT: pmaddwd %xmm4, %xmm2
922 ; SLOW32-NEXT: pmaddwd %xmm4, %xmm3
925 ; SLOW64-LABEL: test_mul_v16i32_v16i8_minsize:
927 ; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
928 ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
929 ; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
930 ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
931 ; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
932 ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
933 ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
934 ; SLOW64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
935 ; SLOW64-NEXT: pmaddwd %xmm4, %xmm0
936 ; SLOW64-NEXT: pmaddwd %xmm4, %xmm1
937 ; SLOW64-NEXT: pmaddwd %xmm4, %xmm2
938 ; SLOW64-NEXT: pmaddwd %xmm4, %xmm3
941 ; SSE4-32-LABEL: test_mul_v16i32_v16i8_minsize:
943 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
944 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
945 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
946 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
947 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
948 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
949 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
950 ; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
951 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm0
952 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm1
953 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm2
954 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm3
957 ; SSE4-64-LABEL: test_mul_v16i32_v16i8_minsize:
959 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
960 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
961 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
962 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
963 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
964 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
965 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
966 ; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
967 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm0
968 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm1
969 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm2
970 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm3
973 ; AVX2-32-LABEL: test_mul_v16i32_v16i8_minsize:
975 ; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
976 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
977 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
978 ; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
979 ; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
980 ; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
983 ; AVX2-64-LABEL: test_mul_v16i32_v16i8_minsize:
985 ; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
986 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
987 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
988 ; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
989 ; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
990 ; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
993 ; AVX512DQ-32-LABEL: test_mul_v16i32_v16i8_minsize:
994 ; AVX512DQ-32: # %bb.0:
995 ; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
996 ; AVX512DQ-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
997 ; AVX512DQ-32-NEXT: retl
999 ; AVX512DQ-64-LABEL: test_mul_v16i32_v16i8_minsize:
1000 ; AVX512DQ-64: # %bb.0:
1001 ; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1002 ; AVX512DQ-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
1003 ; AVX512DQ-64-NEXT: retq
1005 ; AVX512BW-32-LABEL: test_mul_v16i32_v16i8_minsize:
1006 ; AVX512BW-32: # %bb.0:
1007 ; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1008 ; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %zmm0, %zmm0
1009 ; AVX512BW-32-NEXT: retl
1011 ; AVX512BW-64-LABEL: test_mul_v16i32_v16i8_minsize:
1012 ; AVX512BW-64: # %bb.0:
1013 ; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1014 ; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %zmm0, %zmm0
1015 ; AVX512BW-64-NEXT: retq
1017 ; KNL-32-LABEL: test_mul_v16i32_v16i8_minsize:
1019 ; KNL-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1020 ; KNL-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
1023 ; KNL-64-LABEL: test_mul_v16i32_v16i8_minsize:
1025 ; KNL-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1026 ; KNL-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
1028 %z = zext <16 x i8> %A to <16 x i32>
1029 %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
1033 define <4 x i32> @test_mul_v4i32_v4i16_minsize(<4 x i16> %A) minsize {
1034 ; CHECK32-LABEL: test_mul_v4i32_v4i16_minsize:
1036 ; CHECK32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1037 ; CHECK32-NEXT: pmulld {{\.LCPI.*}}, %xmm0
1038 ; CHECK32-NEXT: retl
1040 ; CHECK64-LABEL: test_mul_v4i32_v4i16_minsize:
1042 ; CHECK64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1043 ; CHECK64-NEXT: pmulld {{.*}}(%rip), %xmm0
1044 ; CHECK64-NEXT: retq
1046 ; SSE4-32-LABEL: test_mul_v4i32_v4i16_minsize:
1048 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1049 ; SSE4-32-NEXT: pmulld {{\.LCPI.*}}, %xmm0
1050 ; SSE4-32-NEXT: retl
1052 ; SSE4-64-LABEL: test_mul_v4i32_v4i16_minsize:
1054 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1055 ; SSE4-64-NEXT: pmulld {{.*}}(%rip), %xmm0
1056 ; SSE4-64-NEXT: retq
1058 ; AVX-32-LABEL: test_mul_v4i32_v4i16_minsize:
1060 ; AVX-32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1061 ; AVX-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
1062 ; AVX-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1065 ; AVX-64-LABEL: test_mul_v4i32_v4i16_minsize:
1067 ; AVX-64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1068 ; AVX-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
1069 ; AVX-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1071 %z = zext <4 x i16> %A to <4 x i32>
1072 %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
1076 define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize {
1077 ; SLM32-LABEL: test_mul_v8i32_v8i16_minsize:
1079 ; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
1080 ; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1081 ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1082 ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1083 ; SLM32-NEXT: pmulld %xmm2, %xmm0
1084 ; SLM32-NEXT: pmulld %xmm2, %xmm1
1087 ; SLM64-LABEL: test_mul_v8i32_v8i16_minsize:
1089 ; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
1090 ; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1091 ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1092 ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1093 ; SLM64-NEXT: pmulld %xmm2, %xmm0
1094 ; SLM64-NEXT: pmulld %xmm2, %xmm1
1097 ; SLOW32-LABEL: test_mul_v8i32_v8i16_minsize:
1099 ; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1100 ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1101 ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1102 ; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
1103 ; SLOW32-NEXT: pmulld %xmm2, %xmm0
1104 ; SLOW32-NEXT: pmulld %xmm2, %xmm1
1107 ; SLOW64-LABEL: test_mul_v8i32_v8i16_minsize:
1109 ; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1110 ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1111 ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1112 ; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
1113 ; SLOW64-NEXT: pmulld %xmm2, %xmm0
1114 ; SLOW64-NEXT: pmulld %xmm2, %xmm1
1117 ; SSE4-32-LABEL: test_mul_v8i32_v8i16_minsize:
1119 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1120 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1121 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1122 ; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
1123 ; SSE4-32-NEXT: pmulld %xmm2, %xmm0
1124 ; SSE4-32-NEXT: pmulld %xmm2, %xmm1
1125 ; SSE4-32-NEXT: retl
1127 ; SSE4-64-LABEL: test_mul_v8i32_v8i16_minsize:
1129 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1130 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1131 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1132 ; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
1133 ; SSE4-64-NEXT: pmulld %xmm2, %xmm0
1134 ; SSE4-64-NEXT: pmulld %xmm2, %xmm1
1135 ; SSE4-64-NEXT: retq
1137 ; AVX-32-LABEL: test_mul_v8i32_v8i16_minsize:
1139 ; AVX-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1140 ; AVX-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
1141 ; AVX-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
1144 ; AVX-64-LABEL: test_mul_v8i32_v8i16_minsize:
1146 ; AVX-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1147 ; AVX-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
1148 ; AVX-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
1150 %z = zext <8 x i16> %A to <8 x i32>
1151 %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
1155 define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize {
1156 ; SLM32-LABEL: test_mul_v16i32_v16i16_minsize:
1158 ; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
1159 ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1160 ; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
1161 ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1162 ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1163 ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1164 ; SLM32-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
1165 ; SLM32-NEXT: pmulld %xmm1, %xmm4
1166 ; SLM32-NEXT: pmulld %xmm1, %xmm0
1167 ; SLM32-NEXT: pmulld %xmm1, %xmm2
1168 ; SLM32-NEXT: pmulld %xmm1, %xmm3
1169 ; SLM32-NEXT: movdqa %xmm4, %xmm1
1172 ; SLM64-LABEL: test_mul_v16i32_v16i16_minsize:
1174 ; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
1175 ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1176 ; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
1177 ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1178 ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1179 ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1180 ; SLM64-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
1181 ; SLM64-NEXT: pmulld %xmm1, %xmm4
1182 ; SLM64-NEXT: pmulld %xmm1, %xmm0
1183 ; SLM64-NEXT: pmulld %xmm1, %xmm2
1184 ; SLM64-NEXT: pmulld %xmm1, %xmm3
1185 ; SLM64-NEXT: movdqa %xmm4, %xmm1
1188 ; SLOW32-LABEL: test_mul_v16i32_v16i16_minsize:
1190 ; SLOW32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
1191 ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1192 ; SLOW32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
1193 ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1194 ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1195 ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1196 ; SLOW32-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
1197 ; SLOW32-NEXT: pmulld %xmm1, %xmm0
1198 ; SLOW32-NEXT: pmulld %xmm1, %xmm2
1199 ; SLOW32-NEXT: pmulld %xmm1, %xmm4
1200 ; SLOW32-NEXT: pmulld %xmm1, %xmm3
1201 ; SLOW32-NEXT: movdqa %xmm4, %xmm1
1204 ; SLOW64-LABEL: test_mul_v16i32_v16i16_minsize:
1206 ; SLOW64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
1207 ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1208 ; SLOW64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
1209 ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1210 ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1211 ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1212 ; SLOW64-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
1213 ; SLOW64-NEXT: pmulld %xmm1, %xmm0
1214 ; SLOW64-NEXT: pmulld %xmm1, %xmm2
1215 ; SLOW64-NEXT: pmulld %xmm1, %xmm4
1216 ; SLOW64-NEXT: pmulld %xmm1, %xmm3
1217 ; SLOW64-NEXT: movdqa %xmm4, %xmm1
1220 ; SSE4-32-LABEL: test_mul_v16i32_v16i16_minsize:
1222 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
1223 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1224 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
1225 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1226 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1227 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1228 ; SSE4-32-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
1229 ; SSE4-32-NEXT: pmulld %xmm1, %xmm0
1230 ; SSE4-32-NEXT: pmulld %xmm1, %xmm2
1231 ; SSE4-32-NEXT: pmulld %xmm1, %xmm4
1232 ; SSE4-32-NEXT: pmulld %xmm1, %xmm3
1233 ; SSE4-32-NEXT: movdqa %xmm4, %xmm1
1234 ; SSE4-32-NEXT: retl
1236 ; SSE4-64-LABEL: test_mul_v16i32_v16i16_minsize:
1238 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
1239 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1240 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
1241 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1242 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1243 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1244 ; SSE4-64-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
1245 ; SSE4-64-NEXT: pmulld %xmm1, %xmm0
1246 ; SSE4-64-NEXT: pmulld %xmm1, %xmm2
1247 ; SSE4-64-NEXT: pmulld %xmm1, %xmm4
1248 ; SSE4-64-NEXT: pmulld %xmm1, %xmm3
1249 ; SSE4-64-NEXT: movdqa %xmm4, %xmm1
1250 ; SSE4-64-NEXT: retq
1252 ; AVX2-32-LABEL: test_mul_v16i32_v16i16_minsize:
1254 ; AVX2-32-NEXT: vextracti128 $1, %ymm0, %xmm1
1255 ; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1256 ; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1257 ; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
1258 ; AVX2-32-NEXT: vpmulld %ymm2, %ymm0, %ymm0
1259 ; AVX2-32-NEXT: vpmulld %ymm2, %ymm1, %ymm1
1260 ; AVX2-32-NEXT: retl
1262 ; AVX2-64-LABEL: test_mul_v16i32_v16i16_minsize:
1264 ; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm1
1265 ; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1266 ; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1267 ; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
1268 ; AVX2-64-NEXT: vpmulld %ymm2, %ymm0, %ymm0
1269 ; AVX2-64-NEXT: vpmulld %ymm2, %ymm1, %ymm1
1270 ; AVX2-64-NEXT: retq
1272 ; AVX512-32-LABEL: test_mul_v16i32_v16i16_minsize:
1273 ; AVX512-32: # %bb.0:
1274 ; AVX512-32-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1275 ; AVX512-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
1276 ; AVX512-32-NEXT: retl
1278 ; AVX512-64-LABEL: test_mul_v16i32_v16i16_minsize:
1279 ; AVX512-64: # %bb.0:
1280 ; AVX512-64-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1281 ; AVX512-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
1282 ; AVX512-64-NEXT: retq
1283 %z = zext <16 x i16> %A to <16 x i32>
1284 %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>