1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont | FileCheck %s --check-prefixes=CHECK32,SLM32
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=silvermont | FileCheck %s --check-prefixes=CHECK64,SLM64
4 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=CHECK32,SLOW32
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=CHECK64,SLOW64
6 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4-32
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4-64
8 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX-32,AVX2-32
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX-64,AVX2-64
10 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX-32,AVX512-32,AVX512DQ-32
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX-64,AVX512-64,AVX512DQ-64
12 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX-32,AVX512-32,AVX512BW-32
13 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX-64,AVX512-64,AVX512BW-64
14 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=knl | FileCheck %s --check-prefixes=AVX-32,AVX512-32,KNL-32
15 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefixes=AVX-64,AVX512-64,KNL-64
17 ; Make sure that the slow-pmulld feature can be used without SSE4.1.
18 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont -mattr=-sse4.1
20 define <4 x i32> @test_mul_v4i32_v4i8(<4 x i8> %A) {
21 ; CHECK32-LABEL: test_mul_v4i32_v4i8:
23 ; CHECK32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
24 ; CHECK32-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
27 ; CHECK64-LABEL: test_mul_v4i32_v4i8:
29 ; CHECK64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
30 ; CHECK64-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
33 ; SSE4-32-LABEL: test_mul_v4i32_v4i8:
35 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
36 ; SSE4-32-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
39 ; SSE4-64-LABEL: test_mul_v4i32_v4i8:
41 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
42 ; SSE4-64-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
45 ; AVX2-32-LABEL: test_mul_v4i32_v4i8:
47 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
48 ; AVX2-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
51 ; AVX2-64-LABEL: test_mul_v4i32_v4i8:
53 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
54 ; AVX2-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
57 ; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8:
58 ; AVX512DQ-32: # %bb.0:
59 ; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
60 ; AVX512DQ-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
61 ; AVX512DQ-32-NEXT: retl
63 ; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8:
64 ; AVX512DQ-64: # %bb.0:
65 ; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
66 ; AVX512DQ-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
67 ; AVX512DQ-64-NEXT: retq
69 ; AVX512BW-32-LABEL: test_mul_v4i32_v4i8:
70 ; AVX512BW-32: # %bb.0:
71 ; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
72 ; AVX512BW-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
73 ; AVX512BW-32-NEXT: retl
75 ; AVX512BW-64-LABEL: test_mul_v4i32_v4i8:
76 ; AVX512BW-64: # %bb.0:
77 ; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
78 ; AVX512BW-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
79 ; AVX512BW-64-NEXT: retq
81 ; KNL-32-LABEL: test_mul_v4i32_v4i8:
83 ; KNL-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
84 ; KNL-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
85 ; KNL-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
88 ; KNL-64-LABEL: test_mul_v4i32_v4i8:
90 ; KNL-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
91 ; KNL-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
92 ; KNL-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
94 %z = zext <4 x i8> %A to <4 x i32>
95 %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
99 define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) {
100 ; SLM32-LABEL: test_mul_v8i32_v8i8:
102 ; SLM32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
103 ; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
104 ; SLM32-NEXT: movdqa %xmm1, %xmm2
105 ; SLM32-NEXT: pmullw %xmm0, %xmm1
106 ; SLM32-NEXT: pmulhw %xmm0, %xmm2
107 ; SLM32-NEXT: movdqa %xmm1, %xmm0
108 ; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
109 ; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
112 ; SLM64-LABEL: test_mul_v8i32_v8i8:
114 ; SLM64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
115 ; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
116 ; SLM64-NEXT: movdqa %xmm1, %xmm2
117 ; SLM64-NEXT: pmullw %xmm0, %xmm1
118 ; SLM64-NEXT: pmulhw %xmm0, %xmm2
119 ; SLM64-NEXT: movdqa %xmm1, %xmm0
120 ; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
121 ; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
124 ; SLOW32-LABEL: test_mul_v8i32_v8i8:
126 ; SLOW32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
127 ; SLOW32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
128 ; SLOW32-NEXT: movdqa %xmm1, %xmm2
129 ; SLOW32-NEXT: pmulhw %xmm0, %xmm2
130 ; SLOW32-NEXT: pmullw %xmm0, %xmm1
131 ; SLOW32-NEXT: movdqa %xmm1, %xmm0
132 ; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
133 ; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
136 ; SLOW64-LABEL: test_mul_v8i32_v8i8:
138 ; SLOW64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
139 ; SLOW64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
140 ; SLOW64-NEXT: movdqa %xmm1, %xmm2
141 ; SLOW64-NEXT: pmulhw %xmm0, %xmm2
142 ; SLOW64-NEXT: pmullw %xmm0, %xmm1
143 ; SLOW64-NEXT: movdqa %xmm1, %xmm0
144 ; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
145 ; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
148 ; SSE4-32-LABEL: test_mul_v8i32_v8i8:
150 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
151 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
152 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
153 ; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
154 ; SSE4-32-NEXT: pmaddwd %xmm2, %xmm0
155 ; SSE4-32-NEXT: pmaddwd %xmm2, %xmm1
158 ; SSE4-64-LABEL: test_mul_v8i32_v8i8:
160 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
161 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
162 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
163 ; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
164 ; SSE4-64-NEXT: pmaddwd %xmm2, %xmm0
165 ; SSE4-64-NEXT: pmaddwd %xmm2, %xmm1
168 ; AVX2-32-LABEL: test_mul_v8i32_v8i8:
170 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
171 ; AVX2-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
174 ; AVX2-64-LABEL: test_mul_v8i32_v8i8:
176 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
177 ; AVX2-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
180 ; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8:
181 ; AVX512DQ-32: # %bb.0:
182 ; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
183 ; AVX512DQ-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
184 ; AVX512DQ-32-NEXT: retl
186 ; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8:
187 ; AVX512DQ-64: # %bb.0:
188 ; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
189 ; AVX512DQ-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
190 ; AVX512DQ-64-NEXT: retq
192 ; AVX512BW-32-LABEL: test_mul_v8i32_v8i8:
193 ; AVX512BW-32: # %bb.0:
194 ; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
195 ; AVX512BW-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
196 ; AVX512BW-32-NEXT: retl
198 ; AVX512BW-64-LABEL: test_mul_v8i32_v8i8:
199 ; AVX512BW-64: # %bb.0:
200 ; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
201 ; AVX512BW-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
202 ; AVX512BW-64-NEXT: retq
204 ; KNL-32-LABEL: test_mul_v8i32_v8i8:
206 ; KNL-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
207 ; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
208 ; KNL-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
211 ; KNL-64-LABEL: test_mul_v8i32_v8i8:
213 ; KNL-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
214 ; KNL-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
215 ; KNL-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
217 %z = zext <8 x i8> %A to <8 x i32>
218 %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
222 define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
223 ; SLM32-LABEL: test_mul_v16i32_v16i8:
225 ; SLM32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
226 ; SLM32-NEXT: movdqa %xmm0, %xmm3
227 ; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
228 ; SLM32-NEXT: pxor %xmm4, %xmm4
229 ; SLM32-NEXT: movdqa %xmm1, %xmm2
230 ; SLM32-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
231 ; SLM32-NEXT: movdqa %xmm3, %xmm4
232 ; SLM32-NEXT: pmullw %xmm0, %xmm1
233 ; SLM32-NEXT: pmulhw %xmm0, %xmm2
234 ; SLM32-NEXT: pmullw %xmm0, %xmm3
235 ; SLM32-NEXT: pmulhw %xmm0, %xmm4
236 ; SLM32-NEXT: movdqa %xmm1, %xmm0
237 ; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
238 ; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
239 ; SLM32-NEXT: movdqa %xmm3, %xmm2
240 ; SLM32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
241 ; SLM32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
244 ; SLM64-LABEL: test_mul_v16i32_v16i8:
246 ; SLM64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
247 ; SLM64-NEXT: movdqa %xmm0, %xmm3
248 ; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
249 ; SLM64-NEXT: pxor %xmm4, %xmm4
250 ; SLM64-NEXT: movdqa %xmm1, %xmm2
251 ; SLM64-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
252 ; SLM64-NEXT: movdqa %xmm3, %xmm4
253 ; SLM64-NEXT: pmullw %xmm0, %xmm1
254 ; SLM64-NEXT: pmulhw %xmm0, %xmm2
255 ; SLM64-NEXT: pmullw %xmm0, %xmm3
256 ; SLM64-NEXT: pmulhw %xmm0, %xmm4
257 ; SLM64-NEXT: movdqa %xmm1, %xmm0
258 ; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
259 ; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
260 ; SLM64-NEXT: movdqa %xmm3, %xmm2
261 ; SLM64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
262 ; SLM64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
265 ; SLOW32-LABEL: test_mul_v16i32_v16i8:
267 ; SLOW32-NEXT: movdqa %xmm0, %xmm3
268 ; SLOW32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
269 ; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
270 ; SLOW32-NEXT: movdqa %xmm1, %xmm4
271 ; SLOW32-NEXT: pmulhw %xmm2, %xmm4
272 ; SLOW32-NEXT: pmullw %xmm2, %xmm1
273 ; SLOW32-NEXT: movdqa %xmm1, %xmm0
274 ; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
275 ; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
276 ; SLOW32-NEXT: pxor %xmm4, %xmm4
277 ; SLOW32-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
278 ; SLOW32-NEXT: movdqa %xmm3, %xmm4
279 ; SLOW32-NEXT: pmulhw %xmm2, %xmm4
280 ; SLOW32-NEXT: pmullw %xmm2, %xmm3
281 ; SLOW32-NEXT: movdqa %xmm3, %xmm2
282 ; SLOW32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
283 ; SLOW32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
286 ; SLOW64-LABEL: test_mul_v16i32_v16i8:
288 ; SLOW64-NEXT: movdqa %xmm0, %xmm3
289 ; SLOW64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
290 ; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
291 ; SLOW64-NEXT: movdqa %xmm1, %xmm4
292 ; SLOW64-NEXT: pmulhw %xmm2, %xmm4
293 ; SLOW64-NEXT: pmullw %xmm2, %xmm1
294 ; SLOW64-NEXT: movdqa %xmm1, %xmm0
295 ; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
296 ; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
297 ; SLOW64-NEXT: pxor %xmm4, %xmm4
298 ; SLOW64-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
299 ; SLOW64-NEXT: movdqa %xmm3, %xmm4
300 ; SLOW64-NEXT: pmulhw %xmm2, %xmm4
301 ; SLOW64-NEXT: pmullw %xmm2, %xmm3
302 ; SLOW64-NEXT: movdqa %xmm3, %xmm2
303 ; SLOW64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
304 ; SLOW64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
307 ; SSE4-32-LABEL: test_mul_v16i32_v16i8:
309 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
310 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
311 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
312 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
313 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
314 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
315 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
316 ; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
317 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm0
318 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm1
319 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm2
320 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm3
323 ; SSE4-64-LABEL: test_mul_v16i32_v16i8:
325 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
326 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
327 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
328 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
329 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
330 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
331 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
332 ; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
333 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm0
334 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm1
335 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm2
336 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm3
339 ; AVX2-32-LABEL: test_mul_v16i32_v16i8:
341 ; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
342 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
343 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
344 ; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
345 ; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
346 ; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
349 ; AVX2-64-LABEL: test_mul_v16i32_v16i8:
351 ; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
352 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
353 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
354 ; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
355 ; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
356 ; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
359 ; AVX512DQ-32-LABEL: test_mul_v16i32_v16i8:
360 ; AVX512DQ-32: # %bb.0:
361 ; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
362 ; AVX512DQ-32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
363 ; AVX512DQ-32-NEXT: retl
365 ; AVX512DQ-64-LABEL: test_mul_v16i32_v16i8:
366 ; AVX512DQ-64: # %bb.0:
367 ; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
368 ; AVX512DQ-64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
369 ; AVX512DQ-64-NEXT: retq
371 ; AVX512BW-32-LABEL: test_mul_v16i32_v16i8:
372 ; AVX512BW-32: # %bb.0:
373 ; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
374 ; AVX512BW-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0
375 ; AVX512BW-32-NEXT: retl
377 ; AVX512BW-64-LABEL: test_mul_v16i32_v16i8:
378 ; AVX512BW-64: # %bb.0:
379 ; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
380 ; AVX512BW-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
381 ; AVX512BW-64-NEXT: retq
383 ; KNL-32-LABEL: test_mul_v16i32_v16i8:
385 ; KNL-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
386 ; KNL-32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
389 ; KNL-64-LABEL: test_mul_v16i32_v16i8:
391 ; KNL-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
392 ; KNL-64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
394 %z = zext <16 x i8> %A to <16 x i32>
395 %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
399 define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) {
400 ; CHECK32-LABEL: test_mul_v4i32_v4i16:
402 ; CHECK32-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
403 ; CHECK32-NEXT: movdqa %xmm0, %xmm2
404 ; CHECK32-NEXT: pmulhuw %xmm1, %xmm2
405 ; CHECK32-NEXT: pmullw %xmm1, %xmm0
406 ; CHECK32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
409 ; CHECK64-LABEL: test_mul_v4i32_v4i16:
411 ; CHECK64-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
412 ; CHECK64-NEXT: movdqa %xmm0, %xmm2
413 ; CHECK64-NEXT: pmulhuw %xmm1, %xmm2
414 ; CHECK64-NEXT: pmullw %xmm1, %xmm0
415 ; CHECK64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
418 ; SSE4-32-LABEL: test_mul_v4i32_v4i16:
420 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
421 ; SSE4-32-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
424 ; SSE4-64-LABEL: test_mul_v4i32_v4i16:
426 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
427 ; SSE4-64-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
430 ; AVX-32-LABEL: test_mul_v4i32_v4i16:
432 ; AVX-32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
433 ; AVX-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
434 ; AVX-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
437 ; AVX-64-LABEL: test_mul_v4i32_v4i16:
439 ; AVX-64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
440 ; AVX-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
441 ; AVX-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
443 %z = zext <4 x i16> %A to <4 x i32>
444 %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
448 define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) {
449 ; SLM32-LABEL: test_mul_v8i32_v8i16:
451 ; SLM32-NEXT: movdqa %xmm0, %xmm1
452 ; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
453 ; SLM32-NEXT: movdqa %xmm1, %xmm2
454 ; SLM32-NEXT: pmullw %xmm0, %xmm1
455 ; SLM32-NEXT: pmulhuw %xmm0, %xmm2
456 ; SLM32-NEXT: movdqa %xmm1, %xmm0
457 ; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
458 ; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
461 ; SLM64-LABEL: test_mul_v8i32_v8i16:
463 ; SLM64-NEXT: movdqa %xmm0, %xmm1
464 ; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
465 ; SLM64-NEXT: movdqa %xmm1, %xmm2
466 ; SLM64-NEXT: pmullw %xmm0, %xmm1
467 ; SLM64-NEXT: pmulhuw %xmm0, %xmm2
468 ; SLM64-NEXT: movdqa %xmm1, %xmm0
469 ; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
470 ; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
473 ; SLOW32-LABEL: test_mul_v8i32_v8i16:
475 ; SLOW32-NEXT: movdqa %xmm0, %xmm1
476 ; SLOW32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
477 ; SLOW32-NEXT: movdqa %xmm1, %xmm2
478 ; SLOW32-NEXT: pmulhuw %xmm0, %xmm2
479 ; SLOW32-NEXT: pmullw %xmm0, %xmm1
480 ; SLOW32-NEXT: movdqa %xmm1, %xmm0
481 ; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
482 ; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
485 ; SLOW64-LABEL: test_mul_v8i32_v8i16:
487 ; SLOW64-NEXT: movdqa %xmm0, %xmm1
488 ; SLOW64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
489 ; SLOW64-NEXT: movdqa %xmm1, %xmm2
490 ; SLOW64-NEXT: pmulhuw %xmm0, %xmm2
491 ; SLOW64-NEXT: pmullw %xmm0, %xmm1
492 ; SLOW64-NEXT: movdqa %xmm1, %xmm0
493 ; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
494 ; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
497 ; SSE4-32-LABEL: test_mul_v8i32_v8i16:
499 ; SSE4-32-NEXT: pxor %xmm1, %xmm1
500 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
501 ; SSE4-32-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
502 ; SSE4-32-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
503 ; SSE4-32-NEXT: pmulld %xmm1, %xmm2
504 ; SSE4-32-NEXT: pmulld %xmm0, %xmm1
505 ; SSE4-32-NEXT: movdqa %xmm2, %xmm0
508 ; SSE4-64-LABEL: test_mul_v8i32_v8i16:
510 ; SSE4-64-NEXT: pxor %xmm1, %xmm1
511 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
512 ; SSE4-64-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
513 ; SSE4-64-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
514 ; SSE4-64-NEXT: pmulld %xmm1, %xmm2
515 ; SSE4-64-NEXT: pmulld %xmm0, %xmm1
516 ; SSE4-64-NEXT: movdqa %xmm2, %xmm0
519 ; AVX-32-LABEL: test_mul_v8i32_v8i16:
521 ; AVX-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
522 ; AVX-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
523 ; AVX-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
526 ; AVX-64-LABEL: test_mul_v8i32_v8i16:
528 ; AVX-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
529 ; AVX-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
530 ; AVX-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
532 %z = zext <8 x i16> %A to <8 x i32>
533 %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
537 define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) {
538 ; SLM32-LABEL: test_mul_v16i32_v16i16:
540 ; SLM32-NEXT: movdqa %xmm0, %xmm4
541 ; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
542 ; SLM32-NEXT: movdqa %xmm1, %xmm3
543 ; SLM32-NEXT: movdqa %xmm4, %xmm2
544 ; SLM32-NEXT: pmullw %xmm0, %xmm4
545 ; SLM32-NEXT: pmulhuw %xmm0, %xmm2
546 ; SLM32-NEXT: pmullw %xmm0, %xmm3
547 ; SLM32-NEXT: pmulhuw %xmm0, %xmm1
548 ; SLM32-NEXT: movdqa %xmm4, %xmm0
549 ; SLM32-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
550 ; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
551 ; SLM32-NEXT: movdqa %xmm3, %xmm2
552 ; SLM32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
553 ; SLM32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
554 ; SLM32-NEXT: movdqa %xmm4, %xmm1
557 ; SLM64-LABEL: test_mul_v16i32_v16i16:
559 ; SLM64-NEXT: movdqa %xmm0, %xmm4
560 ; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
561 ; SLM64-NEXT: movdqa %xmm1, %xmm3
562 ; SLM64-NEXT: movdqa %xmm4, %xmm2
563 ; SLM64-NEXT: pmullw %xmm0, %xmm4
564 ; SLM64-NEXT: pmulhuw %xmm0, %xmm2
565 ; SLM64-NEXT: pmullw %xmm0, %xmm3
566 ; SLM64-NEXT: pmulhuw %xmm0, %xmm1
567 ; SLM64-NEXT: movdqa %xmm4, %xmm0
568 ; SLM64-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
569 ; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
570 ; SLM64-NEXT: movdqa %xmm3, %xmm2
571 ; SLM64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
572 ; SLM64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
573 ; SLM64-NEXT: movdqa %xmm4, %xmm1
576 ; SLOW32-LABEL: test_mul_v16i32_v16i16:
578 ; SLOW32-NEXT: movdqa %xmm1, %xmm3
579 ; SLOW32-NEXT: movdqa %xmm0, %xmm1
580 ; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
581 ; SLOW32-NEXT: movdqa %xmm0, %xmm4
582 ; SLOW32-NEXT: pmulhuw %xmm2, %xmm4
583 ; SLOW32-NEXT: pmullw %xmm2, %xmm1
584 ; SLOW32-NEXT: movdqa %xmm1, %xmm0
585 ; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
586 ; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
587 ; SLOW32-NEXT: movdqa %xmm3, %xmm4
588 ; SLOW32-NEXT: pmulhuw %xmm2, %xmm4
589 ; SLOW32-NEXT: pmullw %xmm2, %xmm3
590 ; SLOW32-NEXT: movdqa %xmm3, %xmm2
591 ; SLOW32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
592 ; SLOW32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
595 ; SLOW64-LABEL: test_mul_v16i32_v16i16:
597 ; SLOW64-NEXT: movdqa %xmm1, %xmm3
598 ; SLOW64-NEXT: movdqa %xmm0, %xmm1
599 ; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
600 ; SLOW64-NEXT: movdqa %xmm0, %xmm4
601 ; SLOW64-NEXT: pmulhuw %xmm2, %xmm4
602 ; SLOW64-NEXT: pmullw %xmm2, %xmm1
603 ; SLOW64-NEXT: movdqa %xmm1, %xmm0
604 ; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
605 ; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
606 ; SLOW64-NEXT: movdqa %xmm3, %xmm4
607 ; SLOW64-NEXT: pmulhuw %xmm2, %xmm4
608 ; SLOW64-NEXT: pmullw %xmm2, %xmm3
609 ; SLOW64-NEXT: movdqa %xmm3, %xmm2
610 ; SLOW64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
611 ; SLOW64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
614 ; SSE4-32-LABEL: test_mul_v16i32_v16i16:
616 ; SSE4-32-NEXT: movdqa %xmm0, %xmm4
617 ; SSE4-32-NEXT: pxor %xmm3, %xmm3
618 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
619 ; SSE4-32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
620 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
621 ; SSE4-32-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
622 ; SSE4-32-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778]
623 ; SSE4-32-NEXT: pmulld %xmm3, %xmm0
624 ; SSE4-32-NEXT: pmulld %xmm3, %xmm4
625 ; SSE4-32-NEXT: pmulld %xmm3, %xmm2
626 ; SSE4-32-NEXT: pmulld %xmm1, %xmm3
627 ; SSE4-32-NEXT: movdqa %xmm4, %xmm1
630 ; SSE4-64-LABEL: test_mul_v16i32_v16i16:
632 ; SSE4-64-NEXT: movdqa %xmm0, %xmm4
633 ; SSE4-64-NEXT: pxor %xmm3, %xmm3
634 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
635 ; SSE4-64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
636 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
637 ; SSE4-64-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
638 ; SSE4-64-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778]
639 ; SSE4-64-NEXT: pmulld %xmm3, %xmm0
640 ; SSE4-64-NEXT: pmulld %xmm3, %xmm4
641 ; SSE4-64-NEXT: pmulld %xmm3, %xmm2
642 ; SSE4-64-NEXT: pmulld %xmm1, %xmm3
643 ; SSE4-64-NEXT: movdqa %xmm4, %xmm1
646 ; AVX2-32-LABEL: test_mul_v16i32_v16i16:
648 ; AVX2-32-NEXT: vextracti128 $1, %ymm0, %xmm1
649 ; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
650 ; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
651 ; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
652 ; AVX2-32-NEXT: vpmulld %ymm2, %ymm0, %ymm0
653 ; AVX2-32-NEXT: vpmulld %ymm2, %ymm1, %ymm1
656 ; AVX2-64-LABEL: test_mul_v16i32_v16i16:
658 ; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm1
659 ; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
660 ; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
661 ; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
662 ; AVX2-64-NEXT: vpmulld %ymm2, %ymm0, %ymm0
663 ; AVX2-64-NEXT: vpmulld %ymm2, %ymm1, %ymm1
666 ; AVX512-32-LABEL: test_mul_v16i32_v16i16:
667 ; AVX512-32: # %bb.0:
668 ; AVX512-32-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
669 ; AVX512-32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
670 ; AVX512-32-NEXT: retl
672 ; AVX512-64-LABEL: test_mul_v16i32_v16i16:
673 ; AVX512-64: # %bb.0:
674 ; AVX512-64-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
675 ; AVX512-64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
676 ; AVX512-64-NEXT: retq
677 %z = zext <16 x i16> %A to <16 x i32>
678 %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
686 define <4 x i32> @test_mul_v4i32_v4i8_minsize(<4 x i8> %A) minsize {
687 ; CHECK32-LABEL: test_mul_v4i32_v4i8_minsize:
689 ; CHECK32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
690 ; CHECK32-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
693 ; CHECK64-LABEL: test_mul_v4i32_v4i8_minsize:
695 ; CHECK64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
696 ; CHECK64-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
699 ; SSE4-32-LABEL: test_mul_v4i32_v4i8_minsize:
701 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
702 ; SSE4-32-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
705 ; SSE4-64-LABEL: test_mul_v4i32_v4i8_minsize:
707 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
708 ; SSE4-64-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
711 ; AVX2-32-LABEL: test_mul_v4i32_v4i8_minsize:
713 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
714 ; AVX2-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
717 ; AVX2-64-LABEL: test_mul_v4i32_v4i8_minsize:
719 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
720 ; AVX2-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
723 ; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8_minsize:
724 ; AVX512DQ-32: # %bb.0:
725 ; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
726 ; AVX512DQ-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
727 ; AVX512DQ-32-NEXT: retl
729 ; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8_minsize:
730 ; AVX512DQ-64: # %bb.0:
731 ; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
732 ; AVX512DQ-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
733 ; AVX512DQ-64-NEXT: retq
735 ; AVX512BW-32-LABEL: test_mul_v4i32_v4i8_minsize:
736 ; AVX512BW-32: # %bb.0:
737 ; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
738 ; AVX512BW-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
739 ; AVX512BW-32-NEXT: retl
741 ; AVX512BW-64-LABEL: test_mul_v4i32_v4i8_minsize:
742 ; AVX512BW-64: # %bb.0:
743 ; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
744 ; AVX512BW-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
745 ; AVX512BW-64-NEXT: retq
747 ; KNL-32-LABEL: test_mul_v4i32_v4i8_minsize:
749 ; KNL-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
750 ; KNL-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
751 ; KNL-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
754 ; KNL-64-LABEL: test_mul_v4i32_v4i8_minsize:
756 ; KNL-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
757 ; KNL-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
758 ; KNL-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
760 %z = zext <4 x i8> %A to <4 x i32>
761 %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
765 define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize {
766 ; SLM32-LABEL: test_mul_v8i32_v8i8_minsize:
768 ; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
769 ; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
770 ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
771 ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
772 ; SLM32-NEXT: pmaddwd %xmm2, %xmm0
773 ; SLM32-NEXT: pmaddwd %xmm2, %xmm1
776 ; SLM64-LABEL: test_mul_v8i32_v8i8_minsize:
778 ; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
779 ; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
780 ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
781 ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
782 ; SLM64-NEXT: pmaddwd %xmm2, %xmm0
783 ; SLM64-NEXT: pmaddwd %xmm2, %xmm1
786 ; SLOW32-LABEL: test_mul_v8i32_v8i8_minsize:
788 ; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
789 ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
790 ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
791 ; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
792 ; SLOW32-NEXT: pmaddwd %xmm2, %xmm0
793 ; SLOW32-NEXT: pmaddwd %xmm2, %xmm1
796 ; SLOW64-LABEL: test_mul_v8i32_v8i8_minsize:
798 ; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
799 ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
800 ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
801 ; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
802 ; SLOW64-NEXT: pmaddwd %xmm2, %xmm0
803 ; SLOW64-NEXT: pmaddwd %xmm2, %xmm1
806 ; SSE4-32-LABEL: test_mul_v8i32_v8i8_minsize:
808 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
809 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
810 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
811 ; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
812 ; SSE4-32-NEXT: pmaddwd %xmm2, %xmm0
813 ; SSE4-32-NEXT: pmaddwd %xmm2, %xmm1
816 ; SSE4-64-LABEL: test_mul_v8i32_v8i8_minsize:
818 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
819 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
820 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
821 ; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
822 ; SSE4-64-NEXT: pmaddwd %xmm2, %xmm0
823 ; SSE4-64-NEXT: pmaddwd %xmm2, %xmm1
826 ; AVX2-32-LABEL: test_mul_v8i32_v8i8_minsize:
828 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
829 ; AVX2-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
832 ; AVX2-64-LABEL: test_mul_v8i32_v8i8_minsize:
834 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
835 ; AVX2-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
838 ; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8_minsize:
839 ; AVX512DQ-32: # %bb.0:
840 ; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
841 ; AVX512DQ-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
842 ; AVX512DQ-32-NEXT: retl
844 ; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8_minsize:
845 ; AVX512DQ-64: # %bb.0:
846 ; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
847 ; AVX512DQ-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
848 ; AVX512DQ-64-NEXT: retq
850 ; AVX512BW-32-LABEL: test_mul_v8i32_v8i8_minsize:
851 ; AVX512BW-32: # %bb.0:
852 ; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
853 ; AVX512BW-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
854 ; AVX512BW-32-NEXT: retl
856 ; AVX512BW-64-LABEL: test_mul_v8i32_v8i8_minsize:
857 ; AVX512BW-64: # %bb.0:
858 ; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
859 ; AVX512BW-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
860 ; AVX512BW-64-NEXT: retq
862 ; KNL-32-LABEL: test_mul_v8i32_v8i8_minsize:
864 ; KNL-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
865 ; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
866 ; KNL-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
869 ; KNL-64-LABEL: test_mul_v8i32_v8i8_minsize:
871 ; KNL-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
872 ; KNL-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
873 ; KNL-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
875 %z = zext <8 x i8> %A to <8 x i32>
876 %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
880 define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
881 ; SLM32-LABEL: test_mul_v16i32_v16i8_minsize:
883 ; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
884 ; SLM32-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778]
885 ; SLM32-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
886 ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
887 ; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
888 ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
889 ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
890 ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
891 ; SLM32-NEXT: pmaddwd %xmm5, %xmm0
892 ; SLM32-NEXT: pmaddwd %xmm5, %xmm1
893 ; SLM32-NEXT: pmaddwd %xmm5, %xmm2
894 ; SLM32-NEXT: pmaddwd %xmm5, %xmm3
897 ; SLM64-LABEL: test_mul_v16i32_v16i8_minsize:
899 ; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
900 ; SLM64-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778]
901 ; SLM64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
902 ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
903 ; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
904 ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
905 ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
906 ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
907 ; SLM64-NEXT: pmaddwd %xmm5, %xmm0
908 ; SLM64-NEXT: pmaddwd %xmm5, %xmm1
909 ; SLM64-NEXT: pmaddwd %xmm5, %xmm2
910 ; SLM64-NEXT: pmaddwd %xmm5, %xmm3
913 ; SLOW32-LABEL: test_mul_v16i32_v16i8_minsize:
915 ; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
916 ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
917 ; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
918 ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
919 ; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
920 ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
921 ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
922 ; SLOW32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
923 ; SLOW32-NEXT: pmaddwd %xmm4, %xmm0
924 ; SLOW32-NEXT: pmaddwd %xmm4, %xmm1
925 ; SLOW32-NEXT: pmaddwd %xmm4, %xmm2
926 ; SLOW32-NEXT: pmaddwd %xmm4, %xmm3
929 ; SLOW64-LABEL: test_mul_v16i32_v16i8_minsize:
931 ; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
932 ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
933 ; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
934 ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
935 ; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
936 ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
937 ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
938 ; SLOW64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
939 ; SLOW64-NEXT: pmaddwd %xmm4, %xmm0
940 ; SLOW64-NEXT: pmaddwd %xmm4, %xmm1
941 ; SLOW64-NEXT: pmaddwd %xmm4, %xmm2
942 ; SLOW64-NEXT: pmaddwd %xmm4, %xmm3
945 ; SSE4-32-LABEL: test_mul_v16i32_v16i8_minsize:
947 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
948 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
949 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
950 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
951 ; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
952 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
953 ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
954 ; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
955 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm0
956 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm1
957 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm2
958 ; SSE4-32-NEXT: pmaddwd %xmm4, %xmm3
961 ; SSE4-64-LABEL: test_mul_v16i32_v16i8_minsize:
963 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
964 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
965 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
966 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
967 ; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
968 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
969 ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
970 ; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
971 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm0
972 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm1
973 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm2
974 ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm3
977 ; AVX2-32-LABEL: test_mul_v16i32_v16i8_minsize:
979 ; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
980 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
981 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
982 ; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
983 ; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
984 ; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
987 ; AVX2-64-LABEL: test_mul_v16i32_v16i8_minsize:
989 ; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
990 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
991 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
992 ; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
993 ; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
994 ; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
997 ; AVX512DQ-32-LABEL: test_mul_v16i32_v16i8_minsize:
998 ; AVX512DQ-32: # %bb.0:
999 ; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1000 ; AVX512DQ-32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
1001 ; AVX512DQ-32-NEXT: retl
1003 ; AVX512DQ-64-LABEL: test_mul_v16i32_v16i8_minsize:
1004 ; AVX512DQ-64: # %bb.0:
1005 ; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1006 ; AVX512DQ-64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
1007 ; AVX512DQ-64-NEXT: retq
1009 ; AVX512BW-32-LABEL: test_mul_v16i32_v16i8_minsize:
1010 ; AVX512BW-32: # %bb.0:
1011 ; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1012 ; AVX512BW-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0
1013 ; AVX512BW-32-NEXT: retl
1015 ; AVX512BW-64-LABEL: test_mul_v16i32_v16i8_minsize:
1016 ; AVX512BW-64: # %bb.0:
1017 ; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1018 ; AVX512BW-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1019 ; AVX512BW-64-NEXT: retq
1021 ; KNL-32-LABEL: test_mul_v16i32_v16i8_minsize:
1023 ; KNL-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1024 ; KNL-32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
1027 ; KNL-64-LABEL: test_mul_v16i32_v16i8_minsize:
1029 ; KNL-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1030 ; KNL-64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
1032 %z = zext <16 x i8> %A to <16 x i32>
1033 %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
1037 define <4 x i32> @test_mul_v4i32_v4i16_minsize(<4 x i16> %A) minsize {
1038 ; CHECK32-LABEL: test_mul_v4i32_v4i16_minsize:
1040 ; CHECK32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1041 ; CHECK32-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1042 ; CHECK32-NEXT: retl
1044 ; CHECK64-LABEL: test_mul_v4i32_v4i16_minsize:
1046 ; CHECK64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1047 ; CHECK64-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1048 ; CHECK64-NEXT: retq
1050 ; SSE4-32-LABEL: test_mul_v4i32_v4i16_minsize:
1052 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1053 ; SSE4-32-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1054 ; SSE4-32-NEXT: retl
1056 ; SSE4-64-LABEL: test_mul_v4i32_v4i16_minsize:
1058 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1059 ; SSE4-64-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1060 ; SSE4-64-NEXT: retq
1062 ; AVX-32-LABEL: test_mul_v4i32_v4i16_minsize:
1064 ; AVX-32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1065 ; AVX-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
1066 ; AVX-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1069 ; AVX-64-LABEL: test_mul_v4i32_v4i16_minsize:
1071 ; AVX-64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1072 ; AVX-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
1073 ; AVX-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
1075 %z = zext <4 x i16> %A to <4 x i32>
1076 %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
1080 define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize {
1081 ; CHECK32-LABEL: test_mul_v8i32_v8i16_minsize:
1083 ; CHECK32-NEXT: pxor %xmm1, %xmm1
1084 ; CHECK32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1085 ; CHECK32-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1086 ; CHECK32-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
1087 ; CHECK32-NEXT: pmulld %xmm1, %xmm2
1088 ; CHECK32-NEXT: pmulld %xmm0, %xmm1
1089 ; CHECK32-NEXT: movdqa %xmm2, %xmm0
1090 ; CHECK32-NEXT: retl
1092 ; CHECK64-LABEL: test_mul_v8i32_v8i16_minsize:
1094 ; CHECK64-NEXT: pxor %xmm1, %xmm1
1095 ; CHECK64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1096 ; CHECK64-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1097 ; CHECK64-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
1098 ; CHECK64-NEXT: pmulld %xmm1, %xmm2
1099 ; CHECK64-NEXT: pmulld %xmm0, %xmm1
1100 ; CHECK64-NEXT: movdqa %xmm2, %xmm0
1101 ; CHECK64-NEXT: retq
1103 ; SSE4-32-LABEL: test_mul_v8i32_v8i16_minsize:
1105 ; SSE4-32-NEXT: pxor %xmm1, %xmm1
1106 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1107 ; SSE4-32-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1108 ; SSE4-32-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
1109 ; SSE4-32-NEXT: pmulld %xmm1, %xmm2
1110 ; SSE4-32-NEXT: pmulld %xmm0, %xmm1
1111 ; SSE4-32-NEXT: movdqa %xmm2, %xmm0
1112 ; SSE4-32-NEXT: retl
1114 ; SSE4-64-LABEL: test_mul_v8i32_v8i16_minsize:
1116 ; SSE4-64-NEXT: pxor %xmm1, %xmm1
1117 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1118 ; SSE4-64-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1119 ; SSE4-64-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
1120 ; SSE4-64-NEXT: pmulld %xmm1, %xmm2
1121 ; SSE4-64-NEXT: pmulld %xmm0, %xmm1
1122 ; SSE4-64-NEXT: movdqa %xmm2, %xmm0
1123 ; SSE4-64-NEXT: retq
1125 ; AVX-32-LABEL: test_mul_v8i32_v8i16_minsize:
1127 ; AVX-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1128 ; AVX-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
1129 ; AVX-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
1132 ; AVX-64-LABEL: test_mul_v8i32_v8i16_minsize:
1134 ; AVX-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1135 ; AVX-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
1136 ; AVX-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
1138 %z = zext <8 x i16> %A to <8 x i32>
1139 %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
1143 define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize {
1144 ; SLM32-LABEL: test_mul_v16i32_v16i16_minsize:
1146 ; SLM32-NEXT: movdqa %xmm0, %xmm4
1147 ; SLM32-NEXT: pxor %xmm3, %xmm3
1148 ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1149 ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1150 ; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
1151 ; SLM32-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1152 ; SLM32-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778]
1153 ; SLM32-NEXT: pmulld %xmm3, %xmm4
1154 ; SLM32-NEXT: pmulld %xmm3, %xmm0
1155 ; SLM32-NEXT: pmulld %xmm3, %xmm2
1156 ; SLM32-NEXT: pmulld %xmm1, %xmm3
1157 ; SLM32-NEXT: movdqa %xmm4, %xmm1
1160 ; SLM64-LABEL: test_mul_v16i32_v16i16_minsize:
1162 ; SLM64-NEXT: movdqa %xmm0, %xmm4
1163 ; SLM64-NEXT: pxor %xmm3, %xmm3
1164 ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1165 ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1166 ; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
1167 ; SLM64-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1168 ; SLM64-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778]
1169 ; SLM64-NEXT: pmulld %xmm3, %xmm4
1170 ; SLM64-NEXT: pmulld %xmm3, %xmm0
1171 ; SLM64-NEXT: pmulld %xmm3, %xmm2
1172 ; SLM64-NEXT: pmulld %xmm1, %xmm3
1173 ; SLM64-NEXT: movdqa %xmm4, %xmm1
1176 ; SLOW32-LABEL: test_mul_v16i32_v16i16_minsize:
1178 ; SLOW32-NEXT: movdqa %xmm0, %xmm4
1179 ; SLOW32-NEXT: pxor %xmm3, %xmm3
1180 ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1181 ; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
1182 ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1183 ; SLOW32-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1184 ; SLOW32-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778]
1185 ; SLOW32-NEXT: pmulld %xmm3, %xmm0
1186 ; SLOW32-NEXT: pmulld %xmm3, %xmm4
1187 ; SLOW32-NEXT: pmulld %xmm3, %xmm2
1188 ; SLOW32-NEXT: pmulld %xmm1, %xmm3
1189 ; SLOW32-NEXT: movdqa %xmm4, %xmm1
1192 ; SLOW64-LABEL: test_mul_v16i32_v16i16_minsize:
1194 ; SLOW64-NEXT: movdqa %xmm0, %xmm4
1195 ; SLOW64-NEXT: pxor %xmm3, %xmm3
1196 ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1197 ; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
1198 ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1199 ; SLOW64-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1200 ; SLOW64-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778]
1201 ; SLOW64-NEXT: pmulld %xmm3, %xmm0
1202 ; SLOW64-NEXT: pmulld %xmm3, %xmm4
1203 ; SLOW64-NEXT: pmulld %xmm3, %xmm2
1204 ; SLOW64-NEXT: pmulld %xmm1, %xmm3
1205 ; SLOW64-NEXT: movdqa %xmm4, %xmm1
1208 ; SSE4-32-LABEL: test_mul_v16i32_v16i16_minsize:
1210 ; SSE4-32-NEXT: movdqa %xmm0, %xmm4
1211 ; SSE4-32-NEXT: pxor %xmm3, %xmm3
1212 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1213 ; SSE4-32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
1214 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1215 ; SSE4-32-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1216 ; SSE4-32-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778]
1217 ; SSE4-32-NEXT: pmulld %xmm3, %xmm0
1218 ; SSE4-32-NEXT: pmulld %xmm3, %xmm4
1219 ; SSE4-32-NEXT: pmulld %xmm3, %xmm2
1220 ; SSE4-32-NEXT: pmulld %xmm1, %xmm3
1221 ; SSE4-32-NEXT: movdqa %xmm4, %xmm1
1222 ; SSE4-32-NEXT: retl
1224 ; SSE4-64-LABEL: test_mul_v16i32_v16i16_minsize:
1226 ; SSE4-64-NEXT: movdqa %xmm0, %xmm4
1227 ; SSE4-64-NEXT: pxor %xmm3, %xmm3
1228 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1229 ; SSE4-64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
1230 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1231 ; SSE4-64-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1232 ; SSE4-64-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778]
1233 ; SSE4-64-NEXT: pmulld %xmm3, %xmm0
1234 ; SSE4-64-NEXT: pmulld %xmm3, %xmm4
1235 ; SSE4-64-NEXT: pmulld %xmm3, %xmm2
1236 ; SSE4-64-NEXT: pmulld %xmm1, %xmm3
1237 ; SSE4-64-NEXT: movdqa %xmm4, %xmm1
1238 ; SSE4-64-NEXT: retq
1240 ; AVX2-32-LABEL: test_mul_v16i32_v16i16_minsize:
1242 ; AVX2-32-NEXT: vextracti128 $1, %ymm0, %xmm1
1243 ; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1244 ; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1245 ; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
1246 ; AVX2-32-NEXT: vpmulld %ymm2, %ymm0, %ymm0
1247 ; AVX2-32-NEXT: vpmulld %ymm2, %ymm1, %ymm1
1248 ; AVX2-32-NEXT: retl
1250 ; AVX2-64-LABEL: test_mul_v16i32_v16i16_minsize:
1252 ; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm1
1253 ; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1254 ; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1255 ; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
1256 ; AVX2-64-NEXT: vpmulld %ymm2, %ymm0, %ymm0
1257 ; AVX2-64-NEXT: vpmulld %ymm2, %ymm1, %ymm1
1258 ; AVX2-64-NEXT: retq
1260 ; AVX512-32-LABEL: test_mul_v16i32_v16i16_minsize:
1261 ; AVX512-32: # %bb.0:
1262 ; AVX512-32-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1263 ; AVX512-32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
1264 ; AVX512-32-NEXT: retl
1266 ; AVX512-64-LABEL: test_mul_v16i32_v16i16_minsize:
1267 ; AVX512-64: # %bb.0:
1268 ; AVX512-64-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1269 ; AVX512-64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
1270 ; AVX512-64-NEXT: retq
1271 %z = zext <16 x i16> %A to <16 x i32>
1272 %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>