1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont | FileCheck %s --check-prefixes=SSE-32,SLM,SLM-32
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=silvermont | FileCheck %s --check-prefixes=SSE-64,SLM,SLM-64
4 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=SSE-32,SLOW,SLOW-32
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=SSE-64,SLOW,SLOW-64
6 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE-32,SSE4,SSE4-32
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE-64,SSE4,SSE4-64
8 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx2,+slow-pmulld | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,AVX2-SLOW32
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+slow-pmulld | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,AVX2-SLOW64
10 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX-32,AVX2-32
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX-64,AVX2-64
12 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX2,AVX-32,AVX512-32,AVX512DQ-32
13 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX2,AVX-64,AVX512-64,AVX512DQ-64
14 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX2,AVX-32,AVX512-32,AVX512BW-32
15 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX2,AVX-64,AVX512-64,AVX512BW-64
16 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=knl | FileCheck %s --check-prefixes=AVX2,AVX-32,AVX512-32,KNL-32
17 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefixes=AVX2,AVX-64,AVX512-64,KNL-64
19 ; Make sure that the slow-pmulld feature can be used without SSE4.1.
20 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont -mattr=-sse4.1
22 define <4 x i32> @test_mul_v4i32_v4i8(<4 x i8> %A) {
23 ; SSE-32-LABEL: test_mul_v4i32_v4i8:
25 ; SSE-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
26 ; SSE-32-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
29 ; SSE-64-LABEL: test_mul_v4i32_v4i8:
31 ; SSE-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
32 ; SSE-64-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
35 ; AVX2-SLOW32-LABEL: test_mul_v4i32_v4i8:
36 ; AVX2-SLOW32: # %bb.0:
37 ; AVX2-SLOW32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
38 ; AVX2-SLOW32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
39 ; AVX2-SLOW32-NEXT: retl
41 ; AVX2-SLOW64-LABEL: test_mul_v4i32_v4i8:
42 ; AVX2-SLOW64: # %bb.0:
43 ; AVX2-SLOW64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
44 ; AVX2-SLOW64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
45 ; AVX2-SLOW64-NEXT: retq
47 ; AVX2-32-LABEL: test_mul_v4i32_v4i8:
49 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
50 ; AVX2-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
53 ; AVX2-64-LABEL: test_mul_v4i32_v4i8:
55 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
56 ; AVX2-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
59 ; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8:
60 ; AVX512DQ-32: # %bb.0:
61 ; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
62 ; AVX512DQ-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
63 ; AVX512DQ-32-NEXT: retl
65 ; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8:
66 ; AVX512DQ-64: # %bb.0:
67 ; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
68 ; AVX512DQ-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
69 ; AVX512DQ-64-NEXT: retq
71 ; AVX512BW-32-LABEL: test_mul_v4i32_v4i8:
72 ; AVX512BW-32: # %bb.0:
73 ; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
74 ; AVX512BW-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
75 ; AVX512BW-32-NEXT: retl
77 ; AVX512BW-64-LABEL: test_mul_v4i32_v4i8:
78 ; AVX512BW-64: # %bb.0:
79 ; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
80 ; AVX512BW-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
81 ; AVX512BW-64-NEXT: retq
83 ; KNL-32-LABEL: test_mul_v4i32_v4i8:
85 ; KNL-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
86 ; KNL-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
87 ; KNL-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
90 ; KNL-64-LABEL: test_mul_v4i32_v4i8:
92 ; KNL-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
93 ; KNL-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
94 ; KNL-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
96 %z = zext <4 x i8> %A to <4 x i32>
97 %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
101 define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) {
102 ; SLM-LABEL: test_mul_v8i32_v8i8:
104 ; SLM-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
105 ; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
106 ; SLM-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
107 ; SLM-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
108 ; SLM-NEXT: pmaddwd %xmm2, %xmm0
109 ; SLM-NEXT: pmaddwd %xmm2, %xmm1
110 ; SLM-NEXT: ret{{[l|q]}}
112 ; SLOW-LABEL: test_mul_v8i32_v8i8:
114 ; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
115 ; SLOW-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
116 ; SLOW-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
117 ; SLOW-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
118 ; SLOW-NEXT: pmaddwd %xmm2, %xmm0
119 ; SLOW-NEXT: pmaddwd %xmm2, %xmm1
120 ; SLOW-NEXT: ret{{[l|q]}}
122 ; SSE4-LABEL: test_mul_v8i32_v8i8:
124 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
125 ; SSE4-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
126 ; SSE4-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
127 ; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
128 ; SSE4-NEXT: pmaddwd %xmm2, %xmm0
129 ; SSE4-NEXT: pmaddwd %xmm2, %xmm1
130 ; SSE4-NEXT: ret{{[l|q]}}
132 ; AVX2-SLOW32-LABEL: test_mul_v8i32_v8i8:
133 ; AVX2-SLOW32: # %bb.0:
134 ; AVX2-SLOW32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
135 ; AVX2-SLOW32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
136 ; AVX2-SLOW32-NEXT: retl
138 ; AVX2-SLOW64-LABEL: test_mul_v8i32_v8i8:
139 ; AVX2-SLOW64: # %bb.0:
140 ; AVX2-SLOW64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
141 ; AVX2-SLOW64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
142 ; AVX2-SLOW64-NEXT: retq
144 ; AVX2-32-LABEL: test_mul_v8i32_v8i8:
146 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
147 ; AVX2-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
150 ; AVX2-64-LABEL: test_mul_v8i32_v8i8:
152 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
153 ; AVX2-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
156 ; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8:
157 ; AVX512DQ-32: # %bb.0:
158 ; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
159 ; AVX512DQ-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
160 ; AVX512DQ-32-NEXT: retl
162 ; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8:
163 ; AVX512DQ-64: # %bb.0:
164 ; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
165 ; AVX512DQ-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
166 ; AVX512DQ-64-NEXT: retq
168 ; AVX512BW-32-LABEL: test_mul_v8i32_v8i8:
169 ; AVX512BW-32: # %bb.0:
170 ; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
171 ; AVX512BW-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
172 ; AVX512BW-32-NEXT: retl
174 ; AVX512BW-64-LABEL: test_mul_v8i32_v8i8:
175 ; AVX512BW-64: # %bb.0:
176 ; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
177 ; AVX512BW-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
178 ; AVX512BW-64-NEXT: retq
180 ; KNL-32-LABEL: test_mul_v8i32_v8i8:
182 ; KNL-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
183 ; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
184 ; KNL-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
187 ; KNL-64-LABEL: test_mul_v8i32_v8i8:
189 ; KNL-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
190 ; KNL-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
191 ; KNL-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
193 %z = zext <8 x i8> %A to <8 x i32>
194 %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
198 define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
199 ; SLM-LABEL: test_mul_v16i32_v16i8:
201 ; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
202 ; SLM-NEXT: movdqa {{.*#+}} xmm5 = [18778,0,18778,0,18778,0,18778,0]
203 ; SLM-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
204 ; SLM-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
205 ; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
206 ; SLM-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
207 ; SLM-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
208 ; SLM-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
209 ; SLM-NEXT: pmaddwd %xmm5, %xmm0
210 ; SLM-NEXT: pmaddwd %xmm5, %xmm1
211 ; SLM-NEXT: pmaddwd %xmm5, %xmm2
212 ; SLM-NEXT: pmaddwd %xmm5, %xmm3
213 ; SLM-NEXT: ret{{[l|q]}}
215 ; SLOW-LABEL: test_mul_v16i32_v16i8:
217 ; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
218 ; SLOW-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
219 ; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
220 ; SLOW-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
221 ; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
222 ; SLOW-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
223 ; SLOW-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
224 ; SLOW-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0]
225 ; SLOW-NEXT: pmaddwd %xmm4, %xmm0
226 ; SLOW-NEXT: pmaddwd %xmm4, %xmm1
227 ; SLOW-NEXT: pmaddwd %xmm4, %xmm2
228 ; SLOW-NEXT: pmaddwd %xmm4, %xmm3
229 ; SLOW-NEXT: ret{{[l|q]}}
231 ; SSE4-LABEL: test_mul_v16i32_v16i8:
233 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
234 ; SSE4-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
235 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
236 ; SSE4-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
237 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
238 ; SSE4-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
239 ; SSE4-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
240 ; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0]
241 ; SSE4-NEXT: pmaddwd %xmm4, %xmm0
242 ; SSE4-NEXT: pmaddwd %xmm4, %xmm1
243 ; SSE4-NEXT: pmaddwd %xmm4, %xmm2
244 ; SSE4-NEXT: pmaddwd %xmm4, %xmm3
245 ; SSE4-NEXT: ret{{[l|q]}}
247 ; AVX2-SLOW-LABEL: test_mul_v16i32_v16i8:
248 ; AVX2-SLOW: # %bb.0:
249 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
250 ; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
251 ; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
252 ; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
253 ; AVX2-SLOW-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
254 ; AVX2-SLOW-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
255 ; AVX2-SLOW-NEXT: ret{{[l|q]}}
257 ; AVX2-32-LABEL: test_mul_v16i32_v16i8:
259 ; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
260 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
261 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
262 ; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
263 ; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
264 ; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
267 ; AVX2-64-LABEL: test_mul_v16i32_v16i8:
269 ; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
270 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
271 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
272 ; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
273 ; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
274 ; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
277 ; AVX512DQ-32-LABEL: test_mul_v16i32_v16i8:
278 ; AVX512DQ-32: # %bb.0:
279 ; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
280 ; AVX512DQ-32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
281 ; AVX512DQ-32-NEXT: retl
283 ; AVX512DQ-64-LABEL: test_mul_v16i32_v16i8:
284 ; AVX512DQ-64: # %bb.0:
285 ; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
286 ; AVX512DQ-64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
287 ; AVX512DQ-64-NEXT: retq
289 ; AVX512BW-32-LABEL: test_mul_v16i32_v16i8:
290 ; AVX512BW-32: # %bb.0:
291 ; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
292 ; AVX512BW-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0
293 ; AVX512BW-32-NEXT: retl
295 ; AVX512BW-64-LABEL: test_mul_v16i32_v16i8:
296 ; AVX512BW-64: # %bb.0:
297 ; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
298 ; AVX512BW-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
299 ; AVX512BW-64-NEXT: retq
301 ; KNL-32-LABEL: test_mul_v16i32_v16i8:
303 ; KNL-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
304 ; KNL-32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
307 ; KNL-64-LABEL: test_mul_v16i32_v16i8:
309 ; KNL-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
310 ; KNL-64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
312 %z = zext <16 x i8> %A to <16 x i32>
313 %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
317 define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) {
318 ; SLM-LABEL: test_mul_v4i32_v4i16:
320 ; SLM-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
321 ; SLM-NEXT: movdqa %xmm0, %xmm2
322 ; SLM-NEXT: pmulhuw %xmm1, %xmm2
323 ; SLM-NEXT: pmullw %xmm1, %xmm0
324 ; SLM-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
325 ; SLM-NEXT: ret{{[l|q]}}
327 ; SLOW-LABEL: test_mul_v4i32_v4i16:
329 ; SLOW-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
330 ; SLOW-NEXT: movdqa %xmm0, %xmm2
331 ; SLOW-NEXT: pmulhuw %xmm1, %xmm2
332 ; SLOW-NEXT: pmullw %xmm1, %xmm0
333 ; SLOW-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
334 ; SLOW-NEXT: ret{{[l|q]}}
336 ; SSE4-32-LABEL: test_mul_v4i32_v4i16:
338 ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
339 ; SSE4-32-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
342 ; SSE4-64-LABEL: test_mul_v4i32_v4i16:
344 ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
345 ; SSE4-64-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
348 ; AVX2-SLOW-LABEL: test_mul_v4i32_v4i16:
349 ; AVX2-SLOW: # %bb.0:
350 ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
351 ; AVX2-SLOW-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2
352 ; AVX2-SLOW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
353 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
354 ; AVX2-SLOW-NEXT: ret{{[l|q]}}
356 ; AVX-32-LABEL: test_mul_v4i32_v4i16:
358 ; AVX-32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
359 ; AVX-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
360 ; AVX-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
363 ; AVX-64-LABEL: test_mul_v4i32_v4i16:
365 ; AVX-64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
366 ; AVX-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
367 ; AVX-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
369 %z = zext <4 x i16> %A to <4 x i32>
370 %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
374 define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) {
375 ; SLM-LABEL: test_mul_v8i32_v8i16:
377 ; SLM-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
378 ; SLM-NEXT: movdqa %xmm0, %xmm2
379 ; SLM-NEXT: pmulhuw %xmm1, %xmm2
380 ; SLM-NEXT: pmullw %xmm0, %xmm1
381 ; SLM-NEXT: movdqa %xmm1, %xmm0
382 ; SLM-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
383 ; SLM-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
384 ; SLM-NEXT: ret{{[l|q]}}
386 ; SLOW-LABEL: test_mul_v8i32_v8i16:
388 ; SLOW-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
389 ; SLOW-NEXT: movdqa %xmm0, %xmm2
390 ; SLOW-NEXT: pmulhuw %xmm1, %xmm2
391 ; SLOW-NEXT: pmullw %xmm0, %xmm1
392 ; SLOW-NEXT: movdqa %xmm1, %xmm0
393 ; SLOW-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
394 ; SLOW-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
395 ; SLOW-NEXT: ret{{[l|q]}}
397 ; SSE4-LABEL: test_mul_v8i32_v8i16:
399 ; SSE4-NEXT: pxor %xmm1, %xmm1
400 ; SSE4-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
401 ; SSE4-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
402 ; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
403 ; SSE4-NEXT: pmulld %xmm1, %xmm2
404 ; SSE4-NEXT: pmulld %xmm0, %xmm1
405 ; SSE4-NEXT: movdqa %xmm2, %xmm0
406 ; SSE4-NEXT: ret{{[l|q]}}
408 ; AVX2-SLOW-LABEL: test_mul_v8i32_v8i16:
409 ; AVX2-SLOW: # %bb.0:
410 ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
411 ; AVX2-SLOW-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2
412 ; AVX2-SLOW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
413 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
414 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
415 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
416 ; AVX2-SLOW-NEXT: ret{{[l|q]}}
418 ; AVX-32-LABEL: test_mul_v8i32_v8i16:
420 ; AVX-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
421 ; AVX-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
422 ; AVX-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
425 ; AVX-64-LABEL: test_mul_v8i32_v8i16:
427 ; AVX-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
428 ; AVX-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
429 ; AVX-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
431 %z = zext <8 x i16> %A to <8 x i32>
432 %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
436 define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) {
437 ; SLM-LABEL: test_mul_v16i32_v16i16:
439 ; SLM-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778,18778,18778,18778,18778]
440 ; SLM-NEXT: movdqa %xmm0, %xmm4
441 ; SLM-NEXT: movdqa %xmm0, %xmm2
442 ; SLM-NEXT: movdqa %xmm1, %xmm5
443 ; SLM-NEXT: pmullw %xmm3, %xmm4
444 ; SLM-NEXT: pmulhuw %xmm3, %xmm2
445 ; SLM-NEXT: pmulhuw %xmm3, %xmm5
446 ; SLM-NEXT: pmullw %xmm1, %xmm3
447 ; SLM-NEXT: movdqa %xmm4, %xmm0
448 ; SLM-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
449 ; SLM-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
450 ; SLM-NEXT: movdqa %xmm3, %xmm2
451 ; SLM-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
452 ; SLM-NEXT: movdqa %xmm4, %xmm1
453 ; SLM-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
454 ; SLM-NEXT: ret{{[l|q]}}
456 ; SLOW-LABEL: test_mul_v16i32_v16i16:
458 ; SLOW-NEXT: movdqa %xmm0, %xmm4
459 ; SLOW-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778,18778,18778,18778,18778]
460 ; SLOW-NEXT: movdqa %xmm0, %xmm2
461 ; SLOW-NEXT: pmulhuw %xmm3, %xmm2
462 ; SLOW-NEXT: pmullw %xmm3, %xmm4
463 ; SLOW-NEXT: movdqa %xmm4, %xmm0
464 ; SLOW-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
465 ; SLOW-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
466 ; SLOW-NEXT: movdqa %xmm1, %xmm5
467 ; SLOW-NEXT: pmulhuw %xmm3, %xmm5
468 ; SLOW-NEXT: pmullw %xmm1, %xmm3
469 ; SLOW-NEXT: movdqa %xmm3, %xmm2
470 ; SLOW-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
471 ; SLOW-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
472 ; SLOW-NEXT: movdqa %xmm4, %xmm1
473 ; SLOW-NEXT: ret{{[l|q]}}
475 ; SSE4-LABEL: test_mul_v16i32_v16i16:
477 ; SSE4-NEXT: movdqa %xmm0, %xmm4
478 ; SSE4-NEXT: pxor %xmm3, %xmm3
479 ; SSE4-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
480 ; SSE4-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
481 ; SSE4-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
482 ; SSE4-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
483 ; SSE4-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778]
484 ; SSE4-NEXT: pmulld %xmm3, %xmm0
485 ; SSE4-NEXT: pmulld %xmm3, %xmm4
486 ; SSE4-NEXT: pmulld %xmm3, %xmm2
487 ; SSE4-NEXT: pmulld %xmm1, %xmm3
488 ; SSE4-NEXT: movdqa %xmm4, %xmm1
489 ; SSE4-NEXT: ret{{[l|q]}}
491 ; AVX2-SLOW-LABEL: test_mul_v16i32_v16i16:
492 ; AVX2-SLOW: # %bb.0:
493 ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778]
494 ; AVX2-SLOW-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
495 ; AVX2-SLOW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
496 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
497 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
498 ; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[0,1],ymm1[0,1]
499 ; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
500 ; AVX2-SLOW-NEXT: ret{{[l|q]}}
502 ; AVX2-32-LABEL: test_mul_v16i32_v16i16:
504 ; AVX2-32-NEXT: vextracti128 $1, %ymm0, %xmm1
505 ; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
506 ; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
507 ; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
508 ; AVX2-32-NEXT: vpmulld %ymm2, %ymm0, %ymm0
509 ; AVX2-32-NEXT: vpmulld %ymm2, %ymm1, %ymm1
512 ; AVX2-64-LABEL: test_mul_v16i32_v16i16:
514 ; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm1
515 ; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
516 ; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
517 ; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
518 ; AVX2-64-NEXT: vpmulld %ymm2, %ymm0, %ymm0
519 ; AVX2-64-NEXT: vpmulld %ymm2, %ymm1, %ymm1
522 ; AVX512-32-LABEL: test_mul_v16i32_v16i16:
523 ; AVX512-32: # %bb.0:
524 ; AVX512-32-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
525 ; AVX512-32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
526 ; AVX512-32-NEXT: retl
528 ; AVX512-64-LABEL: test_mul_v16i32_v16i16:
529 ; AVX512-64: # %bb.0:
530 ; AVX512-64-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
531 ; AVX512-64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
532 ; AVX512-64-NEXT: retq
533 %z = zext <16 x i16> %A to <16 x i32>
534 %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
542 define <4 x i32> @test_mul_v4i32_v4i8_minsize(<4 x i8> %A) minsize {
543 ; SSE-32-LABEL: test_mul_v4i32_v4i8_minsize:
545 ; SSE-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
546 ; SSE-32-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
549 ; SSE-64-LABEL: test_mul_v4i32_v4i8_minsize:
551 ; SSE-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
552 ; SSE-64-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
555 ; AVX2-SLOW32-LABEL: test_mul_v4i32_v4i8_minsize:
556 ; AVX2-SLOW32: # %bb.0:
557 ; AVX2-SLOW32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
558 ; AVX2-SLOW32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
559 ; AVX2-SLOW32-NEXT: retl
561 ; AVX2-SLOW64-LABEL: test_mul_v4i32_v4i8_minsize:
562 ; AVX2-SLOW64: # %bb.0:
563 ; AVX2-SLOW64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
564 ; AVX2-SLOW64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
565 ; AVX2-SLOW64-NEXT: retq
567 ; AVX2-32-LABEL: test_mul_v4i32_v4i8_minsize:
569 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
570 ; AVX2-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
573 ; AVX2-64-LABEL: test_mul_v4i32_v4i8_minsize:
575 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
576 ; AVX2-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
579 ; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8_minsize:
580 ; AVX512DQ-32: # %bb.0:
581 ; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
582 ; AVX512DQ-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
583 ; AVX512DQ-32-NEXT: retl
585 ; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8_minsize:
586 ; AVX512DQ-64: # %bb.0:
587 ; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
588 ; AVX512DQ-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
589 ; AVX512DQ-64-NEXT: retq
591 ; AVX512BW-32-LABEL: test_mul_v4i32_v4i8_minsize:
592 ; AVX512BW-32: # %bb.0:
593 ; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
594 ; AVX512BW-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
595 ; AVX512BW-32-NEXT: retl
597 ; AVX512BW-64-LABEL: test_mul_v4i32_v4i8_minsize:
598 ; AVX512BW-64: # %bb.0:
599 ; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
600 ; AVX512BW-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
601 ; AVX512BW-64-NEXT: retq
603 ; KNL-32-LABEL: test_mul_v4i32_v4i8_minsize:
605 ; KNL-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
606 ; KNL-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
607 ; KNL-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
610 ; KNL-64-LABEL: test_mul_v4i32_v4i8_minsize:
612 ; KNL-64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
613 ; KNL-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
614 ; KNL-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
616 %z = zext <4 x i8> %A to <4 x i32>
617 %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
621 define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize {
622 ; SLM-LABEL: test_mul_v8i32_v8i8_minsize:
624 ; SLM-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
625 ; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
626 ; SLM-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
627 ; SLM-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
628 ; SLM-NEXT: pmaddwd %xmm2, %xmm0
629 ; SLM-NEXT: pmaddwd %xmm2, %xmm1
630 ; SLM-NEXT: ret{{[l|q]}}
632 ; SLOW-LABEL: test_mul_v8i32_v8i8_minsize:
634 ; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
635 ; SLOW-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
636 ; SLOW-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
637 ; SLOW-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
638 ; SLOW-NEXT: pmaddwd %xmm2, %xmm0
639 ; SLOW-NEXT: pmaddwd %xmm2, %xmm1
640 ; SLOW-NEXT: ret{{[l|q]}}
642 ; SSE4-LABEL: test_mul_v8i32_v8i8_minsize:
644 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
645 ; SSE4-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
646 ; SSE4-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
647 ; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
648 ; SSE4-NEXT: pmaddwd %xmm2, %xmm0
649 ; SSE4-NEXT: pmaddwd %xmm2, %xmm1
650 ; SSE4-NEXT: ret{{[l|q]}}
652 ; AVX2-SLOW32-LABEL: test_mul_v8i32_v8i8_minsize:
653 ; AVX2-SLOW32: # %bb.0:
654 ; AVX2-SLOW32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
655 ; AVX2-SLOW32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
656 ; AVX2-SLOW32-NEXT: retl
658 ; AVX2-SLOW64-LABEL: test_mul_v8i32_v8i8_minsize:
659 ; AVX2-SLOW64: # %bb.0:
660 ; AVX2-SLOW64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
661 ; AVX2-SLOW64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
662 ; AVX2-SLOW64-NEXT: retq
664 ; AVX2-32-LABEL: test_mul_v8i32_v8i8_minsize:
666 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
667 ; AVX2-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
670 ; AVX2-64-LABEL: test_mul_v8i32_v8i8_minsize:
672 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
673 ; AVX2-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
676 ; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8_minsize:
677 ; AVX512DQ-32: # %bb.0:
678 ; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
679 ; AVX512DQ-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
680 ; AVX512DQ-32-NEXT: retl
682 ; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8_minsize:
683 ; AVX512DQ-64: # %bb.0:
684 ; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
685 ; AVX512DQ-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
686 ; AVX512DQ-64-NEXT: retq
688 ; AVX512BW-32-LABEL: test_mul_v8i32_v8i8_minsize:
689 ; AVX512BW-32: # %bb.0:
690 ; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
691 ; AVX512BW-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
692 ; AVX512BW-32-NEXT: retl
694 ; AVX512BW-64-LABEL: test_mul_v8i32_v8i8_minsize:
695 ; AVX512BW-64: # %bb.0:
696 ; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
697 ; AVX512BW-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
698 ; AVX512BW-64-NEXT: retq
700 ; KNL-32-LABEL: test_mul_v8i32_v8i8_minsize:
702 ; KNL-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
703 ; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
704 ; KNL-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
707 ; KNL-64-LABEL: test_mul_v8i32_v8i8_minsize:
709 ; KNL-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
710 ; KNL-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
711 ; KNL-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
713 %z = zext <8 x i8> %A to <8 x i32>
714 %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
718 define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
719 ; SLM-LABEL: test_mul_v16i32_v16i8_minsize:
721 ; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
722 ; SLM-NEXT: movdqa {{.*#+}} xmm5 = [18778,0,18778,0,18778,0,18778,0]
723 ; SLM-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
724 ; SLM-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
725 ; SLM-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
726 ; SLM-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
727 ; SLM-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
728 ; SLM-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
729 ; SLM-NEXT: pmaddwd %xmm5, %xmm0
730 ; SLM-NEXT: pmaddwd %xmm5, %xmm1
731 ; SLM-NEXT: pmaddwd %xmm5, %xmm2
732 ; SLM-NEXT: pmaddwd %xmm5, %xmm3
733 ; SLM-NEXT: ret{{[l|q]}}
735 ; SLOW-LABEL: test_mul_v16i32_v16i8_minsize:
737 ; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
738 ; SLOW-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
739 ; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
740 ; SLOW-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
741 ; SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
742 ; SLOW-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
743 ; SLOW-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
744 ; SLOW-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0]
745 ; SLOW-NEXT: pmaddwd %xmm4, %xmm0
746 ; SLOW-NEXT: pmaddwd %xmm4, %xmm1
747 ; SLOW-NEXT: pmaddwd %xmm4, %xmm2
748 ; SLOW-NEXT: pmaddwd %xmm4, %xmm3
749 ; SLOW-NEXT: ret{{[l|q]}}
751 ; SSE4-LABEL: test_mul_v16i32_v16i8_minsize:
753 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
754 ; SSE4-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
755 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
756 ; SSE4-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
757 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
758 ; SSE4-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
759 ; SSE4-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
760 ; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0]
761 ; SSE4-NEXT: pmaddwd %xmm4, %xmm0
762 ; SSE4-NEXT: pmaddwd %xmm4, %xmm1
763 ; SSE4-NEXT: pmaddwd %xmm4, %xmm2
764 ; SSE4-NEXT: pmaddwd %xmm4, %xmm3
765 ; SSE4-NEXT: ret{{[l|q]}}
767 ; AVX2-SLOW-LABEL: test_mul_v16i32_v16i8_minsize:
768 ; AVX2-SLOW: # %bb.0:
769 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
770 ; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
771 ; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
772 ; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
773 ; AVX2-SLOW-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
774 ; AVX2-SLOW-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
775 ; AVX2-SLOW-NEXT: ret{{[l|q]}}
777 ; AVX2-32-LABEL: test_mul_v16i32_v16i8_minsize:
779 ; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
780 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
781 ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
782 ; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
783 ; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
784 ; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
787 ; AVX2-64-LABEL: test_mul_v16i32_v16i8_minsize:
789 ; AVX2-64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
790 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
791 ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
792 ; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
793 ; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
794 ; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
797 ; AVX512DQ-32-LABEL: test_mul_v16i32_v16i8_minsize:
798 ; AVX512DQ-32: # %bb.0:
799 ; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
800 ; AVX512DQ-32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
801 ; AVX512DQ-32-NEXT: retl
803 ; AVX512DQ-64-LABEL: test_mul_v16i32_v16i8_minsize:
804 ; AVX512DQ-64: # %bb.0:
805 ; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
806 ; AVX512DQ-64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
807 ; AVX512DQ-64-NEXT: retq
809 ; AVX512BW-32-LABEL: test_mul_v16i32_v16i8_minsize:
810 ; AVX512BW-32: # %bb.0:
811 ; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
812 ; AVX512BW-32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0
813 ; AVX512BW-32-NEXT: retl
815 ; AVX512BW-64-LABEL: test_mul_v16i32_v16i8_minsize:
816 ; AVX512BW-64: # %bb.0:
817 ; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
818 ; AVX512BW-64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
819 ; AVX512BW-64-NEXT: retq
821 ; KNL-32-LABEL: test_mul_v16i32_v16i8_minsize:
823 ; KNL-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
824 ; KNL-32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
827 ; KNL-64-LABEL: test_mul_v16i32_v16i8_minsize:
829 ; KNL-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
830 ; KNL-64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
832 %z = zext <16 x i8> %A to <16 x i32>
833 %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
837 define <4 x i32> @test_mul_v4i32_v4i16_minsize(<4 x i16> %A) minsize {
838 ; SSE-32-LABEL: test_mul_v4i32_v4i16_minsize:
840 ; SSE-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
841 ; SSE-32-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
844 ; SSE-64-LABEL: test_mul_v4i32_v4i16_minsize:
846 ; SSE-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
847 ; SSE-64-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
850 ; AVX2-LABEL: test_mul_v4i32_v4i16_minsize:
852 ; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
853 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
854 ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
855 ; AVX2-NEXT: ret{{[l|q]}}
856 %z = zext <4 x i16> %A to <4 x i32>
857 %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
861 define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize {
862 ; SLM-LABEL: test_mul_v8i32_v8i16_minsize:
864 ; SLM-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
865 ; SLM-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
866 ; SLM-NEXT: pxor %xmm3, %xmm3
867 ; SLM-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
868 ; SLM-NEXT: pmulld %xmm1, %xmm2
869 ; SLM-NEXT: pmulld %xmm0, %xmm1
870 ; SLM-NEXT: movdqa %xmm2, %xmm0
871 ; SLM-NEXT: ret{{[l|q]}}
873 ; SLOW-LABEL: test_mul_v8i32_v8i16_minsize:
875 ; SLOW-NEXT: pxor %xmm1, %xmm1
876 ; SLOW-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
877 ; SLOW-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
878 ; SLOW-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
879 ; SLOW-NEXT: pmulld %xmm1, %xmm2
880 ; SLOW-NEXT: pmulld %xmm0, %xmm1
881 ; SLOW-NEXT: movdqa %xmm2, %xmm0
882 ; SLOW-NEXT: ret{{[l|q]}}
884 ; SSE4-LABEL: test_mul_v8i32_v8i16_minsize:
886 ; SSE4-NEXT: pxor %xmm1, %xmm1
887 ; SSE4-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
888 ; SSE4-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
889 ; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
890 ; SSE4-NEXT: pmulld %xmm1, %xmm2
891 ; SSE4-NEXT: pmulld %xmm0, %xmm1
892 ; SSE4-NEXT: movdqa %xmm2, %xmm0
893 ; SSE4-NEXT: ret{{[l|q]}}
895 ; AVX2-LABEL: test_mul_v8i32_v8i16_minsize:
897 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
898 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
899 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
900 ; AVX2-NEXT: ret{{[l|q]}}
901 %z = zext <8 x i16> %A to <8 x i32>
902 %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
906 define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize {
907 ; SLM-LABEL: test_mul_v16i32_v16i16_minsize:
909 ; SLM-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778]
910 ; SLM-NEXT: movdqa %xmm0, %xmm4
911 ; SLM-NEXT: pxor %xmm5, %xmm5
912 ; SLM-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
913 ; SLM-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
914 ; SLM-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
915 ; SLM-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
916 ; SLM-NEXT: pmulld %xmm3, %xmm4
917 ; SLM-NEXT: pmulld %xmm3, %xmm0
918 ; SLM-NEXT: pmulld %xmm3, %xmm2
919 ; SLM-NEXT: pmulld %xmm1, %xmm3
920 ; SLM-NEXT: movdqa %xmm4, %xmm1
921 ; SLM-NEXT: ret{{[l|q]}}
923 ; SLOW-LABEL: test_mul_v16i32_v16i16_minsize:
925 ; SLOW-NEXT: movdqa %xmm0, %xmm4
926 ; SLOW-NEXT: pxor %xmm3, %xmm3
927 ; SLOW-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
928 ; SLOW-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
929 ; SLOW-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
930 ; SLOW-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
931 ; SLOW-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778]
932 ; SLOW-NEXT: pmulld %xmm3, %xmm0
933 ; SLOW-NEXT: pmulld %xmm3, %xmm4
934 ; SLOW-NEXT: pmulld %xmm3, %xmm2
935 ; SLOW-NEXT: pmulld %xmm1, %xmm3
936 ; SLOW-NEXT: movdqa %xmm4, %xmm1
937 ; SLOW-NEXT: ret{{[l|q]}}
939 ; SSE4-LABEL: test_mul_v16i32_v16i16_minsize:
941 ; SSE4-NEXT: movdqa %xmm0, %xmm4
942 ; SSE4-NEXT: pxor %xmm3, %xmm3
943 ; SSE4-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
944 ; SSE4-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
945 ; SSE4-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
946 ; SSE4-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
947 ; SSE4-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778]
948 ; SSE4-NEXT: pmulld %xmm3, %xmm0
949 ; SSE4-NEXT: pmulld %xmm3, %xmm4
950 ; SSE4-NEXT: pmulld %xmm3, %xmm2
951 ; SSE4-NEXT: pmulld %xmm1, %xmm3
952 ; SSE4-NEXT: movdqa %xmm4, %xmm1
953 ; SSE4-NEXT: ret{{[l|q]}}
955 ; AVX2-SLOW-LABEL: test_mul_v16i32_v16i16_minsize:
956 ; AVX2-SLOW: # %bb.0:
957 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
958 ; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
959 ; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
960 ; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
961 ; AVX2-SLOW-NEXT: vpmulld %ymm2, %ymm0, %ymm0
962 ; AVX2-SLOW-NEXT: vpmulld %ymm2, %ymm1, %ymm1
963 ; AVX2-SLOW-NEXT: ret{{[l|q]}}
965 ; AVX2-32-LABEL: test_mul_v16i32_v16i16_minsize:
967 ; AVX2-32-NEXT: vextracti128 $1, %ymm0, %xmm1
968 ; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
969 ; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
970 ; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
971 ; AVX2-32-NEXT: vpmulld %ymm2, %ymm0, %ymm0
972 ; AVX2-32-NEXT: vpmulld %ymm2, %ymm1, %ymm1
975 ; AVX2-64-LABEL: test_mul_v16i32_v16i16_minsize:
977 ; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm1
978 ; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
979 ; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
980 ; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
981 ; AVX2-64-NEXT: vpmulld %ymm2, %ymm0, %ymm0
982 ; AVX2-64-NEXT: vpmulld %ymm2, %ymm1, %ymm1
985 ; AVX512-32-LABEL: test_mul_v16i32_v16i16_minsize:
986 ; AVX512-32: # %bb.0:
987 ; AVX512-32-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
988 ; AVX512-32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
989 ; AVX512-32-NEXT: retl
991 ; AVX512-64-LABEL: test_mul_v16i32_v16i16_minsize:
992 ; AVX512-64: # %bb.0:
993 ; AVX512-64-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
994 ; AVX512-64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
995 ; AVX512-64-NEXT: retq
996 %z = zext <16 x i16> %A to <16 x i32>
997 %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
1000 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: