1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
8 define <4 x i16> @zext_mulhuw_v4i16(<4 x i16> %a, <4 x i16> %b) {
9 ; SSE-LABEL: zext_mulhuw_v4i16:
11 ; SSE-NEXT: pmulhuw %xmm1, %xmm0
14 ; AVX-LABEL: zext_mulhuw_v4i16:
16 ; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
18 %a1 = zext <4 x i16> %a to <4 x i32>
19 %b1 = zext <4 x i16> %b to <4 x i32>
20 %c = mul <4 x i32> %a1, %b1
21 %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
22 %e = trunc <4 x i32> %d to <4 x i16>
26 define <4 x i16> @and_mulhuw_v4i16(<4 x i64> %a, <4 x i64> %b) {
27 ; SSE2-LABEL: and_mulhuw_v4i16:
29 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,0,0,65535,0,0,0]
30 ; SSE2-NEXT: pand %xmm4, %xmm3
31 ; SSE2-NEXT: pand %xmm4, %xmm1
32 ; SSE2-NEXT: pmuludq %xmm3, %xmm1
33 ; SSE2-NEXT: pand %xmm4, %xmm2
34 ; SSE2-NEXT: pand %xmm4, %xmm0
35 ; SSE2-NEXT: pmuludq %xmm2, %xmm0
36 ; SSE2-NEXT: psrlq $16, %xmm0
37 ; SSE2-NEXT: psrlq $16, %xmm1
38 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
39 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
40 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
41 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
42 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
45 ; SSE41-LABEL: and_mulhuw_v4i16:
47 ; SSE41-NEXT: pxor %xmm4, %xmm4
48 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4],xmm4[5],xmm2[6,7]
49 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4],xmm4[5],xmm0[6,7]
50 ; SSE41-NEXT: pmuldq %xmm2, %xmm0
51 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4],xmm4[5],xmm3[6,7]
52 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3,4],xmm4[5],xmm1[6,7]
53 ; SSE41-NEXT: pmuldq %xmm3, %xmm1
54 ; SSE41-NEXT: psrlq $16, %xmm1
55 ; SSE41-NEXT: psrlq $16, %xmm0
56 ; SSE41-NEXT: packusdw %xmm1, %xmm0
57 ; SSE41-NEXT: packusdw %xmm0, %xmm0
60 ; AVX2-LABEL: and_mulhuw_v4i16:
62 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
63 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
64 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15]
65 ; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
66 ; AVX2-NEXT: vpsrlq $16, %ymm0, %ymm0
67 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
68 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
69 ; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
70 ; AVX2-NEXT: vzeroupper
73 ; AVX512-LABEL: and_mulhuw_v4i16:
75 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
76 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
77 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15]
78 ; AVX512-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
79 ; AVX512-NEXT: vpsrlq $16, %ymm0, %ymm0
80 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
81 ; AVX512-NEXT: vzeroupper
83 %a1 = and <4 x i64> %a, <i64 65535, i64 65535, i64 65535, i64 65535>
84 %b1 = and <4 x i64> %b, <i64 65535, i64 65535, i64 65535, i64 65535>
85 %c = mul <4 x i64> %a1, %b1
86 %d = lshr <4 x i64> %c, <i64 16, i64 16, i64 16, i64 16>
87 %e = trunc <4 x i64> %d to <4 x i16>
91 define <4 x i16> @sext_mulhw_v4i16(<4 x i16> %a, <4 x i16> %b) {
92 ; SSE-LABEL: sext_mulhw_v4i16:
94 ; SSE-NEXT: pmulhw %xmm1, %xmm0
97 ; AVX-LABEL: sext_mulhw_v4i16:
99 ; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
101 %a1 = sext <4 x i16> %a to <4 x i32>
102 %b1 = sext <4 x i16> %b to <4 x i32>
103 %c = mul <4 x i32> %a1, %b1
104 %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
105 %e = trunc <4 x i32> %d to <4 x i16>
109 define <4 x i16> @ashr_mulhw_v4i16(<4 x i32> %a, <4 x i32> %b) {
110 ; SSE2-LABEL: ashr_mulhw_v4i16:
112 ; SSE2-NEXT: psrad $16, %xmm1
113 ; SSE2-NEXT: packssdw %xmm1, %xmm1
114 ; SSE2-NEXT: psrad $16, %xmm0
115 ; SSE2-NEXT: packssdw %xmm0, %xmm0
116 ; SSE2-NEXT: pmulhw %xmm1, %xmm0
119 ; SSE41-LABEL: ashr_mulhw_v4i16:
121 ; SSE41-NEXT: psrad $16, %xmm0
122 ; SSE41-NEXT: psrad $16, %xmm1
123 ; SSE41-NEXT: pmulld %xmm1, %xmm0
124 ; SSE41-NEXT: psrld $16, %xmm0
125 ; SSE41-NEXT: packusdw %xmm0, %xmm0
128 ; AVX-LABEL: ashr_mulhw_v4i16:
130 ; AVX-NEXT: vpsrad $16, %xmm0, %xmm0
131 ; AVX-NEXT: vpsrad $16, %xmm1, %xmm1
132 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
133 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
134 ; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
136 %a1 = ashr <4 x i32> %a, <i32 16, i32 16, i32 16, i32 16>
137 %b1 = ashr <4 x i32> %b, <i32 16, i32 16, i32 16, i32 16>
138 %c = mul <4 x i32> %a1, %b1
139 %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
140 %e = trunc <4 x i32> %d to <4 x i16>
144 define <8 x i16> @zext_mulhuw_v8i16(<8 x i16> %a, <8 x i16> %b) {
145 ; SSE-LABEL: zext_mulhuw_v8i16:
147 ; SSE-NEXT: pmulhuw %xmm1, %xmm0
150 ; AVX-LABEL: zext_mulhuw_v8i16:
152 ; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
154 %a1 = zext <8 x i16> %a to <8 x i32>
155 %b1 = zext <8 x i16> %b to <8 x i32>
156 %c = mul <8 x i32> %a1, %b1
157 %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
158 %e = trunc <8 x i32> %d to <8 x i16>
162 define <8 x i16> @lshr_mulhuw_v8i16(<8 x i32> %a, <8 x i32> %b) {
163 ; SSE2-LABEL: lshr_mulhuw_v8i16:
165 ; SSE2-NEXT: psrad $16, %xmm3
166 ; SSE2-NEXT: psrad $16, %xmm2
167 ; SSE2-NEXT: packssdw %xmm3, %xmm2
168 ; SSE2-NEXT: psrad $16, %xmm1
169 ; SSE2-NEXT: psrad $16, %xmm0
170 ; SSE2-NEXT: packssdw %xmm1, %xmm0
171 ; SSE2-NEXT: pmulhuw %xmm2, %xmm0
174 ; SSE41-LABEL: lshr_mulhuw_v8i16:
176 ; SSE41-NEXT: psrld $16, %xmm1
177 ; SSE41-NEXT: psrld $16, %xmm0
178 ; SSE41-NEXT: psrld $16, %xmm3
179 ; SSE41-NEXT: pmulld %xmm1, %xmm3
180 ; SSE41-NEXT: psrld $16, %xmm2
181 ; SSE41-NEXT: pmulld %xmm2, %xmm0
182 ; SSE41-NEXT: psrld $16, %xmm3
183 ; SSE41-NEXT: psrld $16, %xmm0
184 ; SSE41-NEXT: packusdw %xmm3, %xmm0
187 ; AVX2-LABEL: lshr_mulhuw_v8i16:
189 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
190 ; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
191 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
192 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
193 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
194 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
195 ; AVX2-NEXT: vzeroupper
198 ; AVX512-LABEL: lshr_mulhuw_v8i16:
200 ; AVX512-NEXT: vpsrld $16, %ymm0, %ymm0
201 ; AVX512-NEXT: vpsrld $16, %ymm1, %ymm1
202 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
203 ; AVX512-NEXT: vpsrld $16, %ymm0, %ymm0
204 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
205 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
206 ; AVX512-NEXT: vzeroupper
208 %a1 = lshr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
209 %b1 = lshr <8 x i32> %b, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
210 %c = mul <8 x i32> %a1, %b1
211 %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
212 %e = trunc <8 x i32> %d to <8 x i16>
216 define <8 x i16> @sext_mulhw_v8i16(<8 x i16> %a, <8 x i16> %b) {
217 ; SSE-LABEL: sext_mulhw_v8i16:
219 ; SSE-NEXT: pmulhw %xmm1, %xmm0
222 ; AVX-LABEL: sext_mulhw_v8i16:
224 ; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
226 %a1 = sext <8 x i16> %a to <8 x i32>
227 %b1 = sext <8 x i16> %b to <8 x i32>
228 %c = mul <8 x i32> %a1, %b1
229 %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
230 %e = trunc <8 x i32> %d to <8 x i16>
234 define <8 x i16> @sextinreg_mulhw_v8i16(<8 x i32> %a, <8 x i32> %b) {
235 ; SSE2-LABEL: sextinreg_mulhw_v8i16:
237 ; SSE2-NEXT: pslld $24, %xmm1
238 ; SSE2-NEXT: psrad $24, %xmm1
239 ; SSE2-NEXT: pslld $24, %xmm0
240 ; SSE2-NEXT: psrad $24, %xmm0
241 ; SSE2-NEXT: packssdw %xmm1, %xmm0
242 ; SSE2-NEXT: pslld $25, %xmm3
243 ; SSE2-NEXT: psrad $25, %xmm3
244 ; SSE2-NEXT: pslld $25, %xmm2
245 ; SSE2-NEXT: psrad $25, %xmm2
246 ; SSE2-NEXT: packssdw %xmm3, %xmm2
247 ; SSE2-NEXT: pmullw %xmm0, %xmm2
248 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
249 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
250 ; SSE2-NEXT: psrad $31, %xmm1
251 ; SSE2-NEXT: psrad $31, %xmm0
252 ; SSE2-NEXT: packssdw %xmm1, %xmm0
255 ; SSE41-LABEL: sextinreg_mulhw_v8i16:
257 ; SSE41-NEXT: pslld $24, %xmm1
258 ; SSE41-NEXT: psrad $24, %xmm1
259 ; SSE41-NEXT: pslld $24, %xmm0
260 ; SSE41-NEXT: psrad $24, %xmm0
261 ; SSE41-NEXT: pslld $25, %xmm3
262 ; SSE41-NEXT: psrad $25, %xmm3
263 ; SSE41-NEXT: pmulld %xmm1, %xmm3
264 ; SSE41-NEXT: pslld $25, %xmm2
265 ; SSE41-NEXT: psrad $25, %xmm2
266 ; SSE41-NEXT: pmulld %xmm2, %xmm0
267 ; SSE41-NEXT: psrld $16, %xmm3
268 ; SSE41-NEXT: psrld $16, %xmm0
269 ; SSE41-NEXT: packusdw %xmm3, %xmm0
272 ; AVX2-LABEL: sextinreg_mulhw_v8i16:
274 ; AVX2-NEXT: vpslld $24, %ymm0, %ymm0
275 ; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0
276 ; AVX2-NEXT: vpslld $25, %ymm1, %ymm1
277 ; AVX2-NEXT: vpsrad $25, %ymm1, %ymm1
278 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
279 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
280 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
281 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
282 ; AVX2-NEXT: vzeroupper
285 ; AVX512-LABEL: sextinreg_mulhw_v8i16:
287 ; AVX512-NEXT: vpslld $24, %ymm0, %ymm0
288 ; AVX512-NEXT: vpsrad $24, %ymm0, %ymm0
289 ; AVX512-NEXT: vpslld $25, %ymm1, %ymm1
290 ; AVX512-NEXT: vpsrad $25, %ymm1, %ymm1
291 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
292 ; AVX512-NEXT: vpsrld $16, %ymm0, %ymm0
293 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
294 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
295 ; AVX512-NEXT: vzeroupper
297 %a1 = shl <8 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
298 %b1 = shl <8 x i32> %b, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
299 %a2 = ashr <8 x i32> %a1, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
300 %b2 = ashr <8 x i32> %b1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
301 %c = mul <8 x i32> %a2, %b2
302 %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
303 %e = trunc <8 x i32> %d to <8 x i16>
307 define <16 x i16> @zext_mulhuw_v16i16(<16 x i16> %a, <16 x i16> %b) {
308 ; SSE-LABEL: zext_mulhuw_v16i16:
310 ; SSE-NEXT: pmulhuw %xmm2, %xmm0
311 ; SSE-NEXT: pmulhuw %xmm3, %xmm1
314 ; AVX-LABEL: zext_mulhuw_v16i16:
316 ; AVX-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
318 %a1 = zext <16 x i16> %a to <16 x i32>
319 %b1 = zext <16 x i16> %b to <16 x i32>
320 %c = mul <16 x i32> %a1, %b1
321 %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
322 %e = trunc <16 x i32> %d to <16 x i16>
326 define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) {
327 ; SSE2-LABEL: and_mulhuw_v16i16:
329 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767]
330 ; SSE2-NEXT: pand %xmm8, %xmm3
331 ; SSE2-NEXT: pand %xmm8, %xmm2
332 ; SSE2-NEXT: packssdw %xmm3, %xmm2
333 ; SSE2-NEXT: pand %xmm8, %xmm1
334 ; SSE2-NEXT: pand %xmm8, %xmm0
335 ; SSE2-NEXT: packssdw %xmm1, %xmm0
336 ; SSE2-NEXT: pand %xmm8, %xmm7
337 ; SSE2-NEXT: pand %xmm8, %xmm6
338 ; SSE2-NEXT: packssdw %xmm7, %xmm6
339 ; SSE2-NEXT: pmulhw %xmm2, %xmm6
340 ; SSE2-NEXT: pand %xmm8, %xmm5
341 ; SSE2-NEXT: pand %xmm8, %xmm4
342 ; SSE2-NEXT: packssdw %xmm5, %xmm4
343 ; SSE2-NEXT: pmulhw %xmm0, %xmm4
344 ; SSE2-NEXT: pxor %xmm0, %xmm0
345 ; SSE2-NEXT: movdqa %xmm6, %xmm1
346 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
347 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
348 ; SSE2-NEXT: packssdw %xmm1, %xmm6
349 ; SSE2-NEXT: movdqa %xmm4, %xmm1
350 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
351 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
352 ; SSE2-NEXT: packssdw %xmm1, %xmm4
353 ; SSE2-NEXT: movdqa %xmm4, %xmm0
354 ; SSE2-NEXT: movdqa %xmm6, %xmm1
357 ; SSE41-LABEL: and_mulhuw_v16i16:
359 ; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767]
360 ; SSE41-NEXT: pand %xmm8, %xmm3
361 ; SSE41-NEXT: pand %xmm8, %xmm2
362 ; SSE41-NEXT: pand %xmm8, %xmm1
363 ; SSE41-NEXT: pand %xmm8, %xmm0
364 ; SSE41-NEXT: pand %xmm8, %xmm7
365 ; SSE41-NEXT: pmaddwd %xmm3, %xmm7
366 ; SSE41-NEXT: pand %xmm8, %xmm6
367 ; SSE41-NEXT: pmaddwd %xmm2, %xmm6
368 ; SSE41-NEXT: pand %xmm8, %xmm5
369 ; SSE41-NEXT: pmaddwd %xmm1, %xmm5
370 ; SSE41-NEXT: pand %xmm8, %xmm4
371 ; SSE41-NEXT: pmaddwd %xmm4, %xmm0
372 ; SSE41-NEXT: psrld $16, %xmm7
373 ; SSE41-NEXT: psrld $16, %xmm6
374 ; SSE41-NEXT: packusdw %xmm7, %xmm6
375 ; SSE41-NEXT: psrld $16, %xmm5
376 ; SSE41-NEXT: psrld $16, %xmm0
377 ; SSE41-NEXT: packusdw %xmm5, %xmm0
378 ; SSE41-NEXT: movdqa %xmm6, %xmm1
381 ; AVX2-LABEL: and_mulhuw_v16i16:
383 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32767,32767,32767,32767,32767,32767,32767,32767]
384 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
385 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
386 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
387 ; AVX2-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1
388 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
389 ; AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
390 ; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
391 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
392 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
393 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
396 ; AVX512F-LABEL: and_mulhuw_v16i16:
398 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
399 ; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0
400 ; AVX512F-NEXT: vpandd %zmm2, %zmm1, %zmm1
401 ; AVX512F-NEXT: vpmulld %zmm1, %zmm0, %zmm0
402 ; AVX512F-NEXT: vpsrld $16, %zmm0, %zmm0
403 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
406 ; AVX512BW-LABEL: and_mulhuw_v16i16:
408 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
409 ; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0
410 ; AVX512BW-NEXT: vpandd %zmm2, %zmm1, %zmm1
411 ; AVX512BW-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0
412 ; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm0
413 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
414 ; AVX512BW-NEXT: retq
415 %a1 = and <16 x i32> %a, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
416 %b1 = and <16 x i32> %b, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
417 %c = mul <16 x i32> %a1, %b1
418 %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
419 %e = trunc <16 x i32> %d to <16 x i16>
423 define <16 x i16> @sext_mulhuw_v16i16(<16 x i16> %a, <16 x i16> %b) {
424 ; SSE-LABEL: sext_mulhuw_v16i16:
426 ; SSE-NEXT: pmulhw %xmm2, %xmm0
427 ; SSE-NEXT: pmulhw %xmm3, %xmm1
430 ; AVX-LABEL: sext_mulhuw_v16i16:
432 ; AVX-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
434 %a1 = sext <16 x i16> %a to <16 x i32>
435 %b1 = sext <16 x i16> %b to <16 x i32>
436 %c = mul <16 x i32> %a1, %b1
437 %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
438 %e = trunc <16 x i32> %d to <16 x i16>
442 define <16 x i16> @ashr_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) {
443 ; SSE2-LABEL: ashr_mulhuw_v16i16:
445 ; SSE2-NEXT: psrad $16, %xmm5
446 ; SSE2-NEXT: psrad $16, %xmm4
447 ; SSE2-NEXT: packssdw %xmm5, %xmm4
448 ; SSE2-NEXT: psrad $16, %xmm1
449 ; SSE2-NEXT: psrad $16, %xmm0
450 ; SSE2-NEXT: packssdw %xmm1, %xmm0
451 ; SSE2-NEXT: pmulhw %xmm4, %xmm0
452 ; SSE2-NEXT: psrad $16, %xmm7
453 ; SSE2-NEXT: psrad $16, %xmm6
454 ; SSE2-NEXT: packssdw %xmm7, %xmm6
455 ; SSE2-NEXT: psrad $16, %xmm3
456 ; SSE2-NEXT: psrad $16, %xmm2
457 ; SSE2-NEXT: packssdw %xmm3, %xmm2
458 ; SSE2-NEXT: pmulhw %xmm6, %xmm2
459 ; SSE2-NEXT: movdqa %xmm2, %xmm1
462 ; SSE41-LABEL: ashr_mulhuw_v16i16:
464 ; SSE41-NEXT: psrad $16, %xmm3
465 ; SSE41-NEXT: psrad $16, %xmm2
466 ; SSE41-NEXT: psrad $16, %xmm1
467 ; SSE41-NEXT: psrad $16, %xmm0
468 ; SSE41-NEXT: psrad $16, %xmm7
469 ; SSE41-NEXT: pmulld %xmm3, %xmm7
470 ; SSE41-NEXT: psrad $16, %xmm6
471 ; SSE41-NEXT: pmulld %xmm2, %xmm6
472 ; SSE41-NEXT: psrad $16, %xmm5
473 ; SSE41-NEXT: pmulld %xmm1, %xmm5
474 ; SSE41-NEXT: psrad $16, %xmm4
475 ; SSE41-NEXT: pmulld %xmm4, %xmm0
476 ; SSE41-NEXT: psrld $16, %xmm7
477 ; SSE41-NEXT: psrld $16, %xmm6
478 ; SSE41-NEXT: packusdw %xmm7, %xmm6
479 ; SSE41-NEXT: psrld $16, %xmm5
480 ; SSE41-NEXT: psrld $16, %xmm0
481 ; SSE41-NEXT: packusdw %xmm5, %xmm0
482 ; SSE41-NEXT: movdqa %xmm6, %xmm1
485 ; AVX2-LABEL: ashr_mulhuw_v16i16:
487 ; AVX2-NEXT: vpsrad $16, %ymm1, %ymm1
488 ; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0
489 ; AVX2-NEXT: vpsrad $16, %ymm3, %ymm3
490 ; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1
491 ; AVX2-NEXT: vpsrad $16, %ymm2, %ymm2
492 ; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0
493 ; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
494 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
495 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
496 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
499 ; AVX512-LABEL: ashr_mulhuw_v16i16:
501 ; AVX512-NEXT: vpsrad $16, %zmm0, %zmm0
502 ; AVX512-NEXT: vpsrad $16, %zmm1, %zmm1
503 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
504 ; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0
505 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
507 %a1 = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
508 %b1 = ashr <16 x i32> %b, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
509 %c = mul <16 x i32> %a1, %b1
510 %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
511 %e = trunc <16 x i32> %d to <16 x i16>
515 define <32 x i16> @zext_mulhuw_v32i16(<32 x i16> %a, <32 x i16> %b) {
516 ; SSE-LABEL: zext_mulhuw_v32i16:
518 ; SSE-NEXT: pmulhuw %xmm4, %xmm0
519 ; SSE-NEXT: pmulhuw %xmm5, %xmm1
520 ; SSE-NEXT: pmulhuw %xmm6, %xmm2
521 ; SSE-NEXT: pmulhuw %xmm7, %xmm3
524 ; AVX2-LABEL: zext_mulhuw_v32i16:
526 ; AVX2-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0
527 ; AVX2-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1
530 ; AVX512F-LABEL: zext_mulhuw_v32i16:
532 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
533 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
534 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm3, %ymm2
535 ; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
536 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
539 ; AVX512BW-LABEL: zext_mulhuw_v32i16:
541 ; AVX512BW-NEXT: vpmulhuw %zmm1, %zmm0, %zmm0
542 ; AVX512BW-NEXT: retq
543 %a1 = zext <32 x i16> %a to <32 x i32>
544 %b1 = zext <32 x i16> %b to <32 x i32>
545 %c = mul <32 x i32> %a1, %b1
546 %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
547 %e = trunc <32 x i32> %d to <32 x i16>
551 define <32 x i16> @sext_mulhuw_v32i16(<32 x i16> %a, <32 x i16> %b) {
552 ; SSE-LABEL: sext_mulhuw_v32i16:
554 ; SSE-NEXT: pmulhw %xmm4, %xmm0
555 ; SSE-NEXT: pmulhw %xmm5, %xmm1
556 ; SSE-NEXT: pmulhw %xmm6, %xmm2
557 ; SSE-NEXT: pmulhw %xmm7, %xmm3
560 ; AVX2-LABEL: sext_mulhuw_v32i16:
562 ; AVX2-NEXT: vpmulhw %ymm2, %ymm0, %ymm0
563 ; AVX2-NEXT: vpmulhw %ymm3, %ymm1, %ymm1
566 ; AVX512F-LABEL: sext_mulhuw_v32i16:
568 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
569 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
570 ; AVX512F-NEXT: vpmulhw %ymm2, %ymm3, %ymm2
571 ; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
572 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
575 ; AVX512BW-LABEL: sext_mulhuw_v32i16:
577 ; AVX512BW-NEXT: vpmulhw %zmm1, %zmm0, %zmm0
578 ; AVX512BW-NEXT: retq
579 %a1 = sext <32 x i16> %a to <32 x i32>
580 %b1 = sext <32 x i16> %b to <32 x i32>
581 %c = mul <32 x i32> %a1, %b1
582 %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
583 %e = trunc <32 x i32> %d to <32 x i16>
587 define <64 x i16> @zext_mulhuw_v64i16(<64 x i16> %a, <64 x i16> %b) {
588 ; SSE-LABEL: zext_mulhuw_v64i16:
590 ; SSE-NEXT: movq %rdi, %rax
591 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm0
592 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm1
593 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm2
594 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm3
595 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm4
596 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm5
597 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm6
598 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm7
599 ; SSE-NEXT: movdqa %xmm7, 112(%rdi)
600 ; SSE-NEXT: movdqa %xmm6, 96(%rdi)
601 ; SSE-NEXT: movdqa %xmm5, 80(%rdi)
602 ; SSE-NEXT: movdqa %xmm4, 64(%rdi)
603 ; SSE-NEXT: movdqa %xmm3, 48(%rdi)
604 ; SSE-NEXT: movdqa %xmm2, 32(%rdi)
605 ; SSE-NEXT: movdqa %xmm1, 16(%rdi)
606 ; SSE-NEXT: movdqa %xmm0, (%rdi)
609 ; AVX2-LABEL: zext_mulhuw_v64i16:
611 ; AVX2-NEXT: vpmulhuw %ymm4, %ymm0, %ymm0
612 ; AVX2-NEXT: vpmulhuw %ymm5, %ymm1, %ymm1
613 ; AVX2-NEXT: vpmulhuw %ymm6, %ymm2, %ymm2
614 ; AVX2-NEXT: vpmulhuw %ymm7, %ymm3, %ymm3
617 ; AVX512F-LABEL: zext_mulhuw_v64i16:
619 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4
620 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
621 ; AVX512F-NEXT: vpmulhuw %ymm4, %ymm5, %ymm4
622 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0
623 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
624 ; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm2
625 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
626 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm4, %ymm2
627 ; AVX512F-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1
628 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
631 ; AVX512BW-LABEL: zext_mulhuw_v64i16:
633 ; AVX512BW-NEXT: vpmulhuw %zmm2, %zmm0, %zmm0
634 ; AVX512BW-NEXT: vpmulhuw %zmm3, %zmm1, %zmm1
635 ; AVX512BW-NEXT: retq
636 %a1 = zext <64 x i16> %a to <64 x i32>
637 %b1 = zext <64 x i16> %b to <64 x i32>
638 %c = mul <64 x i32> %a1, %b1
639 %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
640 %e = trunc <64 x i32> %d to <64 x i16>
644 define <64 x i16> @sext_mulhuw_v64i16(<64 x i16> %a, <64 x i16> %b) {
645 ; SSE-LABEL: sext_mulhuw_v64i16:
647 ; SSE-NEXT: movq %rdi, %rax
648 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0
649 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1
650 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2
651 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3
652 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4
653 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5
654 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6
655 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7
656 ; SSE-NEXT: movdqa %xmm7, 112(%rdi)
657 ; SSE-NEXT: movdqa %xmm6, 96(%rdi)
658 ; SSE-NEXT: movdqa %xmm5, 80(%rdi)
659 ; SSE-NEXT: movdqa %xmm4, 64(%rdi)
660 ; SSE-NEXT: movdqa %xmm3, 48(%rdi)
661 ; SSE-NEXT: movdqa %xmm2, 32(%rdi)
662 ; SSE-NEXT: movdqa %xmm1, 16(%rdi)
663 ; SSE-NEXT: movdqa %xmm0, (%rdi)
666 ; AVX2-LABEL: sext_mulhuw_v64i16:
668 ; AVX2-NEXT: vpmulhw %ymm4, %ymm0, %ymm0
669 ; AVX2-NEXT: vpmulhw %ymm5, %ymm1, %ymm1
670 ; AVX2-NEXT: vpmulhw %ymm6, %ymm2, %ymm2
671 ; AVX2-NEXT: vpmulhw %ymm7, %ymm3, %ymm3
674 ; AVX512F-LABEL: sext_mulhuw_v64i16:
676 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4
677 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
678 ; AVX512F-NEXT: vpmulhw %ymm4, %ymm5, %ymm4
679 ; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0
680 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
681 ; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm2
682 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
683 ; AVX512F-NEXT: vpmulhw %ymm2, %ymm4, %ymm2
684 ; AVX512F-NEXT: vpmulhw %ymm3, %ymm1, %ymm1
685 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
688 ; AVX512BW-LABEL: sext_mulhuw_v64i16:
690 ; AVX512BW-NEXT: vpmulhw %zmm2, %zmm0, %zmm0
691 ; AVX512BW-NEXT: vpmulhw %zmm3, %zmm1, %zmm1
692 ; AVX512BW-NEXT: retq
693 %a1 = sext <64 x i16> %a to <64 x i32>
694 %b1 = sext <64 x i16> %b to <64 x i32>
695 %c = mul <64 x i32> %a1, %b1
696 %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
697 %e = trunc <64 x i32> %d to <64 x i16>
701 define <8 x i16> @zext_mulhuw_v8i16_i64(<8 x i16> %a, <8 x i16> %b) {
702 ; SSE-LABEL: zext_mulhuw_v8i16_i64:
704 ; SSE-NEXT: pmulhuw %xmm1, %xmm0
707 ; AVX-LABEL: zext_mulhuw_v8i16_i64:
709 ; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
711 %a1 = zext <8 x i16> %a to <8 x i64>
712 %b1 = zext <8 x i16> %b to <8 x i64>
713 %c = mul <8 x i64> %a1, %b1
714 %d = lshr <8 x i64> %c, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
715 %e = trunc <8 x i64> %d to <8 x i16>
719 define <8 x i16> @sext_mulhuw_v8i16_i64(<8 x i16> %a, <8 x i16> %b) {
720 ; SSE-LABEL: sext_mulhuw_v8i16_i64:
722 ; SSE-NEXT: pmulhw %xmm1, %xmm0
725 ; AVX-LABEL: sext_mulhuw_v8i16_i64:
727 ; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
729 %a1 = sext <8 x i16> %a to <8 x i64>
730 %b1 = sext <8 x i16> %b to <8 x i64>
731 %c = mul <8 x i64> %a1, %b1
732 %d = lshr <8 x i64> %c, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
733 %e = trunc <8 x i64> %d to <8 x i16>
737 define <4 x i32> @zext_mulhuw_v4i16_lshr(<4 x i16> %a, <4 x i16> %b) {
738 ; SSE2-LABEL: zext_mulhuw_v4i16_lshr:
740 ; SSE2-NEXT: pmulhuw %xmm1, %xmm0
741 ; SSE2-NEXT: pxor %xmm1, %xmm1
742 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
745 ; SSE41-LABEL: zext_mulhuw_v4i16_lshr:
747 ; SSE41-NEXT: pmulhuw %xmm1, %xmm0
748 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
751 ; AVX-LABEL: zext_mulhuw_v4i16_lshr:
753 ; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
754 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
756 %a1 = zext <4 x i16> %a to <4 x i32>
757 %b1 = zext <4 x i16> %b to <4 x i32>
758 %c = mul <4 x i32> %a1, %b1
759 %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
763 define <4 x i32> @mulhsw_v4i16_lshr(<4 x i16> %a, <4 x i16> %b) {
764 ; SSE2-LABEL: mulhsw_v4i16_lshr:
766 ; SSE2-NEXT: pmulhw %xmm1, %xmm0
767 ; SSE2-NEXT: pxor %xmm1, %xmm1
768 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
771 ; SSE41-LABEL: mulhsw_v4i16_lshr:
773 ; SSE41-NEXT: pmulhw %xmm1, %xmm0
774 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
777 ; AVX-LABEL: mulhsw_v4i16_lshr:
779 ; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
780 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
782 %a1 = sext <4 x i16> %a to <4 x i32>
783 %b1 = sext <4 x i16> %b to <4 x i32>
784 %c = mul <4 x i32> %a1, %b1
785 %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
789 define <4 x i32> @mulhsw_v4i16_ashr(<4 x i16> %a, <4 x i16> %b) {
790 ; SSE2-LABEL: mulhsw_v4i16_ashr:
792 ; SSE2-NEXT: pmulhw %xmm1, %xmm0
793 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
794 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
795 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
796 ; SSE2-NEXT: psrad $16, %xmm0
799 ; SSE41-LABEL: mulhsw_v4i16_ashr:
801 ; SSE41-NEXT: pmulhw %xmm1, %xmm0
802 ; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
805 ; AVX-LABEL: mulhsw_v4i16_ashr:
807 ; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
808 ; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
810 %a1 = sext <4 x i16> %a to <4 x i32>
811 %b1 = sext <4 x i16> %b to <4 x i32>
812 %c = mul <4 x i32> %a1, %b1
813 %d = ashr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
817 define <8 x i32> @zext_mulhuw_v8i16_lshr(<8 x i16> %a, <8 x i16> %b) {
818 ; SSE2-LABEL: zext_mulhuw_v8i16_lshr:
820 ; SSE2-NEXT: movdqa %xmm0, %xmm2
821 ; SSE2-NEXT: pmulhuw %xmm1, %xmm2
822 ; SSE2-NEXT: pxor %xmm1, %xmm1
823 ; SSE2-NEXT: movdqa %xmm2, %xmm0
824 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
825 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
826 ; SSE2-NEXT: movdqa %xmm2, %xmm1
829 ; SSE41-LABEL: zext_mulhuw_v8i16_lshr:
831 ; SSE41-NEXT: movdqa %xmm0, %xmm2
832 ; SSE41-NEXT: pmulhuw %xmm1, %xmm2
833 ; SSE41-NEXT: pxor %xmm1, %xmm1
834 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
835 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
836 ; SSE41-NEXT: movdqa %xmm2, %xmm1
839 ; AVX-LABEL: zext_mulhuw_v8i16_lshr:
841 ; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
842 ; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
844 %a1 = zext <8 x i16> %a to <8 x i32>
845 %b1 = zext <8 x i16> %b to <8 x i32>
846 %c = mul <8 x i32> %a1, %b1
847 %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
851 define <8 x i32> @mulhsw_v8i16_lshr(<8 x i16> %a, <8 x i16> %b) {
852 ; SSE2-LABEL: mulhsw_v8i16_lshr:
854 ; SSE2-NEXT: movdqa %xmm0, %xmm2
855 ; SSE2-NEXT: pmulhw %xmm1, %xmm2
856 ; SSE2-NEXT: pxor %xmm1, %xmm1
857 ; SSE2-NEXT: movdqa %xmm2, %xmm0
858 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
859 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
860 ; SSE2-NEXT: movdqa %xmm2, %xmm1
863 ; SSE41-LABEL: mulhsw_v8i16_lshr:
865 ; SSE41-NEXT: movdqa %xmm0, %xmm2
866 ; SSE41-NEXT: pmulhw %xmm1, %xmm2
867 ; SSE41-NEXT: pxor %xmm1, %xmm1
868 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
869 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
870 ; SSE41-NEXT: movdqa %xmm2, %xmm1
873 ; AVX-LABEL: mulhsw_v8i16_lshr:
875 ; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
876 ; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
878 %a1 = sext <8 x i16> %a to <8 x i32>
879 %b1 = sext <8 x i16> %b to <8 x i32>
880 %c = mul <8 x i32> %a1, %b1
881 %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
885 define <8 x i32> @mulhsw_v8i16_ashr(<8 x i16> %a, <8 x i16> %b) {
886 ; SSE2-LABEL: mulhsw_v8i16_ashr:
888 ; SSE2-NEXT: pmulhw %xmm1, %xmm0
889 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
890 ; SSE2-NEXT: psrad $16, %xmm2
891 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
892 ; SSE2-NEXT: psrad $16, %xmm1
893 ; SSE2-NEXT: movdqa %xmm2, %xmm0
896 ; SSE41-LABEL: mulhsw_v8i16_ashr:
898 ; SSE41-NEXT: pmulhw %xmm1, %xmm0
899 ; SSE41-NEXT: pmovsxwd %xmm0, %xmm2
900 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
901 ; SSE41-NEXT: pmovsxwd %xmm0, %xmm1
902 ; SSE41-NEXT: movdqa %xmm2, %xmm0
905 ; AVX-LABEL: mulhsw_v8i16_ashr:
907 ; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
908 ; AVX-NEXT: vpmovsxwd %xmm0, %ymm0
910 %a1 = sext <8 x i16> %a to <8 x i32>
911 %b1 = sext <8 x i16> %b to <8 x i32>
912 %c = mul <8 x i32> %a1, %b1
913 %d = ashr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
917 define <16 x i32> @zext_mulhuw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) {
918 ; SSE2-LABEL: zext_mulhuw_v16i16_lshr:
920 ; SSE2-NEXT: movdqa %xmm1, %xmm4
921 ; SSE2-NEXT: movdqa %xmm0, %xmm1
922 ; SSE2-NEXT: pmulhuw %xmm3, %xmm4
923 ; SSE2-NEXT: pmulhuw %xmm2, %xmm1
924 ; SSE2-NEXT: pxor %xmm3, %xmm3
925 ; SSE2-NEXT: movdqa %xmm1, %xmm0
926 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
927 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
928 ; SSE2-NEXT: movdqa %xmm4, %xmm2
929 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
930 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
931 ; SSE2-NEXT: movdqa %xmm4, %xmm3
934 ; SSE41-LABEL: zext_mulhuw_v16i16_lshr:
936 ; SSE41-NEXT: movdqa %xmm1, %xmm4
937 ; SSE41-NEXT: movdqa %xmm0, %xmm1
938 ; SSE41-NEXT: pmulhuw %xmm2, %xmm1
939 ; SSE41-NEXT: pxor %xmm5, %xmm5
940 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
941 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
942 ; SSE41-NEXT: pmulhuw %xmm3, %xmm4
943 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
944 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
945 ; SSE41-NEXT: movdqa %xmm4, %xmm3
948 ; AVX2-LABEL: zext_mulhuw_v16i16_lshr:
950 ; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm1
951 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
952 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
953 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
956 ; AVX512-LABEL: zext_mulhuw_v16i16_lshr:
958 ; AVX512-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
959 ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
961 %a1 = zext <16 x i16> %a to <16 x i32>
962 %b1 = zext <16 x i16> %b to <16 x i32>
963 %c = mul <16 x i32> %a1, %b1
964 %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
968 define <16 x i32> @mulhsw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) {
969 ; SSE2-LABEL: mulhsw_v16i16_lshr:
971 ; SSE2-NEXT: movdqa %xmm1, %xmm4
972 ; SSE2-NEXT: movdqa %xmm0, %xmm1
973 ; SSE2-NEXT: pmulhw %xmm3, %xmm4
974 ; SSE2-NEXT: pmulhw %xmm2, %xmm1
975 ; SSE2-NEXT: pxor %xmm3, %xmm3
976 ; SSE2-NEXT: movdqa %xmm1, %xmm0
977 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
978 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
979 ; SSE2-NEXT: movdqa %xmm4, %xmm2
980 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
981 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
982 ; SSE2-NEXT: movdqa %xmm4, %xmm3
985 ; SSE41-LABEL: mulhsw_v16i16_lshr:
987 ; SSE41-NEXT: movdqa %xmm1, %xmm4
988 ; SSE41-NEXT: movdqa %xmm0, %xmm1
989 ; SSE41-NEXT: pmulhw %xmm2, %xmm1
990 ; SSE41-NEXT: pxor %xmm5, %xmm5
991 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
992 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
993 ; SSE41-NEXT: pmulhw %xmm3, %xmm4
994 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
995 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
996 ; SSE41-NEXT: movdqa %xmm4, %xmm3
999 ; AVX2-LABEL: mulhsw_v16i16_lshr:
1001 ; AVX2-NEXT: vpmulhw %ymm1, %ymm0, %ymm1
1002 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1003 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
1004 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1007 ; AVX512-LABEL: mulhsw_v16i16_lshr:
1009 ; AVX512-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
1010 ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1012 %a1 = sext <16 x i16> %a to <16 x i32>
1013 %b1 = sext <16 x i16> %b to <16 x i32>
1014 %c = mul <16 x i32> %a1, %b1
1015 %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1019 define <16 x i32> @mulhsw_v16i16_ashr(<16 x i16> %a, <16 x i16> %b) {
1020 ; SSE2-LABEL: mulhsw_v16i16_ashr:
1022 ; SSE2-NEXT: pmulhw %xmm3, %xmm1
1023 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1024 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
1025 ; SSE2-NEXT: pmulhw %xmm2, %xmm0
1026 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1027 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1028 ; SSE2-NEXT: psrad $16, %xmm0
1029 ; SSE2-NEXT: psrad $16, %xmm1
1030 ; SSE2-NEXT: psrad $16, %xmm4
1031 ; SSE2-NEXT: psrad $16, %xmm3
1032 ; SSE2-NEXT: movdqa %xmm4, %xmm2
1035 ; SSE41-LABEL: mulhsw_v16i16_ashr:
1037 ; SSE41-NEXT: pmulhw %xmm2, %xmm0
1038 ; SSE41-NEXT: pmovsxwd %xmm0, %xmm4
1039 ; SSE41-NEXT: pmulhw %xmm3, %xmm1
1040 ; SSE41-NEXT: pmovsxwd %xmm1, %xmm2
1041 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1042 ; SSE41-NEXT: pmovsxwd %xmm0, %xmm5
1043 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1044 ; SSE41-NEXT: pmovsxwd %xmm0, %xmm3
1045 ; SSE41-NEXT: movdqa %xmm4, %xmm0
1046 ; SSE41-NEXT: movdqa %xmm5, %xmm1
1049 ; AVX2-LABEL: mulhsw_v16i16_ashr:
1051 ; AVX2-NEXT: vpmulhw %ymm1, %ymm0, %ymm1
1052 ; AVX2-NEXT: vpmovsxwd %xmm1, %ymm0
1053 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
1054 ; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1
1057 ; AVX512-LABEL: mulhsw_v16i16_ashr:
1059 ; AVX512-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
1060 ; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0
1062 %a1 = sext <16 x i16> %a to <16 x i32>
1063 %b1 = sext <16 x i16> %b to <16 x i32>
1064 %c = mul <16 x i32> %a1, %b1
1065 %d = ashr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1069 define <32 x i32> @zext_mulhuw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) {
1070 ; SSE2-LABEL: zext_mulhuw_v32i16_lshr:
1072 ; SSE2-NEXT: movq %rdi, %rax
1073 ; SSE2-NEXT: pmulhuw %xmm7, %xmm3
1074 ; SSE2-NEXT: pmulhuw %xmm6, %xmm2
1075 ; SSE2-NEXT: pmulhuw %xmm5, %xmm1
1076 ; SSE2-NEXT: pmulhuw %xmm4, %xmm0
1077 ; SSE2-NEXT: pxor %xmm4, %xmm4
1078 ; SSE2-NEXT: movdqa %xmm0, %xmm8
1079 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
1080 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1081 ; SSE2-NEXT: movdqa %xmm1, %xmm6
1082 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
1083 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1084 ; SSE2-NEXT: movdqa %xmm2, %xmm7
1085 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
1086 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
1087 ; SSE2-NEXT: movdqa %xmm3, %xmm5
1088 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
1089 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1090 ; SSE2-NEXT: movdqa %xmm3, 112(%rdi)
1091 ; SSE2-NEXT: movdqa %xmm5, 96(%rdi)
1092 ; SSE2-NEXT: movdqa %xmm2, 80(%rdi)
1093 ; SSE2-NEXT: movdqa %xmm7, 64(%rdi)
1094 ; SSE2-NEXT: movdqa %xmm1, 48(%rdi)
1095 ; SSE2-NEXT: movdqa %xmm6, 32(%rdi)
1096 ; SSE2-NEXT: movdqa %xmm0, 16(%rdi)
1097 ; SSE2-NEXT: movdqa %xmm8, (%rdi)
1100 ; SSE41-LABEL: zext_mulhuw_v32i16_lshr:
1102 ; SSE41-NEXT: movq %rdi, %rax
1103 ; SSE41-NEXT: pmulhuw %xmm4, %xmm0
1104 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1105 ; SSE41-NEXT: pxor %xmm4, %xmm4
1106 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1107 ; SSE41-NEXT: pmulhuw %xmm5, %xmm1
1108 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1109 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1110 ; SSE41-NEXT: pmulhuw %xmm6, %xmm2
1111 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1112 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
1113 ; SSE41-NEXT: pmulhuw %xmm7, %xmm3
1114 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
1115 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1116 ; SSE41-NEXT: movdqa %xmm3, 112(%rdi)
1117 ; SSE41-NEXT: movdqa %xmm7, 96(%rdi)
1118 ; SSE41-NEXT: movdqa %xmm2, 80(%rdi)
1119 ; SSE41-NEXT: movdqa %xmm6, 64(%rdi)
1120 ; SSE41-NEXT: movdqa %xmm1, 48(%rdi)
1121 ; SSE41-NEXT: movdqa %xmm5, 32(%rdi)
1122 ; SSE41-NEXT: movdqa %xmm0, 16(%rdi)
1123 ; SSE41-NEXT: movdqa %xmm8, (%rdi)
1126 ; AVX2-LABEL: zext_mulhuw_v32i16_lshr:
1128 ; AVX2-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2
1129 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1130 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
1131 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1132 ; AVX2-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1
1133 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1134 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
1135 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1136 ; AVX2-NEXT: vmovdqa %ymm4, %ymm1
1139 ; AVX512F-LABEL: zext_mulhuw_v32i16_lshr:
1141 ; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1142 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
1143 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
1144 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1145 ; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
1146 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1147 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
1148 ; AVX512F-NEXT: retq
1150 ; AVX512BW-LABEL: zext_mulhuw_v32i16_lshr:
1151 ; AVX512BW: # %bb.0:
1152 ; AVX512BW-NEXT: vpmulhuw %zmm1, %zmm0, %zmm1
1153 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1154 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1
1155 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1156 ; AVX512BW-NEXT: retq
1157 %a1 = zext <32 x i16> %a to <32 x i32>
1158 %b1 = zext <32 x i16> %b to <32 x i32>
1159 %c = mul <32 x i32> %a1, %b1
1160 %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1164 define <32 x i32> @mulhsw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) {
1165 ; SSE2-LABEL: mulhsw_v32i16_lshr:
1167 ; SSE2-NEXT: movq %rdi, %rax
1168 ; SSE2-NEXT: pmulhw %xmm7, %xmm3
1169 ; SSE2-NEXT: pmulhw %xmm6, %xmm2
1170 ; SSE2-NEXT: pmulhw %xmm5, %xmm1
1171 ; SSE2-NEXT: pmulhw %xmm4, %xmm0
1172 ; SSE2-NEXT: pxor %xmm4, %xmm4
1173 ; SSE2-NEXT: movdqa %xmm0, %xmm8
1174 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
1175 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1176 ; SSE2-NEXT: movdqa %xmm1, %xmm6
1177 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
1178 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1179 ; SSE2-NEXT: movdqa %xmm2, %xmm7
1180 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
1181 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
1182 ; SSE2-NEXT: movdqa %xmm3, %xmm5
1183 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
1184 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1185 ; SSE2-NEXT: movdqa %xmm3, 112(%rdi)
1186 ; SSE2-NEXT: movdqa %xmm5, 96(%rdi)
1187 ; SSE2-NEXT: movdqa %xmm2, 80(%rdi)
1188 ; SSE2-NEXT: movdqa %xmm7, 64(%rdi)
1189 ; SSE2-NEXT: movdqa %xmm1, 48(%rdi)
1190 ; SSE2-NEXT: movdqa %xmm6, 32(%rdi)
1191 ; SSE2-NEXT: movdqa %xmm0, 16(%rdi)
1192 ; SSE2-NEXT: movdqa %xmm8, (%rdi)
1195 ; SSE41-LABEL: mulhsw_v32i16_lshr:
1197 ; SSE41-NEXT: movq %rdi, %rax
1198 ; SSE41-NEXT: pmulhw %xmm4, %xmm0
1199 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1200 ; SSE41-NEXT: pxor %xmm4, %xmm4
1201 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1202 ; SSE41-NEXT: pmulhw %xmm5, %xmm1
1203 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1204 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1205 ; SSE41-NEXT: pmulhw %xmm6, %xmm2
1206 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1207 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
1208 ; SSE41-NEXT: pmulhw %xmm7, %xmm3
1209 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
1210 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1211 ; SSE41-NEXT: movdqa %xmm3, 112(%rdi)
1212 ; SSE41-NEXT: movdqa %xmm7, 96(%rdi)
1213 ; SSE41-NEXT: movdqa %xmm2, 80(%rdi)
1214 ; SSE41-NEXT: movdqa %xmm6, 64(%rdi)
1215 ; SSE41-NEXT: movdqa %xmm1, 48(%rdi)
1216 ; SSE41-NEXT: movdqa %xmm5, 32(%rdi)
1217 ; SSE41-NEXT: movdqa %xmm0, 16(%rdi)
1218 ; SSE41-NEXT: movdqa %xmm8, (%rdi)
1221 ; AVX2-LABEL: mulhsw_v32i16_lshr:
1223 ; AVX2-NEXT: vpmulhw %ymm2, %ymm0, %ymm2
1224 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1225 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
1226 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1227 ; AVX2-NEXT: vpmulhw %ymm3, %ymm1, %ymm1
1228 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1229 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
1230 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1231 ; AVX2-NEXT: vmovdqa %ymm4, %ymm1
1234 ; AVX512F-LABEL: mulhsw_v32i16_lshr:
1236 ; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm2
1237 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
1238 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
1239 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1240 ; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
1241 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1242 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
1243 ; AVX512F-NEXT: retq
1245 ; AVX512BW-LABEL: mulhsw_v32i16_lshr:
1246 ; AVX512BW: # %bb.0:
1247 ; AVX512BW-NEXT: vpmulhw %zmm1, %zmm0, %zmm1
1248 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1249 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1
1250 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1251 ; AVX512BW-NEXT: retq
1252 %a1 = sext <32 x i16> %a to <32 x i32>
1253 %b1 = sext <32 x i16> %b to <32 x i32>
1254 %c = mul <32 x i32> %a1, %b1
1255 %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1259 define <32 x i32> @mulhsw_v32i16_ashr(<32 x i16> %a, <32 x i16> %b) {
1260 ; SSE2-LABEL: mulhsw_v32i16_ashr:
1262 ; SSE2-NEXT: movq %rdi, %rax
1263 ; SSE2-NEXT: pmulhw %xmm7, %xmm3
1264 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
1265 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
1266 ; SSE2-NEXT: pmulhw %xmm6, %xmm2
1267 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
1268 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
1269 ; SSE2-NEXT: pmulhw %xmm5, %xmm1
1270 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
1271 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1272 ; SSE2-NEXT: pmulhw %xmm4, %xmm0
1273 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
1274 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1275 ; SSE2-NEXT: psrad $16, %xmm0
1276 ; SSE2-NEXT: psrad $16, %xmm4
1277 ; SSE2-NEXT: psrad $16, %xmm1
1278 ; SSE2-NEXT: psrad $16, %xmm5
1279 ; SSE2-NEXT: psrad $16, %xmm2
1280 ; SSE2-NEXT: psrad $16, %xmm6
1281 ; SSE2-NEXT: psrad $16, %xmm3
1282 ; SSE2-NEXT: psrad $16, %xmm7
1283 ; SSE2-NEXT: movdqa %xmm7, 112(%rdi)
1284 ; SSE2-NEXT: movdqa %xmm3, 96(%rdi)
1285 ; SSE2-NEXT: movdqa %xmm6, 80(%rdi)
1286 ; SSE2-NEXT: movdqa %xmm2, 64(%rdi)
1287 ; SSE2-NEXT: movdqa %xmm5, 48(%rdi)
1288 ; SSE2-NEXT: movdqa %xmm1, 32(%rdi)
1289 ; SSE2-NEXT: movdqa %xmm4, 16(%rdi)
1290 ; SSE2-NEXT: movdqa %xmm0, (%rdi)
1293 ; SSE41-LABEL: mulhsw_v32i16_ashr:
1295 ; SSE41-NEXT: movq %rdi, %rax
1296 ; SSE41-NEXT: pmulhw %xmm4, %xmm0
1297 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1298 ; SSE41-NEXT: pmovsxwd %xmm4, %xmm4
1299 ; SSE41-NEXT: pmulhw %xmm5, %xmm1
1300 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
1301 ; SSE41-NEXT: pmovsxwd %xmm5, %xmm5
1302 ; SSE41-NEXT: pmulhw %xmm6, %xmm2
1303 ; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
1304 ; SSE41-NEXT: pmovsxwd %xmm6, %xmm6
1305 ; SSE41-NEXT: pmulhw %xmm7, %xmm3
1306 ; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3]
1307 ; SSE41-NEXT: pmovsxwd %xmm7, %xmm7
1308 ; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
1309 ; SSE41-NEXT: pmovsxwd %xmm1, %xmm1
1310 ; SSE41-NEXT: pmovsxwd %xmm2, %xmm2
1311 ; SSE41-NEXT: pmovsxwd %xmm3, %xmm3
1312 ; SSE41-NEXT: movdqa %xmm3, 96(%rdi)
1313 ; SSE41-NEXT: movdqa %xmm2, 64(%rdi)
1314 ; SSE41-NEXT: movdqa %xmm1, 32(%rdi)
1315 ; SSE41-NEXT: movdqa %xmm0, (%rdi)
1316 ; SSE41-NEXT: movdqa %xmm7, 112(%rdi)
1317 ; SSE41-NEXT: movdqa %xmm6, 80(%rdi)
1318 ; SSE41-NEXT: movdqa %xmm5, 48(%rdi)
1319 ; SSE41-NEXT: movdqa %xmm4, 16(%rdi)
1322 ; AVX2-LABEL: mulhsw_v32i16_ashr:
1324 ; AVX2-NEXT: vpmulhw %ymm2, %ymm0, %ymm2
1325 ; AVX2-NEXT: vpmovsxwd %xmm2, %ymm0
1326 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
1327 ; AVX2-NEXT: vpmovsxwd %xmm2, %ymm4
1328 ; AVX2-NEXT: vpmulhw %ymm3, %ymm1, %ymm1
1329 ; AVX2-NEXT: vpmovsxwd %xmm1, %ymm2
1330 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
1331 ; AVX2-NEXT: vpmovsxwd %xmm1, %ymm3
1332 ; AVX2-NEXT: vmovdqa %ymm4, %ymm1
1335 ; AVX512F-LABEL: mulhsw_v32i16_ashr:
1337 ; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm2
1338 ; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
1339 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
1340 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1341 ; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
1342 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm1
1343 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
1344 ; AVX512F-NEXT: retq
1346 ; AVX512BW-LABEL: mulhsw_v32i16_ashr:
1347 ; AVX512BW: # %bb.0:
1348 ; AVX512BW-NEXT: vpmulhw %zmm1, %zmm0, %zmm1
1349 ; AVX512BW-NEXT: vpmovsxwd %ymm1, %zmm0
1350 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1
1351 ; AVX512BW-NEXT: vpmovsxwd %ymm1, %zmm1
1352 ; AVX512BW-NEXT: retq
1353 %a1 = sext <32 x i16> %a to <32 x i32>
1354 %b1 = sext <32 x i16> %b to <32 x i32>
1355 %c = mul <32 x i32> %a1, %b1
1356 %d = ashr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1360 define <64 x i32> @zext_mulhuw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) {
1361 ; SSE2-LABEL: zext_mulhuw_v64i16_lshr:
1363 ; SSE2-NEXT: movdqa %xmm7, %xmm8
1364 ; SSE2-NEXT: movq %rdi, %rax
1365 ; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm8
1366 ; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm6
1367 ; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm5
1368 ; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm4
1369 ; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm3
1370 ; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm2
1371 ; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm1
1372 ; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm0
1373 ; SSE2-NEXT: pxor %xmm11, %xmm11
1374 ; SSE2-NEXT: movdqa %xmm0, %xmm7
1375 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
1376 ; SSE2-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1377 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
1378 ; SSE2-NEXT: movdqa %xmm1, %xmm9
1379 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3]
1380 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
1381 ; SSE2-NEXT: movdqa %xmm2, %xmm10
1382 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1383 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
1384 ; SSE2-NEXT: movdqa %xmm3, %xmm12
1385 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
1386 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
1387 ; SSE2-NEXT: movdqa %xmm4, %xmm13
1388 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
1389 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7]
1390 ; SSE2-NEXT: movdqa %xmm5, %xmm14
1391 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
1392 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
1393 ; SSE2-NEXT: movdqa %xmm6, %xmm15
1394 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3]
1395 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
1396 ; SSE2-NEXT: movdqa %xmm8, %xmm7
1397 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
1398 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7]
1399 ; SSE2-NEXT: movdqa %xmm8, 240(%rdi)
1400 ; SSE2-NEXT: movdqa %xmm7, 224(%rdi)
1401 ; SSE2-NEXT: movdqa %xmm6, 208(%rdi)
1402 ; SSE2-NEXT: movdqa %xmm15, 192(%rdi)
1403 ; SSE2-NEXT: movdqa %xmm5, 176(%rdi)
1404 ; SSE2-NEXT: movdqa %xmm14, 160(%rdi)
1405 ; SSE2-NEXT: movdqa %xmm4, 144(%rdi)
1406 ; SSE2-NEXT: movdqa %xmm13, 128(%rdi)
1407 ; SSE2-NEXT: movdqa %xmm3, 112(%rdi)
1408 ; SSE2-NEXT: movdqa %xmm12, 96(%rdi)
1409 ; SSE2-NEXT: movdqa %xmm2, 80(%rdi)
1410 ; SSE2-NEXT: movdqa %xmm10, 64(%rdi)
1411 ; SSE2-NEXT: movdqa %xmm1, 48(%rdi)
1412 ; SSE2-NEXT: movdqa %xmm9, 32(%rdi)
1413 ; SSE2-NEXT: movdqa %xmm0, 16(%rdi)
1414 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1415 ; SSE2-NEXT: movaps %xmm0, (%rdi)
1418 ; SSE41-LABEL: zext_mulhuw_v64i16_lshr:
1420 ; SSE41-NEXT: movdqa %xmm0, %xmm8
1421 ; SSE41-NEXT: movq %rdi, %rax
1422 ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm8
1423 ; SSE41-NEXT: pxor %xmm11, %xmm11
1424 ; SSE41-NEXT: movdqa %xmm8, %xmm0
1425 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
1426 ; SSE41-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1427 ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm1
1428 ; SSE41-NEXT: movdqa %xmm1, %xmm9
1429 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7]
1430 ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm2
1431 ; SSE41-NEXT: movdqa %xmm2, %xmm10
1432 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
1433 ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm3
1434 ; SSE41-NEXT: movdqa %xmm3, %xmm12
1435 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
1436 ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm4
1437 ; SSE41-NEXT: movdqa %xmm4, %xmm13
1438 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7]
1439 ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm5
1440 ; SSE41-NEXT: movdqa %xmm5, %xmm14
1441 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
1442 ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm6
1443 ; SSE41-NEXT: movdqa %xmm6, %xmm15
1444 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7]
1445 ; SSE41-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm7
1446 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
1447 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
1448 ; SSE41-NEXT: movdqa %xmm7, 240(%rdi)
1449 ; SSE41-NEXT: movdqa %xmm0, 224(%rdi)
1450 ; SSE41-NEXT: movdqa %xmm15, 208(%rdi)
1451 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
1452 ; SSE41-NEXT: movdqa %xmm0, 192(%rdi)
1453 ; SSE41-NEXT: movdqa %xmm14, 176(%rdi)
1454 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
1455 ; SSE41-NEXT: movdqa %xmm0, 160(%rdi)
1456 ; SSE41-NEXT: movdqa %xmm13, 144(%rdi)
1457 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
1458 ; SSE41-NEXT: movdqa %xmm0, 128(%rdi)
1459 ; SSE41-NEXT: movdqa %xmm12, 112(%rdi)
1460 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
1461 ; SSE41-NEXT: movdqa %xmm0, 96(%rdi)
1462 ; SSE41-NEXT: movdqa %xmm10, 80(%rdi)
1463 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1464 ; SSE41-NEXT: movdqa %xmm0, 64(%rdi)
1465 ; SSE41-NEXT: movdqa %xmm9, 48(%rdi)
1466 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1467 ; SSE41-NEXT: movdqa %xmm0, 32(%rdi)
1468 ; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1469 ; SSE41-NEXT: movaps %xmm0, 16(%rdi)
1470 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero
1471 ; SSE41-NEXT: movdqa %xmm0, (%rdi)
1474 ; AVX2-LABEL: zext_mulhuw_v64i16_lshr:
1476 ; AVX2-NEXT: movq %rdi, %rax
1477 ; AVX2-NEXT: vpmulhuw %ymm4, %ymm0, %ymm0
1478 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1479 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1480 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1481 ; AVX2-NEXT: vpmulhuw %ymm5, %ymm1, %ymm1
1482 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1483 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
1484 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1485 ; AVX2-NEXT: vpmulhuw %ymm6, %ymm2, %ymm2
1486 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1487 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
1488 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1489 ; AVX2-NEXT: vpmulhuw %ymm7, %ymm3, %ymm3
1490 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
1491 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3
1492 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
1493 ; AVX2-NEXT: vmovdqa %ymm3, 224(%rdi)
1494 ; AVX2-NEXT: vmovdqa %ymm7, 192(%rdi)
1495 ; AVX2-NEXT: vmovdqa %ymm2, 160(%rdi)
1496 ; AVX2-NEXT: vmovdqa %ymm6, 128(%rdi)
1497 ; AVX2-NEXT: vmovdqa %ymm1, 96(%rdi)
1498 ; AVX2-NEXT: vmovdqa %ymm5, 64(%rdi)
1499 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdi)
1500 ; AVX2-NEXT: vmovdqa %ymm4, (%rdi)
1501 ; AVX2-NEXT: vzeroupper
1504 ; AVX512F-LABEL: zext_mulhuw_v64i16_lshr:
1506 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm4
1507 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
1508 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2
1509 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1510 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0
1511 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1512 ; AVX512F-NEXT: vpmulhuw %ymm3, %ymm1, %ymm0
1513 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1514 ; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm0
1515 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
1516 ; AVX512F-NEXT: vpmulhuw %ymm0, %ymm1, %ymm0
1517 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1518 ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0
1519 ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1
1520 ; AVX512F-NEXT: retq
1522 ; AVX512BW-LABEL: zext_mulhuw_v64i16_lshr:
1523 ; AVX512BW: # %bb.0:
1524 ; AVX512BW-NEXT: vpmulhuw %zmm2, %zmm0, %zmm2
1525 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
1526 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm2, %ymm2
1527 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
1528 ; AVX512BW-NEXT: vpmulhuw %zmm3, %zmm1, %zmm1
1529 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1530 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1
1531 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1532 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1
1533 ; AVX512BW-NEXT: retq
1534 %a1 = zext <64 x i16> %a to <64 x i32>
1535 %b1 = zext <64 x i16> %b to <64 x i32>
1536 %c = mul <64 x i32> %a1, %b1
1537 %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1541 define <64 x i32> @mulhsw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) {
1542 ; SSE2-LABEL: mulhsw_v64i16_lshr:
1544 ; SSE2-NEXT: movdqa %xmm7, %xmm8
1545 ; SSE2-NEXT: movq %rdi, %rax
1546 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm8
1547 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6
1548 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5
1549 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4
1550 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3
1551 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2
1552 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1
1553 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0
1554 ; SSE2-NEXT: pxor %xmm11, %xmm11
1555 ; SSE2-NEXT: movdqa %xmm0, %xmm7
1556 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
1557 ; SSE2-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1558 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
1559 ; SSE2-NEXT: movdqa %xmm1, %xmm9
1560 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3]
1561 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
1562 ; SSE2-NEXT: movdqa %xmm2, %xmm10
1563 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1564 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
1565 ; SSE2-NEXT: movdqa %xmm3, %xmm12
1566 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
1567 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
1568 ; SSE2-NEXT: movdqa %xmm4, %xmm13
1569 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
1570 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7]
1571 ; SSE2-NEXT: movdqa %xmm5, %xmm14
1572 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
1573 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
1574 ; SSE2-NEXT: movdqa %xmm6, %xmm15
1575 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3]
1576 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
1577 ; SSE2-NEXT: movdqa %xmm8, %xmm7
1578 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
1579 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7]
1580 ; SSE2-NEXT: movdqa %xmm8, 240(%rdi)
1581 ; SSE2-NEXT: movdqa %xmm7, 224(%rdi)
1582 ; SSE2-NEXT: movdqa %xmm6, 208(%rdi)
1583 ; SSE2-NEXT: movdqa %xmm15, 192(%rdi)
1584 ; SSE2-NEXT: movdqa %xmm5, 176(%rdi)
1585 ; SSE2-NEXT: movdqa %xmm14, 160(%rdi)
1586 ; SSE2-NEXT: movdqa %xmm4, 144(%rdi)
1587 ; SSE2-NEXT: movdqa %xmm13, 128(%rdi)
1588 ; SSE2-NEXT: movdqa %xmm3, 112(%rdi)
1589 ; SSE2-NEXT: movdqa %xmm12, 96(%rdi)
1590 ; SSE2-NEXT: movdqa %xmm2, 80(%rdi)
1591 ; SSE2-NEXT: movdqa %xmm10, 64(%rdi)
1592 ; SSE2-NEXT: movdqa %xmm1, 48(%rdi)
1593 ; SSE2-NEXT: movdqa %xmm9, 32(%rdi)
1594 ; SSE2-NEXT: movdqa %xmm0, 16(%rdi)
1595 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1596 ; SSE2-NEXT: movaps %xmm0, (%rdi)
1599 ; SSE41-LABEL: mulhsw_v64i16_lshr:
1601 ; SSE41-NEXT: movdqa %xmm0, %xmm8
1602 ; SSE41-NEXT: movq %rdi, %rax
1603 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm8
1604 ; SSE41-NEXT: pxor %xmm11, %xmm11
1605 ; SSE41-NEXT: movdqa %xmm8, %xmm0
1606 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
1607 ; SSE41-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1608 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1
1609 ; SSE41-NEXT: movdqa %xmm1, %xmm9
1610 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7]
1611 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2
1612 ; SSE41-NEXT: movdqa %xmm2, %xmm10
1613 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
1614 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3
1615 ; SSE41-NEXT: movdqa %xmm3, %xmm12
1616 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
1617 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4
1618 ; SSE41-NEXT: movdqa %xmm4, %xmm13
1619 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7]
1620 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5
1621 ; SSE41-NEXT: movdqa %xmm5, %xmm14
1622 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
1623 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6
1624 ; SSE41-NEXT: movdqa %xmm6, %xmm15
1625 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7]
1626 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7
1627 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
1628 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
1629 ; SSE41-NEXT: movdqa %xmm7, 240(%rdi)
1630 ; SSE41-NEXT: movdqa %xmm0, 224(%rdi)
1631 ; SSE41-NEXT: movdqa %xmm15, 208(%rdi)
1632 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
1633 ; SSE41-NEXT: movdqa %xmm0, 192(%rdi)
1634 ; SSE41-NEXT: movdqa %xmm14, 176(%rdi)
1635 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
1636 ; SSE41-NEXT: movdqa %xmm0, 160(%rdi)
1637 ; SSE41-NEXT: movdqa %xmm13, 144(%rdi)
1638 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
1639 ; SSE41-NEXT: movdqa %xmm0, 128(%rdi)
1640 ; SSE41-NEXT: movdqa %xmm12, 112(%rdi)
1641 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
1642 ; SSE41-NEXT: movdqa %xmm0, 96(%rdi)
1643 ; SSE41-NEXT: movdqa %xmm10, 80(%rdi)
1644 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1645 ; SSE41-NEXT: movdqa %xmm0, 64(%rdi)
1646 ; SSE41-NEXT: movdqa %xmm9, 48(%rdi)
1647 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1648 ; SSE41-NEXT: movdqa %xmm0, 32(%rdi)
1649 ; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1650 ; SSE41-NEXT: movaps %xmm0, 16(%rdi)
1651 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero
1652 ; SSE41-NEXT: movdqa %xmm0, (%rdi)
1655 ; AVX2-LABEL: mulhsw_v64i16_lshr:
1657 ; AVX2-NEXT: movq %rdi, %rax
1658 ; AVX2-NEXT: vpmulhw %ymm4, %ymm0, %ymm0
1659 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1660 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1661 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1662 ; AVX2-NEXT: vpmulhw %ymm5, %ymm1, %ymm1
1663 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1664 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
1665 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1666 ; AVX2-NEXT: vpmulhw %ymm6, %ymm2, %ymm2
1667 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1668 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
1669 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1670 ; AVX2-NEXT: vpmulhw %ymm7, %ymm3, %ymm3
1671 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
1672 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3
1673 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
1674 ; AVX2-NEXT: vmovdqa %ymm3, 224(%rdi)
1675 ; AVX2-NEXT: vmovdqa %ymm7, 192(%rdi)
1676 ; AVX2-NEXT: vmovdqa %ymm2, 160(%rdi)
1677 ; AVX2-NEXT: vmovdqa %ymm6, 128(%rdi)
1678 ; AVX2-NEXT: vmovdqa %ymm1, 96(%rdi)
1679 ; AVX2-NEXT: vmovdqa %ymm5, 64(%rdi)
1680 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdi)
1681 ; AVX2-NEXT: vmovdqa %ymm4, (%rdi)
1682 ; AVX2-NEXT: vzeroupper
1685 ; AVX512F-LABEL: mulhsw_v64i16_lshr:
1687 ; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm4
1688 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
1689 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2
1690 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1691 ; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0
1692 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1693 ; AVX512F-NEXT: vpmulhw %ymm3, %ymm1, %ymm0
1694 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1695 ; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm0
1696 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
1697 ; AVX512F-NEXT: vpmulhw %ymm0, %ymm1, %ymm0
1698 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1699 ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0
1700 ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1
1701 ; AVX512F-NEXT: retq
1703 ; AVX512BW-LABEL: mulhsw_v64i16_lshr:
1704 ; AVX512BW: # %bb.0:
1705 ; AVX512BW-NEXT: vpmulhw %zmm2, %zmm0, %zmm2
1706 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
1707 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm2, %ymm2
1708 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
1709 ; AVX512BW-NEXT: vpmulhw %zmm3, %zmm1, %zmm1
1710 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1711 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1
1712 ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1713 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1
1714 ; AVX512BW-NEXT: retq
1715 %a1 = sext <64 x i16> %a to <64 x i32>
1716 %b1 = sext <64 x i16> %b to <64 x i32>
1717 %c = mul <64 x i32> %a1, %b1
1718 %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1722 define <64 x i32> @mulhsw_v64i16_ashr(<64 x i16> %a, <64 x i16> %b) {
1723 ; SSE2-LABEL: mulhsw_v64i16_ashr:
1725 ; SSE2-NEXT: movq %rdi, %rax
1726 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7
1727 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
1728 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
1729 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6
1730 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7]
1731 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3]
1732 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5
1733 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7]
1734 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3]
1735 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4
1736 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7]
1737 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
1738 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3
1739 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
1740 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
1741 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2
1742 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
1743 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
1744 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1
1745 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
1746 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1747 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0
1748 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
1749 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1750 ; SSE2-NEXT: psrad $16, %xmm0
1751 ; SSE2-NEXT: psrad $16, %xmm7
1752 ; SSE2-NEXT: psrad $16, %xmm1
1753 ; SSE2-NEXT: psrad $16, %xmm4
1754 ; SSE2-NEXT: psrad $16, %xmm2
1755 ; SSE2-NEXT: psrad $16, %xmm6
1756 ; SSE2-NEXT: psrad $16, %xmm3
1757 ; SSE2-NEXT: psrad $16, %xmm5
1758 ; SSE2-NEXT: psrad $16, %xmm14
1759 ; SSE2-NEXT: psrad $16, %xmm15
1760 ; SSE2-NEXT: psrad $16, %xmm12
1761 ; SSE2-NEXT: psrad $16, %xmm13
1762 ; SSE2-NEXT: psrad $16, %xmm10
1763 ; SSE2-NEXT: psrad $16, %xmm11
1764 ; SSE2-NEXT: psrad $16, %xmm9
1765 ; SSE2-NEXT: psrad $16, %xmm8
1766 ; SSE2-NEXT: movdqa %xmm8, 240(%rdi)
1767 ; SSE2-NEXT: movdqa %xmm9, 224(%rdi)
1768 ; SSE2-NEXT: movdqa %xmm11, 208(%rdi)
1769 ; SSE2-NEXT: movdqa %xmm10, 192(%rdi)
1770 ; SSE2-NEXT: movdqa %xmm13, 176(%rdi)
1771 ; SSE2-NEXT: movdqa %xmm12, 160(%rdi)
1772 ; SSE2-NEXT: movdqa %xmm15, 144(%rdi)
1773 ; SSE2-NEXT: movdqa %xmm14, 128(%rdi)
1774 ; SSE2-NEXT: movdqa %xmm5, 112(%rdi)
1775 ; SSE2-NEXT: movdqa %xmm3, 96(%rdi)
1776 ; SSE2-NEXT: movdqa %xmm6, 80(%rdi)
1777 ; SSE2-NEXT: movdqa %xmm2, 64(%rdi)
1778 ; SSE2-NEXT: movdqa %xmm4, 48(%rdi)
1779 ; SSE2-NEXT: movdqa %xmm1, 32(%rdi)
1780 ; SSE2-NEXT: movdqa %xmm7, 16(%rdi)
1781 ; SSE2-NEXT: movdqa %xmm0, (%rdi)
1784 ; SSE41-LABEL: mulhsw_v64i16_ashr:
1786 ; SSE41-NEXT: movq %rdi, %rax
1787 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0
1788 ; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
1789 ; SSE41-NEXT: pmovsxwd %xmm8, %xmm8
1790 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1
1791 ; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,2,3]
1792 ; SSE41-NEXT: pmovsxwd %xmm9, %xmm9
1793 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2
1794 ; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,2,3]
1795 ; SSE41-NEXT: pmovsxwd %xmm10, %xmm10
1796 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3
1797 ; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,2,3]
1798 ; SSE41-NEXT: pmovsxwd %xmm11, %xmm11
1799 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4
1800 ; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm4[2,3,2,3]
1801 ; SSE41-NEXT: pmovsxwd %xmm12, %xmm12
1802 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5
1803 ; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,3,2,3]
1804 ; SSE41-NEXT: pmovsxwd %xmm13, %xmm13
1805 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6
1806 ; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,2,3]
1807 ; SSE41-NEXT: pmovsxwd %xmm14, %xmm14
1808 ; SSE41-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7
1809 ; SSE41-NEXT: pshufd {{.*#+}} xmm15 = xmm7[2,3,2,3]
1810 ; SSE41-NEXT: pmovsxwd %xmm15, %xmm15
1811 ; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
1812 ; SSE41-NEXT: pmovsxwd %xmm1, %xmm1
1813 ; SSE41-NEXT: pmovsxwd %xmm2, %xmm2
1814 ; SSE41-NEXT: pmovsxwd %xmm3, %xmm3
1815 ; SSE41-NEXT: pmovsxwd %xmm4, %xmm4
1816 ; SSE41-NEXT: pmovsxwd %xmm5, %xmm5
1817 ; SSE41-NEXT: pmovsxwd %xmm6, %xmm6
1818 ; SSE41-NEXT: pmovsxwd %xmm7, %xmm7
1819 ; SSE41-NEXT: movdqa %xmm7, 224(%rdi)
1820 ; SSE41-NEXT: movdqa %xmm6, 192(%rdi)
1821 ; SSE41-NEXT: movdqa %xmm5, 160(%rdi)
1822 ; SSE41-NEXT: movdqa %xmm4, 128(%rdi)
1823 ; SSE41-NEXT: movdqa %xmm3, 96(%rdi)
1824 ; SSE41-NEXT: movdqa %xmm2, 64(%rdi)
1825 ; SSE41-NEXT: movdqa %xmm1, 32(%rdi)
1826 ; SSE41-NEXT: movdqa %xmm0, (%rdi)
1827 ; SSE41-NEXT: movdqa %xmm15, 240(%rdi)
1828 ; SSE41-NEXT: movdqa %xmm14, 208(%rdi)
1829 ; SSE41-NEXT: movdqa %xmm13, 176(%rdi)
1830 ; SSE41-NEXT: movdqa %xmm12, 144(%rdi)
1831 ; SSE41-NEXT: movdqa %xmm11, 112(%rdi)
1832 ; SSE41-NEXT: movdqa %xmm10, 80(%rdi)
1833 ; SSE41-NEXT: movdqa %xmm9, 48(%rdi)
1834 ; SSE41-NEXT: movdqa %xmm8, 16(%rdi)
1837 ; AVX2-LABEL: mulhsw_v64i16_ashr:
1839 ; AVX2-NEXT: movq %rdi, %rax
1840 ; AVX2-NEXT: vpmulhw %ymm4, %ymm0, %ymm0
1841 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm4
1842 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1843 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
1844 ; AVX2-NEXT: vpmulhw %ymm5, %ymm1, %ymm1
1845 ; AVX2-NEXT: vpmovsxwd %xmm1, %ymm5
1846 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
1847 ; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1
1848 ; AVX2-NEXT: vpmulhw %ymm6, %ymm2, %ymm2
1849 ; AVX2-NEXT: vpmovsxwd %xmm2, %ymm6
1850 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
1851 ; AVX2-NEXT: vpmovsxwd %xmm2, %ymm2
1852 ; AVX2-NEXT: vpmulhw %ymm7, %ymm3, %ymm3
1853 ; AVX2-NEXT: vpmovsxwd %xmm3, %ymm7
1854 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3
1855 ; AVX2-NEXT: vpmovsxwd %xmm3, %ymm3
1856 ; AVX2-NEXT: vmovdqa %ymm3, 224(%rdi)
1857 ; AVX2-NEXT: vmovdqa %ymm7, 192(%rdi)
1858 ; AVX2-NEXT: vmovdqa %ymm2, 160(%rdi)
1859 ; AVX2-NEXT: vmovdqa %ymm6, 128(%rdi)
1860 ; AVX2-NEXT: vmovdqa %ymm1, 96(%rdi)
1861 ; AVX2-NEXT: vmovdqa %ymm5, 64(%rdi)
1862 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdi)
1863 ; AVX2-NEXT: vmovdqa %ymm4, (%rdi)
1864 ; AVX2-NEXT: vzeroupper
1867 ; AVX512F-LABEL: mulhsw_v64i16_ashr:
1869 ; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm4
1870 ; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4
1871 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2
1872 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1873 ; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0
1874 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm5
1875 ; AVX512F-NEXT: vpmulhw %ymm3, %ymm1, %ymm0
1876 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm2
1877 ; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm0
1878 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
1879 ; AVX512F-NEXT: vpmulhw %ymm0, %ymm1, %ymm0
1880 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm3
1881 ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0
1882 ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1
1883 ; AVX512F-NEXT: retq
1885 ; AVX512BW-LABEL: mulhsw_v64i16_ashr:
1886 ; AVX512BW: # %bb.0:
1887 ; AVX512BW-NEXT: vpmulhw %zmm2, %zmm0, %zmm2
1888 ; AVX512BW-NEXT: vpmovsxwd %ymm2, %zmm0
1889 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm2, %ymm2
1890 ; AVX512BW-NEXT: vpmovsxwd %ymm2, %zmm4
1891 ; AVX512BW-NEXT: vpmulhw %zmm3, %zmm1, %zmm1
1892 ; AVX512BW-NEXT: vpmovsxwd %ymm1, %zmm2
1893 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1
1894 ; AVX512BW-NEXT: vpmovsxwd %ymm1, %zmm3
1895 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1
1896 ; AVX512BW-NEXT: retq
1897 %a1 = sext <64 x i16> %a to <64 x i32>
1898 %b1 = sext <64 x i16> %b to <64 x i32>
1899 %c = mul <64 x i32> %a1, %b1
1900 %d = ashr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1904 define <8 x i64> @zext_mulhuw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) {
1905 ; SSE2-LABEL: zext_mulhuw_v8i16_lshr_i64:
1907 ; SSE2-NEXT: pxor %xmm2, %xmm2
1908 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1909 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1910 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,1,3]
1911 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3]
1912 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1913 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,1,3]
1914 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,1,3,3]
1915 ; SSE2-NEXT: movdqa %xmm1, %xmm7
1916 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
1917 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,1,3]
1918 ; SSE2-NEXT: pmuludq %xmm4, %xmm0
1919 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,1,3,3]
1920 ; SSE2-NEXT: pmuludq %xmm3, %xmm4
1921 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1922 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
1923 ; SSE2-NEXT: pmuludq %xmm5, %xmm2
1924 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,3,3]
1925 ; SSE2-NEXT: pmuludq %xmm6, %xmm3
1926 ; SSE2-NEXT: psrlq $16, %xmm0
1927 ; SSE2-NEXT: psrlq $16, %xmm4
1928 ; SSE2-NEXT: psrlq $16, %xmm2
1929 ; SSE2-NEXT: psrlq $16, %xmm3
1930 ; SSE2-NEXT: movdqa %xmm4, %xmm1
1933 ; SSE41-LABEL: zext_mulhuw_v8i16_lshr_i64:
1935 ; SSE41-NEXT: pmulhuw %xmm1, %xmm0
1936 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1937 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1938 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1939 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
1940 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1941 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1942 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1943 ; SSE41-NEXT: movdqa %xmm4, %xmm0
1946 ; AVX2-LABEL: zext_mulhuw_v8i16_lshr_i64:
1948 ; AVX2-NEXT: vpmulhuw %xmm1, %xmm0, %xmm1
1949 ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1950 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1951 ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1954 ; AVX512-LABEL: zext_mulhuw_v8i16_lshr_i64:
1956 ; AVX512-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
1957 ; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1959 %a1 = zext <8 x i16> %a to <8 x i64>
1960 %b1 = zext <8 x i16> %b to <8 x i64>
1961 %c = mul <8 x i64> %a1, %b1
1962 %d = lshr <8 x i64> %c, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
1966 define <8 x i64> @sext_mulhsw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) {
1967 ; SSE2-LABEL: sext_mulhsw_v8i16_lshr_i64:
1969 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
1970 ; SSE2-NEXT: psrad $16, %xmm6
1971 ; SSE2-NEXT: pxor %xmm13, %xmm13
1972 ; SSE2-NEXT: pxor %xmm10, %xmm10
1973 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm10
1974 ; SSE2-NEXT: movdqa %xmm6, %xmm8
1975 ; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1]
1976 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm10[2],xmm6[3],xmm10[3]
1977 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
1978 ; SSE2-NEXT: psrad $16, %xmm4
1979 ; SSE2-NEXT: pxor %xmm5, %xmm5
1980 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
1981 ; SSE2-NEXT: movdqa %xmm4, %xmm11
1982 ; SSE2-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1]
1983 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
1984 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
1985 ; SSE2-NEXT: psrad $16, %xmm7
1986 ; SSE2-NEXT: pxor %xmm12, %xmm12
1987 ; SSE2-NEXT: pcmpgtd %xmm7, %xmm12
1988 ; SSE2-NEXT: movdqa %xmm7, %xmm9
1989 ; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1]
1990 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm12[2],xmm7[3],xmm12[3]
1991 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1992 ; SSE2-NEXT: psrad $16, %xmm1
1993 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm13
1994 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1995 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1]
1996 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3]
1997 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,1,3,3]
1998 ; SSE2-NEXT: pmuludq %xmm4, %xmm3
1999 ; SSE2-NEXT: pmuludq %xmm1, %xmm4
2000 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,1,3,3]
2001 ; SSE2-NEXT: pmuludq %xmm1, %xmm2
2002 ; SSE2-NEXT: paddq %xmm2, %xmm3
2003 ; SSE2-NEXT: psllq $32, %xmm3
2004 ; SSE2-NEXT: paddq %xmm4, %xmm3
2005 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,1,1,3]
2006 ; SSE2-NEXT: pmuludq %xmm11, %xmm2
2007 ; SSE2-NEXT: pmuludq %xmm0, %xmm11
2008 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,1,3]
2009 ; SSE2-NEXT: pmuludq %xmm0, %xmm1
2010 ; SSE2-NEXT: paddq %xmm1, %xmm2
2011 ; SSE2-NEXT: psllq $32, %xmm2
2012 ; SSE2-NEXT: paddq %xmm11, %xmm2
2013 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,1,3,3]
2014 ; SSE2-NEXT: pmuludq %xmm6, %xmm1
2015 ; SSE2-NEXT: pmuludq %xmm7, %xmm6
2016 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,1,3,3]
2017 ; SSE2-NEXT: pmuludq %xmm7, %xmm0
2018 ; SSE2-NEXT: paddq %xmm0, %xmm1
2019 ; SSE2-NEXT: psllq $32, %xmm1
2020 ; SSE2-NEXT: paddq %xmm6, %xmm1
2021 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,1,3]
2022 ; SSE2-NEXT: pmuludq %xmm8, %xmm0
2023 ; SSE2-NEXT: pmuludq %xmm9, %xmm8
2024 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,1,3]
2025 ; SSE2-NEXT: pmuludq %xmm9, %xmm4
2026 ; SSE2-NEXT: paddq %xmm4, %xmm0
2027 ; SSE2-NEXT: psllq $32, %xmm0
2028 ; SSE2-NEXT: paddq %xmm8, %xmm0
2029 ; SSE2-NEXT: psrlq $16, %xmm0
2030 ; SSE2-NEXT: psrlq $16, %xmm1
2031 ; SSE2-NEXT: psrlq $16, %xmm2
2032 ; SSE2-NEXT: psrlq $16, %xmm3
2035 ; SSE41-LABEL: sext_mulhsw_v8i16_lshr_i64:
2037 ; SSE41-NEXT: pmulhw %xmm1, %xmm0
2038 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
2039 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2040 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
2041 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
2042 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
2043 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
2044 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
2045 ; SSE41-NEXT: movdqa %xmm4, %xmm0
2048 ; AVX2-LABEL: sext_mulhsw_v8i16_lshr_i64:
2050 ; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm1
2051 ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
2052 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
2053 ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
2056 ; AVX512-LABEL: sext_mulhsw_v8i16_lshr_i64:
2058 ; AVX512-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
2059 ; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
2061 %a1 = sext <8 x i16> %a to <8 x i64>
2062 %b1 = sext <8 x i16> %b to <8 x i64>
2063 %c = mul <8 x i64> %a1, %b1
2064 %d = lshr <8 x i64> %c, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
2068 define <8 x i64> @sext_mulhsw_v8i16_ashr_i64(<8 x i16> %a, <8 x i16> %b) {
2069 ; SSE2-LABEL: sext_mulhsw_v8i16_ashr_i64:
2071 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
2072 ; SSE2-NEXT: psrad $16, %xmm5
2073 ; SSE2-NEXT: pxor %xmm13, %xmm13
2074 ; SSE2-NEXT: pxor %xmm10, %xmm10
2075 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm10
2076 ; SSE2-NEXT: movdqa %xmm5, %xmm8
2077 ; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1]
2078 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm10[2],xmm5[3],xmm10[3]
2079 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
2080 ; SSE2-NEXT: psrad $16, %xmm2
2081 ; SSE2-NEXT: pxor %xmm3, %xmm3
2082 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
2083 ; SSE2-NEXT: movdqa %xmm2, %xmm11
2084 ; SSE2-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1]
2085 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
2086 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2087 ; SSE2-NEXT: psrad $16, %xmm0
2088 ; SSE2-NEXT: pxor %xmm12, %xmm12
2089 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm12
2090 ; SSE2-NEXT: movdqa %xmm0, %xmm9
2091 ; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1]
2092 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3]
2093 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2094 ; SSE2-NEXT: psrad $16, %xmm1
2095 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm13
2096 ; SSE2-NEXT: movdqa %xmm1, %xmm6
2097 ; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1]
2098 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3]
2099 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,1,3,3]
2100 ; SSE2-NEXT: pmuludq %xmm2, %xmm4
2101 ; SSE2-NEXT: pmuludq %xmm1, %xmm2
2102 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,1,3,3]
2103 ; SSE2-NEXT: pmuludq %xmm1, %xmm7
2104 ; SSE2-NEXT: paddq %xmm7, %xmm4
2105 ; SSE2-NEXT: psllq $32, %xmm4
2106 ; SSE2-NEXT: paddq %xmm2, %xmm4
2107 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm13[0,1,1,3]
2108 ; SSE2-NEXT: pmuludq %xmm11, %xmm7
2109 ; SSE2-NEXT: pmuludq %xmm6, %xmm11
2110 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,1,3]
2111 ; SSE2-NEXT: pmuludq %xmm6, %xmm1
2112 ; SSE2-NEXT: paddq %xmm1, %xmm7
2113 ; SSE2-NEXT: psllq $32, %xmm7
2114 ; SSE2-NEXT: paddq %xmm11, %xmm7
2115 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,1,3,3]
2116 ; SSE2-NEXT: pmuludq %xmm5, %xmm1
2117 ; SSE2-NEXT: pmuludq %xmm0, %xmm5
2118 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,1,3,3]
2119 ; SSE2-NEXT: pmuludq %xmm0, %xmm2
2120 ; SSE2-NEXT: paddq %xmm2, %xmm1
2121 ; SSE2-NEXT: psllq $32, %xmm1
2122 ; SSE2-NEXT: paddq %xmm5, %xmm1
2123 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,1,3]
2124 ; SSE2-NEXT: pmuludq %xmm8, %xmm0
2125 ; SSE2-NEXT: pmuludq %xmm9, %xmm8
2126 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,1,1,3]
2127 ; SSE2-NEXT: pmuludq %xmm9, %xmm2
2128 ; SSE2-NEXT: paddq %xmm2, %xmm0
2129 ; SSE2-NEXT: psllq $32, %xmm0
2130 ; SSE2-NEXT: paddq %xmm8, %xmm0
2131 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2132 ; SSE2-NEXT: psrad $16, %xmm2
2133 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
2134 ; SSE2-NEXT: psrlq $16, %xmm0
2135 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2136 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2137 ; SSE2-NEXT: movdqa %xmm1, %xmm2
2138 ; SSE2-NEXT: psrad $16, %xmm2
2139 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
2140 ; SSE2-NEXT: psrlq $16, %xmm1
2141 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2142 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2143 ; SSE2-NEXT: movdqa %xmm7, %xmm2
2144 ; SSE2-NEXT: psrad $16, %xmm2
2145 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
2146 ; SSE2-NEXT: psrlq $16, %xmm7
2147 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3]
2148 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2149 ; SSE2-NEXT: movdqa %xmm4, %xmm3
2150 ; SSE2-NEXT: psrad $16, %xmm3
2151 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3]
2152 ; SSE2-NEXT: psrlq $16, %xmm4
2153 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
2154 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
2157 ; SSE41-LABEL: sext_mulhsw_v8i16_ashr_i64:
2159 ; SSE41-NEXT: pmulhw %xmm1, %xmm0
2160 ; SSE41-NEXT: pmovsxwq %xmm0, %xmm4
2161 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2162 ; SSE41-NEXT: pmovsxwq %xmm1, %xmm1
2163 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
2164 ; SSE41-NEXT: pmovsxwq %xmm2, %xmm2
2165 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
2166 ; SSE41-NEXT: pmovsxwq %xmm0, %xmm3
2167 ; SSE41-NEXT: movdqa %xmm4, %xmm0
2170 ; AVX2-LABEL: sext_mulhsw_v8i16_ashr_i64:
2172 ; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm1
2173 ; AVX2-NEXT: vpmovsxwq %xmm1, %ymm0
2174 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
2175 ; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1
2178 ; AVX512-LABEL: sext_mulhsw_v8i16_ashr_i64:
2180 ; AVX512-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
2181 ; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0
2183 %a1 = sext <8 x i16> %a to <8 x i64>
2184 %b1 = sext <8 x i16> %b to <8 x i64>
2185 %c = mul <8 x i64> %a1, %b1
2186 %d = ashr <8 x i64> %c, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
2190 define <8 x i16> @sse2_pmulh_w_const(<8 x i16> %a0, <8 x i16> %a1) {
2191 ; SSE-LABEL: sse2_pmulh_w_const:
2193 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0]
2196 ; AVX-LABEL: sse2_pmulh_w_const:
2198 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0]
2200 %res = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> <i16 -1, i16 -2, i16 -3, i16 -4, i16 -5, i16 -6, i16 -7, i16 0>, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
2203 declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>)
2205 define <8 x i16> @sse2_pmulhu_w_const(<8 x i16> %a0, <8 x i16> %a1) {
2206 ; SSE-LABEL: sse2_pmulhu_w_const:
2208 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,0,1,2,3,4,5,0]
2211 ; AVX-LABEL: sse2_pmulhu_w_const:
2213 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,1,2,3,4,5,0]
2215 %res = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> <i16 -1, i16 -2, i16 -3, i16 -4, i16 -5, i16 -6, i16 -7, i16 0>, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
2218 declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>)