1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 --check-prefix=SSE2-PROMOTE
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 --check-prefix=SSE2-WIDEN
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 --check-prefix=SSE41-PROMOTE
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 --check-prefix=SSE41-WIDEN
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
10 define <4 x i16> @mulhuw_v4i16(<4 x i16> %a, <4 x i16> %b) {
11 ; SSE2-PROMOTE-LABEL: mulhuw_v4i16:
12 ; SSE2-PROMOTE: # %bb.0:
13 ; SSE2-PROMOTE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
14 ; SSE2-PROMOTE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
15 ; SSE2-PROMOTE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
16 ; SSE2-PROMOTE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
17 ; SSE2-PROMOTE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
18 ; SSE2-PROMOTE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
19 ; SSE2-PROMOTE-NEXT: pmulhuw %xmm1, %xmm0
20 ; SSE2-PROMOTE-NEXT: pxor %xmm1, %xmm1
21 ; SSE2-PROMOTE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
22 ; SSE2-PROMOTE-NEXT: retq
24 ; SSE2-WIDEN-LABEL: mulhuw_v4i16:
25 ; SSE2-WIDEN: # %bb.0:
26 ; SSE2-WIDEN-NEXT: pmulhuw %xmm1, %xmm0
27 ; SSE2-WIDEN-NEXT: retq
29 ; SSE41-PROMOTE-LABEL: mulhuw_v4i16:
30 ; SSE41-PROMOTE: # %bb.0:
31 ; SSE41-PROMOTE-NEXT: pxor %xmm2, %xmm2
32 ; SSE41-PROMOTE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
33 ; SSE41-PROMOTE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
34 ; SSE41-PROMOTE-NEXT: pmulld %xmm1, %xmm0
35 ; SSE41-PROMOTE-NEXT: psrld $16, %xmm0
36 ; SSE41-PROMOTE-NEXT: retq
38 ; SSE41-WIDEN-LABEL: mulhuw_v4i16:
39 ; SSE41-WIDEN: # %bb.0:
40 ; SSE41-WIDEN-NEXT: pmulhuw %xmm1, %xmm0
41 ; SSE41-WIDEN-NEXT: retq
43 ; AVX-LABEL: mulhuw_v4i16:
45 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
46 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
47 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
48 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
49 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
51 %a1 = zext <4 x i16> %a to <4 x i32>
52 %b1 = zext <4 x i16> %b to <4 x i32>
53 %c = mul <4 x i32> %a1, %b1
54 %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
55 %e = trunc <4 x i32> %d to <4 x i16>
59 define <4 x i16> @mulhw_v4i16(<4 x i16> %a, <4 x i16> %b) {
60 ; SSE2-PROMOTE-LABEL: mulhw_v4i16:
61 ; SSE2-PROMOTE: # %bb.0:
62 ; SSE2-PROMOTE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
63 ; SSE2-PROMOTE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
64 ; SSE2-PROMOTE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
65 ; SSE2-PROMOTE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
66 ; SSE2-PROMOTE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
67 ; SSE2-PROMOTE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
68 ; SSE2-PROMOTE-NEXT: pmulhw %xmm1, %xmm0
69 ; SSE2-PROMOTE-NEXT: pxor %xmm1, %xmm1
70 ; SSE2-PROMOTE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
71 ; SSE2-PROMOTE-NEXT: retq
73 ; SSE2-WIDEN-LABEL: mulhw_v4i16:
74 ; SSE2-WIDEN: # %bb.0:
75 ; SSE2-WIDEN-NEXT: pmulhw %xmm1, %xmm0
76 ; SSE2-WIDEN-NEXT: retq
78 ; SSE41-PROMOTE-LABEL: mulhw_v4i16:
79 ; SSE41-PROMOTE: # %bb.0:
80 ; SSE41-PROMOTE-NEXT: pslld $16, %xmm0
81 ; SSE41-PROMOTE-NEXT: psrad $16, %xmm0
82 ; SSE41-PROMOTE-NEXT: pslld $16, %xmm1
83 ; SSE41-PROMOTE-NEXT: psrad $16, %xmm1
84 ; SSE41-PROMOTE-NEXT: pmulld %xmm1, %xmm0
85 ; SSE41-PROMOTE-NEXT: psrld $16, %xmm0
86 ; SSE41-PROMOTE-NEXT: retq
88 ; SSE41-WIDEN-LABEL: mulhw_v4i16:
89 ; SSE41-WIDEN: # %bb.0:
90 ; SSE41-WIDEN-NEXT: pmulhw %xmm1, %xmm0
91 ; SSE41-WIDEN-NEXT: retq
93 ; AVX-LABEL: mulhw_v4i16:
95 ; AVX-NEXT: vpslld $16, %xmm0, %xmm0
96 ; AVX-NEXT: vpsrad $16, %xmm0, %xmm0
97 ; AVX-NEXT: vpslld $16, %xmm1, %xmm1
98 ; AVX-NEXT: vpsrad $16, %xmm1, %xmm1
99 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
100 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
102 %a1 = sext <4 x i16> %a to <4 x i32>
103 %b1 = sext <4 x i16> %b to <4 x i32>
104 %c = mul <4 x i32> %a1, %b1
105 %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
106 %e = trunc <4 x i32> %d to <4 x i16>
110 define <8 x i16> @mulhuw_v8i16(<8 x i16> %a, <8 x i16> %b) {
111 ; SSE-LABEL: mulhuw_v8i16:
113 ; SSE-NEXT: pmulhuw %xmm1, %xmm0
116 ; AVX-LABEL: mulhuw_v8i16:
118 ; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
120 %a1 = zext <8 x i16> %a to <8 x i32>
121 %b1 = zext <8 x i16> %b to <8 x i32>
122 %c = mul <8 x i32> %a1, %b1
123 %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
124 %e = trunc <8 x i32> %d to <8 x i16>
128 define <8 x i16> @mulhw_v8i16(<8 x i16> %a, <8 x i16> %b) {
129 ; SSE-LABEL: mulhw_v8i16:
131 ; SSE-NEXT: pmulhw %xmm1, %xmm0
134 ; AVX-LABEL: mulhw_v8i16:
136 ; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
138 %a1 = sext <8 x i16> %a to <8 x i32>
139 %b1 = sext <8 x i16> %b to <8 x i32>
140 %c = mul <8 x i32> %a1, %b1
141 %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
142 %e = trunc <8 x i32> %d to <8 x i16>
146 define <16 x i16> @mulhuw_v16i16(<16 x i16> %a, <16 x i16> %b) {
147 ; SSE-LABEL: mulhuw_v16i16:
149 ; SSE-NEXT: pmulhuw %xmm2, %xmm0
150 ; SSE-NEXT: pmulhuw %xmm3, %xmm1
153 ; AVX-LABEL: mulhuw_v16i16:
155 ; AVX-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
157 %a1 = zext <16 x i16> %a to <16 x i32>
158 %b1 = zext <16 x i16> %b to <16 x i32>
159 %c = mul <16 x i32> %a1, %b1
160 %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
161 %e = trunc <16 x i32> %d to <16 x i16>
165 define <16 x i16> @mulhw_v16i16(<16 x i16> %a, <16 x i16> %b) {
166 ; SSE-LABEL: mulhw_v16i16:
168 ; SSE-NEXT: pmulhw %xmm2, %xmm0
169 ; SSE-NEXT: pmulhw %xmm3, %xmm1
172 ; AVX-LABEL: mulhw_v16i16:
174 ; AVX-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
176 %a1 = sext <16 x i16> %a to <16 x i32>
177 %b1 = sext <16 x i16> %b to <16 x i32>
178 %c = mul <16 x i32> %a1, %b1
179 %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
180 %e = trunc <16 x i32> %d to <16 x i16>
184 define <32 x i16> @mulhuw_v32i16(<32 x i16> %a, <32 x i16> %b) {
185 ; SSE-LABEL: mulhuw_v32i16:
187 ; SSE-NEXT: pmulhuw %xmm4, %xmm0
188 ; SSE-NEXT: pmulhuw %xmm5, %xmm1
189 ; SSE-NEXT: pmulhuw %xmm6, %xmm2
190 ; SSE-NEXT: pmulhuw %xmm7, %xmm3
193 ; AVX2-LABEL: mulhuw_v32i16:
195 ; AVX2-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0
196 ; AVX2-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1
199 ; AVX512F-LABEL: mulhuw_v32i16:
201 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0
202 ; AVX512F-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1
205 ; AVX512BW-LABEL: mulhuw_v32i16:
207 ; AVX512BW-NEXT: vpmulhuw %zmm1, %zmm0, %zmm0
208 ; AVX512BW-NEXT: retq
209 %a1 = zext <32 x i16> %a to <32 x i32>
210 %b1 = zext <32 x i16> %b to <32 x i32>
211 %c = mul <32 x i32> %a1, %b1
212 %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
213 %e = trunc <32 x i32> %d to <32 x i16>
217 define <32 x i16> @mulhw_v32i16(<32 x i16> %a, <32 x i16> %b) {
218 ; SSE-LABEL: mulhw_v32i16:
220 ; SSE-NEXT: pmulhw %xmm4, %xmm0
221 ; SSE-NEXT: pmulhw %xmm5, %xmm1
222 ; SSE-NEXT: pmulhw %xmm6, %xmm2
223 ; SSE-NEXT: pmulhw %xmm7, %xmm3
226 ; AVX2-LABEL: mulhw_v32i16:
228 ; AVX2-NEXT: vpmulhw %ymm2, %ymm0, %ymm0
229 ; AVX2-NEXT: vpmulhw %ymm3, %ymm1, %ymm1
232 ; AVX512F-LABEL: mulhw_v32i16:
234 ; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0
235 ; AVX512F-NEXT: vpmulhw %ymm3, %ymm1, %ymm1
238 ; AVX512BW-LABEL: mulhw_v32i16:
240 ; AVX512BW-NEXT: vpmulhw %zmm1, %zmm0, %zmm0
241 ; AVX512BW-NEXT: retq
242 %a1 = sext <32 x i16> %a to <32 x i32>
243 %b1 = sext <32 x i16> %b to <32 x i32>
244 %c = mul <32 x i32> %a1, %b1
245 %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
246 %e = trunc <32 x i32> %d to <32 x i16>
250 define <64 x i16> @mulhuw_v64i16(<64 x i16> %a, <64 x i16> %b) {
251 ; SSE-LABEL: mulhuw_v64i16:
253 ; SSE-NEXT: movq %rdi, %rax
254 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm0
255 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm1
256 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm2
257 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm3
258 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm4
259 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm5
260 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm6
261 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm7
262 ; SSE-NEXT: movdqa %xmm7, 112(%rdi)
263 ; SSE-NEXT: movdqa %xmm6, 96(%rdi)
264 ; SSE-NEXT: movdqa %xmm5, 80(%rdi)
265 ; SSE-NEXT: movdqa %xmm4, 64(%rdi)
266 ; SSE-NEXT: movdqa %xmm3, 48(%rdi)
267 ; SSE-NEXT: movdqa %xmm2, 32(%rdi)
268 ; SSE-NEXT: movdqa %xmm1, 16(%rdi)
269 ; SSE-NEXT: movdqa %xmm0, (%rdi)
272 ; AVX2-LABEL: mulhuw_v64i16:
274 ; AVX2-NEXT: vpmulhuw %ymm4, %ymm0, %ymm0
275 ; AVX2-NEXT: vpmulhuw %ymm5, %ymm1, %ymm1
276 ; AVX2-NEXT: vpmulhuw %ymm6, %ymm2, %ymm2
277 ; AVX2-NEXT: vpmulhuw %ymm7, %ymm3, %ymm3
280 ; AVX512F-LABEL: mulhuw_v64i16:
282 ; AVX512F-NEXT: vpmulhuw %ymm4, %ymm0, %ymm0
283 ; AVX512F-NEXT: vpmulhuw %ymm5, %ymm1, %ymm1
284 ; AVX512F-NEXT: vpmulhuw %ymm6, %ymm2, %ymm2
285 ; AVX512F-NEXT: vpmulhuw %ymm7, %ymm3, %ymm3
288 ; AVX512BW-LABEL: mulhuw_v64i16:
290 ; AVX512BW-NEXT: vpmulhuw %zmm2, %zmm0, %zmm0
291 ; AVX512BW-NEXT: vpmulhuw %zmm3, %zmm1, %zmm1
292 ; AVX512BW-NEXT: retq
293 %a1 = zext <64 x i16> %a to <64 x i32>
294 %b1 = zext <64 x i16> %b to <64 x i32>
295 %c = mul <64 x i32> %a1, %b1
296 %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
297 %e = trunc <64 x i32> %d to <64 x i16>
301 define <64 x i16> @mulhw_v64i16(<64 x i16> %a, <64 x i16> %b) {
302 ; SSE-LABEL: mulhw_v64i16:
304 ; SSE-NEXT: movq %rdi, %rax
305 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0
306 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1
307 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2
308 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3
309 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4
310 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5
311 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6
312 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7
313 ; SSE-NEXT: movdqa %xmm7, 112(%rdi)
314 ; SSE-NEXT: movdqa %xmm6, 96(%rdi)
315 ; SSE-NEXT: movdqa %xmm5, 80(%rdi)
316 ; SSE-NEXT: movdqa %xmm4, 64(%rdi)
317 ; SSE-NEXT: movdqa %xmm3, 48(%rdi)
318 ; SSE-NEXT: movdqa %xmm2, 32(%rdi)
319 ; SSE-NEXT: movdqa %xmm1, 16(%rdi)
320 ; SSE-NEXT: movdqa %xmm0, (%rdi)
323 ; AVX2-LABEL: mulhw_v64i16:
325 ; AVX2-NEXT: vpmulhw %ymm4, %ymm0, %ymm0
326 ; AVX2-NEXT: vpmulhw %ymm5, %ymm1, %ymm1
327 ; AVX2-NEXT: vpmulhw %ymm6, %ymm2, %ymm2
328 ; AVX2-NEXT: vpmulhw %ymm7, %ymm3, %ymm3
331 ; AVX512F-LABEL: mulhw_v64i16:
333 ; AVX512F-NEXT: vpmulhw %ymm4, %ymm0, %ymm0
334 ; AVX512F-NEXT: vpmulhw %ymm5, %ymm1, %ymm1
335 ; AVX512F-NEXT: vpmulhw %ymm6, %ymm2, %ymm2
336 ; AVX512F-NEXT: vpmulhw %ymm7, %ymm3, %ymm3
339 ; AVX512BW-LABEL: mulhw_v64i16:
341 ; AVX512BW-NEXT: vpmulhw %zmm2, %zmm0, %zmm0
342 ; AVX512BW-NEXT: vpmulhw %zmm3, %zmm1, %zmm1
343 ; AVX512BW-NEXT: retq
344 %a1 = sext <64 x i16> %a to <64 x i32>
345 %b1 = sext <64 x i16> %b to <64 x i32>
346 %c = mul <64 x i32> %a1, %b1
347 %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
348 %e = trunc <64 x i32> %d to <64 x i16>