1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 --check-prefix=SSE2-PROMOTE
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 --check-prefix=SSE41-PROMOTE
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
8 define <4 x i16> @mulhuw_v4i16(<4 x i16> %a, <4 x i16> %b) {
9 ; SSE-LABEL: mulhuw_v4i16:
11 ; SSE-NEXT: pmulhuw %xmm1, %xmm0
14 ; AVX-LABEL: mulhuw_v4i16:
16 ; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
18 %a1 = zext <4 x i16> %a to <4 x i32>
19 %b1 = zext <4 x i16> %b to <4 x i32>
20 %c = mul <4 x i32> %a1, %b1
21 %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
22 %e = trunc <4 x i32> %d to <4 x i16>
26 define <4 x i16> @mulhw_v4i16(<4 x i16> %a, <4 x i16> %b) {
27 ; SSE-LABEL: mulhw_v4i16:
29 ; SSE-NEXT: pmulhw %xmm1, %xmm0
32 ; AVX-LABEL: mulhw_v4i16:
34 ; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
36 %a1 = sext <4 x i16> %a to <4 x i32>
37 %b1 = sext <4 x i16> %b to <4 x i32>
38 %c = mul <4 x i32> %a1, %b1
39 %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
40 %e = trunc <4 x i32> %d to <4 x i16>
44 define <8 x i16> @mulhuw_v8i16(<8 x i16> %a, <8 x i16> %b) {
45 ; SSE-LABEL: mulhuw_v8i16:
47 ; SSE-NEXT: pmulhuw %xmm1, %xmm0
50 ; AVX-LABEL: mulhuw_v8i16:
52 ; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
54 %a1 = zext <8 x i16> %a to <8 x i32>
55 %b1 = zext <8 x i16> %b to <8 x i32>
56 %c = mul <8 x i32> %a1, %b1
57 %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
58 %e = trunc <8 x i32> %d to <8 x i16>
62 define <8 x i16> @mulhw_v8i16(<8 x i16> %a, <8 x i16> %b) {
63 ; SSE-LABEL: mulhw_v8i16:
65 ; SSE-NEXT: pmulhw %xmm1, %xmm0
68 ; AVX-LABEL: mulhw_v8i16:
70 ; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
72 %a1 = sext <8 x i16> %a to <8 x i32>
73 %b1 = sext <8 x i16> %b to <8 x i32>
74 %c = mul <8 x i32> %a1, %b1
75 %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
76 %e = trunc <8 x i32> %d to <8 x i16>
80 define <16 x i16> @mulhuw_v16i16(<16 x i16> %a, <16 x i16> %b) {
81 ; SSE-LABEL: mulhuw_v16i16:
83 ; SSE-NEXT: pmulhuw %xmm2, %xmm0
84 ; SSE-NEXT: pmulhuw %xmm3, %xmm1
87 ; AVX-LABEL: mulhuw_v16i16:
89 ; AVX-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
91 %a1 = zext <16 x i16> %a to <16 x i32>
92 %b1 = zext <16 x i16> %b to <16 x i32>
93 %c = mul <16 x i32> %a1, %b1
94 %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
95 %e = trunc <16 x i32> %d to <16 x i16>
99 define <16 x i16> @mulhw_v16i16(<16 x i16> %a, <16 x i16> %b) {
100 ; SSE-LABEL: mulhw_v16i16:
102 ; SSE-NEXT: pmulhw %xmm2, %xmm0
103 ; SSE-NEXT: pmulhw %xmm3, %xmm1
106 ; AVX-LABEL: mulhw_v16i16:
108 ; AVX-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
110 %a1 = sext <16 x i16> %a to <16 x i32>
111 %b1 = sext <16 x i16> %b to <16 x i32>
112 %c = mul <16 x i32> %a1, %b1
113 %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
114 %e = trunc <16 x i32> %d to <16 x i16>
118 define <32 x i16> @mulhuw_v32i16(<32 x i16> %a, <32 x i16> %b) {
119 ; SSE-LABEL: mulhuw_v32i16:
121 ; SSE-NEXT: pmulhuw %xmm4, %xmm0
122 ; SSE-NEXT: pmulhuw %xmm5, %xmm1
123 ; SSE-NEXT: pmulhuw %xmm6, %xmm2
124 ; SSE-NEXT: pmulhuw %xmm7, %xmm3
127 ; AVX2-LABEL: mulhuw_v32i16:
129 ; AVX2-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0
130 ; AVX2-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1
133 ; AVX512F-LABEL: mulhuw_v32i16:
135 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
136 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
137 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm3, %ymm2
138 ; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
139 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
142 ; AVX512BW-LABEL: mulhuw_v32i16:
144 ; AVX512BW-NEXT: vpmulhuw %zmm1, %zmm0, %zmm0
145 ; AVX512BW-NEXT: retq
146 %a1 = zext <32 x i16> %a to <32 x i32>
147 %b1 = zext <32 x i16> %b to <32 x i32>
148 %c = mul <32 x i32> %a1, %b1
149 %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
150 %e = trunc <32 x i32> %d to <32 x i16>
154 define <32 x i16> @mulhw_v32i16(<32 x i16> %a, <32 x i16> %b) {
155 ; SSE-LABEL: mulhw_v32i16:
157 ; SSE-NEXT: pmulhw %xmm4, %xmm0
158 ; SSE-NEXT: pmulhw %xmm5, %xmm1
159 ; SSE-NEXT: pmulhw %xmm6, %xmm2
160 ; SSE-NEXT: pmulhw %xmm7, %xmm3
163 ; AVX2-LABEL: mulhw_v32i16:
165 ; AVX2-NEXT: vpmulhw %ymm2, %ymm0, %ymm0
166 ; AVX2-NEXT: vpmulhw %ymm3, %ymm1, %ymm1
169 ; AVX512F-LABEL: mulhw_v32i16:
171 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
172 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
173 ; AVX512F-NEXT: vpmulhw %ymm2, %ymm3, %ymm2
174 ; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
175 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
178 ; AVX512BW-LABEL: mulhw_v32i16:
180 ; AVX512BW-NEXT: vpmulhw %zmm1, %zmm0, %zmm0
181 ; AVX512BW-NEXT: retq
182 %a1 = sext <32 x i16> %a to <32 x i32>
183 %b1 = sext <32 x i16> %b to <32 x i32>
184 %c = mul <32 x i32> %a1, %b1
185 %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
186 %e = trunc <32 x i32> %d to <32 x i16>
190 define <64 x i16> @mulhuw_v64i16(<64 x i16> %a, <64 x i16> %b) {
191 ; SSE-LABEL: mulhuw_v64i16:
193 ; SSE-NEXT: movq %rdi, %rax
194 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm0
195 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm1
196 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm2
197 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm3
198 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm4
199 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm5
200 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm6
201 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm7
202 ; SSE-NEXT: movdqa %xmm7, 112(%rdi)
203 ; SSE-NEXT: movdqa %xmm6, 96(%rdi)
204 ; SSE-NEXT: movdqa %xmm5, 80(%rdi)
205 ; SSE-NEXT: movdqa %xmm4, 64(%rdi)
206 ; SSE-NEXT: movdqa %xmm3, 48(%rdi)
207 ; SSE-NEXT: movdqa %xmm2, 32(%rdi)
208 ; SSE-NEXT: movdqa %xmm1, 16(%rdi)
209 ; SSE-NEXT: movdqa %xmm0, (%rdi)
212 ; AVX2-LABEL: mulhuw_v64i16:
214 ; AVX2-NEXT: vpmulhuw %ymm4, %ymm0, %ymm0
215 ; AVX2-NEXT: vpmulhuw %ymm5, %ymm1, %ymm1
216 ; AVX2-NEXT: vpmulhuw %ymm6, %ymm2, %ymm2
217 ; AVX2-NEXT: vpmulhuw %ymm7, %ymm3, %ymm3
220 ; AVX512F-LABEL: mulhuw_v64i16:
222 ; AVX512F-NEXT: vpmulhuw %ymm4, %ymm0, %ymm0
223 ; AVX512F-NEXT: vpmulhuw %ymm5, %ymm1, %ymm1
224 ; AVX512F-NEXT: vpmulhuw %ymm6, %ymm2, %ymm2
225 ; AVX512F-NEXT: vpmulhuw %ymm7, %ymm3, %ymm3
228 ; AVX512BW-LABEL: mulhuw_v64i16:
230 ; AVX512BW-NEXT: vpmulhuw %zmm2, %zmm0, %zmm0
231 ; AVX512BW-NEXT: vpmulhuw %zmm3, %zmm1, %zmm1
232 ; AVX512BW-NEXT: retq
233 %a1 = zext <64 x i16> %a to <64 x i32>
234 %b1 = zext <64 x i16> %b to <64 x i32>
235 %c = mul <64 x i32> %a1, %b1
236 %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
237 %e = trunc <64 x i32> %d to <64 x i16>
241 define <64 x i16> @mulhw_v64i16(<64 x i16> %a, <64 x i16> %b) {
242 ; SSE-LABEL: mulhw_v64i16:
244 ; SSE-NEXT: movq %rdi, %rax
245 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0
246 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1
247 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2
248 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3
249 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4
250 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5
251 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6
252 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7
253 ; SSE-NEXT: movdqa %xmm7, 112(%rdi)
254 ; SSE-NEXT: movdqa %xmm6, 96(%rdi)
255 ; SSE-NEXT: movdqa %xmm5, 80(%rdi)
256 ; SSE-NEXT: movdqa %xmm4, 64(%rdi)
257 ; SSE-NEXT: movdqa %xmm3, 48(%rdi)
258 ; SSE-NEXT: movdqa %xmm2, 32(%rdi)
259 ; SSE-NEXT: movdqa %xmm1, 16(%rdi)
260 ; SSE-NEXT: movdqa %xmm0, (%rdi)
263 ; AVX2-LABEL: mulhw_v64i16:
265 ; AVX2-NEXT: vpmulhw %ymm4, %ymm0, %ymm0
266 ; AVX2-NEXT: vpmulhw %ymm5, %ymm1, %ymm1
267 ; AVX2-NEXT: vpmulhw %ymm6, %ymm2, %ymm2
268 ; AVX2-NEXT: vpmulhw %ymm7, %ymm3, %ymm3
271 ; AVX512F-LABEL: mulhw_v64i16:
273 ; AVX512F-NEXT: vpmulhw %ymm4, %ymm0, %ymm0
274 ; AVX512F-NEXT: vpmulhw %ymm5, %ymm1, %ymm1
275 ; AVX512F-NEXT: vpmulhw %ymm6, %ymm2, %ymm2
276 ; AVX512F-NEXT: vpmulhw %ymm7, %ymm3, %ymm3
279 ; AVX512BW-LABEL: mulhw_v64i16:
281 ; AVX512BW-NEXT: vpmulhw %zmm2, %zmm0, %zmm0
282 ; AVX512BW-NEXT: vpmulhw %zmm3, %zmm1, %zmm1
283 ; AVX512BW-NEXT: retq
284 %a1 = sext <64 x i16> %a to <64 x i32>
285 %b1 = sext <64 x i16> %b to <64 x i32>
286 %c = mul <64 x i32> %a1, %b1
287 %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
288 %e = trunc <64 x i32> %d to <64 x i16>