1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 --check-prefix=SSE2-PROMOTE
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 --check-prefix=SSE41-PROMOTE
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
8 define <4 x i16> @mulhuw_v4i16(<4 x i16> %a, <4 x i16> %b) {
9 ; SSE-LABEL: mulhuw_v4i16:
11 ; SSE-NEXT: pmulhuw %xmm1, %xmm0
14 ; AVX-LABEL: mulhuw_v4i16:
16 ; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
18 %a1 = zext <4 x i16> %a to <4 x i32>
19 %b1 = zext <4 x i16> %b to <4 x i32>
20 %c = mul <4 x i32> %a1, %b1
21 %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
22 %e = trunc <4 x i32> %d to <4 x i16>
26 define <4 x i16> @mulhw_v4i16(<4 x i16> %a, <4 x i16> %b) {
27 ; SSE-LABEL: mulhw_v4i16:
29 ; SSE-NEXT: pmulhw %xmm1, %xmm0
32 ; AVX-LABEL: mulhw_v4i16:
34 ; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
36 %a1 = sext <4 x i16> %a to <4 x i32>
37 %b1 = sext <4 x i16> %b to <4 x i32>
38 %c = mul <4 x i32> %a1, %b1
39 %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
40 %e = trunc <4 x i32> %d to <4 x i16>
44 define <8 x i16> @mulhuw_v8i16(<8 x i16> %a, <8 x i16> %b) {
45 ; SSE-LABEL: mulhuw_v8i16:
47 ; SSE-NEXT: pmulhuw %xmm1, %xmm0
50 ; AVX-LABEL: mulhuw_v8i16:
52 ; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
54 %a1 = zext <8 x i16> %a to <8 x i32>
55 %b1 = zext <8 x i16> %b to <8 x i32>
56 %c = mul <8 x i32> %a1, %b1
57 %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
58 %e = trunc <8 x i32> %d to <8 x i16>
62 define <8 x i16> @mulhw_v8i16(<8 x i16> %a, <8 x i16> %b) {
63 ; SSE-LABEL: mulhw_v8i16:
65 ; SSE-NEXT: pmulhw %xmm1, %xmm0
68 ; AVX-LABEL: mulhw_v8i16:
70 ; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
72 %a1 = sext <8 x i16> %a to <8 x i32>
73 %b1 = sext <8 x i16> %b to <8 x i32>
74 %c = mul <8 x i32> %a1, %b1
75 %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
76 %e = trunc <8 x i32> %d to <8 x i16>
80 define <16 x i16> @mulhuw_v16i16(<16 x i16> %a, <16 x i16> %b) {
81 ; SSE-LABEL: mulhuw_v16i16:
83 ; SSE-NEXT: pmulhuw %xmm2, %xmm0
84 ; SSE-NEXT: pmulhuw %xmm3, %xmm1
87 ; AVX-LABEL: mulhuw_v16i16:
89 ; AVX-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
91 %a1 = zext <16 x i16> %a to <16 x i32>
92 %b1 = zext <16 x i16> %b to <16 x i32>
93 %c = mul <16 x i32> %a1, %b1
94 %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
95 %e = trunc <16 x i32> %d to <16 x i16>
99 define <16 x i16> @mulhw_v16i16(<16 x i16> %a, <16 x i16> %b) {
100 ; SSE-LABEL: mulhw_v16i16:
102 ; SSE-NEXT: pmulhw %xmm2, %xmm0
103 ; SSE-NEXT: pmulhw %xmm3, %xmm1
106 ; AVX-LABEL: mulhw_v16i16:
108 ; AVX-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
110 %a1 = sext <16 x i16> %a to <16 x i32>
111 %b1 = sext <16 x i16> %b to <16 x i32>
112 %c = mul <16 x i32> %a1, %b1
113 %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
114 %e = trunc <16 x i32> %d to <16 x i16>
118 define <32 x i16> @mulhuw_v32i16(<32 x i16> %a, <32 x i16> %b) {
119 ; SSE-LABEL: mulhuw_v32i16:
121 ; SSE-NEXT: pmulhuw %xmm4, %xmm0
122 ; SSE-NEXT: pmulhuw %xmm5, %xmm1
123 ; SSE-NEXT: pmulhuw %xmm6, %xmm2
124 ; SSE-NEXT: pmulhuw %xmm7, %xmm3
127 ; AVX2-LABEL: mulhuw_v32i16:
129 ; AVX2-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0
130 ; AVX2-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1
133 ; AVX512F-LABEL: mulhuw_v32i16:
135 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0
136 ; AVX512F-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1
139 ; AVX512BW-LABEL: mulhuw_v32i16:
141 ; AVX512BW-NEXT: vpmulhuw %zmm1, %zmm0, %zmm0
142 ; AVX512BW-NEXT: retq
143 %a1 = zext <32 x i16> %a to <32 x i32>
144 %b1 = zext <32 x i16> %b to <32 x i32>
145 %c = mul <32 x i32> %a1, %b1
146 %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
147 %e = trunc <32 x i32> %d to <32 x i16>
151 define <32 x i16> @mulhw_v32i16(<32 x i16> %a, <32 x i16> %b) {
152 ; SSE-LABEL: mulhw_v32i16:
154 ; SSE-NEXT: pmulhw %xmm4, %xmm0
155 ; SSE-NEXT: pmulhw %xmm5, %xmm1
156 ; SSE-NEXT: pmulhw %xmm6, %xmm2
157 ; SSE-NEXT: pmulhw %xmm7, %xmm3
160 ; AVX2-LABEL: mulhw_v32i16:
162 ; AVX2-NEXT: vpmulhw %ymm2, %ymm0, %ymm0
163 ; AVX2-NEXT: vpmulhw %ymm3, %ymm1, %ymm1
166 ; AVX512F-LABEL: mulhw_v32i16:
168 ; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0
169 ; AVX512F-NEXT: vpmulhw %ymm3, %ymm1, %ymm1
172 ; AVX512BW-LABEL: mulhw_v32i16:
174 ; AVX512BW-NEXT: vpmulhw %zmm1, %zmm0, %zmm0
175 ; AVX512BW-NEXT: retq
176 %a1 = sext <32 x i16> %a to <32 x i32>
177 %b1 = sext <32 x i16> %b to <32 x i32>
178 %c = mul <32 x i32> %a1, %b1
179 %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
180 %e = trunc <32 x i32> %d to <32 x i16>
184 define <64 x i16> @mulhuw_v64i16(<64 x i16> %a, <64 x i16> %b) {
185 ; SSE-LABEL: mulhuw_v64i16:
187 ; SSE-NEXT: movq %rdi, %rax
188 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm0
189 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm1
190 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm2
191 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm3
192 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm4
193 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm5
194 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm6
195 ; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm7
196 ; SSE-NEXT: movdqa %xmm7, 112(%rdi)
197 ; SSE-NEXT: movdqa %xmm6, 96(%rdi)
198 ; SSE-NEXT: movdqa %xmm5, 80(%rdi)
199 ; SSE-NEXT: movdqa %xmm4, 64(%rdi)
200 ; SSE-NEXT: movdqa %xmm3, 48(%rdi)
201 ; SSE-NEXT: movdqa %xmm2, 32(%rdi)
202 ; SSE-NEXT: movdqa %xmm1, 16(%rdi)
203 ; SSE-NEXT: movdqa %xmm0, (%rdi)
206 ; AVX2-LABEL: mulhuw_v64i16:
208 ; AVX2-NEXT: vpmulhuw %ymm4, %ymm0, %ymm0
209 ; AVX2-NEXT: vpmulhuw %ymm5, %ymm1, %ymm1
210 ; AVX2-NEXT: vpmulhuw %ymm6, %ymm2, %ymm2
211 ; AVX2-NEXT: vpmulhuw %ymm7, %ymm3, %ymm3
214 ; AVX512F-LABEL: mulhuw_v64i16:
216 ; AVX512F-NEXT: vpmulhuw %ymm4, %ymm0, %ymm0
217 ; AVX512F-NEXT: vpmulhuw %ymm5, %ymm1, %ymm1
218 ; AVX512F-NEXT: vpmulhuw %ymm6, %ymm2, %ymm2
219 ; AVX512F-NEXT: vpmulhuw %ymm7, %ymm3, %ymm3
222 ; AVX512BW-LABEL: mulhuw_v64i16:
224 ; AVX512BW-NEXT: vpmulhuw %zmm2, %zmm0, %zmm0
225 ; AVX512BW-NEXT: vpmulhuw %zmm3, %zmm1, %zmm1
226 ; AVX512BW-NEXT: retq
227 %a1 = zext <64 x i16> %a to <64 x i32>
228 %b1 = zext <64 x i16> %b to <64 x i32>
229 %c = mul <64 x i32> %a1, %b1
230 %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
231 %e = trunc <64 x i32> %d to <64 x i16>
235 define <64 x i16> @mulhw_v64i16(<64 x i16> %a, <64 x i16> %b) {
236 ; SSE-LABEL: mulhw_v64i16:
238 ; SSE-NEXT: movq %rdi, %rax
239 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0
240 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1
241 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2
242 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3
243 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4
244 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5
245 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6
246 ; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7
247 ; SSE-NEXT: movdqa %xmm7, 112(%rdi)
248 ; SSE-NEXT: movdqa %xmm6, 96(%rdi)
249 ; SSE-NEXT: movdqa %xmm5, 80(%rdi)
250 ; SSE-NEXT: movdqa %xmm4, 64(%rdi)
251 ; SSE-NEXT: movdqa %xmm3, 48(%rdi)
252 ; SSE-NEXT: movdqa %xmm2, 32(%rdi)
253 ; SSE-NEXT: movdqa %xmm1, 16(%rdi)
254 ; SSE-NEXT: movdqa %xmm0, (%rdi)
257 ; AVX2-LABEL: mulhw_v64i16:
259 ; AVX2-NEXT: vpmulhw %ymm4, %ymm0, %ymm0
260 ; AVX2-NEXT: vpmulhw %ymm5, %ymm1, %ymm1
261 ; AVX2-NEXT: vpmulhw %ymm6, %ymm2, %ymm2
262 ; AVX2-NEXT: vpmulhw %ymm7, %ymm3, %ymm3
265 ; AVX512F-LABEL: mulhw_v64i16:
267 ; AVX512F-NEXT: vpmulhw %ymm4, %ymm0, %ymm0
268 ; AVX512F-NEXT: vpmulhw %ymm5, %ymm1, %ymm1
269 ; AVX512F-NEXT: vpmulhw %ymm6, %ymm2, %ymm2
270 ; AVX512F-NEXT: vpmulhw %ymm7, %ymm3, %ymm3
273 ; AVX512BW-LABEL: mulhw_v64i16:
275 ; AVX512BW-NEXT: vpmulhw %zmm2, %zmm0, %zmm0
276 ; AVX512BW-NEXT: vpmulhw %zmm3, %zmm1, %zmm1
277 ; AVX512BW-NEXT: retq
278 %a1 = sext <64 x i16> %a to <64 x i32>
279 %b1 = sext <64 x i16> %b to <64 x i32>
280 %c = mul <64 x i32> %a1, %b1
281 %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
282 %e = trunc <64 x i32> %d to <64 x i16>