1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=hexagon -mattr=+hvxv60,+hvx-length128b,-packets < %s | FileCheck --check-prefix=V60 %s
3 ; RUN: llc -march=hexagon -mattr=+hvxv65,+hvx-length128b,-packets < %s | FileCheck --check-prefix=V65 %s
4 ; RUN: llc -march=hexagon -mattr=+hvxv69,+hvx-length128b,-packets < %s | FileCheck --check-prefix=V69 %s
6 define <64 x i16> @mulhs16(<64 x i16> %a0, <64 x i16> %a1) #0 {
10 ; V60-NEXT: v1:0.w = vmpy(v1.h,v0.h)
16 ; V60-NEXT: v1:0 = vshuff(v1,v0,r7)
19 ; V60-NEXT: v0.h = vpacko(v1.w,v0.w)
28 ; V65-NEXT: v1:0.w = vmpy(v1.h,v0.h)
34 ; V65-NEXT: v1:0 = vshuff(v1,v0,r7)
37 ; V65-NEXT: v0.h = vpacko(v1.w,v0.w)
46 ; V69-NEXT: v1:0.w = vmpy(v1.h,v0.h)
52 ; V69-NEXT: v1:0 = vshuff(v1,v0,r7)
55 ; V69-NEXT: v0.h = vpacko(v1.w,v0.w)
60 %v0 = sext <64 x i16> %a0 to <64 x i32>
61 %v1 = sext <64 x i16> %a1 to <64 x i32>
62 %v2 = mul <64 x i32> %v0, %v1
63 %v3 = lshr <64 x i32> %v2, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
64 %v4 = trunc <64 x i32> %v3 to <64 x i16>
68 define <64 x i16> @mulhu16(<64 x i16> %a0, <64 x i16> %a1) #0 {
72 ; V60-NEXT: v1:0.uw = vmpy(v1.uh,v0.uh)
78 ; V60-NEXT: v1:0 = vshuff(v1,v0,r7)
81 ; V60-NEXT: v0.h = vpacko(v1.w,v0.w)
90 ; V65-NEXT: v1:0.uw = vmpy(v1.uh,v0.uh)
96 ; V65-NEXT: v1:0 = vshuff(v1,v0,r7)
99 ; V65-NEXT: v0.h = vpacko(v1.w,v0.w)
102 ; V65-NEXT: jumpr r31
105 ; V69-LABEL: mulhu16:
108 ; V69-NEXT: v0.uh = vmpy(v0.uh,v1.uh):>>16
111 ; V69-NEXT: jumpr r31
113 %v0 = zext <64 x i16> %a0 to <64 x i32>
114 %v1 = zext <64 x i16> %a1 to <64 x i32>
115 %v2 = mul <64 x i32> %v0, %v1
116 %v3 = lshr <64 x i32> %v2, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
117 %v4 = trunc <64 x i32> %v3 to <64 x i16>
121 define <32 x i32> @mulhs32(<32 x i32> %a0, <32 x i32> %a1) #0 {
122 ; V60-LABEL: mulhs32:
128 ; V60-NEXT: v2.w = vmpye(v1.w,v0.uh)
131 ; V60-NEXT: v31.w = vasr(v0.w,r0)
134 ; V60-NEXT: v3.w = vasr(v1.w,r0)
137 ; V60-NEXT: v5:4.w = vmpy(v31.h,v1.uh)
140 ; V60-NEXT: v31:30.w = vmpy(v31.h,v3.h)
143 ; V60-NEXT: v7:6.w = vadd(v2.uh,v4.uh)
146 ; V60-NEXT: v29:28.w = vadd(v2.h,v4.h)
149 ; V60-NEXT: v29.w += vasr(v6.w,r0)
152 ; V60-NEXT: v0.w = vadd(v29.w,v30.w)
155 ; V60-NEXT: jumpr r31
158 ; V65-LABEL: mulhs32:
161 ; V65-NEXT: v3:2 = vmpye(v0.w,v1.uh)
164 ; V65-NEXT: v3:2 += vmpyo(v0.w,v1.h)
170 ; V65-NEXT: jumpr r31
173 ; V69-LABEL: mulhs32:
176 ; V69-NEXT: v3:2 = vmpye(v0.w,v1.uh)
179 ; V69-NEXT: v3:2 += vmpyo(v0.w,v1.h)
185 ; V69-NEXT: jumpr r31
187 %v0 = sext <32 x i32> %a0 to <32 x i64>
188 %v1 = sext <32 x i32> %a1 to <32 x i64>
189 %v2 = mul <32 x i64> %v0, %v1
190 %v3 = lshr <32 x i64> %v2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
191 %v4 = trunc <32 x i64> %v3 to <32 x i32>
195 define <32 x i32> @mulhu32(<32 x i32> %a0, <32 x i32> %a1) #0 {
196 ; V60-LABEL: mulhu32:
199 ; V60-NEXT: r0 = ##33686018
202 ; V60-NEXT: v3:2.uw = vmpy(v0.uh,v1.uh)
208 ; V60-NEXT: v4 = vsplat(r0)
211 ; V60-NEXT: v2.uw = vlsr(v2.uw,r2)
214 ; V60-NEXT: v31 = vdelta(v1,v4)
217 ; V60-NEXT: v1:0.uw = vmpy(v0.uh,v31.uh)
220 ; V60-NEXT: v1:0.w = vadd(v1.uh,v0.uh)
223 ; V60-NEXT: v0.w = vadd(v0.w,v2.w)
226 ; V60-NEXT: v1.w += vasr(v0.w,r2)
229 ; V60-NEXT: v0.w = vadd(v3.w,v1.w)
232 ; V60-NEXT: jumpr r31
235 ; V65-LABEL: mulhu32:
238 ; V65-NEXT: v2 = vxor(v2,v2)
241 ; V65-NEXT: v5:4 = vmpye(v0.w,v1.uh)
244 ; V65-NEXT: q0 = vcmp.gt(v2.w,v0.w)
247 ; V65-NEXT: q1 = vcmp.gt(v2.w,v1.w)
250 ; V65-NEXT: v5:4 += vmpyo(v0.w,v1.h)
253 ; V65-NEXT: v31 = vand(q0,v1)
256 ; V65-NEXT: if (q1) v31.w += v0.w
259 ; V65-NEXT: v0.w = vadd(v5.w,v31.w)
262 ; V65-NEXT: jumpr r31
265 ; V69-LABEL: mulhu32:
268 ; V69-NEXT: v2 = vxor(v2,v2)
271 ; V69-NEXT: v5:4 = vmpye(v0.w,v1.uh)
274 ; V69-NEXT: q0 = vcmp.gt(v2.w,v0.w)
277 ; V69-NEXT: q1 = vcmp.gt(v2.w,v1.w)
280 ; V69-NEXT: v5:4 += vmpyo(v0.w,v1.h)
283 ; V69-NEXT: v31 = vand(q0,v1)
286 ; V69-NEXT: if (q1) v31.w += v0.w
289 ; V69-NEXT: v0.w = vadd(v5.w,v31.w)
292 ; V69-NEXT: jumpr r31
294 %v0 = zext <32 x i32> %a0 to <32 x i64>
295 %v1 = zext <32 x i32> %a1 to <32 x i64>
296 %v2 = mul <32 x i64> %v0, %v1
297 %v3 = lshr <32 x i64> %v2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
298 %v4 = trunc <32 x i64> %v3 to <32 x i32>
302 attributes #0 = { nounwind memory(none) }