1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
6 ; This test only tests the legal types for a given vector width, as mulh nodes
7 ; do not get generated for non-legal types.
9 target triple = "aarch64-unknown-linux-gnu"
15 ; Don't use SVE for 64-bit vectors.
16 define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
17 ; CHECK-LABEL: smulh_v8i8:
19 ; CHECK-NEXT: ptrue p0.b, vl8
20 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
21 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
22 ; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b
23 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
25 %insert = insertelement <8 x i16> undef, i16 8, i64 0
26 %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
27 %1 = sext <8 x i8> %op1 to <8 x i16>
28 %2 = sext <8 x i8> %op2 to <8 x i16>
29 %mul = mul <8 x i16> %1, %2
30 %shr = lshr <8 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
31 %res = trunc <8 x i16> %shr to <8 x i8>
35 ; Don't use SVE for 128-bit vectors.
36 define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
37 ; CHECK-LABEL: smulh_v16i8:
39 ; CHECK-NEXT: ptrue p0.b, vl16
40 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
41 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
42 ; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b
43 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
45 %1 = sext <16 x i8> %op1 to <16 x i16>
46 %2 = sext <16 x i8> %op2 to <16 x i16>
47 %mul = mul <16 x i16> %1, %2
48 %shr = lshr <16 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
49 %res = trunc <16 x i16> %shr to <16 x i8>
53 define void @smulh_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
54 ; CHECK-LABEL: smulh_v32i8:
56 ; CHECK-NEXT: ptrue p0.b, vl32
57 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
58 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
59 ; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b
60 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
62 %op1 = load <32 x i8>, ptr %a
63 %op2 = load <32 x i8>, ptr %b
64 %1 = sext <32 x i8> %op1 to <32 x i16>
65 %2 = sext <32 x i8> %op2 to <32 x i16>
66 %mul = mul <32 x i16> %1, %2
67 %shr = lshr <32 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
68 %res = trunc <32 x i16> %shr to <32 x i8>
69 store <32 x i8> %res, ptr %a
73 define void @smulh_v64i8(ptr %a, ptr %b) #0 {
74 ; VBITS_GE_256-LABEL: smulh_v64i8:
75 ; VBITS_GE_256: // %bb.0:
76 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
77 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
78 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
79 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8]
80 ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0]
81 ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
82 ; VBITS_GE_256-NEXT: smulh z0.b, p0/m, z0.b, z1.b
83 ; VBITS_GE_256-NEXT: movprfx z1, z2
84 ; VBITS_GE_256-NEXT: smulh z1.b, p0/m, z1.b, z3.b
85 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
86 ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
87 ; VBITS_GE_256-NEXT: ret
89 ; VBITS_GE_512-LABEL: smulh_v64i8:
90 ; VBITS_GE_512: // %bb.0:
91 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
92 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
93 ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
94 ; VBITS_GE_512-NEXT: smulh z0.b, p0/m, z0.b, z1.b
95 ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
96 ; VBITS_GE_512-NEXT: ret
97 %op1 = load <64 x i8>, ptr %a
98 %op2 = load <64 x i8>, ptr %b
99 %insert = insertelement <64 x i16> undef, i16 8, i64 0
100 %splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer
101 %1 = sext <64 x i8> %op1 to <64 x i16>
102 %2 = sext <64 x i8> %op2 to <64 x i16>
103 %mul = mul <64 x i16> %1, %2
104 %shr = lshr <64 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
105 %res = trunc <64 x i16> %shr to <64 x i8>
106 store <64 x i8> %res, ptr %a
110 define void @smulh_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
111 ; CHECK-LABEL: smulh_v128i8:
113 ; CHECK-NEXT: ptrue p0.b, vl128
114 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
115 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
116 ; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b
117 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
119 %op1 = load <128 x i8>, ptr %a
120 %op2 = load <128 x i8>, ptr %b
121 %1 = sext <128 x i8> %op1 to <128 x i16>
122 %2 = sext <128 x i8> %op2 to <128 x i16>
123 %mul = mul <128 x i16> %1, %2
124 %shr = lshr <128 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
125 %res = trunc <128 x i16> %shr to <128 x i8>
126 store <128 x i8> %res, ptr %a
130 define void @smulh_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
131 ; CHECK-LABEL: smulh_v256i8:
133 ; CHECK-NEXT: ptrue p0.b, vl256
134 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
135 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
136 ; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b
137 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
139 %op1 = load <256 x i8>, ptr %a
140 %op2 = load <256 x i8>, ptr %b
141 %1 = sext <256 x i8> %op1 to <256 x i16>
142 %2 = sext <256 x i8> %op2 to <256 x i16>
143 %mul = mul <256 x i16> %1, %2
144 %shr = lshr <256 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
145 %res = trunc <256 x i16> %shr to <256 x i8>
146 store <256 x i8> %res, ptr %a
150 ; Don't use SVE for 64-bit vectors.
151 define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
152 ; CHECK-LABEL: smulh_v4i16:
154 ; CHECK-NEXT: ptrue p0.h, vl4
155 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
156 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
157 ; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h
158 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
160 %1 = sext <4 x i16> %op1 to <4 x i32>
161 %2 = sext <4 x i16> %op2 to <4 x i32>
162 %mul = mul <4 x i32> %1, %2
163 %shr = lshr <4 x i32> %mul, <i32 16, i32 16, i32 16, i32 16>
164 %res = trunc <4 x i32> %shr to <4 x i16>
168 ; Don't use SVE for 128-bit vectors.
169 define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
170 ; CHECK-LABEL: smulh_v8i16:
172 ; CHECK-NEXT: ptrue p0.h, vl8
173 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
174 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
175 ; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h
176 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
178 %1 = sext <8 x i16> %op1 to <8 x i32>
179 %2 = sext <8 x i16> %op2 to <8 x i32>
180 %mul = mul <8 x i32> %1, %2
181 %shr = lshr <8 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
182 %res = trunc <8 x i32> %shr to <8 x i16>
186 define void @smulh_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
187 ; CHECK-LABEL: smulh_v16i16:
189 ; CHECK-NEXT: ptrue p0.h, vl16
190 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
191 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
192 ; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h
193 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
195 %op1 = load <16 x i16>, ptr %a
196 %op2 = load <16 x i16>, ptr %b
197 %1 = sext <16 x i16> %op1 to <16 x i32>
198 %2 = sext <16 x i16> %op2 to <16 x i32>
199 %mul = mul <16 x i32> %1, %2
200 %shr = lshr <16 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
201 %res = trunc <16 x i32> %shr to <16 x i16>
202 store <16 x i16> %res, ptr %a
206 define void @smulh_v32i16(ptr %a, ptr %b) #0 {
207 ; VBITS_GE_256-LABEL: smulh_v32i16:
208 ; VBITS_GE_256: // %bb.0:
209 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
210 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
211 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
212 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
213 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0]
214 ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
215 ; VBITS_GE_256-NEXT: smulh z0.h, p0/m, z0.h, z1.h
216 ; VBITS_GE_256-NEXT: movprfx z1, z2
217 ; VBITS_GE_256-NEXT: smulh z1.h, p0/m, z1.h, z3.h
218 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
219 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
220 ; VBITS_GE_256-NEXT: ret
222 ; VBITS_GE_512-LABEL: smulh_v32i16:
223 ; VBITS_GE_512: // %bb.0:
224 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
225 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
226 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
227 ; VBITS_GE_512-NEXT: smulh z0.h, p0/m, z0.h, z1.h
228 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
229 ; VBITS_GE_512-NEXT: ret
230 %op1 = load <32 x i16>, ptr %a
231 %op2 = load <32 x i16>, ptr %b
232 %1 = sext <32 x i16> %op1 to <32 x i32>
233 %2 = sext <32 x i16> %op2 to <32 x i32>
234 %mul = mul <32 x i32> %1, %2
235 %shr = lshr <32 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
236 %res = trunc <32 x i32> %shr to <32 x i16>
237 store <32 x i16> %res, ptr %a
241 define void @smulh_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
242 ; CHECK-LABEL: smulh_v64i16:
244 ; CHECK-NEXT: ptrue p0.h, vl64
245 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
246 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
247 ; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h
248 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
250 %op1 = load <64 x i16>, ptr %a
251 %op2 = load <64 x i16>, ptr %b
252 %1 = sext <64 x i16> %op1 to <64 x i32>
253 %2 = sext <64 x i16> %op2 to <64 x i32>
254 %mul = mul <64 x i32> %1, %2
255 %shr = lshr <64 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
256 %res = trunc <64 x i32> %shr to <64 x i16>
257 store <64 x i16> %res, ptr %a
261 define void @smulh_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
262 ; CHECK-LABEL: smulh_v128i16:
264 ; CHECK-NEXT: ptrue p0.h, vl128
265 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
266 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
267 ; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h
268 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
270 %op1 = load <128 x i16>, ptr %a
271 %op2 = load <128 x i16>, ptr %b
272 %1 = sext <128 x i16> %op1 to <128 x i32>
273 %2 = sext <128 x i16> %op2 to <128 x i32>
274 %mul = mul <128 x i32> %1, %2
275 %shr = lshr <128 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
276 %res = trunc <128 x i32> %shr to <128 x i16>
277 store <128 x i16> %res, ptr %a
281 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
282 define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
283 ; CHECK-LABEL: smulh_v2i32:
285 ; CHECK-NEXT: ptrue p0.s, vl2
286 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
287 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
288 ; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s
289 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
291 %1 = sext <2 x i32> %op1 to <2 x i64>
292 %2 = sext <2 x i32> %op2 to <2 x i64>
293 %mul = mul <2 x i64> %1, %2
294 %shr = lshr <2 x i64> %mul, <i64 32, i64 32>
295 %res = trunc <2 x i64> %shr to <2 x i32>
299 ; Don't use SVE for 128-bit vectors.
300 define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
301 ; CHECK-LABEL: smulh_v4i32:
303 ; CHECK-NEXT: ptrue p0.s, vl4
304 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
305 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
306 ; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s
307 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
309 %1 = sext <4 x i32> %op1 to <4 x i64>
310 %2 = sext <4 x i32> %op2 to <4 x i64>
311 %mul = mul <4 x i64> %1, %2
312 %shr = lshr <4 x i64> %mul, <i64 32, i64 32, i64 32, i64 32>
313 %res = trunc <4 x i64> %shr to <4 x i32>
317 define void @smulh_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
318 ; CHECK-LABEL: smulh_v8i32:
320 ; CHECK-NEXT: ptrue p0.s, vl8
321 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
322 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
323 ; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s
324 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
326 %op1 = load <8 x i32>, ptr %a
327 %op2 = load <8 x i32>, ptr %b
328 %1 = sext <8 x i32> %op1 to <8 x i64>
329 %2 = sext <8 x i32> %op2 to <8 x i64>
330 %mul = mul <8 x i64> %1, %2
331 %shr = lshr <8 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
332 %res = trunc <8 x i64> %shr to <8 x i32>
333 store <8 x i32> %res, ptr %a
337 define void @smulh_v16i32(ptr %a, ptr %b) #0 {
338 ; VBITS_GE_256-LABEL: smulh_v16i32:
339 ; VBITS_GE_256: // %bb.0:
340 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
341 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
342 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
343 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
344 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
345 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
346 ; VBITS_GE_256-NEXT: smulh z0.s, p0/m, z0.s, z1.s
347 ; VBITS_GE_256-NEXT: movprfx z1, z2
348 ; VBITS_GE_256-NEXT: smulh z1.s, p0/m, z1.s, z3.s
349 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
350 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
351 ; VBITS_GE_256-NEXT: ret
353 ; VBITS_GE_512-LABEL: smulh_v16i32:
354 ; VBITS_GE_512: // %bb.0:
355 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
356 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
357 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
358 ; VBITS_GE_512-NEXT: smulh z0.s, p0/m, z0.s, z1.s
359 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
360 ; VBITS_GE_512-NEXT: ret
361 %op1 = load <16 x i32>, ptr %a
362 %op2 = load <16 x i32>, ptr %b
363 %1 = sext <16 x i32> %op1 to <16 x i64>
364 %2 = sext <16 x i32> %op2 to <16 x i64>
365 %mul = mul <16 x i64> %1, %2
366 %shr = lshr <16 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
367 %res = trunc <16 x i64> %shr to <16 x i32>
368 store <16 x i32> %res, ptr %a
372 define void @smulh_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
373 ; CHECK-LABEL: smulh_v32i32:
375 ; CHECK-NEXT: ptrue p0.s, vl32
376 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
377 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
378 ; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s
379 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
381 %op1 = load <32 x i32>, ptr %a
382 %op2 = load <32 x i32>, ptr %b
383 %1 = sext <32 x i32> %op1 to <32 x i64>
384 %2 = sext <32 x i32> %op2 to <32 x i64>
385 %mul = mul <32 x i64> %1, %2
386 %shr = lshr <32 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
387 %res = trunc <32 x i64> %shr to <32 x i32>
388 store <32 x i32> %res, ptr %a
392 define void @smulh_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
393 ; CHECK-LABEL: smulh_v64i32:
395 ; CHECK-NEXT: ptrue p0.s, vl64
396 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
397 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
398 ; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s
399 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
401 %op1 = load <64 x i32>, ptr %a
402 %op2 = load <64 x i32>, ptr %b
403 %1 = sext <64 x i32> %op1 to <64 x i64>
404 %2 = sext <64 x i32> %op2 to <64 x i64>
405 %mul = mul <64 x i64> %1, %2
406 %shr = lshr <64 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
407 %res = trunc <64 x i64> %shr to <64 x i32>
408 store <64 x i32> %res, ptr %a
412 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
413 define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
414 ; CHECK-LABEL: smulh_v1i64:
416 ; CHECK-NEXT: ptrue p0.d, vl1
417 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
418 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
419 ; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d
420 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
422 %insert = insertelement <1 x i128> undef, i128 64, i128 0
423 %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer
424 %1 = sext <1 x i64> %op1 to <1 x i128>
425 %2 = sext <1 x i64> %op2 to <1 x i128>
426 %mul = mul <1 x i128> %1, %2
427 %shr = lshr <1 x i128> %mul, %splat
428 %res = trunc <1 x i128> %shr to <1 x i64>
432 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
433 define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
434 ; CHECK-LABEL: smulh_v2i64:
436 ; CHECK-NEXT: ptrue p0.d, vl2
437 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
438 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
439 ; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d
440 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
442 %1 = sext <2 x i64> %op1 to <2 x i128>
443 %2 = sext <2 x i64> %op2 to <2 x i128>
444 %mul = mul <2 x i128> %1, %2
445 %shr = lshr <2 x i128> %mul, <i128 64, i128 64>
446 %res = trunc <2 x i128> %shr to <2 x i64>
450 define void @smulh_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
451 ; CHECK-LABEL: smulh_v4i64:
453 ; CHECK-NEXT: ptrue p0.d, vl4
454 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
455 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
456 ; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d
457 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
459 %op1 = load <4 x i64>, ptr %a
460 %op2 = load <4 x i64>, ptr %b
461 %1 = sext <4 x i64> %op1 to <4 x i128>
462 %2 = sext <4 x i64> %op2 to <4 x i128>
463 %mul = mul <4 x i128> %1, %2
464 %shr = lshr <4 x i128> %mul, <i128 64, i128 64, i128 64, i128 64>
465 %res = trunc <4 x i128> %shr to <4 x i64>
466 store <4 x i64> %res, ptr %a
470 define void @smulh_v8i64(ptr %a, ptr %b) #0 {
471 ; VBITS_GE_256-LABEL: smulh_v8i64:
472 ; VBITS_GE_256: // %bb.0:
473 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
474 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
475 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
476 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
477 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
478 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
479 ; VBITS_GE_256-NEXT: smulh z0.d, p0/m, z0.d, z1.d
480 ; VBITS_GE_256-NEXT: movprfx z1, z2
481 ; VBITS_GE_256-NEXT: smulh z1.d, p0/m, z1.d, z3.d
482 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
483 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
484 ; VBITS_GE_256-NEXT: ret
486 ; VBITS_GE_512-LABEL: smulh_v8i64:
487 ; VBITS_GE_512: // %bb.0:
488 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
489 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
490 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
491 ; VBITS_GE_512-NEXT: smulh z0.d, p0/m, z0.d, z1.d
492 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
493 ; VBITS_GE_512-NEXT: ret
494 %op1 = load <8 x i64>, ptr %a
495 %op2 = load <8 x i64>, ptr %b
496 %1 = sext <8 x i64> %op1 to <8 x i128>
497 %2 = sext <8 x i64> %op2 to <8 x i128>
498 %mul = mul <8 x i128> %1, %2
499 %shr = lshr <8 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
500 %res = trunc <8 x i128> %shr to <8 x i64>
501 store <8 x i64> %res, ptr %a
505 define void @smulh_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
506 ; CHECK-LABEL: smulh_v16i64:
508 ; CHECK-NEXT: ptrue p0.d, vl16
509 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
510 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
511 ; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d
512 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
514 %op1 = load <16 x i64>, ptr %a
515 %op2 = load <16 x i64>, ptr %b
516 %1 = sext <16 x i64> %op1 to <16 x i128>
517 %2 = sext <16 x i64> %op2 to <16 x i128>
518 %mul = mul <16 x i128> %1, %2
519 %shr = lshr <16 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
520 %res = trunc <16 x i128> %shr to <16 x i64>
521 store <16 x i64> %res, ptr %a
525 define void @smulh_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
526 ; CHECK-LABEL: smulh_v32i64:
528 ; CHECK-NEXT: ptrue p0.d, vl32
529 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
530 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
531 ; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d
532 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
534 %op1 = load <32 x i64>, ptr %a
535 %op2 = load <32 x i64>, ptr %b
536 %1 = sext <32 x i64> %op1 to <32 x i128>
537 %2 = sext <32 x i64> %op2 to <32 x i128>
538 %mul = mul <32 x i128> %1, %2
539 %shr = lshr <32 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
540 %res = trunc <32 x i128> %shr to <32 x i64>
541 store <32 x i64> %res, ptr %a
549 ; Don't use SVE for 64-bit vectors.
550 ; FIXME: The codegen for the >=256 bits case can be improved.
551 define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
552 ; CHECK-LABEL: umulh_v8i8:
554 ; CHECK-NEXT: ptrue p0.b, vl8
555 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
556 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
557 ; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b
558 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
560 %1 = zext <8 x i8> %op1 to <8 x i16>
561 %2 = zext <8 x i8> %op2 to <8 x i16>
562 %mul = mul <8 x i16> %1, %2
563 %shr = lshr <8 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
564 %res = trunc <8 x i16> %shr to <8 x i8>
568 ; Don't use SVE for 128-bit vectors.
569 define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
570 ; CHECK-LABEL: umulh_v16i8:
572 ; CHECK-NEXT: ptrue p0.b, vl16
573 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
574 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
575 ; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b
576 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
578 %1 = zext <16 x i8> %op1 to <16 x i16>
579 %2 = zext <16 x i8> %op2 to <16 x i16>
580 %mul = mul <16 x i16> %1, %2
581 %shr = lshr <16 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
582 %res = trunc <16 x i16> %shr to <16 x i8>
586 define void @umulh_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
587 ; CHECK-LABEL: umulh_v32i8:
589 ; CHECK-NEXT: ptrue p0.b, vl32
590 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
591 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
592 ; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b
593 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
595 %op1 = load <32 x i8>, ptr %a
596 %op2 = load <32 x i8>, ptr %b
597 %1 = zext <32 x i8> %op1 to <32 x i16>
598 %2 = zext <32 x i8> %op2 to <32 x i16>
599 %mul = mul <32 x i16> %1, %2
600 %shr = lshr <32 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
601 %res = trunc <32 x i16> %shr to <32 x i8>
602 store <32 x i8> %res, ptr %a
606 define void @umulh_v64i8(ptr %a, ptr %b) #0 {
607 ; VBITS_GE_256-LABEL: umulh_v64i8:
608 ; VBITS_GE_256: // %bb.0:
609 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
610 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
611 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
612 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8]
613 ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0]
614 ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
615 ; VBITS_GE_256-NEXT: umulh z0.b, p0/m, z0.b, z1.b
616 ; VBITS_GE_256-NEXT: movprfx z1, z2
617 ; VBITS_GE_256-NEXT: umulh z1.b, p0/m, z1.b, z3.b
618 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
619 ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
620 ; VBITS_GE_256-NEXT: ret
622 ; VBITS_GE_512-LABEL: umulh_v64i8:
623 ; VBITS_GE_512: // %bb.0:
624 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
625 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
626 ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
627 ; VBITS_GE_512-NEXT: umulh z0.b, p0/m, z0.b, z1.b
628 ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
629 ; VBITS_GE_512-NEXT: ret
630 %op1 = load <64 x i8>, ptr %a
631 %op2 = load <64 x i8>, ptr %b
632 %1 = zext <64 x i8> %op1 to <64 x i16>
633 %2 = zext <64 x i8> %op2 to <64 x i16>
634 %mul = mul <64 x i16> %1, %2
635 %shr = lshr <64 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
636 %res = trunc <64 x i16> %shr to <64 x i8>
637 store <64 x i8> %res, ptr %a
641 define void @umulh_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
642 ; CHECK-LABEL: umulh_v128i8:
644 ; CHECK-NEXT: ptrue p0.b, vl128
645 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
646 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
647 ; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b
648 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
650 %op1 = load <128 x i8>, ptr %a
651 %op2 = load <128 x i8>, ptr %b
652 %insert = insertelement <128 x i16> undef, i16 8, i64 0
653 %splat = shufflevector <128 x i16> %insert, <128 x i16> undef, <128 x i32> zeroinitializer
654 %1 = zext <128 x i8> %op1 to <128 x i16>
655 %2 = zext <128 x i8> %op2 to <128 x i16>
656 %mul = mul <128 x i16> %1, %2
657 %shr = lshr <128 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
658 %res = trunc <128 x i16> %shr to <128 x i8>
659 store <128 x i8> %res, ptr %a
663 define void @umulh_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
664 ; CHECK-LABEL: umulh_v256i8:
666 ; CHECK-NEXT: ptrue p0.b, vl256
667 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
668 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
669 ; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b
670 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
672 %op1 = load <256 x i8>, ptr %a
673 %op2 = load <256 x i8>, ptr %b
674 %1 = zext <256 x i8> %op1 to <256 x i16>
675 %2 = zext <256 x i8> %op2 to <256 x i16>
676 %mul = mul <256 x i16> %1, %2
677 %shr = lshr <256 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
678 %res = trunc <256 x i16> %shr to <256 x i8>
679 store <256 x i8> %res, ptr %a
683 ; Don't use SVE for 64-bit vectors.
684 ; FIXME: The codegen for the >=256 bits case can be improved.
685 define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
686 ; CHECK-LABEL: umulh_v4i16:
688 ; CHECK-NEXT: ptrue p0.h, vl4
689 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
690 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
691 ; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h
692 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
694 %1 = zext <4 x i16> %op1 to <4 x i32>
695 %2 = zext <4 x i16> %op2 to <4 x i32>
696 %mul = mul <4 x i32> %1, %2
697 %shr = lshr <4 x i32> %mul, <i32 16, i32 16, i32 16, i32 16>
698 %res = trunc <4 x i32> %shr to <4 x i16>
702 ; Don't use SVE for 128-bit vectors.
703 define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
704 ; CHECK-LABEL: umulh_v8i16:
706 ; CHECK-NEXT: ptrue p0.h, vl8
707 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
708 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
709 ; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h
710 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
712 %1 = zext <8 x i16> %op1 to <8 x i32>
713 %2 = zext <8 x i16> %op2 to <8 x i32>
714 %mul = mul <8 x i32> %1, %2
715 %shr = lshr <8 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
716 %res = trunc <8 x i32> %shr to <8 x i16>
720 define void @umulh_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
721 ; CHECK-LABEL: umulh_v16i16:
723 ; CHECK-NEXT: ptrue p0.h, vl16
724 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
725 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
726 ; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h
727 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
729 %op1 = load <16 x i16>, ptr %a
730 %op2 = load <16 x i16>, ptr %b
731 %1 = zext <16 x i16> %op1 to <16 x i32>
732 %2 = zext <16 x i16> %op2 to <16 x i32>
733 %mul = mul <16 x i32> %1, %2
734 %shr = lshr <16 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
735 %res = trunc <16 x i32> %shr to <16 x i16>
736 store <16 x i16> %res, ptr %a
740 define void @umulh_v32i16(ptr %a, ptr %b) #0 {
741 ; VBITS_GE_256-LABEL: umulh_v32i16:
742 ; VBITS_GE_256: // %bb.0:
743 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
744 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
745 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
746 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
747 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0]
748 ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
749 ; VBITS_GE_256-NEXT: umulh z0.h, p0/m, z0.h, z1.h
750 ; VBITS_GE_256-NEXT: movprfx z1, z2
751 ; VBITS_GE_256-NEXT: umulh z1.h, p0/m, z1.h, z3.h
752 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
753 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
754 ; VBITS_GE_256-NEXT: ret
756 ; VBITS_GE_512-LABEL: umulh_v32i16:
757 ; VBITS_GE_512: // %bb.0:
758 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
759 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
760 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
761 ; VBITS_GE_512-NEXT: umulh z0.h, p0/m, z0.h, z1.h
762 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
763 ; VBITS_GE_512-NEXT: ret
764 %op1 = load <32 x i16>, ptr %a
765 %op2 = load <32 x i16>, ptr %b
766 %1 = zext <32 x i16> %op1 to <32 x i32>
767 %2 = zext <32 x i16> %op2 to <32 x i32>
768 %mul = mul <32 x i32> %1, %2
769 %shr = lshr <32 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
770 %res = trunc <32 x i32> %shr to <32 x i16>
771 store <32 x i16> %res, ptr %a
775 define void @umulh_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
776 ; CHECK-LABEL: umulh_v64i16:
778 ; CHECK-NEXT: ptrue p0.h, vl64
779 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
780 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
781 ; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h
782 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
784 %op1 = load <64 x i16>, ptr %a
785 %op2 = load <64 x i16>, ptr %b
786 %1 = zext <64 x i16> %op1 to <64 x i32>
787 %2 = zext <64 x i16> %op2 to <64 x i32>
788 %mul = mul <64 x i32> %1, %2
789 %shr = lshr <64 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
790 %res = trunc <64 x i32> %shr to <64 x i16>
791 store <64 x i16> %res, ptr %a
795 define void @umulh_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
796 ; CHECK-LABEL: umulh_v128i16:
798 ; CHECK-NEXT: ptrue p0.h, vl128
799 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
800 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
801 ; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h
802 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
804 %op1 = load <128 x i16>, ptr %a
805 %op2 = load <128 x i16>, ptr %b
806 %1 = zext <128 x i16> %op1 to <128 x i32>
807 %2 = zext <128 x i16> %op2 to <128 x i32>
808 %mul = mul <128 x i32> %1, %2
809 %shr = lshr <128 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
810 %res = trunc <128 x i32> %shr to <128 x i16>
811 store <128 x i16> %res, ptr %a
815 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
816 define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
817 ; CHECK-LABEL: umulh_v2i32:
819 ; CHECK-NEXT: ptrue p0.s, vl2
820 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
821 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
822 ; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s
823 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
825 %1 = zext <2 x i32> %op1 to <2 x i64>
826 %2 = zext <2 x i32> %op2 to <2 x i64>
827 %mul = mul <2 x i64> %1, %2
828 %shr = lshr <2 x i64> %mul, <i64 32, i64 32>
829 %res = trunc <2 x i64> %shr to <2 x i32>
833 ; Don't use SVE for 128-bit vectors.
834 define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
835 ; CHECK-LABEL: umulh_v4i32:
837 ; CHECK-NEXT: ptrue p0.s, vl4
838 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
839 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
840 ; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s
841 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
843 %1 = zext <4 x i32> %op1 to <4 x i64>
844 %2 = zext <4 x i32> %op2 to <4 x i64>
845 %mul = mul <4 x i64> %1, %2
846 %shr = lshr <4 x i64> %mul, <i64 32, i64 32, i64 32, i64 32>
847 %res = trunc <4 x i64> %shr to <4 x i32>
851 define void @umulh_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
852 ; CHECK-LABEL: umulh_v8i32:
854 ; CHECK-NEXT: ptrue p0.s, vl8
855 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
856 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
857 ; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s
858 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
860 %op1 = load <8 x i32>, ptr %a
861 %op2 = load <8 x i32>, ptr %b
862 %insert = insertelement <8 x i64> undef, i64 32, i64 0
863 %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
864 %1 = zext <8 x i32> %op1 to <8 x i64>
865 %2 = zext <8 x i32> %op2 to <8 x i64>
866 %mul = mul <8 x i64> %1, %2
867 %shr = lshr <8 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
868 %res = trunc <8 x i64> %shr to <8 x i32>
869 store <8 x i32> %res, ptr %a
873 define void @umulh_v16i32(ptr %a, ptr %b) #0 {
874 ; VBITS_GE_256-LABEL: umulh_v16i32:
875 ; VBITS_GE_256: // %bb.0:
876 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
877 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
878 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
879 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
880 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
881 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
882 ; VBITS_GE_256-NEXT: umulh z0.s, p0/m, z0.s, z1.s
883 ; VBITS_GE_256-NEXT: movprfx z1, z2
884 ; VBITS_GE_256-NEXT: umulh z1.s, p0/m, z1.s, z3.s
885 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
886 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
887 ; VBITS_GE_256-NEXT: ret
889 ; VBITS_GE_512-LABEL: umulh_v16i32:
890 ; VBITS_GE_512: // %bb.0:
891 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
892 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
893 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
894 ; VBITS_GE_512-NEXT: umulh z0.s, p0/m, z0.s, z1.s
895 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
896 ; VBITS_GE_512-NEXT: ret
897 %op1 = load <16 x i32>, ptr %a
898 %op2 = load <16 x i32>, ptr %b
899 %1 = zext <16 x i32> %op1 to <16 x i64>
900 %2 = zext <16 x i32> %op2 to <16 x i64>
901 %mul = mul <16 x i64> %1, %2
902 %shr = lshr <16 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
903 %res = trunc <16 x i64> %shr to <16 x i32>
904 store <16 x i32> %res, ptr %a
908 define void @umulh_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
909 ; CHECK-LABEL: umulh_v32i32:
911 ; CHECK-NEXT: ptrue p0.s, vl32
912 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
913 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
914 ; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s
915 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
917 %op1 = load <32 x i32>, ptr %a
918 %op2 = load <32 x i32>, ptr %b
919 %1 = zext <32 x i32> %op1 to <32 x i64>
920 %2 = zext <32 x i32> %op2 to <32 x i64>
921 %mul = mul <32 x i64> %1, %2
922 %shr = lshr <32 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
923 %res = trunc <32 x i64> %shr to <32 x i32>
924 store <32 x i32> %res, ptr %a
928 define void @umulh_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
929 ; CHECK-LABEL: umulh_v64i32:
931 ; CHECK-NEXT: ptrue p0.s, vl64
932 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
933 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
934 ; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s
935 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
937 %op1 = load <64 x i32>, ptr %a
938 %op2 = load <64 x i32>, ptr %b
939 %1 = zext <64 x i32> %op1 to <64 x i64>
940 %2 = zext <64 x i32> %op2 to <64 x i64>
941 %mul = mul <64 x i64> %1, %2
942 %shr = lshr <64 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
943 %res = trunc <64 x i64> %shr to <64 x i32>
944 store <64 x i32> %res, ptr %a
948 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
949 define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
950 ; CHECK-LABEL: umulh_v1i64:
952 ; CHECK-NEXT: ptrue p0.d, vl1
953 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
954 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
955 ; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d
956 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
958 %1 = zext <1 x i64> %op1 to <1 x i128>
959 %2 = zext <1 x i64> %op2 to <1 x i128>
960 %mul = mul <1 x i128> %1, %2
961 %shr = lshr <1 x i128> %mul, <i128 64>
962 %res = trunc <1 x i128> %shr to <1 x i64>
966 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
967 define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
968 ; CHECK-LABEL: umulh_v2i64:
970 ; CHECK-NEXT: ptrue p0.d, vl2
971 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
972 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
973 ; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d
974 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
976 %1 = zext <2 x i64> %op1 to <2 x i128>
977 %2 = zext <2 x i64> %op2 to <2 x i128>
978 %mul = mul <2 x i128> %1, %2
979 %shr = lshr <2 x i128> %mul, <i128 64, i128 64>
980 %res = trunc <2 x i128> %shr to <2 x i64>
984 define void @umulh_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
985 ; CHECK-LABEL: umulh_v4i64:
987 ; CHECK-NEXT: ptrue p0.d, vl4
988 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
989 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
990 ; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d
991 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
993 %op1 = load <4 x i64>, ptr %a
994 %op2 = load <4 x i64>, ptr %b
995 %1 = zext <4 x i64> %op1 to <4 x i128>
996 %2 = zext <4 x i64> %op2 to <4 x i128>
997 %mul = mul <4 x i128> %1, %2
998 %shr = lshr <4 x i128> %mul, <i128 64, i128 64, i128 64, i128 64>
999 %res = trunc <4 x i128> %shr to <4 x i64>
1000 store <4 x i64> %res, ptr %a
1004 define void @umulh_v8i64(ptr %a, ptr %b) #0 {
1005 ; VBITS_GE_256-LABEL: umulh_v8i64:
1006 ; VBITS_GE_256: // %bb.0:
1007 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1008 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1009 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1010 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
1011 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
1012 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
1013 ; VBITS_GE_256-NEXT: umulh z0.d, p0/m, z0.d, z1.d
1014 ; VBITS_GE_256-NEXT: movprfx z1, z2
1015 ; VBITS_GE_256-NEXT: umulh z1.d, p0/m, z1.d, z3.d
1016 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
1017 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
1018 ; VBITS_GE_256-NEXT: ret
1020 ; VBITS_GE_512-LABEL: umulh_v8i64:
1021 ; VBITS_GE_512: // %bb.0:
1022 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1023 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1024 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
1025 ; VBITS_GE_512-NEXT: umulh z0.d, p0/m, z0.d, z1.d
1026 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
1027 ; VBITS_GE_512-NEXT: ret
1028 %op1 = load <8 x i64>, ptr %a
1029 %op2 = load <8 x i64>, ptr %b
1030 %1 = zext <8 x i64> %op1 to <8 x i128>
1031 %2 = zext <8 x i64> %op2 to <8 x i128>
1032 %mul = mul <8 x i128> %1, %2
1033 %shr = lshr <8 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
1034 %res = trunc <8 x i128> %shr to <8 x i64>
1035 store <8 x i64> %res, ptr %a
1039 define void @umulh_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
1040 ; CHECK-LABEL: umulh_v16i64:
1042 ; CHECK-NEXT: ptrue p0.d, vl16
1043 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1044 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1045 ; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d
1046 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1048 %op1 = load <16 x i64>, ptr %a
1049 %op2 = load <16 x i64>, ptr %b
1050 %1 = zext <16 x i64> %op1 to <16 x i128>
1051 %2 = zext <16 x i64> %op2 to <16 x i128>
1052 %mul = mul <16 x i128> %1, %2
1053 %shr = lshr <16 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
1054 %res = trunc <16 x i128> %shr to <16 x i64>
1055 store <16 x i64> %res, ptr %a
1059 define void @umulh_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
1060 ; CHECK-LABEL: umulh_v32i64:
1062 ; CHECK-NEXT: ptrue p0.d, vl32
1063 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1064 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1065 ; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d
1066 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1068 %op1 = load <32 x i64>, ptr %a
1069 %op2 = load <32 x i64>, ptr %b
1070 %1 = zext <32 x i64> %op1 to <32 x i128>
1071 %2 = zext <32 x i64> %op2 to <32 x i128>
1072 %mul = mul <32 x i128> %1, %2
1073 %shr = lshr <32 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
1074 %res = trunc <32 x i128> %shr to <32 x i64>
1075 store <32 x i64> %res, ptr %a
1078 attributes #0 = { "target-features"="+sve" }