1 ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256,VBITS_EQ_256
3 ; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256
4 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256,VBITS_EQ_512
5 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256
6 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256
7 ; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256
8 ; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256,VBITS_EQ_1024
9 ; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
10 ; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
11 ; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
12 ; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
13 ; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
14 ; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
15 ; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
16 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
18 ; VBYTES represents the useful byte size of a vector register from the code
19 ; generator's point of view. It is clamped to power-of-2 values because
20 ; only power-of-2 vector lengths are considered legal, regardless of the
21 ; user specified vector length.
23 ; This test only tests the legal types for a given vector width, as mulh nodes
24 ; do not get generated for non-legal types.
26 target triple = "aarch64-unknown-linux-gnu"
28 ; Don't use SVE when its registers are no bigger than NEON.
35 ; Don't use SVE for 64-bit vectors.
36 define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
37 ; CHECK-LABEL: smulh_v8i8:
38 ; CHECK: smull v0.8h, v0.8b, v1.8b
39 ; CHECK: ushr v1.8h, v0.8h, #8
40 ; CHECK: umov w8, v1.h[0]
42 ; CHECK: umov w8, v1.h[1]
43 ; CHECK: mov v0.b[1], w8
44 ; CHECK: umov w8, v1.h[2]
45 ; CHECK: mov v0.b[2], w8
46 ; CHECK: umov w8, v1.h[3]
47 ; CHECK: mov v0.b[3], w8
49 %insert = insertelement <8 x i16> undef, i16 8, i64 0
50 %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
51 %1 = sext <8 x i8> %op1 to <8 x i16>
52 %2 = sext <8 x i8> %op2 to <8 x i16>
53 %mul = mul <8 x i16> %1, %2
54 %shr = lshr <8 x i16> %mul, %splat
55 %res = trunc <8 x i16> %shr to <8 x i8>
59 ; Don't use SVE for 128-bit vectors.
60 define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
61 ; CHECK-LABEL: smulh_v16i8:
62 ; CHECK: smull2 v2.8h, v0.16b, v1.16b
63 ; CHECK: smull v0.8h, v0.8b, v1.8b
64 ; CHECK: uzp2 v0.16b, v0.16b, v2.16b
66 %insert = insertelement <16 x i16> undef, i16 8, i64 0
67 %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
68 %1 = sext <16 x i8> %op1 to <16 x i16>
69 %2 = sext <16 x i8> %op2 to <16 x i16>
70 %mul = mul <16 x i16> %1, %2
71 %shr = lshr <16 x i16> %mul, %splat
72 %res = trunc <16 x i16> %shr to <16 x i8>
76 define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
77 ; CHECK-LABEL: smulh_v32i8:
78 ; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
79 ; VBITS_EQ_256-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
80 ; VBITS_EQ_256-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
81 ; VBITS_EQ_256: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
84 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]]
85 ; VBITS_GE_512-DAG: ld1sb { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
86 ; VBITS_GE_512-DAG: ld1sb { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
87 ; VBITS_GE_512: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
88 ; VBITS_GE_512: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
89 ; VBITS_GE_512: st1b { [[RES]].h }, [[PG]], [x0]
91 %op1 = load <32 x i8>, <32 x i8>* %a
92 %op2 = load <32 x i8>, <32 x i8>* %b
93 %insert = insertelement <32 x i16> undef, i16 8, i64 0
94 %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
95 %1 = sext <32 x i8> %op1 to <32 x i16>
96 %2 = sext <32 x i8> %op2 to <32 x i16>
97 %mul = mul <32 x i16> %1, %2
98 %shr = lshr <32 x i16> %mul, %splat
99 %res = trunc <32 x i16> %shr to <32 x i8>
100 store <32 x i8> %res, <32 x i8>* %a
104 define void @smulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
105 ; CHECK-LABEL: smulh_v64i8:
106 ; VBITS_EQ_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
107 ; VBITS_EQ_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
108 ; VBITS_EQ_512: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
109 ; VBITS_EQ_512: st1b { [[RES]].b }, [[PG]], [x0]
112 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]]
113 ; VBITS_GE_1024-DAG: ld1sb { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
114 ; VBITS_GE_1024-DAG: ld1sb { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
115 ; VBITS_GE_1024: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
116 ; VBITS_GE_1024: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
117 ; VBITS_GE_1024: st1b { [[RES]].h }, [[PG]], [x0]
119 %op1 = load <64 x i8>, <64 x i8>* %a
120 %op2 = load <64 x i8>, <64 x i8>* %b
121 %insert = insertelement <64 x i16> undef, i16 8, i64 0
122 %splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer
123 %1 = sext <64 x i8> %op1 to <64 x i16>
124 %2 = sext <64 x i8> %op2 to <64 x i16>
125 %mul = mul <64 x i16> %1, %2
126 %shr = lshr <64 x i16> %mul, %splat
127 %res = trunc <64 x i16> %shr to <64 x i8>
128 store <64 x i8> %res, <64 x i8>* %a
132 define void @smulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
133 ; CHECK-LABEL: smulh_v128i8:
134 ; VBITS_EQ_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
135 ; VBITS_EQ_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
136 ; VBITS_EQ_1024: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
137 ; VBITS_EQ_1024: st1b { [[RES]].b }, [[PG]], [x0]
140 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,128)]]
141 ; VBITS_GE_2048-DAG: ld1sb { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
142 ; VBITS_GE_2048-DAG: ld1sb { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
143 ; VBITS_GE_2048: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
144 ; VBITS_GE_2048: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
145 ; VBITS_GE_2048: st1b { [[RES]].h }, [[PG]], [x0]
147 %op1 = load <128 x i8>, <128 x i8>* %a
148 %op2 = load <128 x i8>, <128 x i8>* %b
149 %insert = insertelement <128 x i16> undef, i16 8, i64 0
150 %splat = shufflevector <128 x i16> %insert, <128 x i16> undef, <128 x i32> zeroinitializer
151 %1 = sext <128 x i8> %op1 to <128 x i16>
152 %2 = sext <128 x i8> %op2 to <128 x i16>
153 %mul = mul <128 x i16> %1, %2
154 %shr = lshr <128 x i16> %mul, %splat
155 %res = trunc <128 x i16> %shr to <128 x i8>
156 store <128 x i8> %res, <128 x i8>* %a
160 define void @smulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
161 ; CHECK-LABEL: smulh_v256i8:
162 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]]
163 ; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
164 ; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
165 ; VBITS_GE_2048: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
166 ; VBITS_GE_2048: st1b { [[RES]].b }, [[PG]], [x0]
168 %op1 = load <256 x i8>, <256 x i8>* %a
169 %op2 = load <256 x i8>, <256 x i8>* %b
170 %insert = insertelement <256 x i16> undef, i16 8, i64 0
171 %splat = shufflevector <256 x i16> %insert, <256 x i16> undef, <256 x i32> zeroinitializer
172 %1 = sext <256 x i8> %op1 to <256 x i16>
173 %2 = sext <256 x i8> %op2 to <256 x i16>
174 %mul = mul <256 x i16> %1, %2
175 %shr = lshr <256 x i16> %mul, %splat
176 %res = trunc <256 x i16> %shr to <256 x i8>
177 store <256 x i8> %res, <256 x i8>* %a
181 ; Don't use SVE for 64-bit vectors.
182 define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
183 ; CHECK-LABEL: smulh_v4i16:
184 ; CHECK: smull v0.4s, v0.4h, v1.4h
185 ; CHECK: ushr v0.4s, v0.4s, #16
186 ; CHECK: mov w8, v0.s[1]
187 ; CHECK: mov w9, v0.s[2]
188 ; CHECK: mov w10, v0.s[3]
189 ; CHECK: mov v0.h[1], w8
190 ; CHECK: mov v0.h[2], w9
191 ; CHECK: mov v0.h[3], w10
193 %insert = insertelement <4 x i32> undef, i32 16, i64 0
194 %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
195 %1 = sext <4 x i16> %op1 to <4 x i32>
196 %2 = sext <4 x i16> %op2 to <4 x i32>
197 %mul = mul <4 x i32> %1, %2
198 %shr = lshr <4 x i32> %mul, %splat
199 %res = trunc <4 x i32> %shr to <4 x i16>
203 ; Don't use SVE for 128-bit vectors.
204 define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
205 ; CHECK-LABEL: smulh_v8i16:
206 ; CHECK: smull2 v2.4s, v0.8h, v1.8h
207 ; CHECK: smull v0.4s, v0.4h, v1.4h
208 ; CHECK: uzp2 v0.8h, v0.8h, v2.8h
210 %insert = insertelement <8 x i32> undef, i32 16, i64 0
211 %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
212 %1 = sext <8 x i16> %op1 to <8 x i32>
213 %2 = sext <8 x i16> %op2 to <8 x i32>
214 %mul = mul <8 x i32> %1, %2
215 %shr = lshr <8 x i32> %mul, %splat
216 %res = trunc <8 x i32> %shr to <8 x i16>
220 define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
221 ; CHECK-LABEL: smulh_v16i16:
222 ; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,16)]]
223 ; VBITS_EQ_256-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
224 ; VBITS_EQ_256-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
225 ; VBITS_EQ_256: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
226 ; VBITS_EQ_256: st1h { [[RES]].h }, [[PG]], [x0]
229 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]]
230 ; VBITS_GE_512-DAG: ld1sh { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
231 ; VBITS_GE_512-DAG: ld1sh { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
232 ; VBITS_GE_512: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
233 ; VBITS_GE_512: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
234 ; VBITS_GE_512: st1h { [[RES]].s }, [[PG]], [x0]
236 %op1 = load <16 x i16>, <16 x i16>* %a
237 %op2 = load <16 x i16>, <16 x i16>* %b
238 %insert = insertelement <16 x i32> undef, i32 16, i64 0
239 %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
240 %1 = sext <16 x i16> %op1 to <16 x i32>
241 %2 = sext <16 x i16> %op2 to <16 x i32>
242 %mul = mul <16 x i32> %1, %2
243 %shr = lshr <16 x i32> %mul, %splat
244 %res = trunc <16 x i32> %shr to <16 x i16>
245 store <16 x i16> %res, <16 x i16>* %a
249 define void @smulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
250 ; CHECK-LABEL: smulh_v32i16:
251 ; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]]
252 ; VBITS_EQ_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
253 ; VBITS_EQ_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
254 ; VBITS_EQ_512: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
255 ; VBITS_EQ_512: st1h { [[RES]].h }, [[PG]], [x0]
258 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]]
259 ; VBITS_GE_1024-DAG: ld1sh { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
260 ; VBITS_GE_1024-DAG: ld1sh { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
261 ; VBITS_GE_1024: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
262 ; VBITS_GE_1024: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
263 ; VBITS_GE_1024: st1h { [[RES]].s }, [[PG]], [x0]
265 %op1 = load <32 x i16>, <32 x i16>* %a
266 %op2 = load <32 x i16>, <32 x i16>* %b
267 %insert = insertelement <32 x i32> undef, i32 16, i64 0
268 %splat = shufflevector <32 x i32> %insert, <32 x i32> undef, <32 x i32> zeroinitializer
269 %1 = sext <32 x i16> %op1 to <32 x i32>
270 %2 = sext <32 x i16> %op2 to <32 x i32>
271 %mul = mul <32 x i32> %1, %2
272 %shr = lshr <32 x i32> %mul, %splat
273 %res = trunc <32 x i32> %shr to <32 x i16>
274 store <32 x i16> %res, <32 x i16>* %a
278 define void @smulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
279 ; CHECK-LABEL: smulh_v64i16:
280 ; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]]
281 ; VBITS_EQ_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
282 ; VBITS_EQ_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
283 ; VBITS_EQ_1024: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
284 ; VBITS_EQ_1024: st1h { [[RES]].h }, [[PG]], [x0]
287 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,64)]]
288 ; VBITS_GE_2048-DAG: ld1sh { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
289 ; VBITS_GE_2048-DAG: ld1sh { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
290 ; VBITS_GE_2048: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
291 ; VBITS_GE_2048: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
292 ; VBITS_GE_2048: st1h { [[RES]].s }, [[PG]], [x0]
294 %op1 = load <64 x i16>, <64 x i16>* %a
295 %op2 = load <64 x i16>, <64 x i16>* %b
296 %insert = insertelement <64 x i32> undef, i32 16, i64 0
297 %splat = shufflevector <64 x i32> %insert, <64 x i32> undef, <64 x i32> zeroinitializer
298 %1 = sext <64 x i16> %op1 to <64 x i32>
299 %2 = sext <64 x i16> %op2 to <64 x i32>
300 %mul = mul <64 x i32> %1, %2
301 %shr = lshr <64 x i32> %mul, %splat
302 %res = trunc <64 x i32> %shr to <64 x i16>
303 store <64 x i16> %res, <64 x i16>* %a
307 define void @smulh_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
308 ; CHECK-LABEL: smulh_v128i16:
309 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,128)]]
310 ; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
311 ; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
312 ; VBITS_GE_2048: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
313 ; VBITS_GE_2048: st1h { [[RES]].h }, [[PG]], [x0]
315 %op1 = load <128 x i16>, <128 x i16>* %a
316 %op2 = load <128 x i16>, <128 x i16>* %b
317 %insert = insertelement <128 x i32> undef, i32 16, i64 0
318 %splat = shufflevector <128 x i32> %insert, <128 x i32> undef, <128 x i32> zeroinitializer
319 %1 = sext <128 x i16> %op1 to <128 x i32>
320 %2 = sext <128 x i16> %op2 to <128 x i32>
321 %mul = mul <128 x i32> %1, %2
322 %shr = lshr <128 x i32> %mul, %splat
323 %res = trunc <128 x i32> %shr to <128 x i16>
324 store <128 x i16> %res, <128 x i16>* %a
328 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
329 define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
330 ; CHECK-LABEL: smulh_v2i32:
331 ; CHECK: sshll v0.2d, v0.2s, #0
332 ; CHECK: sshll v1.2d, v1.2s, #0
333 ; CHECK: ptrue p0.d, vl2
334 ; CHECK: mul z0.d, p0/m, z0.d, z1.d
335 ; CHECK: shrn v0.2s, v0.2d, #32
337 %insert = insertelement <2 x i64> undef, i64 32, i64 0
338 %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
339 %1 = sext <2 x i32> %op1 to <2 x i64>
340 %2 = sext <2 x i32> %op2 to <2 x i64>
341 %mul = mul <2 x i64> %1, %2
342 %shr = lshr <2 x i64> %mul, %splat
343 %res = trunc <2 x i64> %shr to <2 x i32>
347 ; Don't use SVE for 128-bit vectors.
348 define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
349 ; CHECK-LABEL: smulh_v4i32:
350 ; CHECK: smull2 v2.2d, v0.4s, v1.4s
351 ; CHECK: smull v0.2d, v0.2s, v1.2s
352 ; CHECK: uzp2 v0.4s, v0.4s, v2.4s
354 %insert = insertelement <4 x i64> undef, i64 32, i64 0
355 %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
356 %1 = sext <4 x i32> %op1 to <4 x i64>
357 %2 = sext <4 x i32> %op2 to <4 x i64>
358 %mul = mul <4 x i64> %1, %2
359 %shr = lshr <4 x i64> %mul, %splat
360 %res = trunc <4 x i64> %shr to <4 x i32>
364 define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
365 ; CHECK-LABEL: smulh_v8i32:
366 ; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,8)]]
367 ; VBITS_EQ_256-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
368 ; VBITS_EQ_256-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
369 ; VBITS_EQ_256: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
370 ; VBITS_EQ_256: st1w { [[RES]].s }, [[PG]], [x0]
373 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,8)]]
374 ; VBITS_GE_512-DAG: ld1sw { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
375 ; VBITS_GE_512-DAG: ld1sw { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
376 ; VBITS_GE_512: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
377 ; VBITS_GE_512: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32
378 ; VBITS_GE_512: st1w { [[RES]].d }, [[PG]], [x0]
380 %op1 = load <8 x i32>, <8 x i32>* %a
381 %op2 = load <8 x i32>, <8 x i32>* %b
382 %insert = insertelement <8 x i64> undef, i64 32, i64 0
383 %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
384 %1 = sext <8 x i32> %op1 to <8 x i64>
385 %2 = sext <8 x i32> %op2 to <8 x i64>
386 %mul = mul <8 x i64> %1, %2
387 %shr = lshr <8 x i64> %mul, %splat
388 %res = trunc <8 x i64> %shr to <8 x i32>
389 store <8 x i32> %res, <8 x i32>* %a
393 define void @smulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
394 ; CHECK-LABEL: smulh_v16i32:
395 ; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]]
396 ; VBITS_EQ_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
397 ; VBITS_EQ_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
398 ; VBITS_EQ_512: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
399 ; VBITS_EQ_512: st1w { [[RES]].s }, [[PG]], [x0]
402 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,16)]]
403 ; VBITS_GE_1024-DAG: ld1sw { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
404 ; VBITS_GE_1024-DAG: ld1sw { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
405 ; VBITS_GE_1024: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
406 ; VBITS_GE_1024: st1w { [[RES]].d }, [[PG]], [x0]
408 %op1 = load <16 x i32>, <16 x i32>* %a
409 %op2 = load <16 x i32>, <16 x i32>* %b
410 %insert = insertelement <16 x i64> undef, i64 32, i64 0
411 %splat = shufflevector <16 x i64> %insert, <16 x i64> undef, <16 x i32> zeroinitializer
412 %1 = sext <16 x i32> %op1 to <16 x i64>
413 %2 = sext <16 x i32> %op2 to <16 x i64>
414 %mul = mul <16 x i64> %1, %2
415 %shr = lshr <16 x i64> %mul, %splat
416 %res = trunc <16 x i64> %shr to <16 x i32>
417 store <16 x i32> %res, <16 x i32>* %a
421 define void @smulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
422 ; CHECK-LABEL: smulh_v32i32:
423 ; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]]
424 ; VBITS_EQ_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
425 ; VBITS_EQ_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
426 ; VBITS_EQ_1024: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
427 ; VBITS_EQ_1024: st1w { [[RES]].s }, [[PG]], [x0]
430 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,32)]]
431 ; VBITS_GE_2048-DAG: ld1sw { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
432 ; VBITS_GE_2048-DAG: ld1sw { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
433 ; VBITS_GE_2048: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
434 ; VBITS_GE_2048: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32
435 ; VBITS_GE_2048: st1w { [[RES]].d }, [[PG]], [x0]
437 %op1 = load <32 x i32>, <32 x i32>* %a
438 %op2 = load <32 x i32>, <32 x i32>* %b
439 %insert = insertelement <32 x i64> undef, i64 32, i64 0
440 %splat = shufflevector <32 x i64> %insert, <32 x i64> undef, <32 x i32> zeroinitializer
441 %1 = sext <32 x i32> %op1 to <32 x i64>
442 %2 = sext <32 x i32> %op2 to <32 x i64>
443 %mul = mul <32 x i64> %1, %2
444 %shr = lshr <32 x i64> %mul, %splat
445 %res = trunc <32 x i64> %shr to <32 x i32>
446 store <32 x i32> %res, <32 x i32>* %a
450 define void @smulh_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
451 ; CHECK-LABEL: smulh_v64i32:
452 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,64)]]
453 ; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
454 ; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
455 ; VBITS_GE_2048: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
456 ; VBITS_GE_2048: st1w { [[RES]].s }, [[PG]], [x0]
458 %op1 = load <64 x i32>, <64 x i32>* %a
459 %op2 = load <64 x i32>, <64 x i32>* %b
460 %insert = insertelement <64 x i64> undef, i64 32, i64 0
461 %splat = shufflevector <64 x i64> %insert, <64 x i64> undef, <64 x i32> zeroinitializer
462 %1 = sext <64 x i32> %op1 to <64 x i64>
463 %2 = sext <64 x i32> %op2 to <64 x i64>
464 %mul = mul <64 x i64> %1, %2
465 %shr = lshr <64 x i64> %mul, %splat
466 %res = trunc <64 x i64> %shr to <64 x i32>
467 store <64 x i32> %res, <64 x i32>* %a
471 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
472 define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
473 ; CHECK-LABEL: smulh_v1i64:
474 ; CHECK: ptrue p0.d, vl1
475 ; CHECK: smulh z0.d, p0/m, z0.d, z1.d
477 %insert = insertelement <1 x i128> undef, i128 64, i128 0
478 %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer
479 %1 = sext <1 x i64> %op1 to <1 x i128>
480 %2 = sext <1 x i64> %op2 to <1 x i128>
481 %mul = mul <1 x i128> %1, %2
482 %shr = lshr <1 x i128> %mul, %splat
483 %res = trunc <1 x i128> %shr to <1 x i64>
487 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
488 define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
489 ; CHECK-LABEL: smulh_v2i64:
490 ; CHECK: ptrue p0.d, vl2
491 ; CHECK: smulh z0.d, p0/m, z0.d, z1.d
493 %insert = insertelement <2 x i128> undef, i128 64, i128 0
494 %splat = shufflevector <2 x i128> %insert, <2 x i128> undef, <2 x i32> zeroinitializer
495 %1 = sext <2 x i64> %op1 to <2 x i128>
496 %2 = sext <2 x i64> %op2 to <2 x i128>
497 %mul = mul <2 x i128> %1, %2
498 %shr = lshr <2 x i128> %mul, %splat
499 %res = trunc <2 x i128> %shr to <2 x i64>
503 define void @smulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
504 ; CHECK-LABEL: smulh_v4i64:
505 ; VBITS_GE_256: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,4)]]
506 ; VBITS_GE_256-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
507 ; VBITS_GE_256-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
508 ; VBITS_GE_256: smulh [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
509 ; VBITS_GE_256: st1d { [[RES]].d }, [[PG]], [x0]
511 %op1 = load <4 x i64>, <4 x i64>* %a
512 %op2 = load <4 x i64>, <4 x i64>* %b
513 %insert = insertelement <4 x i128> undef, i128 64, i128 0
514 %splat = shufflevector <4 x i128> %insert, <4 x i128> undef, <4 x i32> zeroinitializer
515 %1 = sext <4 x i64> %op1 to <4 x i128>
516 %2 = sext <4 x i64> %op2 to <4 x i128>
517 %mul = mul <4 x i128> %1, %2
518 %shr = lshr <4 x i128> %mul, %splat
519 %res = trunc <4 x i128> %shr to <4 x i64>
520 store <4 x i64> %res, <4 x i64>* %a
524 define void @smulh_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
525 ; CHECK-LABEL: smulh_v8i64:
526 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,8)]]
527 ; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
528 ; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
529 ; VBITS_GE_512: smulh [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
530 ; VBITS_GE_512: st1d { [[RES]].d }, [[PG]], [x0]
532 %op1 = load <8 x i64>, <8 x i64>* %a
533 %op2 = load <8 x i64>, <8 x i64>* %b
534 %insert = insertelement <8 x i128> undef, i128 64, i128 0
535 %splat = shufflevector <8 x i128> %insert, <8 x i128> undef, <8 x i32> zeroinitializer
536 %1 = sext <8 x i64> %op1 to <8 x i128>
537 %2 = sext <8 x i64> %op2 to <8 x i128>
538 %mul = mul <8 x i128> %1, %2
539 %shr = lshr <8 x i128> %mul, %splat
540 %res = trunc <8 x i128> %shr to <8 x i64>
541 store <8 x i64> %res, <8 x i64>* %a
545 define void @smulh_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
546 ; CHECK-LABEL: smulh_v16i64:
547 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,16)]]
548 ; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
549 ; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
550 ; VBITS_GE_1024: smulh [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
551 ; VBITS_GE_1024: st1d { [[RES]].d }, [[PG]], [x0]
553 %op1 = load <16 x i64>, <16 x i64>* %a
554 %op2 = load <16 x i64>, <16 x i64>* %b
555 %insert = insertelement <16 x i128> undef, i128 64, i128 0
556 %splat = shufflevector <16 x i128> %insert, <16 x i128> undef, <16 x i32> zeroinitializer
557 %1 = sext <16 x i64> %op1 to <16 x i128>
558 %2 = sext <16 x i64> %op2 to <16 x i128>
559 %mul = mul <16 x i128> %1, %2
560 %shr = lshr <16 x i128> %mul, %splat
561 %res = trunc <16 x i128> %shr to <16 x i64>
562 store <16 x i64> %res, <16 x i64>* %a
566 define void @smulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
567 ; CHECK-LABEL: smulh_v32i64:
568 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,32)]]
569 ; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
570 ; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
571 ; VBITS_GE_2048: smulh [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
572 ; VBITS_GE_2048: st1d { [[RES]].d }, [[PG]], [x0]
574 %op1 = load <32 x i64>, <32 x i64>* %a
575 %op2 = load <32 x i64>, <32 x i64>* %b
576 %insert = insertelement <32 x i128> undef, i128 64, i128 0
577 %splat = shufflevector <32 x i128> %insert, <32 x i128> undef, <32 x i32> zeroinitializer
578 %1 = sext <32 x i64> %op1 to <32 x i128>
579 %2 = sext <32 x i64> %op2 to <32 x i128>
580 %mul = mul <32 x i128> %1, %2
581 %shr = lshr <32 x i128> %mul, %splat
582 %res = trunc <32 x i128> %shr to <32 x i64>
583 store <32 x i64> %res, <32 x i64>* %a
591 ; Don't use SVE for 64-bit vectors.
592 define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
593 ; CHECK-LABEL: umulh_v8i8:
594 ; CHECK: umull v0.8h, v0.8b, v1.8b
595 ; CHECK: ushr v1.8h, v0.8h, #8
596 ; CHECK: umov w8, v1.h[0]
598 ; CHECK: umov w8, v1.h[1]
599 ; CHECK: mov v0.b[1], w8
600 ; CHECK: umov w8, v1.h[2]
601 ; CHECK: mov v0.b[2], w8
602 ; CHECK: umov w8, v1.h[3]
603 ; CHECK: mov v0.b[3], w8
605 %insert = insertelement <8 x i16> undef, i16 8, i64 0
606 %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
607 %1 = zext <8 x i8> %op1 to <8 x i16>
608 %2 = zext <8 x i8> %op2 to <8 x i16>
609 %mul = mul <8 x i16> %1, %2
610 %shr = lshr <8 x i16> %mul, %splat
611 %res = trunc <8 x i16> %shr to <8 x i8>
615 ; Don't use SVE for 128-bit vectors.
616 define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
617 ; CHECK-LABEL: umulh_v16i8:
618 ; CHECK: umull2 v2.8h, v0.16b, v1.16b
619 ; CHECK: umull v0.8h, v0.8b, v1.8b
620 ; CHECK: uzp2 v0.16b, v0.16b, v2.16b
622 %insert = insertelement <16 x i16> undef, i16 8, i64 0
623 %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
624 %1 = zext <16 x i8> %op1 to <16 x i16>
625 %2 = zext <16 x i8> %op2 to <16 x i16>
626 %mul = mul <16 x i16> %1, %2
627 %shr = lshr <16 x i16> %mul, %splat
628 %res = trunc <16 x i16> %shr to <16 x i8>
632 define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
633 ; CHECK-LABEL: umulh_v32i8:
634 ; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
635 ; VBITS_EQ_256-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
636 ; VBITS_EQ_256-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
637 ; VBITS_EQ_256: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
638 ; VBITS_EQ_256: st1b { [[RES]].b }, [[PG]], [x0]
641 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]]
642 ; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
643 ; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
644 ; VBITS_GE_512: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
645 ; VBIGS_GE_512: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
646 ; VBITS_GE_512: st1b { [[RES]].h }, [[PG]], [x0]
648 %op1 = load <32 x i8>, <32 x i8>* %a
649 %op2 = load <32 x i8>, <32 x i8>* %b
650 %insert = insertelement <32 x i16> undef, i16 8, i64 0
651 %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
652 %1 = zext <32 x i8> %op1 to <32 x i16>
653 %2 = zext <32 x i8> %op2 to <32 x i16>
654 %mul = mul <32 x i16> %1, %2
655 %shr = lshr <32 x i16> %mul, %splat
656 %res = trunc <32 x i16> %shr to <32 x i8>
657 store <32 x i8> %res, <32 x i8>* %a
661 define void @umulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
662 ; CHECK-LABEL: umulh_v64i8:
663 ; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
664 ; VBITS_EQ_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
665 ; VBITS_EQ_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
666 ; VBITS_EQ_512: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
669 ; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
670 ; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
671 ; VBITS_GE_1024: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
672 ; VBIGS_GE_1024: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
673 ; VBITS_GE_1024: st1b { [[RES]].h }, [[PG]], [x0]
675 %op1 = load <64 x i8>, <64 x i8>* %a
676 %op2 = load <64 x i8>, <64 x i8>* %b
677 %insert = insertelement <64 x i16> undef, i16 8, i64 0
678 %splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer
679 %1 = zext <64 x i8> %op1 to <64 x i16>
680 %2 = zext <64 x i8> %op2 to <64 x i16>
681 %mul = mul <64 x i16> %1, %2
682 %shr = lshr <64 x i16> %mul, %splat
683 %res = trunc <64 x i16> %shr to <64 x i8>
684 store <64 x i8> %res, <64 x i8>* %a
688 define void @umulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
689 ; CHECK-LABEL: umulh_v128i8:
690 ; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
691 ; VBITS_EQ_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
692 ; VBITS_EQ_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
693 ; VBITS_EQ_1024: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
694 ; VBITS_EQ_1024: st1b { [[RES]].b }, [[PG]], [x0]
697 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,128)]]
698 ; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
699 ; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
700 ; VBITS_GE_2048: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
701 ; VBIGS_GE_2048: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
702 ; VBITS_GE_2048: st1b { [[RES]].h }, [[PG]], [x0]
704 %op1 = load <128 x i8>, <128 x i8>* %a
705 %op2 = load <128 x i8>, <128 x i8>* %b
706 %insert = insertelement <128 x i16> undef, i16 8, i64 0
707 %splat = shufflevector <128 x i16> %insert, <128 x i16> undef, <128 x i32> zeroinitializer
708 %1 = zext <128 x i8> %op1 to <128 x i16>
709 %2 = zext <128 x i8> %op2 to <128 x i16>
710 %mul = mul <128 x i16> %1, %2
711 %shr = lshr <128 x i16> %mul, %splat
712 %res = trunc <128 x i16> %shr to <128 x i8>
713 store <128 x i8> %res, <128 x i8>* %a
717 define void @umulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
718 ; CHECK-LABEL: umulh_v256i8:
719 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]]
720 ; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
721 ; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
722 ; VBITS_GE_2048: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
723 ; VBITS_GE_2048: st1b { [[RES]].b }, [[PG]], [x0]
725 %op1 = load <256 x i8>, <256 x i8>* %a
726 %op2 = load <256 x i8>, <256 x i8>* %b
727 %insert = insertelement <256 x i16> undef, i16 8, i64 0
728 %splat = shufflevector <256 x i16> %insert, <256 x i16> undef, <256 x i32> zeroinitializer
729 %1 = zext <256 x i8> %op1 to <256 x i16>
730 %2 = zext <256 x i8> %op2 to <256 x i16>
731 %mul = mul <256 x i16> %1, %2
732 %shr = lshr <256 x i16> %mul, %splat
733 %res = trunc <256 x i16> %shr to <256 x i8>
734 store <256 x i8> %res, <256 x i8>* %a
738 ; Don't use SVE for 64-bit vectors.
739 define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
740 ; CHECK-LABEL: umulh_v4i16:
741 ; CHECK: umull v0.4s, v0.4h, v1.4h
742 ; CHECK: ushr v0.4s, v0.4s, #16
743 ; CHECK: mov w8, v0.s[1]
744 ; CHECK: mov w9, v0.s[2]
745 ; CHECK: mov w10, v0.s[3]
746 ; CHECK: mov v0.h[1], w8
747 ; CHECK: mov v0.h[2], w9
748 ; CHECK: mov v0.h[3], w10
750 %insert = insertelement <4 x i32> undef, i32 16, i64 0
751 %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
752 %1 = zext <4 x i16> %op1 to <4 x i32>
753 %2 = zext <4 x i16> %op2 to <4 x i32>
754 %mul = mul <4 x i32> %1, %2
755 %shr = lshr <4 x i32> %mul, %splat
756 %res = trunc <4 x i32> %shr to <4 x i16>
760 ; Don't use SVE for 128-bit vectors.
761 define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
762 ; CHECK-LABEL: umulh_v8i16:
763 ; CHECK: umull2 v2.4s, v0.8h, v1.8h
764 ; CHECK: umull v0.4s, v0.4h, v1.4h
765 ; CHECK: uzp2 v0.8h, v0.8h, v2.8h
767 %insert = insertelement <8 x i32> undef, i32 16, i64 0
768 %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
769 %1 = zext <8 x i16> %op1 to <8 x i32>
770 %2 = zext <8 x i16> %op2 to <8 x i32>
771 %mul = mul <8 x i32> %1, %2
772 %shr = lshr <8 x i32> %mul, %splat
773 %res = trunc <8 x i32> %shr to <8 x i16>
777 define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
778 ; CHECK-LABEL: umulh_v16i16:
779 ; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,16)]]
780 ; VBITS_EQ_256-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
781 ; VBITS_EQ_256-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
782 ; VBITS_EQ_256: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
783 ; VBITS_EQ_256: st1h { [[RES]].h }, [[PG]], [x0]
786 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]]
787 ; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
788 ; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
789 ; VBITS_GE_512: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
790 ; VBITS_GE_512: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
791 ; VBITS_GE_512: st1h { [[RES]].s }, [[PG]], [x0]
793 %op1 = load <16 x i16>, <16 x i16>* %a
794 %op2 = load <16 x i16>, <16 x i16>* %b
795 %insert = insertelement <16 x i32> undef, i32 16, i64 0
796 %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
797 %1 = zext <16 x i16> %op1 to <16 x i32>
798 %2 = zext <16 x i16> %op2 to <16 x i32>
799 %mul = mul <16 x i32> %1, %2
800 %shr = lshr <16 x i32> %mul, %splat
801 %res = trunc <16 x i32> %shr to <16 x i16>
802 store <16 x i16> %res, <16 x i16>* %a
806 define void @umulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
807 ; CHECK-LABEL: umulh_v32i16:
808 ; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]]
809 ; VBITS_EQ_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
810 ; VBITS_EQ_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
811 ; VBITS_EQ_512: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
812 ; VBITS_EQ_512: st1h { [[RES]].h }, [[PG]], [x0]
815 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]]
816 ; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
817 ; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
818 ; VBITS_GE_1024: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
819 ; VBITS_GE_1024: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
820 ; VBITS_GE_1024: st1h { [[RES]].s }, [[PG]], [x0]
822 %op1 = load <32 x i16>, <32 x i16>* %a
823 %op2 = load <32 x i16>, <32 x i16>* %b
824 %insert = insertelement <32 x i32> undef, i32 16, i64 0
825 %splat = shufflevector <32 x i32> %insert, <32 x i32> undef, <32 x i32> zeroinitializer
826 %1 = zext <32 x i16> %op1 to <32 x i32>
827 %2 = zext <32 x i16> %op2 to <32 x i32>
828 %mul = mul <32 x i32> %1, %2
829 %shr = lshr <32 x i32> %mul, %splat
830 %res = trunc <32 x i32> %shr to <32 x i16>
831 store <32 x i16> %res, <32 x i16>* %a
835 define void @umulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
836 ; CHECK-LABEL: umulh_v64i16:
837 ; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]]
838 ; VBITS_EQ_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
839 ; VBITS_EQ_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
840 ; VBITS_EQ_1024: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
843 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,64)]]
844 ; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
845 ; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
846 ; VBITS_GE_2048: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
847 ; VBITS_GE_2048: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
848 ; VBITS_GE_2048: st1h { [[RES]].s }, [[PG]], [x0]
850 %op1 = load <64 x i16>, <64 x i16>* %a
851 %op2 = load <64 x i16>, <64 x i16>* %b
852 %insert = insertelement <64 x i32> undef, i32 16, i64 0
853 %splat = shufflevector <64 x i32> %insert, <64 x i32> undef, <64 x i32> zeroinitializer
854 %1 = zext <64 x i16> %op1 to <64 x i32>
855 %2 = zext <64 x i16> %op2 to <64 x i32>
856 %mul = mul <64 x i32> %1, %2
857 %shr = lshr <64 x i32> %mul, %splat
858 %res = trunc <64 x i32> %shr to <64 x i16>
859 store <64 x i16> %res, <64 x i16>* %a
863 define void @umulh_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
864 ; CHECK-LABEL: umulh_v128i16:
865 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,128)]]
866 ; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
867 ; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
868 ; VBITS_GE_2048: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
869 ; VBITS_GE_2048: st1h { [[RES]].h }, [[PG]], [x0]
871 %op1 = load <128 x i16>, <128 x i16>* %a
872 %op2 = load <128 x i16>, <128 x i16>* %b
873 %insert = insertelement <128 x i32> undef, i32 16, i64 0
874 %splat = shufflevector <128 x i32> %insert, <128 x i32> undef, <128 x i32> zeroinitializer
875 %1 = zext <128 x i16> %op1 to <128 x i32>
876 %2 = zext <128 x i16> %op2 to <128 x i32>
877 %mul = mul <128 x i32> %1, %2
878 %shr = lshr <128 x i32> %mul, %splat
879 %res = trunc <128 x i32> %shr to <128 x i16>
880 store <128 x i16> %res, <128 x i16>* %a
884 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
885 define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
886 ; CHECK-LABEL: umulh_v2i32:
887 ; CHECK: ushll v0.2d, v0.2s, #0
888 ; CHECK: ushll v1.2d, v1.2s, #0
889 ; CHECK: ptrue p0.d, vl2
890 ; CHECK: mul z0.d, p0/m, z0.d, z1.d
891 ; CHECK: shrn v0.2s, v0.2d, #32
893 %insert = insertelement <2 x i64> undef, i64 32, i64 0
894 %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
895 %1 = zext <2 x i32> %op1 to <2 x i64>
896 %2 = zext <2 x i32> %op2 to <2 x i64>
897 %mul = mul <2 x i64> %1, %2
898 %shr = lshr <2 x i64> %mul, %splat
899 %res = trunc <2 x i64> %shr to <2 x i32>
903 ; Don't use SVE for 128-bit vectors.
904 define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
905 ; CHECK-LABEL: umulh_v4i32:
906 ; CHECK: umull2 v2.2d, v0.4s, v1.4s
907 ; CHECK: umull v0.2d, v0.2s, v1.2s
908 ; CHECK: uzp2 v0.4s, v0.4s, v2.4s
910 %insert = insertelement <4 x i64> undef, i64 32, i64 0
911 %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
912 %1 = zext <4 x i32> %op1 to <4 x i64>
913 %2 = zext <4 x i32> %op2 to <4 x i64>
914 %mul = mul <4 x i64> %1, %2
915 %shr = lshr <4 x i64> %mul, %splat
916 %res = trunc <4 x i64> %shr to <4 x i32>
920 define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
921 ; CHECK-LABEL: umulh_v8i32:
922 ; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,8)]]
923 ; VBITS_EQ_256-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
924 ; VBITS_EQ_256-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
925 ; VBITS_EQ_256: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
926 ; VBITS_EQ_256: st1w { [[RES]].s }, [[PG]], [x0]
929 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,8)]]
930 ; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
931 ; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
932 ; VBITS_GE_512: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
933 ; VBITS_GE_512: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32
934 ; VBITS_GE_512: st1w { [[RES]].d }, [[PG]], [x0]
936 %op1 = load <8 x i32>, <8 x i32>* %a
937 %op2 = load <8 x i32>, <8 x i32>* %b
938 %insert = insertelement <8 x i64> undef, i64 32, i64 0
939 %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
940 %1 = zext <8 x i32> %op1 to <8 x i64>
941 %2 = zext <8 x i32> %op2 to <8 x i64>
942 %mul = mul <8 x i64> %1, %2
943 %shr = lshr <8 x i64> %mul, %splat
944 %res = trunc <8 x i64> %shr to <8 x i32>
945 store <8 x i32> %res, <8 x i32>* %a
949 define void @umulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
950 ; CHECK-LABEL: umulh_v16i32:
951 ; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]]
952 ; VBITS_EQ_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
953 ; VBITS_EQ_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
954 ; VBITS_EQ_512: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
955 ; VBITS_EQ_512: st1w { [[RES]].s }, [[PG]], [x0]
957 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,16)]]
958 ; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
959 ; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
960 ; VBITS_GE_1024: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
961 ; VBITS_GE_1024: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32
962 ; VBITS_GE_1024: st1w { [[RES]].d }, [[PG]], [x0]
964 %op1 = load <16 x i32>, <16 x i32>* %a
965 %op2 = load <16 x i32>, <16 x i32>* %b
966 %insert = insertelement <16 x i64> undef, i64 32, i64 0
967 %splat = shufflevector <16 x i64> %insert, <16 x i64> undef, <16 x i32> zeroinitializer
968 %1 = zext <16 x i32> %op1 to <16 x i64>
969 %2 = zext <16 x i32> %op2 to <16 x i64>
970 %mul = mul <16 x i64> %1, %2
971 %shr = lshr <16 x i64> %mul, %splat
972 %res = trunc <16 x i64> %shr to <16 x i32>
973 store <16 x i32> %res, <16 x i32>* %a
977 define void @umulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
978 ; CHECK-LABEL: umulh_v32i32:
979 ; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]]
980 ; VBITS_EQ_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
981 ; VBITS_EQ_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
982 ; VBITS_EQ_1024: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
983 ; VBITS_EQ_1024: st1w { [[RES]].s }, [[PG]], [x0]
986 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,32)]]
987 ; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
988 ; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
989 ; VBITS_GE_2048: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
990 ; VBITS_GE_2048: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32
991 ; VBITS_GE_2048: st1w { [[RES]].d }, [[PG]], [x0]
993 %op1 = load <32 x i32>, <32 x i32>* %a
994 %op2 = load <32 x i32>, <32 x i32>* %b
995 %insert = insertelement <32 x i64> undef, i64 32, i64 0
996 %splat = shufflevector <32 x i64> %insert, <32 x i64> undef, <32 x i32> zeroinitializer
997 %1 = zext <32 x i32> %op1 to <32 x i64>
998 %2 = zext <32 x i32> %op2 to <32 x i64>
999 %mul = mul <32 x i64> %1, %2
1000 %shr = lshr <32 x i64> %mul, %splat
1001 %res = trunc <32 x i64> %shr to <32 x i32>
1002 store <32 x i32> %res, <32 x i32>* %a
1006 define void @umulh_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
1007 ; CHECK-LABEL: umulh_v64i32:
1008 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,64)]]
1009 ; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
1010 ; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
1011 ; VBITS_GE_2048: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
1012 ; VBITS_GE_2048: st1w { [[RES]].s }, [[PG]], [x0]
1013 ; VBITS_GE_2048: ret
1014 %op1 = load <64 x i32>, <64 x i32>* %a
1015 %op2 = load <64 x i32>, <64 x i32>* %b
1016 %insert = insertelement <64 x i64> undef, i64 32, i64 0
1017 %splat = shufflevector <64 x i64> %insert, <64 x i64> undef, <64 x i32> zeroinitializer
1018 %1 = zext <64 x i32> %op1 to <64 x i64>
1019 %2 = zext <64 x i32> %op2 to <64 x i64>
1020 %mul = mul <64 x i64> %1, %2
1021 %shr = lshr <64 x i64> %mul, %splat
1022 %res = trunc <64 x i64> %shr to <64 x i32>
1023 store <64 x i32> %res, <64 x i32>* %a
1027 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
1028 define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
1029 ; CHECK-LABEL: umulh_v1i64:
1030 ; CHECK: ptrue p0.d, vl1
1031 ; CHECK: umulh z0.d, p0/m, z0.d, z1.d
1033 %insert = insertelement <1 x i128> undef, i128 64, i128 0
1034 %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer
1035 %1 = zext <1 x i64> %op1 to <1 x i128>
1036 %2 = zext <1 x i64> %op2 to <1 x i128>
1037 %mul = mul <1 x i128> %1, %2
1038 %shr = lshr <1 x i128> %mul, %splat
1039 %res = trunc <1 x i128> %shr to <1 x i64>
1043 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
1044 define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
1045 ; CHECK-LABEL: umulh_v2i64:
1046 ; CHECK: ptrue p0.d, vl2
1047 ; CHECK: umulh z0.d, p0/m, z0.d, z1.d
1049 %insert = insertelement <2 x i128> undef, i128 64, i128 0
1050 %splat = shufflevector <2 x i128> %insert, <2 x i128> undef, <2 x i32> zeroinitializer
1051 %1 = zext <2 x i64> %op1 to <2 x i128>
1052 %2 = zext <2 x i64> %op2 to <2 x i128>
1053 %mul = mul <2 x i128> %1, %2
1054 %shr = lshr <2 x i128> %mul, %splat
1055 %res = trunc <2 x i128> %shr to <2 x i64>
1059 define void @umulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
1060 ; CHECK-LABEL: umulh_v4i64:
1061 ; VBITS_GE_256: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,4)]]
1062 ; VBITS_GE_256-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
1063 ; VBITS_GE_256-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
1064 ; VBITS_GE_256: umulh [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
1065 ; VBITS_GE_256: st1d { [[RES]].d }, [[PG]], [x0]
1067 %op1 = load <4 x i64>, <4 x i64>* %a
1068 %op2 = load <4 x i64>, <4 x i64>* %b
1069 %insert = insertelement <4 x i128> undef, i128 64, i128 0
1070 %splat = shufflevector <4 x i128> %insert, <4 x i128> undef, <4 x i32> zeroinitializer
1071 %1 = zext <4 x i64> %op1 to <4 x i128>
1072 %2 = zext <4 x i64> %op2 to <4 x i128>
1073 %mul = mul <4 x i128> %1, %2
1074 %shr = lshr <4 x i128> %mul, %splat
1075 %res = trunc <4 x i128> %shr to <4 x i64>
1076 store <4 x i64> %res, <4 x i64>* %a
1080 define void @umulh_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
1081 ; CHECK-LABEL: umulh_v8i64:
1082 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,8)]]
1083 ; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
1084 ; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
1085 ; VBITS_GE_512: umulh [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
1086 ; VBITS_GE_512: st1d { [[RES]].d }, [[PG]], [x0]
1088 %op1 = load <8 x i64>, <8 x i64>* %a
1089 %op2 = load <8 x i64>, <8 x i64>* %b
1090 %insert = insertelement <8 x i128> undef, i128 64, i128 0
1091 %splat = shufflevector <8 x i128> %insert, <8 x i128> undef, <8 x i32> zeroinitializer
1092 %1 = zext <8 x i64> %op1 to <8 x i128>
1093 %2 = zext <8 x i64> %op2 to <8 x i128>
1094 %mul = mul <8 x i128> %1, %2
1095 %shr = lshr <8 x i128> %mul, %splat
1096 %res = trunc <8 x i128> %shr to <8 x i64>
1097 store <8 x i64> %res, <8 x i64>* %a
1101 define void @umulh_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
1102 ; CHECK-LABEL: umulh_v16i64:
1103 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,16)]]
1104 ; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
1105 ; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
1106 ; VBITS_GE_1024: umulh [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
1107 ; VBITS_GE_1024: st1d { [[RES]].d }, [[PG]], [x0]
1108 ; VBITS_GE_1024: ret
1109 %op1 = load <16 x i64>, <16 x i64>* %a
1110 %op2 = load <16 x i64>, <16 x i64>* %b
1111 %insert = insertelement <16 x i128> undef, i128 64, i128 0
1112 %splat = shufflevector <16 x i128> %insert, <16 x i128> undef, <16 x i32> zeroinitializer
1113 %1 = zext <16 x i64> %op1 to <16 x i128>
1114 %2 = zext <16 x i64> %op2 to <16 x i128>
1115 %mul = mul <16 x i128> %1, %2
1116 %shr = lshr <16 x i128> %mul, %splat
1117 %res = trunc <16 x i128> %shr to <16 x i64>
1118 store <16 x i64> %res, <16 x i64>* %a
1122 define void @umulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
1123 ; CHECK-LABEL: umulh_v32i64:
1124 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,32)]]
1125 ; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
1126 ; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
1127 ; VBITS_GE_2048: umulh [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
1128 ; VBITS_GE_2048: st1d { [[RES]].d }, [[PG]], [x0]
1129 ; VBITS_GE_2048: ret
1130 %op1 = load <32 x i64>, <32 x i64>* %a
1131 %op2 = load <32 x i64>, <32 x i64>* %b
1132 %insert = insertelement <32 x i128> undef, i128 64, i128 0
1133 %splat = shufflevector <32 x i128> %insert, <32 x i128> undef, <32 x i32> zeroinitializer
1134 %1 = zext <32 x i64> %op1 to <32 x i128>
1135 %2 = zext <32 x i64> %op2 to <32 x i128>
1136 %mul = mul <32 x i128> %1, %2
1137 %shr = lshr <32 x i128> %mul, %splat
1138 %res = trunc <32 x i128> %shr to <32 x i64>
1139 store <32 x i64> %res, <32 x i64>* %a
1142 attributes #0 = { "target-features"="+sve" }