1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
6 target triple = "aarch64-unknown-linux-gnu"
12 ; Don't use SVE for 64-bit vectors.
13 define i8 @uaddv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
14 ; CHECK-LABEL: uaddv_v8i8:
16 ; CHECK-NEXT: addv b0, v0.8b
17 ; CHECK-NEXT: fmov w0, s0
19 %res = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
23 ; Don't use SVE for 128-bit vectors.
24 define i8 @uaddv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
25 ; CHECK-LABEL: uaddv_v16i8:
27 ; CHECK-NEXT: addv b0, v0.16b
28 ; CHECK-NEXT: fmov w0, s0
30 %res = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a)
34 define i8 @uaddv_v32i8(ptr %a) vscale_range(2,0) #0 {
35 ; CHECK-LABEL: uaddv_v32i8:
37 ; CHECK-NEXT: ptrue p0.b, vl32
38 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
39 ; CHECK-NEXT: uaddv d0, p0, z0.b
40 ; CHECK-NEXT: fmov x0, d0
41 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
43 %op = load <32 x i8>, ptr %a
44 %res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %op)
48 define i8 @uaddv_v64i8(ptr %a) #0 {
49 ; VBITS_GE_256-LABEL: uaddv_v64i8:
50 ; VBITS_GE_256: // %bb.0:
51 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
52 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
53 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
54 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
55 ; VBITS_GE_256-NEXT: add z0.b, z1.b, z0.b
56 ; VBITS_GE_256-NEXT: uaddv d0, p0, z0.b
57 ; VBITS_GE_256-NEXT: fmov x0, d0
58 ; VBITS_GE_256-NEXT: // kill: def $w0 killed $w0 killed $x0
59 ; VBITS_GE_256-NEXT: ret
61 ; VBITS_GE_512-LABEL: uaddv_v64i8:
62 ; VBITS_GE_512: // %bb.0:
63 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
64 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
65 ; VBITS_GE_512-NEXT: uaddv d0, p0, z0.b
66 ; VBITS_GE_512-NEXT: fmov x0, d0
67 ; VBITS_GE_512-NEXT: // kill: def $w0 killed $w0 killed $x0
68 ; VBITS_GE_512-NEXT: ret
69 %op = load <64 x i8>, ptr %a
70 %res = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %op)
74 define i8 @uaddv_v128i8(ptr %a) vscale_range(8,0) #0 {
75 ; CHECK-LABEL: uaddv_v128i8:
77 ; CHECK-NEXT: ptrue p0.b, vl128
78 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
79 ; CHECK-NEXT: uaddv d0, p0, z0.b
80 ; CHECK-NEXT: fmov x0, d0
81 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
83 %op = load <128 x i8>, ptr %a
84 %res = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %op)
88 define i8 @uaddv_v256i8(ptr %a) vscale_range(16,0) #0 {
89 ; CHECK-LABEL: uaddv_v256i8:
91 ; CHECK-NEXT: ptrue p0.b, vl256
92 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
93 ; CHECK-NEXT: uaddv d0, p0, z0.b
94 ; CHECK-NEXT: fmov x0, d0
95 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
97 %op = load <256 x i8>, ptr %a
98 %res = call i8 @llvm.vector.reduce.add.v256i8(<256 x i8> %op)
102 ; Don't use SVE for 64-bit vectors.
103 define i16 @uaddv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
104 ; CHECK-LABEL: uaddv_v4i16:
106 ; CHECK-NEXT: addv h0, v0.4h
107 ; CHECK-NEXT: fmov w0, s0
109 %res = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
113 ; Don't use SVE for 128-bit vectors.
114 define i16 @uaddv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
115 ; CHECK-LABEL: uaddv_v8i16:
117 ; CHECK-NEXT: addv h0, v0.8h
118 ; CHECK-NEXT: fmov w0, s0
120 %res = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a)
124 define i16 @uaddv_v16i16(ptr %a) vscale_range(2,0) #0 {
125 ; CHECK-LABEL: uaddv_v16i16:
127 ; CHECK-NEXT: ptrue p0.h, vl16
128 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
129 ; CHECK-NEXT: uaddv d0, p0, z0.h
130 ; CHECK-NEXT: fmov x0, d0
131 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
133 %op = load <16 x i16>, ptr %a
134 %res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %op)
138 define i16 @uaddv_v32i16(ptr %a) #0 {
139 ; VBITS_GE_256-LABEL: uaddv_v32i16:
140 ; VBITS_GE_256: // %bb.0:
141 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
142 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
143 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
144 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
145 ; VBITS_GE_256-NEXT: add z0.h, z1.h, z0.h
146 ; VBITS_GE_256-NEXT: uaddv d0, p0, z0.h
147 ; VBITS_GE_256-NEXT: fmov x0, d0
148 ; VBITS_GE_256-NEXT: // kill: def $w0 killed $w0 killed $x0
149 ; VBITS_GE_256-NEXT: ret
151 ; VBITS_GE_512-LABEL: uaddv_v32i16:
152 ; VBITS_GE_512: // %bb.0:
153 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
154 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
155 ; VBITS_GE_512-NEXT: uaddv d0, p0, z0.h
156 ; VBITS_GE_512-NEXT: fmov x0, d0
157 ; VBITS_GE_512-NEXT: // kill: def $w0 killed $w0 killed $x0
158 ; VBITS_GE_512-NEXT: ret
159 %op = load <32 x i16>, ptr %a
160 %res = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %op)
164 define i16 @uaddv_v64i16(ptr %a) vscale_range(8,0) #0 {
165 ; CHECK-LABEL: uaddv_v64i16:
167 ; CHECK-NEXT: ptrue p0.h, vl64
168 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
169 ; CHECK-NEXT: uaddv d0, p0, z0.h
170 ; CHECK-NEXT: fmov x0, d0
171 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
173 %op = load <64 x i16>, ptr %a
174 %res = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %op)
178 define i16 @uaddv_v128i16(ptr %a) vscale_range(16,0) #0 {
179 ; CHECK-LABEL: uaddv_v128i16:
181 ; CHECK-NEXT: ptrue p0.h, vl128
182 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
183 ; CHECK-NEXT: uaddv d0, p0, z0.h
184 ; CHECK-NEXT: fmov x0, d0
185 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
187 %op = load <128 x i16>, ptr %a
188 %res = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %op)
192 ; Don't use SVE for 64-bit vectors.
193 define i32 @uaddv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
194 ; CHECK-LABEL: uaddv_v2i32:
196 ; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s
197 ; CHECK-NEXT: fmov w0, s0
199 %res = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
203 ; Don't use SVE for 128-bit vectors.
204 define i32 @uaddv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
205 ; CHECK-LABEL: uaddv_v4i32:
207 ; CHECK-NEXT: addv s0, v0.4s
208 ; CHECK-NEXT: fmov w0, s0
210 %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
214 define i32 @uaddv_v8i32(ptr %a) vscale_range(2,0) #0 {
215 ; CHECK-LABEL: uaddv_v8i32:
217 ; CHECK-NEXT: ptrue p0.s, vl8
218 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
219 ; CHECK-NEXT: uaddv d0, p0, z0.s
220 ; CHECK-NEXT: fmov x0, d0
221 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
223 %op = load <8 x i32>, ptr %a
224 %res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %op)
228 define i32 @uaddv_v16i32(ptr %a) #0 {
229 ; VBITS_GE_256-LABEL: uaddv_v16i32:
230 ; VBITS_GE_256: // %bb.0:
231 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
232 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
233 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
234 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
235 ; VBITS_GE_256-NEXT: add z0.s, z1.s, z0.s
236 ; VBITS_GE_256-NEXT: uaddv d0, p0, z0.s
237 ; VBITS_GE_256-NEXT: fmov x0, d0
238 ; VBITS_GE_256-NEXT: // kill: def $w0 killed $w0 killed $x0
239 ; VBITS_GE_256-NEXT: ret
241 ; VBITS_GE_512-LABEL: uaddv_v16i32:
242 ; VBITS_GE_512: // %bb.0:
243 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
244 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
245 ; VBITS_GE_512-NEXT: uaddv d0, p0, z0.s
246 ; VBITS_GE_512-NEXT: fmov x0, d0
247 ; VBITS_GE_512-NEXT: // kill: def $w0 killed $w0 killed $x0
248 ; VBITS_GE_512-NEXT: ret
249 %op = load <16 x i32>, ptr %a
250 %res = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %op)
254 define i32 @uaddv_v32i32(ptr %a) vscale_range(8,0) #0 {
255 ; CHECK-LABEL: uaddv_v32i32:
257 ; CHECK-NEXT: ptrue p0.s, vl32
258 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
259 ; CHECK-NEXT: uaddv d0, p0, z0.s
260 ; CHECK-NEXT: fmov x0, d0
261 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
263 %op = load <32 x i32>, ptr %a
264 %res = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %op)
268 define i32 @uaddv_v64i32(ptr %a) vscale_range(16,0) #0 {
269 ; CHECK-LABEL: uaddv_v64i32:
271 ; CHECK-NEXT: ptrue p0.s, vl64
272 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
273 ; CHECK-NEXT: uaddv d0, p0, z0.s
274 ; CHECK-NEXT: fmov x0, d0
275 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
277 %op = load <64 x i32>, ptr %a
278 %res = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %op)
282 ; Nothing to do for single element vectors.
283 define i64 @uaddv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
284 ; CHECK-LABEL: uaddv_v1i64:
286 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
287 ; CHECK-NEXT: fmov x0, d0
289 %res = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a)
293 ; Don't use SVE for 128-bit vectors.
294 define i64 @uaddv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
295 ; CHECK-LABEL: uaddv_v2i64:
297 ; CHECK-NEXT: addp d0, v0.2d
298 ; CHECK-NEXT: fmov x0, d0
300 %res = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a)
304 define i64 @uaddv_v4i64(ptr %a) vscale_range(2,0) #0 {
305 ; CHECK-LABEL: uaddv_v4i64:
307 ; CHECK-NEXT: ptrue p0.d, vl4
308 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
309 ; CHECK-NEXT: uaddv d0, p0, z0.d
310 ; CHECK-NEXT: fmov x0, d0
312 %op = load <4 x i64>, ptr %a
313 %res = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %op)
317 define i64 @uaddv_v8i64(ptr %a) #0 {
318 ; VBITS_GE_256-LABEL: uaddv_v8i64:
319 ; VBITS_GE_256: // %bb.0:
320 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
321 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
322 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
323 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
324 ; VBITS_GE_256-NEXT: add z0.d, z1.d, z0.d
325 ; VBITS_GE_256-NEXT: uaddv d0, p0, z0.d
326 ; VBITS_GE_256-NEXT: fmov x0, d0
327 ; VBITS_GE_256-NEXT: ret
329 ; VBITS_GE_512-LABEL: uaddv_v8i64:
330 ; VBITS_GE_512: // %bb.0:
331 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
332 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
333 ; VBITS_GE_512-NEXT: uaddv d0, p0, z0.d
334 ; VBITS_GE_512-NEXT: fmov x0, d0
335 ; VBITS_GE_512-NEXT: ret
336 %op = load <8 x i64>, ptr %a
337 %res = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %op)
341 define i64 @uaddv_v16i64(ptr %a) vscale_range(8,0) #0 {
342 ; CHECK-LABEL: uaddv_v16i64:
344 ; CHECK-NEXT: ptrue p0.d, vl16
345 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
346 ; CHECK-NEXT: uaddv d0, p0, z0.d
347 ; CHECK-NEXT: fmov x0, d0
349 %op = load <16 x i64>, ptr %a
350 %res = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %op)
354 define i64 @uaddv_v32i64(ptr %a) vscale_range(16,0) #0 {
355 ; CHECK-LABEL: uaddv_v32i64:
357 ; CHECK-NEXT: ptrue p0.d, vl32
358 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
359 ; CHECK-NEXT: uaddv d0, p0, z0.d
360 ; CHECK-NEXT: fmov x0, d0
362 %op = load <32 x i64>, ptr %a
363 %res = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %op)
371 ; Don't use SVE for 64-bit vectors.
372 define i8 @smaxv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
373 ; CHECK-LABEL: smaxv_v8i8:
375 ; CHECK-NEXT: smaxv b0, v0.8b
376 ; CHECK-NEXT: fmov w0, s0
378 %res = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %a)
382 ; Don't use SVE for 128-bit vectors.
383 define i8 @smaxv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
384 ; CHECK-LABEL: smaxv_v16i8:
386 ; CHECK-NEXT: smaxv b0, v0.16b
387 ; CHECK-NEXT: fmov w0, s0
389 %res = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %a)
393 define i8 @smaxv_v32i8(ptr %a) vscale_range(2,0) #0 {
394 ; CHECK-LABEL: smaxv_v32i8:
396 ; CHECK-NEXT: ptrue p0.b, vl32
397 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
398 ; CHECK-NEXT: smaxv b0, p0, z0.b
399 ; CHECK-NEXT: fmov w0, s0
401 %op = load <32 x i8>, ptr %a
402 %res = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %op)
406 define i8 @smaxv_v64i8(ptr %a) #0 {
407 ; VBITS_GE_256-LABEL: smaxv_v64i8:
408 ; VBITS_GE_256: // %bb.0:
409 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
410 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
411 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
412 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
413 ; VBITS_GE_256-NEXT: smax z0.b, p0/m, z0.b, z1.b
414 ; VBITS_GE_256-NEXT: smaxv b0, p0, z0.b
415 ; VBITS_GE_256-NEXT: fmov w0, s0
416 ; VBITS_GE_256-NEXT: ret
418 ; VBITS_GE_512-LABEL: smaxv_v64i8:
419 ; VBITS_GE_512: // %bb.0:
420 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
421 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
422 ; VBITS_GE_512-NEXT: smaxv b0, p0, z0.b
423 ; VBITS_GE_512-NEXT: fmov w0, s0
424 ; VBITS_GE_512-NEXT: ret
425 %op = load <64 x i8>, ptr %a
426 %res = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> %op)
430 define i8 @smaxv_v128i8(ptr %a) vscale_range(8,0) #0 {
431 ; CHECK-LABEL: smaxv_v128i8:
433 ; CHECK-NEXT: ptrue p0.b, vl128
434 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
435 ; CHECK-NEXT: smaxv b0, p0, z0.b
436 ; CHECK-NEXT: fmov w0, s0
438 %op = load <128 x i8>, ptr %a
439 %res = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> %op)
443 define i8 @smaxv_v256i8(ptr %a) vscale_range(16,0) #0 {
444 ; CHECK-LABEL: smaxv_v256i8:
446 ; CHECK-NEXT: ptrue p0.b, vl256
447 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
448 ; CHECK-NEXT: smaxv b0, p0, z0.b
449 ; CHECK-NEXT: fmov w0, s0
451 %op = load <256 x i8>, ptr %a
452 %res = call i8 @llvm.vector.reduce.smax.v256i8(<256 x i8> %op)
456 ; Don't use SVE for 64-bit vectors.
457 define i16 @smaxv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
458 ; CHECK-LABEL: smaxv_v4i16:
460 ; CHECK-NEXT: smaxv h0, v0.4h
461 ; CHECK-NEXT: fmov w0, s0
463 %res = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %a)
467 ; Don't use SVE for 128-bit vectors.
468 define i16 @smaxv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
469 ; CHECK-LABEL: smaxv_v8i16:
471 ; CHECK-NEXT: smaxv h0, v0.8h
472 ; CHECK-NEXT: fmov w0, s0
474 %res = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %a)
478 define i16 @smaxv_v16i16(ptr %a) vscale_range(2,0) #0 {
479 ; CHECK-LABEL: smaxv_v16i16:
481 ; CHECK-NEXT: ptrue p0.h, vl16
482 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
483 ; CHECK-NEXT: smaxv h0, p0, z0.h
484 ; CHECK-NEXT: fmov w0, s0
486 %op = load <16 x i16>, ptr %a
487 %res = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %op)
491 define i16 @smaxv_v32i16(ptr %a) #0 {
492 ; VBITS_GE_256-LABEL: smaxv_v32i16:
493 ; VBITS_GE_256: // %bb.0:
494 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
495 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
496 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
497 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
498 ; VBITS_GE_256-NEXT: smax z0.h, p0/m, z0.h, z1.h
499 ; VBITS_GE_256-NEXT: smaxv h0, p0, z0.h
500 ; VBITS_GE_256-NEXT: fmov w0, s0
501 ; VBITS_GE_256-NEXT: ret
503 ; VBITS_GE_512-LABEL: smaxv_v32i16:
504 ; VBITS_GE_512: // %bb.0:
505 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
506 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
507 ; VBITS_GE_512-NEXT: smaxv h0, p0, z0.h
508 ; VBITS_GE_512-NEXT: fmov w0, s0
509 ; VBITS_GE_512-NEXT: ret
510 %op = load <32 x i16>, ptr %a
511 %res = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> %op)
515 define i16 @smaxv_v64i16(ptr %a) vscale_range(8,0) #0 {
516 ; CHECK-LABEL: smaxv_v64i16:
518 ; CHECK-NEXT: ptrue p0.h, vl64
519 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
520 ; CHECK-NEXT: smaxv h0, p0, z0.h
521 ; CHECK-NEXT: fmov w0, s0
523 %op = load <64 x i16>, ptr %a
524 %res = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> %op)
528 define i16 @smaxv_v128i16(ptr %a) vscale_range(16,0) #0 {
529 ; CHECK-LABEL: smaxv_v128i16:
531 ; CHECK-NEXT: ptrue p0.h, vl128
532 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
533 ; CHECK-NEXT: smaxv h0, p0, z0.h
534 ; CHECK-NEXT: fmov w0, s0
536 %op = load <128 x i16>, ptr %a
537 %res = call i16 @llvm.vector.reduce.smax.v128i16(<128 x i16> %op)
541 ; Don't use SVE for 64-bit vectors.
542 define i32 @smaxv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
543 ; CHECK-LABEL: smaxv_v2i32:
545 ; CHECK-NEXT: smaxp v0.2s, v0.2s, v0.2s
546 ; CHECK-NEXT: fmov w0, s0
548 %res = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %a)
552 ; Don't use SVE for 128-bit vectors.
553 define i32 @smaxv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
554 ; CHECK-LABEL: smaxv_v4i32:
556 ; CHECK-NEXT: smaxv s0, v0.4s
557 ; CHECK-NEXT: fmov w0, s0
559 %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a)
563 define i32 @smaxv_v8i32(ptr %a) vscale_range(2,0) #0 {
564 ; CHECK-LABEL: smaxv_v8i32:
566 ; CHECK-NEXT: ptrue p0.s, vl8
567 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
568 ; CHECK-NEXT: smaxv s0, p0, z0.s
569 ; CHECK-NEXT: fmov w0, s0
571 %op = load <8 x i32>, ptr %a
572 %res = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %op)
576 define i32 @smaxv_v16i32(ptr %a) #0 {
577 ; VBITS_GE_256-LABEL: smaxv_v16i32:
578 ; VBITS_GE_256: // %bb.0:
579 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
580 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
581 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
582 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
583 ; VBITS_GE_256-NEXT: smax z0.s, p0/m, z0.s, z1.s
584 ; VBITS_GE_256-NEXT: smaxv s0, p0, z0.s
585 ; VBITS_GE_256-NEXT: fmov w0, s0
586 ; VBITS_GE_256-NEXT: ret
588 ; VBITS_GE_512-LABEL: smaxv_v16i32:
589 ; VBITS_GE_512: // %bb.0:
590 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
591 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
592 ; VBITS_GE_512-NEXT: smaxv s0, p0, z0.s
593 ; VBITS_GE_512-NEXT: fmov w0, s0
594 ; VBITS_GE_512-NEXT: ret
595 %op = load <16 x i32>, ptr %a
596 %res = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %op)
600 define i32 @smaxv_v32i32(ptr %a) vscale_range(8,0) #0 {
601 ; CHECK-LABEL: smaxv_v32i32:
603 ; CHECK-NEXT: ptrue p0.s, vl32
604 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
605 ; CHECK-NEXT: smaxv s0, p0, z0.s
606 ; CHECK-NEXT: fmov w0, s0
608 %op = load <32 x i32>, ptr %a
609 %res = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> %op)
613 define i32 @smaxv_v64i32(ptr %a) vscale_range(16,0) #0 {
614 ; CHECK-LABEL: smaxv_v64i32:
616 ; CHECK-NEXT: ptrue p0.s, vl64
617 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
618 ; CHECK-NEXT: smaxv s0, p0, z0.s
619 ; CHECK-NEXT: fmov w0, s0
621 %op = load <64 x i32>, ptr %a
622 %res = call i32 @llvm.vector.reduce.smax.v64i32(<64 x i32> %op)
626 ; Nothing to do for single element vectors.
627 define i64 @smaxv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
628 ; CHECK-LABEL: smaxv_v1i64:
630 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
631 ; CHECK-NEXT: fmov x0, d0
633 %res = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> %a)
637 ; No NEON 64-bit vector SMAXV support. Use SVE.
638 define i64 @smaxv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
639 ; CHECK-LABEL: smaxv_v2i64:
641 ; CHECK-NEXT: ptrue p0.d, vl2
642 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
643 ; CHECK-NEXT: smaxv d0, p0, z0.d
644 ; CHECK-NEXT: fmov x0, d0
646 %res = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %a)
650 define i64 @smaxv_v4i64(ptr %a) vscale_range(2,0) #0 {
651 ; CHECK-LABEL: smaxv_v4i64:
653 ; CHECK-NEXT: ptrue p0.d, vl4
654 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
655 ; CHECK-NEXT: smaxv d0, p0, z0.d
656 ; CHECK-NEXT: fmov x0, d0
658 %op = load <4 x i64>, ptr %a
659 %res = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %op)
663 define i64 @smaxv_v8i64(ptr %a) #0 {
664 ; VBITS_GE_256-LABEL: smaxv_v8i64:
665 ; VBITS_GE_256: // %bb.0:
666 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
667 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
668 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
669 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
670 ; VBITS_GE_256-NEXT: smax z0.d, p0/m, z0.d, z1.d
671 ; VBITS_GE_256-NEXT: smaxv d0, p0, z0.d
672 ; VBITS_GE_256-NEXT: fmov x0, d0
673 ; VBITS_GE_256-NEXT: ret
675 ; VBITS_GE_512-LABEL: smaxv_v8i64:
676 ; VBITS_GE_512: // %bb.0:
677 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
678 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
679 ; VBITS_GE_512-NEXT: smaxv d0, p0, z0.d
680 ; VBITS_GE_512-NEXT: fmov x0, d0
681 ; VBITS_GE_512-NEXT: ret
682 %op = load <8 x i64>, ptr %a
683 %res = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> %op)
687 define i64 @smaxv_v16i64(ptr %a) vscale_range(8,0) #0 {
688 ; CHECK-LABEL: smaxv_v16i64:
690 ; CHECK-NEXT: ptrue p0.d, vl16
691 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
692 ; CHECK-NEXT: smaxv d0, p0, z0.d
693 ; CHECK-NEXT: fmov x0, d0
695 %op = load <16 x i64>, ptr %a
696 %res = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> %op)
700 define i64 @smaxv_v32i64(ptr %a) vscale_range(16,0) #0 {
701 ; CHECK-LABEL: smaxv_v32i64:
703 ; CHECK-NEXT: ptrue p0.d, vl32
704 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
705 ; CHECK-NEXT: smaxv d0, p0, z0.d
706 ; CHECK-NEXT: fmov x0, d0
708 %op = load <32 x i64>, ptr %a
709 %res = call i64 @llvm.vector.reduce.smax.v32i64(<32 x i64> %op)
717 ; Don't use SVE for 64-bit vectors.
718 define i8 @sminv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
719 ; CHECK-LABEL: sminv_v8i8:
721 ; CHECK-NEXT: sminv b0, v0.8b
722 ; CHECK-NEXT: fmov w0, s0
724 %res = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %a)
728 ; Don't use SVE for 128-bit vectors.
729 define i8 @sminv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
730 ; CHECK-LABEL: sminv_v16i8:
732 ; CHECK-NEXT: sminv b0, v0.16b
733 ; CHECK-NEXT: fmov w0, s0
735 %res = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %a)
739 define i8 @sminv_v32i8(ptr %a) vscale_range(2,0) #0 {
740 ; CHECK-LABEL: sminv_v32i8:
742 ; CHECK-NEXT: ptrue p0.b, vl32
743 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
744 ; CHECK-NEXT: sminv b0, p0, z0.b
745 ; CHECK-NEXT: fmov w0, s0
747 %op = load <32 x i8>, ptr %a
748 %res = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %op)
752 define i8 @sminv_v64i8(ptr %a) #0 {
753 ; VBITS_GE_256-LABEL: sminv_v64i8:
754 ; VBITS_GE_256: // %bb.0:
755 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
756 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
757 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
758 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
759 ; VBITS_GE_256-NEXT: smin z0.b, p0/m, z0.b, z1.b
760 ; VBITS_GE_256-NEXT: sminv b0, p0, z0.b
761 ; VBITS_GE_256-NEXT: fmov w0, s0
762 ; VBITS_GE_256-NEXT: ret
764 ; VBITS_GE_512-LABEL: sminv_v64i8:
765 ; VBITS_GE_512: // %bb.0:
766 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
767 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
768 ; VBITS_GE_512-NEXT: sminv b0, p0, z0.b
769 ; VBITS_GE_512-NEXT: fmov w0, s0
770 ; VBITS_GE_512-NEXT: ret
771 %op = load <64 x i8>, ptr %a
772 %res = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> %op)
776 define i8 @sminv_v128i8(ptr %a) vscale_range(8,0) #0 {
777 ; CHECK-LABEL: sminv_v128i8:
779 ; CHECK-NEXT: ptrue p0.b, vl128
780 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
781 ; CHECK-NEXT: sminv b0, p0, z0.b
782 ; CHECK-NEXT: fmov w0, s0
784 %op = load <128 x i8>, ptr %a
785 %res = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> %op)
789 define i8 @sminv_v256i8(ptr %a) vscale_range(16,0) #0 {
790 ; CHECK-LABEL: sminv_v256i8:
792 ; CHECK-NEXT: ptrue p0.b, vl256
793 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
794 ; CHECK-NEXT: sminv b0, p0, z0.b
795 ; CHECK-NEXT: fmov w0, s0
797 %op = load <256 x i8>, ptr %a
798 %res = call i8 @llvm.vector.reduce.smin.v256i8(<256 x i8> %op)
802 ; Don't use SVE for 64-bit vectors.
803 define i16 @sminv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
804 ; CHECK-LABEL: sminv_v4i16:
806 ; CHECK-NEXT: sminv h0, v0.4h
807 ; CHECK-NEXT: fmov w0, s0
809 %res = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %a)
813 ; Don't use SVE for 128-bit vectors.
814 define i16 @sminv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
815 ; CHECK-LABEL: sminv_v8i16:
817 ; CHECK-NEXT: sminv h0, v0.8h
818 ; CHECK-NEXT: fmov w0, s0
820 %res = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %a)
824 define i16 @sminv_v16i16(ptr %a) vscale_range(2,0) #0 {
825 ; CHECK-LABEL: sminv_v16i16:
827 ; CHECK-NEXT: ptrue p0.h, vl16
828 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
829 ; CHECK-NEXT: sminv h0, p0, z0.h
830 ; CHECK-NEXT: fmov w0, s0
832 %op = load <16 x i16>, ptr %a
833 %res = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %op)
837 define i16 @sminv_v32i16(ptr %a) #0 {
838 ; VBITS_GE_256-LABEL: sminv_v32i16:
839 ; VBITS_GE_256: // %bb.0:
840 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
841 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
842 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
843 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
844 ; VBITS_GE_256-NEXT: smin z0.h, p0/m, z0.h, z1.h
845 ; VBITS_GE_256-NEXT: sminv h0, p0, z0.h
846 ; VBITS_GE_256-NEXT: fmov w0, s0
847 ; VBITS_GE_256-NEXT: ret
849 ; VBITS_GE_512-LABEL: sminv_v32i16:
850 ; VBITS_GE_512: // %bb.0:
851 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
852 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
853 ; VBITS_GE_512-NEXT: sminv h0, p0, z0.h
854 ; VBITS_GE_512-NEXT: fmov w0, s0
855 ; VBITS_GE_512-NEXT: ret
856 %op = load <32 x i16>, ptr %a
857 %res = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> %op)
861 define i16 @sminv_v64i16(ptr %a) vscale_range(8,0) #0 {
862 ; CHECK-LABEL: sminv_v64i16:
864 ; CHECK-NEXT: ptrue p0.h, vl64
865 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
866 ; CHECK-NEXT: sminv h0, p0, z0.h
867 ; CHECK-NEXT: fmov w0, s0
869 %op = load <64 x i16>, ptr %a
870 %res = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> %op)
874 define i16 @sminv_v128i16(ptr %a) vscale_range(16,0) #0 {
875 ; CHECK-LABEL: sminv_v128i16:
877 ; CHECK-NEXT: ptrue p0.h, vl128
878 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
879 ; CHECK-NEXT: sminv h0, p0, z0.h
880 ; CHECK-NEXT: fmov w0, s0
882 %op = load <128 x i16>, ptr %a
883 %res = call i16 @llvm.vector.reduce.smin.v128i16(<128 x i16> %op)
887 ; Don't use SVE for 64-bit vectors.
888 define i32 @sminv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
889 ; CHECK-LABEL: sminv_v2i32:
891 ; CHECK-NEXT: sminp v0.2s, v0.2s, v0.2s
892 ; CHECK-NEXT: fmov w0, s0
894 %res = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %a)
898 ; Don't use SVE for 128-bit vectors.
899 define i32 @sminv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
900 ; CHECK-LABEL: sminv_v4i32:
902 ; CHECK-NEXT: sminv s0, v0.4s
903 ; CHECK-NEXT: fmov w0, s0
905 %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a)
909 define i32 @sminv_v8i32(ptr %a) vscale_range(2,0) #0 {
910 ; CHECK-LABEL: sminv_v8i32:
912 ; CHECK-NEXT: ptrue p0.s, vl8
913 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
914 ; CHECK-NEXT: sminv s0, p0, z0.s
915 ; CHECK-NEXT: fmov w0, s0
917 %op = load <8 x i32>, ptr %a
918 %res = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %op)
922 define i32 @sminv_v16i32(ptr %a) #0 {
923 ; VBITS_GE_256-LABEL: sminv_v16i32:
924 ; VBITS_GE_256: // %bb.0:
925 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
926 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
927 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
928 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
929 ; VBITS_GE_256-NEXT: smin z0.s, p0/m, z0.s, z1.s
930 ; VBITS_GE_256-NEXT: sminv s0, p0, z0.s
931 ; VBITS_GE_256-NEXT: fmov w0, s0
932 ; VBITS_GE_256-NEXT: ret
934 ; VBITS_GE_512-LABEL: sminv_v16i32:
935 ; VBITS_GE_512: // %bb.0:
936 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
937 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
938 ; VBITS_GE_512-NEXT: sminv s0, p0, z0.s
939 ; VBITS_GE_512-NEXT: fmov w0, s0
940 ; VBITS_GE_512-NEXT: ret
941 %op = load <16 x i32>, ptr %a
942 %res = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %op)
946 define i32 @sminv_v32i32(ptr %a) vscale_range(8,0) #0 {
947 ; CHECK-LABEL: sminv_v32i32:
949 ; CHECK-NEXT: ptrue p0.s, vl32
950 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
951 ; CHECK-NEXT: sminv s0, p0, z0.s
952 ; CHECK-NEXT: fmov w0, s0
954 %op = load <32 x i32>, ptr %a
955 %res = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> %op)
959 define i32 @sminv_v64i32(ptr %a) vscale_range(16,0) #0 {
960 ; CHECK-LABEL: sminv_v64i32:
962 ; CHECK-NEXT: ptrue p0.s, vl64
963 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
964 ; CHECK-NEXT: sminv s0, p0, z0.s
965 ; CHECK-NEXT: fmov w0, s0
967 %op = load <64 x i32>, ptr %a
968 %res = call i32 @llvm.vector.reduce.smin.v64i32(<64 x i32> %op)
972 ; Nothing to do for single element vectors.
973 define i64 @sminv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
974 ; CHECK-LABEL: sminv_v1i64:
976 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
977 ; CHECK-NEXT: fmov x0, d0
979 %res = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> %a)
983 ; No NEON 64-bit vector SMINV support. Use SVE.
984 define i64 @sminv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
985 ; CHECK-LABEL: sminv_v2i64:
987 ; CHECK-NEXT: ptrue p0.d, vl2
988 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
989 ; CHECK-NEXT: sminv d0, p0, z0.d
990 ; CHECK-NEXT: fmov x0, d0
992 %res = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %a)
996 define i64 @sminv_v4i64(ptr %a) vscale_range(2,0) #0 {
997 ; CHECK-LABEL: sminv_v4i64:
999 ; CHECK-NEXT: ptrue p0.d, vl4
1000 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1001 ; CHECK-NEXT: sminv d0, p0, z0.d
1002 ; CHECK-NEXT: fmov x0, d0
1004 %op = load <4 x i64>, ptr %a
1005 %res = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %op)
1009 define i64 @sminv_v8i64(ptr %a) #0 {
1010 ; VBITS_GE_256-LABEL: sminv_v8i64:
1011 ; VBITS_GE_256: // %bb.0:
1012 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1013 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1014 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1015 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
1016 ; VBITS_GE_256-NEXT: smin z0.d, p0/m, z0.d, z1.d
1017 ; VBITS_GE_256-NEXT: sminv d0, p0, z0.d
1018 ; VBITS_GE_256-NEXT: fmov x0, d0
1019 ; VBITS_GE_256-NEXT: ret
1021 ; VBITS_GE_512-LABEL: sminv_v8i64:
1022 ; VBITS_GE_512: // %bb.0:
1023 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1024 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1025 ; VBITS_GE_512-NEXT: sminv d0, p0, z0.d
1026 ; VBITS_GE_512-NEXT: fmov x0, d0
1027 ; VBITS_GE_512-NEXT: ret
1028 %op = load <8 x i64>, ptr %a
1029 %res = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> %op)
1033 define i64 @sminv_v16i64(ptr %a) vscale_range(8,0) #0 {
1034 ; CHECK-LABEL: sminv_v16i64:
1036 ; CHECK-NEXT: ptrue p0.d, vl16
1037 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1038 ; CHECK-NEXT: sminv d0, p0, z0.d
1039 ; CHECK-NEXT: fmov x0, d0
1041 %op = load <16 x i64>, ptr %a
1042 %res = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> %op)
1046 define i64 @sminv_v32i64(ptr %a) vscale_range(16,0) #0 {
1047 ; CHECK-LABEL: sminv_v32i64:
1049 ; CHECK-NEXT: ptrue p0.d, vl32
1050 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1051 ; CHECK-NEXT: sminv d0, p0, z0.d
1052 ; CHECK-NEXT: fmov x0, d0
1054 %op = load <32 x i64>, ptr %a
1055 %res = call i64 @llvm.vector.reduce.smin.v32i64(<32 x i64> %op)
1063 ; Don't use SVE for 64-bit vectors.
1064 define i8 @umaxv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
1065 ; CHECK-LABEL: umaxv_v8i8:
1067 ; CHECK-NEXT: umaxv b0, v0.8b
1068 ; CHECK-NEXT: fmov w0, s0
1070 %res = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %a)
1074 ; Don't use SVE for 128-bit vectors.
1075 define i8 @umaxv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
1076 ; CHECK-LABEL: umaxv_v16i8:
1078 ; CHECK-NEXT: umaxv b0, v0.16b
1079 ; CHECK-NEXT: fmov w0, s0
1081 %res = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %a)
1085 define i8 @umaxv_v32i8(ptr %a) vscale_range(2,0) #0 {
1086 ; CHECK-LABEL: umaxv_v32i8:
1088 ; CHECK-NEXT: ptrue p0.b, vl32
1089 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
1090 ; CHECK-NEXT: umaxv b0, p0, z0.b
1091 ; CHECK-NEXT: fmov w0, s0
1093 %op = load <32 x i8>, ptr %a
1094 %res = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %op)
1098 define i8 @umaxv_v64i8(ptr %a) #0 {
1099 ; VBITS_GE_256-LABEL: umaxv_v64i8:
1100 ; VBITS_GE_256: // %bb.0:
1101 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
1102 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
1103 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
1104 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
1105 ; VBITS_GE_256-NEXT: umax z0.b, p0/m, z0.b, z1.b
1106 ; VBITS_GE_256-NEXT: umaxv b0, p0, z0.b
1107 ; VBITS_GE_256-NEXT: fmov w0, s0
1108 ; VBITS_GE_256-NEXT: ret
1110 ; VBITS_GE_512-LABEL: umaxv_v64i8:
1111 ; VBITS_GE_512: // %bb.0:
1112 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
1113 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
1114 ; VBITS_GE_512-NEXT: umaxv b0, p0, z0.b
1115 ; VBITS_GE_512-NEXT: fmov w0, s0
1116 ; VBITS_GE_512-NEXT: ret
1117 %op = load <64 x i8>, ptr %a
1118 %res = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> %op)
1122 define i8 @umaxv_v128i8(ptr %a) vscale_range(8,0) #0 {
1123 ; CHECK-LABEL: umaxv_v128i8:
1125 ; CHECK-NEXT: ptrue p0.b, vl128
1126 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
1127 ; CHECK-NEXT: umaxv b0, p0, z0.b
1128 ; CHECK-NEXT: fmov w0, s0
1130 %op = load <128 x i8>, ptr %a
1131 %res = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> %op)
1135 define i8 @umaxv_v256i8(ptr %a) vscale_range(16,0) #0 {
1136 ; CHECK-LABEL: umaxv_v256i8:
1138 ; CHECK-NEXT: ptrue p0.b, vl256
1139 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
1140 ; CHECK-NEXT: umaxv b0, p0, z0.b
1141 ; CHECK-NEXT: fmov w0, s0
1143 %op = load <256 x i8>, ptr %a
1144 %res = call i8 @llvm.vector.reduce.umax.v256i8(<256 x i8> %op)
1148 ; Don't use SVE for 64-bit vectors.
1149 define i16 @umaxv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
1150 ; CHECK-LABEL: umaxv_v4i16:
1152 ; CHECK-NEXT: umaxv h0, v0.4h
1153 ; CHECK-NEXT: fmov w0, s0
1155 %res = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %a)
1159 ; Don't use SVE for 128-bit vectors.
1160 define i16 @umaxv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
1161 ; CHECK-LABEL: umaxv_v8i16:
1163 ; CHECK-NEXT: umaxv h0, v0.8h
1164 ; CHECK-NEXT: fmov w0, s0
1166 %res = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %a)
1170 define i16 @umaxv_v16i16(ptr %a) vscale_range(2,0) #0 {
1171 ; CHECK-LABEL: umaxv_v16i16:
1173 ; CHECK-NEXT: ptrue p0.h, vl16
1174 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1175 ; CHECK-NEXT: umaxv h0, p0, z0.h
1176 ; CHECK-NEXT: fmov w0, s0
1178 %op = load <16 x i16>, ptr %a
1179 %res = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %op)
1183 define i16 @umaxv_v32i16(ptr %a) #0 {
1184 ; VBITS_GE_256-LABEL: umaxv_v32i16:
1185 ; VBITS_GE_256: // %bb.0:
1186 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
1187 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
1188 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1189 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
1190 ; VBITS_GE_256-NEXT: umax z0.h, p0/m, z0.h, z1.h
1191 ; VBITS_GE_256-NEXT: umaxv h0, p0, z0.h
1192 ; VBITS_GE_256-NEXT: fmov w0, s0
1193 ; VBITS_GE_256-NEXT: ret
1195 ; VBITS_GE_512-LABEL: umaxv_v32i16:
1196 ; VBITS_GE_512: // %bb.0:
1197 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
1198 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
1199 ; VBITS_GE_512-NEXT: umaxv h0, p0, z0.h
1200 ; VBITS_GE_512-NEXT: fmov w0, s0
1201 ; VBITS_GE_512-NEXT: ret
1202 %op = load <32 x i16>, ptr %a
1203 %res = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> %op)
1207 define i16 @umaxv_v64i16(ptr %a) vscale_range(8,0) #0 {
1208 ; CHECK-LABEL: umaxv_v64i16:
1210 ; CHECK-NEXT: ptrue p0.h, vl64
1211 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1212 ; CHECK-NEXT: umaxv h0, p0, z0.h
1213 ; CHECK-NEXT: fmov w0, s0
1215 %op = load <64 x i16>, ptr %a
1216 %res = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> %op)
1220 define i16 @umaxv_v128i16(ptr %a) vscale_range(16,0) #0 {
1221 ; CHECK-LABEL: umaxv_v128i16:
1223 ; CHECK-NEXT: ptrue p0.h, vl128
1224 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1225 ; CHECK-NEXT: umaxv h0, p0, z0.h
1226 ; CHECK-NEXT: fmov w0, s0
1228 %op = load <128 x i16>, ptr %a
1229 %res = call i16 @llvm.vector.reduce.umax.v128i16(<128 x i16> %op)
1233 ; Don't use SVE for 64-bit vectors.
1234 define i32 @umaxv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
1235 ; CHECK-LABEL: umaxv_v2i32:
1237 ; CHECK-NEXT: umaxp v0.2s, v0.2s, v0.2s
1238 ; CHECK-NEXT: fmov w0, s0
1240 %res = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %a)
1244 ; Don't use SVE for 128-bit vectors.
1245 define i32 @umaxv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
1246 ; CHECK-LABEL: umaxv_v4i32:
1248 ; CHECK-NEXT: umaxv s0, v0.4s
1249 ; CHECK-NEXT: fmov w0, s0
1251 %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a)
1255 define i32 @umaxv_v8i32(ptr %a) vscale_range(2,0) #0 {
1256 ; CHECK-LABEL: umaxv_v8i32:
1258 ; CHECK-NEXT: ptrue p0.s, vl8
1259 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1260 ; CHECK-NEXT: umaxv s0, p0, z0.s
1261 ; CHECK-NEXT: fmov w0, s0
1263 %op = load <8 x i32>, ptr %a
1264 %res = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %op)
1268 define i32 @umaxv_v16i32(ptr %a) #0 {
1269 ; VBITS_GE_256-LABEL: umaxv_v16i32:
1270 ; VBITS_GE_256: // %bb.0:
1271 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1272 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1273 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1274 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
1275 ; VBITS_GE_256-NEXT: umax z0.s, p0/m, z0.s, z1.s
1276 ; VBITS_GE_256-NEXT: umaxv s0, p0, z0.s
1277 ; VBITS_GE_256-NEXT: fmov w0, s0
1278 ; VBITS_GE_256-NEXT: ret
1280 ; VBITS_GE_512-LABEL: umaxv_v16i32:
1281 ; VBITS_GE_512: // %bb.0:
1282 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1283 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1284 ; VBITS_GE_512-NEXT: umaxv s0, p0, z0.s
1285 ; VBITS_GE_512-NEXT: fmov w0, s0
1286 ; VBITS_GE_512-NEXT: ret
1287 %op = load <16 x i32>, ptr %a
1288 %res = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %op)
1292 define i32 @umaxv_v32i32(ptr %a) vscale_range(8,0) #0 {
1293 ; CHECK-LABEL: umaxv_v32i32:
1295 ; CHECK-NEXT: ptrue p0.s, vl32
1296 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1297 ; CHECK-NEXT: umaxv s0, p0, z0.s
1298 ; CHECK-NEXT: fmov w0, s0
1300 %op = load <32 x i32>, ptr %a
1301 %res = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> %op)
1305 define i32 @umaxv_v64i32(ptr %a) vscale_range(16,0) #0 {
1306 ; CHECK-LABEL: umaxv_v64i32:
1308 ; CHECK-NEXT: ptrue p0.s, vl64
1309 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1310 ; CHECK-NEXT: umaxv s0, p0, z0.s
1311 ; CHECK-NEXT: fmov w0, s0
1313 %op = load <64 x i32>, ptr %a
1314 %res = call i32 @llvm.vector.reduce.umax.v64i32(<64 x i32> %op)
1318 ; Nothing to do for single element vectors.
1319 define i64 @umaxv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
1320 ; CHECK-LABEL: umaxv_v1i64:
1322 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1323 ; CHECK-NEXT: fmov x0, d0
1325 %res = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> %a)
1329 ; No NEON 64-bit vector UMAXV support. Use SVE.
1330 define i64 @umaxv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
1331 ; CHECK-LABEL: umaxv_v2i64:
1333 ; CHECK-NEXT: ptrue p0.d, vl2
1334 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
1335 ; CHECK-NEXT: umaxv d0, p0, z0.d
1336 ; CHECK-NEXT: fmov x0, d0
1338 %res = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a)
1342 define i64 @umaxv_v4i64(ptr %a) vscale_range(2,0) #0 {
1343 ; CHECK-LABEL: umaxv_v4i64:
1345 ; CHECK-NEXT: ptrue p0.d, vl4
1346 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1347 ; CHECK-NEXT: umaxv d0, p0, z0.d
1348 ; CHECK-NEXT: fmov x0, d0
1350 %op = load <4 x i64>, ptr %a
1351 %res = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %op)
1355 define i64 @umaxv_v8i64(ptr %a) #0 {
1356 ; VBITS_GE_256-LABEL: umaxv_v8i64:
1357 ; VBITS_GE_256: // %bb.0:
1358 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1359 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1360 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1361 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
1362 ; VBITS_GE_256-NEXT: umax z0.d, p0/m, z0.d, z1.d
1363 ; VBITS_GE_256-NEXT: umaxv d0, p0, z0.d
1364 ; VBITS_GE_256-NEXT: fmov x0, d0
1365 ; VBITS_GE_256-NEXT: ret
1367 ; VBITS_GE_512-LABEL: umaxv_v8i64:
1368 ; VBITS_GE_512: // %bb.0:
1369 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1370 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1371 ; VBITS_GE_512-NEXT: umaxv d0, p0, z0.d
1372 ; VBITS_GE_512-NEXT: fmov x0, d0
1373 ; VBITS_GE_512-NEXT: ret
1374 %op = load <8 x i64>, ptr %a
1375 %res = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> %op)
1379 define i64 @umaxv_v16i64(ptr %a) vscale_range(8,0) #0 {
1380 ; CHECK-LABEL: umaxv_v16i64:
1382 ; CHECK-NEXT: ptrue p0.d, vl16
1383 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1384 ; CHECK-NEXT: umaxv d0, p0, z0.d
1385 ; CHECK-NEXT: fmov x0, d0
1387 %op = load <16 x i64>, ptr %a
1388 %res = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> %op)
1392 define i64 @umaxv_v32i64(ptr %a) vscale_range(16,0) #0 {
1393 ; CHECK-LABEL: umaxv_v32i64:
1395 ; CHECK-NEXT: ptrue p0.d, vl32
1396 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1397 ; CHECK-NEXT: umaxv d0, p0, z0.d
1398 ; CHECK-NEXT: fmov x0, d0
1400 %op = load <32 x i64>, ptr %a
1401 %res = call i64 @llvm.vector.reduce.umax.v32i64(<32 x i64> %op)
1409 ; Don't use SVE for 64-bit vectors.
1410 define i8 @uminv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
1411 ; CHECK-LABEL: uminv_v8i8:
1413 ; CHECK-NEXT: uminv b0, v0.8b
1414 ; CHECK-NEXT: fmov w0, s0
1416 %res = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %a)
1420 ; Don't use SVE for 128-bit vectors.
1421 define i8 @uminv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
1422 ; CHECK-LABEL: uminv_v16i8:
1424 ; CHECK-NEXT: uminv b0, v0.16b
1425 ; CHECK-NEXT: fmov w0, s0
1427 %res = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %a)
1431 define i8 @uminv_v32i8(ptr %a) vscale_range(2,0) #0 {
1432 ; CHECK-LABEL: uminv_v32i8:
1434 ; CHECK-NEXT: ptrue p0.b, vl32
1435 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
1436 ; CHECK-NEXT: uminv b0, p0, z0.b
1437 ; CHECK-NEXT: fmov w0, s0
1439 %op = load <32 x i8>, ptr %a
1440 %res = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %op)
1444 define i8 @uminv_v64i8(ptr %a) #0 {
1445 ; VBITS_GE_256-LABEL: uminv_v64i8:
1446 ; VBITS_GE_256: // %bb.0:
1447 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
1448 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
1449 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
1450 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
1451 ; VBITS_GE_256-NEXT: umin z0.b, p0/m, z0.b, z1.b
1452 ; VBITS_GE_256-NEXT: uminv b0, p0, z0.b
1453 ; VBITS_GE_256-NEXT: fmov w0, s0
1454 ; VBITS_GE_256-NEXT: ret
1456 ; VBITS_GE_512-LABEL: uminv_v64i8:
1457 ; VBITS_GE_512: // %bb.0:
1458 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
1459 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
1460 ; VBITS_GE_512-NEXT: uminv b0, p0, z0.b
1461 ; VBITS_GE_512-NEXT: fmov w0, s0
1462 ; VBITS_GE_512-NEXT: ret
1463 %op = load <64 x i8>, ptr %a
1464 %res = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> %op)
1468 define i8 @uminv_v128i8(ptr %a) vscale_range(8,0) #0 {
1469 ; CHECK-LABEL: uminv_v128i8:
1471 ; CHECK-NEXT: ptrue p0.b, vl128
1472 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
1473 ; CHECK-NEXT: uminv b0, p0, z0.b
1474 ; CHECK-NEXT: fmov w0, s0
1476 %op = load <128 x i8>, ptr %a
1477 %res = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> %op)
1481 define i8 @uminv_v256i8(ptr %a) vscale_range(16,0) #0 {
1482 ; CHECK-LABEL: uminv_v256i8:
1484 ; CHECK-NEXT: ptrue p0.b, vl256
1485 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
1486 ; CHECK-NEXT: uminv b0, p0, z0.b
1487 ; CHECK-NEXT: fmov w0, s0
1489 %op = load <256 x i8>, ptr %a
1490 %res = call i8 @llvm.vector.reduce.umin.v256i8(<256 x i8> %op)
1494 ; Don't use SVE for 64-bit vectors.
1495 define i16 @uminv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
1496 ; CHECK-LABEL: uminv_v4i16:
1498 ; CHECK-NEXT: uminv h0, v0.4h
1499 ; CHECK-NEXT: fmov w0, s0
1501 %res = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %a)
1505 ; Don't use SVE for 128-bit vectors.
1506 define i16 @uminv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
1507 ; CHECK-LABEL: uminv_v8i16:
1509 ; CHECK-NEXT: uminv h0, v0.8h
1510 ; CHECK-NEXT: fmov w0, s0
1512 %res = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %a)
1516 define i16 @uminv_v16i16(ptr %a) vscale_range(2,0) #0 {
1517 ; CHECK-LABEL: uminv_v16i16:
1519 ; CHECK-NEXT: ptrue p0.h, vl16
1520 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1521 ; CHECK-NEXT: uminv h0, p0, z0.h
1522 ; CHECK-NEXT: fmov w0, s0
1524 %op = load <16 x i16>, ptr %a
1525 %res = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %op)
1529 define i16 @uminv_v32i16(ptr %a) #0 {
1530 ; VBITS_GE_256-LABEL: uminv_v32i16:
1531 ; VBITS_GE_256: // %bb.0:
1532 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
1533 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
1534 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1535 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
1536 ; VBITS_GE_256-NEXT: umin z0.h, p0/m, z0.h, z1.h
1537 ; VBITS_GE_256-NEXT: uminv h0, p0, z0.h
1538 ; VBITS_GE_256-NEXT: fmov w0, s0
1539 ; VBITS_GE_256-NEXT: ret
1541 ; VBITS_GE_512-LABEL: uminv_v32i16:
1542 ; VBITS_GE_512: // %bb.0:
1543 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
1544 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
1545 ; VBITS_GE_512-NEXT: uminv h0, p0, z0.h
1546 ; VBITS_GE_512-NEXT: fmov w0, s0
1547 ; VBITS_GE_512-NEXT: ret
1548 %op = load <32 x i16>, ptr %a
1549 %res = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> %op)
1553 define i16 @uminv_v64i16(ptr %a) vscale_range(8,0) #0 {
1554 ; CHECK-LABEL: uminv_v64i16:
1556 ; CHECK-NEXT: ptrue p0.h, vl64
1557 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1558 ; CHECK-NEXT: uminv h0, p0, z0.h
1559 ; CHECK-NEXT: fmov w0, s0
1561 %op = load <64 x i16>, ptr %a
1562 %res = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> %op)
1566 define i16 @uminv_v128i16(ptr %a) vscale_range(16,0) #0 {
1567 ; CHECK-LABEL: uminv_v128i16:
1569 ; CHECK-NEXT: ptrue p0.h, vl128
1570 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1571 ; CHECK-NEXT: uminv h0, p0, z0.h
1572 ; CHECK-NEXT: fmov w0, s0
1574 %op = load <128 x i16>, ptr %a
1575 %res = call i16 @llvm.vector.reduce.umin.v128i16(<128 x i16> %op)
1579 ; Don't use SVE for 64-bit vectors.
1580 define i32 @uminv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
1581 ; CHECK-LABEL: uminv_v2i32:
1583 ; CHECK-NEXT: uminp v0.2s, v0.2s, v0.2s
1584 ; CHECK-NEXT: fmov w0, s0
1586 %res = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %a)
1590 ; Don't use SVE for 128-bit vectors.
1591 define i32 @uminv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
1592 ; CHECK-LABEL: uminv_v4i32:
1594 ; CHECK-NEXT: uminv s0, v0.4s
1595 ; CHECK-NEXT: fmov w0, s0
1597 %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a)
1601 define i32 @uminv_v8i32(ptr %a) vscale_range(2,0) #0 {
1602 ; CHECK-LABEL: uminv_v8i32:
1604 ; CHECK-NEXT: ptrue p0.s, vl8
1605 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1606 ; CHECK-NEXT: uminv s0, p0, z0.s
1607 ; CHECK-NEXT: fmov w0, s0
1609 %op = load <8 x i32>, ptr %a
1610 %res = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %op)
1614 define i32 @uminv_v16i32(ptr %a) #0 {
1615 ; VBITS_GE_256-LABEL: uminv_v16i32:
1616 ; VBITS_GE_256: // %bb.0:
1617 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1618 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1619 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1620 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
1621 ; VBITS_GE_256-NEXT: umin z0.s, p0/m, z0.s, z1.s
1622 ; VBITS_GE_256-NEXT: uminv s0, p0, z0.s
1623 ; VBITS_GE_256-NEXT: fmov w0, s0
1624 ; VBITS_GE_256-NEXT: ret
1626 ; VBITS_GE_512-LABEL: uminv_v16i32:
1627 ; VBITS_GE_512: // %bb.0:
1628 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1629 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1630 ; VBITS_GE_512-NEXT: uminv s0, p0, z0.s
1631 ; VBITS_GE_512-NEXT: fmov w0, s0
1632 ; VBITS_GE_512-NEXT: ret
1633 %op = load <16 x i32>, ptr %a
1634 %res = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %op)
1638 define i32 @uminv_v32i32(ptr %a) vscale_range(8,0) #0 {
1639 ; CHECK-LABEL: uminv_v32i32:
1641 ; CHECK-NEXT: ptrue p0.s, vl32
1642 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1643 ; CHECK-NEXT: uminv s0, p0, z0.s
1644 ; CHECK-NEXT: fmov w0, s0
1646 %op = load <32 x i32>, ptr %a
1647 %res = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> %op)
1651 define i32 @uminv_v64i32(ptr %a) vscale_range(16,0) #0 {
1652 ; CHECK-LABEL: uminv_v64i32:
1654 ; CHECK-NEXT: ptrue p0.s, vl64
1655 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1656 ; CHECK-NEXT: uminv s0, p0, z0.s
1657 ; CHECK-NEXT: fmov w0, s0
1659 %op = load <64 x i32>, ptr %a
1660 %res = call i32 @llvm.vector.reduce.umin.v64i32(<64 x i32> %op)
1664 ; Nothing to do for single element vectors.
1665 define i64 @uminv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
1666 ; CHECK-LABEL: uminv_v1i64:
1668 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1669 ; CHECK-NEXT: fmov x0, d0
1671 %res = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> %a)
1675 ; No NEON 64-bit vector UMINV support. Use SVE.
1676 define i64 @uminv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
1677 ; CHECK-LABEL: uminv_v2i64:
1679 ; CHECK-NEXT: ptrue p0.d, vl2
1680 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
1681 ; CHECK-NEXT: uminv d0, p0, z0.d
1682 ; CHECK-NEXT: fmov x0, d0
1684 %res = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %a)
1688 define i64 @uminv_v4i64(ptr %a) vscale_range(2,0) #0 {
1689 ; CHECK-LABEL: uminv_v4i64:
1691 ; CHECK-NEXT: ptrue p0.d, vl4
1692 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1693 ; CHECK-NEXT: uminv d0, p0, z0.d
1694 ; CHECK-NEXT: fmov x0, d0
1696 %op = load <4 x i64>, ptr %a
1697 %res = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %op)
1701 define i64 @uminv_v8i64(ptr %a) #0 {
1702 ; VBITS_GE_256-LABEL: uminv_v8i64:
1703 ; VBITS_GE_256: // %bb.0:
1704 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1705 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1706 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1707 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
1708 ; VBITS_GE_256-NEXT: umin z0.d, p0/m, z0.d, z1.d
1709 ; VBITS_GE_256-NEXT: uminv d0, p0, z0.d
1710 ; VBITS_GE_256-NEXT: fmov x0, d0
1711 ; VBITS_GE_256-NEXT: ret
1713 ; VBITS_GE_512-LABEL: uminv_v8i64:
1714 ; VBITS_GE_512: // %bb.0:
1715 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1716 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1717 ; VBITS_GE_512-NEXT: uminv d0, p0, z0.d
1718 ; VBITS_GE_512-NEXT: fmov x0, d0
1719 ; VBITS_GE_512-NEXT: ret
1720 %op = load <8 x i64>, ptr %a
1721 %res = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> %op)
1725 define i64 @uminv_v16i64(ptr %a) vscale_range(8,0) #0 {
1726 ; CHECK-LABEL: uminv_v16i64:
1728 ; CHECK-NEXT: ptrue p0.d, vl16
1729 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1730 ; CHECK-NEXT: uminv d0, p0, z0.d
1731 ; CHECK-NEXT: fmov x0, d0
1733 %op = load <16 x i64>, ptr %a
1734 %res = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> %op)
1738 define i64 @uminv_v32i64(ptr %a) vscale_range(16,0) #0 {
1739 ; CHECK-LABEL: uminv_v32i64:
1741 ; CHECK-NEXT: ptrue p0.d, vl32
1742 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1743 ; CHECK-NEXT: uminv d0, p0, z0.d
1744 ; CHECK-NEXT: fmov x0, d0
1746 %op = load <32 x i64>, ptr %a
1747 %res = call i64 @llvm.vector.reduce.umin.v32i64(<32 x i64> %op)
1751 attributes #0 = { "target-features"="+sve" }
1753 declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
1754 declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
1755 declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
1756 declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
1757 declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
1758 declare i8 @llvm.vector.reduce.add.v256i8(<256 x i8>)
1760 declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
1761 declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
1762 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
1763 declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
1764 declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
1765 declare i16 @llvm.vector.reduce.add.v128i16(<128 x i16>)
1767 declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
1768 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
1769 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
1770 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
1771 declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
1772 declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>)
1774 declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>)
1775 declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
1776 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
1777 declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
1778 declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
1779 declare i64 @llvm.vector.reduce.add.v32i64(<32 x i64>)
1781 declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>)
1782 declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>)
1783 declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>)
1784 declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>)
1785 declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>)
1786 declare i8 @llvm.vector.reduce.smax.v256i8(<256 x i8>)
1788 declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>)
1789 declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
1790 declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>)
1791 declare i16 @llvm.vector.reduce.smax.v32i16(<32 x i16>)
1792 declare i16 @llvm.vector.reduce.smax.v64i16(<64 x i16>)
1793 declare i16 @llvm.vector.reduce.smax.v128i16(<128 x i16>)
1795 declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>)
1796 declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
1797 declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>)
1798 declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>)
1799 declare i32 @llvm.vector.reduce.smax.v32i32(<32 x i32>)
1800 declare i32 @llvm.vector.reduce.smax.v64i32(<64 x i32>)
1802 declare i64 @llvm.vector.reduce.smax.v1i64(<1 x i64>)
1803 declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>)
1804 declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>)
1805 declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>)
1806 declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>)
1807 declare i64 @llvm.vector.reduce.smax.v32i64(<32 x i64>)
1809 declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>)
1810 declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>)
1811 declare i8 @llvm.vector.reduce.smin.v32i8(<32 x i8>)
1812 declare i8 @llvm.vector.reduce.smin.v64i8(<64 x i8>)
1813 declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>)
1814 declare i8 @llvm.vector.reduce.smin.v256i8(<256 x i8>)
1816 declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>)
1817 declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>)
1818 declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>)
1819 declare i16 @llvm.vector.reduce.smin.v32i16(<32 x i16>)
1820 declare i16 @llvm.vector.reduce.smin.v64i16(<64 x i16>)
1821 declare i16 @llvm.vector.reduce.smin.v128i16(<128 x i16>)
1823 declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>)
1824 declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
1825 declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>)
1826 declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>)
1827 declare i32 @llvm.vector.reduce.smin.v32i32(<32 x i32>)
1828 declare i32 @llvm.vector.reduce.smin.v64i32(<64 x i32>)
1830 declare i64 @llvm.vector.reduce.smin.v1i64(<1 x i64>)
1831 declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>)
1832 declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>)
1833 declare i64 @llvm.vector.reduce.smin.v8i64(<8 x i64>)
1834 declare i64 @llvm.vector.reduce.smin.v16i64(<16 x i64>)
1835 declare i64 @llvm.vector.reduce.smin.v32i64(<32 x i64>)
1837 declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>)
1838 declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>)
1839 declare i8 @llvm.vector.reduce.umax.v32i8(<32 x i8>)
1840 declare i8 @llvm.vector.reduce.umax.v64i8(<64 x i8>)
1841 declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>)
1842 declare i8 @llvm.vector.reduce.umax.v256i8(<256 x i8>)
1844 declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>)
1845 declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>)
1846 declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>)
1847 declare i16 @llvm.vector.reduce.umax.v32i16(<32 x i16>)
1848 declare i16 @llvm.vector.reduce.umax.v64i16(<64 x i16>)
1849 declare i16 @llvm.vector.reduce.umax.v128i16(<128 x i16>)
1851 declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>)
1852 declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
1853 declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>)
1854 declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>)
1855 declare i32 @llvm.vector.reduce.umax.v32i32(<32 x i32>)
1856 declare i32 @llvm.vector.reduce.umax.v64i32(<64 x i32>)
1858 declare i64 @llvm.vector.reduce.umax.v1i64(<1 x i64>)
1859 declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>)
1860 declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>)
1861 declare i64 @llvm.vector.reduce.umax.v8i64(<8 x i64>)
1862 declare i64 @llvm.vector.reduce.umax.v16i64(<16 x i64>)
1863 declare i64 @llvm.vector.reduce.umax.v32i64(<32 x i64>)
1865 declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>)
1866 declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>)
1867 declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>)
1868 declare i8 @llvm.vector.reduce.umin.v64i8(<64 x i8>)
1869 declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>)
1870 declare i8 @llvm.vector.reduce.umin.v256i8(<256 x i8>)
1872 declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>)
1873 declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>)
1874 declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>)
1875 declare i16 @llvm.vector.reduce.umin.v32i16(<32 x i16>)
1876 declare i16 @llvm.vector.reduce.umin.v64i16(<64 x i16>)
1877 declare i16 @llvm.vector.reduce.umin.v128i16(<128 x i16>)
1879 declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>)
1880 declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
1881 declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>)
1882 declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>)
1883 declare i32 @llvm.vector.reduce.umin.v32i32(<32 x i32>)
1884 declare i32 @llvm.vector.reduce.umin.v64i32(<64 x i32>)
1886 declare i64 @llvm.vector.reduce.umin.v1i64(<1 x i64>)
1887 declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>)
1888 declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>)
1889 declare i64 @llvm.vector.reduce.umin.v8i64(<8 x i64>)
1890 declare i64 @llvm.vector.reduce.umin.v16i64(<16 x i64>)
1891 declare i64 @llvm.vector.reduce.umin.v32i64(<32 x i64>)