1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
6 target triple = "aarch64-unknown-linux-gnu"
12 ; No single instruction NEON ANDV support. Use SVE.
13 define i8 @andv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
14 ; CHECK-LABEL: andv_v8i8:
16 ; CHECK-NEXT: ptrue p0.b, vl8
17 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
18 ; CHECK-NEXT: andv b0, p0, z0.b
19 ; CHECK-NEXT: fmov w0, s0
21 %res = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a)
25 ; No single instruction NEON ANDV support. Use SVE.
26 define i8 @andv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
27 ; CHECK-LABEL: andv_v16i8:
29 ; CHECK-NEXT: ptrue p0.b, vl16
30 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
31 ; CHECK-NEXT: andv b0, p0, z0.b
32 ; CHECK-NEXT: fmov w0, s0
34 %res = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a)
38 define i8 @andv_v32i8(ptr %a) vscale_range(2,0) #0 {
39 ; CHECK-LABEL: andv_v32i8:
41 ; CHECK-NEXT: ptrue p0.b, vl32
42 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
43 ; CHECK-NEXT: andv b0, p0, z0.b
44 ; CHECK-NEXT: fmov w0, s0
46 %op = load <32 x i8>, ptr %a
47 %res = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %op)
51 define i8 @andv_v64i8(ptr %a) #0 {
52 ; VBITS_GE_256-LABEL: andv_v64i8:
53 ; VBITS_GE_256: // %bb.0:
54 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
55 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
56 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
57 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
58 ; VBITS_GE_256-NEXT: and z0.d, z1.d, z0.d
59 ; VBITS_GE_256-NEXT: andv b0, p0, z0.b
60 ; VBITS_GE_256-NEXT: fmov w0, s0
61 ; VBITS_GE_256-NEXT: ret
63 ; VBITS_GE_512-LABEL: andv_v64i8:
64 ; VBITS_GE_512: // %bb.0:
65 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
66 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
67 ; VBITS_GE_512-NEXT: andv b0, p0, z0.b
68 ; VBITS_GE_512-NEXT: fmov w0, s0
69 ; VBITS_GE_512-NEXT: ret
70 %op = load <64 x i8>, ptr %a
71 %res = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> %op)
75 define i8 @andv_v128i8(ptr %a) vscale_range(8,0) #0 {
76 ; CHECK-LABEL: andv_v128i8:
78 ; CHECK-NEXT: ptrue p0.b, vl128
79 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
80 ; CHECK-NEXT: andv b0, p0, z0.b
81 ; CHECK-NEXT: fmov w0, s0
83 %op = load <128 x i8>, ptr %a
84 %res = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> %op)
88 define i8 @andv_v256i8(ptr %a) vscale_range(16,0) #0 {
89 ; CHECK-LABEL: andv_v256i8:
91 ; CHECK-NEXT: ptrue p0.b, vl256
92 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
93 ; CHECK-NEXT: andv b0, p0, z0.b
94 ; CHECK-NEXT: fmov w0, s0
96 %op = load <256 x i8>, ptr %a
97 %res = call i8 @llvm.vector.reduce.and.v256i8(<256 x i8> %op)
101 ; No single instruction NEON ANDV support. Use SVE.
102 define i16 @andv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
103 ; CHECK-LABEL: andv_v4i16:
105 ; CHECK-NEXT: ptrue p0.h, vl4
106 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
107 ; CHECK-NEXT: andv h0, p0, z0.h
108 ; CHECK-NEXT: fmov w0, s0
110 %res = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %a)
114 ; No single instruction NEON ANDV support. Use SVE.
115 define i16 @andv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
116 ; CHECK-LABEL: andv_v8i16:
118 ; CHECK-NEXT: ptrue p0.h, vl8
119 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
120 ; CHECK-NEXT: andv h0, p0, z0.h
121 ; CHECK-NEXT: fmov w0, s0
123 %res = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %a)
127 define i16 @andv_v16i16(ptr %a) vscale_range(2,0) #0 {
128 ; CHECK-LABEL: andv_v16i16:
130 ; CHECK-NEXT: ptrue p0.h, vl16
131 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
132 ; CHECK-NEXT: andv h0, p0, z0.h
133 ; CHECK-NEXT: fmov w0, s0
135 %op = load <16 x i16>, ptr %a
136 %res = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %op)
140 define i16 @andv_v32i16(ptr %a) #0 {
141 ; VBITS_GE_256-LABEL: andv_v32i16:
142 ; VBITS_GE_256: // %bb.0:
143 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
144 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
145 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
146 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
147 ; VBITS_GE_256-NEXT: and z0.d, z1.d, z0.d
148 ; VBITS_GE_256-NEXT: andv h0, p0, z0.h
149 ; VBITS_GE_256-NEXT: fmov w0, s0
150 ; VBITS_GE_256-NEXT: ret
152 ; VBITS_GE_512-LABEL: andv_v32i16:
153 ; VBITS_GE_512: // %bb.0:
154 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
155 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
156 ; VBITS_GE_512-NEXT: andv h0, p0, z0.h
157 ; VBITS_GE_512-NEXT: fmov w0, s0
158 ; VBITS_GE_512-NEXT: ret
159 %op = load <32 x i16>, ptr %a
160 %res = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> %op)
164 define i16 @andv_v64i16(ptr %a) vscale_range(8,0) #0 {
165 ; CHECK-LABEL: andv_v64i16:
167 ; CHECK-NEXT: ptrue p0.h, vl64
168 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
169 ; CHECK-NEXT: andv h0, p0, z0.h
170 ; CHECK-NEXT: fmov w0, s0
172 %op = load <64 x i16>, ptr %a
173 %res = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> %op)
177 define i16 @andv_v128i16(ptr %a) vscale_range(16,0) #0 {
178 ; CHECK-LABEL: andv_v128i16:
180 ; CHECK-NEXT: ptrue p0.h, vl128
181 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
182 ; CHECK-NEXT: andv h0, p0, z0.h
183 ; CHECK-NEXT: fmov w0, s0
185 %op = load <128 x i16>, ptr %a
186 %res = call i16 @llvm.vector.reduce.and.v128i16(<128 x i16> %op)
190 ; No single instruction NEON ANDV support. Use SVE.
191 define i32 @andv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
192 ; CHECK-LABEL: andv_v2i32:
194 ; CHECK-NEXT: ptrue p0.s, vl2
195 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
196 ; CHECK-NEXT: andv s0, p0, z0.s
197 ; CHECK-NEXT: fmov w0, s0
199 %res = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a)
203 ; No single instruction NEON ANDV support. Use SVE.
204 define i32 @andv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
205 ; CHECK-LABEL: andv_v4i32:
207 ; CHECK-NEXT: ptrue p0.s, vl4
208 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
209 ; CHECK-NEXT: andv s0, p0, z0.s
210 ; CHECK-NEXT: fmov w0, s0
212 %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a)
216 define i32 @andv_v8i32(ptr %a) vscale_range(2,0) #0 {
217 ; CHECK-LABEL: andv_v8i32:
219 ; CHECK-NEXT: ptrue p0.s, vl8
220 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
221 ; CHECK-NEXT: andv s0, p0, z0.s
222 ; CHECK-NEXT: fmov w0, s0
224 %op = load <8 x i32>, ptr %a
225 %res = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %op)
229 define i32 @andv_v16i32(ptr %a) #0 {
230 ; VBITS_GE_256-LABEL: andv_v16i32:
231 ; VBITS_GE_256: // %bb.0:
232 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
233 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
234 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
235 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
236 ; VBITS_GE_256-NEXT: and z0.d, z1.d, z0.d
237 ; VBITS_GE_256-NEXT: andv s0, p0, z0.s
238 ; VBITS_GE_256-NEXT: fmov w0, s0
239 ; VBITS_GE_256-NEXT: ret
241 ; VBITS_GE_512-LABEL: andv_v16i32:
242 ; VBITS_GE_512: // %bb.0:
243 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
244 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
245 ; VBITS_GE_512-NEXT: andv s0, p0, z0.s
246 ; VBITS_GE_512-NEXT: fmov w0, s0
247 ; VBITS_GE_512-NEXT: ret
248 %op = load <16 x i32>, ptr %a
249 %res = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %op)
253 define i32 @andv_v32i32(ptr %a) vscale_range(8,0) #0 {
254 ; CHECK-LABEL: andv_v32i32:
256 ; CHECK-NEXT: ptrue p0.s, vl32
257 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
258 ; CHECK-NEXT: andv s0, p0, z0.s
259 ; CHECK-NEXT: fmov w0, s0
261 %op = load <32 x i32>, ptr %a
262 %res = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> %op)
266 define i32 @andv_v64i32(ptr %a) vscale_range(16,0) #0 {
267 ; CHECK-LABEL: andv_v64i32:
269 ; CHECK-NEXT: ptrue p0.s, vl64
270 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
271 ; CHECK-NEXT: andv s0, p0, z0.s
272 ; CHECK-NEXT: fmov w0, s0
274 %op = load <64 x i32>, ptr %a
275 %res = call i32 @llvm.vector.reduce.and.v64i32(<64 x i32> %op)
279 ; Nothing to do for single element vectors.
280 define i64 @andv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
281 ; CHECK-LABEL: andv_v1i64:
283 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
284 ; CHECK-NEXT: fmov x0, d0
286 %res = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> %a)
290 ; Use SVE for 128-bit vectors
291 define i64 @andv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
292 ; CHECK-LABEL: andv_v2i64:
294 ; CHECK-NEXT: ptrue p0.d, vl2
295 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
296 ; CHECK-NEXT: andv d0, p0, z0.d
297 ; CHECK-NEXT: fmov x0, d0
299 %res = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a)
303 define i64 @andv_v4i64(ptr %a) vscale_range(2,0) #0 {
304 ; CHECK-LABEL: andv_v4i64:
306 ; CHECK-NEXT: ptrue p0.d, vl4
307 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
308 ; CHECK-NEXT: andv d0, p0, z0.d
309 ; CHECK-NEXT: fmov x0, d0
311 %op = load <4 x i64>, ptr %a
312 %res = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %op)
316 define i64 @andv_v8i64(ptr %a) #0 {
317 ; VBITS_GE_256-LABEL: andv_v8i64:
318 ; VBITS_GE_256: // %bb.0:
319 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
320 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
321 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
322 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
323 ; VBITS_GE_256-NEXT: and z0.d, z1.d, z0.d
324 ; VBITS_GE_256-NEXT: andv d0, p0, z0.d
325 ; VBITS_GE_256-NEXT: fmov x0, d0
326 ; VBITS_GE_256-NEXT: ret
328 ; VBITS_GE_512-LABEL: andv_v8i64:
329 ; VBITS_GE_512: // %bb.0:
330 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
331 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
332 ; VBITS_GE_512-NEXT: andv d0, p0, z0.d
333 ; VBITS_GE_512-NEXT: fmov x0, d0
334 ; VBITS_GE_512-NEXT: ret
335 %op = load <8 x i64>, ptr %a
336 %res = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %op)
340 define i64 @andv_v16i64(ptr %a) vscale_range(8,0) #0 {
341 ; CHECK-LABEL: andv_v16i64:
343 ; CHECK-NEXT: ptrue p0.d, vl16
344 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
345 ; CHECK-NEXT: andv d0, p0, z0.d
346 ; CHECK-NEXT: fmov x0, d0
348 %op = load <16 x i64>, ptr %a
349 %res = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> %op)
353 define i64 @andv_v32i64(ptr %a) vscale_range(16,0) #0 {
354 ; CHECK-LABEL: andv_v32i64:
356 ; CHECK-NEXT: ptrue p0.d, vl32
357 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
358 ; CHECK-NEXT: andv d0, p0, z0.d
359 ; CHECK-NEXT: fmov x0, d0
361 %op = load <32 x i64>, ptr %a
362 %res = call i64 @llvm.vector.reduce.and.v32i64(<32 x i64> %op)
370 ; No single instruction NEON EORV support. Use SVE.
371 define i8 @eorv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
372 ; CHECK-LABEL: eorv_v8i8:
374 ; CHECK-NEXT: ptrue p0.b, vl8
375 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
376 ; CHECK-NEXT: eorv b0, p0, z0.b
377 ; CHECK-NEXT: fmov w0, s0
379 %res = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %a)
383 ; No single instruction NEON EORV support. Use SVE.
384 define i8 @eorv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
385 ; CHECK-LABEL: eorv_v16i8:
387 ; CHECK-NEXT: ptrue p0.b, vl16
388 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
389 ; CHECK-NEXT: eorv b0, p0, z0.b
390 ; CHECK-NEXT: fmov w0, s0
392 %res = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %a)
396 define i8 @eorv_v32i8(ptr %a) vscale_range(2,0) #0 {
397 ; CHECK-LABEL: eorv_v32i8:
399 ; CHECK-NEXT: ptrue p0.b, vl32
400 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
401 ; CHECK-NEXT: eorv b0, p0, z0.b
402 ; CHECK-NEXT: fmov w0, s0
404 %op = load <32 x i8>, ptr %a
405 %res = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %op)
409 define i8 @eorv_v64i8(ptr %a) #0 {
410 ; VBITS_GE_256-LABEL: eorv_v64i8:
411 ; VBITS_GE_256: // %bb.0:
412 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
413 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
414 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
415 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
416 ; VBITS_GE_256-NEXT: eor z0.d, z1.d, z0.d
417 ; VBITS_GE_256-NEXT: eorv b0, p0, z0.b
418 ; VBITS_GE_256-NEXT: fmov w0, s0
419 ; VBITS_GE_256-NEXT: ret
421 ; VBITS_GE_512-LABEL: eorv_v64i8:
422 ; VBITS_GE_512: // %bb.0:
423 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
424 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
425 ; VBITS_GE_512-NEXT: eorv b0, p0, z0.b
426 ; VBITS_GE_512-NEXT: fmov w0, s0
427 ; VBITS_GE_512-NEXT: ret
428 %op = load <64 x i8>, ptr %a
429 %res = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> %op)
433 define i8 @eorv_v128i8(ptr %a) vscale_range(8,0) #0 {
434 ; CHECK-LABEL: eorv_v128i8:
436 ; CHECK-NEXT: ptrue p0.b, vl128
437 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
438 ; CHECK-NEXT: eorv b0, p0, z0.b
439 ; CHECK-NEXT: fmov w0, s0
441 %op = load <128 x i8>, ptr %a
442 %res = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> %op)
446 define i8 @eorv_v256i8(ptr %a) vscale_range(16,0) #0 {
447 ; CHECK-LABEL: eorv_v256i8:
449 ; CHECK-NEXT: ptrue p0.b, vl256
450 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
451 ; CHECK-NEXT: eorv b0, p0, z0.b
452 ; CHECK-NEXT: fmov w0, s0
454 %op = load <256 x i8>, ptr %a
455 %res = call i8 @llvm.vector.reduce.xor.v256i8(<256 x i8> %op)
459 ; No single instruction NEON EORV support. Use SVE.
460 define i16 @eorv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
461 ; CHECK-LABEL: eorv_v4i16:
463 ; CHECK-NEXT: ptrue p0.h, vl4
464 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
465 ; CHECK-NEXT: eorv h0, p0, z0.h
466 ; CHECK-NEXT: fmov w0, s0
468 %res = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %a)
472 ; No single instruction NEON EORV support. Use SVE.
473 define i16 @eorv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
474 ; CHECK-LABEL: eorv_v8i16:
476 ; CHECK-NEXT: ptrue p0.h, vl8
477 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
478 ; CHECK-NEXT: eorv h0, p0, z0.h
479 ; CHECK-NEXT: fmov w0, s0
481 %res = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %a)
485 define i16 @eorv_v16i16(ptr %a) vscale_range(2,0) #0 {
486 ; CHECK-LABEL: eorv_v16i16:
488 ; CHECK-NEXT: ptrue p0.h, vl16
489 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
490 ; CHECK-NEXT: eorv h0, p0, z0.h
491 ; CHECK-NEXT: fmov w0, s0
493 %op = load <16 x i16>, ptr %a
494 %res = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %op)
498 define i16 @eorv_v32i16(ptr %a) #0 {
499 ; VBITS_GE_256-LABEL: eorv_v32i16:
500 ; VBITS_GE_256: // %bb.0:
501 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
502 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
503 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
504 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
505 ; VBITS_GE_256-NEXT: eor z0.d, z1.d, z0.d
506 ; VBITS_GE_256-NEXT: eorv h0, p0, z0.h
507 ; VBITS_GE_256-NEXT: fmov w0, s0
508 ; VBITS_GE_256-NEXT: ret
510 ; VBITS_GE_512-LABEL: eorv_v32i16:
511 ; VBITS_GE_512: // %bb.0:
512 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
513 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
514 ; VBITS_GE_512-NEXT: eorv h0, p0, z0.h
515 ; VBITS_GE_512-NEXT: fmov w0, s0
516 ; VBITS_GE_512-NEXT: ret
517 %op = load <32 x i16>, ptr %a
518 %res = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> %op)
522 define i16 @eorv_v64i16(ptr %a) vscale_range(8,0) #0 {
523 ; CHECK-LABEL: eorv_v64i16:
525 ; CHECK-NEXT: ptrue p0.h, vl64
526 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
527 ; CHECK-NEXT: eorv h0, p0, z0.h
528 ; CHECK-NEXT: fmov w0, s0
530 %op = load <64 x i16>, ptr %a
531 %res = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> %op)
535 define i16 @eorv_v128i16(ptr %a) vscale_range(16,0) #0 {
536 ; CHECK-LABEL: eorv_v128i16:
538 ; CHECK-NEXT: ptrue p0.h, vl128
539 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
540 ; CHECK-NEXT: eorv h0, p0, z0.h
541 ; CHECK-NEXT: fmov w0, s0
543 %op = load <128 x i16>, ptr %a
544 %res = call i16 @llvm.vector.reduce.xor.v128i16(<128 x i16> %op)
548 ; No single instruction NEON EORV support. Use SVE.
549 define i32 @eorv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
550 ; CHECK-LABEL: eorv_v2i32:
552 ; CHECK-NEXT: ptrue p0.s, vl2
553 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
554 ; CHECK-NEXT: eorv s0, p0, z0.s
555 ; CHECK-NEXT: fmov w0, s0
557 %res = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a)
561 ; No single instruction NEON EORV support. Use SVE.
562 define i32 @eorv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
563 ; CHECK-LABEL: eorv_v4i32:
565 ; CHECK-NEXT: ptrue p0.s, vl4
566 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
567 ; CHECK-NEXT: eorv s0, p0, z0.s
568 ; CHECK-NEXT: fmov w0, s0
570 %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a)
574 define i32 @eorv_v8i32(ptr %a) vscale_range(2,0) #0 {
575 ; CHECK-LABEL: eorv_v8i32:
577 ; CHECK-NEXT: ptrue p0.s, vl8
578 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
579 ; CHECK-NEXT: eorv s0, p0, z0.s
580 ; CHECK-NEXT: fmov w0, s0
582 %op = load <8 x i32>, ptr %a
583 %res = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %op)
587 define i32 @eorv_v16i32(ptr %a) #0 {
588 ; VBITS_GE_256-LABEL: eorv_v16i32:
589 ; VBITS_GE_256: // %bb.0:
590 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
591 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
592 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
593 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
594 ; VBITS_GE_256-NEXT: eor z0.d, z1.d, z0.d
595 ; VBITS_GE_256-NEXT: eorv s0, p0, z0.s
596 ; VBITS_GE_256-NEXT: fmov w0, s0
597 ; VBITS_GE_256-NEXT: ret
599 ; VBITS_GE_512-LABEL: eorv_v16i32:
600 ; VBITS_GE_512: // %bb.0:
601 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
602 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
603 ; VBITS_GE_512-NEXT: eorv s0, p0, z0.s
604 ; VBITS_GE_512-NEXT: fmov w0, s0
605 ; VBITS_GE_512-NEXT: ret
606 %op = load <16 x i32>, ptr %a
607 %res = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %op)
611 define i32 @eorv_v32i32(ptr %a) vscale_range(8,0) #0 {
612 ; CHECK-LABEL: eorv_v32i32:
614 ; CHECK-NEXT: ptrue p0.s, vl32
615 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
616 ; CHECK-NEXT: eorv s0, p0, z0.s
617 ; CHECK-NEXT: fmov w0, s0
619 %op = load <32 x i32>, ptr %a
620 %res = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> %op)
624 define i32 @eorv_v64i32(ptr %a) vscale_range(16,0) #0 {
625 ; CHECK-LABEL: eorv_v64i32:
627 ; CHECK-NEXT: ptrue p0.s, vl64
628 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
629 ; CHECK-NEXT: eorv s0, p0, z0.s
630 ; CHECK-NEXT: fmov w0, s0
632 %op = load <64 x i32>, ptr %a
633 %res = call i32 @llvm.vector.reduce.xor.v64i32(<64 x i32> %op)
637 ; Nothing to do for single element vectors.
638 define i64 @eorv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
639 ; CHECK-LABEL: eorv_v1i64:
641 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
642 ; CHECK-NEXT: fmov x0, d0
644 %res = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> %a)
648 ; Use SVE for 128-bit vectors
649 define i64 @eorv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
650 ; CHECK-LABEL: eorv_v2i64:
652 ; CHECK-NEXT: ptrue p0.d, vl2
653 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
654 ; CHECK-NEXT: eorv d0, p0, z0.d
655 ; CHECK-NEXT: fmov x0, d0
657 %res = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %a)
661 define i64 @eorv_v4i64(ptr %a) vscale_range(2,0) #0 {
662 ; CHECK-LABEL: eorv_v4i64:
664 ; CHECK-NEXT: ptrue p0.d, vl4
665 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
666 ; CHECK-NEXT: eorv d0, p0, z0.d
667 ; CHECK-NEXT: fmov x0, d0
669 %op = load <4 x i64>, ptr %a
670 %res = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %op)
674 define i64 @eorv_v8i64(ptr %a) #0 {
675 ; VBITS_GE_256-LABEL: eorv_v8i64:
676 ; VBITS_GE_256: // %bb.0:
677 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
678 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
679 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
680 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
681 ; VBITS_GE_256-NEXT: eor z0.d, z1.d, z0.d
682 ; VBITS_GE_256-NEXT: eorv d0, p0, z0.d
683 ; VBITS_GE_256-NEXT: fmov x0, d0
684 ; VBITS_GE_256-NEXT: ret
686 ; VBITS_GE_512-LABEL: eorv_v8i64:
687 ; VBITS_GE_512: // %bb.0:
688 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
689 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
690 ; VBITS_GE_512-NEXT: eorv d0, p0, z0.d
691 ; VBITS_GE_512-NEXT: fmov x0, d0
692 ; VBITS_GE_512-NEXT: ret
693 %op = load <8 x i64>, ptr %a
694 %res = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> %op)
698 define i64 @eorv_v16i64(ptr %a) vscale_range(8,0) #0 {
699 ; CHECK-LABEL: eorv_v16i64:
701 ; CHECK-NEXT: ptrue p0.d, vl16
702 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
703 ; CHECK-NEXT: eorv d0, p0, z0.d
704 ; CHECK-NEXT: fmov x0, d0
706 %op = load <16 x i64>, ptr %a
707 %res = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> %op)
711 define i64 @eorv_v32i64(ptr %a) vscale_range(16,0) #0 {
712 ; CHECK-LABEL: eorv_v32i64:
714 ; CHECK-NEXT: ptrue p0.d, vl32
715 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
716 ; CHECK-NEXT: eorv d0, p0, z0.d
717 ; CHECK-NEXT: fmov x0, d0
719 %op = load <32 x i64>, ptr %a
720 %res = call i64 @llvm.vector.reduce.xor.v32i64(<32 x i64> %op)
728 ; No single instruction NEON ORV support. Use SVE.
729 define i8 @orv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
730 ; CHECK-LABEL: orv_v8i8:
732 ; CHECK-NEXT: ptrue p0.b, vl8
733 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
734 ; CHECK-NEXT: orv b0, p0, z0.b
735 ; CHECK-NEXT: fmov w0, s0
737 %res = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a)
741 ; No single instruction NEON ORV support. Use SVE.
742 define i8 @orv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
743 ; CHECK-LABEL: orv_v16i8:
745 ; CHECK-NEXT: ptrue p0.b, vl16
746 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
747 ; CHECK-NEXT: orv b0, p0, z0.b
748 ; CHECK-NEXT: fmov w0, s0
750 %res = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a)
754 define i8 @orv_v32i8(ptr %a) vscale_range(2,0) #0 {
755 ; CHECK-LABEL: orv_v32i8:
757 ; CHECK-NEXT: ptrue p0.b, vl32
758 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
759 ; CHECK-NEXT: orv b0, p0, z0.b
760 ; CHECK-NEXT: fmov w0, s0
762 %op = load <32 x i8>, ptr %a
763 %res = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %op)
767 define i8 @orv_v64i8(ptr %a) #0 {
768 ; VBITS_GE_256-LABEL: orv_v64i8:
769 ; VBITS_GE_256: // %bb.0:
770 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
771 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
772 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
773 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
774 ; VBITS_GE_256-NEXT: orr z0.d, z1.d, z0.d
775 ; VBITS_GE_256-NEXT: orv b0, p0, z0.b
776 ; VBITS_GE_256-NEXT: fmov w0, s0
777 ; VBITS_GE_256-NEXT: ret
779 ; VBITS_GE_512-LABEL: orv_v64i8:
780 ; VBITS_GE_512: // %bb.0:
781 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
782 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
783 ; VBITS_GE_512-NEXT: orv b0, p0, z0.b
784 ; VBITS_GE_512-NEXT: fmov w0, s0
785 ; VBITS_GE_512-NEXT: ret
786 %op = load <64 x i8>, ptr %a
787 %res = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> %op)
791 define i8 @orv_v128i8(ptr %a) vscale_range(8,0) #0 {
792 ; CHECK-LABEL: orv_v128i8:
794 ; CHECK-NEXT: ptrue p0.b, vl128
795 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
796 ; CHECK-NEXT: orv b0, p0, z0.b
797 ; CHECK-NEXT: fmov w0, s0
799 %op = load <128 x i8>, ptr %a
800 %res = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> %op)
804 define i8 @orv_v256i8(ptr %a) vscale_range(16,0) #0 {
805 ; CHECK-LABEL: orv_v256i8:
807 ; CHECK-NEXT: ptrue p0.b, vl256
808 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
809 ; CHECK-NEXT: orv b0, p0, z0.b
810 ; CHECK-NEXT: fmov w0, s0
812 %op = load <256 x i8>, ptr %a
813 %res = call i8 @llvm.vector.reduce.or.v256i8(<256 x i8> %op)
817 ; No single instruction NEON ORV support. Use SVE.
818 define i16 @orv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
819 ; CHECK-LABEL: orv_v4i16:
821 ; CHECK-NEXT: ptrue p0.h, vl4
822 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
823 ; CHECK-NEXT: orv h0, p0, z0.h
824 ; CHECK-NEXT: fmov w0, s0
826 %res = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %a)
830 ; No single instruction NEON ORV support. Use SVE.
831 define i16 @orv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
832 ; CHECK-LABEL: orv_v8i16:
834 ; CHECK-NEXT: ptrue p0.h, vl8
835 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
836 ; CHECK-NEXT: orv h0, p0, z0.h
837 ; CHECK-NEXT: fmov w0, s0
839 %res = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %a)
843 define i16 @orv_v16i16(ptr %a) vscale_range(2,0) #0 {
844 ; CHECK-LABEL: orv_v16i16:
846 ; CHECK-NEXT: ptrue p0.h, vl16
847 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
848 ; CHECK-NEXT: orv h0, p0, z0.h
849 ; CHECK-NEXT: fmov w0, s0
851 %op = load <16 x i16>, ptr %a
852 %res = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %op)
856 define i16 @orv_v32i16(ptr %a) #0 {
857 ; VBITS_GE_256-LABEL: orv_v32i16:
858 ; VBITS_GE_256: // %bb.0:
859 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
860 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
861 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
862 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
863 ; VBITS_GE_256-NEXT: orr z0.d, z1.d, z0.d
864 ; VBITS_GE_256-NEXT: orv h0, p0, z0.h
865 ; VBITS_GE_256-NEXT: fmov w0, s0
866 ; VBITS_GE_256-NEXT: ret
868 ; VBITS_GE_512-LABEL: orv_v32i16:
869 ; VBITS_GE_512: // %bb.0:
870 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
871 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
872 ; VBITS_GE_512-NEXT: orv h0, p0, z0.h
873 ; VBITS_GE_512-NEXT: fmov w0, s0
874 ; VBITS_GE_512-NEXT: ret
875 %op = load <32 x i16>, ptr %a
876 %res = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> %op)
880 define i16 @orv_v64i16(ptr %a) vscale_range(8,0) #0 {
881 ; CHECK-LABEL: orv_v64i16:
883 ; CHECK-NEXT: ptrue p0.h, vl64
884 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
885 ; CHECK-NEXT: orv h0, p0, z0.h
886 ; CHECK-NEXT: fmov w0, s0
888 %op = load <64 x i16>, ptr %a
889 %res = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> %op)
893 define i16 @orv_v128i16(ptr %a) vscale_range(16,0) #0 {
894 ; CHECK-LABEL: orv_v128i16:
896 ; CHECK-NEXT: ptrue p0.h, vl128
897 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
898 ; CHECK-NEXT: orv h0, p0, z0.h
899 ; CHECK-NEXT: fmov w0, s0
901 %op = load <128 x i16>, ptr %a
902 %res = call i16 @llvm.vector.reduce.or.v128i16(<128 x i16> %op)
906 ; No single instruction NEON ORV support. Use SVE.
907 define i32 @orv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
908 ; CHECK-LABEL: orv_v2i32:
910 ; CHECK-NEXT: ptrue p0.s, vl2
911 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
912 ; CHECK-NEXT: orv s0, p0, z0.s
913 ; CHECK-NEXT: fmov w0, s0
915 %res = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a)
919 ; No single instruction NEON ORV support. Use SVE.
920 define i32 @orv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
921 ; CHECK-LABEL: orv_v4i32:
923 ; CHECK-NEXT: ptrue p0.s, vl4
924 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
925 ; CHECK-NEXT: orv s0, p0, z0.s
926 ; CHECK-NEXT: fmov w0, s0
928 %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
932 define i32 @orv_v8i32(ptr %a) vscale_range(2,0) #0 {
933 ; CHECK-LABEL: orv_v8i32:
935 ; CHECK-NEXT: ptrue p0.s, vl8
936 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
937 ; CHECK-NEXT: orv s0, p0, z0.s
938 ; CHECK-NEXT: fmov w0, s0
940 %op = load <8 x i32>, ptr %a
941 %res = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %op)
945 define i32 @orv_v16i32(ptr %a) #0 {
946 ; VBITS_GE_256-LABEL: orv_v16i32:
947 ; VBITS_GE_256: // %bb.0:
948 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
949 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
950 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
951 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
952 ; VBITS_GE_256-NEXT: orr z0.d, z1.d, z0.d
953 ; VBITS_GE_256-NEXT: orv s0, p0, z0.s
954 ; VBITS_GE_256-NEXT: fmov w0, s0
955 ; VBITS_GE_256-NEXT: ret
957 ; VBITS_GE_512-LABEL: orv_v16i32:
958 ; VBITS_GE_512: // %bb.0:
959 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
960 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
961 ; VBITS_GE_512-NEXT: orv s0, p0, z0.s
962 ; VBITS_GE_512-NEXT: fmov w0, s0
963 ; VBITS_GE_512-NEXT: ret
964 %op = load <16 x i32>, ptr %a
965 %res = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %op)
969 define i32 @orv_v32i32(ptr %a) vscale_range(8,0) #0 {
970 ; CHECK-LABEL: orv_v32i32:
972 ; CHECK-NEXT: ptrue p0.s, vl32
973 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
974 ; CHECK-NEXT: orv s0, p0, z0.s
975 ; CHECK-NEXT: fmov w0, s0
977 %op = load <32 x i32>, ptr %a
978 %res = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> %op)
982 define i32 @orv_v64i32(ptr %a) vscale_range(16,0) #0 {
983 ; CHECK-LABEL: orv_v64i32:
985 ; CHECK-NEXT: ptrue p0.s, vl64
986 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
987 ; CHECK-NEXT: orv s0, p0, z0.s
988 ; CHECK-NEXT: fmov w0, s0
990 %op = load <64 x i32>, ptr %a
991 %res = call i32 @llvm.vector.reduce.or.v64i32(<64 x i32> %op)
995 ; Nothing to do for single element vectors.
996 define i64 @orv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
997 ; CHECK-LABEL: orv_v1i64:
999 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1000 ; CHECK-NEXT: fmov x0, d0
1002 %res = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> %a)
1006 ; Use SVE for 128-bit vectors
1007 define i64 @orv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
1008 ; CHECK-LABEL: orv_v2i64:
1010 ; CHECK-NEXT: ptrue p0.d, vl2
1011 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
1012 ; CHECK-NEXT: orv d0, p0, z0.d
1013 ; CHECK-NEXT: fmov x0, d0
1015 %res = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a)
1019 define i64 @orv_v4i64(ptr %a) vscale_range(2,0) #0 {
1020 ; CHECK-LABEL: orv_v4i64:
1022 ; CHECK-NEXT: ptrue p0.d, vl4
1023 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1024 ; CHECK-NEXT: orv d0, p0, z0.d
1025 ; CHECK-NEXT: fmov x0, d0
1027 %op = load <4 x i64>, ptr %a
1028 %res = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %op)
1032 define i64 @orv_v8i64(ptr %a) #0 {
1033 ; VBITS_GE_256-LABEL: orv_v8i64:
1034 ; VBITS_GE_256: // %bb.0:
1035 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1036 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1037 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1038 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
1039 ; VBITS_GE_256-NEXT: orr z0.d, z1.d, z0.d
1040 ; VBITS_GE_256-NEXT: orv d0, p0, z0.d
1041 ; VBITS_GE_256-NEXT: fmov x0, d0
1042 ; VBITS_GE_256-NEXT: ret
1044 ; VBITS_GE_512-LABEL: orv_v8i64:
1045 ; VBITS_GE_512: // %bb.0:
1046 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1047 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1048 ; VBITS_GE_512-NEXT: orv d0, p0, z0.d
1049 ; VBITS_GE_512-NEXT: fmov x0, d0
1050 ; VBITS_GE_512-NEXT: ret
1051 %op = load <8 x i64>, ptr %a
1052 %res = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %op)
1056 define i64 @orv_v16i64(ptr %a) vscale_range(8,0) #0 {
1057 ; CHECK-LABEL: orv_v16i64:
1059 ; CHECK-NEXT: ptrue p0.d, vl16
1060 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1061 ; CHECK-NEXT: orv d0, p0, z0.d
1062 ; CHECK-NEXT: fmov x0, d0
1064 %op = load <16 x i64>, ptr %a
1065 %res = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> %op)
1069 define i64 @orv_v32i64(ptr %a) vscale_range(16,0) #0 {
1070 ; CHECK-LABEL: orv_v32i64:
1072 ; CHECK-NEXT: ptrue p0.d, vl32
1073 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1074 ; CHECK-NEXT: orv d0, p0, z0.d
1075 ; CHECK-NEXT: fmov x0, d0
1077 %op = load <32 x i64>, ptr %a
1078 %res = call i64 @llvm.vector.reduce.or.v32i64(<32 x i64> %op)
1082 attributes #0 = { "target-features"="+sve" }
1084 declare i8 @llvm.vector.reduce.and.v8i8(<8 x i8>)
1085 declare i8 @llvm.vector.reduce.and.v16i8(<16 x i8>)
1086 declare i8 @llvm.vector.reduce.and.v32i8(<32 x i8>)
1087 declare i8 @llvm.vector.reduce.and.v64i8(<64 x i8>)
1088 declare i8 @llvm.vector.reduce.and.v128i8(<128 x i8>)
1089 declare i8 @llvm.vector.reduce.and.v256i8(<256 x i8>)
1091 declare i16 @llvm.vector.reduce.and.v4i16(<4 x i16>)
1092 declare i16 @llvm.vector.reduce.and.v8i16(<8 x i16>)
1093 declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>)
1094 declare i16 @llvm.vector.reduce.and.v32i16(<32 x i16>)
1095 declare i16 @llvm.vector.reduce.and.v64i16(<64 x i16>)
1096 declare i16 @llvm.vector.reduce.and.v128i16(<128 x i16>)
1098 declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>)
1099 declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
1100 declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>)
1101 declare i32 @llvm.vector.reduce.and.v16i32(<16 x i32>)
1102 declare i32 @llvm.vector.reduce.and.v32i32(<32 x i32>)
1103 declare i32 @llvm.vector.reduce.and.v64i32(<64 x i32>)
1105 declare i64 @llvm.vector.reduce.and.v1i64(<1 x i64>)
1106 declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>)
1107 declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>)
1108 declare i64 @llvm.vector.reduce.and.v8i64(<8 x i64>)
1109 declare i64 @llvm.vector.reduce.and.v16i64(<16 x i64>)
1110 declare i64 @llvm.vector.reduce.and.v32i64(<32 x i64>)
1112 declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>)
1113 declare i8 @llvm.vector.reduce.or.v16i8(<16 x i8>)
1114 declare i8 @llvm.vector.reduce.or.v32i8(<32 x i8>)
1115 declare i8 @llvm.vector.reduce.or.v64i8(<64 x i8>)
1116 declare i8 @llvm.vector.reduce.or.v128i8(<128 x i8>)
1117 declare i8 @llvm.vector.reduce.or.v256i8(<256 x i8>)
1119 declare i16 @llvm.vector.reduce.or.v4i16(<4 x i16>)
1120 declare i16 @llvm.vector.reduce.or.v8i16(<8 x i16>)
1121 declare i16 @llvm.vector.reduce.or.v16i16(<16 x i16>)
1122 declare i16 @llvm.vector.reduce.or.v32i16(<32 x i16>)
1123 declare i16 @llvm.vector.reduce.or.v64i16(<64 x i16>)
1124 declare i16 @llvm.vector.reduce.or.v128i16(<128 x i16>)
1126 declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>)
1127 declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
1128 declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>)
1129 declare i32 @llvm.vector.reduce.or.v16i32(<16 x i32>)
1130 declare i32 @llvm.vector.reduce.or.v32i32(<32 x i32>)
1131 declare i32 @llvm.vector.reduce.or.v64i32(<64 x i32>)
1133 declare i64 @llvm.vector.reduce.or.v1i64(<1 x i64>)
1134 declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
1135 declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>)
1136 declare i64 @llvm.vector.reduce.or.v8i64(<8 x i64>)
1137 declare i64 @llvm.vector.reduce.or.v16i64(<16 x i64>)
1138 declare i64 @llvm.vector.reduce.or.v32i64(<32 x i64>)
1140 declare i8 @llvm.vector.reduce.xor.v8i8(<8 x i8>)
1141 declare i8 @llvm.vector.reduce.xor.v16i8(<16 x i8>)
1142 declare i8 @llvm.vector.reduce.xor.v32i8(<32 x i8>)
1143 declare i8 @llvm.vector.reduce.xor.v64i8(<64 x i8>)
1144 declare i8 @llvm.vector.reduce.xor.v128i8(<128 x i8>)
1145 declare i8 @llvm.vector.reduce.xor.v256i8(<256 x i8>)
1147 declare i16 @llvm.vector.reduce.xor.v4i16(<4 x i16>)
1148 declare i16 @llvm.vector.reduce.xor.v8i16(<8 x i16>)
1149 declare i16 @llvm.vector.reduce.xor.v16i16(<16 x i16>)
1150 declare i16 @llvm.vector.reduce.xor.v32i16(<32 x i16>)
1151 declare i16 @llvm.vector.reduce.xor.v64i16(<64 x i16>)
1152 declare i16 @llvm.vector.reduce.xor.v128i16(<128 x i16>)
1154 declare i32 @llvm.vector.reduce.xor.v2i32(<2 x i32>)
1155 declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
1156 declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32>)
1157 declare i32 @llvm.vector.reduce.xor.v16i32(<16 x i32>)
1158 declare i32 @llvm.vector.reduce.xor.v32i32(<32 x i32>)
1159 declare i32 @llvm.vector.reduce.xor.v64i32(<64 x i32>)
1161 declare i64 @llvm.vector.reduce.xor.v1i64(<1 x i64>)
1162 declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>)
1163 declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>)
1164 declare i64 @llvm.vector.reduce.xor.v8i64(<8 x i64>)
1165 declare i64 @llvm.vector.reduce.xor.v16i64(<16 x i64>)
1166 declare i64 @llvm.vector.reduce.xor.v32i64(<32 x i64>)