1 ; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256
3 ; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
4 ; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
5 ; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
6 ; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
7 ; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
8 ; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
9 ; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
10 ; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
11 ; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
12 ; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
13 ; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
14 ; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
15 ; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
16 ; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
18 target triple = "aarch64-unknown-linux-gnu"
20 ; Don't use SVE when its registers are no bigger than NEON.
27 ; No single instruction NEON ANDV support. Use SVE.
28 define i8 @andv_v8i8(<8 x i8> %a) #0 {
29 ; CHECK-LABEL: andv_v8i8:
30 ; CHECK: ptrue [[PG:p[0-9]+]].b, vl8
31 ; CHECK: andv b[[REDUCE:[0-9]+]], [[PG]], z0.b
32 ; CHECK: fmov w0, s[[REDUCE]]
34 %res = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a)
38 ; No single instruction NEON ANDV support. Use SVE.
39 define i8 @andv_v16i8(<16 x i8> %a) #0 {
40 ; CHECK-LABEL: andv_v16i8:
41 ; CHECK: ptrue [[PG:p[0-9]+]].b, vl16
42 ; CHECK: andv b[[REDUCE:[0-9]+]], [[PG]], z0.b
43 ; CHECK: fmov w0, s[[REDUCE]]
45 %res = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a)
49 define i8 @andv_v32i8(<32 x i8>* %a) #0 {
50 ; CHECK-LABEL: andv_v32i8:
51 ; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
52 ; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
53 ; CHECK-NEXT: andv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
54 ; CHECK-NEXT: fmov w0, s[[REDUCE]]
56 %op = load <32 x i8>, <32 x i8>* %a
57 %res = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %op)
61 define i8 @andv_v64i8(<64 x i8>* %a) #0 {
62 ; CHECK-LABEL: andv_v64i8:
63 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
64 ; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
65 ; VBITS_GE_512-NEXT: andv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
66 ; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
67 ; VBITS_GE_512-NEXT: ret
69 ; Ensure sensible type legalisation.
70 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
71 ; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
72 ; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
73 ; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
74 ; VBITS_EQ_256-DAG: and [[AND:z[0-9]+]].d, [[LO]].d, [[HI]].d
75 ; VBITS_EQ_256-DAG: andv b[[REDUCE:[0-9]+]], [[PG]], [[AND]].b
76 ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
77 ; VBITS_EQ_256-NEXT: ret
79 %op = load <64 x i8>, <64 x i8>* %a
80 %res = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> %op)
84 define i8 @andv_v128i8(<128 x i8>* %a) #0 {
85 ; CHECK-LABEL: andv_v128i8:
86 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
87 ; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
88 ; VBITS_GE_1024-NEXT: andv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
89 ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
90 ; VBITS_GE_1024-NEXT: ret
91 %op = load <128 x i8>, <128 x i8>* %a
92 %res = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> %op)
96 define i8 @andv_v256i8(<256 x i8>* %a) #0 {
97 ; CHECK-LABEL: andv_v256i8:
98 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
99 ; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
100 ; VBITS_GE_2048-NEXT: andv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
101 ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
102 ; VBITS_GE_2048-NEXT: ret
103 %op = load <256 x i8>, <256 x i8>* %a
104 %res = call i8 @llvm.vector.reduce.and.v256i8(<256 x i8> %op)
108 ; No single instruction NEON ANDV support. Use SVE.
109 define i16 @andv_v4i16(<4 x i16> %a) #0 {
110 ; CHECK-LABEL: andv_v4i16:
111 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
112 ; CHECK: andv h[[REDUCE:[0-9]+]], [[PG]], z0.h
113 ; CHECK: fmov w0, s[[REDUCE]]
115 %res = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %a)
119 ; No single instruction NEON ANDV support. Use SVE.
120 define i16 @andv_v8i16(<8 x i16> %a) #0 {
121 ; CHECK-LABEL: andv_v8i16:
122 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
123 ; CHECK: andv h[[REDUCE:[0-9]+]], [[PG]], z0.h
124 ; CHECK: fmov w0, s[[REDUCE]]
126 %res = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %a)
130 define i16 @andv_v16i16(<16 x i16>* %a) #0 {
131 ; CHECK-LABEL: andv_v16i16:
132 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
133 ; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
134 ; CHECK-NEXT: andv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
135 ; CHECK-NEXT: fmov w0, s[[REDUCE]]
137 %op = load <16 x i16>, <16 x i16>* %a
138 %res = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %op)
142 define i16 @andv_v32i16(<32 x i16>* %a) #0 {
143 ; CHECK-LABEL: andv_v32i16:
144 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
145 ; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
146 ; VBITS_GE_512-NEXT: andv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
147 ; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
148 ; VBITS_GE_512-NEXT: ret
150 ; Ensure sensible type legalisation.
151 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
152 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
153 ; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
154 ; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
155 ; VBITS_EQ_256-DAG: and [[AND:z[0-9]+]].d, [[LO]].d, [[HI]].d
156 ; VBITS_EQ_256-DAG: andv h[[REDUCE:[0-9]+]], [[PG]], [[AND]].h
157 ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
158 ; VBITS_EQ_256-NEXT: ret
159 %op = load <32 x i16>, <32 x i16>* %a
160 %res = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> %op)
164 define i16 @andv_v64i16(<64 x i16>* %a) #0 {
165 ; CHECK-LABEL: andv_v64i16:
166 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
167 ; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
168 ; VBITS_GE_1024-NEXT: andv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
169 ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
170 ; VBITS_GE_1024-NEXT: ret
171 %op = load <64 x i16>, <64 x i16>* %a
172 %res = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> %op)
176 define i16 @andv_v128i16(<128 x i16>* %a) #0 {
177 ; CHECK-LABEL: andv_v128i16:
178 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
179 ; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
180 ; VBITS_GE_2048-NEXT: andv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
181 ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
182 ; VBITS_GE_2048-NEXT: ret
183 %op = load <128 x i16>, <128 x i16>* %a
184 %res = call i16 @llvm.vector.reduce.and.v128i16(<128 x i16> %op)
188 ; No single instruction NEON ANDV support. Use SVE.
189 define i32 @andv_v2i32(<2 x i32> %a) #0 {
190 ; CHECK-LABEL: andv_v2i32:
191 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
192 ; CHECK: andv [[REDUCE:s[0-9]+]], [[PG]], z0.s
193 ; CHECK: fmov w0, [[REDUCE]]
195 %res = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a)
199 ; No single instruction NEON ANDV support. Use SVE.
200 define i32 @andv_v4i32(<4 x i32> %a) #0 {
201 ; CHECK-LABEL: andv_v4i32:
202 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
203 ; CHECK: andv [[REDUCE:s[0-9]+]], [[PG]], z0.s
204 ; CHECK: fmov w0, [[REDUCE]]
206 %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a)
210 define i32 @andv_v8i32(<8 x i32>* %a) #0 {
211 ; CHECK-LABEL: andv_v8i32:
212 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
213 ; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
214 ; CHECK-NEXT: andv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
215 ; CHECK-NEXT: fmov w0, [[REDUCE]]
217 %op = load <8 x i32>, <8 x i32>* %a
218 %res = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %op)
222 define i32 @andv_v16i32(<16 x i32>* %a) #0 {
223 ; CHECK-LABEL: andv_v16i32:
224 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
225 ; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
226 ; VBITS_GE_512-NEXT: andv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
227 ; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
228 ; VBITS_GE_512-NEXT: ret
230 ; Ensure sensible type legalisation.
231 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
232 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
233 ; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
234 ; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
235 ; VBITS_EQ_256-DAG: and [[AND:z[0-9]+]].d, [[LO]].d, [[HI]].d
236 ; VBITS_EQ_256-DAG: andv [[REDUCE:s[0-9]+]], [[PG]], [[AND]].s
237 ; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
238 ; VBITS_EQ_256-NEXT: ret
239 %op = load <16 x i32>, <16 x i32>* %a
240 %res = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %op)
244 define i32 @andv_v32i32(<32 x i32>* %a) #0 {
245 ; CHECK-LABEL: andv_v32i32:
246 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
247 ; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
248 ; VBITS_GE_1024-NEXT: andv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
249 ; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
250 ; VBITS_GE_1024-NEXT: ret
251 %op = load <32 x i32>, <32 x i32>* %a
252 %res = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> %op)
256 define i32 @andv_v64i32(<64 x i32>* %a) #0 {
257 ; CHECK-LABEL: andv_v64i32:
258 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
259 ; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
260 ; VBITS_GE_2048-NEXT: andv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
261 ; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
262 ; VBITS_GE_2048-NEXT: ret
263 %op = load <64 x i32>, <64 x i32>* %a
264 %res = call i32 @llvm.vector.reduce.and.v64i32(<64 x i32> %op)
268 ; Nothing to do for single element vectors.
269 define i64 @andv_v1i64(<1 x i64> %a) #0 {
270 ; CHECK-LABEL: andv_v1i64:
273 %res = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> %a)
277 ; Use SVE for 128-bit vectors
278 define i64 @andv_v2i64(<2 x i64> %a) #0 {
279 ; CHECK-LABEL: andv_v2i64:
280 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
281 ; CHECK: andv [[REDUCE:d[0-9]+]], [[PG]], z0.d
282 ; CHECK: fmov x0, [[REDUCE]]
284 %res = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a)
288 define i64 @andv_v4i64(<4 x i64>* %a) #0 {
289 ; CHECK-LABEL: andv_v4i64:
290 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
291 ; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
292 ; CHECK-NEXT: andv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
293 ; CHECK-NEXT: fmov x0, [[REDUCE]]
295 %op = load <4 x i64>, <4 x i64>* %a
296 %res = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %op)
300 define i64 @andv_v8i64(<8 x i64>* %a) #0 {
301 ; CHECK-LABEL: andv_v8i64:
302 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
303 ; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
304 ; VBITS_GE_512-NEXT: andv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
305 ; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
306 ; VBITS_GE_512-NEXT: ret
308 ; Ensure sensible type legalisation.
309 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
310 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
311 ; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
312 ; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
313 ; VBITS_EQ_256-DAG: and [[AND:z[0-9]+]].d, [[LO]].d, [[HI]].d
314 ; VBITS_EQ_256-DAG: andv [[REDUCE:d[0-9]+]], [[PG]], [[AND]].d
315 ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
316 ; VBITS_EQ_256-NEXT: ret
317 %op = load <8 x i64>, <8 x i64>* %a
318 %res = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %op)
322 define i64 @andv_v16i64(<16 x i64>* %a) #0 {
323 ; CHECK-LABEL: andv_v16i64:
324 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
325 ; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
326 ; VBITS_GE_1024-NEXT: andv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
327 ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
328 ; VBITS_GE_1024-NEXT: ret
329 %op = load <16 x i64>, <16 x i64>* %a
330 %res = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> %op)
334 define i64 @andv_v32i64(<32 x i64>* %a) #0 {
335 ; CHECK-LABEL: andv_v32i64:
336 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
337 ; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
338 ; VBITS_GE_2048-NEXT: andv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
339 ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
340 ; VBITS_GE_2048-NEXT: ret
341 %op = load <32 x i64>, <32 x i64>* %a
342 %res = call i64 @llvm.vector.reduce.and.v32i64(<32 x i64> %op)
350 ; No single instruction NEON EORV support. Use SVE.
351 define i8 @eorv_v8i8(<8 x i8> %a) #0 {
352 ; CHECK-LABEL: eorv_v8i8:
353 ; CHECK: ptrue [[PG:p[0-9]+]].b, vl8
354 ; CHECK: eorv b[[REDUCE:[0-9]+]], [[PG]], z0.b
355 ; CHECK: fmov w0, s[[REDUCE]]
357 %res = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %a)
361 ; No single instruction NEON EORV support. Use SVE.
362 define i8 @eorv_v16i8(<16 x i8> %a) #0 {
363 ; CHECK-LABEL: eorv_v16i8:
364 ; CHECK: ptrue [[PG:p[0-9]+]].b, vl16
365 ; CHECK: eorv b[[REDUCE:[0-9]+]], [[PG]], z0.b
366 ; CHECK: fmov w0, s[[REDUCE]]
368 %res = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %a)
372 define i8 @eorv_v32i8(<32 x i8>* %a) #0 {
373 ; CHECK-LABEL: eorv_v32i8:
374 ; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
375 ; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
376 ; CHECK-NEXT: eorv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
377 ; CHECK-NEXT: fmov w0, s[[REDUCE]]
379 %op = load <32 x i8>, <32 x i8>* %a
380 %res = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %op)
384 define i8 @eorv_v64i8(<64 x i8>* %a) #0 {
385 ; CHECK-LABEL: eorv_v64i8:
386 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
387 ; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
388 ; VBITS_GE_512-NEXT: eorv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
389 ; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
390 ; VBITS_GE_512-NEXT: ret
392 ; Ensure sensible type legalisation.
393 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
394 ; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
395 ; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
396 ; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
397 ; VBITS_EQ_256-DAG: eor [[EOR:z[0-9]+]].d, [[LO]].d, [[HI]].d
398 ; VBITS_EQ_256-DAG: eorv b[[REDUCE:[0-9]+]], [[PG]], [[EOR]].b
399 ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
400 ; VBITS_EQ_256-NEXT: ret
402 %op = load <64 x i8>, <64 x i8>* %a
403 %res = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> %op)
407 define i8 @eorv_v128i8(<128 x i8>* %a) #0 {
408 ; CHECK-LABEL: eorv_v128i8:
409 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
410 ; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
411 ; VBITS_GE_1024-NEXT: eorv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
412 ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
413 ; VBITS_GE_1024-NEXT: ret
414 %op = load <128 x i8>, <128 x i8>* %a
415 %res = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> %op)
419 define i8 @eorv_v256i8(<256 x i8>* %a) #0 {
420 ; CHECK-LABEL: eorv_v256i8:
421 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
422 ; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
423 ; VBITS_GE_2048-NEXT: eorv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
424 ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
425 ; VBITS_GE_2048-NEXT: ret
426 %op = load <256 x i8>, <256 x i8>* %a
427 %res = call i8 @llvm.vector.reduce.xor.v256i8(<256 x i8> %op)
431 ; No single instruction NEON EORV support. Use SVE.
432 define i16 @eorv_v4i16(<4 x i16> %a) #0 {
433 ; CHECK-LABEL: eorv_v4i16:
434 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
435 ; CHECK: eorv h[[REDUCE:[0-9]+]], [[PG]], z0.h
436 ; CHECK: fmov w0, s[[REDUCE]]
438 %res = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %a)
442 ; No single instruction NEON EORV support. Use SVE.
443 define i16 @eorv_v8i16(<8 x i16> %a) #0 {
444 ; CHECK-LABEL: eorv_v8i16:
445 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
446 ; CHECK: eorv h[[REDUCE:[0-9]+]], [[PG]], z0.h
447 ; CHECK: fmov w0, s[[REDUCE]]
449 %res = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %a)
453 define i16 @eorv_v16i16(<16 x i16>* %a) #0 {
454 ; CHECK-LABEL: eorv_v16i16:
455 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
456 ; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
457 ; CHECK-NEXT: eorv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
458 ; CHECK-NEXT: fmov w0, s[[REDUCE]]
460 %op = load <16 x i16>, <16 x i16>* %a
461 %res = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %op)
465 define i16 @eorv_v32i16(<32 x i16>* %a) #0 {
466 ; CHECK-LABEL: eorv_v32i16:
467 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
468 ; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
469 ; VBITS_GE_512-NEXT: eorv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
470 ; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
471 ; VBITS_GE_512-NEXT: ret
473 ; Ensure sensible type legalisation.
474 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
475 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
476 ; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
477 ; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
478 ; VBITS_EQ_256-DAG: eor [[EOR:z[0-9]+]].d, [[LO]].d, [[HI]].d
479 ; VBITS_EQ_256-DAG: eorv h[[REDUCE:[0-9]+]], [[PG]], [[EOR]].h
480 ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
481 ; VBITS_EQ_256-NEXT: ret
482 %op = load <32 x i16>, <32 x i16>* %a
483 %res = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> %op)
487 define i16 @eorv_v64i16(<64 x i16>* %a) #0 {
488 ; CHECK-LABEL: eorv_v64i16:
489 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
490 ; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
491 ; VBITS_GE_1024-NEXT: eorv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
492 ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
493 ; VBITS_GE_1024-NEXT: ret
494 %op = load <64 x i16>, <64 x i16>* %a
495 %res = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> %op)
499 define i16 @eorv_v128i16(<128 x i16>* %a) #0 {
500 ; CHECK-LABEL: eorv_v128i16:
501 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
502 ; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
503 ; VBITS_GE_2048-NEXT: eorv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
504 ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
505 ; VBITS_GE_2048-NEXT: ret
506 %op = load <128 x i16>, <128 x i16>* %a
507 %res = call i16 @llvm.vector.reduce.xor.v128i16(<128 x i16> %op)
511 ; No single instruction NEON EORV support. Use SVE.
512 define i32 @eorv_v2i32(<2 x i32> %a) #0 {
513 ; CHECK-LABEL: eorv_v2i32:
514 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
515 ; CHECK: eorv [[REDUCE:s[0-9]+]], [[PG]], z0.s
516 ; CHECK: fmov w0, [[REDUCE]]
518 %res = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a)
522 ; No single instruction NEON EORV support. Use SVE.
523 define i32 @eorv_v4i32(<4 x i32> %a) #0 {
524 ; CHECK-LABEL: eorv_v4i32:
525 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
526 ; CHECK: eorv [[REDUCE:s[0-9]+]], [[PG]], z0.s
527 ; CHECK: fmov w0, [[REDUCE]]
529 %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a)
533 define i32 @eorv_v8i32(<8 x i32>* %a) #0 {
534 ; CHECK-LABEL: eorv_v8i32:
535 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
536 ; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
537 ; CHECK-NEXT: eorv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
538 ; CHECK-NEXT: fmov w0, [[REDUCE]]
540 %op = load <8 x i32>, <8 x i32>* %a
541 %res = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %op)
545 define i32 @eorv_v16i32(<16 x i32>* %a) #0 {
546 ; CHECK-LABEL: eorv_v16i32:
547 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
548 ; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
549 ; VBITS_GE_512-NEXT: eorv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
550 ; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
551 ; VBITS_GE_512-NEXT: ret
553 ; Ensure sensible type legalisation.
554 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
555 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
556 ; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
557 ; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
558 ; VBITS_EQ_256-DAG: eor [[EOR:z[0-9]+]].d, [[LO]].d, [[HI]].d
559 ; VBITS_EQ_256-DAG: eorv [[REDUCE:s[0-9]+]], [[PG]], [[EOR]].s
560 ; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
561 ; VBITS_EQ_256-NEXT: ret
562 %op = load <16 x i32>, <16 x i32>* %a
563 %res = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %op)
567 define i32 @eorv_v32i32(<32 x i32>* %a) #0 {
568 ; CHECK-LABEL: eorv_v32i32:
569 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
570 ; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
571 ; VBITS_GE_1024-NEXT: eorv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
572 ; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
573 ; VBITS_GE_1024-NEXT: ret
574 %op = load <32 x i32>, <32 x i32>* %a
575 %res = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> %op)
579 define i32 @eorv_v64i32(<64 x i32>* %a) #0 {
580 ; CHECK-LABEL: eorv_v64i32:
581 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
582 ; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
583 ; VBITS_GE_2048-NEXT: eorv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
584 ; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
585 ; VBITS_GE_2048-NEXT: ret
586 %op = load <64 x i32>, <64 x i32>* %a
587 %res = call i32 @llvm.vector.reduce.xor.v64i32(<64 x i32> %op)
591 ; Nothing to do for single element vectors.
592 define i64 @eorv_v1i64(<1 x i64> %a) #0 {
593 ; CHECK-LABEL: eorv_v1i64:
596 %res = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> %a)
600 ; Use SVE for 128-bit vectors
601 define i64 @eorv_v2i64(<2 x i64> %a) #0 {
602 ; CHECK-LABEL: eorv_v2i64:
603 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
604 ; CHECK: eorv [[REDUCE:d[0-9]+]], [[PG]], z0.d
605 ; CHECK: fmov x0, [[REDUCE]]
607 %res = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %a)
611 define i64 @eorv_v4i64(<4 x i64>* %a) #0 {
612 ; CHECK-LABEL: eorv_v4i64:
613 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
614 ; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
615 ; CHECK-NEXT: eorv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
616 ; CHECK-NEXT: fmov x0, [[REDUCE]]
618 %op = load <4 x i64>, <4 x i64>* %a
619 %res = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %op)
623 define i64 @eorv_v8i64(<8 x i64>* %a) #0 {
624 ; CHECK-LABEL: eorv_v8i64:
625 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
626 ; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
627 ; VBITS_GE_512-NEXT: eorv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
628 ; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
629 ; VBITS_GE_512-NEXT: ret
631 ; Ensure sensible type legalisation.
632 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
633 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
634 ; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
635 ; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
636 ; VBITS_EQ_256-DAG: eor [[EOR:z[0-9]+]].d, [[LO]].d, [[HI]].d
637 ; VBITS_EQ_256-DAG: eorv [[REDUCE:d[0-9]+]], [[PG]], [[EOR]].d
638 ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
639 ; VBITS_EQ_256-NEXT: ret
640 %op = load <8 x i64>, <8 x i64>* %a
641 %res = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> %op)
645 define i64 @eorv_v16i64(<16 x i64>* %a) #0 {
646 ; CHECK-LABEL: eorv_v16i64:
647 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
648 ; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
649 ; VBITS_GE_1024-NEXT: eorv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
650 ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
651 ; VBITS_GE_1024-NEXT: ret
652 %op = load <16 x i64>, <16 x i64>* %a
653 %res = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> %op)
657 define i64 @eorv_v32i64(<32 x i64>* %a) #0 {
658 ; CHECK-LABEL: eorv_v32i64:
659 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
660 ; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
661 ; VBITS_GE_2048-NEXT: eorv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
662 ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
663 ; VBITS_GE_2048-NEXT: ret
664 %op = load <32 x i64>, <32 x i64>* %a
665 %res = call i64 @llvm.vector.reduce.xor.v32i64(<32 x i64> %op)
673 ; No single instruction NEON ORV support. Use SVE.
674 define i8 @orv_v8i8(<8 x i8> %a) #0 {
675 ; CHECK-LABEL: orv_v8i8:
676 ; CHECK: ptrue [[PG:p[0-9]+]].b, vl8
677 ; CHECK: orv b[[REDUCE:[0-9]+]], [[PG]], z0.b
678 ; CHECK: fmov w0, s[[REDUCE]]
680 %res = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a)
684 ; No single instruction NEON ORV support. Use SVE.
685 define i8 @orv_v16i8(<16 x i8> %a) #0 {
686 ; CHECK-LABEL: orv_v16i8:
687 ; CHECK: ptrue [[PG:p[0-9]+]].b, vl16
688 ; CHECK: orv b[[REDUCE:[0-9]+]], [[PG]], z0.b
689 ; CHECK: fmov w0, s[[REDUCE]]
691 %res = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a)
695 define i8 @orv_v32i8(<32 x i8>* %a) #0 {
696 ; CHECK-LABEL: orv_v32i8:
697 ; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
698 ; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
699 ; CHECK-NEXT: orv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
700 ; CHECK-NEXT: fmov w0, s[[REDUCE]]
702 %op = load <32 x i8>, <32 x i8>* %a
703 %res = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %op)
707 define i8 @orv_v64i8(<64 x i8>* %a) #0 {
708 ; CHECK-LABEL: orv_v64i8:
709 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
710 ; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
711 ; VBITS_GE_512-NEXT: orv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
712 ; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
713 ; VBITS_GE_512-NEXT: ret
715 ; Ensure sensible type legalisation.
716 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
717 ; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32
718 ; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
719 ; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]]
720 ; VBITS_EQ_256-DAG: orr [[OR:z[0-9]+]].d, [[LO]].d, [[HI]].d
721 ; VBITS_EQ_256-DAG: orv b[[REDUCE:[0-9]+]], [[PG]], [[OR]].b
722 ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
723 ; VBITS_EQ_256-NEXT: ret
725 %op = load <64 x i8>, <64 x i8>* %a
726 %res = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> %op)
730 define i8 @orv_v128i8(<128 x i8>* %a) #0 {
731 ; CHECK-LABEL: orv_v128i8:
732 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
733 ; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
734 ; VBITS_GE_1024-NEXT: orv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
735 ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
736 ; VBITS_GE_1024-NEXT: ret
737 %op = load <128 x i8>, <128 x i8>* %a
738 %res = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> %op)
742 define i8 @orv_v256i8(<256 x i8>* %a) #0 {
743 ; CHECK-LABEL: orv_v256i8:
744 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
745 ; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
746 ; VBITS_GE_2048-NEXT: orv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
747 ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
748 ; VBITS_GE_2048-NEXT: ret
749 %op = load <256 x i8>, <256 x i8>* %a
750 %res = call i8 @llvm.vector.reduce.or.v256i8(<256 x i8> %op)
754 ; No single instruction NEON ORV support. Use SVE.
755 define i16 @orv_v4i16(<4 x i16> %a) #0 {
756 ; CHECK-LABEL: orv_v4i16:
757 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
758 ; CHECK: orv h[[REDUCE:[0-9]+]], [[PG]], z0.h
759 ; CHECK: fmov w0, s[[REDUCE]]
761 %res = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %a)
765 ; No single instruction NEON ORV support. Use SVE.
766 define i16 @orv_v8i16(<8 x i16> %a) #0 {
767 ; CHECK-LABEL: orv_v8i16:
768 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
769 ; CHECK: orv h[[REDUCE:[0-9]+]], [[PG]], z0.h
770 ; CHECK: fmov w0, s[[REDUCE]]
772 %res = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %a)
776 define i16 @orv_v16i16(<16 x i16>* %a) #0 {
777 ; CHECK-LABEL: orv_v16i16:
778 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
779 ; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
780 ; CHECK-NEXT: orv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
781 ; CHECK-NEXT: fmov w0, s[[REDUCE]]
783 %op = load <16 x i16>, <16 x i16>* %a
784 %res = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %op)
788 define i16 @orv_v32i16(<32 x i16>* %a) #0 {
789 ; CHECK-LABEL: orv_v32i16:
790 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
791 ; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
792 ; VBITS_GE_512-NEXT: orv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
793 ; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
794 ; VBITS_GE_512-NEXT: ret
796 ; Ensure sensible type legalisation.
797 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
798 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
799 ; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
800 ; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
801 ; VBITS_EQ_256-DAG: orr [[OR:z[0-9]+]].d, [[LO]].d, [[HI]].d
802 ; VBITS_EQ_256-DAG: orv h[[REDUCE:[0-9]+]], [[PG]], [[OR]].h
803 ; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
804 ; VBITS_EQ_256-NEXT: ret
805 %op = load <32 x i16>, <32 x i16>* %a
806 %res = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> %op)
810 define i16 @orv_v64i16(<64 x i16>* %a) #0 {
811 ; CHECK-LABEL: orv_v64i16:
812 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
813 ; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
814 ; VBITS_GE_1024-NEXT: orv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
815 ; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
816 ; VBITS_GE_1024-NEXT: ret
817 %op = load <64 x i16>, <64 x i16>* %a
818 %res = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> %op)
822 define i16 @orv_v128i16(<128 x i16>* %a) #0 {
823 ; CHECK-LABEL: orv_v128i16:
824 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
825 ; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
826 ; VBITS_GE_2048-NEXT: orv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
827 ; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
828 ; VBITS_GE_2048-NEXT: ret
829 %op = load <128 x i16>, <128 x i16>* %a
830 %res = call i16 @llvm.vector.reduce.or.v128i16(<128 x i16> %op)
834 ; No single instruction NEON ORV support. Use SVE.
835 define i32 @orv_v2i32(<2 x i32> %a) #0 {
836 ; CHECK-LABEL: orv_v2i32:
837 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
838 ; CHECK: orv [[REDUCE:s[0-9]+]], [[PG]], z0.s
839 ; CHECK: fmov w0, [[REDUCE]]
841 %res = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a)
845 ; No single instruction NEON ORV support. Use SVE.
846 define i32 @orv_v4i32(<4 x i32> %a) #0 {
847 ; CHECK-LABEL: orv_v4i32:
848 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
849 ; CHECK: orv [[REDUCE:s[0-9]+]], [[PG]], z0.s
850 ; CHECK: fmov w0, [[REDUCE]]
852 %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
856 define i32 @orv_v8i32(<8 x i32>* %a) #0 {
857 ; CHECK-LABEL: orv_v8i32:
858 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
859 ; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
860 ; CHECK-NEXT: orv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
861 ; CHECK-NEXT: fmov w0, [[REDUCE]]
863 %op = load <8 x i32>, <8 x i32>* %a
864 %res = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %op)
868 define i32 @orv_v16i32(<16 x i32>* %a) #0 {
869 ; CHECK-LABEL: orv_v16i32:
870 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
871 ; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
872 ; VBITS_GE_512-NEXT: orv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
873 ; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
874 ; VBITS_GE_512-NEXT: ret
876 ; Ensure sensible type legalisation.
877 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
878 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
879 ; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
880 ; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
881 ; VBITS_EQ_256-DAG: orr [[OR:z[0-9]+]].d, [[LO]].d, [[HI]].d
882 ; VBITS_EQ_256-DAG: orv [[REDUCE:s[0-9]+]], [[PG]], [[OR]].s
883 ; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
884 ; VBITS_EQ_256-NEXT: ret
885 %op = load <16 x i32>, <16 x i32>* %a
886 %res = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %op)
890 define i32 @orv_v32i32(<32 x i32>* %a) #0 {
891 ; CHECK-LABEL: orv_v32i32:
892 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
893 ; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
894 ; VBITS_GE_1024-NEXT: orv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
895 ; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
896 ; VBITS_GE_1024-NEXT: ret
897 %op = load <32 x i32>, <32 x i32>* %a
898 %res = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> %op)
902 define i32 @orv_v64i32(<64 x i32>* %a) #0 {
903 ; CHECK-LABEL: orv_v64i32:
904 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
905 ; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
906 ; VBITS_GE_2048-NEXT: orv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
907 ; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
908 ; VBITS_GE_2048-NEXT: ret
909 %op = load <64 x i32>, <64 x i32>* %a
910 %res = call i32 @llvm.vector.reduce.or.v64i32(<64 x i32> %op)
914 ; Nothing to do for single element vectors.
915 define i64 @orv_v1i64(<1 x i64> %a) #0 {
916 ; CHECK-LABEL: orv_v1i64:
919 %res = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> %a)
923 ; Use SVE for 128-bit vectors
924 define i64 @orv_v2i64(<2 x i64> %a) #0 {
925 ; CHECK-LABEL: orv_v2i64:
926 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
927 ; CHECK: orv [[REDUCE:d[0-9]+]], [[PG]], z0.d
928 ; CHECK: fmov x0, [[REDUCE]]
930 %res = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a)
934 define i64 @orv_v4i64(<4 x i64>* %a) #0 {
935 ; CHECK-LABEL: orv_v4i64:
936 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
937 ; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
938 ; CHECK-NEXT: orv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
939 ; CHECK-NEXT: fmov x0, [[REDUCE]]
941 %op = load <4 x i64>, <4 x i64>* %a
942 %res = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %op)
946 define i64 @orv_v8i64(<8 x i64>* %a) #0 {
947 ; CHECK-LABEL: orv_v8i64:
948 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
949 ; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
950 ; VBITS_GE_512-NEXT: orv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
951 ; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
952 ; VBITS_GE_512-NEXT: ret
954 ; Ensure sensible type legalisation.
955 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
956 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
957 ; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
958 ; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
959 ; VBITS_EQ_256-DAG: orr [[OR:z[0-9]+]].d, [[LO]].d, [[HI]].d
960 ; VBITS_EQ_256-DAG: orv [[REDUCE:d[0-9]+]], [[PG]], [[OR]].d
961 ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
962 ; VBITS_EQ_256-NEXT: ret
963 %op = load <8 x i64>, <8 x i64>* %a
964 %res = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %op)
968 define i64 @orv_v16i64(<16 x i64>* %a) #0 {
969 ; CHECK-LABEL: orv_v16i64:
970 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
971 ; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
972 ; VBITS_GE_1024-NEXT: orv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
973 ; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
974 ; VBITS_GE_1024-NEXT: ret
975 %op = load <16 x i64>, <16 x i64>* %a
976 %res = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> %op)
980 define i64 @orv_v32i64(<32 x i64>* %a) #0 {
981 ; CHECK-LABEL: orv_v32i64:
982 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
983 ; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
984 ; VBITS_GE_2048-NEXT: orv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
985 ; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
986 ; VBITS_GE_2048-NEXT: ret
987 %op = load <32 x i64>, <32 x i64>* %a
988 %res = call i64 @llvm.vector.reduce.or.v32i64(<32 x i64> %op)
992 attributes #0 = { "target-features"="+sve" }
994 declare i8 @llvm.vector.reduce.and.v8i8(<8 x i8>)
995 declare i8 @llvm.vector.reduce.and.v16i8(<16 x i8>)
996 declare i8 @llvm.vector.reduce.and.v32i8(<32 x i8>)
997 declare i8 @llvm.vector.reduce.and.v64i8(<64 x i8>)
998 declare i8 @llvm.vector.reduce.and.v128i8(<128 x i8>)
999 declare i8 @llvm.vector.reduce.and.v256i8(<256 x i8>)
1001 declare i16 @llvm.vector.reduce.and.v4i16(<4 x i16>)
1002 declare i16 @llvm.vector.reduce.and.v8i16(<8 x i16>)
1003 declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>)
1004 declare i16 @llvm.vector.reduce.and.v32i16(<32 x i16>)
1005 declare i16 @llvm.vector.reduce.and.v64i16(<64 x i16>)
1006 declare i16 @llvm.vector.reduce.and.v128i16(<128 x i16>)
1008 declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>)
1009 declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
1010 declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>)
1011 declare i32 @llvm.vector.reduce.and.v16i32(<16 x i32>)
1012 declare i32 @llvm.vector.reduce.and.v32i32(<32 x i32>)
1013 declare i32 @llvm.vector.reduce.and.v64i32(<64 x i32>)
1015 declare i64 @llvm.vector.reduce.and.v1i64(<1 x i64>)
1016 declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>)
1017 declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>)
1018 declare i64 @llvm.vector.reduce.and.v8i64(<8 x i64>)
1019 declare i64 @llvm.vector.reduce.and.v16i64(<16 x i64>)
1020 declare i64 @llvm.vector.reduce.and.v32i64(<32 x i64>)
1022 declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>)
1023 declare i8 @llvm.vector.reduce.or.v16i8(<16 x i8>)
1024 declare i8 @llvm.vector.reduce.or.v32i8(<32 x i8>)
1025 declare i8 @llvm.vector.reduce.or.v64i8(<64 x i8>)
1026 declare i8 @llvm.vector.reduce.or.v128i8(<128 x i8>)
1027 declare i8 @llvm.vector.reduce.or.v256i8(<256 x i8>)
1029 declare i16 @llvm.vector.reduce.or.v4i16(<4 x i16>)
1030 declare i16 @llvm.vector.reduce.or.v8i16(<8 x i16>)
1031 declare i16 @llvm.vector.reduce.or.v16i16(<16 x i16>)
1032 declare i16 @llvm.vector.reduce.or.v32i16(<32 x i16>)
1033 declare i16 @llvm.vector.reduce.or.v64i16(<64 x i16>)
1034 declare i16 @llvm.vector.reduce.or.v128i16(<128 x i16>)
1036 declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>)
1037 declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
1038 declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>)
1039 declare i32 @llvm.vector.reduce.or.v16i32(<16 x i32>)
1040 declare i32 @llvm.vector.reduce.or.v32i32(<32 x i32>)
1041 declare i32 @llvm.vector.reduce.or.v64i32(<64 x i32>)
1043 declare i64 @llvm.vector.reduce.or.v1i64(<1 x i64>)
1044 declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
1045 declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>)
1046 declare i64 @llvm.vector.reduce.or.v8i64(<8 x i64>)
1047 declare i64 @llvm.vector.reduce.or.v16i64(<16 x i64>)
1048 declare i64 @llvm.vector.reduce.or.v32i64(<32 x i64>)
1050 declare i8 @llvm.vector.reduce.xor.v8i8(<8 x i8>)
1051 declare i8 @llvm.vector.reduce.xor.v16i8(<16 x i8>)
1052 declare i8 @llvm.vector.reduce.xor.v32i8(<32 x i8>)
1053 declare i8 @llvm.vector.reduce.xor.v64i8(<64 x i8>)
1054 declare i8 @llvm.vector.reduce.xor.v128i8(<128 x i8>)
1055 declare i8 @llvm.vector.reduce.xor.v256i8(<256 x i8>)
1057 declare i16 @llvm.vector.reduce.xor.v4i16(<4 x i16>)
1058 declare i16 @llvm.vector.reduce.xor.v8i16(<8 x i16>)
1059 declare i16 @llvm.vector.reduce.xor.v16i16(<16 x i16>)
1060 declare i16 @llvm.vector.reduce.xor.v32i16(<32 x i16>)
1061 declare i16 @llvm.vector.reduce.xor.v64i16(<64 x i16>)
1062 declare i16 @llvm.vector.reduce.xor.v128i16(<128 x i16>)
1064 declare i32 @llvm.vector.reduce.xor.v2i32(<2 x i32>)
1065 declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
1066 declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32>)
1067 declare i32 @llvm.vector.reduce.xor.v16i32(<16 x i32>)
1068 declare i32 @llvm.vector.reduce.xor.v32i32(<32 x i32>)
1069 declare i32 @llvm.vector.reduce.xor.v64i32(<64 x i32>)
1071 declare i64 @llvm.vector.reduce.xor.v1i64(<1 x i64>)
1072 declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>)
1073 declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>)
1074 declare i64 @llvm.vector.reduce.xor.v8i64(<8 x i64>)
1075 declare i64 @llvm.vector.reduce.xor.v16i64(<16 x i64>)
1076 declare i64 @llvm.vector.reduce.xor.v32i64(<32 x i64>)