1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
6 target triple = "aarch64-unknown-linux-gnu"
12 define <8 x i8> @bitreverse_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
13 ; CHECK-LABEL: bitreverse_v8i8:
15 ; CHECK-NEXT: ptrue p0.b, vl8
16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
17 ; CHECK-NEXT: rbit z0.b, p0/m, z0.b
18 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
20 %res = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %op)
24 define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) vscale_range(2,0) #0 {
25 ; CHECK-LABEL: bitreverse_v16i8:
27 ; CHECK-NEXT: ptrue p0.b, vl16
28 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
29 ; CHECK-NEXT: rbit z0.b, p0/m, z0.b
30 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
32 %res = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %op)
36 define void @bitreverse_v32i8(ptr %a) vscale_range(2,0) #0 {
37 ; CHECK-LABEL: bitreverse_v32i8:
39 ; CHECK-NEXT: ptrue p0.b, vl32
40 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
41 ; CHECK-NEXT: rbit z0.b, p0/m, z0.b
42 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
44 %op = load <32 x i8>, ptr %a
45 %res = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %op)
46 store <32 x i8> %res, ptr %a
50 define void @bitreverse_v64i8(ptr %a) #0 {
51 ; VBITS_GE_256-LABEL: bitreverse_v64i8:
52 ; VBITS_GE_256: // %bb.0:
53 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
54 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
55 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
56 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
57 ; VBITS_GE_256-NEXT: rbit z0.b, p0/m, z0.b
58 ; VBITS_GE_256-NEXT: rbit z1.b, p0/m, z1.b
59 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
60 ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
61 ; VBITS_GE_256-NEXT: ret
63 ; VBITS_GE_512-LABEL: bitreverse_v64i8:
64 ; VBITS_GE_512: // %bb.0:
65 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
66 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
67 ; VBITS_GE_512-NEXT: rbit z0.b, p0/m, z0.b
68 ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
69 ; VBITS_GE_512-NEXT: ret
70 %op = load <64 x i8>, ptr %a
71 %res = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %op)
72 store <64 x i8> %res, ptr %a
76 define void @bitreverse_v128i8(ptr %a) vscale_range(8,0) #0 {
77 ; CHECK-LABEL: bitreverse_v128i8:
79 ; CHECK-NEXT: ptrue p0.b, vl128
80 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
81 ; CHECK-NEXT: rbit z0.b, p0/m, z0.b
82 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
84 %op = load <128 x i8>, ptr %a
85 %res = call <128 x i8> @llvm.bitreverse.v128i8(<128 x i8> %op)
86 store <128 x i8> %res, ptr %a
90 define void @bitreverse_v256i8(ptr %a) vscale_range(16,0) #0 {
91 ; CHECK-LABEL: bitreverse_v256i8:
93 ; CHECK-NEXT: ptrue p0.b, vl256
94 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
95 ; CHECK-NEXT: rbit z0.b, p0/m, z0.b
96 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
98 %op = load <256 x i8>, ptr %a
99 %res = call <256 x i8> @llvm.bitreverse.v256i8(<256 x i8> %op)
100 store <256 x i8> %res, ptr %a
104 define <4 x i16> @bitreverse_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
105 ; CHECK-LABEL: bitreverse_v4i16:
107 ; CHECK-NEXT: ptrue p0.h, vl4
108 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
109 ; CHECK-NEXT: rbit z0.h, p0/m, z0.h
110 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
112 %res = call <4 x i16> @llvm.bitreverse.v4i16(<4 x i16> %op)
116 define <8 x i16> @bitreverse_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
117 ; CHECK-LABEL: bitreverse_v8i16:
119 ; CHECK-NEXT: ptrue p0.h, vl8
120 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
121 ; CHECK-NEXT: rbit z0.h, p0/m, z0.h
122 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
124 %res = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %op)
128 define void @bitreverse_v16i16(ptr %a) vscale_range(2,0) #0 {
129 ; CHECK-LABEL: bitreverse_v16i16:
131 ; CHECK-NEXT: ptrue p0.h, vl16
132 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
133 ; CHECK-NEXT: rbit z0.h, p0/m, z0.h
134 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
136 %op = load <16 x i16>, ptr %a
137 %res = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %op)
138 store <16 x i16> %res, ptr %a
142 define void @bitreverse_v32i16(ptr %a) #0 {
143 ; VBITS_GE_256-LABEL: bitreverse_v32i16:
144 ; VBITS_GE_256: // %bb.0:
145 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
146 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
147 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
148 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
149 ; VBITS_GE_256-NEXT: rbit z0.h, p0/m, z0.h
150 ; VBITS_GE_256-NEXT: rbit z1.h, p0/m, z1.h
151 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
152 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
153 ; VBITS_GE_256-NEXT: ret
155 ; VBITS_GE_512-LABEL: bitreverse_v32i16:
156 ; VBITS_GE_512: // %bb.0:
157 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
158 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
159 ; VBITS_GE_512-NEXT: rbit z0.h, p0/m, z0.h
160 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
161 ; VBITS_GE_512-NEXT: ret
162 %op = load <32 x i16>, ptr %a
163 %res = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %op)
164 store <32 x i16> %res, ptr %a
168 define void @bitreverse_v64i16(ptr %a) vscale_range(8,0) #0 {
169 ; CHECK-LABEL: bitreverse_v64i16:
171 ; CHECK-NEXT: ptrue p0.h, vl64
172 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
173 ; CHECK-NEXT: rbit z0.h, p0/m, z0.h
174 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
176 %op = load <64 x i16>, ptr %a
177 %res = call <64 x i16> @llvm.bitreverse.v64i16(<64 x i16> %op)
178 store <64 x i16> %res, ptr %a
182 define void @bitreverse_v128i16(ptr %a) vscale_range(16,0) #0 {
183 ; CHECK-LABEL: bitreverse_v128i16:
185 ; CHECK-NEXT: ptrue p0.h, vl128
186 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
187 ; CHECK-NEXT: rbit z0.h, p0/m, z0.h
188 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
190 %op = load <128 x i16>, ptr %a
191 %res = call <128 x i16> @llvm.bitreverse.v128i16(<128 x i16> %op)
192 store <128 x i16> %res, ptr %a
196 define <2 x i32> @bitreverse_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
197 ; CHECK-LABEL: bitreverse_v2i32:
199 ; CHECK-NEXT: ptrue p0.s, vl2
200 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
201 ; CHECK-NEXT: rbit z0.s, p0/m, z0.s
202 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
204 %res = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %op)
208 define <4 x i32> @bitreverse_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
209 ; CHECK-LABEL: bitreverse_v4i32:
211 ; CHECK-NEXT: ptrue p0.s, vl4
212 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
213 ; CHECK-NEXT: rbit z0.s, p0/m, z0.s
214 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
216 %res = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %op)
220 define void @bitreverse_v8i32(ptr %a) vscale_range(2,0) #0 {
221 ; CHECK-LABEL: bitreverse_v8i32:
223 ; CHECK-NEXT: ptrue p0.s, vl8
224 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
225 ; CHECK-NEXT: rbit z0.s, p0/m, z0.s
226 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
228 %op = load <8 x i32>, ptr %a
229 %res = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %op)
230 store <8 x i32> %res, ptr %a
234 define void @bitreverse_v16i32(ptr %a) #0 {
235 ; VBITS_GE_256-LABEL: bitreverse_v16i32:
236 ; VBITS_GE_256: // %bb.0:
237 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
238 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
239 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
240 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
241 ; VBITS_GE_256-NEXT: rbit z0.s, p0/m, z0.s
242 ; VBITS_GE_256-NEXT: rbit z1.s, p0/m, z1.s
243 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
244 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
245 ; VBITS_GE_256-NEXT: ret
247 ; VBITS_GE_512-LABEL: bitreverse_v16i32:
248 ; VBITS_GE_512: // %bb.0:
249 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
250 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
251 ; VBITS_GE_512-NEXT: rbit z0.s, p0/m, z0.s
252 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
253 ; VBITS_GE_512-NEXT: ret
254 %op = load <16 x i32>, ptr %a
255 %res = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %op)
256 store <16 x i32> %res, ptr %a
260 define void @bitreverse_v32i32(ptr %a) vscale_range(8,0) #0 {
261 ; CHECK-LABEL: bitreverse_v32i32:
263 ; CHECK-NEXT: ptrue p0.s, vl32
264 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
265 ; CHECK-NEXT: rbit z0.s, p0/m, z0.s
266 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
268 %op = load <32 x i32>, ptr %a
269 %res = call <32 x i32> @llvm.bitreverse.v32i32(<32 x i32> %op)
270 store <32 x i32> %res, ptr %a
274 define void @bitreverse_v64i32(ptr %a) vscale_range(16,0) #0 {
275 ; CHECK-LABEL: bitreverse_v64i32:
277 ; CHECK-NEXT: ptrue p0.s, vl64
278 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
279 ; CHECK-NEXT: rbit z0.s, p0/m, z0.s
280 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
282 %op = load <64 x i32>, ptr %a
283 %res = call <64 x i32> @llvm.bitreverse.v64i32(<64 x i32> %op)
284 store <64 x i32> %res, ptr %a
288 define <1 x i64> @bitreverse_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
289 ; CHECK-LABEL: bitreverse_v1i64:
291 ; CHECK-NEXT: ptrue p0.d, vl1
292 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
293 ; CHECK-NEXT: rbit z0.d, p0/m, z0.d
294 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
296 %res = call <1 x i64> @llvm.bitreverse.v1i64(<1 x i64> %op)
300 define <2 x i64> @bitreverse_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
301 ; CHECK-LABEL: bitreverse_v2i64:
303 ; CHECK-NEXT: ptrue p0.d, vl2
304 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
305 ; CHECK-NEXT: rbit z0.d, p0/m, z0.d
306 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
308 %res = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %op)
312 define void @bitreverse_v4i64(ptr %a) vscale_range(2,0) #0 {
313 ; CHECK-LABEL: bitreverse_v4i64:
315 ; CHECK-NEXT: ptrue p0.d, vl4
316 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
317 ; CHECK-NEXT: rbit z0.d, p0/m, z0.d
318 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
320 %op = load <4 x i64>, ptr %a
321 %res = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %op)
322 store <4 x i64> %res, ptr %a
326 define void @bitreverse_v8i64(ptr %a) #0 {
327 ; VBITS_GE_256-LABEL: bitreverse_v8i64:
328 ; VBITS_GE_256: // %bb.0:
329 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
330 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
331 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
332 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
333 ; VBITS_GE_256-NEXT: rbit z0.d, p0/m, z0.d
334 ; VBITS_GE_256-NEXT: rbit z1.d, p0/m, z1.d
335 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
336 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
337 ; VBITS_GE_256-NEXT: ret
339 ; VBITS_GE_512-LABEL: bitreverse_v8i64:
340 ; VBITS_GE_512: // %bb.0:
341 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
342 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
343 ; VBITS_GE_512-NEXT: rbit z0.d, p0/m, z0.d
344 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
345 ; VBITS_GE_512-NEXT: ret
346 %op = load <8 x i64>, ptr %a
347 %res = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %op)
348 store <8 x i64> %res, ptr %a
352 define void @bitreverse_v16i64(ptr %a) vscale_range(8,0) #0 {
353 ; CHECK-LABEL: bitreverse_v16i64:
355 ; CHECK-NEXT: ptrue p0.d, vl16
356 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
357 ; CHECK-NEXT: rbit z0.d, p0/m, z0.d
358 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
360 %op = load <16 x i64>, ptr %a
361 %res = call <16 x i64> @llvm.bitreverse.v16i64(<16 x i64> %op)
362 store <16 x i64> %res, ptr %a
366 define void @bitreverse_v32i64(ptr %a) vscale_range(16,0) #0 {
367 ; CHECK-LABEL: bitreverse_v32i64:
369 ; CHECK-NEXT: ptrue p0.d, vl32
370 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
371 ; CHECK-NEXT: rbit z0.d, p0/m, z0.d
372 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
374 %op = load <32 x i64>, ptr %a
375 %res = call <32 x i64> @llvm.bitreverse.v32i64(<32 x i64> %op)
376 store <32 x i64> %res, ptr %a
384 ; Don't use SVE for 64-bit vectors.
385 define <4 x i16> @bswap_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
386 ; CHECK-LABEL: bswap_v4i16:
388 ; CHECK-NEXT: rev16 v0.8b, v0.8b
390 %res = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %op)
394 ; Don't use SVE for 128-bit vectors.
395 define <8 x i16> @bswap_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
396 ; CHECK-LABEL: bswap_v8i16:
398 ; CHECK-NEXT: rev16 v0.16b, v0.16b
400 %res = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %op)
404 define void @bswap_v16i16(ptr %a) vscale_range(2,0) #0 {
405 ; CHECK-LABEL: bswap_v16i16:
407 ; CHECK-NEXT: ptrue p0.h, vl16
408 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
409 ; CHECK-NEXT: revb z0.h, p0/m, z0.h
410 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
412 %op = load <16 x i16>, ptr %a
413 %res = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %op)
414 store <16 x i16> %res, ptr %a
418 define void @bswap_v32i16(ptr %a) #0 {
419 ; VBITS_GE_256-LABEL: bswap_v32i16:
420 ; VBITS_GE_256: // %bb.0:
421 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
422 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
423 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
424 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
425 ; VBITS_GE_256-NEXT: revb z0.h, p0/m, z0.h
426 ; VBITS_GE_256-NEXT: revb z1.h, p0/m, z1.h
427 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
428 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
429 ; VBITS_GE_256-NEXT: ret
431 ; VBITS_GE_512-LABEL: bswap_v32i16:
432 ; VBITS_GE_512: // %bb.0:
433 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
434 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
435 ; VBITS_GE_512-NEXT: revb z0.h, p0/m, z0.h
436 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
437 ; VBITS_GE_512-NEXT: ret
438 %op = load <32 x i16>, ptr %a
439 %res = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %op)
440 store <32 x i16> %res, ptr %a
444 define void @bswap_v64i16(ptr %a) vscale_range(8,0) #0 {
445 ; CHECK-LABEL: bswap_v64i16:
447 ; CHECK-NEXT: ptrue p0.h, vl64
448 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
449 ; CHECK-NEXT: revb z0.h, p0/m, z0.h
450 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
452 %op = load <64 x i16>, ptr %a
453 %res = call <64 x i16> @llvm.bswap.v64i16(<64 x i16> %op)
454 store <64 x i16> %res, ptr %a
458 define void @bswap_v128i16(ptr %a) vscale_range(16,0) #0 {
459 ; CHECK-LABEL: bswap_v128i16:
461 ; CHECK-NEXT: ptrue p0.h, vl128
462 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
463 ; CHECK-NEXT: revb z0.h, p0/m, z0.h
464 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
466 %op = load <128 x i16>, ptr %a
467 %res = call <128 x i16> @llvm.bswap.v128i16(<128 x i16> %op)
468 store <128 x i16> %res, ptr %a
472 ; Don't use SVE for 64-bit vectors.
473 define <2 x i32> @bswap_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
474 ; CHECK-LABEL: bswap_v2i32:
476 ; CHECK-NEXT: rev32 v0.8b, v0.8b
478 %res = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %op)
482 ; Don't use SVE for 128-bit vectors.
483 define <4 x i32> @bswap_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
484 ; CHECK-LABEL: bswap_v4i32:
486 ; CHECK-NEXT: rev32 v0.16b, v0.16b
488 %res = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %op)
492 define void @bswap_v8i32(ptr %a) vscale_range(2,0) #0 {
493 ; CHECK-LABEL: bswap_v8i32:
495 ; CHECK-NEXT: ptrue p0.s, vl8
496 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
497 ; CHECK-NEXT: revb z0.s, p0/m, z0.s
498 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
500 %op = load <8 x i32>, ptr %a
501 %res = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %op)
502 store <8 x i32> %res, ptr %a
506 define void @bswap_v16i32(ptr %a) #0 {
507 ; VBITS_GE_256-LABEL: bswap_v16i32:
508 ; VBITS_GE_256: // %bb.0:
509 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
510 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
511 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
512 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
513 ; VBITS_GE_256-NEXT: revb z0.s, p0/m, z0.s
514 ; VBITS_GE_256-NEXT: revb z1.s, p0/m, z1.s
515 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
516 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
517 ; VBITS_GE_256-NEXT: ret
519 ; VBITS_GE_512-LABEL: bswap_v16i32:
520 ; VBITS_GE_512: // %bb.0:
521 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
522 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
523 ; VBITS_GE_512-NEXT: revb z0.s, p0/m, z0.s
524 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
525 ; VBITS_GE_512-NEXT: ret
526 %op = load <16 x i32>, ptr %a
527 %res = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %op)
528 store <16 x i32> %res, ptr %a
532 define void @bswap_v32i32(ptr %a) vscale_range(8,0) #0 {
533 ; CHECK-LABEL: bswap_v32i32:
535 ; CHECK-NEXT: ptrue p0.s, vl32
536 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
537 ; CHECK-NEXT: revb z0.s, p0/m, z0.s
538 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
540 %op = load <32 x i32>, ptr %a
541 %res = call <32 x i32> @llvm.bswap.v32i32(<32 x i32> %op)
542 store <32 x i32> %res, ptr %a
546 define void @bswap_v64i32(ptr %a) vscale_range(16,0) #0 {
547 ; CHECK-LABEL: bswap_v64i32:
549 ; CHECK-NEXT: ptrue p0.s, vl64
550 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
551 ; CHECK-NEXT: revb z0.s, p0/m, z0.s
552 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
554 %op = load <64 x i32>, ptr %a
555 %res = call <64 x i32> @llvm.bswap.v64i32(<64 x i32> %op)
556 store <64 x i32> %res, ptr %a
560 ; Don't use SVE for 64-bit vectors.
561 define <1 x i64> @bswap_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
562 ; CHECK-LABEL: bswap_v1i64:
564 ; CHECK-NEXT: rev64 v0.8b, v0.8b
566 %res = call <1 x i64> @llvm.bswap.v1i64(<1 x i64> %op)
570 ; Don't use SVE for 128-bit vectors.
571 define <2 x i64> @bswap_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
572 ; CHECK-LABEL: bswap_v2i64:
574 ; CHECK-NEXT: rev64 v0.16b, v0.16b
576 %res = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %op)
580 define void @bswap_v4i64(ptr %a) vscale_range(2,0) #0 {
581 ; CHECK-LABEL: bswap_v4i64:
583 ; CHECK-NEXT: ptrue p0.d, vl4
584 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
585 ; CHECK-NEXT: revb z0.d, p0/m, z0.d
586 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
588 %op = load <4 x i64>, ptr %a
589 %res = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %op)
590 store <4 x i64> %res, ptr %a
594 define void @bswap_v8i64(ptr %a) #0 {
595 ; VBITS_GE_256-LABEL: bswap_v8i64:
596 ; VBITS_GE_256: // %bb.0:
597 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
598 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
599 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
600 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
601 ; VBITS_GE_256-NEXT: revb z0.d, p0/m, z0.d
602 ; VBITS_GE_256-NEXT: revb z1.d, p0/m, z1.d
603 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
604 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
605 ; VBITS_GE_256-NEXT: ret
607 ; VBITS_GE_512-LABEL: bswap_v8i64:
608 ; VBITS_GE_512: // %bb.0:
609 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
610 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
611 ; VBITS_GE_512-NEXT: revb z0.d, p0/m, z0.d
612 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
613 ; VBITS_GE_512-NEXT: ret
614 %op = load <8 x i64>, ptr %a
615 %res = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %op)
616 store <8 x i64> %res, ptr %a
620 define void @bswap_v16i64(ptr %a) vscale_range(8,0) #0 {
621 ; CHECK-LABEL: bswap_v16i64:
623 ; CHECK-NEXT: ptrue p0.d, vl16
624 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
625 ; CHECK-NEXT: revb z0.d, p0/m, z0.d
626 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
628 %op = load <16 x i64>, ptr %a
629 %res = call <16 x i64> @llvm.bswap.v16i64(<16 x i64> %op)
630 store <16 x i64> %res, ptr %a
634 define void @bswap_v32i64(ptr %a) vscale_range(16,0) #0 {
635 ; CHECK-LABEL: bswap_v32i64:
637 ; CHECK-NEXT: ptrue p0.d, vl32
638 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
639 ; CHECK-NEXT: revb z0.d, p0/m, z0.d
640 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
642 %op = load <32 x i64>, ptr %a
643 %res = call <32 x i64> @llvm.bswap.v32i64(<32 x i64> %op)
644 store <32 x i64> %res, ptr %a
648 attributes #0 = { "target-features"="+sve" }
650 declare <8 x i8> @llvm.bitreverse.v8i8(<8 x i8>)
651 declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>)
652 declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>)
653 declare <64 x i8> @llvm.bitreverse.v64i8(<64 x i8>)
654 declare <128 x i8> @llvm.bitreverse.v128i8(<128 x i8>)
655 declare <256 x i8> @llvm.bitreverse.v256i8(<256 x i8>)
656 declare <4 x i16> @llvm.bitreverse.v4i16(<4 x i16>)
657 declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>)
658 declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>)
659 declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>)
660 declare <64 x i16> @llvm.bitreverse.v64i16(<64 x i16>)
661 declare <128 x i16> @llvm.bitreverse.v128i16(<128 x i16>)
662 declare <2 x i32> @llvm.bitreverse.v2i32(<2 x i32>)
663 declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>)
664 declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>)
665 declare <16 x i32> @llvm.bitreverse.v16i32(<16 x i32>)
666 declare <32 x i32> @llvm.bitreverse.v32i32(<32 x i32>)
667 declare <64 x i32> @llvm.bitreverse.v64i32(<64 x i32>)
668 declare <1 x i64> @llvm.bitreverse.v1i64(<1 x i64>)
669 declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>)
670 declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>)
671 declare <8 x i64> @llvm.bitreverse.v8i64(<8 x i64>)
672 declare <16 x i64> @llvm.bitreverse.v16i64(<16 x i64>)
673 declare <32 x i64> @llvm.bitreverse.v32i64(<32 x i64>)
675 declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>)
676 declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>)
677 declare <16 x i16> @llvm.bswap.v16i16(<16 x i16>)
678 declare <32 x i16> @llvm.bswap.v32i16(<32 x i16>)
679 declare <64 x i16> @llvm.bswap.v64i16(<64 x i16>)
680 declare <128 x i16> @llvm.bswap.v128i16(<128 x i16>)
681 declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>)
682 declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>)
683 declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>)
684 declare <16 x i32> @llvm.bswap.v16i32(<16 x i32>)
685 declare <32 x i32> @llvm.bswap.v32i32(<32 x i32>)
686 declare <64 x i32> @llvm.bswap.v64i32(<64 x i32>)
687 declare <1 x i64> @llvm.bswap.v1i64(<1 x i64>)
688 declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)
689 declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>)
690 declare <8 x i64> @llvm.bswap.v8i64(<8 x i64>)
691 declare <16 x i64> @llvm.bswap.v16i64(<16 x i64>)
692 declare <32 x i64> @llvm.bswap.v32i64(<32 x i64>)