1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
6 target triple = "aarch64-unknown-linux-gnu"
12 ; Don't use SVE for 64-bit vectors.
13 define <8 x i8> @ctlz_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
14 ; CHECK-LABEL: ctlz_v8i8:
16 ; CHECK-NEXT: clz v0.8b, v0.8b
18 %res = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %op)
22 ; Don't use SVE for 128-bit vectors.
23 define <16 x i8> @ctlz_v16i8(<16 x i8> %op) vscale_range(2,0) #0 {
24 ; CHECK-LABEL: ctlz_v16i8:
26 ; CHECK-NEXT: clz v0.16b, v0.16b
28 %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %op)
32 define void @ctlz_v32i8(ptr %a) vscale_range(2,0) #0 {
33 ; CHECK-LABEL: ctlz_v32i8:
35 ; CHECK-NEXT: ptrue p0.b, vl32
36 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
37 ; CHECK-NEXT: clz z0.b, p0/m, z0.b
38 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
40 %op = load <32 x i8>, ptr %a
41 %res = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %op)
42 store <32 x i8> %res, ptr %a
46 define void @ctlz_v64i8(ptr %a) #0 {
47 ; VBITS_GE_256-LABEL: ctlz_v64i8:
48 ; VBITS_GE_256: // %bb.0:
49 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
50 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
51 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
52 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
53 ; VBITS_GE_256-NEXT: clz z0.b, p0/m, z0.b
54 ; VBITS_GE_256-NEXT: clz z1.b, p0/m, z1.b
55 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
56 ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
57 ; VBITS_GE_256-NEXT: ret
59 ; VBITS_GE_512-LABEL: ctlz_v64i8:
60 ; VBITS_GE_512: // %bb.0:
61 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
62 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
63 ; VBITS_GE_512-NEXT: clz z0.b, p0/m, z0.b
64 ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
65 ; VBITS_GE_512-NEXT: ret
66 %op = load <64 x i8>, ptr %a
67 %res = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %op)
68 store <64 x i8> %res, ptr %a
72 define void @ctlz_v128i8(ptr %a) vscale_range(8,0) #0 {
73 ; CHECK-LABEL: ctlz_v128i8:
75 ; CHECK-NEXT: ptrue p0.b, vl128
76 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
77 ; CHECK-NEXT: clz z0.b, p0/m, z0.b
78 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
80 %op = load <128 x i8>, ptr %a
81 %res = call <128 x i8> @llvm.ctlz.v128i8(<128 x i8> %op)
82 store <128 x i8> %res, ptr %a
86 define void @ctlz_v256i8(ptr %a) vscale_range(16,0) #0 {
87 ; CHECK-LABEL: ctlz_v256i8:
89 ; CHECK-NEXT: ptrue p0.b, vl256
90 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
91 ; CHECK-NEXT: clz z0.b, p0/m, z0.b
92 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
94 %op = load <256 x i8>, ptr %a
95 %res = call <256 x i8> @llvm.ctlz.v256i8(<256 x i8> %op)
96 store <256 x i8> %res, ptr %a
100 ; Don't use SVE for 64-bit vectors.
101 define <4 x i16> @ctlz_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
102 ; CHECK-LABEL: ctlz_v4i16:
104 ; CHECK-NEXT: clz v0.4h, v0.4h
106 %res = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %op)
110 ; Don't use SVE for 128-bit vectors.
111 define <8 x i16> @ctlz_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
112 ; CHECK-LABEL: ctlz_v8i16:
114 ; CHECK-NEXT: clz v0.8h, v0.8h
116 %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %op)
120 define void @ctlz_v16i16(ptr %a) vscale_range(2,0) #0 {
121 ; CHECK-LABEL: ctlz_v16i16:
123 ; CHECK-NEXT: ptrue p0.h, vl16
124 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
125 ; CHECK-NEXT: clz z0.h, p0/m, z0.h
126 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
128 %op = load <16 x i16>, ptr %a
129 %res = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %op)
130 store <16 x i16> %res, ptr %a
134 define void @ctlz_v32i16(ptr %a) #0 {
135 ; VBITS_GE_256-LABEL: ctlz_v32i16:
136 ; VBITS_GE_256: // %bb.0:
137 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
138 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
139 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
140 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
141 ; VBITS_GE_256-NEXT: clz z0.h, p0/m, z0.h
142 ; VBITS_GE_256-NEXT: clz z1.h, p0/m, z1.h
143 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
144 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
145 ; VBITS_GE_256-NEXT: ret
147 ; VBITS_GE_512-LABEL: ctlz_v32i16:
148 ; VBITS_GE_512: // %bb.0:
149 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
150 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
151 ; VBITS_GE_512-NEXT: clz z0.h, p0/m, z0.h
152 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
153 ; VBITS_GE_512-NEXT: ret
154 %op = load <32 x i16>, ptr %a
155 %res = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %op)
156 store <32 x i16> %res, ptr %a
160 define void @ctlz_v64i16(ptr %a) vscale_range(8,0) #0 {
161 ; CHECK-LABEL: ctlz_v64i16:
163 ; CHECK-NEXT: ptrue p0.h, vl64
164 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
165 ; CHECK-NEXT: clz z0.h, p0/m, z0.h
166 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
168 %op = load <64 x i16>, ptr %a
169 %res = call <64 x i16> @llvm.ctlz.v64i16(<64 x i16> %op)
170 store <64 x i16> %res, ptr %a
174 define void @ctlz_v128i16(ptr %a) vscale_range(16,0) #0 {
175 ; CHECK-LABEL: ctlz_v128i16:
177 ; CHECK-NEXT: ptrue p0.h, vl128
178 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
179 ; CHECK-NEXT: clz z0.h, p0/m, z0.h
180 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
182 %op = load <128 x i16>, ptr %a
183 %res = call <128 x i16> @llvm.ctlz.v128i16(<128 x i16> %op)
184 store <128 x i16> %res, ptr %a
188 ; Don't use SVE for 64-bit vectors.
189 define <2 x i32> @ctlz_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
190 ; CHECK-LABEL: ctlz_v2i32:
192 ; CHECK-NEXT: clz v0.2s, v0.2s
194 %res = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %op)
198 ; Don't use SVE for 128-bit vectors.
199 define <4 x i32> @ctlz_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
200 ; CHECK-LABEL: ctlz_v4i32:
202 ; CHECK-NEXT: clz v0.4s, v0.4s
204 %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %op)
208 define void @ctlz_v8i32(ptr %a) vscale_range(2,0) #0 {
209 ; CHECK-LABEL: ctlz_v8i32:
211 ; CHECK-NEXT: ptrue p0.s, vl8
212 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
213 ; CHECK-NEXT: clz z0.s, p0/m, z0.s
214 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
216 %op = load <8 x i32>, ptr %a
217 %res = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %op)
218 store <8 x i32> %res, ptr %a
222 define void @ctlz_v16i32(ptr %a) #0 {
223 ; VBITS_GE_256-LABEL: ctlz_v16i32:
224 ; VBITS_GE_256: // %bb.0:
225 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
226 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
227 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
228 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
229 ; VBITS_GE_256-NEXT: clz z0.s, p0/m, z0.s
230 ; VBITS_GE_256-NEXT: clz z1.s, p0/m, z1.s
231 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
232 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
233 ; VBITS_GE_256-NEXT: ret
235 ; VBITS_GE_512-LABEL: ctlz_v16i32:
236 ; VBITS_GE_512: // %bb.0:
237 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
238 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
239 ; VBITS_GE_512-NEXT: clz z0.s, p0/m, z0.s
240 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
241 ; VBITS_GE_512-NEXT: ret
242 %op = load <16 x i32>, ptr %a
243 %res = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %op)
244 store <16 x i32> %res, ptr %a
248 define void @ctlz_v32i32(ptr %a) vscale_range(8,0) #0 {
249 ; CHECK-LABEL: ctlz_v32i32:
251 ; CHECK-NEXT: ptrue p0.s, vl32
252 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
253 ; CHECK-NEXT: clz z0.s, p0/m, z0.s
254 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
256 %op = load <32 x i32>, ptr %a
257 %res = call <32 x i32> @llvm.ctlz.v32i32(<32 x i32> %op)
258 store <32 x i32> %res, ptr %a
262 define void @ctlz_v64i32(ptr %a) vscale_range(16,0) #0 {
263 ; CHECK-LABEL: ctlz_v64i32:
265 ; CHECK-NEXT: ptrue p0.s, vl64
266 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
267 ; CHECK-NEXT: clz z0.s, p0/m, z0.s
268 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
270 %op = load <64 x i32>, ptr %a
271 %res = call <64 x i32> @llvm.ctlz.v64i32(<64 x i32> %op)
272 store <64 x i32> %res, ptr %a
276 define <1 x i64> @ctlz_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
277 ; CHECK-LABEL: ctlz_v1i64:
279 ; CHECK-NEXT: ptrue p0.d, vl1
280 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
281 ; CHECK-NEXT: clz z0.d, p0/m, z0.d
282 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
284 %res = call <1 x i64> @llvm.ctlz.v1i64(<1 x i64> %op)
288 define <2 x i64> @ctlz_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
289 ; CHECK-LABEL: ctlz_v2i64:
291 ; CHECK-NEXT: ptrue p0.d, vl2
292 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
293 ; CHECK-NEXT: clz z0.d, p0/m, z0.d
294 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
296 %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %op)
300 define void @ctlz_v4i64(ptr %a) vscale_range(2,0) #0 {
301 ; CHECK-LABEL: ctlz_v4i64:
303 ; CHECK-NEXT: ptrue p0.d, vl4
304 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
305 ; CHECK-NEXT: clz z0.d, p0/m, z0.d
306 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
308 %op = load <4 x i64>, ptr %a
309 %res = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %op)
310 store <4 x i64> %res, ptr %a
314 define void @ctlz_v8i64(ptr %a) #0 {
315 ; VBITS_GE_256-LABEL: ctlz_v8i64:
316 ; VBITS_GE_256: // %bb.0:
317 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
318 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
319 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
320 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
321 ; VBITS_GE_256-NEXT: clz z0.d, p0/m, z0.d
322 ; VBITS_GE_256-NEXT: clz z1.d, p0/m, z1.d
323 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
324 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
325 ; VBITS_GE_256-NEXT: ret
327 ; VBITS_GE_512-LABEL: ctlz_v8i64:
328 ; VBITS_GE_512: // %bb.0:
329 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
330 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
331 ; VBITS_GE_512-NEXT: clz z0.d, p0/m, z0.d
332 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
333 ; VBITS_GE_512-NEXT: ret
334 %op = load <8 x i64>, ptr %a
335 %res = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %op)
336 store <8 x i64> %res, ptr %a
340 define void @ctlz_v16i64(ptr %a) vscale_range(8,0) #0 {
341 ; CHECK-LABEL: ctlz_v16i64:
343 ; CHECK-NEXT: ptrue p0.d, vl16
344 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
345 ; CHECK-NEXT: clz z0.d, p0/m, z0.d
346 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
348 %op = load <16 x i64>, ptr %a
349 %res = call <16 x i64> @llvm.ctlz.v16i64(<16 x i64> %op)
350 store <16 x i64> %res, ptr %a
354 define void @ctlz_v32i64(ptr %a) vscale_range(16,0) #0 {
355 ; CHECK-LABEL: ctlz_v32i64:
357 ; CHECK-NEXT: ptrue p0.d, vl32
358 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
359 ; CHECK-NEXT: clz z0.d, p0/m, z0.d
360 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
362 %op = load <32 x i64>, ptr %a
363 %res = call <32 x i64> @llvm.ctlz.v32i64(<32 x i64> %op)
364 store <32 x i64> %res, ptr %a
372 ; Don't use SVE for 64-bit vectors.
373 define <8 x i8> @ctpop_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
374 ; CHECK-LABEL: ctpop_v8i8:
376 ; CHECK-NEXT: cnt v0.8b, v0.8b
378 %res = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %op)
382 ; Don't use SVE for 128-bit vectors.
383 define <16 x i8> @ctpop_v16i8(<16 x i8> %op) vscale_range(2,0) #0 {
384 ; CHECK-LABEL: ctpop_v16i8:
386 ; CHECK-NEXT: cnt v0.16b, v0.16b
388 %res = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %op)
392 define void @ctpop_v32i8(ptr %a) vscale_range(2,0) #0 {
393 ; CHECK-LABEL: ctpop_v32i8:
395 ; CHECK-NEXT: ptrue p0.b, vl32
396 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
397 ; CHECK-NEXT: cnt z0.b, p0/m, z0.b
398 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
400 %op = load <32 x i8>, ptr %a
401 %res = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %op)
402 store <32 x i8> %res, ptr %a
406 define void @ctpop_v64i8(ptr %a) #0 {
407 ; VBITS_GE_256-LABEL: ctpop_v64i8:
408 ; VBITS_GE_256: // %bb.0:
409 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
410 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
411 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
412 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
413 ; VBITS_GE_256-NEXT: cnt z0.b, p0/m, z0.b
414 ; VBITS_GE_256-NEXT: cnt z1.b, p0/m, z1.b
415 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
416 ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
417 ; VBITS_GE_256-NEXT: ret
419 ; VBITS_GE_512-LABEL: ctpop_v64i8:
420 ; VBITS_GE_512: // %bb.0:
421 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
422 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
423 ; VBITS_GE_512-NEXT: cnt z0.b, p0/m, z0.b
424 ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
425 ; VBITS_GE_512-NEXT: ret
426 %op = load <64 x i8>, ptr %a
427 %res = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %op)
428 store <64 x i8> %res, ptr %a
432 define void @ctpop_v128i8(ptr %a) vscale_range(8,0) #0 {
433 ; CHECK-LABEL: ctpop_v128i8:
435 ; CHECK-NEXT: ptrue p0.b, vl128
436 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
437 ; CHECK-NEXT: cnt z0.b, p0/m, z0.b
438 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
440 %op = load <128 x i8>, ptr %a
441 %res = call <128 x i8> @llvm.ctpop.v128i8(<128 x i8> %op)
442 store <128 x i8> %res, ptr %a
446 define void @ctpop_v256i8(ptr %a) vscale_range(16,0) #0 {
447 ; CHECK-LABEL: ctpop_v256i8:
449 ; CHECK-NEXT: ptrue p0.b, vl256
450 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
451 ; CHECK-NEXT: cnt z0.b, p0/m, z0.b
452 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
454 %op = load <256 x i8>, ptr %a
455 %res = call <256 x i8> @llvm.ctpop.v256i8(<256 x i8> %op)
456 store <256 x i8> %res, ptr %a
460 ; Don't use SVE for 64-bit vectors.
461 define <4 x i16> @ctpop_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
462 ; CHECK-LABEL: ctpop_v4i16:
464 ; CHECK-NEXT: cnt v0.8b, v0.8b
465 ; CHECK-NEXT: uaddlp v0.4h, v0.8b
467 %res = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %op)
471 ; Don't use SVE for 128-bit vectors.
472 define <8 x i16> @ctpop_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
473 ; CHECK-LABEL: ctpop_v8i16:
475 ; CHECK-NEXT: cnt v0.16b, v0.16b
476 ; CHECK-NEXT: uaddlp v0.8h, v0.16b
478 %res = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %op)
482 define void @ctpop_v16i16(ptr %a) vscale_range(2,0) #0 {
483 ; CHECK-LABEL: ctpop_v16i16:
485 ; CHECK-NEXT: ptrue p0.h, vl16
486 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
487 ; CHECK-NEXT: cnt z0.h, p0/m, z0.h
488 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
490 %op = load <16 x i16>, ptr %a
491 %res = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %op)
492 store <16 x i16> %res, ptr %a
496 define void @ctpop_v32i16(ptr %a) #0 {
497 ; VBITS_GE_256-LABEL: ctpop_v32i16:
498 ; VBITS_GE_256: // %bb.0:
499 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
500 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
501 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
502 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
503 ; VBITS_GE_256-NEXT: cnt z0.h, p0/m, z0.h
504 ; VBITS_GE_256-NEXT: cnt z1.h, p0/m, z1.h
505 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
506 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
507 ; VBITS_GE_256-NEXT: ret
509 ; VBITS_GE_512-LABEL: ctpop_v32i16:
510 ; VBITS_GE_512: // %bb.0:
511 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
512 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
513 ; VBITS_GE_512-NEXT: cnt z0.h, p0/m, z0.h
514 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
515 ; VBITS_GE_512-NEXT: ret
516 %op = load <32 x i16>, ptr %a
517 %res = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %op)
518 store <32 x i16> %res, ptr %a
522 define void @ctpop_v64i16(ptr %a) vscale_range(8,0) #0 {
523 ; CHECK-LABEL: ctpop_v64i16:
525 ; CHECK-NEXT: ptrue p0.h, vl64
526 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
527 ; CHECK-NEXT: cnt z0.h, p0/m, z0.h
528 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
530 %op = load <64 x i16>, ptr %a
531 %res = call <64 x i16> @llvm.ctpop.v64i16(<64 x i16> %op)
532 store <64 x i16> %res, ptr %a
536 define void @ctpop_v128i16(ptr %a) vscale_range(16,0) #0 {
537 ; CHECK-LABEL: ctpop_v128i16:
539 ; CHECK-NEXT: ptrue p0.h, vl128
540 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
541 ; CHECK-NEXT: cnt z0.h, p0/m, z0.h
542 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
544 %op = load <128 x i16>, ptr %a
545 %res = call <128 x i16> @llvm.ctpop.v128i16(<128 x i16> %op)
546 store <128 x i16> %res, ptr %a
550 ; Don't use SVE for 64-bit vectors.
551 define <2 x i32> @ctpop_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
552 ; CHECK-LABEL: ctpop_v2i32:
554 ; CHECK-NEXT: cnt v0.8b, v0.8b
555 ; CHECK-NEXT: uaddlp v0.4h, v0.8b
556 ; CHECK-NEXT: uaddlp v0.2s, v0.4h
558 %res = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %op)
562 ; Don't use SVE for 128-bit vectors.
563 define <4 x i32> @ctpop_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
564 ; CHECK-LABEL: ctpop_v4i32:
566 ; CHECK-NEXT: cnt v0.16b, v0.16b
567 ; CHECK-NEXT: uaddlp v0.8h, v0.16b
568 ; CHECK-NEXT: uaddlp v0.4s, v0.8h
570 %res = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %op)
574 define void @ctpop_v8i32(ptr %a) vscale_range(2,0) #0 {
575 ; CHECK-LABEL: ctpop_v8i32:
577 ; CHECK-NEXT: ptrue p0.s, vl8
578 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
579 ; CHECK-NEXT: cnt z0.s, p0/m, z0.s
580 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
582 %op = load <8 x i32>, ptr %a
583 %res = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %op)
584 store <8 x i32> %res, ptr %a
588 define void @ctpop_v16i32(ptr %a) #0 {
589 ; VBITS_GE_256-LABEL: ctpop_v16i32:
590 ; VBITS_GE_256: // %bb.0:
591 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
592 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
593 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
594 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
595 ; VBITS_GE_256-NEXT: cnt z0.s, p0/m, z0.s
596 ; VBITS_GE_256-NEXT: cnt z1.s, p0/m, z1.s
597 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
598 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
599 ; VBITS_GE_256-NEXT: ret
601 ; VBITS_GE_512-LABEL: ctpop_v16i32:
602 ; VBITS_GE_512: // %bb.0:
603 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
604 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
605 ; VBITS_GE_512-NEXT: cnt z0.s, p0/m, z0.s
606 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
607 ; VBITS_GE_512-NEXT: ret
608 %op = load <16 x i32>, ptr %a
609 %res = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %op)
610 store <16 x i32> %res, ptr %a
614 define void @ctpop_v32i32(ptr %a) vscale_range(8,0) #0 {
615 ; CHECK-LABEL: ctpop_v32i32:
617 ; CHECK-NEXT: ptrue p0.s, vl32
618 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
619 ; CHECK-NEXT: cnt z0.s, p0/m, z0.s
620 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
622 %op = load <32 x i32>, ptr %a
623 %res = call <32 x i32> @llvm.ctpop.v32i32(<32 x i32> %op)
624 store <32 x i32> %res, ptr %a
628 define void @ctpop_v64i32(ptr %a) vscale_range(16,0) #0 {
629 ; CHECK-LABEL: ctpop_v64i32:
631 ; CHECK-NEXT: ptrue p0.s, vl64
632 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
633 ; CHECK-NEXT: cnt z0.s, p0/m, z0.s
634 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
636 %op = load <64 x i32>, ptr %a
637 %res = call <64 x i32> @llvm.ctpop.v64i32(<64 x i32> %op)
638 store <64 x i32> %res, ptr %a
642 ; Don't use SVE for 64-bit vectors.
643 define <1 x i64> @ctpop_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
644 ; CHECK-LABEL: ctpop_v1i64:
646 ; CHECK-NEXT: cnt v0.8b, v0.8b
647 ; CHECK-NEXT: uaddlp v0.4h, v0.8b
648 ; CHECK-NEXT: uaddlp v0.2s, v0.4h
649 ; CHECK-NEXT: uaddlp v0.1d, v0.2s
651 %res = call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %op)
655 ; Don't use SVE for 128-bit vectors.
656 define <2 x i64> @ctpop_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
657 ; CHECK-LABEL: ctpop_v2i64:
659 ; CHECK-NEXT: cnt v0.16b, v0.16b
660 ; CHECK-NEXT: uaddlp v0.8h, v0.16b
661 ; CHECK-NEXT: uaddlp v0.4s, v0.8h
662 ; CHECK-NEXT: uaddlp v0.2d, v0.4s
664 %res = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %op)
668 define void @ctpop_v4i64(ptr %a) vscale_range(2,0) #0 {
669 ; CHECK-LABEL: ctpop_v4i64:
671 ; CHECK-NEXT: ptrue p0.d, vl4
672 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
673 ; CHECK-NEXT: cnt z0.d, p0/m, z0.d
674 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
676 %op = load <4 x i64>, ptr %a
677 %res = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %op)
678 store <4 x i64> %res, ptr %a
682 define void @ctpop_v8i64(ptr %a) #0 {
683 ; VBITS_GE_256-LABEL: ctpop_v8i64:
684 ; VBITS_GE_256: // %bb.0:
685 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
686 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
687 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
688 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
689 ; VBITS_GE_256-NEXT: cnt z0.d, p0/m, z0.d
690 ; VBITS_GE_256-NEXT: cnt z1.d, p0/m, z1.d
691 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
692 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
693 ; VBITS_GE_256-NEXT: ret
695 ; VBITS_GE_512-LABEL: ctpop_v8i64:
696 ; VBITS_GE_512: // %bb.0:
697 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
698 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
699 ; VBITS_GE_512-NEXT: cnt z0.d, p0/m, z0.d
700 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
701 ; VBITS_GE_512-NEXT: ret
702 %op = load <8 x i64>, ptr %a
703 %res = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %op)
704 store <8 x i64> %res, ptr %a
708 define void @ctpop_v16i64(ptr %a) vscale_range(8,0) #0 {
709 ; CHECK-LABEL: ctpop_v16i64:
711 ; CHECK-NEXT: ptrue p0.d, vl16
712 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
713 ; CHECK-NEXT: cnt z0.d, p0/m, z0.d
714 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
716 %op = load <16 x i64>, ptr %a
717 %res = call <16 x i64> @llvm.ctpop.v16i64(<16 x i64> %op)
718 store <16 x i64> %res, ptr %a
722 define void @ctpop_v32i64(ptr %a) vscale_range(16,0) #0 {
723 ; CHECK-LABEL: ctpop_v32i64:
725 ; CHECK-NEXT: ptrue p0.d, vl32
726 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
727 ; CHECK-NEXT: cnt z0.d, p0/m, z0.d
728 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
730 %op = load <32 x i64>, ptr %a
731 %res = call <32 x i64> @llvm.ctpop.v32i64(<32 x i64> %op)
732 store <32 x i64> %res, ptr %a
737 ; Count trailing zeros
740 define <8 x i8> @cttz_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
741 ; CHECK-LABEL: cttz_v8i8:
743 ; CHECK-NEXT: ptrue p0.b, vl8
744 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
745 ; CHECK-NEXT: rbit z0.b, p0/m, z0.b
746 ; CHECK-NEXT: clz v0.8b, v0.8b
748 %res = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %op)
752 define <16 x i8> @cttz_v16i8(<16 x i8> %op) vscale_range(2,0) #0 {
753 ; CHECK-LABEL: cttz_v16i8:
755 ; CHECK-NEXT: ptrue p0.b, vl16
756 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
757 ; CHECK-NEXT: rbit z0.b, p0/m, z0.b
758 ; CHECK-NEXT: clz v0.16b, v0.16b
760 %res = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %op)
764 define void @cttz_v32i8(ptr %a) vscale_range(2,0) #0 {
765 ; CHECK-LABEL: cttz_v32i8:
767 ; CHECK-NEXT: ptrue p0.b, vl32
768 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
769 ; CHECK-NEXT: rbit z0.b, p0/m, z0.b
770 ; CHECK-NEXT: clz z0.b, p0/m, z0.b
771 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
773 %op = load <32 x i8>, ptr %a
774 %res = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %op)
775 store <32 x i8> %res, ptr %a
779 define void @cttz_v64i8(ptr %a) #0 {
780 ; VBITS_GE_256-LABEL: cttz_v64i8:
781 ; VBITS_GE_256: // %bb.0:
782 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
783 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
784 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
785 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
786 ; VBITS_GE_256-NEXT: rbit z0.b, p0/m, z0.b
787 ; VBITS_GE_256-NEXT: rbit z1.b, p0/m, z1.b
788 ; VBITS_GE_256-NEXT: clz z0.b, p0/m, z0.b
789 ; VBITS_GE_256-NEXT: clz z1.b, p0/m, z1.b
790 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
791 ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
792 ; VBITS_GE_256-NEXT: ret
794 ; VBITS_GE_512-LABEL: cttz_v64i8:
795 ; VBITS_GE_512: // %bb.0:
796 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
797 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
798 ; VBITS_GE_512-NEXT: rbit z0.b, p0/m, z0.b
799 ; VBITS_GE_512-NEXT: clz z0.b, p0/m, z0.b
800 ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
801 ; VBITS_GE_512-NEXT: ret
802 %op = load <64 x i8>, ptr %a
803 %res = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %op)
804 store <64 x i8> %res, ptr %a
808 define void @cttz_v128i8(ptr %a) vscale_range(8,0) #0 {
809 ; CHECK-LABEL: cttz_v128i8:
811 ; CHECK-NEXT: ptrue p0.b, vl128
812 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
813 ; CHECK-NEXT: rbit z0.b, p0/m, z0.b
814 ; CHECK-NEXT: clz z0.b, p0/m, z0.b
815 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
817 %op = load <128 x i8>, ptr %a
818 %res = call <128 x i8> @llvm.cttz.v128i8(<128 x i8> %op)
819 store <128 x i8> %res, ptr %a
823 define void @cttz_v256i8(ptr %a) vscale_range(16,0) #0 {
824 ; CHECK-LABEL: cttz_v256i8:
826 ; CHECK-NEXT: ptrue p0.b, vl256
827 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
828 ; CHECK-NEXT: rbit z0.b, p0/m, z0.b
829 ; CHECK-NEXT: clz z0.b, p0/m, z0.b
830 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
832 %op = load <256 x i8>, ptr %a
833 %res = call <256 x i8> @llvm.cttz.v256i8(<256 x i8> %op)
834 store <256 x i8> %res, ptr %a
838 define <4 x i16> @cttz_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
839 ; CHECK-LABEL: cttz_v4i16:
841 ; CHECK-NEXT: ptrue p0.h, vl4
842 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
843 ; CHECK-NEXT: rbit z0.h, p0/m, z0.h
844 ; CHECK-NEXT: clz v0.4h, v0.4h
846 %res = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %op)
850 define <8 x i16> @cttz_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
851 ; CHECK-LABEL: cttz_v8i16:
853 ; CHECK-NEXT: ptrue p0.h, vl8
854 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
855 ; CHECK-NEXT: rbit z0.h, p0/m, z0.h
856 ; CHECK-NEXT: clz v0.8h, v0.8h
858 %res = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %op)
862 define void @cttz_v16i16(ptr %a) vscale_range(2,0) #0 {
863 ; CHECK-LABEL: cttz_v16i16:
865 ; CHECK-NEXT: ptrue p0.h, vl16
866 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
867 ; CHECK-NEXT: rbit z0.h, p0/m, z0.h
868 ; CHECK-NEXT: clz z0.h, p0/m, z0.h
869 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
871 %op = load <16 x i16>, ptr %a
872 %res = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %op)
873 store <16 x i16> %res, ptr %a
877 define void @cttz_v32i16(ptr %a) #0 {
878 ; VBITS_GE_256-LABEL: cttz_v32i16:
879 ; VBITS_GE_256: // %bb.0:
880 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
881 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
882 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
883 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
884 ; VBITS_GE_256-NEXT: rbit z0.h, p0/m, z0.h
885 ; VBITS_GE_256-NEXT: rbit z1.h, p0/m, z1.h
886 ; VBITS_GE_256-NEXT: clz z0.h, p0/m, z0.h
887 ; VBITS_GE_256-NEXT: clz z1.h, p0/m, z1.h
888 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
889 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
890 ; VBITS_GE_256-NEXT: ret
892 ; VBITS_GE_512-LABEL: cttz_v32i16:
893 ; VBITS_GE_512: // %bb.0:
894 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
895 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
896 ; VBITS_GE_512-NEXT: rbit z0.h, p0/m, z0.h
897 ; VBITS_GE_512-NEXT: clz z0.h, p0/m, z0.h
898 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
899 ; VBITS_GE_512-NEXT: ret
900 %op = load <32 x i16>, ptr %a
901 %res = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %op)
902 store <32 x i16> %res, ptr %a
906 define void @cttz_v64i16(ptr %a) vscale_range(8,0) #0 {
907 ; CHECK-LABEL: cttz_v64i16:
909 ; CHECK-NEXT: ptrue p0.h, vl64
910 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
911 ; CHECK-NEXT: rbit z0.h, p0/m, z0.h
912 ; CHECK-NEXT: clz z0.h, p0/m, z0.h
913 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
915 %op = load <64 x i16>, ptr %a
916 %res = call <64 x i16> @llvm.cttz.v64i16(<64 x i16> %op)
917 store <64 x i16> %res, ptr %a
921 define void @cttz_v128i16(ptr %a) vscale_range(16,0) #0 {
922 ; CHECK-LABEL: cttz_v128i16:
924 ; CHECK-NEXT: ptrue p0.h, vl128
925 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
926 ; CHECK-NEXT: rbit z0.h, p0/m, z0.h
927 ; CHECK-NEXT: clz z0.h, p0/m, z0.h
928 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
930 %op = load <128 x i16>, ptr %a
931 %res = call <128 x i16> @llvm.cttz.v128i16(<128 x i16> %op)
932 store <128 x i16> %res, ptr %a
936 ; Don't use SVE for 64-bit vectors.
937 define <2 x i32> @cttz_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
938 ; CHECK-LABEL: cttz_v2i32:
940 ; CHECK-NEXT: ptrue p0.s, vl2
941 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
942 ; CHECK-NEXT: rbit z0.s, p0/m, z0.s
943 ; CHECK-NEXT: clz v0.2s, v0.2s
945 %res = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %op)
949 ; Don't use SVE for 128-bit vectors.
950 define <4 x i32> @cttz_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
951 ; CHECK-LABEL: cttz_v4i32:
953 ; CHECK-NEXT: ptrue p0.s, vl4
954 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
955 ; CHECK-NEXT: rbit z0.s, p0/m, z0.s
956 ; CHECK-NEXT: clz v0.4s, v0.4s
958 %res = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %op)
962 define void @cttz_v8i32(ptr %a) vscale_range(2,0) #0 {
963 ; CHECK-LABEL: cttz_v8i32:
965 ; CHECK-NEXT: ptrue p0.s, vl8
966 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
967 ; CHECK-NEXT: rbit z0.s, p0/m, z0.s
968 ; CHECK-NEXT: clz z0.s, p0/m, z0.s
969 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
971 %op = load <8 x i32>, ptr %a
972 %res = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %op)
973 store <8 x i32> %res, ptr %a
977 define void @cttz_v16i32(ptr %a) #0 {
978 ; VBITS_GE_256-LABEL: cttz_v16i32:
979 ; VBITS_GE_256: // %bb.0:
980 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
981 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
982 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
983 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
984 ; VBITS_GE_256-NEXT: rbit z0.s, p0/m, z0.s
985 ; VBITS_GE_256-NEXT: rbit z1.s, p0/m, z1.s
986 ; VBITS_GE_256-NEXT: clz z0.s, p0/m, z0.s
987 ; VBITS_GE_256-NEXT: clz z1.s, p0/m, z1.s
988 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
989 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
990 ; VBITS_GE_256-NEXT: ret
992 ; VBITS_GE_512-LABEL: cttz_v16i32:
993 ; VBITS_GE_512: // %bb.0:
994 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
995 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
996 ; VBITS_GE_512-NEXT: rbit z0.s, p0/m, z0.s
997 ; VBITS_GE_512-NEXT: clz z0.s, p0/m, z0.s
998 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
999 ; VBITS_GE_512-NEXT: ret
1000 %op = load <16 x i32>, ptr %a
1001 %res = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %op)
1002 store <16 x i32> %res, ptr %a
1006 define void @cttz_v32i32(ptr %a) vscale_range(8,0) #0 {
1007 ; CHECK-LABEL: cttz_v32i32:
1009 ; CHECK-NEXT: ptrue p0.s, vl32
1010 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1011 ; CHECK-NEXT: rbit z0.s, p0/m, z0.s
1012 ; CHECK-NEXT: clz z0.s, p0/m, z0.s
1013 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1015 %op = load <32 x i32>, ptr %a
1016 %res = call <32 x i32> @llvm.cttz.v32i32(<32 x i32> %op)
1017 store <32 x i32> %res, ptr %a
1021 define void @cttz_v64i32(ptr %a) vscale_range(16,0) #0 {
1022 ; CHECK-LABEL: cttz_v64i32:
1024 ; CHECK-NEXT: ptrue p0.s, vl64
1025 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1026 ; CHECK-NEXT: rbit z0.s, p0/m, z0.s
1027 ; CHECK-NEXT: clz z0.s, p0/m, z0.s
1028 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1030 %op = load <64 x i32>, ptr %a
1031 %res = call <64 x i32> @llvm.cttz.v64i32(<64 x i32> %op)
1032 store <64 x i32> %res, ptr %a
1036 define <1 x i64> @cttz_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
1037 ; CHECK-LABEL: cttz_v1i64:
1039 ; CHECK-NEXT: ptrue p0.d, vl1
1040 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
1041 ; CHECK-NEXT: rbit z0.d, p0/m, z0.d
1042 ; CHECK-NEXT: clz z0.d, p0/m, z0.d
1043 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
1045 %res = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %op)
1049 define <2 x i64> @cttz_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
1050 ; CHECK-LABEL: cttz_v2i64:
1052 ; CHECK-NEXT: ptrue p0.d, vl2
1053 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
1054 ; CHECK-NEXT: rbit z0.d, p0/m, z0.d
1055 ; CHECK-NEXT: clz z0.d, p0/m, z0.d
1056 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
1058 %res = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %op)
1062 define void @cttz_v4i64(ptr %a) vscale_range(2,0) #0 {
1063 ; CHECK-LABEL: cttz_v4i64:
1065 ; CHECK-NEXT: ptrue p0.d, vl4
1066 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1067 ; CHECK-NEXT: rbit z0.d, p0/m, z0.d
1068 ; CHECK-NEXT: clz z0.d, p0/m, z0.d
1069 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1071 %op = load <4 x i64>, ptr %a
1072 %res = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %op)
1073 store <4 x i64> %res, ptr %a
1077 define void @cttz_v8i64(ptr %a) #0 {
1078 ; VBITS_GE_256-LABEL: cttz_v8i64:
1079 ; VBITS_GE_256: // %bb.0:
1080 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1081 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1082 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1083 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
1084 ; VBITS_GE_256-NEXT: rbit z0.d, p0/m, z0.d
1085 ; VBITS_GE_256-NEXT: rbit z1.d, p0/m, z1.d
1086 ; VBITS_GE_256-NEXT: clz z0.d, p0/m, z0.d
1087 ; VBITS_GE_256-NEXT: clz z1.d, p0/m, z1.d
1088 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
1089 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
1090 ; VBITS_GE_256-NEXT: ret
1092 ; VBITS_GE_512-LABEL: cttz_v8i64:
1093 ; VBITS_GE_512: // %bb.0:
1094 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1095 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1096 ; VBITS_GE_512-NEXT: rbit z0.d, p0/m, z0.d
1097 ; VBITS_GE_512-NEXT: clz z0.d, p0/m, z0.d
1098 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
1099 ; VBITS_GE_512-NEXT: ret
1100 %op = load <8 x i64>, ptr %a
1101 %res = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %op)
1102 store <8 x i64> %res, ptr %a
1106 define void @cttz_v16i64(ptr %a) vscale_range(8,0) #0 {
1107 ; CHECK-LABEL: cttz_v16i64:
1109 ; CHECK-NEXT: ptrue p0.d, vl16
1110 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1111 ; CHECK-NEXT: rbit z0.d, p0/m, z0.d
1112 ; CHECK-NEXT: clz z0.d, p0/m, z0.d
1113 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1115 %op = load <16 x i64>, ptr %a
1116 %res = call <16 x i64> @llvm.cttz.v16i64(<16 x i64> %op)
1117 store <16 x i64> %res, ptr %a
1121 define void @cttz_v32i64(ptr %a) vscale_range(16,0) #0 {
1122 ; CHECK-LABEL: cttz_v32i64:
1124 ; CHECK-NEXT: ptrue p0.d, vl32
1125 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1126 ; CHECK-NEXT: rbit z0.d, p0/m, z0.d
1127 ; CHECK-NEXT: clz z0.d, p0/m, z0.d
1128 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1130 %op = load <32 x i64>, ptr %a
1131 %res = call <32 x i64> @llvm.cttz.v32i64(<32 x i64> %op)
1132 store <32 x i64> %res, ptr %a
1136 attributes #0 = { "target-features"="+sve" }
1138 declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>)
1139 declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>)
1140 declare <32 x i8> @llvm.ctlz.v32i8(<32 x i8>)
1141 declare <64 x i8> @llvm.ctlz.v64i8(<64 x i8>)
1142 declare <128 x i8> @llvm.ctlz.v128i8(<128 x i8>)
1143 declare <256 x i8> @llvm.ctlz.v256i8(<256 x i8>)
1144 declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>)
1145 declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>)
1146 declare <16 x i16> @llvm.ctlz.v16i16(<16 x i16>)
1147 declare <32 x i16> @llvm.ctlz.v32i16(<32 x i16>)
1148 declare <64 x i16> @llvm.ctlz.v64i16(<64 x i16>)
1149 declare <128 x i16> @llvm.ctlz.v128i16(<128 x i16>)
1150 declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>)
1151 declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>)
1152 declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>)
1153 declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>)
1154 declare <32 x i32> @llvm.ctlz.v32i32(<32 x i32>)
1155 declare <64 x i32> @llvm.ctlz.v64i32(<64 x i32>)
1156 declare <1 x i64> @llvm.ctlz.v1i64(<1 x i64>)
1157 declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>)
1158 declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>)
1159 declare <8 x i64> @llvm.ctlz.v8i64(<8 x i64>)
1160 declare <16 x i64> @llvm.ctlz.v16i64(<16 x i64>)
1161 declare <32 x i64> @llvm.ctlz.v32i64(<32 x i64>)
1163 declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>)
1164 declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>)
1165 declare <32 x i8> @llvm.ctpop.v32i8(<32 x i8>)
1166 declare <64 x i8> @llvm.ctpop.v64i8(<64 x i8>)
1167 declare <128 x i8> @llvm.ctpop.v128i8(<128 x i8>)
1168 declare <256 x i8> @llvm.ctpop.v256i8(<256 x i8>)
1169 declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>)
1170 declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>)
1171 declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>)
1172 declare <32 x i16> @llvm.ctpop.v32i16(<32 x i16>)
1173 declare <64 x i16> @llvm.ctpop.v64i16(<64 x i16>)
1174 declare <128 x i16> @llvm.ctpop.v128i16(<128 x i16>)
1175 declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>)
1176 declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
1177 declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>)
1178 declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>)
1179 declare <32 x i32> @llvm.ctpop.v32i32(<32 x i32>)
1180 declare <64 x i32> @llvm.ctpop.v64i32(<64 x i32>)
1181 declare <1 x i64> @llvm.ctpop.v1i64(<1 x i64>)
1182 declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
1183 declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>)
1184 declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>)
1185 declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>)
1186 declare <32 x i64> @llvm.ctpop.v32i64(<32 x i64>)
1188 declare <8 x i8> @llvm.cttz.v8i8(<8 x i8>)
1189 declare <16 x i8> @llvm.cttz.v16i8(<16 x i8>)
1190 declare <32 x i8> @llvm.cttz.v32i8(<32 x i8>)
1191 declare <64 x i8> @llvm.cttz.v64i8(<64 x i8>)
1192 declare <128 x i8> @llvm.cttz.v128i8(<128 x i8>)
1193 declare <256 x i8> @llvm.cttz.v256i8(<256 x i8>)
1194 declare <4 x i16> @llvm.cttz.v4i16(<4 x i16>)
1195 declare <8 x i16> @llvm.cttz.v8i16(<8 x i16>)
1196 declare <16 x i16> @llvm.cttz.v16i16(<16 x i16>)
1197 declare <32 x i16> @llvm.cttz.v32i16(<32 x i16>)
1198 declare <64 x i16> @llvm.cttz.v64i16(<64 x i16>)
1199 declare <128 x i16> @llvm.cttz.v128i16(<128 x i16>)
1200 declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>)
1201 declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>)
1202 declare <8 x i32> @llvm.cttz.v8i32(<8 x i32>)
1203 declare <16 x i32> @llvm.cttz.v16i32(<16 x i32>)
1204 declare <32 x i32> @llvm.cttz.v32i32(<32 x i32>)
1205 declare <64 x i32> @llvm.cttz.v64i32(<64 x i32>)
1206 declare <1 x i64> @llvm.cttz.v1i64(<1 x i64>)
1207 declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>)
1208 declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>)
1209 declare <8 x i64> @llvm.cttz.v8i64(<8 x i64>)
1210 declare <16 x i64> @llvm.cttz.v16i64(<16 x i64>)
1211 declare <32 x i64> @llvm.cttz.v32i64(<32 x i64>)