1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3 ; RUN: llc -aarch64-sve-vector-bits-min=512 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4 ; RUN: llc -aarch64-sve-vector-bits-min=2048 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
6 ; Test we can code generater patterns of the form:
7 ; fixed_length_vector = ISD::EXTRACT_SUBVECTOR scalable_vector, 0
8 ; scalable_vector = ISD::INSERT_SUBVECTOR scalable_vector, fixed_length_vector, 0
10 ; NOTE: Currently shufflevector does not support scalable vectors so it cannot
11 ; be used to model the above operations. Instead these tests rely on knowing
12 ; how fixed length operation are lowered to scalable ones, with multiple blocks
13 ; ensuring insert/extract sequences are not folded away.
15 target triple = "aarch64-unknown-linux-gnu"
17 define void @subvector_v8i16(ptr %in, ptr %out) vscale_range(2,0) #0 {
18 ; CHECK-LABEL: subvector_v8i16:
20 ; CHECK-NEXT: ldr q0, [x0]
21 ; CHECK-NEXT: str q0, [x1]
23 %a = load <8 x i16>, ptr %in
27 store <8 x i16> %a, ptr %out
31 define void @subvector_v16i16(ptr %in, ptr %out) vscale_range(2,0) #0 {
32 ; CHECK-LABEL: subvector_v16i16:
34 ; CHECK-NEXT: ptrue p0.h, vl16
35 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
36 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
38 %a = load <16 x i16>, ptr %in
42 store <16 x i16> %a, ptr %out
46 define void @subvector_v32i16(ptr %in, ptr %out) #0 {
47 ; VBITS_GE_256-LABEL: subvector_v32i16:
48 ; VBITS_GE_256: // %bb.0:
49 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
50 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
51 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
52 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
53 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
54 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
55 ; VBITS_GE_256-NEXT: ret
57 ; VBITS_GE_512-LABEL: subvector_v32i16:
58 ; VBITS_GE_512: // %bb.0:
59 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
60 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
61 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
62 ; VBITS_GE_512-NEXT: ret
63 %a = load <32 x i16>, ptr %in
67 store <32 x i16> %a, ptr %out
71 define void @subvector_v64i16(ptr %in, ptr %out) vscale_range(8,0) #0 {
72 ; CHECK-LABEL: subvector_v64i16:
74 ; CHECK-NEXT: ptrue p0.h, vl64
75 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
76 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
78 %a = load <64 x i16>, ptr %in
82 store <64 x i16> %a, ptr %out
86 define void @subvector_v8i32(ptr %in, ptr %out) vscale_range(2,0) #0 {
87 ; CHECK-LABEL: subvector_v8i32:
89 ; CHECK-NEXT: ptrue p0.s, vl8
90 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
91 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
93 %a = load <8 x i32>, ptr %in
97 store <8 x i32> %a, ptr %out
101 define void @subvector_v16i32(ptr %in, ptr %out) #0 {
102 ; VBITS_GE_256-LABEL: subvector_v16i32:
103 ; VBITS_GE_256: // %bb.0:
104 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
105 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
106 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
107 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
108 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
109 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
110 ; VBITS_GE_256-NEXT: ret
112 ; VBITS_GE_512-LABEL: subvector_v16i32:
113 ; VBITS_GE_512: // %bb.0:
114 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
115 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
116 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
117 ; VBITS_GE_512-NEXT: ret
118 %a = load <16 x i32>, ptr %in
122 store <16 x i32> %a, ptr %out
126 define void @subvector_v32i32(ptr %in, ptr %out) vscale_range(8,0) #0 {
127 ; CHECK-LABEL: subvector_v32i32:
129 ; CHECK-NEXT: ptrue p0.s, vl32
130 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
131 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
133 %a = load <32 x i32>, ptr %in
137 store <32 x i32> %a, ptr %out
141 define void @subvector_v64i32(ptr %in, ptr %out) vscale_range(16,0) #0 {
142 ; CHECK-LABEL: subvector_v64i32:
144 ; CHECK-NEXT: ptrue p0.s, vl64
145 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
146 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
148 %a = load <64 x i32>, ptr %in
152 store <64 x i32> %a, ptr %out
157 define void @subvector_v8i64(ptr %in, ptr %out) vscale_range(2,0) #0 {
158 ; CHECK-LABEL: subvector_v8i64:
160 ; CHECK-NEXT: ptrue p0.d, vl4
161 ; CHECK-NEXT: mov x8, #4 // =0x4
162 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
163 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
164 ; CHECK-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
165 ; CHECK-NEXT: st1d { z1.d }, p0, [x1]
167 %a = load <8 x i64>, ptr %in
171 store <8 x i64> %a, ptr %out
175 define void @subvector_v16i64(ptr %in, ptr %out) vscale_range(8,0) #0 {
176 ; CHECK-LABEL: subvector_v16i64:
178 ; CHECK-NEXT: ptrue p0.d, vl16
179 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
180 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
182 %a = load <16 x i64>, ptr %in
186 store <16 x i64> %a, ptr %out
190 define void @subvector_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 {
191 ; CHECK-LABEL: subvector_v32i64:
193 ; CHECK-NEXT: ptrue p0.d, vl32
194 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
195 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
197 %a = load <32 x i64>, ptr %in
201 store <32 x i64> %a, ptr %out
205 define void @subvector_v8f16(ptr %in, ptr %out) vscale_range(2,0) #0 {
206 ; CHECK-LABEL: subvector_v8f16:
208 ; CHECK-NEXT: ldr q0, [x0]
209 ; CHECK-NEXT: str q0, [x1]
211 %a = load <8 x half>, ptr %in
215 store <8 x half> %a, ptr %out
219 define void @subvector_v16f16(ptr %in, ptr %out) vscale_range(2,0) #0 {
220 ; CHECK-LABEL: subvector_v16f16:
222 ; CHECK-NEXT: ptrue p0.h, vl16
223 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
224 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
226 %a = load <16 x half>, ptr %in
230 store <16 x half> %a, ptr %out
234 define void @subvector_v32f16(ptr %in, ptr %out) #0 {
235 ; VBITS_GE_256-LABEL: subvector_v32f16:
236 ; VBITS_GE_256: // %bb.0:
237 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
238 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
239 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
240 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
241 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
242 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
243 ; VBITS_GE_256-NEXT: ret
245 ; VBITS_GE_512-LABEL: subvector_v32f16:
246 ; VBITS_GE_512: // %bb.0:
247 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
248 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
249 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
250 ; VBITS_GE_512-NEXT: ret
251 %a = load <32 x half>, ptr %in
255 store <32 x half> %a, ptr %out
259 define void @subvector_v64f16(ptr %in, ptr %out) vscale_range(8,0) #0 {
260 ; CHECK-LABEL: subvector_v64f16:
262 ; CHECK-NEXT: ptrue p0.h, vl64
263 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
264 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
266 %a = load <64 x half>, ptr %in
270 store <64 x half> %a, ptr %out
274 define void @subvector_v8f32(ptr %in, ptr %out) vscale_range(2,0) #0 {
275 ; CHECK-LABEL: subvector_v8f32:
277 ; CHECK-NEXT: ptrue p0.s, vl8
278 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
279 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
281 %a = load <8 x float>, ptr %in
285 store <8 x float> %a, ptr %out
289 define void @subvector_v16f32(ptr %in, ptr %out) #0 {
290 ; VBITS_GE_256-LABEL: subvector_v16f32:
291 ; VBITS_GE_256: // %bb.0:
292 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
293 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
294 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
295 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
296 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
297 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
298 ; VBITS_GE_256-NEXT: ret
300 ; VBITS_GE_512-LABEL: subvector_v16f32:
301 ; VBITS_GE_512: // %bb.0:
302 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
303 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
304 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
305 ; VBITS_GE_512-NEXT: ret
306 %a = load <16 x float>, ptr %in
310 store <16 x float> %a, ptr %out
314 define void @subvector_v32f32(ptr %in, ptr %out) vscale_range(8,0) #0 {
315 ; CHECK-LABEL: subvector_v32f32:
317 ; CHECK-NEXT: ptrue p0.s, vl32
318 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
319 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
321 %a = load <32 x float>, ptr %in
325 store <32 x float> %a, ptr %out
329 define void @subvector_v64f32(ptr %in, ptr %out) vscale_range(16,0) #0 {
330 ; CHECK-LABEL: subvector_v64f32:
332 ; CHECK-NEXT: ptrue p0.s, vl64
333 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
334 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
336 %a = load <64 x float>, ptr %in
340 store <64 x float> %a, ptr %out
343 define void @subvector_v8f64(ptr %in, ptr %out) #0 {
344 ; VBITS_GE_256-LABEL: subvector_v8f64:
345 ; VBITS_GE_256: // %bb.0:
346 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
347 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
348 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
349 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
350 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
351 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
352 ; VBITS_GE_256-NEXT: ret
354 ; VBITS_GE_512-LABEL: subvector_v8f64:
355 ; VBITS_GE_512: // %bb.0:
356 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
357 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
358 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
359 ; VBITS_GE_512-NEXT: ret
360 %a = load <8 x double>, ptr %in
364 store <8 x double> %a, ptr %out
368 define void @subvector_v16f64(ptr %in, ptr %out) vscale_range(8,0) #0 {
369 ; CHECK-LABEL: subvector_v16f64:
371 ; CHECK-NEXT: ptrue p0.d, vl16
372 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
373 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
375 %a = load <16 x double>, ptr %in
379 store <16 x double> %a, ptr %out
383 define void @subvector_v32f64(ptr %in, ptr %out) vscale_range(16,0) #0 {
384 ; CHECK-LABEL: subvector_v32f64:
386 ; CHECK-NEXT: ptrue p0.d, vl32
387 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
388 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
390 %a = load <32 x double>, ptr %in
394 store <32 x double> %a, ptr %out
398 define <8 x i1> @no_warn_dropped_scalable(ptr %in) #0 {
399 ; CHECK-LABEL: no_warn_dropped_scalable:
401 ; CHECK-NEXT: ptrue p0.s, vl8
402 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
403 ; CHECK-NEXT: cmpgt p0.s, p0/z, z0.s, #0
404 ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
405 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
406 ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
407 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
409 %a = load <8 x i32>, ptr %in
413 %cond = icmp sgt <8 x i32> %a, zeroinitializer
417 ; binop(insert_subvec(a), insert_subvec(b)) -> insert_subvec(binop(a,b)) like
418 ; combines remove redundant subvector operations. This test ensures it's not
419 ; performed when the input idiom is the result of operation legalisation. When
420 ; not prevented the test triggers infinite combine->legalise->combine->...
421 define void @no_subvector_binop_hang(ptr %in, ptr %out, i1 %cond) #0 {
422 ; CHECK-LABEL: no_subvector_binop_hang:
424 ; CHECK-NEXT: ptrue p0.s, vl8
425 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
426 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
427 ; CHECK-NEXT: tbz w2, #0, .LBB23_2
428 ; CHECK-NEXT: // %bb.1: // %bb.1
429 ; CHECK-NEXT: orr z0.d, z0.d, z1.d
430 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
431 ; CHECK-NEXT: .LBB23_2: // %bb.2
433 %a = load <8 x i32>, ptr %in
434 %b = load <8 x i32>, ptr %out
435 br i1 %cond, label %bb.1, label %bb.2
438 %or = or <8 x i32> %a, %b
439 store <8 x i32> %or, ptr %out
446 attributes #0 = { "target-features"="+sve" }