1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s -o - | FileCheck %s
4 ; Extracting a legal fixed-length vector from an illegal subvector
6 define <4 x i32> @extract_v4i32_nxv16i32_12(<vscale x 16 x i32> %arg) {
7 ; CHECK-LABEL: extract_v4i32_nxv16i32_12:
9 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
10 ; CHECK-NEXT: addvl sp, sp, #-4
11 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
12 ; CHECK-NEXT: .cfi_offset w29, -16
13 ; CHECK-NEXT: ptrue p0.s
14 ; CHECK-NEXT: st1w { z3.s }, p0, [sp, #3, mul vl]
15 ; CHECK-NEXT: st1w { z2.s }, p0, [sp, #2, mul vl]
16 ; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl]
17 ; CHECK-NEXT: st1w { z0.s }, p0, [sp]
18 ; CHECK-NEXT: ldr q0, [sp, #48]
19 ; CHECK-NEXT: addvl sp, sp, #4
20 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
22 %ext = call <4 x i32> @llvm.vector.extract.v4i32.nxv16i32(<vscale x 16 x i32> %arg, i64 12)
26 define <8 x i16> @extract_v8i16_nxv32i16_8(<vscale x 32 x i16> %arg) {
27 ; CHECK-LABEL: extract_v8i16_nxv32i16_8:
29 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
30 ; CHECK-NEXT: addvl sp, sp, #-2
31 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
32 ; CHECK-NEXT: .cfi_offset w29, -16
33 ; CHECK-NEXT: ptrue p0.h
34 ; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl]
35 ; CHECK-NEXT: st1h { z0.h }, p0, [sp]
36 ; CHECK-NEXT: ldr q0, [sp, #16]
37 ; CHECK-NEXT: addvl sp, sp, #2
38 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
40 %ext = call <8 x i16> @llvm.vector.extract.v8i16.nxv32i16(<vscale x 32 x i16> %arg, i64 8)
44 define <4 x i16> @extract_v4i16_nxv32i16_8(<vscale x 32 x i16> %arg) {
45 ; CHECK-LABEL: extract_v4i16_nxv32i16_8:
47 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
48 ; CHECK-NEXT: addvl sp, sp, #-4
49 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
50 ; CHECK-NEXT: .cfi_offset w29, -16
51 ; CHECK-NEXT: ptrue p0.h
52 ; CHECK-NEXT: st1h { z3.h }, p0, [sp, #3, mul vl]
53 ; CHECK-NEXT: st1h { z2.h }, p0, [sp, #2, mul vl]
54 ; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl]
55 ; CHECK-NEXT: st1h { z0.h }, p0, [sp]
56 ; CHECK-NEXT: ldr d0, [sp, #32]
57 ; CHECK-NEXT: addvl sp, sp, #4
58 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
60 %ext = call <4 x i16> @llvm.vector.extract.v4i16.nxv32i16(<vscale x 32 x i16> %arg, i64 16)
64 ; The result type gets promoted, leading to us extracting 2 elements from a nxv32i16.
65 ; Hence we don't end up in SplitVecOp_EXTRACT_SUBVECTOR, but in SplitVecOp_EXTRACT_VECTOR_ELT instead.
66 define <2 x i16> @extract_v2i16_nxv32i16_8(<vscale x 32 x i16> %arg) {
67 ; CHECK-LABEL: extract_v2i16_nxv32i16_8:
69 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
70 ; CHECK-NEXT: addvl sp, sp, #-8
71 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG
72 ; CHECK-NEXT: .cfi_offset w29, -16
73 ; CHECK-NEXT: ptrue p0.h
74 ; CHECK-NEXT: mov x8, sp
75 ; CHECK-NEXT: add x8, x8, #32
76 ; CHECK-NEXT: st1h { z3.h }, p0, [sp, #3, mul vl]
77 ; CHECK-NEXT: st1h { z2.h }, p0, [sp, #2, mul vl]
78 ; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl]
79 ; CHECK-NEXT: st1h { z0.h }, p0, [sp]
80 ; CHECK-NEXT: st1h { z3.h }, p0, [sp, #7, mul vl]
81 ; CHECK-NEXT: st1h { z2.h }, p0, [sp, #6, mul vl]
82 ; CHECK-NEXT: st1h { z1.h }, p0, [sp, #5, mul vl]
83 ; CHECK-NEXT: st1h { z0.h }, p0, [sp, #4, mul vl]
84 ; CHECK-NEXT: ld1 { v0.h }[0], [x8]
85 ; CHECK-NEXT: addvl x8, sp, #4
86 ; CHECK-NEXT: add x8, x8, #34
87 ; CHECK-NEXT: ld1 { v0.h }[2], [x8]
88 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
89 ; CHECK-NEXT: addvl sp, sp, #8
90 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
92 %ext = call <2 x i16> @llvm.vector.extract.v2i16.nxv32i16(<vscale x 32 x i16> %arg, i64 16)
96 define <2 x i64> @extract_v2i64_nxv8i64_8(<vscale x 8 x i64> %arg) {
97 ; CHECK-LABEL: extract_v2i64_nxv8i64_8:
99 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
100 ; CHECK-NEXT: addvl sp, sp, #-4
101 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
102 ; CHECK-NEXT: .cfi_offset w29, -16
103 ; CHECK-NEXT: ptrue p0.d
104 ; CHECK-NEXT: cnth x8
105 ; CHECK-NEXT: mov w9, #8 // =0x8
106 ; CHECK-NEXT: sub x8, x8, #2
107 ; CHECK-NEXT: cmp x8, #8
108 ; CHECK-NEXT: csel x8, x8, x9, lo
109 ; CHECK-NEXT: mov x9, sp
110 ; CHECK-NEXT: lsl x8, x8, #3
111 ; CHECK-NEXT: st1d { z3.d }, p0, [sp, #3, mul vl]
112 ; CHECK-NEXT: st1d { z2.d }, p0, [sp, #2, mul vl]
113 ; CHECK-NEXT: st1d { z1.d }, p0, [sp, #1, mul vl]
114 ; CHECK-NEXT: st1d { z0.d }, p0, [sp]
115 ; CHECK-NEXT: ldr q0, [x9, x8]
116 ; CHECK-NEXT: addvl sp, sp, #4
117 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
119 %ext = call <2 x i64> @llvm.vector.extract.v2i64.nxv8i64(<vscale x 8 x i64> %arg, i64 8)
123 define <4 x float> @extract_v4f32_nxv16f32_12(<vscale x 16 x float> %arg) {
124 ; CHECK-LABEL: extract_v4f32_nxv16f32_12:
126 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
127 ; CHECK-NEXT: addvl sp, sp, #-4
128 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
129 ; CHECK-NEXT: .cfi_offset w29, -16
130 ; CHECK-NEXT: ptrue p0.s
131 ; CHECK-NEXT: st1w { z3.s }, p0, [sp, #3, mul vl]
132 ; CHECK-NEXT: st1w { z2.s }, p0, [sp, #2, mul vl]
133 ; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl]
134 ; CHECK-NEXT: st1w { z0.s }, p0, [sp]
135 ; CHECK-NEXT: ldr q0, [sp, #48]
136 ; CHECK-NEXT: addvl sp, sp, #4
137 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
139 %ext = call <4 x float> @llvm.vector.extract.v4f32.nxv16f32(<vscale x 16 x float> %arg, i64 12)
143 define <2 x float> @extract_v2f32_nxv16f32_2(<vscale x 16 x float> %arg) {
144 ; CHECK-LABEL: extract_v2f32_nxv16f32_2:
146 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
147 ; CHECK-NEXT: addvl sp, sp, #-1
148 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
149 ; CHECK-NEXT: .cfi_offset w29, -16
150 ; CHECK-NEXT: ptrue p0.s
151 ; CHECK-NEXT: st1w { z0.s }, p0, [sp]
152 ; CHECK-NEXT: ldr d0, [sp, #8]
153 ; CHECK-NEXT: addvl sp, sp, #1
154 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
156 %ext = call <2 x float> @llvm.vector.extract.v2f32.nxv16f32(<vscale x 16 x float> %arg, i64 2)
160 define <4 x i1> @extract_v4i1_nxv32i1_0(<vscale x 32 x i1> %arg) {
161 ; CHECK-LABEL: extract_v4i1_nxv32i1_0:
163 ; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1
164 ; CHECK-NEXT: umov w8, v1.b[1]
165 ; CHECK-NEXT: mov v0.16b, v1.16b
166 ; CHECK-NEXT: umov w9, v1.b[2]
167 ; CHECK-NEXT: mov v0.h[1], w8
168 ; CHECK-NEXT: umov w8, v1.b[3]
169 ; CHECK-NEXT: mov v0.h[2], w9
170 ; CHECK-NEXT: mov v0.h[3], w8
171 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
173 %ext = call <4 x i1> @llvm.vector.extract.v4i1.nxv32i1(<vscale x 32 x i1> %arg, i64 0)
177 ; The result type gets promoted, leading to us extracting 4 elements from a nxv32i16.
178 ; Hence we don't end up in SplitVecOp_EXTRACT_SUBVECTOR, but in SplitVecOp_EXTRACT_VECTOR_ELT instead.
179 define <4 x i1> @extract_v4i1_nxv32i1_16(<vscale x 32 x i1> %arg) {
180 ; CHECK-LABEL: extract_v4i1_nxv32i1_16:
182 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
183 ; CHECK-NEXT: addvl sp, sp, #-8
184 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG
185 ; CHECK-NEXT: .cfi_offset w29, -16
186 ; CHECK-NEXT: ptrue p2.b
187 ; CHECK-NEXT: mov z0.b, p1/z, #1 // =0x1
188 ; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1
189 ; CHECK-NEXT: mov x8, sp
190 ; CHECK-NEXT: add x8, x8, #16
191 ; CHECK-NEXT: st1b { z0.b }, p2, [sp, #1, mul vl]
192 ; CHECK-NEXT: st1b { z1.b }, p2, [sp]
193 ; CHECK-NEXT: st1b { z0.b }, p2, [sp, #3, mul vl]
194 ; CHECK-NEXT: st1b { z1.b }, p2, [sp, #2, mul vl]
195 ; CHECK-NEXT: st1b { z0.b }, p2, [sp, #5, mul vl]
196 ; CHECK-NEXT: st1b { z1.b }, p2, [sp, #4, mul vl]
197 ; CHECK-NEXT: st1b { z0.b }, p2, [sp, #7, mul vl]
198 ; CHECK-NEXT: st1b { z1.b }, p2, [sp, #6, mul vl]
199 ; CHECK-NEXT: ld1 { v0.b }[0], [x8]
200 ; CHECK-NEXT: addvl x8, sp, #2
201 ; CHECK-NEXT: add x8, x8, #17
202 ; CHECK-NEXT: ld1 { v0.b }[2], [x8]
203 ; CHECK-NEXT: addvl x8, sp, #4
204 ; CHECK-NEXT: add x8, x8, #18
205 ; CHECK-NEXT: ld1 { v0.b }[4], [x8]
206 ; CHECK-NEXT: addvl x8, sp, #6
207 ; CHECK-NEXT: add x8, x8, #19
208 ; CHECK-NEXT: ld1 { v0.b }[6], [x8]
209 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
210 ; CHECK-NEXT: addvl sp, sp, #8
211 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
213 %ext = call <4 x i1> @llvm.vector.extract.v4i1.nxv32i1(<vscale x 32 x i1> %arg, i64 16)
217 define <4 x i1> @extract_v4i1_v32i1_16(<32 x i1> %arg) {
218 ; CHECK-LABEL: extract_v4i1_v32i1_16:
220 ; CHECK-NEXT: ldr w8, [sp, #64]
221 ; CHECK-NEXT: ldr w9, [sp, #72]
222 ; CHECK-NEXT: fmov s0, w8
223 ; CHECK-NEXT: ldr w8, [sp, #80]
224 ; CHECK-NEXT: mov v0.h[1], w9
225 ; CHECK-NEXT: mov v0.h[2], w8
226 ; CHECK-NEXT: ldr w8, [sp, #88]
227 ; CHECK-NEXT: mov v0.h[3], w8
228 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
230 %ext = call <4 x i1> @llvm.vector.extract.v4i1.v32i1(<32 x i1> %arg, i64 16)
234 ; The result type gets promoted, leading to us extracting 4 elements from a nxv32i3.
235 ; Hence we don't end up in SplitVecOp_EXTRACT_SUBVECTOR, but in SplitVecOp_EXTRACT_VECTOR_ELT instead.
236 define <4 x i3> @extract_v4i3_nxv32i3_16(<vscale x 32 x i3> %arg) {
237 ; CHECK-LABEL: extract_v4i3_nxv32i3_16:
239 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
240 ; CHECK-NEXT: addvl sp, sp, #-8
241 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG
242 ; CHECK-NEXT: .cfi_offset w29, -16
243 ; CHECK-NEXT: ptrue p0.b
244 ; CHECK-NEXT: mov x8, sp
245 ; CHECK-NEXT: add x8, x8, #16
246 ; CHECK-NEXT: st1b { z1.b }, p0, [sp, #1, mul vl]
247 ; CHECK-NEXT: st1b { z0.b }, p0, [sp]
248 ; CHECK-NEXT: st1b { z1.b }, p0, [sp, #3, mul vl]
249 ; CHECK-NEXT: st1b { z0.b }, p0, [sp, #2, mul vl]
250 ; CHECK-NEXT: st1b { z1.b }, p0, [sp, #5, mul vl]
251 ; CHECK-NEXT: st1b { z0.b }, p0, [sp, #4, mul vl]
252 ; CHECK-NEXT: st1b { z1.b }, p0, [sp, #7, mul vl]
253 ; CHECK-NEXT: st1b { z0.b }, p0, [sp, #6, mul vl]
254 ; CHECK-NEXT: ld1 { v0.b }[0], [x8]
255 ; CHECK-NEXT: addvl x8, sp, #2
256 ; CHECK-NEXT: add x8, x8, #17
257 ; CHECK-NEXT: ld1 { v0.b }[2], [x8]
258 ; CHECK-NEXT: addvl x8, sp, #4
259 ; CHECK-NEXT: add x8, x8, #18
260 ; CHECK-NEXT: ld1 { v0.b }[4], [x8]
261 ; CHECK-NEXT: addvl x8, sp, #6
262 ; CHECK-NEXT: add x8, x8, #19
263 ; CHECK-NEXT: ld1 { v0.b }[6], [x8]
264 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
265 ; CHECK-NEXT: addvl sp, sp, #8
266 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
268 %ext = call <4 x i3> @llvm.vector.extract.v4i3.nxv32i3(<vscale x 32 x i3> %arg, i64 16)
272 ; Extracting an illegal fixed-length vector from an illegal subvector
274 define <2 x i32> @extract_v2i32_nxv16i32_2(<vscale x 16 x i32> %arg) {
275 ; CHECK-LABEL: extract_v2i32_nxv16i32_2:
277 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
278 ; CHECK-NEXT: addvl sp, sp, #-1
279 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
280 ; CHECK-NEXT: .cfi_offset w29, -16
281 ; CHECK-NEXT: ptrue p0.s
282 ; CHECK-NEXT: st1w { z0.s }, p0, [sp]
283 ; CHECK-NEXT: ldr d0, [sp, #8]
284 ; CHECK-NEXT: addvl sp, sp, #1
285 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
287 %ext = call <2 x i32> @llvm.vector.extract.v2i32.nxv16i32(<vscale x 16 x i32> %arg, i64 2)
291 define <4 x i64> @extract_v4i64_nxv8i64_0(<vscale x 8 x i64> %arg) {
292 ; CHECK-LABEL: extract_v4i64_nxv8i64_0:
294 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
295 ; CHECK-NEXT: addvl sp, sp, #-2
296 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
297 ; CHECK-NEXT: .cfi_offset w29, -16
298 ; CHECK-NEXT: ptrue p0.d
299 ; CHECK-NEXT: st1d { z1.d }, p0, [sp, #1, mul vl]
300 ; CHECK-NEXT: st1d { z0.d }, p0, [sp]
301 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
302 ; CHECK-NEXT: ldr q1, [sp, #16]
303 ; CHECK-NEXT: addvl sp, sp, #2
304 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
306 %ext = call <4 x i64> @llvm.vector.extract.v4i64.nxv8i64(<vscale x 8 x i64> %arg, i64 0)
311 declare <2 x i64> @llvm.vector.extract.v2i64.nxv8i64(<vscale x 8 x i64>, i64)
312 declare <4 x i64> @llvm.vector.extract.v4i64.nxv8i64(<vscale x 8 x i64>, i64)
313 declare <4 x float> @llvm.vector.extract.v4f32.nxv16f32(<vscale x 16 x float>, i64)
314 declare <2 x float> @llvm.vector.extract.v2f32.nxv16f32(<vscale x 16 x float>, i64)
315 declare <4 x i32> @llvm.vector.extract.v4i32.nxv16i32(<vscale x 16 x i32>, i64)
316 declare <2 x i32> @llvm.vector.extract.v2i32.nxv16i32(<vscale x 16 x i32>, i64)
317 declare <8 x i16> @llvm.vector.extract.v8i16.nxv32i16(<vscale x 32 x i16>, i64)
318 declare <4 x i16> @llvm.vector.extract.v4i16.nxv32i16(<vscale x 32 x i16>, i64)
319 declare <2 x i16> @llvm.vector.extract.v2i16.nxv32i16(<vscale x 32 x i16>, i64)
320 declare <4 x i1> @llvm.vector.extract.v4i1.nxv32i1(<vscale x 32 x i1>, i64)
321 declare <4 x i1> @llvm.vector.extract.v4i1.v32i1(<32 x i1>, i64)
322 declare <4 x i3> @llvm.vector.extract.v4i3.nxv32i3(<vscale x 32 x i3>, i64)