1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs | FileCheck %s
3 ; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s
5 ; Test prolog sequences for stack probing when SVE objects are involved.
7 ; The space for SVE objects needs probing in the general case, because
8 ; the stack adjustment may happen to be too big (i.e. greater than the
9 ; probe size) to allocate with a single `addvl`.
10 ; When we do know that the stack adjustment cannot exceed the probe size
11 ; we can avoid emitting a probe loop and emit a simple `addvl; str`
14 define void @sve_1_vector(ptr %out) #0 {
15 ; CHECK-LABEL: sve_1_vector:
16 ; CHECK: // %bb.0: // %entry
17 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
18 ; CHECK-NEXT: .cfi_def_cfa_offset 16
19 ; CHECK-NEXT: .cfi_offset w29, -16
20 ; CHECK-NEXT: addvl sp, sp, #-1
21 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
22 ; CHECK-NEXT: addvl sp, sp, #1
23 ; CHECK-NEXT: .cfi_def_cfa wsp, 16
24 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
25 ; CHECK-NEXT: .cfi_def_cfa_offset 0
26 ; CHECK-NEXT: .cfi_restore w29
29 %vec = alloca <vscale x 4 x float>, align 16
33 ; As above, but with 4 SVE vectors of stack space.
34 define void @sve_4_vector(ptr %out) #0 {
35 ; CHECK-LABEL: sve_4_vector:
36 ; CHECK: // %bb.0: // %entry
37 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
38 ; CHECK-NEXT: .cfi_def_cfa_offset 16
39 ; CHECK-NEXT: .cfi_offset w29, -16
40 ; CHECK-NEXT: addvl sp, sp, #-4
41 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
42 ; CHECK-NEXT: addvl sp, sp, #4
43 ; CHECK-NEXT: .cfi_def_cfa wsp, 16
44 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
45 ; CHECK-NEXT: .cfi_def_cfa_offset 0
46 ; CHECK-NEXT: .cfi_restore w29
49 %vec1 = alloca <vscale x 4 x float>, align 16
50 %vec2 = alloca <vscale x 4 x float>, align 16
51 %vec3 = alloca <vscale x 4 x float>, align 16
52 %vec4 = alloca <vscale x 4 x float>, align 16
56 ; As above, but with 16 SVE vectors of stack space.
57 ; The stack adjustment is less than or equal to 16 x 256 = 4096, so
58 ; we can allocate the locals at once.
59 define void @sve_16_vector(ptr %out) #0 {
60 ; CHECK-LABEL: sve_16_vector:
61 ; CHECK: // %bb.0: // %entry
62 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
63 ; CHECK-NEXT: .cfi_def_cfa_offset 16
64 ; CHECK-NEXT: .cfi_offset w29, -16
65 ; CHECK-NEXT: addvl sp, sp, #-16
66 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG
67 ; CHECK-NEXT: str xzr, [sp]
68 ; CHECK-NEXT: addvl sp, sp, #16
69 ; CHECK-NEXT: .cfi_def_cfa wsp, 16
70 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
71 ; CHECK-NEXT: .cfi_def_cfa_offset 0
72 ; CHECK-NEXT: .cfi_restore w29
75 %vec1 = alloca <vscale x 4 x float>, align 16
76 %vec2 = alloca <vscale x 4 x float>, align 16
77 %vec3 = alloca <vscale x 4 x float>, align 16
78 %vec4 = alloca <vscale x 4 x float>, align 16
79 %vec5 = alloca <vscale x 4 x float>, align 16
80 %vec6 = alloca <vscale x 4 x float>, align 16
81 %vec7 = alloca <vscale x 4 x float>, align 16
82 %vec8 = alloca <vscale x 4 x float>, align 16
83 %vec9 = alloca <vscale x 4 x float>, align 16
84 %vec10 = alloca <vscale x 4 x float>, align 16
85 %vec11 = alloca <vscale x 4 x float>, align 16
86 %vec12 = alloca <vscale x 4 x float>, align 16
87 %vec13 = alloca <vscale x 4 x float>, align 16
88 %vec14 = alloca <vscale x 4 x float>, align 16
89 %vec15 = alloca <vscale x 4 x float>, align 16
90 %vec16 = alloca <vscale x 4 x float>, align 16
94 ; As above, but with 17 SVE vectors of stack space. Now we need
95 ; a probing loops since stack adjustment may be greater than
96 ; the probe size (17 x 256 = 4354 bytes)
97 ; TODO: Allocating `k*16+r` SVE vectors can be unrolled into
98 ; emiting the `k + r` sequences of `addvl sp, sp, #-N; str xzr, [sp]`
99 define void @sve_17_vector(ptr %out) #0 {
100 ; CHECK-LABEL: sve_17_vector:
101 ; CHECK: // %bb.0: // %entry
102 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
103 ; CHECK-NEXT: .cfi_def_cfa_offset 16
104 ; CHECK-NEXT: .cfi_offset w29, -16
105 ; CHECK-NEXT: addvl x9, sp, #-17
106 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 136 * VG
107 ; CHECK-NEXT: .LBB3_1: // %entry
108 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
109 ; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
110 ; CHECK-NEXT: cmp sp, x9
111 ; CHECK-NEXT: b.le .LBB3_3
112 ; CHECK-NEXT: // %bb.2: // %entry
113 ; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1
114 ; CHECK-NEXT: str xzr, [sp]
115 ; CHECK-NEXT: b .LBB3_1
116 ; CHECK-NEXT: .LBB3_3: // %entry
117 ; CHECK-NEXT: mov sp, x9
118 ; CHECK-NEXT: ldr xzr, [sp]
119 ; CHECK-NEXT: .cfi_def_cfa_register wsp
120 ; CHECK-NEXT: addvl sp, sp, #17
121 ; CHECK-NEXT: .cfi_def_cfa wsp, 16
122 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
123 ; CHECK-NEXT: .cfi_def_cfa_offset 0
124 ; CHECK-NEXT: .cfi_restore w29
127 %vec1 = alloca <vscale x 4 x float>, align 16
128 %vec2 = alloca <vscale x 4 x float>, align 16
129 %vec3 = alloca <vscale x 4 x float>, align 16
130 %vec4 = alloca <vscale x 4 x float>, align 16
131 %vec5 = alloca <vscale x 4 x float>, align 16
132 %vec6 = alloca <vscale x 4 x float>, align 16
133 %vec7 = alloca <vscale x 4 x float>, align 16
134 %vec8 = alloca <vscale x 4 x float>, align 16
135 %vec9 = alloca <vscale x 4 x float>, align 16
136 %vec10 = alloca <vscale x 4 x float>, align 16
137 %vec11 = alloca <vscale x 4 x float>, align 16
138 %vec12 = alloca <vscale x 4 x float>, align 16
139 %vec13 = alloca <vscale x 4 x float>, align 16
140 %vec14 = alloca <vscale x 4 x float>, align 16
141 %vec15 = alloca <vscale x 4 x float>, align 16
142 %vec16 = alloca <vscale x 4 x float>, align 16
143 %vec17 = alloca <vscale x 4 x float>, align 16
147 ; Space for callee-saved SVE register is allocated similarly to allocating
148 ; space for SVE locals. When we know the stack adjustment cannot exceed the
149 ; probe size we can skip the explict probe, since saving SVE registers serves
150 ; as an implicit probe.
151 define void @sve_1v_csr(<vscale x 4 x float> %a) #0 {
152 ; CHECK-LABEL: sve_1v_csr:
153 ; CHECK: // %bb.0: // %entry
154 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
155 ; CHECK-NEXT: .cfi_def_cfa_offset 16
156 ; CHECK-NEXT: .cfi_offset w29, -16
157 ; CHECK-NEXT: addvl sp, sp, #-1
158 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
159 ; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill
160 ; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
162 ; CHECK-NEXT: //NO_APP
163 ; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
164 ; CHECK-NEXT: addvl sp, sp, #1
165 ; CHECK-NEXT: .cfi_def_cfa wsp, 16
166 ; CHECK-NEXT: .cfi_restore z8
167 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
168 ; CHECK-NEXT: .cfi_def_cfa_offset 0
169 ; CHECK-NEXT: .cfi_restore w29
172 call void asm sideeffect "", "~{z8}" ()
176 define void @sve_4v_csr(<vscale x 4 x float> %a) #0 {
177 ; CHECK-LABEL: sve_4v_csr:
178 ; CHECK: // %bb.0: // %entry
179 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
180 ; CHECK-NEXT: .cfi_def_cfa_offset 16
181 ; CHECK-NEXT: .cfi_offset w29, -16
182 ; CHECK-NEXT: addvl sp, sp, #-4
183 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
184 ; CHECK-NEXT: str z11, [sp] // 16-byte Folded Spill
185 ; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill
186 ; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill
187 ; CHECK-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill
188 ; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
189 ; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
190 ; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
191 ; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
193 ; CHECK-NEXT: //NO_APP
194 ; CHECK-NEXT: ldr z11, [sp] // 16-byte Folded Reload
195 ; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
196 ; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
197 ; CHECK-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
198 ; CHECK-NEXT: addvl sp, sp, #4
199 ; CHECK-NEXT: .cfi_def_cfa wsp, 16
200 ; CHECK-NEXT: .cfi_restore z8
201 ; CHECK-NEXT: .cfi_restore z9
202 ; CHECK-NEXT: .cfi_restore z10
203 ; CHECK-NEXT: .cfi_restore z11
204 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
205 ; CHECK-NEXT: .cfi_def_cfa_offset 0
206 ; CHECK-NEXT: .cfi_restore w29
209 call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11}" ()
213 define void @sve_16v_csr(<vscale x 4 x float> %a) #0 {
214 ; CHECK-LABEL: sve_16v_csr:
215 ; CHECK: // %bb.0: // %entry
216 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
217 ; CHECK-NEXT: .cfi_def_cfa_offset 16
218 ; CHECK-NEXT: .cfi_offset w29, -16
219 ; CHECK-NEXT: addvl sp, sp, #-16
220 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG
221 ; CHECK-NEXT: str xzr, [sp]
222 ; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill
223 ; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
224 ; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
225 ; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
226 ; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
227 ; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
228 ; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
229 ; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
230 ; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
231 ; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
232 ; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
233 ; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
234 ; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
235 ; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
236 ; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
237 ; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
238 ; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
239 ; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
240 ; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
241 ; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
242 ; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
243 ; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
244 ; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
245 ; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
247 ; CHECK-NEXT: //NO_APP
248 ; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload
249 ; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
250 ; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
251 ; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
252 ; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
253 ; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
254 ; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
255 ; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
256 ; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
257 ; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
258 ; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
259 ; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
260 ; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
261 ; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
262 ; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
263 ; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
264 ; CHECK-NEXT: addvl sp, sp, #16
265 ; CHECK-NEXT: .cfi_def_cfa wsp, 16
266 ; CHECK-NEXT: .cfi_restore z8
267 ; CHECK-NEXT: .cfi_restore z9
268 ; CHECK-NEXT: .cfi_restore z10
269 ; CHECK-NEXT: .cfi_restore z11
270 ; CHECK-NEXT: .cfi_restore z12
271 ; CHECK-NEXT: .cfi_restore z13
272 ; CHECK-NEXT: .cfi_restore z14
273 ; CHECK-NEXT: .cfi_restore z15
274 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
275 ; CHECK-NEXT: .cfi_def_cfa_offset 0
276 ; CHECK-NEXT: .cfi_restore w29
279 call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23}" ()
283 define void @sve_1p_csr(<vscale x 4 x float> %a) #0 {
284 ; CHECK-LABEL: sve_1p_csr:
285 ; CHECK: // %bb.0: // %entry
286 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
287 ; CHECK-NEXT: .cfi_def_cfa_offset 16
288 ; CHECK-NEXT: .cfi_offset w29, -16
289 ; CHECK-NEXT: addvl sp, sp, #-1
290 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
291 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
293 ; CHECK-NEXT: //NO_APP
294 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
295 ; CHECK-NEXT: addvl sp, sp, #1
296 ; CHECK-NEXT: .cfi_def_cfa wsp, 16
297 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
298 ; CHECK-NEXT: .cfi_def_cfa_offset 0
299 ; CHECK-NEXT: .cfi_restore w29
302 call void asm sideeffect "", "~{p8}" ()
306 define void @sve_4p_csr(<vscale x 4 x float> %a) #0 {
307 ; CHECK-LABEL: sve_4p_csr:
308 ; CHECK: // %bb.0: // %entry
309 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
310 ; CHECK-NEXT: .cfi_def_cfa_offset 16
311 ; CHECK-NEXT: .cfi_offset w29, -16
312 ; CHECK-NEXT: addvl sp, sp, #-1
313 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
314 ; CHECK-NEXT: str p11, [sp, #4, mul vl] // 2-byte Folded Spill
315 ; CHECK-NEXT: str p10, [sp, #5, mul vl] // 2-byte Folded Spill
316 ; CHECK-NEXT: str p9, [sp, #6, mul vl] // 2-byte Folded Spill
317 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
319 ; CHECK-NEXT: //NO_APP
320 ; CHECK-NEXT: ldr p11, [sp, #4, mul vl] // 2-byte Folded Reload
321 ; CHECK-NEXT: ldr p10, [sp, #5, mul vl] // 2-byte Folded Reload
322 ; CHECK-NEXT: ldr p9, [sp, #6, mul vl] // 2-byte Folded Reload
323 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
324 ; CHECK-NEXT: addvl sp, sp, #1
325 ; CHECK-NEXT: .cfi_def_cfa wsp, 16
326 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
327 ; CHECK-NEXT: .cfi_def_cfa_offset 0
328 ; CHECK-NEXT: .cfi_restore w29
331 call void asm sideeffect "", "~{p8},~{p9},~{p10},~{p11}" ()
335 define void @sve_16v_1p_csr(<vscale x 4 x float> %a) #0 {
336 ; CHECK-LABEL: sve_16v_1p_csr:
337 ; CHECK: // %bb.0: // %entry
338 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
339 ; CHECK-NEXT: .cfi_def_cfa_offset 16
340 ; CHECK-NEXT: .cfi_offset w29, -16
341 ; CHECK-NEXT: addvl x9, sp, #-17
342 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 136 * VG
343 ; CHECK-NEXT: .LBB9_1: // %entry
344 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
345 ; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
346 ; CHECK-NEXT: cmp sp, x9
347 ; CHECK-NEXT: b.le .LBB9_3
348 ; CHECK-NEXT: // %bb.2: // %entry
349 ; CHECK-NEXT: // in Loop: Header=BB9_1 Depth=1
350 ; CHECK-NEXT: str xzr, [sp]
351 ; CHECK-NEXT: b .LBB9_1
352 ; CHECK-NEXT: .LBB9_3: // %entry
353 ; CHECK-NEXT: mov sp, x9
354 ; CHECK-NEXT: ldr xzr, [sp]
355 ; CHECK-NEXT: .cfi_def_cfa_register wsp
356 ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
357 ; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill
358 ; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill
359 ; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill
360 ; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill
361 ; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill
362 ; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill
363 ; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill
364 ; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill
365 ; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill
366 ; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill
367 ; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill
368 ; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill
369 ; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill
370 ; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill
371 ; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill
372 ; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill
373 ; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
374 ; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
375 ; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
376 ; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
377 ; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
378 ; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
379 ; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
380 ; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
382 ; CHECK-NEXT: //NO_APP
383 ; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
384 ; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
385 ; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
386 ; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
387 ; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
388 ; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
389 ; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
390 ; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
391 ; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
392 ; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
393 ; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
394 ; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
395 ; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
396 ; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
397 ; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
398 ; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
399 ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
400 ; CHECK-NEXT: addvl sp, sp, #17
401 ; CHECK-NEXT: .cfi_def_cfa wsp, 16
402 ; CHECK-NEXT: .cfi_restore z8
403 ; CHECK-NEXT: .cfi_restore z9
404 ; CHECK-NEXT: .cfi_restore z10
405 ; CHECK-NEXT: .cfi_restore z11
406 ; CHECK-NEXT: .cfi_restore z12
407 ; CHECK-NEXT: .cfi_restore z13
408 ; CHECK-NEXT: .cfi_restore z14
409 ; CHECK-NEXT: .cfi_restore z15
410 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
411 ; CHECK-NEXT: .cfi_def_cfa_offset 0
412 ; CHECK-NEXT: .cfi_restore w29
415 call void asm sideeffect "", "~{p8},~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23}" ()
419 ; A SVE vector and a 16-byte fixed size object.
420 define void @sve_1_vector_16_arr(ptr %out) #0 {
421 ; CHECK-LABEL: sve_1_vector_16_arr:
422 ; CHECK: // %bb.0: // %entry
423 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
424 ; CHECK-NEXT: .cfi_def_cfa_offset 16
425 ; CHECK-NEXT: .cfi_offset w29, -16
426 ; CHECK-NEXT: sub sp, sp, #16
427 ; CHECK-NEXT: .cfi_def_cfa_offset 32
428 ; CHECK-NEXT: addvl sp, sp, #-1
429 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG
430 ; CHECK-NEXT: addvl sp, sp, #1
431 ; CHECK-NEXT: .cfi_def_cfa wsp, 32
432 ; CHECK-NEXT: add sp, sp, #16
433 ; CHECK-NEXT: .cfi_def_cfa_offset 16
434 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
435 ; CHECK-NEXT: .cfi_def_cfa_offset 0
436 ; CHECK-NEXT: .cfi_restore w29
439 %vec = alloca <vscale x 4 x float>, align 16
440 %arr = alloca i8, i64 16, align 1
444 ; A large SVE stack object and a large stack slot, both of which need probing.
445 ; TODO: This could be optimised by combining the fixed-size offset into the
447 define void @sve_1_vector_4096_arr(ptr %out) #0 {
448 ; CHECK-LABEL: sve_1_vector_4096_arr:
449 ; CHECK: // %bb.0: // %entry
450 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
451 ; CHECK-NEXT: .cfi_def_cfa_offset 16
452 ; CHECK-NEXT: .cfi_offset w29, -16
453 ; CHECK-NEXT: sub x9, sp, #3, lsl #12 // =12288
454 ; CHECK-NEXT: .cfi_def_cfa w9, 12304
455 ; CHECK-NEXT: addvl x9, x9, #-32
456 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x79, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 12304 + 256 * VG
457 ; CHECK-NEXT: addvl x9, x9, #-32
458 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x79, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 12304 + 512 * VG
459 ; CHECK-NEXT: .LBB11_1: // %entry
460 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
461 ; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
462 ; CHECK-NEXT: cmp sp, x9
463 ; CHECK-NEXT: b.le .LBB11_3
464 ; CHECK-NEXT: // %bb.2: // %entry
465 ; CHECK-NEXT: // in Loop: Header=BB11_1 Depth=1
466 ; CHECK-NEXT: str xzr, [sp]
467 ; CHECK-NEXT: b .LBB11_1
468 ; CHECK-NEXT: .LBB11_3: // %entry
469 ; CHECK-NEXT: mov sp, x9
470 ; CHECK-NEXT: ldr xzr, [sp]
471 ; CHECK-NEXT: .cfi_def_cfa_register wsp
472 ; CHECK-NEXT: addvl sp, sp, #31
473 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x88, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 264 * VG
474 ; CHECK-NEXT: addvl sp, sp, #31
475 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 16 * VG
476 ; CHECK-NEXT: addvl sp, sp, #2
477 ; CHECK-NEXT: .cfi_def_cfa wsp, 12304
478 ; CHECK-NEXT: add sp, sp, #3, lsl #12 // =12288
479 ; CHECK-NEXT: .cfi_def_cfa_offset 16
480 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
481 ; CHECK-NEXT: .cfi_def_cfa_offset 0
482 ; CHECK-NEXT: .cfi_restore w29
485 %vec = alloca <vscale x 256 x float>, align 16
486 %arr = alloca i8, i64 12288, align 1
490 ; Not tested: SVE stack objects with alignment >16 bytes, which isn't currently
491 ; supported even without stack-probing.
493 ; An SVE vector, and a 16-byte fixed size object, which
494 ; has a large alignment requirement.
495 define void @sve_1_vector_16_arr_align_8192(ptr %out) #0 {
496 ; CHECK-LABEL: sve_1_vector_16_arr_align_8192:
497 ; CHECK: // %bb.0: // %entry
498 ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
499 ; CHECK-NEXT: .cfi_def_cfa_offset 16
500 ; CHECK-NEXT: mov x29, sp
501 ; CHECK-NEXT: .cfi_def_cfa w29, 16
502 ; CHECK-NEXT: .cfi_offset w30, -8
503 ; CHECK-NEXT: .cfi_offset w29, -16
504 ; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096
505 ; CHECK-NEXT: sub x9, x9, #4080
506 ; CHECK-NEXT: addvl x9, x9, #-1
507 ; CHECK-NEXT: and x9, x9, #0xffffffffffffe000
508 ; CHECK-NEXT: .LBB12_1: // %entry
509 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
510 ; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
511 ; CHECK-NEXT: cmp sp, x9
512 ; CHECK-NEXT: b.le .LBB12_3
513 ; CHECK-NEXT: // %bb.2: // %entry
514 ; CHECK-NEXT: // in Loop: Header=BB12_1 Depth=1
515 ; CHECK-NEXT: str xzr, [sp]
516 ; CHECK-NEXT: b .LBB12_1
517 ; CHECK-NEXT: .LBB12_3: // %entry
518 ; CHECK-NEXT: mov sp, x9
519 ; CHECK-NEXT: ldr xzr, [sp]
520 ; CHECK-NEXT: mov sp, x29
521 ; CHECK-NEXT: .cfi_def_cfa wsp, 16
522 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
523 ; CHECK-NEXT: .cfi_def_cfa_offset 0
524 ; CHECK-NEXT: .cfi_restore w30
525 ; CHECK-NEXT: .cfi_restore w29
528 %vec = alloca <vscale x 4 x float>, align 16
529 %arr = alloca i8, i64 16, align 8192
533 ; With 64k guard pages, we can allocate bigger SVE space without a probing loop.
534 define void @sve_1024_64k_guard(ptr %out) #0 "stack-probe-size"="65536" {
535 ; CHECK-LABEL: sve_1024_64k_guard:
536 ; CHECK: // %bb.0: // %entry
537 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
538 ; CHECK-NEXT: .cfi_def_cfa_offset 16
539 ; CHECK-NEXT: .cfi_offset w29, -16
540 ; CHECK-NEXT: addvl sp, sp, #-32
541 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 256 * VG
542 ; CHECK-NEXT: addvl sp, sp, #-32
543 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 512 * VG
544 ; CHECK-NEXT: addvl sp, sp, #-32
545 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 768 * VG
546 ; CHECK-NEXT: addvl sp, sp, #-32
547 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1024 * VG
548 ; CHECK-NEXT: addvl sp, sp, #-32
549 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1280 * VG
550 ; CHECK-NEXT: addvl sp, sp, #-32
551 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1536 * VG
552 ; CHECK-NEXT: addvl sp, sp, #-32
553 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1792 * VG
554 ; CHECK-NEXT: addvl sp, sp, #-32
555 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 2048 * VG
556 ; CHECK-NEXT: str xzr, [sp]
557 ; CHECK-NEXT: addvl sp, sp, #31
558 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1800 * VG
559 ; CHECK-NEXT: addvl sp, sp, #31
560 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1552 * VG
561 ; CHECK-NEXT: addvl sp, sp, #31
562 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x98, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1304 * VG
563 ; CHECK-NEXT: addvl sp, sp, #31
564 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1056 * VG
565 ; CHECK-NEXT: addvl sp, sp, #31
566 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 808 * VG
567 ; CHECK-NEXT: addvl sp, sp, #31
568 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb0, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 560 * VG
569 ; CHECK-NEXT: addvl sp, sp, #31
570 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb8, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 312 * VG
571 ; CHECK-NEXT: addvl sp, sp, #31
572 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG
573 ; CHECK-NEXT: addvl sp, sp, #8
574 ; CHECK-NEXT: .cfi_def_cfa wsp, 16
575 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
576 ; CHECK-NEXT: .cfi_def_cfa_offset 0
577 ; CHECK-NEXT: .cfi_restore w29
580 %vec = alloca <vscale x 1024 x float>, align 16
584 define void @sve_1028_64k_guard(ptr %out) #0 "stack-probe-size"="65536" {
585 ; CHECK-LABEL: sve_1028_64k_guard:
586 ; CHECK: // %bb.0: // %entry
587 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
588 ; CHECK-NEXT: .cfi_def_cfa_offset 16
589 ; CHECK-NEXT: .cfi_offset w29, -16
590 ; CHECK-NEXT: addvl x9, sp, #-32
591 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 256 * VG
592 ; CHECK-NEXT: addvl x9, x9, #-32
593 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 512 * VG
594 ; CHECK-NEXT: addvl x9, x9, #-32
595 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 768 * VG
596 ; CHECK-NEXT: addvl x9, x9, #-32
597 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1024 * VG
598 ; CHECK-NEXT: addvl x9, x9, #-32
599 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1280 * VG
600 ; CHECK-NEXT: addvl x9, x9, #-32
601 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1536 * VG
602 ; CHECK-NEXT: addvl x9, x9, #-32
603 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1792 * VG
604 ; CHECK-NEXT: addvl x9, x9, #-32
605 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 2048 * VG
606 ; CHECK-NEXT: addvl x9, x9, #-1
607 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 2056 * VG
608 ; CHECK-NEXT: .LBB14_1: // %entry
609 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
610 ; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
611 ; CHECK-NEXT: cmp sp, x9
612 ; CHECK-NEXT: b.le .LBB14_3
613 ; CHECK-NEXT: // %bb.2: // %entry
614 ; CHECK-NEXT: // in Loop: Header=BB14_1 Depth=1
615 ; CHECK-NEXT: str xzr, [sp]
616 ; CHECK-NEXT: b .LBB14_1
617 ; CHECK-NEXT: .LBB14_3: // %entry
618 ; CHECK-NEXT: mov sp, x9
619 ; CHECK-NEXT: ldr xzr, [sp]
620 ; CHECK-NEXT: .cfi_def_cfa_register wsp
621 ; CHECK-NEXT: addvl sp, sp, #31
622 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1808 * VG
623 ; CHECK-NEXT: addvl sp, sp, #31
624 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x98, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1560 * VG
625 ; CHECK-NEXT: addvl sp, sp, #31
626 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1312 * VG
627 ; CHECK-NEXT: addvl sp, sp, #31
628 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1064 * VG
629 ; CHECK-NEXT: addvl sp, sp, #31
630 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb0, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 816 * VG
631 ; CHECK-NEXT: addvl sp, sp, #31
632 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb8, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 568 * VG
633 ; CHECK-NEXT: addvl sp, sp, #31
634 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 320 * VG
635 ; CHECK-NEXT: addvl sp, sp, #31
636 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 72 * VG
637 ; CHECK-NEXT: addvl sp, sp, #9
638 ; CHECK-NEXT: .cfi_def_cfa wsp, 16
639 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
640 ; CHECK-NEXT: .cfi_def_cfa_offset 0
641 ; CHECK-NEXT: .cfi_restore w29
644 %vec = alloca <vscale x 1024 x float>, align 16
645 %vec1 = alloca <vscale x 4 x float>, align 16
649 ; With 5 SVE vectors of stack space the unprobed area
650 ; at the top of the stack can exceed 1024 bytes (5 x 256 == 1280),
651 ; hence we need to issue a probe.
652 define void @sve_5_vector(ptr %out) #0 {
653 ; CHECK-LABEL: sve_5_vector:
654 ; CHECK: // %bb.0: // %entry
655 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
656 ; CHECK-NEXT: .cfi_def_cfa_offset 16
657 ; CHECK-NEXT: .cfi_offset w29, -16
658 ; CHECK-NEXT: addvl sp, sp, #-5
659 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 40 * VG
660 ; CHECK-NEXT: str xzr, [sp]
661 ; CHECK-NEXT: addvl sp, sp, #5
662 ; CHECK-NEXT: .cfi_def_cfa wsp, 16
663 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
664 ; CHECK-NEXT: .cfi_def_cfa_offset 0
665 ; CHECK-NEXT: .cfi_restore w29
668 %vec1 = alloca <vscale x 4 x float>, align 16
669 %vec2 = alloca <vscale x 4 x float>, align 16
670 %vec3 = alloca <vscale x 4 x float>, align 16
671 %vec4 = alloca <vscale x 4 x float>, align 16
672 %vec5 = alloca <vscale x 4 x float>, align 16
676 ; Test with a 14 scalable bytes (so up to 14 * 16 = 224) of unprobed
677 ; are bellow the save location of `p9`.
678 define void @sve_unprobed_area(<vscale x 4 x float> %a, i32 %n) #0 {
679 ; CHECK-LABEL: sve_unprobed_area:
680 ; CHECK: // %bb.0: // %entry
681 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
682 ; CHECK-NEXT: .cfi_def_cfa_offset 16
683 ; CHECK-NEXT: .cfi_offset w29, -16
684 ; CHECK-NEXT: addvl sp, sp, #-4
685 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
686 ; CHECK-NEXT: str xzr, [sp]
687 ; CHECK-NEXT: str p9, [sp, #7, mul vl] // 2-byte Folded Spill
688 ; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill
689 ; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill
690 ; CHECK-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill
691 ; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
692 ; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
693 ; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
694 ; CHECK-NEXT: addvl sp, sp, #-4
695 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG
697 ; CHECK-NEXT: //NO_APP
698 ; CHECK-NEXT: addvl sp, sp, #4
699 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
700 ; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
701 ; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
702 ; CHECK-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
703 ; CHECK-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload
704 ; CHECK-NEXT: addvl sp, sp, #4
705 ; CHECK-NEXT: .cfi_def_cfa wsp, 16
706 ; CHECK-NEXT: .cfi_restore z8
707 ; CHECK-NEXT: .cfi_restore z9
708 ; CHECK-NEXT: .cfi_restore z10
709 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
710 ; CHECK-NEXT: .cfi_def_cfa_offset 0
711 ; CHECK-NEXT: .cfi_restore w29
714 call void asm sideeffect "", "~{z8},~{z9},~{z10},~{p9}" ()
716 %v0 = alloca <vscale x 4 x float>, align 16
717 %v1 = alloca <vscale x 4 x float>, align 16
718 %v2 = alloca <vscale x 4 x float>, align 16
719 %v3 = alloca <vscale x 4 x float>, align 16
724 attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" }