1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s
4 declare void @normal_callee();
5 declare void @streaming_callee() "aarch64_pstate_sm_enabled";
6 declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible";
8 define void @locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_body" nounwind {
9 ; CHECK-LABEL: locally_streaming_caller_streaming_callee:
11 ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
12 ; CHECK-NEXT: rdsvl x9, #1
13 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
14 ; CHECK-NEXT: lsr x9, x9, #3
15 ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
16 ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
17 ; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
19 ; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
20 ; CHECK-NEXT: smstart sm
21 ; CHECK-NEXT: bl streaming_compatible_callee
22 ; CHECK-NEXT: bl streaming_compatible_callee
23 ; CHECK-NEXT: smstop sm
24 ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
25 ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
26 ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
27 ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
28 ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
31 call void @streaming_compatible_callee();
32 call void @streaming_compatible_callee();
36 ; Test that a streaming body and streaming interface, no smstart/smstop are emitted,
37 ; because the function already is in streaming mode upon entry.
38 define void @streaming_and_locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_enabled" "aarch64_pstate_sm_body" nounwind {
39 ; CHECK-LABEL: streaming_and_locally_streaming_caller_streaming_callee:
41 ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
42 ; CHECK-NEXT: bl streaming_callee
43 ; CHECK-NEXT: bl streaming_callee
44 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
46 call void @streaming_callee();
47 call void @streaming_callee();
51 define void @locally_streaming_multiple_exit(i64 %cond) "aarch64_pstate_sm_body" nounwind {
52 ; CHECK-LABEL: locally_streaming_multiple_exit:
53 ; CHECK: // %bb.0: // %entry
54 ; CHECK-NEXT: rdsvl x9, #1
55 ; CHECK-NEXT: lsr x9, x9, #3
56 ; CHECK-NEXT: str x9, [sp, #-80]! // 8-byte Folded Spill
58 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
59 ; CHECK-NEXT: str x9, [sp, #8] // 8-byte Folded Spill
60 ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
61 ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
62 ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
63 ; CHECK-NEXT: smstart sm
64 ; CHECK-NEXT: cmp x0, #1
65 ; CHECK-NEXT: b.ne .LBB2_2
66 ; CHECK-NEXT: // %bb.1: // %if.else
67 ; CHECK-NEXT: smstop sm
68 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
69 ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
70 ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
71 ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
72 ; CHECK-NEXT: add sp, sp, #80
74 ; CHECK-NEXT: .LBB2_2: // %if.end
75 ; CHECK-NEXT: smstop sm
76 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
77 ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
78 ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
79 ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
80 ; CHECK-NEXT: add sp, sp, #80
84 %tobool = icmp eq i64 %cond, 1
85 br i1 %tobool, label %if.else, label %if.end
94 ; Do a fixed-width vector add on a NEON vector.
96 ; * Incoming vector in v0.d isn't clobbered by the change in streaming mode.
97 ; * Result vector is correctly preserved after smstop.
98 define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind {
99 ; CHECK-LABEL: locally_streaming_caller_no_callee:
101 ; CHECK-NEXT: sub sp, sp, #96
102 ; CHECK-NEXT: rdsvl x9, #1
103 ; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
104 ; CHECK-NEXT: lsr x9, x9, #3
105 ; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
106 ; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
107 ; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
108 ; CHECK-NEXT: cntd x9
109 ; CHECK-NEXT: str x9, [sp, #24] // 8-byte Folded Spill
110 ; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
111 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
112 ; CHECK-NEXT: smstart sm
113 ; CHECK-NEXT: index z0.d, #0, #1
114 ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload
115 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
116 ; CHECK-NEXT: add z0.d, z0.d, z1.d
117 ; CHECK-NEXT: add z0.d, z0.d, #41 // =0x29
118 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
119 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
120 ; CHECK-NEXT: smstop sm
121 ; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
122 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
123 ; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
124 ; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
125 ; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
126 ; CHECK-NEXT: add sp, sp, #96
129 %add = add <2 x i64> %a, <i64 41, i64 42>;
133 ; Test that we use the interface (not the function's body) to determine what
134 ; streaming-mode to enter the callee. In this case the interface is normal, so
135 ; pstate.sm must be 0 on entry and is 0 upon return from the callee.
136 define void @locally_streaming_caller_locally_streaming_callee() "aarch64_pstate_sm_body" nounwind {
137 ; CHECK-LABEL: locally_streaming_caller_locally_streaming_callee:
139 ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
140 ; CHECK-NEXT: bl locally_streaming_caller_streaming_callee
141 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
144 call void @locally_streaming_caller_streaming_callee();
149 ; Test that a locally streaming function correctly retains the
150 ; argument/result registers, because smstart/smstop instructions that are
151 ; inserted to implement the arm_locally_streaming attribute thrashes the
152 ; vector register contents.
155 define <2 x i64> @locally_streaming_caller_compatible_callee_vec_args_ret(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind {
156 ; CHECK-LABEL: locally_streaming_caller_compatible_callee_vec_args_ret:
158 ; CHECK-NEXT: sub sp, sp, #112
159 ; CHECK-NEXT: rdsvl x9, #1
160 ; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
161 ; CHECK-NEXT: lsr x9, x9, #3
162 ; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
163 ; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
164 ; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
165 ; CHECK-NEXT: cntd x9
166 ; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
167 ; CHECK-NEXT: str x9, [sp, #96] // 8-byte Folded Spill
168 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
169 ; CHECK-NEXT: smstart sm
170 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
171 ; CHECK-NEXT: bl streaming_compatible_callee_vec_args_ret
172 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
173 ; CHECK-NEXT: smstop sm
174 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
175 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
176 ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
177 ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
178 ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
179 ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
180 ; CHECK-NEXT: add sp, sp, #112
182 %res = call <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64> %a) "aarch64_pstate_sm_compatible"
186 declare <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64>) "aarch64_pstate_sm_compatible"
188 define {<2 x i64>, <2 x i64>} @locally_streaming_caller_compatible_callee_struct_arg_ret({<2 x i64>, <2 x i64>} %arg) "aarch64_pstate_sm_body" nounwind {
189 ; CHECK-LABEL: locally_streaming_caller_compatible_callee_struct_arg_ret:
191 ; CHECK-NEXT: sub sp, sp, #128
192 ; CHECK-NEXT: rdsvl x9, #1
193 ; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
194 ; CHECK-NEXT: lsr x9, x9, #3
195 ; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
196 ; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
197 ; CHECK-NEXT: stp x30, x9, [sp, #96] // 16-byte Folded Spill
198 ; CHECK-NEXT: cntd x9
199 ; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
200 ; CHECK-NEXT: str x9, [sp, #112] // 8-byte Folded Spill
201 ; CHECK-NEXT: str q1, [sp, #16] // 16-byte Folded Spill
202 ; CHECK-NEXT: smstart sm
203 ; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
204 ; CHECK-NEXT: bl streaming_compatible_callee_vec_arg_struct_ret
205 ; CHECK-NEXT: stp q1, q0, [sp] // 32-byte Folded Spill
206 ; CHECK-NEXT: smstop sm
207 ; CHECK-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload
208 ; CHECK-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload
209 ; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
210 ; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
211 ; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
212 ; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
213 ; CHECK-NEXT: add sp, sp, #128
215 %v1.arg = extractvalue {<2 x i64>, <2 x i64>} %arg, 1
216 %res = call {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<2 x i64> %v1.arg) "aarch64_pstate_sm_compatible"
217 ret {<2 x i64>, <2 x i64>} %res;
220 declare {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<2 x i64>) "aarch64_pstate_sm_compatible"
222 ; Test that we use `addsvl` for allocating any stack space for locals before `smstart`,
223 ; such that the correct amount of stack space is allocated.
224 define void @locally_streaming_caller_alloca() nounwind "aarch64_pstate_sm_body" {
225 ; CHECK-LABEL: locally_streaming_caller_alloca:
227 ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
228 ; CHECK-NEXT: rdsvl x9, #1
229 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
230 ; CHECK-NEXT: lsr x9, x9, #3
231 ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
232 ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
233 ; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
234 ; CHECK-NEXT: cntd x9
235 ; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
236 ; CHECK-NEXT: str x9, [sp, #88] // 8-byte Folded Spill
237 ; CHECK-NEXT: addsvl sp, sp, #-1
238 ; CHECK-NEXT: smstart sm
239 ; CHECK-NEXT: mov x0, sp
240 ; CHECK-NEXT: bl use_ptr
241 ; CHECK-NEXT: smstop sm
242 ; CHECK-NEXT: addsvl sp, sp, #1
243 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
244 ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
245 ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
246 ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
247 ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
249 %alloca = alloca <vscale x 4 x i32>
250 call void @use_ptr(ptr %alloca) "aarch64_pstate_sm_compatible"
254 declare void @use_ptr(ptr) "aarch64_pstate_sm_compatible"
256 define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_pstate_sm_body" {
257 ; CHECK-LABEL: call_to_intrinsic_without_chain:
258 ; CHECK: // %bb.0: // %entry
259 ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
261 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
264 %0 = call fast double @llvm.cos.f64(double %x)
268 declare double @llvm.cos.f64(double)
271 define float @test_arg_survives_loop(float %arg, i32 %N) nounwind "aarch64_pstate_sm_body" {
272 ; CHECK-LABEL: test_arg_survives_loop:
273 ; CHECK: // %bb.0: // %entry
274 ; CHECK-NEXT: sub sp, sp, #96
275 ; CHECK-NEXT: rdsvl x9, #1
276 ; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
277 ; CHECK-NEXT: lsr x9, x9, #3
278 ; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
279 ; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
280 ; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
281 ; CHECK-NEXT: cntd x9
282 ; CHECK-NEXT: str x9, [sp, #24] // 8-byte Folded Spill
283 ; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
284 ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
285 ; CHECK-NEXT: smstart sm
286 ; CHECK-NEXT: .LBB9_1: // %for.body
287 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
288 ; CHECK-NEXT: subs w0, w0, #1
289 ; CHECK-NEXT: b.ne .LBB9_1
290 ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup
291 ; CHECK-NEXT: fmov s0, #1.00000000
292 ; CHECK-NEXT: ldr s1, [sp, #12] // 4-byte Folded Reload
293 ; CHECK-NEXT: fadd s0, s1, s0
294 ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
295 ; CHECK-NEXT: smstop sm
296 ; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
297 ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload
298 ; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
299 ; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
300 ; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
301 ; CHECK-NEXT: add sp, sp, #96
307 %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
308 %inc = add nuw nsw i32 %i.02, 1
309 %exitcond.not = icmp eq i32 %inc, %N
310 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
313 %add = fadd float %arg, 1.000000e+00
318 define void @disable_tailcallopt() "aarch64_pstate_sm_body" nounwind {
319 ; CHECK-LABEL: disable_tailcallopt:
321 ; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
322 ; CHECK-NEXT: rdsvl x9, #1
323 ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
324 ; CHECK-NEXT: lsr x9, x9, #3
325 ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
326 ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
327 ; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
328 ; CHECK-NEXT: cntd x9
329 ; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
330 ; CHECK-NEXT: smstart sm
331 ; CHECK-NEXT: bl streaming_compatible_callee
332 ; CHECK-NEXT: smstop sm
333 ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
334 ; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
335 ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
336 ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
337 ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
339 tail call void @streaming_compatible_callee();