1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 -verify-machineinstrs < %s | FileCheck %s
4 ; FP is in CSR range, modified.
5 define hidden fastcc void @callee_has_fp() #1 {
6 ; CHECK-LABEL: callee_has_fp:
8 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9 ; CHECK-NEXT: s_mov_b32 s4, s33
10 ; CHECK-NEXT: s_mov_b32 s33, s32
11 ; CHECK-NEXT: s_add_i32 s32, s32, 0x200
12 ; CHECK-NEXT: v_mov_b32_e32 v0, 1
13 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
14 ; CHECK-NEXT: s_waitcnt vmcnt(0)
15 ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffe00
16 ; CHECK-NEXT: s_mov_b32 s33, s4
17 ; CHECK-NEXT: s_setpc_b64 s[30:31]
18 %alloca = alloca i32, addrspace(5)
19 store volatile i32 1, i32 addrspace(5)* %alloca
23 ; Has no stack objects, but introduces them due to the CSR spill. We
24 ; see the FP modified in the callee with IPRA. We should not have
25 ; redundant spills of s33 or assert.
26 define internal fastcc void @csr_vgpr_spill_fp_callee() #0 {
27 ; CHECK-LABEL: csr_vgpr_spill_fp_callee:
28 ; CHECK: ; %bb.0: ; %bb
29 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1
31 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
32 ; CHECK-NEXT: s_mov_b64 exec, s[4:5]
33 ; CHECK-NEXT: v_writelane_b32 v1, s33, 2
34 ; CHECK-NEXT: s_mov_b32 s33, s32
35 ; CHECK-NEXT: s_add_i32 s32, s32, 0x400
36 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
37 ; CHECK-NEXT: v_writelane_b32 v1, s30, 0
38 ; CHECK-NEXT: v_writelane_b32 v1, s31, 1
39 ; CHECK-NEXT: s_getpc_b64 s[4:5]
40 ; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4
41 ; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12
42 ; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3]
43 ; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1]
44 ; CHECK-NEXT: s_mov_b64 s[0:1], s[8:9]
45 ; CHECK-NEXT: s_mov_b64 s[2:3], s[10:11]
46 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
47 ; CHECK-NEXT: v_readlane_b32 s30, v1, 0
48 ; CHECK-NEXT: v_readlane_b32 s31, v1, 1
49 ; CHECK-NEXT: ;;#ASMSTART
50 ; CHECK-NEXT: ; clobber csr v40
51 ; CHECK-NEXT: ;;#ASMEND
52 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
53 ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00
54 ; CHECK-NEXT: v_readlane_b32 s33, v1, 2
55 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1
56 ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
57 ; CHECK-NEXT: s_mov_b64 exec, s[4:5]
58 ; CHECK-NEXT: s_waitcnt vmcnt(0)
59 ; CHECK-NEXT: s_setpc_b64 s[30:31]
61 call fastcc void @callee_has_fp()
62 call void asm sideeffect "; clobber csr v40", "~{v40}"()
66 define amdgpu_kernel void @kernel_call() {
67 ; CHECK-LABEL: kernel_call:
68 ; CHECK: ; %bb.0: ; %bb
69 ; CHECK-NEXT: s_mov_b32 s32, 0
70 ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s4, s7
71 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
72 ; CHECK-NEXT: s_add_u32 s0, s0, s7
73 ; CHECK-NEXT: s_addc_u32 s1, s1, 0
74 ; CHECK-NEXT: s_getpc_b64 s[4:5]
75 ; CHECK-NEXT: s_add_u32 s4, s4, csr_vgpr_spill_fp_callee@rel32@lo+4
76 ; CHECK-NEXT: s_addc_u32 s5, s5, csr_vgpr_spill_fp_callee@rel32@hi+12
77 ; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3]
78 ; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1]
79 ; CHECK-NEXT: s_mov_b64 s[0:1], s[8:9]
80 ; CHECK-NEXT: s_mov_b64 s[2:3], s[10:11]
81 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
82 ; CHECK-NEXT: s_endpgm
84 tail call fastcc void @csr_vgpr_spill_fp_callee()
88 ; Same, except with a tail call.
89 define internal fastcc void @csr_vgpr_spill_fp_tailcall_callee() #0 {
90 ; CHECK-LABEL: csr_vgpr_spill_fp_tailcall_callee:
91 ; CHECK: ; %bb.0: ; %bb
92 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
93 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1
94 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
95 ; CHECK-NEXT: s_mov_b64 exec, s[4:5]
96 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
97 ; CHECK-NEXT: v_writelane_b32 v1, s33, 0
98 ; CHECK-NEXT: ;;#ASMSTART
99 ; CHECK-NEXT: ; clobber csr v40
100 ; CHECK-NEXT: ;;#ASMEND
101 ; CHECK-NEXT: s_getpc_b64 s[4:5]
102 ; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4
103 ; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12
104 ; CHECK-NEXT: v_readlane_b32 s33, v1, 0
105 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
106 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
107 ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
108 ; CHECK-NEXT: s_mov_b64 exec, s[6:7]
109 ; CHECK-NEXT: s_setpc_b64 s[4:5]
111 call void asm sideeffect "; clobber csr v40", "~{v40}"()
112 tail call fastcc void @callee_has_fp()
116 define amdgpu_kernel void @kernel_tailcall() {
117 ; CHECK-LABEL: kernel_tailcall:
118 ; CHECK: ; %bb.0: ; %bb
119 ; CHECK-NEXT: s_mov_b32 s32, 0
120 ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s4, s7
121 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
122 ; CHECK-NEXT: s_add_u32 s0, s0, s7
123 ; CHECK-NEXT: s_addc_u32 s1, s1, 0
124 ; CHECK-NEXT: s_getpc_b64 s[4:5]
125 ; CHECK-NEXT: s_add_u32 s4, s4, csr_vgpr_spill_fp_tailcall_callee@rel32@lo+4
126 ; CHECK-NEXT: s_addc_u32 s5, s5, csr_vgpr_spill_fp_tailcall_callee@rel32@hi+12
127 ; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3]
128 ; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1]
129 ; CHECK-NEXT: s_mov_b64 s[0:1], s[8:9]
130 ; CHECK-NEXT: s_mov_b64 s[2:3], s[10:11]
131 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
132 ; CHECK-NEXT: s_endpgm
134 tail call fastcc void @csr_vgpr_spill_fp_tailcall_callee()
138 define hidden i32 @tail_call() #1 {
139 ; CHECK-LABEL: tail_call:
140 ; CHECK: ; %bb.0: ; %entry
141 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
142 ; CHECK-NEXT: s_mov_b32 s4, s33
143 ; CHECK-NEXT: s_mov_b32 s33, s32
144 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
145 ; CHECK-NEXT: s_mov_b32 s33, s4
146 ; CHECK-NEXT: s_setpc_b64 s[30:31]
151 define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 {
152 ; CHECK-LABEL: caller_save_vgpr_spill_fp_tail_call:
153 ; CHECK: ; %bb.0: ; %entry
154 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
155 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1
156 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
157 ; CHECK-NEXT: s_mov_b64 exec, s[4:5]
158 ; CHECK-NEXT: v_writelane_b32 v1, s33, 2
159 ; CHECK-NEXT: s_mov_b32 s33, s32
160 ; CHECK-NEXT: s_add_i32 s32, s32, 0x400
161 ; CHECK-NEXT: v_writelane_b32 v1, s30, 0
162 ; CHECK-NEXT: v_writelane_b32 v1, s31, 1
163 ; CHECK-NEXT: s_getpc_b64 s[4:5]
164 ; CHECK-NEXT: s_add_u32 s4, s4, tail_call@rel32@lo+4
165 ; CHECK-NEXT: s_addc_u32 s5, s5, tail_call@rel32@hi+12
166 ; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3]
167 ; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1]
168 ; CHECK-NEXT: s_mov_b64 s[0:1], s[8:9]
169 ; CHECK-NEXT: s_mov_b64 s[2:3], s[10:11]
170 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
171 ; CHECK-NEXT: v_readlane_b32 s30, v1, 0
172 ; CHECK-NEXT: v_readlane_b32 s31, v1, 1
173 ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00
174 ; CHECK-NEXT: v_readlane_b32 s33, v1, 2
175 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1
176 ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
177 ; CHECK-NEXT: s_mov_b64 exec, s[4:5]
178 ; CHECK-NEXT: s_waitcnt vmcnt(0)
179 ; CHECK-NEXT: s_setpc_b64 s[30:31]
181 %call = call i32 @tail_call()
185 define hidden i32 @caller_save_vgpr_spill_fp() #0 {
186 ; CHECK-LABEL: caller_save_vgpr_spill_fp:
187 ; CHECK: ; %bb.0: ; %entry
188 ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
189 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1
190 ; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
191 ; CHECK-NEXT: s_mov_b64 exec, s[4:5]
192 ; CHECK-NEXT: v_writelane_b32 v2, s33, 2
193 ; CHECK-NEXT: s_mov_b32 s33, s32
194 ; CHECK-NEXT: s_add_i32 s32, s32, 0x400
195 ; CHECK-NEXT: v_writelane_b32 v2, s30, 0
196 ; CHECK-NEXT: v_writelane_b32 v2, s31, 1
197 ; CHECK-NEXT: s_getpc_b64 s[4:5]
198 ; CHECK-NEXT: s_add_u32 s4, s4, caller_save_vgpr_spill_fp_tail_call@rel32@lo+4
199 ; CHECK-NEXT: s_addc_u32 s5, s5, caller_save_vgpr_spill_fp_tail_call@rel32@hi+12
200 ; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3]
201 ; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1]
202 ; CHECK-NEXT: s_mov_b64 s[0:1], s[8:9]
203 ; CHECK-NEXT: s_mov_b64 s[2:3], s[10:11]
204 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
205 ; CHECK-NEXT: v_readlane_b32 s30, v2, 0
206 ; CHECK-NEXT: v_readlane_b32 s31, v2, 1
207 ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00
208 ; CHECK-NEXT: v_readlane_b32 s33, v2, 2
209 ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1
210 ; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
211 ; CHECK-NEXT: s_mov_b64 exec, s[4:5]
212 ; CHECK-NEXT: s_waitcnt vmcnt(0)
213 ; CHECK-NEXT: s_setpc_b64 s[30:31]
215 %call = call i32 @caller_save_vgpr_spill_fp_tail_call()
219 define protected amdgpu_kernel void @kernel() {
220 ; CHECK-LABEL: kernel:
221 ; CHECK: ; %bb.0: ; %entry
222 ; CHECK-NEXT: s_mov_b32 s32, 0
223 ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s4, s7
224 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
225 ; CHECK-NEXT: s_add_u32 s0, s0, s7
226 ; CHECK-NEXT: s_addc_u32 s1, s1, 0
227 ; CHECK-NEXT: s_getpc_b64 s[4:5]
228 ; CHECK-NEXT: s_add_u32 s4, s4, caller_save_vgpr_spill_fp@rel32@lo+4
229 ; CHECK-NEXT: s_addc_u32 s5, s5, caller_save_vgpr_spill_fp@rel32@hi+12
230 ; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3]
231 ; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1]
232 ; CHECK-NEXT: s_mov_b64 s[0:1], s[8:9]
233 ; CHECK-NEXT: s_mov_b64 s[2:3], s[10:11]
234 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
235 ; CHECK-NEXT: s_endpgm
237 %call = call i32 @caller_save_vgpr_spill_fp()
241 attributes #0 = { "frame-pointer"="none" noinline }
242 attributes #1 = { "frame-pointer"="all" noinline }