1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=GFX803 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX900 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX1010 %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX1100 %s
7 define amdgpu_kernel void @test_kern_empty() local_unnamed_addr #0 {
8 ; GFX803-LABEL: test_kern_empty:
9 ; GFX803: ; %bb.0: ; %entry
10 ; GFX803-NEXT: s_endpgm
12 ; GFX900-LABEL: test_kern_empty:
13 ; GFX900: ; %bb.0: ; %entry
14 ; GFX900-NEXT: s_endpgm
16 ; GFX1010-LABEL: test_kern_empty:
17 ; GFX1010: ; %bb.0: ; %entry
18 ; GFX1010-NEXT: s_endpgm
20 ; GFX1100-LABEL: test_kern_empty:
21 ; GFX1100: ; %bb.0: ; %entry
22 ; GFX1100-NEXT: s_endpgm
27 define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 {
28 ; GFX803-LABEL: test_kern_stack:
29 ; GFX803: ; %bb.0: ; %entry
30 ; GFX803-NEXT: s_add_u32 s0, s0, s7
31 ; GFX803-NEXT: s_addc_u32 s1, s1, 0
32 ; GFX803-NEXT: v_mov_b32_e32 v0, 0
33 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
34 ; GFX803-NEXT: s_waitcnt vmcnt(0)
35 ; GFX803-NEXT: s_endpgm
37 ; GFX900-LABEL: test_kern_stack:
38 ; GFX900: ; %bb.0: ; %entry
39 ; GFX900-NEXT: s_add_u32 s0, s0, s7
40 ; GFX900-NEXT: s_addc_u32 s1, s1, 0
41 ; GFX900-NEXT: v_mov_b32_e32 v0, 0
42 ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
43 ; GFX900-NEXT: s_waitcnt vmcnt(0)
44 ; GFX900-NEXT: s_endpgm
46 ; GFX1010-LABEL: test_kern_stack:
47 ; GFX1010: ; %bb.0: ; %entry
48 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0
49 ; GFX1010-NEXT: s_add_u32 s0, s0, s7
50 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0
51 ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
52 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
53 ; GFX1010-NEXT: s_endpgm
55 ; GFX1100-LABEL: test_kern_stack:
56 ; GFX1100: ; %bb.0: ; %entry
57 ; GFX1100-NEXT: v_mov_b32_e32 v0, 0
58 ; GFX1100-NEXT: scratch_store_b32 off, v0, off offset:4 dlc
59 ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
60 ; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
61 ; GFX1100-NEXT: s_endpgm
63 %x = alloca i32, align 4, addrspace(5)
64 store volatile i32 0, i32 addrspace(5)* %x, align 4
68 define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
69 ; GFX803-LABEL: test_kern_call:
70 ; GFX803: ; %bb.0: ; %entry
71 ; GFX803-NEXT: s_add_i32 s12, s12, s17
72 ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
73 ; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
74 ; GFX803-NEXT: s_add_u32 s0, s0, s17
75 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
76 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
77 ; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
78 ; GFX803-NEXT: s_addc_u32 s1, s1, 0
79 ; GFX803-NEXT: s_mov_b32 s13, s15
80 ; GFX803-NEXT: s_mov_b32 s12, s14
81 ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
82 ; GFX803-NEXT: s_mov_b32 s14, s16
83 ; GFX803-NEXT: s_mov_b32 s32, 0
84 ; GFX803-NEXT: s_getpc_b64 s[18:19]
85 ; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
86 ; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
87 ; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
88 ; GFX803-NEXT: s_endpgm
90 ; GFX900-LABEL: test_kern_call:
91 ; GFX900: ; %bb.0: ; %entry
92 ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
93 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
94 ; GFX900-NEXT: s_add_u32 s0, s0, s17
95 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
96 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
97 ; GFX900-NEXT: s_addc_u32 s1, s1, 0
98 ; GFX900-NEXT: s_mov_b32 s13, s15
99 ; GFX900-NEXT: s_mov_b32 s12, s14
100 ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
101 ; GFX900-NEXT: s_mov_b32 s14, s16
102 ; GFX900-NEXT: s_mov_b32 s32, 0
103 ; GFX900-NEXT: s_getpc_b64 s[18:19]
104 ; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
105 ; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
106 ; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
107 ; GFX900-NEXT: s_endpgm
109 ; GFX1010-LABEL: test_kern_call:
110 ; GFX1010: ; %bb.0: ; %entry
111 ; GFX1010-NEXT: s_add_u32 s12, s12, s17
112 ; GFX1010-NEXT: s_mov_b32 s32, 0
113 ; GFX1010-NEXT: s_addc_u32 s13, s13, 0
114 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
115 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
116 ; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
117 ; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
118 ; GFX1010-NEXT: s_add_u32 s0, s0, s17
119 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0
120 ; GFX1010-NEXT: s_mov_b32 s13, s15
121 ; GFX1010-NEXT: s_mov_b32 s12, s14
122 ; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
123 ; GFX1010-NEXT: s_mov_b32 s14, s16
124 ; GFX1010-NEXT: s_getpc_b64 s[18:19]
125 ; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
126 ; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
127 ; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
128 ; GFX1010-NEXT: s_endpgm
130 ; GFX1100-LABEL: test_kern_call:
131 ; GFX1100: ; %bb.0: ; %entry
132 ; GFX1100-NEXT: v_mov_b32_e32 v31, v0
133 ; GFX1100-NEXT: s_mov_b32 s12, s13
134 ; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7]
135 ; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5]
136 ; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1]
137 ; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3]
138 ; GFX1100-NEXT: s_mov_b32 s13, s14
139 ; GFX1100-NEXT: s_mov_b32 s14, s15
140 ; GFX1100-NEXT: s_mov_b32 s32, 0
141 ; GFX1100-NEXT: s_getpc_b64 s[16:17]
142 ; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
143 ; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
144 ; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
145 ; GFX1100-NEXT: s_endpgm
148 tail call void @ex() #0
152 define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
153 ; GFX803-LABEL: test_kern_stack_and_call:
154 ; GFX803: ; %bb.0: ; %entry
155 ; GFX803-NEXT: s_add_i32 s12, s12, s17
156 ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
157 ; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
158 ; GFX803-NEXT: s_add_u32 s0, s0, s17
159 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
160 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
161 ; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
162 ; GFX803-NEXT: s_addc_u32 s1, s1, 0
163 ; GFX803-NEXT: s_mov_b32 s13, s15
164 ; GFX803-NEXT: s_mov_b32 s12, s14
165 ; GFX803-NEXT: v_mov_b32_e32 v3, 0
166 ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
167 ; GFX803-NEXT: s_mov_b32 s14, s16
168 ; GFX803-NEXT: s_movk_i32 s32, 0x400
169 ; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4
170 ; GFX803-NEXT: s_waitcnt vmcnt(0)
171 ; GFX803-NEXT: s_getpc_b64 s[18:19]
172 ; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
173 ; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
174 ; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
175 ; GFX803-NEXT: s_endpgm
177 ; GFX900-LABEL: test_kern_stack_and_call:
178 ; GFX900: ; %bb.0: ; %entry
179 ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
180 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
181 ; GFX900-NEXT: s_add_u32 s0, s0, s17
182 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
183 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
184 ; GFX900-NEXT: s_addc_u32 s1, s1, 0
185 ; GFX900-NEXT: s_mov_b32 s13, s15
186 ; GFX900-NEXT: s_mov_b32 s12, s14
187 ; GFX900-NEXT: v_mov_b32_e32 v3, 0
188 ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
189 ; GFX900-NEXT: s_mov_b32 s14, s16
190 ; GFX900-NEXT: s_movk_i32 s32, 0x400
191 ; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4
192 ; GFX900-NEXT: s_waitcnt vmcnt(0)
193 ; GFX900-NEXT: s_getpc_b64 s[18:19]
194 ; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
195 ; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
196 ; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
197 ; GFX900-NEXT: s_endpgm
199 ; GFX1010-LABEL: test_kern_stack_and_call:
200 ; GFX1010: ; %bb.0: ; %entry
201 ; GFX1010-NEXT: s_add_u32 s12, s12, s17
202 ; GFX1010-NEXT: s_movk_i32 s32, 0x200
203 ; GFX1010-NEXT: s_addc_u32 s13, s13, 0
204 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
205 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
206 ; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
207 ; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
208 ; GFX1010-NEXT: v_mov_b32_e32 v3, 0
209 ; GFX1010-NEXT: s_add_u32 s0, s0, s17
210 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0
211 ; GFX1010-NEXT: s_mov_b32 s13, s15
212 ; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
213 ; GFX1010-NEXT: s_mov_b32 s12, s14
214 ; GFX1010-NEXT: s_mov_b32 s14, s16
215 ; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4
216 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
217 ; GFX1010-NEXT: s_getpc_b64 s[18:19]
218 ; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
219 ; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
220 ; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
221 ; GFX1010-NEXT: s_endpgm
223 ; GFX1100-LABEL: test_kern_stack_and_call:
224 ; GFX1100: ; %bb.0: ; %entry
225 ; GFX1100-NEXT: v_mov_b32_e32 v1, 0
226 ; GFX1100-NEXT: v_mov_b32_e32 v31, v0
227 ; GFX1100-NEXT: s_mov_b32 s12, s13
228 ; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7]
229 ; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5]
230 ; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1]
231 ; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3]
232 ; GFX1100-NEXT: s_mov_b32 s13, s14
233 ; GFX1100-NEXT: s_mov_b32 s14, s15
234 ; GFX1100-NEXT: s_mov_b32 s32, 16
235 ; GFX1100-NEXT: scratch_store_b32 off, v1, off offset:4 dlc
236 ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
237 ; GFX1100-NEXT: s_getpc_b64 s[16:17]
238 ; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
239 ; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
240 ; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
241 ; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
242 ; GFX1100-NEXT: s_endpgm
245 %x = alloca i32, align 4, addrspace(5)
246 store volatile i32 0, i32 addrspace(5)* %x, align 4
247 tail call void @ex() #0
251 define amdgpu_kernel void @test_force_fp_kern_empty() local_unnamed_addr #2 {
252 ; GFX803-LABEL: test_force_fp_kern_empty:
253 ; GFX803: ; %bb.0: ; %entry
254 ; GFX803-NEXT: s_mov_b32 s33, 0
255 ; GFX803-NEXT: s_endpgm
257 ; GFX900-LABEL: test_force_fp_kern_empty:
258 ; GFX900: ; %bb.0: ; %entry
259 ; GFX900-NEXT: s_mov_b32 s33, 0
260 ; GFX900-NEXT: s_endpgm
262 ; GFX1010-LABEL: test_force_fp_kern_empty:
263 ; GFX1010: ; %bb.0: ; %entry
264 ; GFX1010-NEXT: s_mov_b32 s33, 0
265 ; GFX1010-NEXT: s_endpgm
267 ; GFX1100-LABEL: test_force_fp_kern_empty:
268 ; GFX1100: ; %bb.0: ; %entry
269 ; GFX1100-NEXT: s_mov_b32 s33, 0
270 ; GFX1100-NEXT: s_endpgm
276 define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
277 ; GFX803-LABEL: test_force_fp_kern_stack:
278 ; GFX803: ; %bb.0: ; %entry
279 ; GFX803-NEXT: s_add_u32 s0, s0, s7
280 ; GFX803-NEXT: s_mov_b32 s33, 0
281 ; GFX803-NEXT: s_addc_u32 s1, s1, 0
282 ; GFX803-NEXT: v_mov_b32_e32 v0, 0
283 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
284 ; GFX803-NEXT: s_waitcnt vmcnt(0)
285 ; GFX803-NEXT: s_endpgm
287 ; GFX900-LABEL: test_force_fp_kern_stack:
288 ; GFX900: ; %bb.0: ; %entry
289 ; GFX900-NEXT: s_add_u32 s0, s0, s7
290 ; GFX900-NEXT: s_mov_b32 s33, 0
291 ; GFX900-NEXT: s_addc_u32 s1, s1, 0
292 ; GFX900-NEXT: v_mov_b32_e32 v0, 0
293 ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
294 ; GFX900-NEXT: s_waitcnt vmcnt(0)
295 ; GFX900-NEXT: s_endpgm
297 ; GFX1010-LABEL: test_force_fp_kern_stack:
298 ; GFX1010: ; %bb.0: ; %entry
299 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0
300 ; GFX1010-NEXT: s_add_u32 s0, s0, s7
301 ; GFX1010-NEXT: s_mov_b32 s33, 0
302 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0
303 ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
304 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
305 ; GFX1010-NEXT: s_endpgm
307 ; GFX1100-LABEL: test_force_fp_kern_stack:
308 ; GFX1100: ; %bb.0: ; %entry
309 ; GFX1100-NEXT: v_mov_b32_e32 v0, 0
310 ; GFX1100-NEXT: s_mov_b32 s33, 0
311 ; GFX1100-NEXT: scratch_store_b32 off, v0, s33 offset:4 dlc
312 ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
313 ; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
314 ; GFX1100-NEXT: s_endpgm
316 %x = alloca i32, align 4, addrspace(5)
317 store volatile i32 0, i32 addrspace(5)* %x, align 4
321 define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
322 ; GFX803-LABEL: test_force_fp_kern_call:
323 ; GFX803: ; %bb.0: ; %entry
324 ; GFX803-NEXT: s_add_i32 s12, s12, s17
325 ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
326 ; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
327 ; GFX803-NEXT: s_add_u32 s0, s0, s17
328 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
329 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
330 ; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
331 ; GFX803-NEXT: s_addc_u32 s1, s1, 0
332 ; GFX803-NEXT: s_mov_b32 s13, s15
333 ; GFX803-NEXT: s_mov_b32 s12, s14
334 ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
335 ; GFX803-NEXT: s_mov_b32 s14, s16
336 ; GFX803-NEXT: s_mov_b32 s32, 0
337 ; GFX803-NEXT: s_mov_b32 s33, 0
338 ; GFX803-NEXT: s_getpc_b64 s[18:19]
339 ; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
340 ; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
341 ; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
342 ; GFX803-NEXT: s_endpgm
344 ; GFX900-LABEL: test_force_fp_kern_call:
345 ; GFX900: ; %bb.0: ; %entry
346 ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
347 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
348 ; GFX900-NEXT: s_add_u32 s0, s0, s17
349 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
350 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
351 ; GFX900-NEXT: s_addc_u32 s1, s1, 0
352 ; GFX900-NEXT: s_mov_b32 s13, s15
353 ; GFX900-NEXT: s_mov_b32 s12, s14
354 ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
355 ; GFX900-NEXT: s_mov_b32 s14, s16
356 ; GFX900-NEXT: s_mov_b32 s32, 0
357 ; GFX900-NEXT: s_mov_b32 s33, 0
358 ; GFX900-NEXT: s_getpc_b64 s[18:19]
359 ; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
360 ; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
361 ; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
362 ; GFX900-NEXT: s_endpgm
364 ; GFX1010-LABEL: test_force_fp_kern_call:
365 ; GFX1010: ; %bb.0: ; %entry
366 ; GFX1010-NEXT: s_add_u32 s12, s12, s17
367 ; GFX1010-NEXT: s_mov_b32 s32, 0
368 ; GFX1010-NEXT: s_mov_b32 s33, 0
369 ; GFX1010-NEXT: s_addc_u32 s13, s13, 0
370 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
371 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
372 ; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
373 ; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
374 ; GFX1010-NEXT: s_add_u32 s0, s0, s17
375 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0
376 ; GFX1010-NEXT: s_mov_b32 s13, s15
377 ; GFX1010-NEXT: s_mov_b32 s12, s14
378 ; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
379 ; GFX1010-NEXT: s_mov_b32 s14, s16
380 ; GFX1010-NEXT: s_getpc_b64 s[18:19]
381 ; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
382 ; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
383 ; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
384 ; GFX1010-NEXT: s_endpgm
386 ; GFX1100-LABEL: test_force_fp_kern_call:
387 ; GFX1100: ; %bb.0: ; %entry
388 ; GFX1100-NEXT: v_mov_b32_e32 v31, v0
389 ; GFX1100-NEXT: s_mov_b32 s12, s13
390 ; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7]
391 ; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5]
392 ; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1]
393 ; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3]
394 ; GFX1100-NEXT: s_mov_b32 s13, s14
395 ; GFX1100-NEXT: s_mov_b32 s14, s15
396 ; GFX1100-NEXT: s_mov_b32 s32, 0
397 ; GFX1100-NEXT: s_mov_b32 s33, 0
398 ; GFX1100-NEXT: s_getpc_b64 s[16:17]
399 ; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
400 ; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
401 ; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
402 ; GFX1100-NEXT: s_endpgm
403 ; GFX1010-NEXT s_add_u32 s12, s12, s17
404 ; GFX1010-NEXT s_mov_b32 s32, 0
405 ; GFX1010-NEXT s_mov_b32 s33, 0
406 ; GFX1010-NEXT s_addc_u32 s13, s13, 0
407 ; GFX1010-NEXT s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
408 ; GFX1010-NEXT s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
409 ; GFX1010-NEXT v_lshlrev_b32_e32 v2, 20, v2
410 ; GFX1010-NEXT v_lshlrev_b32_e32 v1, 10, v1
411 ; GFX1010-NEXT s_add_u32 s0, s0, s17
412 ; GFX1010-NEXT s_addc_u32 s1, s1, 0
413 ; GFX1010-NEXT s_mov_b32 s12, s14
414 ; GFX1010-NEXT s_mov_b32 s13, s15
415 ; GFX1010-NEXT v_or3_b32 v31, v0, v1, v2
416 ; GFX1010-NEXT s_mov_b32 s14, s16
417 ; GFX1010-NEXT s_getpc_b64 s[18:19]
418 ; GFX1010-NEXT s_add_u32 s18, s18, ex@rel32@lo+4
419 ; GFX1010-NEXT s_addc_u32 s19, s19, ex@rel32@hi+12
420 ; GFX1010-NEXT s_swappc_b64 s[30:31], s[18:19]
421 ; GFX1010-NEXT s_endpgm
423 tail call void @ex() #2
427 define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_addr #2 {
428 ; GFX803-LABEL: test_force_fp_kern_stack_and_call:
429 ; GFX803: ; %bb.0: ; %entry
430 ; GFX803-NEXT: s_add_i32 s12, s12, s17
431 ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
432 ; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
433 ; GFX803-NEXT: s_add_u32 s0, s0, s17
434 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
435 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
436 ; GFX803-NEXT: s_mov_b32 s33, 0
437 ; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
438 ; GFX803-NEXT: s_addc_u32 s1, s1, 0
439 ; GFX803-NEXT: s_mov_b32 s13, s15
440 ; GFX803-NEXT: s_mov_b32 s12, s14
441 ; GFX803-NEXT: v_mov_b32_e32 v3, 0
442 ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
443 ; GFX803-NEXT: s_mov_b32 s14, s16
444 ; GFX803-NEXT: s_movk_i32 s32, 0x400
445 ; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4
446 ; GFX803-NEXT: s_waitcnt vmcnt(0)
447 ; GFX803-NEXT: s_getpc_b64 s[18:19]
448 ; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
449 ; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
450 ; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
451 ; GFX803-NEXT: s_endpgm
453 ; GFX900-LABEL: test_force_fp_kern_stack_and_call:
454 ; GFX900: ; %bb.0: ; %entry
455 ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
456 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
457 ; GFX900-NEXT: s_add_u32 s0, s0, s17
458 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
459 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
460 ; GFX900-NEXT: s_mov_b32 s33, 0
461 ; GFX900-NEXT: s_addc_u32 s1, s1, 0
462 ; GFX900-NEXT: s_mov_b32 s13, s15
463 ; GFX900-NEXT: s_mov_b32 s12, s14
464 ; GFX900-NEXT: v_mov_b32_e32 v3, 0
465 ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
466 ; GFX900-NEXT: s_mov_b32 s14, s16
467 ; GFX900-NEXT: s_movk_i32 s32, 0x400
468 ; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4
469 ; GFX900-NEXT: s_waitcnt vmcnt(0)
470 ; GFX900-NEXT: s_getpc_b64 s[18:19]
471 ; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
472 ; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
473 ; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
474 ; GFX900-NEXT: s_endpgm
476 ; GFX1010-LABEL: test_force_fp_kern_stack_and_call:
477 ; GFX1010: ; %bb.0: ; %entry
478 ; GFX1010-NEXT: s_add_u32 s12, s12, s17
479 ; GFX1010-NEXT: s_movk_i32 s32, 0x200
480 ; GFX1010-NEXT: s_mov_b32 s33, 0
481 ; GFX1010-NEXT: s_addc_u32 s13, s13, 0
482 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
483 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
484 ; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
485 ; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
486 ; GFX1010-NEXT: v_mov_b32_e32 v3, 0
487 ; GFX1010-NEXT: s_add_u32 s0, s0, s17
488 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0
489 ; GFX1010-NEXT: s_mov_b32 s13, s15
490 ; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
491 ; GFX1010-NEXT: s_mov_b32 s12, s14
492 ; GFX1010-NEXT: s_mov_b32 s14, s16
493 ; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4
494 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
495 ; GFX1010-NEXT: s_getpc_b64 s[18:19]
496 ; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
497 ; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
498 ; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
499 ; GFX1010-NEXT: s_endpgm
501 ; GFX1100-LABEL: test_force_fp_kern_stack_and_call:
502 ; GFX1100: ; %bb.0: ; %entry
503 ; GFX1100-NEXT: v_mov_b32_e32 v1, 0
504 ; GFX1100-NEXT: v_mov_b32_e32 v31, v0
505 ; GFX1100-NEXT: s_mov_b32 s33, 0
506 ; GFX1100-NEXT: s_mov_b32 s12, s13
507 ; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7]
508 ; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5]
509 ; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1]
510 ; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3]
511 ; GFX1100-NEXT: s_mov_b32 s13, s14
512 ; GFX1100-NEXT: s_mov_b32 s14, s15
513 ; GFX1100-NEXT: s_mov_b32 s32, 16
514 ; GFX1100-NEXT: scratch_store_b32 off, v1, s33 offset:4 dlc
515 ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
516 ; GFX1100-NEXT: s_getpc_b64 s[16:17]
517 ; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
518 ; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
519 ; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
520 ; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
521 ; GFX1100-NEXT: s_endpgm
523 %x = alloca i32, align 4, addrspace(5)
524 store volatile i32 0, i32 addrspace(5)* %x, align 4
525 tail call void @ex() #2
529 define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
530 ; GFX803-LABEL: test_sgpr_offset_kernel:
531 ; GFX803: ; %bb.0: ; %entry
532 ; GFX803-NEXT: s_add_u32 s0, s0, s7
533 ; GFX803-NEXT: s_addc_u32 s1, s1, 0
534 ; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
535 ; GFX803-NEXT: s_waitcnt vmcnt(0)
536 ; GFX803-NEXT: s_mov_b32 s4, 0x40000
537 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
538 ; GFX803-NEXT: ;;#ASMSTART
539 ; GFX803-NEXT: ;;#ASMEND
540 ; GFX803-NEXT: s_mov_b32 s4, 0x40000
541 ; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
542 ; GFX803-NEXT: s_waitcnt vmcnt(0)
543 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
544 ; GFX803-NEXT: s_waitcnt vmcnt(0)
545 ; GFX803-NEXT: s_endpgm
547 ; GFX900-LABEL: test_sgpr_offset_kernel:
548 ; GFX900: ; %bb.0: ; %entry
549 ; GFX900-NEXT: s_add_u32 s0, s0, s7
550 ; GFX900-NEXT: s_addc_u32 s1, s1, 0
551 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
552 ; GFX900-NEXT: s_waitcnt vmcnt(0)
553 ; GFX900-NEXT: s_mov_b32 s4, 0x40000
554 ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
555 ; GFX900-NEXT: ;;#ASMSTART
556 ; GFX900-NEXT: ;;#ASMEND
557 ; GFX900-NEXT: s_mov_b32 s4, 0x40000
558 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
559 ; GFX900-NEXT: s_waitcnt vmcnt(0)
560 ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
561 ; GFX900-NEXT: s_waitcnt vmcnt(0)
562 ; GFX900-NEXT: s_endpgm
564 ; GFX1010-LABEL: test_sgpr_offset_kernel:
565 ; GFX1010: ; %bb.0: ; %entry
566 ; GFX1010-NEXT: s_add_u32 s0, s0, s7
567 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0
568 ; GFX1010-NEXT: s_mov_b32 s4, 0x20000
569 ; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc
570 ; GFX1010-NEXT: s_waitcnt vmcnt(0)
571 ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
572 ; GFX1010-NEXT: s_waitcnt_depctr 0xffe3
573 ; GFX1010-NEXT: s_mov_b32 s4, 0x20000
574 ; GFX1010-NEXT: ;;#ASMSTART
575 ; GFX1010-NEXT: ;;#ASMEND
576 ; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
577 ; GFX1010-NEXT: s_waitcnt vmcnt(0)
578 ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
579 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
580 ; GFX1010-NEXT: s_endpgm
582 ; GFX1100-LABEL: test_sgpr_offset_kernel:
583 ; GFX1100: ; %bb.0: ; %entry
584 ; GFX1100-NEXT: scratch_load_b32 v0, off, off offset:8 glc dlc
585 ; GFX1100-NEXT: s_waitcnt vmcnt(0)
586 ; GFX1100-NEXT: s_movk_i32 s0, 0x1000
587 ; GFX1100-NEXT: scratch_store_b32 off, v0, s0 ; 4-byte Folded Spill
588 ; GFX1100-NEXT: s_movk_i32 s0, 0x1000
589 ; GFX1100-NEXT: ;;#ASMSTART
590 ; GFX1100-NEXT: ;;#ASMEND
591 ; GFX1100-NEXT: scratch_load_b32 v0, off, s0 ; 4-byte Folded Reload
592 ; GFX1100-NEXT: s_waitcnt vmcnt(0)
593 ; GFX1100-NEXT: scratch_store_b32 off, v0, off offset:8 dlc
594 ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
595 ; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
596 ; GFX1100-NEXT: s_endpgm
598 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
599 ; fit in the instruction, and has to live in the SGPR offset.
600 %alloca = alloca i8, i32 4092, align 4, addrspace(5)
601 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
603 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
604 ; 0x40000 / 64 = 4096 (for wave64)
605 ; CHECK: s_add_u32 s6, s7, 0x40000
606 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
607 %a = load volatile i32, i32 addrspace(5)* %aptr
610 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
612 %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
613 store volatile i32 %a, i32 addrspace(5)* %outptr
618 declare hidden void @ex() local_unnamed_addr #0
620 attributes #0 = { nounwind }
621 attributes #1 = { nounwind "amdgpu-num-vgpr"="8" }
622 attributes #2 = { nounwind "frame-pointer"="all" }