1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=GFX803 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX900 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX1010 %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX1100 %s
7 define amdgpu_kernel void @test_kern_empty() local_unnamed_addr #0 {
8 ; GFX803-LABEL: test_kern_empty:
9 ; GFX803: ; %bb.0: ; %entry
10 ; GFX803-NEXT: s_endpgm
12 ; GFX900-LABEL: test_kern_empty:
13 ; GFX900: ; %bb.0: ; %entry
14 ; GFX900-NEXT: s_endpgm
16 ; GFX1010-LABEL: test_kern_empty:
17 ; GFX1010: ; %bb.0: ; %entry
18 ; GFX1010-NEXT: s_endpgm
20 ; GFX1100-LABEL: test_kern_empty:
21 ; GFX1100: ; %bb.0: ; %entry
22 ; GFX1100-NEXT: s_endpgm
27 define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 {
28 ; GFX803-LABEL: test_kern_stack:
29 ; GFX803: ; %bb.0: ; %entry
30 ; GFX803-NEXT: s_add_u32 s0, s0, s7
31 ; GFX803-NEXT: s_addc_u32 s1, s1, 0
32 ; GFX803-NEXT: v_mov_b32_e32 v0, 0
33 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
34 ; GFX803-NEXT: s_waitcnt vmcnt(0)
35 ; GFX803-NEXT: s_endpgm
37 ; GFX900-LABEL: test_kern_stack:
38 ; GFX900: ; %bb.0: ; %entry
39 ; GFX900-NEXT: s_add_u32 s0, s0, s7
40 ; GFX900-NEXT: s_addc_u32 s1, s1, 0
41 ; GFX900-NEXT: v_mov_b32_e32 v0, 0
42 ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
43 ; GFX900-NEXT: s_waitcnt vmcnt(0)
44 ; GFX900-NEXT: s_endpgm
46 ; GFX1010-LABEL: test_kern_stack:
47 ; GFX1010: ; %bb.0: ; %entry
48 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0
49 ; GFX1010-NEXT: s_add_u32 s0, s0, s7
50 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0
51 ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
52 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
53 ; GFX1010-NEXT: s_endpgm
55 ; GFX1100-LABEL: test_kern_stack:
56 ; GFX1100: ; %bb.0: ; %entry
57 ; GFX1100-NEXT: v_mov_b32_e32 v0, 0
58 ; GFX1100-NEXT: scratch_store_b32 off, v0, off offset:4 dlc
59 ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
60 ; GFX1100-NEXT: s_endpgm
62 %x = alloca i32, align 4, addrspace(5)
63 store volatile i32 0, ptr addrspace(5) %x, align 4
67 define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
68 ; GFX803-LABEL: test_kern_call:
69 ; GFX803: ; %bb.0: ; %entry
70 ; GFX803-NEXT: s_add_i32 s12, s12, s17
71 ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
72 ; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
73 ; GFX803-NEXT: s_add_u32 s0, s0, s17
74 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
75 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
76 ; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
77 ; GFX803-NEXT: s_addc_u32 s1, s1, 0
78 ; GFX803-NEXT: s_mov_b32 s13, s15
79 ; GFX803-NEXT: s_mov_b32 s12, s14
80 ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
81 ; GFX803-NEXT: s_mov_b32 s14, s16
82 ; GFX803-NEXT: s_mov_b32 s32, 0
83 ; GFX803-NEXT: s_getpc_b64 s[18:19]
84 ; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
85 ; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
86 ; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
87 ; GFX803-NEXT: s_endpgm
89 ; GFX900-LABEL: test_kern_call:
90 ; GFX900: ; %bb.0: ; %entry
91 ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
92 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
93 ; GFX900-NEXT: s_add_u32 s0, s0, s17
94 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
95 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
96 ; GFX900-NEXT: s_addc_u32 s1, s1, 0
97 ; GFX900-NEXT: s_mov_b32 s13, s15
98 ; GFX900-NEXT: s_mov_b32 s12, s14
99 ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
100 ; GFX900-NEXT: s_mov_b32 s14, s16
101 ; GFX900-NEXT: s_mov_b32 s32, 0
102 ; GFX900-NEXT: s_getpc_b64 s[18:19]
103 ; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
104 ; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
105 ; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
106 ; GFX900-NEXT: s_endpgm
108 ; GFX1010-LABEL: test_kern_call:
109 ; GFX1010: ; %bb.0: ; %entry
110 ; GFX1010-NEXT: s_add_u32 s12, s12, s17
111 ; GFX1010-NEXT: s_mov_b32 s32, 0
112 ; GFX1010-NEXT: s_addc_u32 s13, s13, 0
113 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
114 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
115 ; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
116 ; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
117 ; GFX1010-NEXT: s_add_u32 s0, s0, s17
118 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0
119 ; GFX1010-NEXT: s_mov_b32 s13, s15
120 ; GFX1010-NEXT: s_mov_b32 s12, s14
121 ; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
122 ; GFX1010-NEXT: s_mov_b32 s14, s16
123 ; GFX1010-NEXT: s_getpc_b64 s[18:19]
124 ; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
125 ; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
126 ; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
127 ; GFX1010-NEXT: s_endpgm
129 ; GFX1100-LABEL: test_kern_call:
130 ; GFX1100: ; %bb.0: ; %entry
131 ; GFX1100-NEXT: v_mov_b32_e32 v31, v0
132 ; GFX1100-NEXT: s_mov_b32 s12, s13
133 ; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7]
134 ; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5]
135 ; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1]
136 ; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3]
137 ; GFX1100-NEXT: s_mov_b32 s13, s14
138 ; GFX1100-NEXT: s_mov_b32 s14, s15
139 ; GFX1100-NEXT: s_mov_b32 s32, 0
140 ; GFX1100-NEXT: s_getpc_b64 s[16:17]
141 ; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
142 ; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
143 ; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
144 ; GFX1100-NEXT: s_endpgm
147 tail call void @ex() #0
151 define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
152 ; GFX803-LABEL: test_kern_stack_and_call:
153 ; GFX803: ; %bb.0: ; %entry
154 ; GFX803-NEXT: s_add_i32 s12, s12, s17
155 ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
156 ; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
157 ; GFX803-NEXT: s_add_u32 s0, s0, s17
158 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
159 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
160 ; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
161 ; GFX803-NEXT: s_addc_u32 s1, s1, 0
162 ; GFX803-NEXT: s_mov_b32 s13, s15
163 ; GFX803-NEXT: s_mov_b32 s12, s14
164 ; GFX803-NEXT: v_mov_b32_e32 v3, 0
165 ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
166 ; GFX803-NEXT: s_mov_b32 s14, s16
167 ; GFX803-NEXT: s_movk_i32 s32, 0x400
168 ; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4
169 ; GFX803-NEXT: s_waitcnt vmcnt(0)
170 ; GFX803-NEXT: s_getpc_b64 s[18:19]
171 ; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
172 ; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
173 ; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
174 ; GFX803-NEXT: s_endpgm
176 ; GFX900-LABEL: test_kern_stack_and_call:
177 ; GFX900: ; %bb.0: ; %entry
178 ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
179 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
180 ; GFX900-NEXT: s_add_u32 s0, s0, s17
181 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
182 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
183 ; GFX900-NEXT: s_addc_u32 s1, s1, 0
184 ; GFX900-NEXT: s_mov_b32 s13, s15
185 ; GFX900-NEXT: s_mov_b32 s12, s14
186 ; GFX900-NEXT: v_mov_b32_e32 v3, 0
187 ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
188 ; GFX900-NEXT: s_mov_b32 s14, s16
189 ; GFX900-NEXT: s_movk_i32 s32, 0x400
190 ; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4
191 ; GFX900-NEXT: s_waitcnt vmcnt(0)
192 ; GFX900-NEXT: s_getpc_b64 s[18:19]
193 ; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
194 ; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
195 ; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
196 ; GFX900-NEXT: s_endpgm
198 ; GFX1010-LABEL: test_kern_stack_and_call:
199 ; GFX1010: ; %bb.0: ; %entry
200 ; GFX1010-NEXT: s_add_u32 s12, s12, s17
201 ; GFX1010-NEXT: s_movk_i32 s32, 0x200
202 ; GFX1010-NEXT: s_addc_u32 s13, s13, 0
203 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
204 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
205 ; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
206 ; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
207 ; GFX1010-NEXT: v_mov_b32_e32 v3, 0
208 ; GFX1010-NEXT: s_add_u32 s0, s0, s17
209 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0
210 ; GFX1010-NEXT: s_mov_b32 s13, s15
211 ; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
212 ; GFX1010-NEXT: s_mov_b32 s12, s14
213 ; GFX1010-NEXT: s_mov_b32 s14, s16
214 ; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4
215 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
216 ; GFX1010-NEXT: s_getpc_b64 s[18:19]
217 ; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
218 ; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
219 ; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
220 ; GFX1010-NEXT: s_endpgm
222 ; GFX1100-LABEL: test_kern_stack_and_call:
223 ; GFX1100: ; %bb.0: ; %entry
224 ; GFX1100-NEXT: v_mov_b32_e32 v1, 0
225 ; GFX1100-NEXT: v_mov_b32_e32 v31, v0
226 ; GFX1100-NEXT: s_mov_b32 s12, s13
227 ; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7]
228 ; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5]
229 ; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1]
230 ; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3]
231 ; GFX1100-NEXT: s_mov_b32 s13, s14
232 ; GFX1100-NEXT: s_mov_b32 s14, s15
233 ; GFX1100-NEXT: s_mov_b32 s32, 16
234 ; GFX1100-NEXT: scratch_store_b32 off, v1, off offset:4 dlc
235 ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
236 ; GFX1100-NEXT: s_getpc_b64 s[16:17]
237 ; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
238 ; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
239 ; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
240 ; GFX1100-NEXT: s_endpgm
243 %x = alloca i32, align 4, addrspace(5)
244 store volatile i32 0, ptr addrspace(5) %x, align 4
245 tail call void @ex() #0
249 define amdgpu_kernel void @test_force_fp_kern_empty() local_unnamed_addr #2 {
250 ; GFX803-LABEL: test_force_fp_kern_empty:
251 ; GFX803: ; %bb.0: ; %entry
252 ; GFX803-NEXT: s_mov_b32 s33, 0
253 ; GFX803-NEXT: s_endpgm
255 ; GFX900-LABEL: test_force_fp_kern_empty:
256 ; GFX900: ; %bb.0: ; %entry
257 ; GFX900-NEXT: s_mov_b32 s33, 0
258 ; GFX900-NEXT: s_endpgm
260 ; GFX1010-LABEL: test_force_fp_kern_empty:
261 ; GFX1010: ; %bb.0: ; %entry
262 ; GFX1010-NEXT: s_mov_b32 s33, 0
263 ; GFX1010-NEXT: s_endpgm
265 ; GFX1100-LABEL: test_force_fp_kern_empty:
266 ; GFX1100: ; %bb.0: ; %entry
267 ; GFX1100-NEXT: s_mov_b32 s33, 0
268 ; GFX1100-NEXT: s_endpgm
274 define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
275 ; GFX803-LABEL: test_force_fp_kern_stack:
276 ; GFX803: ; %bb.0: ; %entry
277 ; GFX803-NEXT: s_add_u32 s0, s0, s7
278 ; GFX803-NEXT: s_mov_b32 s33, 0
279 ; GFX803-NEXT: s_addc_u32 s1, s1, 0
280 ; GFX803-NEXT: v_mov_b32_e32 v0, 0
281 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
282 ; GFX803-NEXT: s_waitcnt vmcnt(0)
283 ; GFX803-NEXT: s_endpgm
285 ; GFX900-LABEL: test_force_fp_kern_stack:
286 ; GFX900: ; %bb.0: ; %entry
287 ; GFX900-NEXT: s_add_u32 s0, s0, s7
288 ; GFX900-NEXT: s_mov_b32 s33, 0
289 ; GFX900-NEXT: s_addc_u32 s1, s1, 0
290 ; GFX900-NEXT: v_mov_b32_e32 v0, 0
291 ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
292 ; GFX900-NEXT: s_waitcnt vmcnt(0)
293 ; GFX900-NEXT: s_endpgm
295 ; GFX1010-LABEL: test_force_fp_kern_stack:
296 ; GFX1010: ; %bb.0: ; %entry
297 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0
298 ; GFX1010-NEXT: s_add_u32 s0, s0, s7
299 ; GFX1010-NEXT: s_mov_b32 s33, 0
300 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0
301 ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
302 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
303 ; GFX1010-NEXT: s_endpgm
305 ; GFX1100-LABEL: test_force_fp_kern_stack:
306 ; GFX1100: ; %bb.0: ; %entry
307 ; GFX1100-NEXT: v_mov_b32_e32 v0, 0
308 ; GFX1100-NEXT: s_mov_b32 s33, 0
309 ; GFX1100-NEXT: scratch_store_b32 off, v0, s33 offset:4 dlc
310 ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
311 ; GFX1100-NEXT: s_endpgm
313 %x = alloca i32, align 4, addrspace(5)
314 store volatile i32 0, ptr addrspace(5) %x, align 4
318 define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
319 ; GFX803-LABEL: test_force_fp_kern_call:
320 ; GFX803: ; %bb.0: ; %entry
321 ; GFX803-NEXT: s_add_i32 s12, s12, s17
322 ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
323 ; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
324 ; GFX803-NEXT: s_add_u32 s0, s0, s17
325 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
326 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
327 ; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
328 ; GFX803-NEXT: s_addc_u32 s1, s1, 0
329 ; GFX803-NEXT: s_mov_b32 s13, s15
330 ; GFX803-NEXT: s_mov_b32 s12, s14
331 ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
332 ; GFX803-NEXT: s_mov_b32 s14, s16
333 ; GFX803-NEXT: s_mov_b32 s32, 0
334 ; GFX803-NEXT: s_mov_b32 s33, 0
335 ; GFX803-NEXT: s_getpc_b64 s[18:19]
336 ; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
337 ; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
338 ; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
339 ; GFX803-NEXT: s_endpgm
341 ; GFX900-LABEL: test_force_fp_kern_call:
342 ; GFX900: ; %bb.0: ; %entry
343 ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
344 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
345 ; GFX900-NEXT: s_add_u32 s0, s0, s17
346 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
347 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
348 ; GFX900-NEXT: s_addc_u32 s1, s1, 0
349 ; GFX900-NEXT: s_mov_b32 s13, s15
350 ; GFX900-NEXT: s_mov_b32 s12, s14
351 ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
352 ; GFX900-NEXT: s_mov_b32 s14, s16
353 ; GFX900-NEXT: s_mov_b32 s32, 0
354 ; GFX900-NEXT: s_mov_b32 s33, 0
355 ; GFX900-NEXT: s_getpc_b64 s[18:19]
356 ; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
357 ; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
358 ; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
359 ; GFX900-NEXT: s_endpgm
361 ; GFX1010-LABEL: test_force_fp_kern_call:
362 ; GFX1010: ; %bb.0: ; %entry
363 ; GFX1010-NEXT: s_add_u32 s12, s12, s17
364 ; GFX1010-NEXT: s_mov_b32 s32, 0
365 ; GFX1010-NEXT: s_mov_b32 s33, 0
366 ; GFX1010-NEXT: s_addc_u32 s13, s13, 0
367 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
368 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
369 ; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
370 ; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
371 ; GFX1010-NEXT: s_add_u32 s0, s0, s17
372 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0
373 ; GFX1010-NEXT: s_mov_b32 s13, s15
374 ; GFX1010-NEXT: s_mov_b32 s12, s14
375 ; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
376 ; GFX1010-NEXT: s_mov_b32 s14, s16
377 ; GFX1010-NEXT: s_getpc_b64 s[18:19]
378 ; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
379 ; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
380 ; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
381 ; GFX1010-NEXT: s_endpgm
383 ; GFX1100-LABEL: test_force_fp_kern_call:
384 ; GFX1100: ; %bb.0: ; %entry
385 ; GFX1100-NEXT: v_mov_b32_e32 v31, v0
386 ; GFX1100-NEXT: s_mov_b32 s12, s13
387 ; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7]
388 ; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5]
389 ; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1]
390 ; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3]
391 ; GFX1100-NEXT: s_mov_b32 s13, s14
392 ; GFX1100-NEXT: s_mov_b32 s14, s15
393 ; GFX1100-NEXT: s_mov_b32 s32, 0
394 ; GFX1100-NEXT: s_mov_b32 s33, 0
395 ; GFX1100-NEXT: s_getpc_b64 s[16:17]
396 ; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
397 ; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
398 ; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
399 ; GFX1100-NEXT: s_endpgm
400 ; GFX1010-NEXT s_add_u32 s12, s12, s17
401 ; GFX1010-NEXT s_mov_b32 s32, 0
402 ; GFX1010-NEXT s_mov_b32 s33, 0
403 ; GFX1010-NEXT s_addc_u32 s13, s13, 0
404 ; GFX1010-NEXT s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
405 ; GFX1010-NEXT s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
406 ; GFX1010-NEXT v_lshlrev_b32_e32 v2, 20, v2
407 ; GFX1010-NEXT v_lshlrev_b32_e32 v1, 10, v1
408 ; GFX1010-NEXT s_add_u32 s0, s0, s17
409 ; GFX1010-NEXT s_addc_u32 s1, s1, 0
410 ; GFX1010-NEXT s_mov_b32 s12, s14
411 ; GFX1010-NEXT s_mov_b32 s13, s15
412 ; GFX1010-NEXT v_or3_b32 v31, v0, v1, v2
413 ; GFX1010-NEXT s_mov_b32 s14, s16
414 ; GFX1010-NEXT s_getpc_b64 s[18:19]
415 ; GFX1010-NEXT s_add_u32 s18, s18, ex@rel32@lo+4
416 ; GFX1010-NEXT s_addc_u32 s19, s19, ex@rel32@hi+12
417 ; GFX1010-NEXT s_swappc_b64 s[30:31], s[18:19]
418 ; GFX1010-NEXT s_endpgm
420 tail call void @ex() #2
424 define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_addr #2 {
425 ; GFX803-LABEL: test_force_fp_kern_stack_and_call:
426 ; GFX803: ; %bb.0: ; %entry
427 ; GFX803-NEXT: s_add_i32 s12, s12, s17
428 ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
429 ; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1
430 ; GFX803-NEXT: s_add_u32 s0, s0, s17
431 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2
432 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
433 ; GFX803-NEXT: s_mov_b32 s33, 0
434 ; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
435 ; GFX803-NEXT: s_addc_u32 s1, s1, 0
436 ; GFX803-NEXT: s_mov_b32 s13, s15
437 ; GFX803-NEXT: s_mov_b32 s12, s14
438 ; GFX803-NEXT: v_mov_b32_e32 v3, 0
439 ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2
440 ; GFX803-NEXT: s_mov_b32 s14, s16
441 ; GFX803-NEXT: s_movk_i32 s32, 0x400
442 ; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4
443 ; GFX803-NEXT: s_waitcnt vmcnt(0)
444 ; GFX803-NEXT: s_getpc_b64 s[18:19]
445 ; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
446 ; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
447 ; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19]
448 ; GFX803-NEXT: s_endpgm
450 ; GFX900-LABEL: test_force_fp_kern_stack_and_call:
451 ; GFX900: ; %bb.0: ; %entry
452 ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
453 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
454 ; GFX900-NEXT: s_add_u32 s0, s0, s17
455 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2
456 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1
457 ; GFX900-NEXT: s_mov_b32 s33, 0
458 ; GFX900-NEXT: s_addc_u32 s1, s1, 0
459 ; GFX900-NEXT: s_mov_b32 s13, s15
460 ; GFX900-NEXT: s_mov_b32 s12, s14
461 ; GFX900-NEXT: v_mov_b32_e32 v3, 0
462 ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2
463 ; GFX900-NEXT: s_mov_b32 s14, s16
464 ; GFX900-NEXT: s_movk_i32 s32, 0x400
465 ; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4
466 ; GFX900-NEXT: s_waitcnt vmcnt(0)
467 ; GFX900-NEXT: s_getpc_b64 s[18:19]
468 ; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
469 ; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
470 ; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19]
471 ; GFX900-NEXT: s_endpgm
473 ; GFX1010-LABEL: test_force_fp_kern_stack_and_call:
474 ; GFX1010: ; %bb.0: ; %entry
475 ; GFX1010-NEXT: s_add_u32 s12, s12, s17
476 ; GFX1010-NEXT: s_movk_i32 s32, 0x200
477 ; GFX1010-NEXT: s_mov_b32 s33, 0
478 ; GFX1010-NEXT: s_addc_u32 s13, s13, 0
479 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
480 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
481 ; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2
482 ; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1
483 ; GFX1010-NEXT: v_mov_b32_e32 v3, 0
484 ; GFX1010-NEXT: s_add_u32 s0, s0, s17
485 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0
486 ; GFX1010-NEXT: s_mov_b32 s13, s15
487 ; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2
488 ; GFX1010-NEXT: s_mov_b32 s12, s14
489 ; GFX1010-NEXT: s_mov_b32 s14, s16
490 ; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4
491 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
492 ; GFX1010-NEXT: s_getpc_b64 s[18:19]
493 ; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4
494 ; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12
495 ; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19]
496 ; GFX1010-NEXT: s_endpgm
498 ; GFX1100-LABEL: test_force_fp_kern_stack_and_call:
499 ; GFX1100: ; %bb.0: ; %entry
500 ; GFX1100-NEXT: v_mov_b32_e32 v1, 0
501 ; GFX1100-NEXT: v_mov_b32_e32 v31, v0
502 ; GFX1100-NEXT: s_mov_b32 s33, 0
503 ; GFX1100-NEXT: s_mov_b32 s12, s13
504 ; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7]
505 ; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5]
506 ; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1]
507 ; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3]
508 ; GFX1100-NEXT: s_mov_b32 s13, s14
509 ; GFX1100-NEXT: s_mov_b32 s14, s15
510 ; GFX1100-NEXT: s_mov_b32 s32, 16
511 ; GFX1100-NEXT: scratch_store_b32 off, v1, s33 offset:4 dlc
512 ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
513 ; GFX1100-NEXT: s_getpc_b64 s[16:17]
514 ; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4
515 ; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12
516 ; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17]
517 ; GFX1100-NEXT: s_endpgm
519 %x = alloca i32, align 4, addrspace(5)
520 store volatile i32 0, ptr addrspace(5) %x, align 4
521 tail call void @ex() #2
525 define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
526 ; GFX803-LABEL: test_sgpr_offset_kernel:
527 ; GFX803: ; %bb.0: ; %entry
528 ; GFX803-NEXT: s_add_u32 s0, s0, s7
529 ; GFX803-NEXT: s_addc_u32 s1, s1, 0
530 ; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
531 ; GFX803-NEXT: s_waitcnt vmcnt(0)
532 ; GFX803-NEXT: s_mov_b32 s4, 0x40000
533 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
534 ; GFX803-NEXT: ;;#ASMSTART
535 ; GFX803-NEXT: ;;#ASMEND
536 ; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
537 ; GFX803-NEXT: s_waitcnt vmcnt(0)
538 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
539 ; GFX803-NEXT: s_waitcnt vmcnt(0)
540 ; GFX803-NEXT: s_endpgm
542 ; GFX900-LABEL: test_sgpr_offset_kernel:
543 ; GFX900: ; %bb.0: ; %entry
544 ; GFX900-NEXT: s_add_u32 s0, s0, s7
545 ; GFX900-NEXT: s_addc_u32 s1, s1, 0
546 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
547 ; GFX900-NEXT: s_waitcnt vmcnt(0)
548 ; GFX900-NEXT: s_mov_b32 s4, 0x40000
549 ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
550 ; GFX900-NEXT: ;;#ASMSTART
551 ; GFX900-NEXT: ;;#ASMEND
552 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
553 ; GFX900-NEXT: s_waitcnt vmcnt(0)
554 ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
555 ; GFX900-NEXT: s_waitcnt vmcnt(0)
556 ; GFX900-NEXT: s_endpgm
558 ; GFX1010-LABEL: test_sgpr_offset_kernel:
559 ; GFX1010: ; %bb.0: ; %entry
560 ; GFX1010-NEXT: s_add_u32 s0, s0, s7
561 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0
562 ; GFX1010-NEXT: s_mov_b32 s4, 0x20000
563 ; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc
564 ; GFX1010-NEXT: s_waitcnt vmcnt(0)
565 ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
566 ; GFX1010-NEXT: ;;#ASMSTART
567 ; GFX1010-NEXT: ;;#ASMEND
568 ; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
569 ; GFX1010-NEXT: s_waitcnt vmcnt(0)
570 ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
571 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
572 ; GFX1010-NEXT: s_endpgm
574 ; GFX1100-LABEL: test_sgpr_offset_kernel:
575 ; GFX1100: ; %bb.0: ; %entry
576 ; GFX1100-NEXT: scratch_load_b32 v0, off, off offset:8 glc dlc
577 ; GFX1100-NEXT: s_waitcnt vmcnt(0)
578 ; GFX1100-NEXT: s_movk_i32 s0, 0x1000
579 ; GFX1100-NEXT: scratch_store_b32 off, v0, s0 ; 4-byte Folded Spill
580 ; GFX1100-NEXT: ;;#ASMSTART
581 ; GFX1100-NEXT: ;;#ASMEND
582 ; GFX1100-NEXT: scratch_load_b32 v0, off, s0 ; 4-byte Folded Reload
583 ; GFX1100-NEXT: s_waitcnt vmcnt(0)
584 ; GFX1100-NEXT: scratch_store_b32 off, v0, off offset:8 dlc
585 ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
586 ; GFX1100-NEXT: s_endpgm
588 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
589 ; fit in the instruction, and has to live in the SGPR offset.
590 %alloca = alloca i8, i32 4092, align 4, addrspace(5)
592 %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
593 ; 0x40000 / 64 = 4096 (for wave64)
594 ; CHECK: s_add_u32 s6, s7, 0x40000
595 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
596 %a = load volatile i32, ptr addrspace(5) %aptr
599 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
601 %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
602 store volatile i32 %a, ptr addrspace(5) %outptr
607 declare hidden void @ex() local_unnamed_addr #0
609 attributes #0 = { nounwind }
610 attributes #1 = { nounwind "amdgpu-num-vgpr"="8" }
611 attributes #2 = { nounwind "frame-pointer"="all" }