1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
4 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
6 @gv = external addrspace(4) constant i32
8 define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) {
9 ; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align4:
11 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
12 ; GFX9-NEXT: s_add_u32 s0, s0, s15
13 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
14 ; GFX9-NEXT: s_movk_i32 s32, 0x400
15 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
17 ; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15
18 ; GFX9-NEXT: s_and_b32 s4, s4, -16
19 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6
20 ; GFX9-NEXT: s_add_u32 s4, s32, s4
21 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
22 ; GFX9-NEXT: s_mov_b32 s33, 0
23 ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
26 ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align4:
28 ; GFX10-NEXT: s_load_dword s4, s[6:7], 0x0
29 ; GFX10-NEXT: s_add_u32 s0, s0, s15
30 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
31 ; GFX10-NEXT: s_movk_i32 s32, 0x200
32 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
33 ; GFX10-NEXT: s_mov_b32 s33, 0
34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
35 ; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15
36 ; GFX10-NEXT: s_and_b32 s4, s4, -16
37 ; GFX10-NEXT: s_lshl_b32 s4, s4, 5
38 ; GFX10-NEXT: s_add_u32 s4, s32, s4
39 ; GFX10-NEXT: v_mov_b32_e32 v1, s4
40 ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
41 ; GFX10-NEXT: s_endpgm
43 ; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align4:
45 ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
46 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
47 ; GFX11-NEXT: s_mov_b32 s32, 16
48 ; GFX11-NEXT: s_mov_b32 s33, 0
49 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
50 ; GFX11-NEXT: s_lshl2_add_u32 s0, s0, 15
51 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
52 ; GFX11-NEXT: s_and_b32 s0, s0, -16
53 ; GFX11-NEXT: s_lshl_b32 s0, s0, 5
54 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
55 ; GFX11-NEXT: s_add_u32 s0, s32, s0
56 ; GFX11-NEXT: scratch_store_b32 off, v0, s0
57 ; GFX11-NEXT: s_endpgm
58 %alloca = alloca i32, i32 %n, align 4, addrspace(5)
59 store i32 0, ptr addrspace(5) %alloca
63 define void @func_dynamic_stackalloc_sgpr_align4() {
64 ; GFX9-LABEL: func_dynamic_stackalloc_sgpr_align4:
66 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
67 ; GFX9-NEXT: s_mov_b32 s6, s33
68 ; GFX9-NEXT: s_mov_b32 s33, s32
69 ; GFX9-NEXT: s_addk_i32 s32, 0x400
70 ; GFX9-NEXT: s_getpc_b64 s[4:5]
71 ; GFX9-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4
72 ; GFX9-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
73 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
74 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
75 ; GFX9-NEXT: s_mov_b32 s33, s6
76 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
77 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0
78 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
79 ; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15
80 ; GFX9-NEXT: s_and_b32 s4, s4, -16
81 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6
82 ; GFX9-NEXT: s_add_u32 s4, s32, s4
83 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
84 ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
85 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00
86 ; GFX9-NEXT: s_waitcnt vmcnt(0)
87 ; GFX9-NEXT: s_setpc_b64 s[30:31]
89 ; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align4:
91 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
92 ; GFX10-NEXT: s_mov_b32 s6, s33
93 ; GFX10-NEXT: s_mov_b32 s33, s32
94 ; GFX10-NEXT: s_addk_i32 s32, 0x200
95 ; GFX10-NEXT: s_getpc_b64 s[4:5]
96 ; GFX10-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4
97 ; GFX10-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
98 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
99 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
100 ; GFX10-NEXT: s_mov_b32 s33, s6
101 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
102 ; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0
103 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
104 ; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15
105 ; GFX10-NEXT: s_and_b32 s4, s4, -16
106 ; GFX10-NEXT: s_lshl_b32 s4, s4, 5
107 ; GFX10-NEXT: s_add_u32 s4, s32, s4
108 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00
109 ; GFX10-NEXT: v_mov_b32_e32 v1, s4
110 ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
111 ; GFX10-NEXT: s_setpc_b64 s[30:31]
113 ; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align4:
115 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
116 ; GFX11-NEXT: s_mov_b32 s2, s33
117 ; GFX11-NEXT: s_mov_b32 s33, s32
118 ; GFX11-NEXT: s_add_i32 s32, s32, 16
119 ; GFX11-NEXT: s_getpc_b64 s[0:1]
120 ; GFX11-NEXT: s_add_u32 s0, s0, gv@gotpcrel32@lo+4
121 ; GFX11-NEXT: s_addc_u32 s1, s1, gv@gotpcrel32@hi+12
122 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
123 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
124 ; GFX11-NEXT: s_mov_b32 s33, s2
125 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
126 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
127 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
128 ; GFX11-NEXT: s_lshl2_add_u32 s0, s0, 15
129 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
130 ; GFX11-NEXT: s_and_b32 s0, s0, -16
131 ; GFX11-NEXT: s_lshl_b32 s0, s0, 5
132 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
133 ; GFX11-NEXT: s_add_u32 s0, s32, s0
134 ; GFX11-NEXT: s_add_i32 s32, s32, -16
135 ; GFX11-NEXT: scratch_store_b32 off, v0, s0
136 ; GFX11-NEXT: s_setpc_b64 s[30:31]
137 %n = load i32, ptr addrspace(4) @gv, align 4
138 %alloca = alloca i32, i32 %n, addrspace(5)
139 store i32 0, ptr addrspace(5) %alloca
143 define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) {
144 ; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align16:
146 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
147 ; GFX9-NEXT: s_add_u32 s0, s0, s15
148 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
149 ; GFX9-NEXT: s_movk_i32 s32, 0x400
150 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
151 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
152 ; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15
153 ; GFX9-NEXT: s_and_b32 s4, s4, -16
154 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6
155 ; GFX9-NEXT: s_add_u32 s4, s32, s4
156 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
157 ; GFX9-NEXT: s_mov_b32 s33, 0
158 ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
159 ; GFX9-NEXT: s_endpgm
161 ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align16:
163 ; GFX10-NEXT: s_load_dword s4, s[6:7], 0x0
164 ; GFX10-NEXT: s_add_u32 s0, s0, s15
165 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
166 ; GFX10-NEXT: s_movk_i32 s32, 0x200
167 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
168 ; GFX10-NEXT: s_mov_b32 s33, 0
169 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
170 ; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15
171 ; GFX10-NEXT: s_and_b32 s4, s4, -16
172 ; GFX10-NEXT: s_lshl_b32 s4, s4, 5
173 ; GFX10-NEXT: s_add_u32 s4, s32, s4
174 ; GFX10-NEXT: v_mov_b32_e32 v1, s4
175 ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
176 ; GFX10-NEXT: s_endpgm
178 ; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align16:
180 ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
181 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
182 ; GFX11-NEXT: s_mov_b32 s32, 16
183 ; GFX11-NEXT: s_mov_b32 s33, 0
184 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
185 ; GFX11-NEXT: s_lshl2_add_u32 s0, s0, 15
186 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
187 ; GFX11-NEXT: s_and_b32 s0, s0, -16
188 ; GFX11-NEXT: s_lshl_b32 s0, s0, 5
189 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
190 ; GFX11-NEXT: s_add_u32 s0, s32, s0
191 ; GFX11-NEXT: scratch_store_b32 off, v0, s0
192 ; GFX11-NEXT: s_endpgm
193 %alloca = alloca i32, i32 %n, align 16, addrspace(5)
194 store i32 0, ptr addrspace(5) %alloca
198 define void @func_dynamic_stackalloc_sgpr_align16() {
199 ; GFX9-LABEL: func_dynamic_stackalloc_sgpr_align16:
201 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
202 ; GFX9-NEXT: s_mov_b32 s6, s33
203 ; GFX9-NEXT: s_mov_b32 s33, s32
204 ; GFX9-NEXT: s_addk_i32 s32, 0x400
205 ; GFX9-NEXT: s_getpc_b64 s[4:5]
206 ; GFX9-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4
207 ; GFX9-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
208 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
209 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
210 ; GFX9-NEXT: s_mov_b32 s33, s6
211 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
212 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0
213 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
214 ; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15
215 ; GFX9-NEXT: s_and_b32 s4, s4, -16
216 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6
217 ; GFX9-NEXT: s_add_u32 s4, s32, s4
218 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
219 ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
220 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00
221 ; GFX9-NEXT: s_waitcnt vmcnt(0)
222 ; GFX9-NEXT: s_setpc_b64 s[30:31]
224 ; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align16:
226 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
227 ; GFX10-NEXT: s_mov_b32 s6, s33
228 ; GFX10-NEXT: s_mov_b32 s33, s32
229 ; GFX10-NEXT: s_addk_i32 s32, 0x200
230 ; GFX10-NEXT: s_getpc_b64 s[4:5]
231 ; GFX10-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4
232 ; GFX10-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
233 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
234 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
235 ; GFX10-NEXT: s_mov_b32 s33, s6
236 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
237 ; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0
238 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
239 ; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15
240 ; GFX10-NEXT: s_and_b32 s4, s4, -16
241 ; GFX10-NEXT: s_lshl_b32 s4, s4, 5
242 ; GFX10-NEXT: s_add_u32 s4, s32, s4
243 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00
244 ; GFX10-NEXT: v_mov_b32_e32 v1, s4
245 ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
246 ; GFX10-NEXT: s_setpc_b64 s[30:31]
248 ; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align16:
250 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
251 ; GFX11-NEXT: s_mov_b32 s2, s33
252 ; GFX11-NEXT: s_mov_b32 s33, s32
253 ; GFX11-NEXT: s_add_i32 s32, s32, 16
254 ; GFX11-NEXT: s_getpc_b64 s[0:1]
255 ; GFX11-NEXT: s_add_u32 s0, s0, gv@gotpcrel32@lo+4
256 ; GFX11-NEXT: s_addc_u32 s1, s1, gv@gotpcrel32@hi+12
257 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
258 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
259 ; GFX11-NEXT: s_mov_b32 s33, s2
260 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
261 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
262 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
263 ; GFX11-NEXT: s_lshl2_add_u32 s0, s0, 15
264 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
265 ; GFX11-NEXT: s_and_b32 s0, s0, -16
266 ; GFX11-NEXT: s_lshl_b32 s0, s0, 5
267 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
268 ; GFX11-NEXT: s_add_u32 s0, s32, s0
269 ; GFX11-NEXT: s_add_i32 s32, s32, -16
270 ; GFX11-NEXT: scratch_store_b32 off, v0, s0
271 ; GFX11-NEXT: s_setpc_b64 s[30:31]
272 %n = load i32, ptr addrspace(4) @gv, align 16
273 %alloca = alloca i32, i32 %n, addrspace(5)
274 store i32 0, ptr addrspace(5) %alloca
278 define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) {
279 ; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align32:
281 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
282 ; GFX9-NEXT: s_add_u32 s0, s0, s15
283 ; GFX9-NEXT: s_addc_u32 s1, s1, 0
284 ; GFX9-NEXT: s_movk_i32 s32, 0x800
285 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
286 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
287 ; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15
288 ; GFX9-NEXT: s_and_b32 s4, s4, -16
289 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6
290 ; GFX9-NEXT: s_add_u32 s4, s32, s4
291 ; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff800
292 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
293 ; GFX9-NEXT: s_mov_b32 s33, 0
294 ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
295 ; GFX9-NEXT: s_endpgm
297 ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align32:
299 ; GFX10-NEXT: s_load_dword s4, s[6:7], 0x0
300 ; GFX10-NEXT: s_add_u32 s0, s0, s15
301 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
302 ; GFX10-NEXT: s_movk_i32 s32, 0x400
303 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
304 ; GFX10-NEXT: s_mov_b32 s33, 0
305 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
306 ; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15
307 ; GFX10-NEXT: s_and_b32 s4, s4, -16
308 ; GFX10-NEXT: s_lshl_b32 s4, s4, 5
309 ; GFX10-NEXT: s_add_u32 s4, s32, s4
310 ; GFX10-NEXT: s_and_b32 s4, s4, 0xfffffc00
311 ; GFX10-NEXT: v_mov_b32_e32 v1, s4
312 ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
313 ; GFX10-NEXT: s_endpgm
315 ; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align32:
317 ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
318 ; GFX11-NEXT: s_mov_b32 s32, 32
319 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
320 ; GFX11-NEXT: s_mov_b32 s33, 0
321 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
322 ; GFX11-NEXT: s_lshl2_add_u32 s0, s0, 15
323 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
324 ; GFX11-NEXT: s_and_b32 s0, s0, -16
325 ; GFX11-NEXT: s_lshl_b32 s0, s0, 5
326 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
327 ; GFX11-NEXT: s_add_u32 s0, s32, s0
328 ; GFX11-NEXT: s_and_b32 s0, s0, 0xfffffc00
329 ; GFX11-NEXT: scratch_store_b32 off, v0, s0
330 ; GFX11-NEXT: s_endpgm
331 %alloca = alloca i32, i32 %n, align 32, addrspace(5)
332 store i32 0, ptr addrspace(5) %alloca
336 define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
337 ; GFX9-LABEL: func_dynamic_stackalloc_sgpr_align32:
339 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
340 ; GFX9-NEXT: s_mov_b32 s6, s33
341 ; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0
342 ; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800
343 ; GFX9-NEXT: s_addk_i32 s32, 0x1000
344 ; GFX9-NEXT: s_getpc_b64 s[4:5]
345 ; GFX9-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4
346 ; GFX9-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
347 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
348 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
349 ; GFX9-NEXT: s_mov_b32 s33, s6
350 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
351 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0
352 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
353 ; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15
354 ; GFX9-NEXT: s_and_b32 s4, s4, -16
355 ; GFX9-NEXT: s_lshl_b32 s4, s4, 6
356 ; GFX9-NEXT: s_add_u32 s4, s32, s4
357 ; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff800
358 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
359 ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
360 ; GFX9-NEXT: s_addk_i32 s32, 0xf000
361 ; GFX9-NEXT: s_waitcnt vmcnt(0)
362 ; GFX9-NEXT: s_setpc_b64 s[30:31]
364 ; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align32:
366 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
367 ; GFX10-NEXT: s_mov_b32 s6, s33
368 ; GFX10-NEXT: s_add_i32 s33, s32, 0x3e0
369 ; GFX10-NEXT: s_addk_i32 s32, 0x800
370 ; GFX10-NEXT: s_and_b32 s33, s33, 0xfffffc00
371 ; GFX10-NEXT: s_getpc_b64 s[4:5]
372 ; GFX10-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4
373 ; GFX10-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
374 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
375 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
376 ; GFX10-NEXT: s_mov_b32 s33, s6
377 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
378 ; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0
379 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
380 ; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15
381 ; GFX10-NEXT: s_and_b32 s4, s4, -16
382 ; GFX10-NEXT: s_lshl_b32 s4, s4, 5
383 ; GFX10-NEXT: s_add_u32 s4, s32, s4
384 ; GFX10-NEXT: s_addk_i32 s32, 0xf800
385 ; GFX10-NEXT: s_and_b32 s4, s4, 0xfffffc00
386 ; GFX10-NEXT: v_mov_b32_e32 v1, s4
387 ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
388 ; GFX10-NEXT: s_setpc_b64 s[30:31]
390 ; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align32:
392 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
393 ; GFX11-NEXT: s_mov_b32 s2, s33
394 ; GFX11-NEXT: s_add_i32 s33, s32, 31
395 ; GFX11-NEXT: s_add_i32 s32, s32, 64
396 ; GFX11-NEXT: s_and_not1_b32 s33, s33, 31
397 ; GFX11-NEXT: s_getpc_b64 s[0:1]
398 ; GFX11-NEXT: s_add_u32 s0, s0, gv@gotpcrel32@lo+4
399 ; GFX11-NEXT: s_addc_u32 s1, s1, gv@gotpcrel32@hi+12
400 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
401 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
402 ; GFX11-NEXT: s_mov_b32 s33, s2
403 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
404 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
405 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
406 ; GFX11-NEXT: s_lshl2_add_u32 s0, s0, 15
407 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
408 ; GFX11-NEXT: s_and_b32 s0, s0, -16
409 ; GFX11-NEXT: s_lshl_b32 s0, s0, 5
410 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
411 ; GFX11-NEXT: s_add_u32 s0, s32, s0
412 ; GFX11-NEXT: s_addk_i32 s32, 0xffc0
413 ; GFX11-NEXT: s_and_b32 s0, s0, 0xfffffc00
414 ; GFX11-NEXT: scratch_store_b32 off, v0, s0
415 ; GFX11-NEXT: s_setpc_b64 s[30:31]
416 %n = load i32, ptr addrspace(4) @gv
417 %alloca = alloca i32, i32 %n, align 32, addrspace(5)
418 store i32 0, ptr addrspace(5) %alloca