1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=DEFAULTSIZE,MUBUF %s
3 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=DEFAULTSIZE-V5,MUBUF %s
4 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 | FileCheck -check-prefixes=ASSUME1024,MUBUF %s
5 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 | FileCheck -check-prefixes=ASSUME1024,MUBUF %s
6 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch | FileCheck -check-prefixes=DEFAULTSIZE,FLATSCR %s
7 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch -amdgpu-assume-dynamic-stack-object-size=1024 | FileCheck -check-prefixes=ASSUME1024,FLATSCR %s
9 ; FIXME: Generated test checks do not check metadata at the end of the
10 ; function, so this also includes manually added checks.
12 ; Test that we can select a statically sized alloca outside of the
15 ; FIXME: FunctionLoweringInfo unhelpfully doesn't preserve an
16 ; alignment less than the stack alignment.
17 define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(ptr addrspace(1) %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) #1 {
18 ; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4:
19 ; MUBUF: ; %bb.0: ; %entry
20 ; MUBUF-NEXT: s_add_u32 s0, s0, s9
21 ; MUBUF-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
22 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0
23 ; MUBUF-NEXT: s_movk_i32 s32, 0x400
24 ; MUBUF-NEXT: s_mov_b32 s33, 0
25 ; MUBUF-NEXT: s_waitcnt lgkmcnt(0)
26 ; MUBUF-NEXT: s_cmp_lg_u32 s8, 0
27 ; MUBUF-NEXT: s_cbranch_scc1 .LBB0_3
28 ; MUBUF-NEXT: ; %bb.1: ; %bb.0
29 ; MUBUF-NEXT: s_cmp_lg_u32 s9, 0
30 ; MUBUF-NEXT: s_cbranch_scc1 .LBB0_3
31 ; MUBUF-NEXT: ; %bb.2: ; %bb.1
32 ; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
33 ; MUBUF-NEXT: s_lshl_b32 s7, s10, 2
34 ; MUBUF-NEXT: s_mov_b32 s32, s6
35 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0
36 ; MUBUF-NEXT: v_mov_b32_e32 v2, s6
37 ; MUBUF-NEXT: v_mov_b32_e32 v3, 1
38 ; MUBUF-NEXT: s_add_i32 s6, s6, s7
39 ; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
40 ; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
41 ; MUBUF-NEXT: v_mov_b32_e32 v2, s6
42 ; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
43 ; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
44 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
45 ; MUBUF-NEXT: v_add_u32_e32 v0, v2, v0
46 ; MUBUF-NEXT: s_waitcnt lgkmcnt(0)
47 ; MUBUF-NEXT: global_store_dword v1, v0, s[4:5]
48 ; MUBUF-NEXT: .LBB0_3: ; %bb.2
49 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0
50 ; MUBUF-NEXT: global_store_dword v[0:1], v0, off
51 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
52 ; MUBUF-NEXT: s_endpgm
54 ; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4:
55 ; FLATSCR: ; %bb.0: ; %entry
56 ; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5
57 ; FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
58 ; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
59 ; FLATSCR-NEXT: s_mov_b32 s32, 16
60 ; FLATSCR-NEXT: s_mov_b32 s33, 0
61 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
62 ; FLATSCR-NEXT: s_cmp_lg_u32 s4, 0
63 ; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_3
64 ; FLATSCR-NEXT: ; %bb.1: ; %bb.0
65 ; FLATSCR-NEXT: s_cmp_lg_u32 s5, 0
66 ; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_3
67 ; FLATSCR-NEXT: ; %bb.2: ; %bb.1
68 ; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
69 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
70 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
71 ; FLATSCR-NEXT: s_lshl_b32 s3, s6, 2
72 ; FLATSCR-NEXT: s_mov_b32 s32, s2
73 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2
74 ; FLATSCR-NEXT: s_add_i32 s2, s2, s3
75 ; FLATSCR-NEXT: scratch_load_dword v2, off, s2
76 ; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
77 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
78 ; FLATSCR-NEXT: v_add_u32_e32 v0, v2, v0
79 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
80 ; FLATSCR-NEXT: global_store_dword v1, v0, s[0:1]
81 ; FLATSCR-NEXT: .LBB0_3: ; %bb.2
82 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
83 ; FLATSCR-NEXT: global_store_dword v[0:1], v0, off
84 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
85 ; FLATSCR-NEXT: s_endpgm
88 %cond0 = icmp eq i32 %arg.cond0, 0
89 br i1 %cond0, label %bb.0, label %bb.2
92 %alloca = alloca [16 x i32], align 4, addrspace(5)
93 %gep1 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
94 %cond1 = icmp eq i32 %arg.cond1, 0
95 br i1 %cond1, label %bb.1, label %bb.2
98 ; Use the alloca outside of the defining block.
99 store i32 0, ptr addrspace(5) %alloca
100 store i32 1, ptr addrspace(5) %gep1
101 %gep2 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %in
102 %load = load i32, ptr addrspace(5) %gep2
103 %tid = call i32 @llvm.amdgcn.workitem.id.x()
104 %add = add i32 %load, %tid
105 store i32 %add, ptr addrspace(1) %out
109 store volatile i32 0, ptr addrspace(1) undef
112 ; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112
113 ; DEFAULTSIZE: ; ScratchSize: 4112
114 ; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size 16
115 ; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack 1
116 ; DEFAULTSIZE-V5: ; ScratchSize: 16
118 ; ASSUME1024: .amdhsa_private_segment_fixed_size 1040
119 ; ASSUME1024: ; ScratchSize: 1040
121 define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) {
122 ; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64:
123 ; MUBUF: ; %bb.0: ; %entry
124 ; MUBUF-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
125 ; MUBUF-NEXT: s_add_u32 s0, s0, s9
126 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0
127 ; MUBUF-NEXT: s_movk_i32 s32, 0x1000
128 ; MUBUF-NEXT: s_mov_b32 s33, 0
129 ; MUBUF-NEXT: s_waitcnt lgkmcnt(0)
130 ; MUBUF-NEXT: s_cmp_lg_u32 s6, 0
131 ; MUBUF-NEXT: s_cbranch_scc1 .LBB1_2
132 ; MUBUF-NEXT: ; %bb.1: ; %bb.0
133 ; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
134 ; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000
135 ; MUBUF-NEXT: s_lshl_b32 s7, s7, 2
136 ; MUBUF-NEXT: s_mov_b32 s32, s6
137 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0
138 ; MUBUF-NEXT: v_mov_b32_e32 v2, s6
139 ; MUBUF-NEXT: v_mov_b32_e32 v3, 1
140 ; MUBUF-NEXT: s_add_i32 s6, s6, s7
141 ; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
142 ; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
143 ; MUBUF-NEXT: v_mov_b32_e32 v2, s6
144 ; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
145 ; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
146 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
147 ; MUBUF-NEXT: v_add_u32_e32 v0, v2, v0
148 ; MUBUF-NEXT: s_waitcnt lgkmcnt(0)
149 ; MUBUF-NEXT: global_store_dword v1, v0, s[4:5]
150 ; MUBUF-NEXT: .LBB1_2: ; %bb.1
151 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0
152 ; MUBUF-NEXT: global_store_dword v[0:1], v0, off
153 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
154 ; MUBUF-NEXT: s_endpgm
156 ; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64:
157 ; FLATSCR: ; %bb.0: ; %entry
158 ; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5
159 ; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
160 ; FLATSCR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
161 ; FLATSCR-NEXT: s_mov_b32 s32, 64
162 ; FLATSCR-NEXT: s_mov_b32 s33, 0
163 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
164 ; FLATSCR-NEXT: s_cmp_lg_u32 s2, 0
165 ; FLATSCR-NEXT: s_cbranch_scc1 .LBB1_2
166 ; FLATSCR-NEXT: ; %bb.1: ; %bb.0
167 ; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
168 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
169 ; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000
170 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
171 ; FLATSCR-NEXT: s_lshl_b32 s3, s3, 2
172 ; FLATSCR-NEXT: s_mov_b32 s32, s2
173 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2
174 ; FLATSCR-NEXT: s_add_i32 s2, s2, s3
175 ; FLATSCR-NEXT: scratch_load_dword v2, off, s2
176 ; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
177 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
178 ; FLATSCR-NEXT: v_add_u32_e32 v0, v2, v0
179 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
180 ; FLATSCR-NEXT: global_store_dword v1, v0, s[0:1]
181 ; FLATSCR-NEXT: .LBB1_2: ; %bb.1
182 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
183 ; FLATSCR-NEXT: global_store_dword v[0:1], v0, off
184 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
185 ; FLATSCR-NEXT: s_endpgm
187 %cond = icmp eq i32 %arg.cond, 0
188 br i1 %cond, label %bb.0, label %bb.1
191 %alloca = alloca [16 x i32], align 64, addrspace(5)
192 %gep1 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
193 store i32 0, ptr addrspace(5) %alloca
194 store i32 1, ptr addrspace(5) %gep1
195 %gep2 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %in
196 %load = load i32, ptr addrspace(5) %gep2
197 %tid = call i32 @llvm.amdgcn.workitem.id.x()
198 %add = add i32 %load, %tid
199 store i32 %add, ptr addrspace(1) %out
203 store volatile i32 0, ptr addrspace(1) undef
207 ; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160
208 ; DEFAULTSIZE: ; ScratchSize: 4160
209 ; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size 64
210 ; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack 1
211 ; DEFAULTSIZE-V5: ; ScratchSize: 64
213 ; ASSUME1024: .amdhsa_private_segment_fixed_size 1088
214 ; ASSUME1024: ; ScratchSize: 1088
217 define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) {
218 ; MUBUF-LABEL: func_non_entry_block_static_alloca_align4:
219 ; MUBUF: ; %bb.0: ; %entry
220 ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
221 ; MUBUF-NEXT: s_mov_b32 s7, s33
222 ; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
223 ; MUBUF-NEXT: s_mov_b32 s33, s32
224 ; MUBUF-NEXT: s_addk_i32 s32, 0x400
225 ; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc
226 ; MUBUF-NEXT: s_cbranch_execz .LBB2_3
227 ; MUBUF-NEXT: ; %bb.1: ; %bb.0
228 ; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
229 ; MUBUF-NEXT: s_and_b64 exec, exec, vcc
230 ; MUBUF-NEXT: s_cbranch_execz .LBB2_3
231 ; MUBUF-NEXT: ; %bb.2: ; %bb.1
232 ; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
233 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0
234 ; MUBUF-NEXT: v_mov_b32_e32 v3, s6
235 ; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
236 ; MUBUF-NEXT: v_mov_b32_e32 v2, 1
237 ; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4
238 ; MUBUF-NEXT: v_lshl_add_u32 v2, v4, 2, s6
239 ; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
240 ; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31
241 ; MUBUF-NEXT: s_mov_b32 s32, s6
242 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
243 ; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3
244 ; MUBUF-NEXT: global_store_dword v[0:1], v2, off
245 ; MUBUF-NEXT: .LBB2_3: ; %bb.2
246 ; MUBUF-NEXT: s_or_b64 exec, exec, s[4:5]
247 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0
248 ; MUBUF-NEXT: global_store_dword v[0:1], v0, off
249 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
250 ; MUBUF-NEXT: s_addk_i32 s32, 0xfc00
251 ; MUBUF-NEXT: s_mov_b32 s33, s7
252 ; MUBUF-NEXT: s_setpc_b64 s[30:31]
254 ; FLATSCR-LABEL: func_non_entry_block_static_alloca_align4:
255 ; FLATSCR: ; %bb.0: ; %entry
256 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
257 ; FLATSCR-NEXT: s_mov_b32 s3, s33
258 ; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
259 ; FLATSCR-NEXT: s_mov_b32 s33, s32
260 ; FLATSCR-NEXT: s_add_i32 s32, s32, 16
261 ; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc
262 ; FLATSCR-NEXT: s_cbranch_execz .LBB2_3
263 ; FLATSCR-NEXT: ; %bb.1: ; %bb.0
264 ; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
265 ; FLATSCR-NEXT: s_and_b64 exec, exec, vcc
266 ; FLATSCR-NEXT: s_cbranch_execz .LBB2_3
267 ; FLATSCR-NEXT: ; %bb.2: ; %bb.1
268 ; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
269 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0
270 ; FLATSCR-NEXT: v_mov_b32_e32 v3, 1
271 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2
272 ; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s2
273 ; FLATSCR-NEXT: scratch_load_dword v2, v2, off
274 ; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31
275 ; FLATSCR-NEXT: s_mov_b32 s32, s2
276 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
277 ; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3
278 ; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
279 ; FLATSCR-NEXT: .LBB2_3: ; %bb.2
280 ; FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1]
281 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
282 ; FLATSCR-NEXT: global_store_dword v[0:1], v0, off
283 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
284 ; FLATSCR-NEXT: s_add_i32 s32, s32, -16
285 ; FLATSCR-NEXT: s_mov_b32 s33, s3
286 ; FLATSCR-NEXT: s_setpc_b64 s[30:31]
289 %cond0 = icmp eq i32 %arg.cond0, 0
290 br i1 %cond0, label %bb.0, label %bb.2
293 %alloca = alloca [16 x i32], align 4, addrspace(5)
294 %gep1 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
295 %cond1 = icmp eq i32 %arg.cond1, 0
296 br i1 %cond1, label %bb.1, label %bb.2
299 ; Use the alloca outside of the defining block.
300 store i32 0, ptr addrspace(5) %alloca
301 store i32 1, ptr addrspace(5) %gep1
302 %gep2 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %in
303 %load = load i32, ptr addrspace(5) %gep2
304 %tid = call i32 @llvm.amdgcn.workitem.id.x()
305 %add = add i32 %load, %tid
306 store i32 %add, ptr addrspace(1) %out
310 store volatile i32 0, ptr addrspace(1) undef
314 define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) {
315 ; MUBUF-LABEL: func_non_entry_block_static_alloca_align64:
316 ; MUBUF: ; %bb.0: ; %entry
317 ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
318 ; MUBUF-NEXT: s_mov_b32 s7, s33
319 ; MUBUF-NEXT: s_add_i32 s33, s32, 0xfc0
320 ; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
321 ; MUBUF-NEXT: s_and_b32 s33, s33, 0xfffff000
322 ; MUBUF-NEXT: s_addk_i32 s32, 0x2000
323 ; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc
324 ; MUBUF-NEXT: s_cbranch_execz .LBB3_2
325 ; MUBUF-NEXT: ; %bb.1: ; %bb.0
326 ; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
327 ; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000
328 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0
329 ; MUBUF-NEXT: v_mov_b32_e32 v4, s6
330 ; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
331 ; MUBUF-NEXT: v_mov_b32_e32 v2, 1
332 ; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
333 ; MUBUF-NEXT: v_lshl_add_u32 v2, v3, 2, s6
334 ; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
335 ; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31
336 ; MUBUF-NEXT: s_mov_b32 s32, s6
337 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
338 ; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3
339 ; MUBUF-NEXT: global_store_dword v[0:1], v2, off
340 ; MUBUF-NEXT: .LBB3_2: ; %bb.1
341 ; MUBUF-NEXT: s_or_b64 exec, exec, s[4:5]
342 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0
343 ; MUBUF-NEXT: global_store_dword v[0:1], v0, off
344 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
345 ; MUBUF-NEXT: s_addk_i32 s32, 0xe000
346 ; MUBUF-NEXT: s_mov_b32 s33, s7
347 ; MUBUF-NEXT: s_setpc_b64 s[30:31]
349 ; FLATSCR-LABEL: func_non_entry_block_static_alloca_align64:
350 ; FLATSCR: ; %bb.0: ; %entry
351 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
352 ; FLATSCR-NEXT: s_mov_b32 s3, s33
353 ; FLATSCR-NEXT: s_add_i32 s33, s32, 63
354 ; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
355 ; FLATSCR-NEXT: s_andn2_b32 s33, s33, 63
356 ; FLATSCR-NEXT: s_addk_i32 s32, 0x80
357 ; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc
358 ; FLATSCR-NEXT: s_cbranch_execz .LBB3_2
359 ; FLATSCR-NEXT: ; %bb.1: ; %bb.0
360 ; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
361 ; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000
362 ; FLATSCR-NEXT: v_mov_b32_e32 v4, 0
363 ; FLATSCR-NEXT: v_mov_b32_e32 v5, 1
364 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[4:5], s2
365 ; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s2
366 ; FLATSCR-NEXT: scratch_load_dword v2, v2, off
367 ; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31
368 ; FLATSCR-NEXT: s_mov_b32 s32, s2
369 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
370 ; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3
371 ; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
372 ; FLATSCR-NEXT: .LBB3_2: ; %bb.1
373 ; FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1]
374 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
375 ; FLATSCR-NEXT: global_store_dword v[0:1], v0, off
376 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
377 ; FLATSCR-NEXT: s_addk_i32 s32, 0xff80
378 ; FLATSCR-NEXT: s_mov_b32 s33, s3
379 ; FLATSCR-NEXT: s_setpc_b64 s[30:31]
381 %cond = icmp eq i32 %arg.cond, 0
382 br i1 %cond, label %bb.0, label %bb.1
385 %alloca = alloca [16 x i32], align 64, addrspace(5)
386 %gep1 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
387 store i32 0, ptr addrspace(5) %alloca
388 store i32 1, ptr addrspace(5) %gep1
389 %gep2 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %in
390 %load = load i32, ptr addrspace(5) %gep2
391 %tid = call i32 @llvm.amdgcn.workitem.id.x()
392 %add = add i32 %load, %tid
393 store i32 %add, ptr addrspace(1) %out
397 store volatile i32 0, ptr addrspace(1) undef
401 declare i32 @llvm.amdgcn.workitem.id.x() #0
403 attributes #0 = { nounwind readnone speculatable }
404 attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
406 !llvm.module.flags = !{!0}
407 !0 = !{i32 1, !"amdgpu_code_object_version", i32 CODE_OBJECT_VERSION}