1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck --check-prefix=MUBUF %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=FLATSCR %s
5 ; Make sure we use the correct frame offset is used with the local
8 ; %pin.low is allocated to offset 0.
10 ; %local.area is assigned to the local frame offset by the
11 ; LocalStackSlotAllocation pass at offset 4096.
13 ; The %load1 access to %gep.large.offset initially used the stack
14 ; pointer register and directly referenced the frame index. After
15 ; LocalStackSlotAllocation, it would no longer refer to a frame index
16 ; so eliminateFrameIndex would not adjust the access to use the
19 define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) {
20 ; MUBUF-LABEL: local_stack_offset_uses_sp:
21 ; MUBUF: ; %bb.0: ; %entry
22 ; MUBUF-NEXT: s_add_u32 s0, s0, s15
23 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x3000
24 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0
25 ; MUBUF-NEXT: v_add_u32_e32 v0, 64, v1
26 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0
27 ; MUBUF-NEXT: v_mov_b32_e32 v3, 0x2000
28 ; MUBUF-NEXT: s_mov_b32 s4, 0
29 ; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
30 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
31 ; MUBUF-NEXT: .LBB0_1: ; %loadstoreloop
32 ; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1
33 ; MUBUF-NEXT: v_add_u32_e32 v3, s4, v1
34 ; MUBUF-NEXT: s_add_i32 s4, s4, 1
35 ; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2120
36 ; MUBUF-NEXT: buffer_store_byte v2, v3, s[0:3], 0 offen
37 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
38 ; MUBUF-NEXT: s_cbranch_scc1 .LBB0_1
39 ; MUBUF-NEXT: ; %bb.2: ; %split
40 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x3000
41 ; MUBUF-NEXT: v_add_u32_e32 v1, 0x20d0, v1
42 ; MUBUF-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen glc
43 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
44 ; MUBUF-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 glc
45 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
46 ; MUBUF-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen glc
47 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
48 ; MUBUF-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 glc
49 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
50 ; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
51 ; MUBUF-NEXT: v_mov_b32_e32 v6, 0
52 ; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v2, v4
53 ; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v5, vcc
54 ; MUBUF-NEXT: s_waitcnt lgkmcnt(0)
55 ; MUBUF-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5]
56 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
57 ; MUBUF-NEXT: s_endpgm
59 ; FLATSCR-LABEL: local_stack_offset_uses_sp:
60 ; FLATSCR: ; %bb.0: ; %entry
61 ; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11
62 ; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
63 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
64 ; FLATSCR-NEXT: s_movk_i32 s0, 0x2000
65 ; FLATSCR-NEXT: scratch_store_dword off, v0, s0
66 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
67 ; FLATSCR-NEXT: s_mov_b32 s0, 0
68 ; FLATSCR-NEXT: .LBB0_1: ; %loadstoreloop
69 ; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1
70 ; FLATSCR-NEXT: s_add_i32 s1, s0, 0x3000
71 ; FLATSCR-NEXT: s_add_i32 s0, s0, 1
72 ; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120
73 ; FLATSCR-NEXT: scratch_store_byte off, v0, s1
74 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
75 ; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_1
76 ; FLATSCR-NEXT: ; %bb.2: ; %split
77 ; FLATSCR-NEXT: s_movk_i32 s0, 0x2000
78 ; FLATSCR-NEXT: s_addk_i32 s0, 0x3000
79 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:208 glc
80 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
81 ; FLATSCR-NEXT: s_movk_i32 s0, 0x3000
82 ; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:64 glc
83 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
84 ; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
85 ; FLATSCR-NEXT: v_mov_b32_e32 v4, 0
86 ; FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
87 ; FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
88 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
89 ; FLATSCR-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
90 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
91 ; FLATSCR-NEXT: s_endpgm
93 %pin.low = alloca i32, align 8192, addrspace(5)
94 %local.area = alloca [1060 x i64], align 4096, addrspace(5)
95 store volatile i32 0, ptr addrspace(5) %pin.low
96 call void @llvm.memset.p5.i32(ptr addrspace(5) align 4 %local.area, i8 0, i32 8480, i1 true)
97 %gep.large.offset = getelementptr inbounds [1060 x i64], ptr addrspace(5) %local.area, i64 0, i64 1050
98 %gep.small.offset = getelementptr inbounds [1060 x i64], ptr addrspace(5) %local.area, i64 0, i64 8
99 %load0 = load volatile i64, ptr addrspace(5) %gep.large.offset
100 %load1 = load volatile i64, ptr addrspace(5) %gep.small.offset
101 %add0 = add i64 %load0, %load1
102 store volatile i64 %add0, ptr addrspace(1) %out
106 define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) {
107 ; MUBUF-LABEL: func_local_stack_offset_uses_sp:
108 ; MUBUF: ; %bb.0: ; %entry
109 ; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
110 ; MUBUF-NEXT: s_mov_b32 s5, s33
111 ; MUBUF-NEXT: s_add_i32 s33, s32, 0x7ffc0
112 ; MUBUF-NEXT: s_and_b32 s33, s33, 0xfff80000
113 ; MUBUF-NEXT: v_lshrrev_b32_e64 v3, 6, s33
114 ; MUBUF-NEXT: v_add_u32_e32 v3, 0x3000, v3
115 ; MUBUF-NEXT: v_add_u32_e32 v2, 64, v3
116 ; MUBUF-NEXT: v_mov_b32_e32 v4, 0
117 ; MUBUF-NEXT: v_mov_b32_e32 v5, 0x2000
118 ; MUBUF-NEXT: s_mov_b32 s4, 0
119 ; MUBUF-NEXT: s_add_i32 s32, s32, 0x200000
120 ; MUBUF-NEXT: buffer_store_dword v4, v5, s[0:3], s33 offen
121 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
122 ; MUBUF-NEXT: .LBB1_1: ; %loadstoreloop
123 ; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1
124 ; MUBUF-NEXT: v_add_u32_e32 v5, s4, v3
125 ; MUBUF-NEXT: s_add_i32 s4, s4, 1
126 ; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2120
127 ; MUBUF-NEXT: buffer_store_byte v4, v5, s[0:3], 0 offen
128 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
129 ; MUBUF-NEXT: s_cbranch_scc1 .LBB1_1
130 ; MUBUF-NEXT: ; %bb.2: ; %split
131 ; MUBUF-NEXT: v_lshrrev_b32_e64 v3, 6, s33
132 ; MUBUF-NEXT: v_add_u32_e32 v3, 0x3000, v3
133 ; MUBUF-NEXT: v_add_u32_e32 v3, 0x20d0, v3
134 ; MUBUF-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen glc
135 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
136 ; MUBUF-NEXT: buffer_load_dword v5, v3, s[0:3], 0 offen offset:4 glc
137 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
138 ; MUBUF-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen glc
139 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
140 ; MUBUF-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:4 glc
141 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
142 ; MUBUF-NEXT: s_add_i32 s32, s32, 0xffe00000
143 ; MUBUF-NEXT: s_mov_b32 s33, s5
144 ; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v4, v6
145 ; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v7, vcc
146 ; MUBUF-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
147 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
148 ; MUBUF-NEXT: s_setpc_b64 s[30:31]
150 ; FLATSCR-LABEL: func_local_stack_offset_uses_sp:
151 ; FLATSCR: ; %bb.0: ; %entry
152 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
153 ; FLATSCR-NEXT: s_mov_b32 s2, s33
154 ; FLATSCR-NEXT: s_add_i32 s33, s32, 0x1fff
155 ; FLATSCR-NEXT: s_and_b32 s33, s33, 0xffffe000
156 ; FLATSCR-NEXT: s_add_i32 s32, s32, 0x8000
157 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0
158 ; FLATSCR-NEXT: s_add_i32 s0, s33, 0x2000
159 ; FLATSCR-NEXT: scratch_store_dword off, v2, s0
160 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
161 ; FLATSCR-NEXT: s_mov_b32 s0, 0
162 ; FLATSCR-NEXT: .LBB1_1: ; %loadstoreloop
163 ; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1
164 ; FLATSCR-NEXT: s_add_i32 s3, s33, 0x3000
165 ; FLATSCR-NEXT: s_add_i32 s1, s0, s3
166 ; FLATSCR-NEXT: s_add_i32 s0, s0, 1
167 ; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120
168 ; FLATSCR-NEXT: scratch_store_byte off, v2, s1
169 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
170 ; FLATSCR-NEXT: s_cbranch_scc1 .LBB1_1
171 ; FLATSCR-NEXT: ; %bb.2: ; %split
172 ; FLATSCR-NEXT: s_movk_i32 s0, 0x2000
173 ; FLATSCR-NEXT: s_add_i32 s1, s33, 0x3000
174 ; FLATSCR-NEXT: s_add_i32 s0, s0, s1
175 ; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:208 glc
176 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
177 ; FLATSCR-NEXT: s_add_i32 s0, s33, 0x3000
178 ; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s0 offset:64 glc
179 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
180 ; FLATSCR-NEXT: s_addk_i32 s32, 0x8000
181 ; FLATSCR-NEXT: s_mov_b32 s33, s2
182 ; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
183 ; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
184 ; FLATSCR-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
185 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
186 ; FLATSCR-NEXT: s_setpc_b64 s[30:31]
188 %pin.low = alloca i32, align 8192, addrspace(5)
189 %local.area = alloca [1060 x i64], align 4096, addrspace(5)
190 store volatile i32 0, ptr addrspace(5) %pin.low
191 call void @llvm.memset.p5.i32(ptr addrspace(5) align 4 %local.area, i8 0, i32 8480, i1 true)
192 %gep.large.offset = getelementptr inbounds [1060 x i64], ptr addrspace(5) %local.area, i64 0, i64 1050
193 %gep.small.offset = getelementptr inbounds [1060 x i64], ptr addrspace(5) %local.area, i64 0, i64 8
194 %load0 = load volatile i64, ptr addrspace(5) %gep.large.offset
195 %load1 = load volatile i64, ptr addrspace(5) %gep.small.offset
196 %add0 = add i64 %load0, %load1
197 store volatile i64 %add0, ptr addrspace(1) %out
201 define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out) {
202 ; MUBUF-LABEL: local_stack_offset_uses_sp_flat:
203 ; MUBUF: ; %bb.0: ; %entry
204 ; MUBUF-NEXT: s_add_u32 s0, s0, s15
205 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0
206 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0x4000
207 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0
208 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0x2000
209 ; MUBUF-NEXT: s_mov_b32 s4, 0
210 ; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
211 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
212 ; MUBUF-NEXT: .LBB2_1: ; %loadstoreloop
213 ; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1
214 ; MUBUF-NEXT: v_add_u32_e32 v2, s4, v0
215 ; MUBUF-NEXT: s_add_i32 s4, s4, 1
216 ; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2120
217 ; MUBUF-NEXT: buffer_store_byte v1, v2, s[0:3], 0 offen
218 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
219 ; MUBUF-NEXT: s_cbranch_scc1 .LBB2_1
220 ; MUBUF-NEXT: ; %bb.2: ; %split
221 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0x4000
222 ; MUBUF-NEXT: v_or_b32_e32 v2, 0x12d4, v0
223 ; MUBUF-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen glc
224 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
225 ; MUBUF-NEXT: v_or_b32_e32 v2, 0x12d0, v0
226 ; MUBUF-NEXT: v_or_b32_e32 v1, 0x12c0, v0
227 ; MUBUF-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen glc
228 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
229 ; MUBUF-NEXT: v_or_b32_e32 v2, 0x12c4, v0
230 ; MUBUF-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen glc
231 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
232 ; MUBUF-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen glc
233 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
234 ; MUBUF-NEXT: v_or_b32_e32 v1, 0x12cc, v0
235 ; MUBUF-NEXT: v_or_b32_e32 v0, 0x12c8, v0
236 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000
237 ; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc
238 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
239 ; MUBUF-NEXT: v_mov_b32_e32 v3, 0x4000
240 ; MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc
241 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
242 ; MUBUF-NEXT: v_mov_b32_e32 v10, 0x4000
243 ; MUBUF-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen glc
244 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
245 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000
246 ; MUBUF-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:4 glc
247 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
248 ; MUBUF-NEXT: v_mov_b32_e32 v11, 0x4000
249 ; MUBUF-NEXT: buffer_load_dword v2, v3, s[0:3], 0 offen offset:8 glc
250 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
251 ; MUBUF-NEXT: v_mov_b32_e32 v12, 0x4000
252 ; MUBUF-NEXT: buffer_load_dword v3, v10, s[0:3], 0 offen offset:12 glc
253 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
254 ; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
255 ; MUBUF-NEXT: buffer_load_dword v10, v11, s[0:3], 0 offen offset:16 glc
256 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
257 ; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2
258 ; MUBUF-NEXT: buffer_load_dword v11, v12, s[0:3], 0 offen offset:20 glc
259 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
260 ; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
261 ; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v7, v8
262 ; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v9, vcc
263 ; MUBUF-NEXT: v_add_co_u32_e32 v4, vcc, v4, v10
264 ; MUBUF-NEXT: v_mov_b32_e32 v12, 0
265 ; MUBUF-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v11, vcc
266 ; MUBUF-NEXT: s_waitcnt lgkmcnt(0)
267 ; MUBUF-NEXT: global_store_dwordx2 v12, v[4:5], s[4:5] offset:16
268 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
269 ; MUBUF-NEXT: global_store_dwordx4 v12, v[0:3], s[4:5]
270 ; MUBUF-NEXT: s_waitcnt vmcnt(0)
271 ; MUBUF-NEXT: s_endpgm
273 ; FLATSCR-LABEL: local_stack_offset_uses_sp_flat:
274 ; FLATSCR: ; %bb.0: ; %entry
275 ; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s6, s11
276 ; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
277 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
278 ; FLATSCR-NEXT: s_mov_b32 s0, 0
279 ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 offset:1024
280 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
281 ; FLATSCR-NEXT: .LBB2_1: ; %loadstoreloop
282 ; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1
283 ; FLATSCR-NEXT: s_add_i32 s1, s0, 0x2000
284 ; FLATSCR-NEXT: s_add_i32 s0, s0, 1
285 ; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120
286 ; FLATSCR-NEXT: scratch_store_byte off, v0, s1
287 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
288 ; FLATSCR-NEXT: s_cbranch_scc1 .LBB2_1
289 ; FLATSCR-NEXT: ; %bb.2: ; %split
290 ; FLATSCR-NEXT: s_movk_i32 s0, 0x1000
291 ; FLATSCR-NEXT: s_addk_i32 s0, 0x2000
292 ; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:720 glc
293 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
294 ; FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 offset:704 glc
295 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
296 ; FLATSCR-NEXT: s_movk_i32 s0, 0x2000
297 ; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s0 offset:16 glc
298 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
299 ; FLATSCR-NEXT: scratch_load_dwordx4 v[4:7], off, s0 glc
300 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
301 ; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
302 ; FLATSCR-NEXT: v_mov_b32_e32 v12, 0
303 ; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
304 ; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
305 ; FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
306 ; FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
307 ; FLATSCR-NEXT: v_add_co_u32_e32 v4, vcc, v8, v10
308 ; FLATSCR-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v11, vcc
309 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
310 ; FLATSCR-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16
311 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
312 ; FLATSCR-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
313 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
314 ; FLATSCR-NEXT: s_endpgm
316 %pin.low = alloca i32, align 1024, addrspace(5)
317 %local.area = alloca [160 x <3 x i64>], align 8192, addrspace(5)
318 store volatile i32 0, ptr addrspace(5) %pin.low
319 call void @llvm.memset.p5.i32(ptr addrspace(5) align 4 %local.area, i8 0, i32 8480, i1 true)
320 %gep.large.offset = getelementptr inbounds [160 x <3 x i64>], ptr addrspace(5) %local.area, i64 0, i64 150
321 %load0 = load volatile <3 x i64>, ptr addrspace(5) %gep.large.offset
322 %load1 = load volatile <3 x i64>, ptr addrspace(5) %local.area
323 %add0 = add <3 x i64> %load0, %load1
324 store volatile <3 x i64> %add0, ptr addrspace(1) %out
328 declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg) #0
330 attributes #0 = { argmemonly nounwind willreturn writeonly }