1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
3 ; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s | FileCheck -check-prefix=GCN_DBG %s
5 define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind {
6 ; GCN-LABEL: test_loop:
7 ; GCN: ; %bb.0: ; %entry
8 ; GCN-NEXT: s_load_dword s2, s[0:1], 0xa
9 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
10 ; GCN-NEXT: s_cmp_eq_u32 s2, -1
11 ; GCN-NEXT: s_cbranch_scc1 .LBB0_3
12 ; GCN-NEXT: ; %bb.1: ; %for.body.preheader
13 ; GCN-NEXT: s_load_dword s0, s[0:1], 0x9
14 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
15 ; GCN-NEXT: s_addk_i32 s0, 0x80
16 ; GCN-NEXT: s_and_b64 vcc, exec, -1
17 ; GCN-NEXT: s_mov_b32 m0, -1
18 ; GCN-NEXT: .LBB0_2: ; %for.body
19 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
20 ; GCN-NEXT: v_mov_b32_e32 v0, s0
21 ; GCN-NEXT: ds_read_b32 v1, v0
22 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
23 ; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1
24 ; GCN-NEXT: ds_write_b32 v0, v1
25 ; GCN-NEXT: s_add_i32 s0, s0, 4
26 ; GCN-NEXT: s_mov_b64 vcc, vcc
27 ; GCN-NEXT: s_cbranch_vccnz .LBB0_2
28 ; GCN-NEXT: .LBB0_3: ; %for.exit
31 ; GCN_DBG-LABEL: test_loop:
32 ; GCN_DBG: ; %bb.0: ; %entry
33 ; GCN_DBG-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
34 ; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
35 ; GCN_DBG-NEXT: s_mov_b32 s14, -1
36 ; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000
37 ; GCN_DBG-NEXT: s_add_u32 s12, s12, s11
38 ; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0
39 ; GCN_DBG-NEXT: ; implicit-def: $vgpr0
40 ; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9
41 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
42 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0
43 ; GCN_DBG-NEXT: s_load_dword s1, s[4:5], 0xa
44 ; GCN_DBG-NEXT: s_mov_b32 s0, 0
45 ; GCN_DBG-NEXT: s_mov_b32 s2, -1
46 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
47 ; GCN_DBG-NEXT: s_cmp_lg_u32 s1, s2
48 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1
49 ; GCN_DBG-NEXT: s_mov_b64 s[6:7], exec
50 ; GCN_DBG-NEXT: s_mov_b64 exec, -1
51 ; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
52 ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
53 ; GCN_DBG-NEXT: s_cbranch_scc1 .LBB0_2
54 ; GCN_DBG-NEXT: ; %bb.1: ; %for.exit
55 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
56 ; GCN_DBG-NEXT: s_waitcnt expcnt(0)
57 ; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
58 ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
59 ; GCN_DBG-NEXT: ; kill: killed $vgpr0
60 ; GCN_DBG-NEXT: s_endpgm
61 ; GCN_DBG-NEXT: .LBB0_2: ; %for.body
62 ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1
63 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
64 ; GCN_DBG-NEXT: s_waitcnt expcnt(0)
65 ; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
66 ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
67 ; GCN_DBG-NEXT: s_waitcnt vmcnt(0)
68 ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1
69 ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0
70 ; GCN_DBG-NEXT: s_mov_b32 s1, 2
71 ; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1
72 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2
73 ; GCN_DBG-NEXT: s_mov_b32 s2, 0x80
74 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2
75 ; GCN_DBG-NEXT: s_mov_b32 m0, -1
76 ; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1
77 ; GCN_DBG-NEXT: ds_read_b32 v1, v1
78 ; GCN_DBG-NEXT: s_mov_b32 s2, 1.0
79 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
80 ; GCN_DBG-NEXT: v_add_f32_e64 v2, v1, s2
81 ; GCN_DBG-NEXT: s_mov_b32 m0, -1
82 ; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1
83 ; GCN_DBG-NEXT: ds_write_b32 v1, v2
84 ; GCN_DBG-NEXT: s_mov_b32 s1, 1
85 ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1
86 ; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1
87 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3]
88 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1
89 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
90 ; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
91 ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
92 ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB0_2
93 ; GCN_DBG-NEXT: ; %bb.3: ; %DummyReturnBlock
94 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
95 ; GCN_DBG-NEXT: s_waitcnt expcnt(0)
96 ; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
97 ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
98 ; GCN_DBG-NEXT: ; kill: killed $vgpr0
99 ; GCN_DBG-NEXT: s_endpgm
101 %cmp = icmp eq i32 %n, -1
102 br i1 %cmp, label %for.exit, label %for.body
108 %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
109 %tmp = add i32 %indvar, 32
110 %arrayidx = getelementptr float, ptr addrspace(3) %ptr, i32 %tmp
111 %vecload = load float, ptr addrspace(3) %arrayidx, align 4
112 %add = fadd float %vecload, 1.0
113 store float %add, ptr addrspace(3) %arrayidx, align 8
114 %inc = add i32 %indvar, 1
118 define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwind {
119 ; GCN-LABEL: loop_const_true:
120 ; GCN: ; %bb.0: ; %entry
121 ; GCN-NEXT: s_load_dword s0, s[0:1], 0x9
122 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
123 ; GCN-NEXT: s_addk_i32 s0, 0x80
124 ; GCN-NEXT: s_and_b64 vcc, exec, -1
125 ; GCN-NEXT: s_mov_b32 m0, -1
126 ; GCN-NEXT: .LBB1_1: ; %for.body
127 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
128 ; GCN-NEXT: v_mov_b32_e32 v0, s0
129 ; GCN-NEXT: ds_read_b32 v1, v0
130 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
131 ; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1
132 ; GCN-NEXT: ds_write_b32 v0, v1
133 ; GCN-NEXT: s_add_i32 s0, s0, 4
134 ; GCN-NEXT: s_mov_b64 vcc, vcc
135 ; GCN-NEXT: s_cbranch_vccnz .LBB1_1
136 ; GCN-NEXT: ; %bb.2: ; %DummyReturnBlock
139 ; GCN_DBG-LABEL: loop_const_true:
140 ; GCN_DBG: ; %bb.0: ; %entry
141 ; GCN_DBG-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
142 ; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
143 ; GCN_DBG-NEXT: s_mov_b32 s14, -1
144 ; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000
145 ; GCN_DBG-NEXT: s_add_u32 s12, s12, s11
146 ; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0
147 ; GCN_DBG-NEXT: ; implicit-def: $vgpr0
148 ; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9
149 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
150 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0
151 ; GCN_DBG-NEXT: s_mov_b32 s0, 0
152 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1
153 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
154 ; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
155 ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
156 ; GCN_DBG-NEXT: s_branch .LBB1_2
157 ; GCN_DBG-NEXT: .LBB1_1: ; %for.exit
158 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
159 ; GCN_DBG-NEXT: s_waitcnt expcnt(0)
160 ; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
161 ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
162 ; GCN_DBG-NEXT: ; kill: killed $vgpr0
163 ; GCN_DBG-NEXT: s_endpgm
164 ; GCN_DBG-NEXT: .LBB1_2: ; %for.body
165 ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1
166 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
167 ; GCN_DBG-NEXT: s_waitcnt expcnt(0)
168 ; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
169 ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
170 ; GCN_DBG-NEXT: s_waitcnt vmcnt(0)
171 ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1
172 ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0
173 ; GCN_DBG-NEXT: s_mov_b32 s1, 2
174 ; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1
175 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2
176 ; GCN_DBG-NEXT: s_mov_b32 s2, 0x80
177 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2
178 ; GCN_DBG-NEXT: s_mov_b32 m0, -1
179 ; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1
180 ; GCN_DBG-NEXT: ds_read_b32 v1, v1
181 ; GCN_DBG-NEXT: s_mov_b32 s2, 1.0
182 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
183 ; GCN_DBG-NEXT: v_add_f32_e64 v2, v1, s2
184 ; GCN_DBG-NEXT: s_mov_b32 m0, -1
185 ; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1
186 ; GCN_DBG-NEXT: ds_write_b32 v1, v2
187 ; GCN_DBG-NEXT: s_mov_b32 s1, 1
188 ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1
189 ; GCN_DBG-NEXT: s_mov_b64 s[2:3], 0
190 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3]
191 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1
192 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
193 ; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
194 ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
195 ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB1_1
196 ; GCN_DBG-NEXT: s_branch .LBB1_2
204 %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
205 %tmp = add i32 %indvar, 32
206 %arrayidx = getelementptr float, ptr addrspace(3) %ptr, i32 %tmp
207 %vecload = load float, ptr addrspace(3) %arrayidx, align 4
208 %add = fadd float %vecload, 1.0
209 store float %add, ptr addrspace(3) %arrayidx, align 8
210 %inc = add i32 %indvar, 1
211 br i1 true, label %for.body, label %for.exit
214 define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounwind {
215 ; GCN-LABEL: loop_const_false:
216 ; GCN: ; %bb.0: ; %entry
217 ; GCN-NEXT: s_load_dword s0, s[0:1], 0x9
218 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
219 ; GCN-NEXT: v_mov_b32_e32 v0, s0
220 ; GCN-NEXT: s_mov_b32 m0, -1
221 ; GCN-NEXT: ds_read_b32 v1, v0 offset:128
222 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
223 ; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1
224 ; GCN-NEXT: ds_write_b32 v0, v1 offset:128
227 ; GCN_DBG-LABEL: loop_const_false:
228 ; GCN_DBG: ; %bb.0: ; %entry
229 ; GCN_DBG-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
230 ; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
231 ; GCN_DBG-NEXT: s_mov_b32 s14, -1
232 ; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000
233 ; GCN_DBG-NEXT: s_add_u32 s12, s12, s11
234 ; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0
235 ; GCN_DBG-NEXT: ; implicit-def: $vgpr0
236 ; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9
237 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
238 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0
239 ; GCN_DBG-NEXT: s_mov_b32 s0, 0
240 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1
241 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
242 ; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
243 ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
244 ; GCN_DBG-NEXT: s_branch .LBB2_2
245 ; GCN_DBG-NEXT: .LBB2_1: ; %for.exit
246 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
247 ; GCN_DBG-NEXT: s_waitcnt expcnt(0)
248 ; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
249 ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
250 ; GCN_DBG-NEXT: ; kill: killed $vgpr0
251 ; GCN_DBG-NEXT: s_endpgm
252 ; GCN_DBG-NEXT: .LBB2_2: ; %for.body
253 ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1
254 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
255 ; GCN_DBG-NEXT: s_waitcnt expcnt(0)
256 ; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
257 ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
258 ; GCN_DBG-NEXT: s_waitcnt vmcnt(0)
259 ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1
260 ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0
261 ; GCN_DBG-NEXT: s_mov_b32 s1, 2
262 ; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1
263 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2
264 ; GCN_DBG-NEXT: s_mov_b32 s2, 0x80
265 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2
266 ; GCN_DBG-NEXT: s_mov_b32 m0, -1
267 ; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1
268 ; GCN_DBG-NEXT: ds_read_b32 v1, v1
269 ; GCN_DBG-NEXT: s_mov_b32 s2, 1.0
270 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
271 ; GCN_DBG-NEXT: v_add_f32_e64 v2, v1, s2
272 ; GCN_DBG-NEXT: s_mov_b32 m0, -1
273 ; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1
274 ; GCN_DBG-NEXT: ds_write_b32 v1, v2
275 ; GCN_DBG-NEXT: s_mov_b32 s1, 1
276 ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1
277 ; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1
278 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3]
279 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1
280 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
281 ; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
282 ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
283 ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB2_1
284 ; GCN_DBG-NEXT: s_branch .LBB2_2
291 ; XXX - Should there be an S_ENDPGM?
293 %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
294 %tmp = add i32 %indvar, 32
295 %arrayidx = getelementptr float, ptr addrspace(3) %ptr, i32 %tmp
296 %vecload = load float, ptr addrspace(3) %arrayidx, align 4
297 %add = fadd float %vecload, 1.0
298 store float %add, ptr addrspace(3) %arrayidx, align 8
299 %inc = add i32 %indvar, 1
300 br i1 false, label %for.body, label %for.exit
303 define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounwind {
304 ; GCN-LABEL: loop_const_undef:
305 ; GCN: ; %bb.0: ; %entry
306 ; GCN-NEXT: s_load_dword s0, s[0:1], 0x9
307 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
308 ; GCN-NEXT: v_mov_b32_e32 v0, s0
309 ; GCN-NEXT: s_mov_b32 m0, -1
310 ; GCN-NEXT: ds_read_b32 v1, v0 offset:128
311 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
312 ; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1
313 ; GCN-NEXT: ds_write_b32 v0, v1 offset:128
316 ; GCN_DBG-LABEL: loop_const_undef:
317 ; GCN_DBG: ; %bb.0: ; %entry
318 ; GCN_DBG-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
319 ; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
320 ; GCN_DBG-NEXT: s_mov_b32 s14, -1
321 ; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000
322 ; GCN_DBG-NEXT: s_add_u32 s12, s12, s11
323 ; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0
324 ; GCN_DBG-NEXT: ; implicit-def: $vgpr0
325 ; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9
326 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
327 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0
328 ; GCN_DBG-NEXT: s_mov_b32 s0, 0
329 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1
330 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
331 ; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
332 ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
333 ; GCN_DBG-NEXT: s_branch .LBB3_2
334 ; GCN_DBG-NEXT: .LBB3_1: ; %for.exit
335 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
336 ; GCN_DBG-NEXT: s_waitcnt expcnt(0)
337 ; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
338 ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
339 ; GCN_DBG-NEXT: ; kill: killed $vgpr0
340 ; GCN_DBG-NEXT: s_endpgm
341 ; GCN_DBG-NEXT: .LBB3_2: ; %for.body
342 ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1
343 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
344 ; GCN_DBG-NEXT: s_waitcnt expcnt(0)
345 ; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
346 ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
347 ; GCN_DBG-NEXT: s_waitcnt vmcnt(0)
348 ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1
349 ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0
350 ; GCN_DBG-NEXT: s_mov_b32 s1, 2
351 ; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1
352 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2
353 ; GCN_DBG-NEXT: s_mov_b32 s2, 0x80
354 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2
355 ; GCN_DBG-NEXT: s_mov_b32 m0, -1
356 ; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1
357 ; GCN_DBG-NEXT: ds_read_b32 v1, v1
358 ; GCN_DBG-NEXT: s_mov_b32 s2, 1.0
359 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
360 ; GCN_DBG-NEXT: v_add_f32_e64 v2, v1, s2
361 ; GCN_DBG-NEXT: s_mov_b32 m0, -1
362 ; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1
363 ; GCN_DBG-NEXT: ds_write_b32 v1, v2
364 ; GCN_DBG-NEXT: s_mov_b32 s1, 1
365 ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1
366 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1
367 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
368 ; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
369 ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
370 ; GCN_DBG-NEXT: s_cbranch_scc1 .LBB3_1
371 ; GCN_DBG-NEXT: s_branch .LBB3_2
378 ; XXX - Should there be an s_endpgm?
380 %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
381 %tmp = add i32 %indvar, 32
382 %arrayidx = getelementptr float, ptr addrspace(3) %ptr, i32 %tmp
383 %vecload = load float, ptr addrspace(3) %arrayidx, align 4
384 %add = fadd float %vecload, 1.0
385 store float %add, ptr addrspace(3) %arrayidx, align 8
386 %inc = add i32 %indvar, 1
387 br i1 undef, label %for.body, label %for.exit
390 define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind {
391 ; GCN-LABEL: loop_arg_0:
392 ; GCN: ; %bb.0: ; %entry
393 ; GCN-NEXT: v_mov_b32_e32 v0, 0
394 ; GCN-NEXT: s_mov_b32 m0, -1
395 ; GCN-NEXT: ds_read_u8 v0, v0
396 ; GCN-NEXT: s_load_dword s4, s[0:1], 0x9
397 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
398 ; GCN-NEXT: v_readfirstlane_b32 s0, v0
399 ; GCN-NEXT: s_bitcmp1_b32 s0, 0
400 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
401 ; GCN-NEXT: s_xor_b64 s[2:3], s[0:1], -1
402 ; GCN-NEXT: s_add_i32 s0, s4, 0x80
403 ; GCN-NEXT: s_and_b64 vcc, exec, s[2:3]
404 ; GCN-NEXT: .LBB4_1: ; %for.body
405 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
406 ; GCN-NEXT: v_mov_b32_e32 v0, s0
407 ; GCN-NEXT: ds_read_b32 v1, v0
408 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
409 ; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1
410 ; GCN-NEXT: ds_write_b32 v0, v1
411 ; GCN-NEXT: s_add_i32 s0, s0, 4
412 ; GCN-NEXT: s_mov_b64 vcc, vcc
413 ; GCN-NEXT: s_cbranch_vccz .LBB4_1
414 ; GCN-NEXT: ; %bb.2: ; %for.exit
417 ; GCN_DBG-LABEL: loop_arg_0:
418 ; GCN_DBG: ; %bb.0: ; %entry
419 ; GCN_DBG-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
420 ; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
421 ; GCN_DBG-NEXT: s_mov_b32 s14, -1
422 ; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000
423 ; GCN_DBG-NEXT: s_add_u32 s12, s12, s11
424 ; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0
425 ; GCN_DBG-NEXT: ; implicit-def: $vgpr0
426 ; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9
427 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
428 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0
429 ; GCN_DBG-NEXT: v_mov_b32_e32 v1, 0
430 ; GCN_DBG-NEXT: s_mov_b32 m0, -1
431 ; GCN_DBG-NEXT: ds_read_u8 v1, v1
432 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
433 ; GCN_DBG-NEXT: v_readfirstlane_b32 s0, v1
434 ; GCN_DBG-NEXT: s_and_b32 s0, 1, s0
435 ; GCN_DBG-NEXT: s_cmp_eq_u32 s0, 1
436 ; GCN_DBG-NEXT: s_cselect_b64 s[0:1], -1, 0
437 ; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1
438 ; GCN_DBG-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
439 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1
440 ; GCN_DBG-NEXT: v_writelane_b32 v0, s1, 2
441 ; GCN_DBG-NEXT: s_mov_b32 s0, 0
442 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 3
443 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
444 ; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
445 ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
446 ; GCN_DBG-NEXT: s_branch .LBB4_2
447 ; GCN_DBG-NEXT: .LBB4_1: ; %for.exit
448 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
449 ; GCN_DBG-NEXT: s_waitcnt expcnt(0)
450 ; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
451 ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
452 ; GCN_DBG-NEXT: ; kill: killed $vgpr0
453 ; GCN_DBG-NEXT: s_endpgm
454 ; GCN_DBG-NEXT: .LBB4_2: ; %for.body
455 ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1
456 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
457 ; GCN_DBG-NEXT: s_waitcnt expcnt(0)
458 ; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
459 ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
460 ; GCN_DBG-NEXT: s_waitcnt vmcnt(0)
461 ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 3
462 ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 1
463 ; GCN_DBG-NEXT: v_readlane_b32 s3, v0, 2
464 ; GCN_DBG-NEXT: v_readlane_b32 s4, v0, 0
465 ; GCN_DBG-NEXT: s_mov_b32 s1, 2
466 ; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1
467 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s4
468 ; GCN_DBG-NEXT: s_mov_b32 s4, 0x80
469 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s4
470 ; GCN_DBG-NEXT: s_mov_b32 m0, -1
471 ; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1
472 ; GCN_DBG-NEXT: ds_read_b32 v1, v1
473 ; GCN_DBG-NEXT: s_mov_b32 s4, 1.0
474 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
475 ; GCN_DBG-NEXT: v_add_f32_e64 v2, v1, s4
476 ; GCN_DBG-NEXT: s_mov_b32 m0, -1
477 ; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1
478 ; GCN_DBG-NEXT: ds_write_b32 v1, v2
479 ; GCN_DBG-NEXT: s_mov_b32 s1, 1
480 ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1
481 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3]
482 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 3
483 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
484 ; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
485 ; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
486 ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB4_1
487 ; GCN_DBG-NEXT: s_branch .LBB4_2
489 %cond = load volatile i1, ptr addrspace(3) null
496 %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ]
497 %tmp = add i32 %indvar, 32
498 %arrayidx = getelementptr float, ptr addrspace(3) %ptr, i32 %tmp
499 %vecload = load float, ptr addrspace(3) %arrayidx, align 4
500 %add = fadd float %vecload, 1.0
501 store float %add, ptr addrspace(3) %arrayidx, align 8
502 %inc = add i32 %indvar, 1
503 br i1 %cond, label %for.body, label %for.exit