1 ; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=4 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
6 ; Used to emit an always 4 byte instruction. Inline asm always assumes
7 ; each instruction is the maximum size.
8 declare void @llvm.amdgcn.s.sleep(i32) #0
10 declare i32 @llvm.amdgcn.workitem.id.x() #1
13 define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 {
14 ; GCN-LABEL: uniform_conditional_max_short_forward_branch:
16 ; GCN-NEXT: s_load_dword s2, s[0:1], 0xb
17 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
18 ; GCN-NEXT: s_cmp_eq_u32 s2, 0
19 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2
20 ; GCN-NEXT: ; %bb.1: ; %bb2
21 ; GCN-NEXT: ;;#ASMSTART
27 ; GCN-NEXT: .LBB0_2: ; %bb3
28 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
29 ; GCN-NEXT: s_mov_b32 s7, 0xf000
30 ; GCN-NEXT: s_mov_b32 s6, -1
31 ; GCN-NEXT: v_mov_b32_e32 v0, s2
32 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
33 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
34 ; GCN-NEXT: s_waitcnt vmcnt(0)
37 %cmp = icmp eq i32 %cnd, 0
38 br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
42 call void asm sideeffect
46 call void @llvm.amdgcn.s.sleep(i32 0)
50 store volatile i32 %cnd, ptr addrspace(1) %arg
54 define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 {
55 ; GCN-LABEL: uniform_conditional_min_long_forward_branch:
56 ; GCN: ; %bb.0: ; %bb0
57 ; GCN-NEXT: s_load_dword s2, s[0:1], 0xb
58 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
59 ; GCN-NEXT: s_cmp_eq_u32 s2, 0
60 ; GCN-NEXT: s_cbranch_scc0 .LBB1_1
61 ; GCN-NEXT: .LBB1_3: ; %bb0
62 ; GCN-NEXT: s_getpc_b64 s[8:9]
63 ; GCN-NEXT: .Lpost_getpc0:
64 ; GCN-NEXT: s_add_u32 s8, s8, (.LBB1_2-.Lpost_getpc0)&4294967295
65 ; GCN-NEXT: s_addc_u32 s9, s9, (.LBB1_2-.Lpost_getpc0)>>32
66 ; GCN-NEXT: s_setpc_b64 s[8:9]
67 ; GCN-NEXT: .LBB1_1: ; %bb2
68 ; GCN-NEXT: ;;#ASMSTART
74 ; GCN-NEXT: .LBB1_2: ; %bb3
75 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
76 ; GCN-NEXT: s_mov_b32 s7, 0xf000
77 ; GCN-NEXT: s_mov_b32 s6, -1
78 ; GCN-NEXT: v_mov_b32_e32 v0, s2
79 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
80 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
81 ; GCN-NEXT: s_waitcnt vmcnt(0)
84 %cmp = icmp eq i32 %cnd, 0
85 br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch
89 call void asm sideeffect
97 store volatile i32 %cnd, ptr addrspace(1) %arg
101 define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr addrspace(1) %arg, float %cnd) #0 {
102 ; GCN-LABEL: uniform_conditional_min_long_forward_vcnd_branch:
103 ; GCN: ; %bb.0: ; %bb0
104 ; GCN-NEXT: s_load_dword s2, s[0:1], 0xb
105 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
106 ; GCN-NEXT: v_cmp_eq_f32_e64 s[4:5], s2, 0
107 ; GCN-NEXT: s_and_b64 vcc, exec, s[4:5]
108 ; GCN-NEXT: s_cbranch_vccz .LBB2_1
109 ; GCN-NEXT: .LBB2_3: ; %bb0
110 ; GCN-NEXT: s_getpc_b64 s[8:9]
111 ; GCN-NEXT: .Lpost_getpc1:
112 ; GCN-NEXT: s_add_u32 s8, s8, (.LBB2_2-.Lpost_getpc1)&4294967295
113 ; GCN-NEXT: s_addc_u32 s9, s9, (.LBB2_2-.Lpost_getpc1)>>32
114 ; GCN-NEXT: s_setpc_b64 s[8:9]
115 ; GCN-NEXT: .LBB2_1: ; %bb2
116 ; GCN-NEXT: ;;#ASMSTART
117 ; GCN-NEXT: ; 32 bytes
118 ; GCN-NEXT: v_nop_e64
119 ; GCN-NEXT: v_nop_e64
120 ; GCN-NEXT: v_nop_e64
121 ; GCN-NEXT: v_nop_e64
122 ; GCN-NEXT: ;;#ASMEND
123 ; GCN-NEXT: .LBB2_2: ; %bb3
124 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
125 ; GCN-NEXT: s_mov_b32 s7, 0xf000
126 ; GCN-NEXT: s_mov_b32 s6, -1
127 ; GCN-NEXT: v_mov_b32_e32 v0, s2
128 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
129 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
130 ; GCN-NEXT: s_waitcnt vmcnt(0)
133 %cmp = fcmp oeq float %cnd, 0.0
134 br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
137 call void asm sideeffect " ; 32 bytes
145 store volatile float %cnd, ptr addrspace(1) %arg
149 define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 {
150 ; GCN-LABEL: min_long_forward_vbranch:
151 ; GCN: ; %bb.0: ; %bb
152 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
153 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
154 ; GCN-NEXT: v_mov_b32_e32 v1, 0
155 ; GCN-NEXT: s_mov_b32 s3, 0xf000
156 ; GCN-NEXT: s_mov_b32 s2, 0
157 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
158 ; GCN-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
159 ; GCN-NEXT: s_waitcnt vmcnt(0)
160 ; GCN-NEXT: v_mov_b32_e32 v1, s1
161 ; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0
162 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
163 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
164 ; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc
165 ; GCN-NEXT: s_cbranch_execnz .LBB3_1
166 ; GCN-NEXT: .LBB3_3: ; %bb
167 ; GCN-NEXT: s_getpc_b64 s[4:5]
168 ; GCN-NEXT: .Lpost_getpc2:
169 ; GCN-NEXT: s_add_u32 s4, s4, (.LBB3_2-.Lpost_getpc2)&4294967295
170 ; GCN-NEXT: s_addc_u32 s5, s5, (.LBB3_2-.Lpost_getpc2)>>32
171 ; GCN-NEXT: s_setpc_b64 s[4:5]
172 ; GCN-NEXT: .LBB3_1: ; %bb2
173 ; GCN-NEXT: ;;#ASMSTART
174 ; GCN-NEXT: ; 32 bytes
175 ; GCN-NEXT: v_nop_e64
176 ; GCN-NEXT: v_nop_e64
177 ; GCN-NEXT: v_nop_e64
178 ; GCN-NEXT: v_nop_e64
179 ; GCN-NEXT: ;;#ASMEND
180 ; GCN-NEXT: .LBB3_2: ; %bb3
181 ; GCN-NEXT: s_or_b64 exec, exec, s[0:1]
182 ; GCN-NEXT: s_mov_b32 s0, s2
183 ; GCN-NEXT: s_mov_b32 s1, s2
184 ; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
185 ; GCN-NEXT: s_waitcnt vmcnt(0)
188 %tid = call i32 @llvm.amdgcn.workitem.id.x()
189 %tid.ext = zext i32 %tid to i64
190 %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tid.ext
191 %load = load volatile i32, ptr addrspace(1) %gep
192 %cmp = icmp eq i32 %load, 0
193 br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
196 call void asm sideeffect " ; 32 bytes
204 store volatile i32 %load, ptr addrspace(1) %gep
208 define amdgpu_kernel void @long_backward_sbranch(ptr addrspace(1) %arg) #0 {
209 ; GCN-LABEL: long_backward_sbranch:
210 ; GCN: ; %bb.0: ; %bb
211 ; GCN-NEXT: s_mov_b32 s0, 0
212 ; GCN-NEXT: .LBB4_1: ; %bb2
213 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
214 ; GCN-NEXT: s_add_i32 s0, s0, 1
215 ; GCN-NEXT: s_cmp_lt_i32 s0, 10
216 ; GCN-NEXT: ;;#ASMSTART
217 ; GCN-NEXT: v_nop_e64
218 ; GCN-NEXT: v_nop_e64
219 ; GCN-NEXT: v_nop_e64
220 ; GCN-NEXT: ;;#ASMEND
221 ; GCN-NEXT: s_cbranch_scc0 .LBB4_2
222 ; GCN-NEXT: .LBB4_3: ; %bb2
223 ; GCN-NEXT: ; in Loop: Header=BB4_1 Depth=1
224 ; GCN-NEXT: s_getpc_b64 s[2:3]
225 ; GCN-NEXT: .Lpost_getpc3:
226 ; GCN-NEXT: s_add_u32 s2, s2, (.LBB4_1-.Lpost_getpc3)&4294967295
227 ; GCN-NEXT: s_addc_u32 s3, s3, (.LBB4_1-.Lpost_getpc3)>>32
228 ; GCN-NEXT: s_setpc_b64 s[2:3]
229 ; GCN-NEXT: .LBB4_2: ; %bb3
236 %loop.idx = phi i32 [ 0, %bb ], [ %inc, %bb2 ]
238 call void asm sideeffect
242 %inc = add nsw i32 %loop.idx, 1 ; add cost 4
243 %cmp = icmp slt i32 %inc, 10 ; condition cost = 8
244 br i1 %cmp, label %bb2, label %bb3 ; -
250 ; Requires expansion of unconditional branch from %bb2 to %bb4 (and
251 ; expansion of conditional branch from %bb to %bb3.
253 define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) {
254 ; GCN-LABEL: uniform_unconditional_min_long_forward_branch:
255 ; GCN: ; %bb.0: ; %bb0
256 ; GCN-NEXT: s_load_dword s2, s[0:1], 0xb
257 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
258 ; GCN-NEXT: s_cmp_eq_u32 s2, 0
259 ; GCN-NEXT: s_mov_b64 s[2:3], -1
260 ; GCN-NEXT: s_cbranch_scc0 .LBB5_1
261 ; GCN-NEXT: .LBB5_7: ; %bb0
262 ; GCN-NEXT: s_getpc_b64 s[4:5]
263 ; GCN-NEXT: .Lpost_getpc5:
264 ; GCN-NEXT: s_add_u32 s4, s4, (.LBB5_4-.Lpost_getpc5)&4294967295
265 ; GCN-NEXT: s_addc_u32 s5, s5, (.LBB5_4-.Lpost_getpc5)>>32
266 ; GCN-NEXT: s_setpc_b64 s[4:5]
267 ; GCN-NEXT: .LBB5_1: ; %Flow
268 ; GCN-NEXT: s_andn2_b64 vcc, exec, s[2:3]
269 ; GCN-NEXT: s_cbranch_vccnz .LBB5_3
270 ; GCN-NEXT: .LBB5_2: ; %bb2
271 ; GCN-NEXT: s_mov_b32 s3, 0xf000
272 ; GCN-NEXT: s_mov_b32 s2, -1
273 ; GCN-NEXT: v_mov_b32_e32 v0, 17
274 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
275 ; GCN-NEXT: s_waitcnt vmcnt(0)
276 ; GCN-NEXT: .LBB5_3: ; %bb4
277 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
278 ; GCN-NEXT: s_mov_b32 s3, 0xf000
279 ; GCN-NEXT: s_mov_b32 s2, -1
280 ; GCN-NEXT: s_waitcnt expcnt(0)
281 ; GCN-NEXT: v_mov_b32_e32 v0, 63
282 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
283 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
284 ; GCN-NEXT: s_waitcnt vmcnt(0)
286 ; GCN-NEXT: .LBB5_4: ; %bb3
287 ; GCN-NEXT: ;;#ASMSTART
288 ; GCN-NEXT: v_nop_e64
289 ; GCN-NEXT: v_nop_e64
290 ; GCN-NEXT: v_nop_e64
291 ; GCN-NEXT: v_nop_e64
292 ; GCN-NEXT: ;;#ASMEND
293 ; GCN-NEXT: s_mov_b64 vcc, exec
294 ; GCN-NEXT: s_cbranch_execnz .LBB5_5
295 ; GCN-NEXT: .LBB5_9: ; %bb3
296 ; GCN-NEXT: s_getpc_b64 s[4:5]
297 ; GCN-NEXT: .Lpost_getpc6:
298 ; GCN-NEXT: s_add_u32 s4, s4, (.LBB5_2-.Lpost_getpc6)&4294967295
299 ; GCN-NEXT: s_addc_u32 s5, s5, (.LBB5_2-.Lpost_getpc6)>>32
300 ; GCN-NEXT: s_setpc_b64 s[4:5]
301 ; GCN-NEXT: .LBB5_5: ; %bb3
302 ; GCN-NEXT: s_getpc_b64 s[4:5]
303 ; GCN-NEXT: .Lpost_getpc4:
304 ; GCN-NEXT: s_add_u32 s4, s4, (.LBB5_3-.Lpost_getpc4)&4294967295
305 ; GCN-NEXT: s_addc_u32 s5, s5, (.LBB5_3-.Lpost_getpc4)>>32
306 ; GCN-NEXT: s_setpc_b64 s[4:5]
308 %tmp = icmp ne i32 %arg1, 0
309 br i1 %tmp, label %bb2, label %bb3
312 store volatile i32 17, ptr addrspace(1) undef
317 call void asm sideeffect
325 store volatile i32 63, ptr addrspace(1) %arg
329 attributes #0 = { nounwind }
330 attributes #1 = { nounwind readnone }