1 ; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s
2 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
4 ; Add an extra verifier runs. There were some cases where invalid IR
5 ; was produced but happened to be fixed by the later passes.
7 ; Make sure divergent control flow with multiple exits from a region
8 ; is properly handled. UnifyFunctionExitNodes should be run before
11 ; IR-LABEL: @multi_divergent_region_exit_ret_ret(
12 ; IR: %1 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %0)
13 ; IR: %2 = extractvalue { i1, i64 } %1, 0
14 ; IR: %3 = extractvalue { i1, i64 } %1, 1
15 ; IR: br i1 %2, label %LeafBlock1, label %Flow
18 ; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
19 ; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
20 ; IR: %6 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %3)
21 ; IR: %7 = extractvalue { i1, i64 } %6, 0
22 ; IR: %8 = extractvalue { i1, i64 } %6, 1
23 ; IR: br i1 %7, label %LeafBlock, label %Flow1
29 ; IR: br label %Flow{{$}}
32 ; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
33 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %19)
34 ; IR: %12 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %11)
35 ; IR: %13 = extractvalue { i1, i64 } %12, 0
36 ; IR: %14 = extractvalue { i1, i64 } %12, 1
37 ; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
40 ; IR: store volatile i32 9, i32 addrspace(1)* undef
41 ; IR: br label %UnifiedReturnBlock
44 ; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
45 ; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
46 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %8)
47 ; IR: %17 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %16)
48 ; IR: %18 = extractvalue { i1, i64 } %17, 0
49 ; IR: %19 = extractvalue { i1, i64 } %17, 1
50 ; IR: br i1 %18, label %exit1, label %Flow2
53 ; IR: store volatile i32 17, i32 addrspace(3)* undef
56 ; IR: UnifiedReturnBlock:
57 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %14)
61 ; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:
63 ; GCN-DAG: s_mov_b64 [[EXIT1:s\[[0-9]+:[0-9]+\]]], 0
64 ; GCN-DAG: v_cmp_lt_i32_e32 vcc, 1,
65 ; GCN-DAG: s_mov_b64 [[EXIT0:s\[[0-9]+:[0-9]+\]]], 0
66 ; GCN-DAG: s_and_saveexec_b64
70 ; GCN-NEXT: s_mov_b64 [[EXIT0]], exec
71 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2,
72 ; GCN-NEXT: s_and_b64 [[EXIT1]], vcc, exec
75 ; GCN-NEXT: s_or_saveexec_b64
78 ; FIXME: Why is this compare essentially repeated?
80 ; GCN-DAG: v_cmp_eq_u32_e32 vcc, 1,
81 ; GCN-DAG: v_cmp_ne_u32_e64 [[TMP1:s\[[0-9]+:[0-9]+\]]], 1,
82 ; GCN-DAG: s_andn2_b64 [[EXIT0]], [[EXIT0]], exec
83 ; GCN-DAG: s_andn2_b64 [[EXIT1]], [[EXIT1]], exec
84 ; GCN-DAG: s_and_b64 [[TMP0:s\[[0-9]+:[0-9]+\]]], vcc, exec
85 ; GCN-DAG: s_and_b64 [[TMP1]], [[TMP1]], exec
86 ; GCN-DAG: s_or_b64 [[EXIT0]], [[EXIT0]], [[TMP0]]
87 ; GCN-DAG: s_or_b64 [[EXIT1]], [[EXIT1]], [[TMP1]]
90 ; GCN-NEXT: s_or_b64 exec, exec,
91 ; GCN-NEXT: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[EXIT1]]
95 ; GCN-DAG: ds_write_b32
96 ; GCN-DAG: s_andn2_b64 [[EXIT0]], [[EXIT0]], exec
99 ; GCN-NEXT: s_or_b64 exec, exec,
100 ; GCN-NEXT; s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[EXIT0]]
103 ; GCN: buffer_store_dword
105 ; GCN: ; %UnifiedReturnBlock
107 define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
109 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
110 %tmp1 = add i32 0, %tmp
111 %tmp2 = zext i32 %tmp1 to i64
112 %tmp3 = add i64 0, %tmp2
113 %tmp4 = shl i64 %tmp3, 32
114 %tmp5 = ashr exact i64 %tmp4, 32
115 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
116 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
117 %tmp8 = sext i32 %tmp7 to i64
118 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
119 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
120 %tmp13 = zext i32 %tmp10 to i64
121 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
122 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
123 %Pivot = icmp slt i32 %tmp16, 2
124 br i1 %Pivot, label %LeafBlock, label %LeafBlock1
126 LeafBlock: ; preds = %entry
127 %SwitchLeaf = icmp eq i32 %tmp16, 1
128 br i1 %SwitchLeaf, label %exit0, label %exit1
130 LeafBlock1: ; preds = %entry
131 %SwitchLeaf2 = icmp eq i32 %tmp16, 2
132 br i1 %SwitchLeaf2, label %exit0, label %exit1
134 exit0: ; preds = %LeafBlock, %LeafBlock1
135 store volatile i32 9, i32 addrspace(1)* undef
138 exit1: ; preds = %LeafBlock, %LeafBlock1
139 store volatile i32 17, i32 addrspace(3)* undef
143 ; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable(
144 ; IR: %1 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %0)
146 ; IR: %6 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %3)
148 ; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
149 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %19)
150 ; IR: %12 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %11)
151 ; IR: br i1 %13, label %exit0, label %UnifiedUnreachableBlock
154 ; IR: UnifiedUnreachableBlock:
155 ; IR-NEXT: unreachable
158 ; FIXME: Probably should insert an s_endpgm anyway.
159 ; GCN-LABEL: {{^}}multi_divergent_region_exit_unreachable_unreachable:
160 ; GCN: ; %UnifiedUnreachableBlock
161 ; GCN-NEXT: .Lfunc_end
162 define amdgpu_kernel void @multi_divergent_region_exit_unreachable_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
164 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
165 %tmp1 = add i32 0, %tmp
166 %tmp2 = zext i32 %tmp1 to i64
167 %tmp3 = add i64 0, %tmp2
168 %tmp4 = shl i64 %tmp3, 32
169 %tmp5 = ashr exact i64 %tmp4, 32
170 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
171 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
172 %tmp8 = sext i32 %tmp7 to i64
173 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
174 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
175 %tmp13 = zext i32 %tmp10 to i64
176 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
177 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
178 %Pivot = icmp slt i32 %tmp16, 2
179 br i1 %Pivot, label %LeafBlock, label %LeafBlock1
181 LeafBlock: ; preds = %entry
182 %SwitchLeaf = icmp eq i32 %tmp16, 1
183 br i1 %SwitchLeaf, label %exit0, label %exit1
185 LeafBlock1: ; preds = %entry
186 %SwitchLeaf2 = icmp eq i32 %tmp16, 2
187 br i1 %SwitchLeaf2, label %exit0, label %exit1
189 exit0: ; preds = %LeafBlock, %LeafBlock1
190 store volatile i32 9, i32 addrspace(1)* undef
193 exit1: ; preds = %LeafBlock, %LeafBlock1
194 store volatile i32 17, i32 addrspace(3)* undef
198 ; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret(
199 ; IR: %divergent.cond0 = icmp slt i32 %tmp16, 2
204 ; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
205 ; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
206 ; IR: %6 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %3)
207 ; IR: br i1 %7, label %LeafBlock, label %Flow1
209 ; IR: {{^}}LeafBlock:
210 ; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1
211 ; IR: %9 = xor i1 %divergent.cond1, true
212 ; IR: br label %Flow1
215 ; IR: %uniform.cond0 = icmp eq i32 %arg3, 2
216 ; IR: %10 = xor i1 %uniform.cond0, true
220 ; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
221 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %19)
222 ; IR: %12 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %11)
223 ; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
226 ; IR: store volatile i32 9, i32 addrspace(1)* undef
227 ; IR: br label %UnifiedReturnBlock
230 ; IR: %15 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ]
231 ; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
232 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %8)
233 ; IR: %17 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %16)
234 ; IR: %18 = extractvalue { i1, i64 } %17, 0
235 ; IR: %19 = extractvalue { i1, i64 } %17, 1
236 ; IR: br i1 %18, label %exit1, label %Flow2
239 ; IR: store volatile i32 17, i32 addrspace(3)* undef
240 ; IR: br label %Flow2
242 ; IR: UnifiedReturnBlock:
243 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %14)
245 define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
247 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
248 %tmp1 = add i32 0, %tmp
249 %tmp2 = zext i32 %tmp1 to i64
250 %tmp3 = add i64 0, %tmp2
251 %tmp4 = shl i64 %tmp3, 32
252 %tmp5 = ashr exact i64 %tmp4, 32
253 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
254 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
255 %tmp8 = sext i32 %tmp7 to i64
256 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
257 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
258 %tmp13 = zext i32 %tmp10 to i64
259 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
260 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
261 %divergent.cond0 = icmp slt i32 %tmp16, 2
262 br i1 %divergent.cond0, label %LeafBlock, label %LeafBlock1
264 LeafBlock: ; preds = %entry
265 %divergent.cond1 = icmp eq i32 %tmp16, 1
266 br i1 %divergent.cond1, label %exit0, label %exit1
268 LeafBlock1: ; preds = %entry
269 %uniform.cond0 = icmp eq i32 %arg3, 2
270 br i1 %uniform.cond0, label %exit0, label %exit1
272 exit0: ; preds = %LeafBlock, %LeafBlock1
273 store volatile i32 9, i32 addrspace(1)* undef
276 exit1: ; preds = %LeafBlock, %LeafBlock1
277 store volatile i32 17, i32 addrspace(3)* undef
281 ; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret(
282 ; IR: %1 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %0)
283 ; IR: br i1 %2, label %LeafBlock1, label %Flow
286 ; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
287 ; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
288 ; IR: %6 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %3)
290 ; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
291 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %19)
292 ; IR: %12 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %11)
294 define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
296 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
297 %tmp1 = add i32 0, %tmp
298 %tmp2 = zext i32 %tmp1 to i64
299 %tmp3 = add i64 0, %tmp2
300 %tmp4 = shl i64 %tmp3, 32
301 %tmp5 = ashr exact i64 %tmp4, 32
302 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
303 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
304 %tmp8 = sext i32 %tmp7 to i64
305 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
306 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
307 %tmp13 = zext i32 %tmp10 to i64
308 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
309 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
310 %Pivot = icmp slt i32 %tmp16, 2
311 br i1 %Pivot, label %LeafBlock, label %LeafBlock1
313 LeafBlock: ; preds = %entry
314 %SwitchLeaf = icmp eq i32 %arg3, 1
315 br i1 %SwitchLeaf, label %exit0, label %exit1
317 LeafBlock1: ; preds = %entry
318 %SwitchLeaf2 = icmp eq i32 %tmp16, 2
319 br i1 %SwitchLeaf2, label %exit0, label %exit1
321 exit0: ; preds = %LeafBlock, %LeafBlock1
322 store volatile i32 9, i32 addrspace(1)* undef
325 exit1: ; preds = %LeafBlock, %LeafBlock1
326 store volatile i32 17, i32 addrspace(3)* undef
330 ; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value(
332 ; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
333 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %19)
335 ; IR: UnifiedReturnBlock:
336 ; IR: %UnifiedRetVal = phi float [ 2.000000e+00, %Flow2 ], [ 1.000000e+00, %exit0 ]
337 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %14)
338 ; IR: ret float %UnifiedRetVal
339 define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 {
341 %Pivot = icmp slt i32 %vgpr, 2
342 br i1 %Pivot, label %LeafBlock, label %LeafBlock1
344 LeafBlock: ; preds = %entry
345 %SwitchLeaf = icmp eq i32 %vgpr, 1
346 br i1 %SwitchLeaf, label %exit0, label %exit1
348 LeafBlock1: ; preds = %entry
349 %SwitchLeaf2 = icmp eq i32 %vgpr, 2
350 br i1 %SwitchLeaf2, label %exit0, label %exit1
352 exit0: ; preds = %LeafBlock, %LeafBlock1
353 store i32 9, i32 addrspace(1)* undef
356 exit1: ; preds = %LeafBlock, %LeafBlock1
357 store i32 17, i32 addrspace(3)* undef
361 ; IR-LABEL: @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(
363 ; GCN-LABEL: {{^}}uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value:
364 ; GCN: s_cmp_gt_i32 s0, 1
365 ; GCN: s_cbranch_scc0 [[FLOW:BB[0-9]+_[0-9]+]]
367 ; GCN: v_cmp_ne_u32_e32 vcc, 7, v0
369 ; GCN: {{^}}[[FLOW]]:
370 ; GCN: s_cbranch_vccnz [[FLOW1:BB[0-9]+]]
372 ; GCN: s_or_b64 exec, exec
373 ; GCN: v_mov_b32_e32 v0, 2.0
374 ; GCN-NOT: s_and_b64 exec, exec
375 ; GCN: v_mov_b32_e32 v0, 1.0
377 ; GCN: {{^BB[0-9]+_[0-9]+}}: ; %UnifiedReturnBlock
378 ; GCN-NEXT: s_or_b64 exec, exec
379 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
382 define amdgpu_ps float @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(i32 inreg %sgpr, i32 %vgpr) #0 {
384 %uniform.cond = icmp slt i32 %sgpr, 2
385 br i1 %uniform.cond, label %LeafBlock, label %LeafBlock1
387 LeafBlock: ; preds = %entry
388 %divergent.cond0 = icmp eq i32 %vgpr, 3
389 br i1 %divergent.cond0, label %exit0, label %exit1
391 LeafBlock1: ; preds = %entry
392 %divergent.cond1 = icmp eq i32 %vgpr, 7
393 br i1 %divergent.cond1, label %exit0, label %exit1
395 exit0: ; preds = %LeafBlock, %LeafBlock1
396 store i32 9, i32 addrspace(1)* undef
399 exit1: ; preds = %LeafBlock, %LeafBlock1
400 store i32 17, i32 addrspace(3)* undef
404 ; IR-LABEL: @multi_divergent_region_exit_ret_unreachable(
405 ; IR: %1 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %0)
408 ; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
409 ; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
410 ; IR: %6 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %3)
413 ; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
414 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %19)
415 ; IR: %12 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %11)
416 ; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
419 ; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
420 ; IR-NEXT: br label %UnifiedReturnBlock
423 ; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
424 ; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
425 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %8)
426 ; IR: %17 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %16)
427 ; IR: %18 = extractvalue { i1, i64 } %17, 0
428 ; IR: %19 = extractvalue { i1, i64 } %17, 1
429 ; IR: br i1 %18, label %exit1, label %Flow2
432 ; IR-NEXT: store volatile i32 9, i32 addrspace(1)* undef
433 ; IR-NEXT: call void @llvm.amdgcn.unreachable()
434 ; IR-NEXT: br label %Flow2
436 ; IR: UnifiedReturnBlock:
437 ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %14)
439 define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
441 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
442 %tmp1 = add i32 0, %tmp
443 %tmp2 = zext i32 %tmp1 to i64
444 %tmp3 = add i64 0, %tmp2
445 %tmp4 = shl i64 %tmp3, 32
446 %tmp5 = ashr exact i64 %tmp4, 32
447 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
448 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
449 %tmp8 = sext i32 %tmp7 to i64
450 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
451 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
452 %tmp13 = zext i32 %tmp10 to i64
453 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
454 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
455 %Pivot = icmp slt i32 %tmp16, 2
456 br i1 %Pivot, label %LeafBlock, label %LeafBlock1
458 LeafBlock: ; preds = %entry
459 %SwitchLeaf = icmp eq i32 %tmp16, 1
460 br i1 %SwitchLeaf, label %exit0, label %exit1
462 LeafBlock1: ; preds = %entry
463 %SwitchLeaf2 = icmp eq i32 %tmp16, 2
464 br i1 %SwitchLeaf2, label %exit0, label %exit1
466 exit0: ; preds = %LeafBlock, %LeafBlock1
467 store volatile i32 17, i32 addrspace(3)* undef
470 exit1: ; preds = %LeafBlock, %LeafBlock1
471 store volatile i32 9, i32 addrspace(1)* undef
475 ; The non-uniformity of the branch to the exiting blocks requires
476 ; looking at transitive predecessors.
478 ; IR-LABEL: @indirect_multi_divergent_region_exit_ret_unreachable(
480 ; IR: exit0: ; preds = %Flow2
481 ; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
482 ; IR-NEXT: br label %UnifiedReturnBlock
485 ; IR: indirect.exit1:
486 ; IR: %load = load volatile i32, i32 addrspace(1)* undef
487 ; IR: store volatile i32 %load, i32 addrspace(1)* undef
488 ; IR: store volatile i32 9, i32 addrspace(1)* undef
489 ; IR: call void @llvm.amdgcn.unreachable()
490 ; IR-NEXT: br label %Flow2
492 ; IR: UnifiedReturnBlock: ; preds = %exit0, %Flow2
493 ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %14)
495 define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
497 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
498 %tmp1 = add i32 0, %tmp
499 %tmp2 = zext i32 %tmp1 to i64
500 %tmp3 = add i64 0, %tmp2
501 %tmp4 = shl i64 %tmp3, 32
502 %tmp5 = ashr exact i64 %tmp4, 32
503 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
504 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
505 %tmp8 = sext i32 %tmp7 to i64
506 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
507 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
508 %tmp13 = zext i32 %tmp10 to i64
509 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
510 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
511 %Pivot = icmp slt i32 %tmp16, 2
512 br i1 %Pivot, label %LeafBlock, label %LeafBlock1
514 LeafBlock: ; preds = %entry
515 %SwitchLeaf = icmp eq i32 %tmp16, 1
516 br i1 %SwitchLeaf, label %exit0, label %indirect.exit1
518 LeafBlock1: ; preds = %entry
519 %SwitchLeaf2 = icmp eq i32 %tmp16, 2
520 br i1 %SwitchLeaf2, label %exit0, label %indirect.exit1
522 exit0: ; preds = %LeafBlock, %LeafBlock1
523 store volatile i32 17, i32 addrspace(3)* undef
527 %load = load volatile i32, i32 addrspace(1)* undef
528 store volatile i32 %load, i32 addrspace(1)* undef
531 exit1: ; preds = %LeafBlock, %LeafBlock1
532 store volatile i32 9, i32 addrspace(1)* undef
536 ; IR-LABEL: @multi_divergent_region_exit_ret_switch(
537 define amdgpu_kernel void @multi_divergent_region_exit_ret_switch(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
539 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
540 %tmp1 = add i32 0, %tmp
541 %tmp2 = zext i32 %tmp1 to i64
542 %tmp3 = add i64 0, %tmp2
543 %tmp4 = shl i64 %tmp3, 32
544 %tmp5 = ashr exact i64 %tmp4, 32
545 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
546 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
547 %tmp8 = sext i32 %tmp7 to i64
548 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
549 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
550 %tmp13 = zext i32 %tmp10 to i64
551 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
552 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
553 switch i32 %tmp16, label %exit1
554 [ i32 1, label %LeafBlock
555 i32 2, label %LeafBlock1
556 i32 3, label %exit0 ]
558 LeafBlock: ; preds = %entry
559 %SwitchLeaf = icmp eq i32 %tmp16, 1
560 br i1 %SwitchLeaf, label %exit0, label %exit1
562 LeafBlock1: ; preds = %entry
563 %SwitchLeaf2 = icmp eq i32 %tmp16, 2
564 br i1 %SwitchLeaf2, label %exit0, label %exit1
566 exit0: ; preds = %LeafBlock, %LeafBlock1
567 store volatile i32 17, i32 addrspace(3)* undef
570 exit1: ; preds = %LeafBlock, %LeafBlock1
571 store volatile i32 9, i32 addrspace(1)* undef
575 ; IR-LABEL: @divergent_multi_ret_nest_in_uniform_triangle(
576 define amdgpu_kernel void @divergent_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
578 %uniform.cond0 = icmp eq i32 %arg0, 4
579 br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
581 divergent.multi.exit.region:
582 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
583 %divergent.cond0 = icmp eq i32 %id.x, 0
584 br i1 %divergent.cond0, label %divergent.ret0, label %divergent.ret1
587 store volatile i32 11, i32 addrspace(3)* undef
591 store volatile i32 42, i32 addrspace(3)* undef
595 store volatile i32 9, i32 addrspace(1)* undef
599 ; IR-LABEL: @divergent_complex_multi_ret_nest_in_uniform_triangle(
600 define amdgpu_kernel void @divergent_complex_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
602 %uniform.cond0 = icmp eq i32 %arg0, 4
603 br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
605 divergent.multi.exit.region:
606 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
607 %divergent.cond0 = icmp eq i32 %id.x, 0
608 br i1 %divergent.cond0, label %divergent.if, label %divergent.ret1
611 %vgpr0 = load volatile float, float addrspace(1)* undef
612 %divergent.cond1 = fcmp ogt float %vgpr0, 1.0
613 br i1 %divergent.cond1, label %divergent.then, label %divergent.endif
616 %vgpr1 = load volatile float, float addrspace(1)* undef
617 %divergent.cond2 = fcmp olt float %vgpr1, 4.0
618 store volatile i32 33, i32 addrspace(1)* undef
619 br i1 %divergent.cond2, label %divergent.ret0, label %divergent.endif
622 store volatile i32 38, i32 addrspace(1)* undef
623 br label %divergent.ret0
626 store volatile i32 11, i32 addrspace(3)* undef
630 store volatile i32 42, i32 addrspace(3)* undef
634 store volatile i32 9, i32 addrspace(1)* undef
638 ; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle(
639 ; IR: Flow1: ; preds = %uniform.ret1, %uniform.multi.exit.region
640 ; IR: %8 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ]
641 ; IR: br i1 %8, label %uniform.if, label %Flow2
643 ; IR: Flow: ; preds = %uniform.then, %uniform.if
644 ; IR: %11 = phi i1 [ %10, %uniform.then ], [ %9, %uniform.if ]
645 ; IR: br i1 %11, label %uniform.endif, label %uniform.ret0
647 ; IR: UnifiedReturnBlock: ; preds = %Flow3, %Flow2
648 ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %6)
650 define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 {
652 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
653 %divergent.cond0 = icmp eq i32 %id.x, 0
654 br i1 %divergent.cond0, label %uniform.multi.exit.region, label %divergent.ret
656 uniform.multi.exit.region:
657 %uniform.cond0 = icmp eq i32 %arg0, 4
658 br i1 %uniform.cond0, label %uniform.if, label %uniform.ret1
661 %sgpr0 = load volatile i32, i32 addrspace(4)* undef
662 %uniform.cond1 = icmp slt i32 %sgpr0, 1
663 br i1 %uniform.cond1, label %uniform.then, label %uniform.endif
666 %sgpr1 = load volatile i32, i32 addrspace(4)* undef
667 %uniform.cond2 = icmp sge i32 %sgpr1, 4
668 store volatile i32 33, i32 addrspace(1)* undef
669 br i1 %uniform.cond2, label %uniform.ret0, label %uniform.endif
672 store volatile i32 38, i32 addrspace(1)* undef
673 br label %uniform.ret0
676 store volatile i32 11, i32 addrspace(3)* undef
680 store volatile i32 42, i32 addrspace(3)* undef
684 store volatile i32 9, i32 addrspace(1)* undef
688 ; IR-LABEL: @multi_divergent_unreachable_exit(
689 ; IR: UnifiedUnreachableBlock:
690 ; IR-NEXT: call void @llvm.amdgcn.unreachable()
691 ; IR-NEXT: br label %UnifiedReturnBlock
693 ; IR: UnifiedReturnBlock:
694 ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64
696 define amdgpu_kernel void @multi_divergent_unreachable_exit() #0 {
698 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
699 switch i32 %tmp, label %bb3 [
711 switch i32 undef, label %bb5 [
722 declare i32 @llvm.amdgcn.workitem.id.x() #1
724 attributes #0 = { nounwind }
725 attributes #1 = { nounwind readnone }