1 ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx600 -S -lowerswitch -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s
2 ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -S -lowerswitch -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s
3 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s
5 ; Add an extra verifier runs. There were some cases where invalid IR
6 ; was produced but happened to be fixed by the later passes.
8 ; Make sure divergent control flow with multiple exits from a region
9 ; is properly handled. UnifyFunctionExitNodes should be run before
12 ; IR-LABEL: @multi_divergent_region_exit_ret_ret(
13 ; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot)
14 ; IR: %1 = extractvalue { i1, i64 } %0, 0
15 ; IR: %2 = extractvalue { i1, i64 } %0, 1
16 ; IR: br i1 %1, label %LeafBlock1, label %Flow
19 ; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
20 ; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ]
21 ; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
22 ; IR: %6 = extractvalue { i1, i64 } %5, 0
23 ; IR: %7 = extractvalue { i1, i64 } %5, 1
24 ; IR: br i1 %6, label %LeafBlock, label %Flow1
30 ; IR: br label %Flow{{$}}
33 ; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
34 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
35 ; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8)
36 ; IR: %10 = extractvalue { i1, i64 } %9, 0
37 ; IR: %11 = extractvalue { i1, i64 } %9, 1
38 ; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock
41 ; IR: store volatile i32 9, ptr addrspace(1) undef
42 ; IR: br label %UnifiedReturnBlock
45 ; IR: %12 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %3, %Flow ]
46 ; IR: %13 = phi i1 [ %SwitchLeaf.inv, %LeafBlock ], [ %4, %Flow ]
47 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %7)
48 ; IR: %14 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %13)
49 ; IR: %15 = extractvalue { i1, i64 } %14, 0
50 ; IR: %16 = extractvalue { i1, i64 } %14, 1
51 ; IR: br i1 %15, label %exit1, label %Flow2
54 ; IR: store volatile i32 17, ptr addrspace(3) undef
57 ; IR: UnifiedReturnBlock:
58 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %11)
62 ; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:
64 ; GCN-DAG: s_mov_b64 [[EXIT1:s\[[0-9]+:[0-9]+\]]], 0
65 ; GCN-DAG: v_cmp_lt_i32_e32 vcc, 1,
66 ; GCN-DAG: s_mov_b64 [[EXIT0:s\[[0-9]+:[0-9]+\]]], 0
67 ; GCN-DAG: s_and_saveexec_b64
71 ; GCN-NEXT: s_mov_b64 [[EXIT0]], exec
72 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2,
73 ; GCN-NEXT: s_and_b64 [[EXIT1]], vcc, exec
76 ; GCN-NEXT: s_andn2_saveexec_b64
79 ; GCN-DAG: v_cmp_eq_u32_e32 vcc, 1,
80 ; GCN-DAG: v_cmp_ne_u32_e64 [[INV:s\[[0-9]+:[0-9]+\]]], 1,
81 ; GCN-DAG: s_andn2_b64 [[EXIT0]], [[EXIT0]], exec
82 ; GCN-DAG: s_andn2_b64 [[EXIT1]], [[EXIT1]], exec
83 ; GCN-DAG: s_and_b64 [[TMP0:s\[[0-9]+:[0-9]+\]]], vcc, exec
84 ; GCN-DAG: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]], [[INV]], exec
85 ; GCN-DAG: s_or_b64 [[EXIT0]], [[EXIT0]], [[TMP0]]
86 ; GCN-DAG: s_or_b64 [[EXIT1]], [[EXIT1]], [[TMP1]]
89 ; GCN-NEXT: s_or_b64 exec, exec,
90 ; GCN-NEXT: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[EXIT1]]
94 ; GCN-DAG: ds_write_b32
95 ; GCN-DAG: s_andn2_b64 [[EXIT0]], [[EXIT0]], exec
98 ; GCN-NEXT: s_or_b64 exec, exec,
99 ; GCN-NEXT: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[EXIT0]]
102 ; GCN: buffer_store_dword
104 ; GCN: ; %UnifiedReturnBlock
106 define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(ptr addrspace(1) nocapture %arg0, ptr addrspace(1) nocapture %arg1, ptr addrspace(1) nocapture %arg2) #0 {
108 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
109 %tmp1 = add i32 0, %tmp
110 %tmp2 = zext i32 %tmp1 to i64
111 %tmp3 = add i64 0, %tmp2
112 %tmp4 = shl i64 %tmp3, 32
113 %tmp5 = ashr exact i64 %tmp4, 32
114 %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %tmp5
115 %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
116 %tmp8 = sext i32 %tmp7 to i64
117 %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp8
118 %tmp10 = load i32, ptr addrspace(1) %tmp9, align 4
119 %tmp13 = zext i32 %tmp10 to i64
120 %tmp14 = getelementptr inbounds i32, ptr addrspace(1) %arg2, i64 %tmp13
121 %tmp16 = load i32, ptr addrspace(1) %tmp14, align 16
122 %Pivot = icmp slt i32 %tmp16, 2
123 br i1 %Pivot, label %LeafBlock, label %LeafBlock1
125 LeafBlock: ; preds = %entry
126 %SwitchLeaf = icmp eq i32 %tmp16, 1
127 br i1 %SwitchLeaf, label %exit0, label %exit1
129 LeafBlock1: ; preds = %entry
130 %SwitchLeaf2 = icmp eq i32 %tmp16, 2
131 br i1 %SwitchLeaf2, label %exit0, label %exit1
133 exit0: ; preds = %LeafBlock, %LeafBlock1
134 store volatile i32 9, ptr addrspace(1) undef
137 exit1: ; preds = %LeafBlock, %LeafBlock1
138 store volatile i32 17, ptr addrspace(3) undef
142 ; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable(
143 ; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot)
145 ; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
147 ; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
148 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
149 ; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8)
150 ; IR: br i1 %10, label %exit0, label %UnifiedUnreachableBlock
153 ; IR: UnifiedUnreachableBlock:
154 ; IR-NEXT: unreachable
157 ; FIXME: Probably should insert an s_endpgm anyway.
158 ; GCN-LABEL: {{^}}multi_divergent_region_exit_unreachable_unreachable:
159 ; GCN: ; %UnifiedUnreachableBlock
160 ; GCN-NEXT: .Lfunc_end
161 define amdgpu_kernel void @multi_divergent_region_exit_unreachable_unreachable(ptr addrspace(1) nocapture %arg0, ptr addrspace(1) nocapture %arg1, ptr addrspace(1) nocapture %arg2) #0 {
163 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
164 %tmp1 = add i32 0, %tmp
165 %tmp2 = zext i32 %tmp1 to i64
166 %tmp3 = add i64 0, %tmp2
167 %tmp4 = shl i64 %tmp3, 32
168 %tmp5 = ashr exact i64 %tmp4, 32
169 %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %tmp5
170 %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
171 %tmp8 = sext i32 %tmp7 to i64
172 %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp8
173 %tmp10 = load i32, ptr addrspace(1) %tmp9, align 4
174 %tmp13 = zext i32 %tmp10 to i64
175 %tmp14 = getelementptr inbounds i32, ptr addrspace(1) %arg2, i64 %tmp13
176 %tmp16 = load i32, ptr addrspace(1) %tmp14, align 16
177 %Pivot = icmp slt i32 %tmp16, 2
178 br i1 %Pivot, label %LeafBlock, label %LeafBlock1
180 LeafBlock: ; preds = %entry
181 %SwitchLeaf = icmp eq i32 %tmp16, 1
182 br i1 %SwitchLeaf, label %exit0, label %exit1
184 LeafBlock1: ; preds = %entry
185 %SwitchLeaf2 = icmp eq i32 %tmp16, 2
186 br i1 %SwitchLeaf2, label %exit0, label %exit1
188 exit0: ; preds = %LeafBlock, %LeafBlock1
189 store volatile i32 9, ptr addrspace(1) undef
192 exit1: ; preds = %LeafBlock, %LeafBlock1
193 store volatile i32 17, ptr addrspace(3) undef
197 ; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret(
198 ; IR: %divergent.cond0 = icmp sge i32 %tmp16, 2
203 ; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
204 ; IR: %4 = phi i1 [ %uniform.cond0, %LeafBlock1 ], [ false, %entry ]
205 ; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
206 ; IR: br i1 %6, label %LeafBlock, label %Flow1
208 ; IR: {{^}}LeafBlock:
209 ; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1
210 ; IR: br label %Flow1
213 ; IR: %uniform.cond0 = icmp ne i32 %arg3, 2
217 ; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
218 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
219 ; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8)
220 ; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock
223 ; IR: store volatile i32 9, ptr addrspace(1) undef
224 ; IR: br label %UnifiedReturnBlock
227 ; IR: %12 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %3, %Flow ]
228 ; IR: %13 = phi i1 [ %divergent.cond1.inv, %LeafBlock ], [ %4, %Flow ]
229 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %7)
230 ; IR: %14 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %13)
231 ; IR: %15 = extractvalue { i1, i64 } %14, 0
232 ; IR: %16 = extractvalue { i1, i64 } %14, 1
233 ; IR: br i1 %15, label %exit1, label %Flow2
236 ; IR: store volatile i32 17, ptr addrspace(3) undef
237 ; IR: br label %Flow2
239 ; IR: UnifiedReturnBlock:
240 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %11)
242 define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(ptr addrspace(1) nocapture %arg0, ptr addrspace(1) nocapture %arg1, ptr addrspace(1) nocapture %arg2, i32 %arg3) #0 {
244 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
245 %tmp1 = add i32 0, %tmp
246 %tmp2 = zext i32 %tmp1 to i64
247 %tmp3 = add i64 0, %tmp2
248 %tmp4 = shl i64 %tmp3, 32
249 %tmp5 = ashr exact i64 %tmp4, 32
250 %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %tmp5
251 %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
252 %tmp8 = sext i32 %tmp7 to i64
253 %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp8
254 %tmp10 = load i32, ptr addrspace(1) %tmp9, align 4
255 %tmp13 = zext i32 %tmp10 to i64
256 %tmp14 = getelementptr inbounds i32, ptr addrspace(1) %arg2, i64 %tmp13
257 %tmp16 = load i32, ptr addrspace(1) %tmp14, align 16
258 %divergent.cond0 = icmp slt i32 %tmp16, 2
259 br i1 %divergent.cond0, label %LeafBlock, label %LeafBlock1
261 LeafBlock: ; preds = %entry
262 %divergent.cond1 = icmp eq i32 %tmp16, 1
263 br i1 %divergent.cond1, label %exit0, label %exit1
265 LeafBlock1: ; preds = %entry
266 %uniform.cond0 = icmp eq i32 %arg3, 2
267 br i1 %uniform.cond0, label %exit0, label %exit1
269 exit0: ; preds = %LeafBlock, %LeafBlock1
270 store volatile i32 9, ptr addrspace(1) undef
273 exit1: ; preds = %LeafBlock, %LeafBlock1
274 store volatile i32 17, ptr addrspace(3) undef
278 ; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret(
279 ; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot)
280 ; IR: br i1 %1, label %LeafBlock1, label %Flow
283 ; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
284 ; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ]
285 ; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
287 ; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
288 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
289 ; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8)
291 define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(ptr addrspace(1) nocapture %arg0, ptr addrspace(1) nocapture %arg1, ptr addrspace(1) nocapture %arg2, i32 %arg3) #0 {
293 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
294 %tmp1 = add i32 0, %tmp
295 %tmp2 = zext i32 %tmp1 to i64
296 %tmp3 = add i64 0, %tmp2
297 %tmp4 = shl i64 %tmp3, 32
298 %tmp5 = ashr exact i64 %tmp4, 32
299 %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %tmp5
300 %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
301 %tmp8 = sext i32 %tmp7 to i64
302 %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp8
303 %tmp10 = load i32, ptr addrspace(1) %tmp9, align 4
304 %tmp13 = zext i32 %tmp10 to i64
305 %tmp14 = getelementptr inbounds i32, ptr addrspace(1) %arg2, i64 %tmp13
306 %tmp16 = load i32, ptr addrspace(1) %tmp14, align 16
307 %Pivot = icmp slt i32 %tmp16, 2
308 br i1 %Pivot, label %LeafBlock, label %LeafBlock1
310 LeafBlock: ; preds = %entry
311 %SwitchLeaf = icmp eq i32 %arg3, 1
312 br i1 %SwitchLeaf, label %exit0, label %exit1
314 LeafBlock1: ; preds = %entry
315 %SwitchLeaf2 = icmp eq i32 %tmp16, 2
316 br i1 %SwitchLeaf2, label %exit0, label %exit1
318 exit0: ; preds = %LeafBlock, %LeafBlock1
319 store volatile i32 9, ptr addrspace(1) undef
322 exit1: ; preds = %LeafBlock, %LeafBlock1
323 store volatile i32 17, ptr addrspace(3) undef
327 ; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value(
329 ; IR: %8 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ]
330 ; IR: %9 = phi i1 [ false, %exit1 ], [ %13, %Flow1 ]
331 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %17)
333 ; IR: UnifiedReturnBlock:
334 ; IR: %UnifiedRetVal = phi float [ %8, %Flow2 ], [ 1.000000e+00, %exit0 ]
335 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %12)
336 ; IR: ret float %UnifiedRetVal
337 define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 {
339 %Pivot = icmp slt i32 %vgpr, 2
340 br i1 %Pivot, label %LeafBlock, label %LeafBlock1
342 LeafBlock: ; preds = %entry
343 %SwitchLeaf = icmp eq i32 %vgpr, 1
344 br i1 %SwitchLeaf, label %exit0, label %exit1
346 LeafBlock1: ; preds = %entry
347 %SwitchLeaf2 = icmp eq i32 %vgpr, 2
348 br i1 %SwitchLeaf2, label %exit0, label %exit1
350 exit0: ; preds = %LeafBlock, %LeafBlock1
351 store i32 9, ptr addrspace(1) undef
354 exit1: ; preds = %LeafBlock, %LeafBlock1
355 store i32 17, ptr addrspace(3) undef
359 ; IR-LABEL: @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(
361 ; GCN-LABEL: {{^}}uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value:
362 ; GCN: s_cmp_gt_i32 s0, 1
363 ; GCN: s_cbranch_scc0 [[FLOW:.LBB[0-9]+_[0-9]+]]
365 ; GCN: v_cmp_ne_u32_e32 vcc, 7, v0
367 ; GCN: {{^}}[[FLOW]]:
369 ; GCN: s_or_b64 exec, exec
370 ; GCN: v_mov_b32_e32 v0, s6
371 ; GCN-NOT: s_and_b64 exec, exec
372 ; GCN: v_mov_b32_e32 v0, 1.0
374 ; GCN: {{^.LBB[0-9]+_[0-9]+}}: ; %UnifiedReturnBlock
375 ; GCN-NEXT: s_or_b64 exec, exec
376 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
379 define amdgpu_ps float @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(i32 inreg %sgpr, i32 %vgpr) #0 {
381 %uniform.cond = icmp slt i32 %sgpr, 2
382 br i1 %uniform.cond, label %LeafBlock, label %LeafBlock1
384 LeafBlock: ; preds = %entry
385 %divergent.cond0 = icmp eq i32 %vgpr, 3
386 br i1 %divergent.cond0, label %exit0, label %exit1
388 LeafBlock1: ; preds = %entry
389 %divergent.cond1 = icmp eq i32 %vgpr, 7
390 br i1 %divergent.cond1, label %exit0, label %exit1
392 exit0: ; preds = %LeafBlock, %LeafBlock1
393 store i32 9, ptr addrspace(1) undef
396 exit1: ; preds = %LeafBlock, %LeafBlock1
397 store i32 17, ptr addrspace(3) undef
401 ; IR-LABEL: @multi_divergent_region_exit_ret_unreachable(
402 ; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot)
405 ; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
406 ; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ]
407 ; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
410 ; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
411 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
412 ; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8)
413 ; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock
416 ; IR-NEXT: store volatile i32 17, ptr addrspace(3) undef
417 ; IR-NEXT: br label %UnifiedReturnBlock
420 ; IR: %12 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %3, %Flow ]
421 ; IR: %13 = phi i1 [ %SwitchLeaf.inv, %LeafBlock ], [ %4, %Flow ]
422 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %7)
423 ; IR: %14 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %13)
424 ; IR: %15 = extractvalue { i1, i64 } %14, 0
425 ; IR: %16 = extractvalue { i1, i64 } %14, 1
426 ; IR: br i1 %15, label %exit1, label %Flow2
429 ; IR-NEXT: store volatile i32 9, ptr addrspace(1) undef
430 ; IR-NEXT: call void @llvm.amdgcn.unreachable()
431 ; IR-NEXT: br label %Flow2
433 ; IR: UnifiedReturnBlock:
434 ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %11)
436 define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(ptr addrspace(1) nocapture %arg0, ptr addrspace(1) nocapture %arg1, ptr addrspace(1) nocapture %arg2) #0 {
438 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
439 %tmp1 = add i32 0, %tmp
440 %tmp2 = zext i32 %tmp1 to i64
441 %tmp3 = add i64 0, %tmp2
442 %tmp4 = shl i64 %tmp3, 32
443 %tmp5 = ashr exact i64 %tmp4, 32
444 %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %tmp5
445 %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
446 %tmp8 = sext i32 %tmp7 to i64
447 %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp8
448 %tmp10 = load i32, ptr addrspace(1) %tmp9, align 4
449 %tmp13 = zext i32 %tmp10 to i64
450 %tmp14 = getelementptr inbounds i32, ptr addrspace(1) %arg2, i64 %tmp13
451 %tmp16 = load i32, ptr addrspace(1) %tmp14, align 16
452 %Pivot = icmp slt i32 %tmp16, 2
453 br i1 %Pivot, label %LeafBlock, label %LeafBlock1
455 LeafBlock: ; preds = %entry
456 %SwitchLeaf = icmp eq i32 %tmp16, 1
457 br i1 %SwitchLeaf, label %exit0, label %exit1
459 LeafBlock1: ; preds = %entry
460 %SwitchLeaf2 = icmp eq i32 %tmp16, 2
461 br i1 %SwitchLeaf2, label %exit0, label %exit1
463 exit0: ; preds = %LeafBlock, %LeafBlock1
464 store volatile i32 17, ptr addrspace(3) undef
467 exit1: ; preds = %LeafBlock, %LeafBlock1
468 store volatile i32 9, ptr addrspace(1) undef
472 ; The non-uniformity of the branch to the exiting blocks requires
473 ; looking at transitive predecessors.
475 ; IR-LABEL: @indirect_multi_divergent_region_exit_ret_unreachable(
477 ; IR: exit0: ; preds = %Flow2
478 ; IR-NEXT: store volatile i32 17, ptr addrspace(3) undef
479 ; IR-NEXT: br label %UnifiedReturnBlock
482 ; IR: indirect.exit1:
483 ; IR: %load = load volatile i32, ptr addrspace(1) undef
484 ; IR: store volatile i32 %load, ptr addrspace(1) undef
485 ; IR: store volatile i32 9, ptr addrspace(1) undef
486 ; IR: call void @llvm.amdgcn.unreachable()
487 ; IR-NEXT: br label %Flow2
489 ; IR: UnifiedReturnBlock: ; preds = %exit0, %Flow2
490 ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %11)
492 define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(ptr addrspace(1) nocapture %arg0, ptr addrspace(1) nocapture %arg1, ptr addrspace(1) nocapture %arg2) #0 {
494 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
495 %tmp1 = add i32 0, %tmp
496 %tmp2 = zext i32 %tmp1 to i64
497 %tmp3 = add i64 0, %tmp2
498 %tmp4 = shl i64 %tmp3, 32
499 %tmp5 = ashr exact i64 %tmp4, 32
500 %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %tmp5
501 %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
502 %tmp8 = sext i32 %tmp7 to i64
503 %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp8
504 %tmp10 = load i32, ptr addrspace(1) %tmp9, align 4
505 %tmp13 = zext i32 %tmp10 to i64
506 %tmp14 = getelementptr inbounds i32, ptr addrspace(1) %arg2, i64 %tmp13
507 %tmp16 = load i32, ptr addrspace(1) %tmp14, align 16
508 %Pivot = icmp slt i32 %tmp16, 2
509 br i1 %Pivot, label %LeafBlock, label %LeafBlock1
511 LeafBlock: ; preds = %entry
512 %SwitchLeaf = icmp eq i32 %tmp16, 1
513 br i1 %SwitchLeaf, label %exit0, label %indirect.exit1
515 LeafBlock1: ; preds = %entry
516 %SwitchLeaf2 = icmp eq i32 %tmp16, 2
517 br i1 %SwitchLeaf2, label %exit0, label %indirect.exit1
519 exit0: ; preds = %LeafBlock, %LeafBlock1
520 store volatile i32 17, ptr addrspace(3) undef
524 %load = load volatile i32, ptr addrspace(1) undef
525 store volatile i32 %load, ptr addrspace(1) undef
528 exit1: ; preds = %LeafBlock, %LeafBlock1
529 store volatile i32 9, ptr addrspace(1) undef
533 ; IR-LABEL: @multi_divergent_region_exit_ret_switch(
534 define amdgpu_kernel void @multi_divergent_region_exit_ret_switch(ptr addrspace(1) nocapture %arg0, ptr addrspace(1) nocapture %arg1, ptr addrspace(1) nocapture %arg2) #0 {
536 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
537 %tmp1 = add i32 0, %tmp
538 %tmp2 = zext i32 %tmp1 to i64
539 %tmp3 = add i64 0, %tmp2
540 %tmp4 = shl i64 %tmp3, 32
541 %tmp5 = ashr exact i64 %tmp4, 32
542 %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %tmp5
543 %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
544 %tmp8 = sext i32 %tmp7 to i64
545 %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp8
546 %tmp10 = load i32, ptr addrspace(1) %tmp9, align 4
547 %tmp13 = zext i32 %tmp10 to i64
548 %tmp14 = getelementptr inbounds i32, ptr addrspace(1) %arg2, i64 %tmp13
549 %tmp16 = load i32, ptr addrspace(1) %tmp14, align 16
550 switch i32 %tmp16, label %exit1
551 [ i32 1, label %LeafBlock
552 i32 2, label %LeafBlock1
553 i32 3, label %exit0 ]
555 LeafBlock: ; preds = %entry
556 %SwitchLeaf = icmp eq i32 %tmp16, 1
557 br i1 %SwitchLeaf, label %exit0, label %exit1
559 LeafBlock1: ; preds = %entry
560 %SwitchLeaf2 = icmp eq i32 %tmp16, 2
561 br i1 %SwitchLeaf2, label %exit0, label %exit1
563 exit0: ; preds = %LeafBlock, %LeafBlock1
564 store volatile i32 17, ptr addrspace(3) undef
567 exit1: ; preds = %LeafBlock, %LeafBlock1
568 store volatile i32 9, ptr addrspace(1) undef
572 ; IR-LABEL: @divergent_multi_ret_nest_in_uniform_triangle(
573 define amdgpu_kernel void @divergent_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
575 %uniform.cond0 = icmp eq i32 %arg0, 4
576 br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
578 divergent.multi.exit.region:
579 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
580 %divergent.cond0 = icmp eq i32 %id.x, 0
581 br i1 %divergent.cond0, label %divergent.ret0, label %divergent.ret1
584 store volatile i32 11, ptr addrspace(3) undef
588 store volatile i32 42, ptr addrspace(3) undef
592 store volatile i32 9, ptr addrspace(1) undef
596 ; IR-LABEL: @divergent_complex_multi_ret_nest_in_uniform_triangle(
597 define amdgpu_kernel void @divergent_complex_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
599 %uniform.cond0 = icmp eq i32 %arg0, 4
600 br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
602 divergent.multi.exit.region:
603 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
604 %divergent.cond0 = icmp eq i32 %id.x, 0
605 br i1 %divergent.cond0, label %divergent.if, label %divergent.ret1
608 %vgpr0 = load volatile float, ptr addrspace(1) undef
609 %divergent.cond1 = fcmp ogt float %vgpr0, 1.0
610 br i1 %divergent.cond1, label %divergent.then, label %divergent.endif
613 %vgpr1 = load volatile float, ptr addrspace(1) undef
614 %divergent.cond2 = fcmp olt float %vgpr1, 4.0
615 store volatile i32 33, ptr addrspace(1) undef
616 br i1 %divergent.cond2, label %divergent.ret0, label %divergent.endif
619 store volatile i32 38, ptr addrspace(1) undef
620 br label %divergent.ret0
623 store volatile i32 11, ptr addrspace(3) undef
627 store volatile i32 42, ptr addrspace(3) undef
631 store volatile i32 9, ptr addrspace(1) undef
635 ; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle(
636 ; IR: Flow1: ; preds = %uniform.ret1, %uniform.multi.exit.region
637 ; IR: %6 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ]
638 ; IR: br i1 %6, label %uniform.if, label %Flow2
640 ; IR: Flow: ; preds = %uniform.then, %uniform.if
641 ; IR: %7 = phi i1 [ %uniform.cond2, %uniform.then ], [ %uniform.cond1.inv, %uniform.if ]
642 ; IR: br i1 %7, label %uniform.endif, label %uniform.ret0
644 ; IR: UnifiedReturnBlock: ; preds = %Flow3, %Flow2
645 ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %5)
647 define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 {
649 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
650 %divergent.cond0 = icmp eq i32 %id.x, 0
651 br i1 %divergent.cond0, label %uniform.multi.exit.region, label %divergent.ret
653 uniform.multi.exit.region:
654 %uniform.cond0 = icmp eq i32 %arg0, 4
655 br i1 %uniform.cond0, label %uniform.if, label %uniform.ret1
658 %sgpr0 = load volatile i32, ptr addrspace(4) undef
659 %uniform.cond1 = icmp slt i32 %sgpr0, 1
660 br i1 %uniform.cond1, label %uniform.then, label %uniform.endif
663 %sgpr1 = load volatile i32, ptr addrspace(4) undef
664 %uniform.cond2 = icmp sge i32 %sgpr1, 4
665 store volatile i32 33, ptr addrspace(1) undef
666 br i1 %uniform.cond2, label %uniform.ret0, label %uniform.endif
669 store volatile i32 38, ptr addrspace(1) undef
670 br label %uniform.ret0
673 store volatile i32 11, ptr addrspace(3) undef
677 store volatile i32 42, ptr addrspace(3) undef
681 store volatile i32 9, ptr addrspace(1) undef
685 ; IR-LABEL: @multi_divergent_unreachable_exit(
686 ; IR: UnifiedUnreachableBlock:
687 ; IR-NEXT: call void @llvm.amdgcn.unreachable()
688 ; IR-NEXT: br label %UnifiedReturnBlock
690 ; IR: UnifiedReturnBlock:
691 ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64
693 define amdgpu_kernel void @multi_divergent_unreachable_exit() #0 {
695 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
696 switch i32 %tmp, label %bb3 [
708 switch i32 undef, label %bb5 [
719 ; Test that there is an extra export inserted after the normal export,
720 ; if the normal export is inside a uniformly reached block and there is
721 ; an infinite loop in the pixel shader.
723 ; IR-LABEL: @uniformly_reached_export
725 ; IR: br i1 [[CND:%.*]], label %[[LOOP:.*]], label %[[EXP:.*]]
728 ; IR-NEXT: br i1 false, label %DummyReturnBlock, label %[[LOOP]]
731 ; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 true, i1 true)
734 ; IR: DummyReturnBlock:
737 define amdgpu_ps void @uniformly_reached_export(float inreg %tmp25) {
739 %tmp26 = fcmp olt float %tmp25, 0.000000e+00
740 br i1 %tmp26, label %loop, label %bb27
742 loop: ; preds = %loop, %.entry
745 bb27: ; preds = %.entry
746 call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 true, i1 true)
750 declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #0
751 declare i32 @llvm.amdgcn.workitem.id.x() #1
753 attributes #0 = { nounwind }
754 attributes #1 = { nounwind readnone }