llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll

   1 ; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s
   2 ; RUN: llc -march=amdgcn -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s
   3
   4 ; Add an extra verifier runs. There were some cases where invalid IR
   5 ; was produced but happened to be fixed by the later passes.
   6
   7 ; Make sure divergent control flow with multiple exits from a region
   8 ; is properly handled. UnifyFunctionExitNodes should be run before
   9 ; StructurizeCFG.
  10
  11 ; IR-LABEL: @multi_divergent_region_exit_ret_ret(
  12 ; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot.inv)
  13 ; IR: %1 = extractvalue { i1, i64 } %0, 0
  14 ; IR: %2 = extractvalue { i1, i64 } %0, 1
  15 ; IR: br i1 %1, label %LeafBlock1, label %Flow
  16
  17 ; IR: Flow:
  18 ; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
  19 ; IR: %4 = phi i1 [ %SwitchLeaf2.inv, %LeafBlock1 ], [ false, %entry ]
  20 ; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
  21 ; IR: %6 = extractvalue { i1, i64 } %5, 0
  22 ; IR: %7 = extractvalue { i1, i64 } %5, 1
  23 ; IR: br i1 %6, label %LeafBlock, label %Flow1
  24
  25 ; IR: LeafBlock:
  26 ; IR: br label %Flow1
  27
  28 ; IR: LeafBlock1:
  29 ; IR: br label %Flow{{$}}
  30
  31 ; IR:  Flow2:
  32 ; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
  33 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
  34 ; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8)
  35 ; IR: %10 = extractvalue { i1, i64 } %9, 0
  36 ; IR: %11 = extractvalue { i1, i64 } %9, 1
  37 ; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock
  38
  39 ; IR: exit0:
  40 ; IR: store volatile i32 9, i32 addrspace(1)* undef
  41 ; IR: br label %UnifiedReturnBlock
  42
  43 ; IR: Flow1:
  44 ; IR: %12 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %3, %Flow ]
  45 ; IR: %13 = phi i1 [ %SwitchLeaf.inv, %LeafBlock ], [ %4, %Flow ]
  46 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %7)
  47 ; IR: %14 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %13)
  48 ; IR: %15 = extractvalue { i1, i64 } %14, 0
  49 ; IR: %16 = extractvalue { i1, i64 } %14, 1
  50 ; IR: br i1 %15, label %exit1, label %Flow2
  51
  52 ; IR: exit1:
  53 ; IR: store volatile i32 17, i32 addrspace(3)* undef
  54 ; IR:  br label %Flow2
  55
  56 ; IR: UnifiedReturnBlock:
  57 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %11)
  58 ; IR: ret void
  59
  60
  61 ; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:
  62
  63 ; GCN-DAG:  s_mov_b64           [[EXIT1:s\[[0-9]+:[0-9]+\]]], 0
  64 ; GCN-DAG:  v_cmp_lt_i32_e32    vcc, 1,
  65 ; GCN-DAG:  s_mov_b64           [[EXIT0:s\[[0-9]+:[0-9]+\]]], 0
  66 ; GCN-DAG:  s_and_saveexec_b64
  67 ; GCN-DAG:  s_xor_b64
  68
  69 ; GCN: ; %LeafBlock1
  70 ; GCN-NEXT: s_mov_b64           [[EXIT0]], exec
  71 ; GCN-NEXT: v_cmp_ne_u32_e32    vcc, 2,
  72 ; GCN-NEXT: s_and_b64           [[EXIT1]], vcc, exec
  73
  74 ; GCN: ; %Flow
  75 ; GCN-NEXT: s_or_saveexec_b64
  76 ; GCN-NEXT: s_xor_b64
  77
  78 ; FIXME: Why is this compare essentially repeated?
  79 ; GCN: ; %LeafBlock
  80 ; GCN-DAG:  v_cmp_eq_u32_e32    vcc, 1,
  81 ; GCN-DAG:  v_cmp_ne_u32_e64    [[TMP1:s\[[0-9]+:[0-9]+\]]], 1,
  82 ; GCN-DAG:  s_andn2_b64         [[EXIT0]], [[EXIT0]], exec
  83 ; GCN-DAG:  s_andn2_b64         [[EXIT1]], [[EXIT1]], exec
  84 ; GCN-DAG:  s_and_b64           [[TMP0:s\[[0-9]+:[0-9]+\]]], vcc, exec
  85 ; GCN-DAG:  s_and_b64           [[TMP1]], [[TMP1]], exec
  86 ; GCN-DAG:  s_or_b64            [[EXIT0]], [[EXIT0]], [[TMP0]]
  87 ; GCN-DAG:  s_or_b64            [[EXIT1]], [[EXIT1]], [[TMP1]]
  88
  89 ; GCN: ; %Flow4
  90 ; GCN-NEXT: s_or_b64            exec, exec,
  91 ; GCN-NEXT: s_and_saveexec_b64  {{s\[[0-9]+:[0-9]+\]}}, [[EXIT1]]
  92 ; GCN-NEXT: s_xor_b64
  93
  94 ; GCN: ; %exit1
  95 ; GCN-DAG:  ds_write_b32
  96 ; GCN-DAG:  s_andn2_b64         [[EXIT0]], [[EXIT0]], exec
  97
  98 ; GCN: ; %Flow5
  99 ; GCN-NEXT: s_or_b64            exec, exec,
 100 ; GCN-NEXT; s_and_saveexec_b64  {{s\[[0-9]+:[0-9]+\]}}, [[EXIT0]]
 101
 102 ; GCN: ; %exit0
 103 ; GCN:      buffer_store_dword
 104
 105 ; GCN: ; %UnifiedReturnBlock
 106 ; GCN-NEXT: s_endpgm
 107 define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
 108 entry:
 109   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 110   %tmp1 = add i32 0, %tmp
 111   %tmp2 = zext i32 %tmp1 to i64
 112   %tmp3 = add i64 0, %tmp2
 113   %tmp4 = shl i64 %tmp3, 32
 114   %tmp5 = ashr exact i64 %tmp4, 32
 115   %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
 116   %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
 117   %tmp8 = sext i32 %tmp7 to i64
 118   %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
 119   %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
 120   %tmp13 = zext i32 %tmp10 to i64
 121   %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
 122   %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
 123   %Pivot = icmp slt i32 %tmp16, 2
 124   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
 125
 126 LeafBlock:                                        ; preds = %entry
 127   %SwitchLeaf = icmp eq i32 %tmp16, 1
 128   br i1 %SwitchLeaf, label %exit0, label %exit1
 129
 130 LeafBlock1:                                       ; preds = %entry
 131   %SwitchLeaf2 = icmp eq i32 %tmp16, 2
 132   br i1 %SwitchLeaf2, label %exit0, label %exit1
 133
 134 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 135   store volatile i32 9, i32 addrspace(1)* undef
 136   ret void
 137
 138 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 139   store volatile i32 17, i32 addrspace(3)* undef
 140   ret void
 141 }
 142
 143 ; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable(
 144 ; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot.inv)
 145
 146 ; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
 147
 148 ; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
 149 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
 150 ; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8)
 151 ; IR: br i1 %10, label %exit0, label %UnifiedUnreachableBlock
 152
 153
 154 ; IR: UnifiedUnreachableBlock:
 155 ; IR-NEXT: unreachable
 156
 157
 158 ; FIXME: Probably should insert an s_endpgm anyway.
 159 ; GCN-LABEL: {{^}}multi_divergent_region_exit_unreachable_unreachable:
 160 ; GCN: ; %UnifiedUnreachableBlock
 161 ; GCN-NEXT: .Lfunc_end
 162 define amdgpu_kernel void @multi_divergent_region_exit_unreachable_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
 163 entry:
 164   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 165   %tmp1 = add i32 0, %tmp
 166   %tmp2 = zext i32 %tmp1 to i64
 167   %tmp3 = add i64 0, %tmp2
 168   %tmp4 = shl i64 %tmp3, 32
 169   %tmp5 = ashr exact i64 %tmp4, 32
 170   %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
 171   %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
 172   %tmp8 = sext i32 %tmp7 to i64
 173   %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
 174   %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
 175   %tmp13 = zext i32 %tmp10 to i64
 176   %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
 177   %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
 178   %Pivot = icmp slt i32 %tmp16, 2
 179   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
 180
 181 LeafBlock:                                        ; preds = %entry
 182   %SwitchLeaf = icmp eq i32 %tmp16, 1
 183   br i1 %SwitchLeaf, label %exit0, label %exit1
 184
 185 LeafBlock1:                                       ; preds = %entry
 186   %SwitchLeaf2 = icmp eq i32 %tmp16, 2
 187   br i1 %SwitchLeaf2, label %exit0, label %exit1
 188
 189 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 190   store volatile i32 9, i32 addrspace(1)* undef
 191   unreachable
 192
 193 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 194   store volatile i32 17, i32 addrspace(3)* undef
 195   unreachable
 196 }
 197
 198 ; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret(
 199 ; IR: %divergent.cond0 = icmp slt i32 %tmp16, 2
 200 ; IR: llvm.amdgcn.if
 201 ; IR: br i1
 202
 203 ; IR: {{^}}Flow:
 204 ; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
 205 ; IR: %4 = phi i1 [ %uniform.cond0.inv, %LeafBlock1 ], [ false, %entry ]
 206 ; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
 207 ; IR: br i1 %6, label %LeafBlock, label %Flow1
 208
 209 ; IR: {{^}}LeafBlock:
 210 ; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1
 211 ; IR: %divergent.cond1.inv = xor i1 %divergent.cond1, true
 212 ; IR: br label %Flow1
 213
 214 ; IR: LeafBlock1:
 215 ; IR: %uniform.cond0 = icmp eq i32 %arg3, 2
 216 ; IR: %uniform.cond0.inv = xor i1 %uniform.cond0, true
 217 ; IR: br label %Flow
 218
 219 ; IR: Flow2:
 220 ; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
 221 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
 222 ; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8)
 223 ; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock
 224
 225 ; IR: exit0:
 226 ; IR: store volatile i32 9, i32 addrspace(1)* undef
 227 ; IR: br label %UnifiedReturnBlock
 228
 229 ; IR: {{^}}Flow1:
 230 ; IR: %12 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %3, %Flow ]
 231 ; IR: %13 = phi i1 [ %divergent.cond1.inv, %LeafBlock ], [ %4, %Flow ]
 232 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %7)
 233 ; IR: %14 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %13)
 234 ; IR: %15 = extractvalue { i1, i64 } %14, 0
 235 ; IR: %16 = extractvalue { i1, i64 } %14, 1
 236 ; IR: br i1 %15, label %exit1, label %Flow2
 237
 238 ; IR: exit1:
 239 ; IR: store volatile i32 17, i32 addrspace(3)* undef
 240 ; IR: br label %Flow2
 241
 242 ; IR: UnifiedReturnBlock:
 243 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %11)
 244 ; IR: ret void
 245 define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
 246 entry:
 247   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 248   %tmp1 = add i32 0, %tmp
 249   %tmp2 = zext i32 %tmp1 to i64
 250   %tmp3 = add i64 0, %tmp2
 251   %tmp4 = shl i64 %tmp3, 32
 252   %tmp5 = ashr exact i64 %tmp4, 32
 253   %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
 254   %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
 255   %tmp8 = sext i32 %tmp7 to i64
 256   %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
 257   %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
 258   %tmp13 = zext i32 %tmp10 to i64
 259   %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
 260   %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
 261   %divergent.cond0 = icmp slt i32 %tmp16, 2
 262   br i1 %divergent.cond0, label %LeafBlock, label %LeafBlock1
 263
 264 LeafBlock:                                        ; preds = %entry
 265   %divergent.cond1 = icmp eq i32 %tmp16, 1
 266   br i1 %divergent.cond1, label %exit0, label %exit1
 267
 268 LeafBlock1:                                       ; preds = %entry
 269   %uniform.cond0 = icmp eq i32 %arg3, 2
 270   br i1 %uniform.cond0, label %exit0, label %exit1
 271
 272 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 273   store volatile i32 9, i32 addrspace(1)* undef
 274   ret void
 275
 276 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 277   store volatile i32 17, i32 addrspace(3)* undef
 278   ret void
 279 }
 280
 281 ; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret(
 282 ; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot.inv)
 283 ; IR: br i1 %1, label %LeafBlock1, label %Flow
 284
 285 ; IR: Flow:
 286 ; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
 287 ; IR: %4 = phi i1 [ %SwitchLeaf2.inv, %LeafBlock1 ], [ false, %entry ]
 288 ; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
 289
 290 ; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
 291 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
 292 ; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8)
 293
 294 define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
 295 entry:
 296   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 297   %tmp1 = add i32 0, %tmp
 298   %tmp2 = zext i32 %tmp1 to i64
 299   %tmp3 = add i64 0, %tmp2
 300   %tmp4 = shl i64 %tmp3, 32
 301   %tmp5 = ashr exact i64 %tmp4, 32
 302   %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
 303   %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
 304   %tmp8 = sext i32 %tmp7 to i64
 305   %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
 306   %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
 307   %tmp13 = zext i32 %tmp10 to i64
 308   %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
 309   %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
 310   %Pivot = icmp slt i32 %tmp16, 2
 311   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
 312
 313 LeafBlock:                                        ; preds = %entry
 314   %SwitchLeaf = icmp eq i32 %arg3, 1
 315   br i1 %SwitchLeaf, label %exit0, label %exit1
 316
 317 LeafBlock1:                                       ; preds = %entry
 318   %SwitchLeaf2 = icmp eq i32 %tmp16, 2
 319   br i1 %SwitchLeaf2, label %exit0, label %exit1
 320
 321 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 322   store volatile i32 9, i32 addrspace(1)* undef
 323   ret void
 324
 325 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 326   store volatile i32 17, i32 addrspace(3)* undef
 327   ret void
 328 }
 329
 330 ; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value(
 331 ; IR: Flow2:
 332 ; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
 333 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
 334
 335 ; IR: UnifiedReturnBlock:
 336 ; IR: %UnifiedRetVal = phi float [ 2.000000e+00, %Flow2 ], [ 1.000000e+00, %exit0 ]
 337 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %11)
 338 ; IR: ret float %UnifiedRetVal
 339 define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 {
 340 entry:
 341   %Pivot = icmp slt i32 %vgpr, 2
 342   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
 343
 344 LeafBlock:                                        ; preds = %entry
 345   %SwitchLeaf = icmp eq i32 %vgpr, 1
 346   br i1 %SwitchLeaf, label %exit0, label %exit1
 347
 348 LeafBlock1:                                       ; preds = %entry
 349   %SwitchLeaf2 = icmp eq i32 %vgpr, 2
 350   br i1 %SwitchLeaf2, label %exit0, label %exit1
 351
 352 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 353   store i32 9, i32 addrspace(1)* undef
 354   ret float 1.0
 355
 356 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 357   store i32 17, i32 addrspace(3)* undef
 358   ret float 2.0
 359 }
 360
 361 ; IR-LABEL: @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(
 362
 363 ; GCN-LABEL: {{^}}uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value:
 364 ; GCN: s_cmp_gt_i32 s0, 1
 365 ; GCN: s_cbranch_scc0 [[FLOW:BB[0-9]+_[0-9]+]]
 366
 367 ; GCN: v_cmp_ne_u32_e32 vcc, 7, v0
 368
 369 ; GCN: {{^}}[[FLOW]]:
 370
 371 ; GCN: s_or_b64 exec, exec
 372 ; GCN: v_mov_b32_e32 v0, 2.0
 373 ; GCN-NOT: s_and_b64 exec, exec
 374 ; GCN: v_mov_b32_e32 v0, 1.0
 375
 376 ; GCN: {{^BB[0-9]+_[0-9]+}}: ; %UnifiedReturnBlock
 377 ; GCN-NEXT: s_or_b64 exec, exec
 378 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
 379 ; GCN-NEXT: ; return
 380
 381 define amdgpu_ps float @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(i32 inreg %sgpr, i32 %vgpr) #0 {
 382 entry:
 383   %uniform.cond = icmp slt i32 %sgpr, 2
 384   br i1 %uniform.cond, label %LeafBlock, label %LeafBlock1
 385
 386 LeafBlock:                                        ; preds = %entry
 387   %divergent.cond0 = icmp eq i32 %vgpr, 3
 388   br i1 %divergent.cond0, label %exit0, label %exit1
 389
 390 LeafBlock1:                                       ; preds = %entry
 391   %divergent.cond1 = icmp eq i32 %vgpr, 7
 392   br i1 %divergent.cond1, label %exit0, label %exit1
 393
 394 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 395   store i32 9, i32 addrspace(1)* undef
 396   ret float 1.0
 397
 398 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 399   store i32 17, i32 addrspace(3)* undef
 400   ret float 2.0
 401 }
 402
 403 ; IR-LABEL: @multi_divergent_region_exit_ret_unreachable(
 404 ; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot.inv)
 405
 406 ; IR: Flow:
 407 ; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
 408 ; IR: %4 = phi i1 [ %SwitchLeaf2.inv, %LeafBlock1 ], [ false, %entry ]
 409 ; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
 410
 411 ; IR: Flow2:
 412 ; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
 413 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
 414 ; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8)
 415 ; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock
 416
 417 ; IR: exit0:
 418 ; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
 419 ; IR-NEXT: br label %UnifiedReturnBlock
 420
 421 ; IR: Flow1:
 422 ; IR: %12 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %3, %Flow ]
 423 ; IR: %13 = phi i1 [ %SwitchLeaf.inv, %LeafBlock ], [ %4, %Flow ]
 424 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %7)
 425 ; IR: %14 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %13)
 426 ; IR: %15 = extractvalue { i1, i64 } %14, 0
 427 ; IR: %16 = extractvalue { i1, i64 } %14, 1
 428 ; IR: br i1 %15, label %exit1, label %Flow2
 429
 430 ; IR: exit1:
 431 ; IR-NEXT: store volatile i32 9, i32 addrspace(1)* undef
 432 ; IR-NEXT: call void @llvm.amdgcn.unreachable()
 433 ; IR-NEXT: br label %Flow2
 434
 435 ; IR: UnifiedReturnBlock:
 436 ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %11)
 437 ; IR-NEXT: ret void
 438 define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
 439 entry:
 440   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 441   %tmp1 = add i32 0, %tmp
 442   %tmp2 = zext i32 %tmp1 to i64
 443   %tmp3 = add i64 0, %tmp2
 444   %tmp4 = shl i64 %tmp3, 32
 445   %tmp5 = ashr exact i64 %tmp4, 32
 446   %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
 447   %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
 448   %tmp8 = sext i32 %tmp7 to i64
 449   %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
 450   %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
 451   %tmp13 = zext i32 %tmp10 to i64
 452   %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
 453   %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
 454   %Pivot = icmp slt i32 %tmp16, 2
 455   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
 456
 457 LeafBlock:                                        ; preds = %entry
 458   %SwitchLeaf = icmp eq i32 %tmp16, 1
 459   br i1 %SwitchLeaf, label %exit0, label %exit1
 460
 461 LeafBlock1:                                       ; preds = %entry
 462   %SwitchLeaf2 = icmp eq i32 %tmp16, 2
 463   br i1 %SwitchLeaf2, label %exit0, label %exit1
 464
 465 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 466   store volatile i32 17, i32 addrspace(3)* undef
 467   ret void
 468
 469 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 470   store volatile i32 9, i32 addrspace(1)* undef
 471   unreachable
 472 }
 473
 474 ; The non-uniformity of the branch to the exiting blocks requires
 475 ; looking at transitive predecessors.
 476
 477 ; IR-LABEL: @indirect_multi_divergent_region_exit_ret_unreachable(
 478
 479 ; IR: exit0:                                            ; preds = %Flow2
 480 ; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
 481 ; IR-NEXT: br label %UnifiedReturnBlock
 482
 483
 484 ; IR: indirect.exit1:
 485 ; IR: %load = load volatile i32, i32 addrspace(1)* undef
 486 ; IR: store volatile i32 %load, i32 addrspace(1)* undef
 487 ; IR: store volatile i32 9, i32 addrspace(1)* undef
 488 ; IR: call void @llvm.amdgcn.unreachable()
 489 ; IR-NEXT: br label %Flow2
 490
 491 ; IR: UnifiedReturnBlock:                               ; preds = %exit0, %Flow2
 492 ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %11)
 493 ; IR-NEXT: ret void
 494 define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
 495 entry:
 496   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 497   %tmp1 = add i32 0, %tmp
 498   %tmp2 = zext i32 %tmp1 to i64
 499   %tmp3 = add i64 0, %tmp2
 500   %tmp4 = shl i64 %tmp3, 32
 501   %tmp5 = ashr exact i64 %tmp4, 32
 502   %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
 503   %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
 504   %tmp8 = sext i32 %tmp7 to i64
 505   %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
 506   %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
 507   %tmp13 = zext i32 %tmp10 to i64
 508   %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
 509   %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
 510   %Pivot = icmp slt i32 %tmp16, 2
 511   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
 512
 513 LeafBlock:                                        ; preds = %entry
 514   %SwitchLeaf = icmp eq i32 %tmp16, 1
 515   br i1 %SwitchLeaf, label %exit0, label %indirect.exit1
 516
 517 LeafBlock1:                                       ; preds = %entry
 518   %SwitchLeaf2 = icmp eq i32 %tmp16, 2
 519   br i1 %SwitchLeaf2, label %exit0, label %indirect.exit1
 520
 521 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 522   store volatile i32 17, i32 addrspace(3)* undef
 523   ret void
 524
 525 indirect.exit1:
 526   %load = load volatile i32, i32 addrspace(1)* undef
 527   store volatile i32 %load, i32 addrspace(1)* undef
 528   br label %exit1
 529
 530 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 531   store volatile i32 9, i32 addrspace(1)* undef
 532   unreachable
 533 }
 534
 535 ; IR-LABEL: @multi_divergent_region_exit_ret_switch(
 536 define amdgpu_kernel void @multi_divergent_region_exit_ret_switch(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
 537 entry:
 538   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 539   %tmp1 = add i32 0, %tmp
 540   %tmp2 = zext i32 %tmp1 to i64
 541   %tmp3 = add i64 0, %tmp2
 542   %tmp4 = shl i64 %tmp3, 32
 543   %tmp5 = ashr exact i64 %tmp4, 32
 544   %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
 545   %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
 546   %tmp8 = sext i32 %tmp7 to i64
 547   %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
 548   %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
 549   %tmp13 = zext i32 %tmp10 to i64
 550   %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
 551   %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
 552   switch i32 %tmp16, label %exit1
 553     [ i32 1, label %LeafBlock
 554       i32 2, label %LeafBlock1
 555       i32 3, label %exit0 ]
 556
 557 LeafBlock:                                        ; preds = %entry
 558   %SwitchLeaf = icmp eq i32 %tmp16, 1
 559   br i1 %SwitchLeaf, label %exit0, label %exit1
 560
 561 LeafBlock1:                                       ; preds = %entry
 562   %SwitchLeaf2 = icmp eq i32 %tmp16, 2
 563   br i1 %SwitchLeaf2, label %exit0, label %exit1
 564
 565 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 566   store volatile i32 17, i32 addrspace(3)* undef
 567   ret void
 568
 569 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 570   store volatile i32 9, i32 addrspace(1)* undef
 571   unreachable
 572 }
 573
 574 ; IR-LABEL: @divergent_multi_ret_nest_in_uniform_triangle(
 575 define amdgpu_kernel void @divergent_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
 576 entry:
 577   %uniform.cond0 = icmp eq i32 %arg0, 4
 578   br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
 579
 580 divergent.multi.exit.region:
 581   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
 582   %divergent.cond0 = icmp eq i32 %id.x, 0
 583   br i1 %divergent.cond0, label %divergent.ret0, label %divergent.ret1
 584
 585 divergent.ret0:
 586   store volatile i32 11, i32 addrspace(3)* undef
 587   ret void
 588
 589 divergent.ret1:
 590   store volatile i32 42, i32 addrspace(3)* undef
 591   ret void
 592
 593 uniform.ret:
 594   store volatile i32 9, i32 addrspace(1)* undef
 595   ret void
 596 }
 597
 598 ; IR-LABEL: @divergent_complex_multi_ret_nest_in_uniform_triangle(
 599 define amdgpu_kernel void @divergent_complex_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
 600 entry:
 601   %uniform.cond0 = icmp eq i32 %arg0, 4
 602   br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
 603
 604 divergent.multi.exit.region:
 605   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
 606   %divergent.cond0 = icmp eq i32 %id.x, 0
 607   br i1 %divergent.cond0, label %divergent.if, label %divergent.ret1
 608
 609 divergent.if:
 610   %vgpr0 = load volatile float, float addrspace(1)* undef
 611   %divergent.cond1 = fcmp ogt float %vgpr0, 1.0
 612   br i1 %divergent.cond1, label %divergent.then, label %divergent.endif
 613
 614 divergent.then:
 615   %vgpr1 = load volatile float, float addrspace(1)* undef
 616   %divergent.cond2 = fcmp olt float %vgpr1, 4.0
 617   store volatile i32 33, i32 addrspace(1)* undef
 618   br i1 %divergent.cond2, label %divergent.ret0, label %divergent.endif
 619
 620 divergent.endif:
 621   store volatile i32 38, i32 addrspace(1)* undef
 622   br label %divergent.ret0
 623
 624 divergent.ret0:
 625   store volatile i32 11, i32 addrspace(3)* undef
 626   ret void
 627
 628 divergent.ret1:
 629   store volatile i32 42, i32 addrspace(3)* undef
 630   ret void
 631
 632 uniform.ret:
 633   store volatile i32 9, i32 addrspace(1)* undef
 634   ret void
 635 }
 636
 637 ; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle(
 638 ; IR: Flow1:                                            ; preds = %uniform.ret1, %uniform.multi.exit.region
 639 ; IR: %6 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ]
 640 ; IR: br i1 %6, label %uniform.if, label %Flow2
 641
 642 ; IR: Flow:                                             ; preds = %uniform.then, %uniform.if
 643 ; IR: %7 = phi i1 [ %uniform.cond2.inv, %uniform.then ], [ %uniform.cond1.inv, %uniform.if ]
 644 ; IR: br i1 %7, label %uniform.endif, label %uniform.ret0
 645
 646 ; IR: UnifiedReturnBlock:                               ; preds = %Flow3, %Flow2
 647 ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %5)
 648 ; IR-NEXT: ret void
 649 define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 {
 650 entry:
 651   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
 652   %divergent.cond0 = icmp eq i32 %id.x, 0
 653   br i1 %divergent.cond0, label %uniform.multi.exit.region, label %divergent.ret
 654
 655 uniform.multi.exit.region:
 656   %uniform.cond0 = icmp eq i32 %arg0, 4
 657   br i1 %uniform.cond0, label %uniform.if, label %uniform.ret1
 658
 659 uniform.if:
 660   %sgpr0 = load volatile i32, i32 addrspace(4)* undef
 661   %uniform.cond1 = icmp slt i32 %sgpr0, 1
 662   br i1 %uniform.cond1, label %uniform.then, label %uniform.endif
 663
 664 uniform.then:
 665   %sgpr1 = load volatile i32, i32 addrspace(4)* undef
 666   %uniform.cond2 = icmp sge i32 %sgpr1, 4
 667   store volatile i32 33, i32 addrspace(1)* undef
 668   br i1 %uniform.cond2, label %uniform.ret0, label %uniform.endif
 669
 670 uniform.endif:
 671   store volatile i32 38, i32 addrspace(1)* undef
 672   br label %uniform.ret0
 673
 674 uniform.ret0:
 675   store volatile i32 11, i32 addrspace(3)* undef
 676   ret void
 677
 678 uniform.ret1:
 679   store volatile i32 42, i32 addrspace(3)* undef
 680   ret void
 681
 682 divergent.ret:
 683   store volatile i32 9, i32 addrspace(1)* undef
 684   ret void
 685 }
 686
 687 ; IR-LABEL: @multi_divergent_unreachable_exit(
 688 ; IR: UnifiedUnreachableBlock:
 689 ; IR-NEXT: call void @llvm.amdgcn.unreachable()
 690 ; IR-NEXT: br label %UnifiedReturnBlock
 691
 692 ; IR: UnifiedReturnBlock:
 693 ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64
 694 ; IR-NEXT: ret void
 695 define amdgpu_kernel void @multi_divergent_unreachable_exit() #0 {
 696 bb:
 697   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
 698   switch i32 %tmp, label %bb3 [
 699     i32 2, label %bb1
 700     i32 0, label %bb2
 701   ]
 702
 703 bb1:                                              ; preds = %bb
 704   unreachable
 705
 706 bb2:                                              ; preds = %bb
 707   unreachable
 708
 709 bb3:                                              ; preds = %bb
 710   switch i32 undef, label %bb5 [
 711     i32 2, label %bb4
 712   ]
 713
 714 bb4:                                              ; preds = %bb3
 715   ret void
 716
 717 bb5:                                              ; preds = %bb3
 718   unreachable
 719 }
 720
 721 ; Test that there is an extra export inserted after the normal export,
 722 ; if the normal export is inside a uniformly reached block and there is
 723 ; an infinite loop in the pixel shader.
 724
 725 ; IR-LABEL: @uniformly_reached_export
 726 ; IR-NEXT: .entry:
 727 ; IR: br i1 [[CND:%.*]], label %[[LOOP:.*]], label %[[EXP:.*]]
 728
 729 ; IR: [[LOOP]]:
 730 ; IR-NEXT: br i1 false, label %DummyReturnBlock, label %[[LOOP]]
 731
 732 ; IR: [[EXP]]:
 733 ; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true)
 734 ; IR-NEXT: ret void
 735
 736 ; IR: DummyReturnBlock:
 737 ; IR-NEXT: ret void
 738
 739 define amdgpu_ps void @uniformly_reached_export(float inreg %tmp25) {
 740 .entry:
 741   %tmp26 = fcmp olt float %tmp25, 0.000000e+00
 742   br i1 %tmp26, label %loop, label %bb27
 743
 744 loop:                                               ; preds = %loop, %.entry
 745   br label %loop
 746
 747 bb27:                                             ; preds = %.entry
 748   call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true)
 749   ret void
 750 }
 751
 752 declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #0
 753 declare i32 @llvm.amdgcn.workitem.id.x() #1
 754
 755 attributes #0 = { nounwind }
 756 attributes #1 = { nounwind readnone }