llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll

   1 ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx600 -S -lowerswitch -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s
   2 ; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -S -lowerswitch -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s
   3 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=GCN %s
   4
   5 ; Add an extra verifier runs. There were some cases where invalid IR
   6 ; was produced but happened to be fixed by the later passes.
   7
   8 ; Make sure divergent control flow with multiple exits from a region
   9 ; is properly handled. UnifyFunctionExitNodes should be run before
  10 ; StructurizeCFG.
  11
  12 ; IR-LABEL: @multi_divergent_region_exit_ret_ret(
  13 ; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot)
  14 ; IR: %1 = extractvalue { i1, i64 } %0, 0
  15 ; IR: %2 = extractvalue { i1, i64 } %0, 1
  16 ; IR: br i1 %1, label %LeafBlock1, label %Flow
  17
  18 ; IR: Flow:
  19 ; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
  20 ; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ]
  21 ; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
  22 ; IR: %6 = extractvalue { i1, i64 } %5, 0
  23 ; IR: %7 = extractvalue { i1, i64 } %5, 1
  24 ; IR: br i1 %6, label %LeafBlock, label %Flow1
  25
  26 ; IR: LeafBlock:
  27 ; IR: br label %Flow1
  28
  29 ; IR: LeafBlock1:
  30 ; IR: br label %Flow{{$}}
  31
  32 ; IR:  Flow2:
  33 ; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
  34 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
  35 ; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8)
  36 ; IR: %10 = extractvalue { i1, i64 } %9, 0
  37 ; IR: %11 = extractvalue { i1, i64 } %9, 1
  38 ; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock
  39
  40 ; IR: exit0:
  41 ; IR: store volatile i32 9, ptr addrspace(1) undef
  42 ; IR: br label %UnifiedReturnBlock
  43
  44 ; IR: Flow1:
  45 ; IR: %12 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %3, %Flow ]
  46 ; IR: %13 = phi i1 [ %SwitchLeaf.inv, %LeafBlock ], [ %4, %Flow ]
  47 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %7)
  48 ; IR: %14 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %13)
  49 ; IR: %15 = extractvalue { i1, i64 } %14, 0
  50 ; IR: %16 = extractvalue { i1, i64 } %14, 1
  51 ; IR: br i1 %15, label %exit1, label %Flow2
  52
  53 ; IR: exit1:
  54 ; IR: store volatile i32 17, ptr addrspace(3) undef
  55 ; IR:  br label %Flow2
  56
  57 ; IR: UnifiedReturnBlock:
  58 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %11)
  59 ; IR: ret void
  60
  61
  62 ; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:
  63
  64 ; GCN-DAG:  s_mov_b64           [[EXIT1:s\[[0-9]+:[0-9]+\]]], 0
  65 ; GCN-DAG:  v_cmp_lt_i32_e32    vcc, 1,
  66 ; GCN-DAG:  s_mov_b64           [[EXIT0:s\[[0-9]+:[0-9]+\]]], 0
  67 ; GCN-DAG:  s_and_saveexec_b64
  68 ; GCN-DAG:  s_xor_b64
  69
  70 ; GCN: ; %LeafBlock1
  71 ; GCN-NEXT: s_mov_b64           [[EXIT0]], exec
  72 ; GCN-NEXT: v_cmp_ne_u32_e32    vcc, 2,
  73 ; GCN-NEXT: s_and_b64           [[EXIT1]], vcc, exec
  74
  75 ; GCN: ; %Flow
  76 ; GCN-NEXT: s_andn2_saveexec_b64
  77
  78 ; GCN: ; %LeafBlock
  79 ; GCN-DAG:  v_cmp_eq_u32_e32    vcc, 1,
  80 ; GCN-DAG:  v_cmp_ne_u32_e64    [[INV:s\[[0-9]+:[0-9]+\]]], 1,
  81 ; GCN-DAG:  s_andn2_b64         [[EXIT0]], [[EXIT0]], exec
  82 ; GCN-DAG:  s_andn2_b64         [[EXIT1]], [[EXIT1]], exec
  83 ; GCN-DAG:  s_and_b64           [[TMP0:s\[[0-9]+:[0-9]+\]]], vcc, exec
  84 ; GCN-DAG:  s_and_b64           [[TMP1:s\[[0-9]+:[0-9]+\]]], [[INV]], exec
  85 ; GCN-DAG:  s_or_b64            [[EXIT0]], [[EXIT0]], [[TMP0]]
  86 ; GCN-DAG:  s_or_b64            [[EXIT1]], [[EXIT1]], [[TMP1]]
  87
  88 ; GCN: ; %Flow4
  89 ; GCN-NEXT: s_or_b64            exec, exec,
  90 ; GCN-NEXT: s_and_saveexec_b64  {{s\[[0-9]+:[0-9]+\]}}, [[EXIT1]]
  91 ; GCN-NEXT: s_xor_b64
  92
  93 ; GCN: ; %exit1
  94 ; GCN-DAG:  ds_write_b32
  95 ; GCN-DAG:  s_andn2_b64         [[EXIT0]], [[EXIT0]], exec
  96
  97 ; GCN: ; %Flow5
  98 ; GCN-NEXT: s_or_b64            exec, exec,
  99 ; GCN-NEXT: s_and_saveexec_b64  {{s\[[0-9]+:[0-9]+\]}}, [[EXIT0]]
 100
 101 ; GCN: ; %exit0
 102 ; GCN:      buffer_store_dword
 103
 104 ; GCN: ; %UnifiedReturnBlock
 105 ; GCN-NEXT: s_endpgm
 106 define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(ptr addrspace(1) nocapture %arg0, ptr addrspace(1) nocapture %arg1, ptr addrspace(1) nocapture %arg2) #0 {
 107 entry:
 108   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 109   %tmp1 = add i32 0, %tmp
 110   %tmp2 = zext i32 %tmp1 to i64
 111   %tmp3 = add i64 0, %tmp2
 112   %tmp4 = shl i64 %tmp3, 32
 113   %tmp5 = ashr exact i64 %tmp4, 32
 114   %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %tmp5
 115   %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
 116   %tmp8 = sext i32 %tmp7 to i64
 117   %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp8
 118   %tmp10 = load i32, ptr addrspace(1) %tmp9, align 4
 119   %tmp13 = zext i32 %tmp10 to i64
 120   %tmp14 = getelementptr inbounds i32, ptr addrspace(1) %arg2, i64 %tmp13
 121   %tmp16 = load i32, ptr addrspace(1) %tmp14, align 16
 122   %Pivot = icmp slt i32 %tmp16, 2
 123   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
 124
 125 LeafBlock:                                        ; preds = %entry
 126   %SwitchLeaf = icmp eq i32 %tmp16, 1
 127   br i1 %SwitchLeaf, label %exit0, label %exit1
 128
 129 LeafBlock1:                                       ; preds = %entry
 130   %SwitchLeaf2 = icmp eq i32 %tmp16, 2
 131   br i1 %SwitchLeaf2, label %exit0, label %exit1
 132
 133 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 134   store volatile i32 9, ptr addrspace(1) undef
 135   ret void
 136
 137 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 138   store volatile i32 17, ptr addrspace(3) undef
 139   ret void
 140 }
 141
 142 ; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable(
 143 ; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot)
 144
 145 ; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
 146
 147 ; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
 148 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
 149 ; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8)
 150 ; IR: br i1 %10, label %exit0, label %UnifiedUnreachableBlock
 151
 152
 153 ; IR: UnifiedUnreachableBlock:
 154 ; IR-NEXT: unreachable
 155
 156
 157 ; FIXME: Probably should insert an s_endpgm anyway.
 158 ; GCN-LABEL: {{^}}multi_divergent_region_exit_unreachable_unreachable:
 159 ; GCN: ; %UnifiedUnreachableBlock
 160 ; GCN-NEXT: .Lfunc_end
 161 define amdgpu_kernel void @multi_divergent_region_exit_unreachable_unreachable(ptr addrspace(1) nocapture %arg0, ptr addrspace(1) nocapture %arg1, ptr addrspace(1) nocapture %arg2) #0 {
 162 entry:
 163   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 164   %tmp1 = add i32 0, %tmp
 165   %tmp2 = zext i32 %tmp1 to i64
 166   %tmp3 = add i64 0, %tmp2
 167   %tmp4 = shl i64 %tmp3, 32
 168   %tmp5 = ashr exact i64 %tmp4, 32
 169   %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %tmp5
 170   %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
 171   %tmp8 = sext i32 %tmp7 to i64
 172   %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp8
 173   %tmp10 = load i32, ptr addrspace(1) %tmp9, align 4
 174   %tmp13 = zext i32 %tmp10 to i64
 175   %tmp14 = getelementptr inbounds i32, ptr addrspace(1) %arg2, i64 %tmp13
 176   %tmp16 = load i32, ptr addrspace(1) %tmp14, align 16
 177   %Pivot = icmp slt i32 %tmp16, 2
 178   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
 179
 180 LeafBlock:                                        ; preds = %entry
 181   %SwitchLeaf = icmp eq i32 %tmp16, 1
 182   br i1 %SwitchLeaf, label %exit0, label %exit1
 183
 184 LeafBlock1:                                       ; preds = %entry
 185   %SwitchLeaf2 = icmp eq i32 %tmp16, 2
 186   br i1 %SwitchLeaf2, label %exit0, label %exit1
 187
 188 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 189   store volatile i32 9, ptr addrspace(1) undef
 190   unreachable
 191
 192 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 193   store volatile i32 17, ptr addrspace(3) undef
 194   unreachable
 195 }
 196
 197 ; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret(
 198 ; IR: %divergent.cond0 = icmp sge i32 %tmp16, 2
 199 ; IR: llvm.amdgcn.if
 200 ; IR: br i1
 201
 202 ; IR: {{^}}Flow:
 203 ; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
 204 ; IR: %4 = phi i1 [ %uniform.cond0, %LeafBlock1 ], [ false, %entry ]
 205 ; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
 206 ; IR: br i1 %6, label %LeafBlock, label %Flow1
 207
 208 ; IR: {{^}}LeafBlock:
 209 ; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1
 210 ; IR: br label %Flow1
 211
 212 ; IR: LeafBlock1:
 213 ; IR: %uniform.cond0 = icmp ne i32 %arg3, 2
 214 ; IR: br label %Flow
 215
 216 ; IR: Flow2:
 217 ; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
 218 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
 219 ; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8)
 220 ; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock
 221
 222 ; IR: exit0:
 223 ; IR: store volatile i32 9, ptr addrspace(1) undef
 224 ; IR: br label %UnifiedReturnBlock
 225
 226 ; IR: {{^}}Flow1:
 227 ; IR: %12 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %3, %Flow ]
 228 ; IR: %13 = phi i1 [ %divergent.cond1.inv, %LeafBlock ], [ %4, %Flow ]
 229 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %7)
 230 ; IR: %14 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %13)
 231 ; IR: %15 = extractvalue { i1, i64 } %14, 0
 232 ; IR: %16 = extractvalue { i1, i64 } %14, 1
 233 ; IR: br i1 %15, label %exit1, label %Flow2
 234
 235 ; IR: exit1:
 236 ; IR: store volatile i32 17, ptr addrspace(3) undef
 237 ; IR: br label %Flow2
 238
 239 ; IR: UnifiedReturnBlock:
 240 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %11)
 241 ; IR: ret void
 242 define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(ptr addrspace(1) nocapture %arg0, ptr addrspace(1) nocapture %arg1, ptr addrspace(1) nocapture %arg2, i32 %arg3) #0 {
 243 entry:
 244   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 245   %tmp1 = add i32 0, %tmp
 246   %tmp2 = zext i32 %tmp1 to i64
 247   %tmp3 = add i64 0, %tmp2
 248   %tmp4 = shl i64 %tmp3, 32
 249   %tmp5 = ashr exact i64 %tmp4, 32
 250   %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %tmp5
 251   %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
 252   %tmp8 = sext i32 %tmp7 to i64
 253   %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp8
 254   %tmp10 = load i32, ptr addrspace(1) %tmp9, align 4
 255   %tmp13 = zext i32 %tmp10 to i64
 256   %tmp14 = getelementptr inbounds i32, ptr addrspace(1) %arg2, i64 %tmp13
 257   %tmp16 = load i32, ptr addrspace(1) %tmp14, align 16
 258   %divergent.cond0 = icmp slt i32 %tmp16, 2
 259   br i1 %divergent.cond0, label %LeafBlock, label %LeafBlock1
 260
 261 LeafBlock:                                        ; preds = %entry
 262   %divergent.cond1 = icmp eq i32 %tmp16, 1
 263   br i1 %divergent.cond1, label %exit0, label %exit1
 264
 265 LeafBlock1:                                       ; preds = %entry
 266   %uniform.cond0 = icmp eq i32 %arg3, 2
 267   br i1 %uniform.cond0, label %exit0, label %exit1
 268
 269 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 270   store volatile i32 9, ptr addrspace(1) undef
 271   ret void
 272
 273 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 274   store volatile i32 17, ptr addrspace(3) undef
 275   ret void
 276 }
 277
 278 ; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret(
 279 ; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot)
 280 ; IR: br i1 %1, label %LeafBlock1, label %Flow
 281
 282 ; IR: Flow:
 283 ; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
 284 ; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ]
 285 ; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
 286
 287 ; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
 288 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
 289 ; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8)
 290
 291 define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(ptr addrspace(1) nocapture %arg0, ptr addrspace(1) nocapture %arg1, ptr addrspace(1) nocapture %arg2, i32 %arg3) #0 {
 292 entry:
 293   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 294   %tmp1 = add i32 0, %tmp
 295   %tmp2 = zext i32 %tmp1 to i64
 296   %tmp3 = add i64 0, %tmp2
 297   %tmp4 = shl i64 %tmp3, 32
 298   %tmp5 = ashr exact i64 %tmp4, 32
 299   %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %tmp5
 300   %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
 301   %tmp8 = sext i32 %tmp7 to i64
 302   %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp8
 303   %tmp10 = load i32, ptr addrspace(1) %tmp9, align 4
 304   %tmp13 = zext i32 %tmp10 to i64
 305   %tmp14 = getelementptr inbounds i32, ptr addrspace(1) %arg2, i64 %tmp13
 306   %tmp16 = load i32, ptr addrspace(1) %tmp14, align 16
 307   %Pivot = icmp slt i32 %tmp16, 2
 308   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
 309
 310 LeafBlock:                                        ; preds = %entry
 311   %SwitchLeaf = icmp eq i32 %arg3, 1
 312   br i1 %SwitchLeaf, label %exit0, label %exit1
 313
 314 LeafBlock1:                                       ; preds = %entry
 315   %SwitchLeaf2 = icmp eq i32 %tmp16, 2
 316   br i1 %SwitchLeaf2, label %exit0, label %exit1
 317
 318 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 319   store volatile i32 9, ptr addrspace(1) undef
 320   ret void
 321
 322 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 323   store volatile i32 17, ptr addrspace(3) undef
 324   ret void
 325 }
 326
 327 ; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value(
 328 ; IR: Flow2:
 329 ; IR: %8 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ]
 330 ; IR: %9 = phi i1 [ false, %exit1 ], [ %13, %Flow1 ]
 331 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %17)
 332
 333 ; IR: UnifiedReturnBlock:
 334 ; IR: %UnifiedRetVal = phi float [ %8, %Flow2 ], [ 1.000000e+00, %exit0 ]
 335 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %12)
 336 ; IR: ret float %UnifiedRetVal
 337 define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 {
 338 entry:
 339   %Pivot = icmp slt i32 %vgpr, 2
 340   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
 341
 342 LeafBlock:                                        ; preds = %entry
 343   %SwitchLeaf = icmp eq i32 %vgpr, 1
 344   br i1 %SwitchLeaf, label %exit0, label %exit1
 345
 346 LeafBlock1:                                       ; preds = %entry
 347   %SwitchLeaf2 = icmp eq i32 %vgpr, 2
 348   br i1 %SwitchLeaf2, label %exit0, label %exit1
 349
 350 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 351   store i32 9, ptr addrspace(1) undef
 352   ret float 1.0
 353
 354 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 355   store i32 17, ptr addrspace(3) undef
 356   ret float 2.0
 357 }
 358
 359 ; IR-LABEL: @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(
 360
 361 ; GCN-LABEL: {{^}}uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value:
 362 ; GCN: s_cmp_gt_i32 s0, 1
 363 ; GCN: s_cbranch_scc0 [[FLOW:.LBB[0-9]+_[0-9]+]]
 364
 365 ; GCN: v_cmp_ne_u32_e32 vcc, 7, v0
 366
 367 ; GCN: {{^}}[[FLOW]]:
 368
 369 ; GCN: s_or_b64 exec, exec
 370 ; GCN: v_mov_b32_e32 v0, s6
 371 ; GCN-NOT: s_and_b64 exec, exec
 372 ; GCN: v_mov_b32_e32 v0, 1.0
 373
 374 ; GCN: {{^.LBB[0-9]+_[0-9]+}}: ; %UnifiedReturnBlock
 375 ; GCN-NEXT: s_or_b64 exec, exec
 376 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
 377 ; GCN-NEXT: ; return
 378
 379 define amdgpu_ps float @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(i32 inreg %sgpr, i32 %vgpr) #0 {
 380 entry:
 381   %uniform.cond = icmp slt i32 %sgpr, 2
 382   br i1 %uniform.cond, label %LeafBlock, label %LeafBlock1
 383
 384 LeafBlock:                                        ; preds = %entry
 385   %divergent.cond0 = icmp eq i32 %vgpr, 3
 386   br i1 %divergent.cond0, label %exit0, label %exit1
 387
 388 LeafBlock1:                                       ; preds = %entry
 389   %divergent.cond1 = icmp eq i32 %vgpr, 7
 390   br i1 %divergent.cond1, label %exit0, label %exit1
 391
 392 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 393   store i32 9, ptr addrspace(1) undef
 394   ret float 1.0
 395
 396 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 397   store i32 17, ptr addrspace(3) undef
 398   ret float 2.0
 399 }
 400
 401 ; IR-LABEL: @multi_divergent_region_exit_ret_unreachable(
 402 ; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot)
 403
 404 ; IR: Flow:
 405 ; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
 406 ; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ]
 407 ; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
 408
 409 ; IR: Flow2:
 410 ; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
 411 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
 412 ; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8)
 413 ; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock
 414
 415 ; IR: exit0:
 416 ; IR-NEXT: store volatile i32 17, ptr addrspace(3) undef
 417 ; IR-NEXT: br label %UnifiedReturnBlock
 418
 419 ; IR: Flow1:
 420 ; IR: %12 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %3, %Flow ]
 421 ; IR: %13 = phi i1 [ %SwitchLeaf.inv, %LeafBlock ], [ %4, %Flow ]
 422 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %7)
 423 ; IR: %14 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %13)
 424 ; IR: %15 = extractvalue { i1, i64 } %14, 0
 425 ; IR: %16 = extractvalue { i1, i64 } %14, 1
 426 ; IR: br i1 %15, label %exit1, label %Flow2
 427
 428 ; IR: exit1:
 429 ; IR-NEXT: store volatile i32 9, ptr addrspace(1) undef
 430 ; IR-NEXT: call void @llvm.amdgcn.unreachable()
 431 ; IR-NEXT: br label %Flow2
 432
 433 ; IR: UnifiedReturnBlock:
 434 ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %11)
 435 ; IR-NEXT: ret void
 436 define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(ptr addrspace(1) nocapture %arg0, ptr addrspace(1) nocapture %arg1, ptr addrspace(1) nocapture %arg2) #0 {
 437 entry:
 438   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 439   %tmp1 = add i32 0, %tmp
 440   %tmp2 = zext i32 %tmp1 to i64
 441   %tmp3 = add i64 0, %tmp2
 442   %tmp4 = shl i64 %tmp3, 32
 443   %tmp5 = ashr exact i64 %tmp4, 32
 444   %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %tmp5
 445   %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
 446   %tmp8 = sext i32 %tmp7 to i64
 447   %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp8
 448   %tmp10 = load i32, ptr addrspace(1) %tmp9, align 4
 449   %tmp13 = zext i32 %tmp10 to i64
 450   %tmp14 = getelementptr inbounds i32, ptr addrspace(1) %arg2, i64 %tmp13
 451   %tmp16 = load i32, ptr addrspace(1) %tmp14, align 16
 452   %Pivot = icmp slt i32 %tmp16, 2
 453   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
 454
 455 LeafBlock:                                        ; preds = %entry
 456   %SwitchLeaf = icmp eq i32 %tmp16, 1
 457   br i1 %SwitchLeaf, label %exit0, label %exit1
 458
 459 LeafBlock1:                                       ; preds = %entry
 460   %SwitchLeaf2 = icmp eq i32 %tmp16, 2
 461   br i1 %SwitchLeaf2, label %exit0, label %exit1
 462
 463 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 464   store volatile i32 17, ptr addrspace(3) undef
 465   ret void
 466
 467 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 468   store volatile i32 9, ptr addrspace(1) undef
 469   unreachable
 470 }
 471
 472 ; The non-uniformity of the branch to the exiting blocks requires
 473 ; looking at transitive predecessors.
 474
 475 ; IR-LABEL: @indirect_multi_divergent_region_exit_ret_unreachable(
 476
 477 ; IR: exit0:                                            ; preds = %Flow2
 478 ; IR-NEXT: store volatile i32 17, ptr addrspace(3) undef
 479 ; IR-NEXT: br label %UnifiedReturnBlock
 480
 481
 482 ; IR: indirect.exit1:
 483 ; IR: %load = load volatile i32, ptr addrspace(1) undef
 484 ; IR: store volatile i32 %load, ptr addrspace(1) undef
 485 ; IR: store volatile i32 9, ptr addrspace(1) undef
 486 ; IR: call void @llvm.amdgcn.unreachable()
 487 ; IR-NEXT: br label %Flow2
 488
 489 ; IR: UnifiedReturnBlock:                               ; preds = %exit0, %Flow2
 490 ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %11)
 491 ; IR-NEXT: ret void
 492 define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(ptr addrspace(1) nocapture %arg0, ptr addrspace(1) nocapture %arg1, ptr addrspace(1) nocapture %arg2) #0 {
 493 entry:
 494   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 495   %tmp1 = add i32 0, %tmp
 496   %tmp2 = zext i32 %tmp1 to i64
 497   %tmp3 = add i64 0, %tmp2
 498   %tmp4 = shl i64 %tmp3, 32
 499   %tmp5 = ashr exact i64 %tmp4, 32
 500   %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %tmp5
 501   %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
 502   %tmp8 = sext i32 %tmp7 to i64
 503   %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp8
 504   %tmp10 = load i32, ptr addrspace(1) %tmp9, align 4
 505   %tmp13 = zext i32 %tmp10 to i64
 506   %tmp14 = getelementptr inbounds i32, ptr addrspace(1) %arg2, i64 %tmp13
 507   %tmp16 = load i32, ptr addrspace(1) %tmp14, align 16
 508   %Pivot = icmp slt i32 %tmp16, 2
 509   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
 510
 511 LeafBlock:                                        ; preds = %entry
 512   %SwitchLeaf = icmp eq i32 %tmp16, 1
 513   br i1 %SwitchLeaf, label %exit0, label %indirect.exit1
 514
 515 LeafBlock1:                                       ; preds = %entry
 516   %SwitchLeaf2 = icmp eq i32 %tmp16, 2
 517   br i1 %SwitchLeaf2, label %exit0, label %indirect.exit1
 518
 519 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 520   store volatile i32 17, ptr addrspace(3) undef
 521   ret void
 522
 523 indirect.exit1:
 524   %load = load volatile i32, ptr addrspace(1) undef
 525   store volatile i32 %load, ptr addrspace(1) undef
 526   br label %exit1
 527
 528 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 529   store volatile i32 9, ptr addrspace(1) undef
 530   unreachable
 531 }
 532
 533 ; IR-LABEL: @multi_divergent_region_exit_ret_switch(
 534 define amdgpu_kernel void @multi_divergent_region_exit_ret_switch(ptr addrspace(1) nocapture %arg0, ptr addrspace(1) nocapture %arg1, ptr addrspace(1) nocapture %arg2) #0 {
 535 entry:
 536   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 537   %tmp1 = add i32 0, %tmp
 538   %tmp2 = zext i32 %tmp1 to i64
 539   %tmp3 = add i64 0, %tmp2
 540   %tmp4 = shl i64 %tmp3, 32
 541   %tmp5 = ashr exact i64 %tmp4, 32
 542   %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %tmp5
 543   %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
 544   %tmp8 = sext i32 %tmp7 to i64
 545   %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp8
 546   %tmp10 = load i32, ptr addrspace(1) %tmp9, align 4
 547   %tmp13 = zext i32 %tmp10 to i64
 548   %tmp14 = getelementptr inbounds i32, ptr addrspace(1) %arg2, i64 %tmp13
 549   %tmp16 = load i32, ptr addrspace(1) %tmp14, align 16
 550   switch i32 %tmp16, label %exit1
 551     [ i32 1, label %LeafBlock
 552       i32 2, label %LeafBlock1
 553       i32 3, label %exit0 ]
 554
 555 LeafBlock:                                        ; preds = %entry
 556   %SwitchLeaf = icmp eq i32 %tmp16, 1
 557   br i1 %SwitchLeaf, label %exit0, label %exit1
 558
 559 LeafBlock1:                                       ; preds = %entry
 560   %SwitchLeaf2 = icmp eq i32 %tmp16, 2
 561   br i1 %SwitchLeaf2, label %exit0, label %exit1
 562
 563 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 564   store volatile i32 17, ptr addrspace(3) undef
 565   ret void
 566
 567 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 568   store volatile i32 9, ptr addrspace(1) undef
 569   unreachable
 570 }
 571
 572 ; IR-LABEL: @divergent_multi_ret_nest_in_uniform_triangle(
 573 define amdgpu_kernel void @divergent_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
 574 entry:
 575   %uniform.cond0 = icmp eq i32 %arg0, 4
 576   br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
 577
 578 divergent.multi.exit.region:
 579   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
 580   %divergent.cond0 = icmp eq i32 %id.x, 0
 581   br i1 %divergent.cond0, label %divergent.ret0, label %divergent.ret1
 582
 583 divergent.ret0:
 584   store volatile i32 11, ptr addrspace(3) undef
 585   ret void
 586
 587 divergent.ret1:
 588   store volatile i32 42, ptr addrspace(3) undef
 589   ret void
 590
 591 uniform.ret:
 592   store volatile i32 9, ptr addrspace(1) undef
 593   ret void
 594 }
 595
 596 ; IR-LABEL: @divergent_complex_multi_ret_nest_in_uniform_triangle(
 597 define amdgpu_kernel void @divergent_complex_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
 598 entry:
 599   %uniform.cond0 = icmp eq i32 %arg0, 4
 600   br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
 601
 602 divergent.multi.exit.region:
 603   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
 604   %divergent.cond0 = icmp eq i32 %id.x, 0
 605   br i1 %divergent.cond0, label %divergent.if, label %divergent.ret1
 606
 607 divergent.if:
 608   %vgpr0 = load volatile float, ptr addrspace(1) undef
 609   %divergent.cond1 = fcmp ogt float %vgpr0, 1.0
 610   br i1 %divergent.cond1, label %divergent.then, label %divergent.endif
 611
 612 divergent.then:
 613   %vgpr1 = load volatile float, ptr addrspace(1) undef
 614   %divergent.cond2 = fcmp olt float %vgpr1, 4.0
 615   store volatile i32 33, ptr addrspace(1) undef
 616   br i1 %divergent.cond2, label %divergent.ret0, label %divergent.endif
 617
 618 divergent.endif:
 619   store volatile i32 38, ptr addrspace(1) undef
 620   br label %divergent.ret0
 621
 622 divergent.ret0:
 623   store volatile i32 11, ptr addrspace(3) undef
 624   ret void
 625
 626 divergent.ret1:
 627   store volatile i32 42, ptr addrspace(3) undef
 628   ret void
 629
 630 uniform.ret:
 631   store volatile i32 9, ptr addrspace(1) undef
 632   ret void
 633 }
 634
 635 ; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle(
 636 ; IR: Flow1:                                            ; preds = %uniform.ret1, %uniform.multi.exit.region
 637 ; IR: %6 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ]
 638 ; IR: br i1 %6, label %uniform.if, label %Flow2
 639
 640 ; IR: Flow:                                             ; preds = %uniform.then, %uniform.if
 641 ; IR: %7 = phi i1 [ %uniform.cond2, %uniform.then ], [ %uniform.cond1.inv, %uniform.if ]
 642 ; IR: br i1 %7, label %uniform.endif, label %uniform.ret0
 643
 644 ; IR: UnifiedReturnBlock:                               ; preds = %Flow3, %Flow2
 645 ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 %5)
 646 ; IR-NEXT: ret void
 647 define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 {
 648 entry:
 649   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
 650   %divergent.cond0 = icmp eq i32 %id.x, 0
 651   br i1 %divergent.cond0, label %uniform.multi.exit.region, label %divergent.ret
 652
 653 uniform.multi.exit.region:
 654   %uniform.cond0 = icmp eq i32 %arg0, 4
 655   br i1 %uniform.cond0, label %uniform.if, label %uniform.ret1
 656
 657 uniform.if:
 658   %sgpr0 = load volatile i32, ptr addrspace(4) undef
 659   %uniform.cond1 = icmp slt i32 %sgpr0, 1
 660   br i1 %uniform.cond1, label %uniform.then, label %uniform.endif
 661
 662 uniform.then:
 663   %sgpr1 = load volatile i32, ptr addrspace(4) undef
 664   %uniform.cond2 = icmp sge i32 %sgpr1, 4
 665   store volatile i32 33, ptr addrspace(1) undef
 666   br i1 %uniform.cond2, label %uniform.ret0, label %uniform.endif
 667
 668 uniform.endif:
 669   store volatile i32 38, ptr addrspace(1) undef
 670   br label %uniform.ret0
 671
 672 uniform.ret0:
 673   store volatile i32 11, ptr addrspace(3) undef
 674   ret void
 675
 676 uniform.ret1:
 677   store volatile i32 42, ptr addrspace(3) undef
 678   ret void
 679
 680 divergent.ret:
 681   store volatile i32 9, ptr addrspace(1) undef
 682   ret void
 683 }
 684
 685 ; IR-LABEL: @multi_divergent_unreachable_exit(
 686 ; IR: UnifiedUnreachableBlock:
 687 ; IR-NEXT: call void @llvm.amdgcn.unreachable()
 688 ; IR-NEXT: br label %UnifiedReturnBlock
 689
 690 ; IR: UnifiedReturnBlock:
 691 ; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64
 692 ; IR-NEXT: ret void
 693 define amdgpu_kernel void @multi_divergent_unreachable_exit() #0 {
 694 bb:
 695   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
 696   switch i32 %tmp, label %bb3 [
 697     i32 2, label %bb1
 698     i32 0, label %bb2
 699   ]
 700
 701 bb1:                                              ; preds = %bb
 702   unreachable
 703
 704 bb2:                                              ; preds = %bb
 705   unreachable
 706
 707 bb3:                                              ; preds = %bb
 708   switch i32 undef, label %bb5 [
 709     i32 2, label %bb4
 710   ]
 711
 712 bb4:                                              ; preds = %bb3
 713   ret void
 714
 715 bb5:                                              ; preds = %bb3
 716   unreachable
 717 }
 718
 719 ; Test that there is an extra export inserted after the normal export,
 720 ; if the normal export is inside a uniformly reached block and there is
 721 ; an infinite loop in the pixel shader.
 722
 723 ; IR-LABEL: @uniformly_reached_export
 724 ; IR-NEXT: .entry:
 725 ; IR: br i1 [[CND:%.*]], label %[[LOOP:.*]], label %[[EXP:.*]]
 726
 727 ; IR: [[LOOP]]:
 728 ; IR-NEXT: br i1 false, label %DummyReturnBlock, label %[[LOOP]]
 729
 730 ; IR: [[EXP]]:
 731 ; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 true, i1 true)
 732 ; IR-NEXT: ret void
 733
 734 ; IR: DummyReturnBlock:
 735 ; IR-NEXT: ret void
 736
 737 define amdgpu_ps void @uniformly_reached_export(float inreg %tmp25) {
 738 .entry:
 739   %tmp26 = fcmp olt float %tmp25, 0.000000e+00
 740   br i1 %tmp26, label %loop, label %bb27
 741
 742 loop:                                               ; preds = %loop, %.entry
 743   br label %loop
 744
 745 bb27:                                             ; preds = %.entry
 746   call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 true, i1 true)
 747   ret void
 748 }
 749
 750 declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #0
 751 declare i32 @llvm.amdgcn.workitem.id.x() #1
 752
 753 attributes #0 = { nounwind }
 754 attributes #1 = { nounwind readnone }