test/CodeGen/AMDGPU/multi-divergent-exit-region.ll

   1 ; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s
   2 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
   3
   4 ; Add an extra verifier runs. There were some cases where invalid IR
   5 ; was produced but happened to be fixed by the later passes.
   6
   7 ; Make sure divergent control flow with multiple exits from a region
   8 ; is properly handled. UnifyFunctionExitNodes should be run before
   9 ; StructurizeCFG.
  10
  11 ; IR-LABEL: @multi_divergent_region_exit_ret_ret(
  12 ; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
  13 ; IR: %2 = extractvalue { i1, i64 } %1, 0
  14 ; IR: %3 = extractvalue { i1, i64 } %1, 1
  15 ; IR: br i1 %2, label %LeafBlock1, label %Flow
  16
  17 ; IR: Flow:
  18 ; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
  19 ; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
  20 ; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
  21 ; IR: %7 = extractvalue { i1, i64 } %6, 0
  22 ; IR: %8 = extractvalue { i1, i64 } %6, 1
  23 ; IR: br i1 %7, label %LeafBlock, label %Flow1
  24
  25 ; IR: LeafBlock:
  26 ; IR: br label %Flow1
  27
  28 ; IR: LeafBlock1:
  29 ; IR: br label %Flow{{$}}
  30
  31 ; IR:  Flow2:
  32 ; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
  33 ; IR: call void @llvm.amdgcn.end.cf(i64 %19)
  34 ; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
  35 ; IR: %13 = extractvalue { i1, i64 } %12, 0
  36 ; IR: %14 = extractvalue { i1, i64 } %12, 1
  37 ; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
  38
  39 ; IR: exit0:
  40 ; IR: store volatile i32 9, i32 addrspace(1)* undef
  41 ; IR: br label %UnifiedReturnBlock
  42
  43 ; IR: Flow1:
  44 ; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
  45 ; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
  46 ; IR: call void @llvm.amdgcn.end.cf(i64 %8)
  47 ; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
  48 ; IR: %18 = extractvalue { i1, i64 } %17, 0
  49 ; IR: %19 = extractvalue { i1, i64 } %17, 1
  50 ; IR: br i1 %18, label %exit1, label %Flow2
  51
  52 ; IR: exit1:
  53 ; IR: store volatile i32 17, i32 addrspace(3)* undef
  54 ; IR:  br label %Flow2
  55
  56 ; IR: UnifiedReturnBlock:
  57 ; IR: call void @llvm.amdgcn.end.cf(i64 %14)
  58 ; IR: ret void
  59
  60
  61 ; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:
  62 ; GCN: v_cmp_lt_i32_e32 vcc, 1
  63 ; GCN: s_and_saveexec_b64
  64 ; GCN: s_xor_b64
  65
  66
  67 ; FIXME: Why is this compare essentially repeated?
  68 ; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]]
  69 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
  70 ; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG]]
  71 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
  72
  73 ; GCN: ; %Flow1
  74 ; GCN-NEXT: s_or_b64 exec, exec
  75 ; GCN: v_cmp_ne_u32_e32 vcc, 0
  76
  77 ; GCN: ; %exit1
  78 ; GCN: ds_write_b32
  79
  80 ; GCN: %Flow2
  81 ; GCN-NEXT: s_or_b64 exec, exec
  82 ; GCN: v_cmp_ne_u32_e32 vcc, 0
  83 ; GCN-NEXT: s_and_saveexec_b64
  84
  85 ; GCN: ; %exit0
  86 ; GCN: buffer_store_dword
  87
  88 ; GCN: ; %UnifiedReturnBlock
  89 ; GCN-NEXT: s_endpgm
  90 define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
  91 entry:
  92   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
  93   %tmp1 = add i32 0, %tmp
  94   %tmp2 = zext i32 %tmp1 to i64
  95   %tmp3 = add i64 0, %tmp2
  96   %tmp4 = shl i64 %tmp3, 32
  97   %tmp5 = ashr exact i64 %tmp4, 32
  98   %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
  99   %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
 100   %tmp8 = sext i32 %tmp7 to i64
 101   %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
 102   %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
 103   %tmp13 = zext i32 %tmp10 to i64
 104   %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
 105   %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
 106   %Pivot = icmp slt i32 %tmp16, 2
 107   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
 108
 109 LeafBlock:                                        ; preds = %entry
 110   %SwitchLeaf = icmp eq i32 %tmp16, 1
 111   br i1 %SwitchLeaf, label %exit0, label %exit1
 112
 113 LeafBlock1:                                       ; preds = %entry
 114   %SwitchLeaf2 = icmp eq i32 %tmp16, 2
 115   br i1 %SwitchLeaf2, label %exit0, label %exit1
 116
 117 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 118   store volatile i32 9, i32 addrspace(1)* undef
 119   ret void
 120
 121 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 122   store volatile i32 17, i32 addrspace(3)* undef
 123   ret void
 124 }
 125
 126 ; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable(
 127 ; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
 128
 129 ; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
 130
 131 ; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
 132 ; IR: call void @llvm.amdgcn.end.cf(i64 %19)
 133 ; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
 134 ; IR: br i1 %13, label %exit0, label %UnifiedUnreachableBlock
 135
 136
 137 ; IR: UnifiedUnreachableBlock:
 138 ; IR-NEXT: unreachable
 139
 140
 141 ; FIXME: Probably should insert an s_endpgm anyway.
 142 ; GCN-LABEL: {{^}}multi_divergent_region_exit_unreachable_unreachable:
 143 ; GCN: ; %UnifiedUnreachableBlock
 144 ; GCN-NEXT: .Lfunc_end
 145 define amdgpu_kernel void @multi_divergent_region_exit_unreachable_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
 146 entry:
 147   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 148   %tmp1 = add i32 0, %tmp
 149   %tmp2 = zext i32 %tmp1 to i64
 150   %tmp3 = add i64 0, %tmp2
 151   %tmp4 = shl i64 %tmp3, 32
 152   %tmp5 = ashr exact i64 %tmp4, 32
 153   %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
 154   %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
 155   %tmp8 = sext i32 %tmp7 to i64
 156   %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
 157   %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
 158   %tmp13 = zext i32 %tmp10 to i64
 159   %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
 160   %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
 161   %Pivot = icmp slt i32 %tmp16, 2
 162   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
 163
 164 LeafBlock:                                        ; preds = %entry
 165   %SwitchLeaf = icmp eq i32 %tmp16, 1
 166   br i1 %SwitchLeaf, label %exit0, label %exit1
 167
 168 LeafBlock1:                                       ; preds = %entry
 169   %SwitchLeaf2 = icmp eq i32 %tmp16, 2
 170   br i1 %SwitchLeaf2, label %exit0, label %exit1
 171
 172 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 173   store volatile i32 9, i32 addrspace(1)* undef
 174   unreachable
 175
 176 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 177   store volatile i32 17, i32 addrspace(3)* undef
 178   unreachable
 179 }
 180
 181 ; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret(
 182 ; IR: %divergent.cond0 = icmp slt i32 %tmp16, 2
 183 ; IR: llvm.amdgcn.if
 184 ; IR: br i1
 185
 186 ; IR: {{^}}Flow:
 187 ; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
 188 ; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
 189 ; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
 190 ; IR: br i1 %7, label %LeafBlock, label %Flow1
 191
 192 ; IR: {{^}}LeafBlock:
 193 ; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1
 194 ; IR: %9 = xor i1 %divergent.cond1, true
 195 ; IR: br label %Flow1
 196
 197 ; IR: LeafBlock1:
 198 ; IR: %uniform.cond0 = icmp eq i32 %arg3, 2
 199 ; IR: %10 = xor i1 %uniform.cond0, true
 200 ; IR: br label %Flow
 201
 202 ; IR: Flow2:
 203 ; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
 204 ; IR: call void @llvm.amdgcn.end.cf(i64 %19)
 205 ; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
 206 ; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
 207
 208 ; IR: exit0:
 209 ; IR: store volatile i32 9, i32 addrspace(1)* undef
 210 ; IR: br label %UnifiedReturnBlock
 211
 212 ; IR: {{^}}Flow1:
 213 ; IR: %15 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ]
 214 ; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
 215 ; IR: call void @llvm.amdgcn.end.cf(i64 %8)
 216 ; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
 217 ; IR: %18 = extractvalue { i1, i64 } %17, 0
 218 ; IR: %19 = extractvalue { i1, i64 } %17, 1
 219 ; IR: br i1 %18, label %exit1, label %Flow2
 220
 221 ; IR: exit1:
 222 ; IR: store volatile i32 17, i32 addrspace(3)* undef
 223 ; IR: br label %Flow2
 224
 225 ; IR: UnifiedReturnBlock:
 226 ; IR: call void @llvm.amdgcn.end.cf(i64 %14)
 227 ; IR: ret void
 228 define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
 229 entry:
 230   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 231   %tmp1 = add i32 0, %tmp
 232   %tmp2 = zext i32 %tmp1 to i64
 233   %tmp3 = add i64 0, %tmp2
 234   %tmp4 = shl i64 %tmp3, 32
 235   %tmp5 = ashr exact i64 %tmp4, 32
 236   %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
 237   %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
 238   %tmp8 = sext i32 %tmp7 to i64
 239   %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
 240   %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
 241   %tmp13 = zext i32 %tmp10 to i64
 242   %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
 243   %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
 244   %divergent.cond0 = icmp slt i32 %tmp16, 2
 245   br i1 %divergent.cond0, label %LeafBlock, label %LeafBlock1
 246
 247 LeafBlock:                                        ; preds = %entry
 248   %divergent.cond1 = icmp eq i32 %tmp16, 1
 249   br i1 %divergent.cond1, label %exit0, label %exit1
 250
 251 LeafBlock1:                                       ; preds = %entry
 252   %uniform.cond0 = icmp eq i32 %arg3, 2
 253   br i1 %uniform.cond0, label %exit0, label %exit1
 254
 255 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 256   store volatile i32 9, i32 addrspace(1)* undef
 257   ret void
 258
 259 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 260   store volatile i32 17, i32 addrspace(3)* undef
 261   ret void
 262 }
 263
 264 ; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret(
 265 ; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
 266 ; IR: br i1 %2, label %LeafBlock1, label %Flow
 267
 268 ; IR: Flow:
 269 ; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
 270 ; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
 271 ; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
 272
 273 ; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
 274 ; IR: call void @llvm.amdgcn.end.cf(i64 %19)
 275 ; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
 276
 277 define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
 278 entry:
 279   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 280   %tmp1 = add i32 0, %tmp
 281   %tmp2 = zext i32 %tmp1 to i64
 282   %tmp3 = add i64 0, %tmp2
 283   %tmp4 = shl i64 %tmp3, 32
 284   %tmp5 = ashr exact i64 %tmp4, 32
 285   %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
 286   %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
 287   %tmp8 = sext i32 %tmp7 to i64
 288   %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
 289   %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
 290   %tmp13 = zext i32 %tmp10 to i64
 291   %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
 292   %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
 293   %Pivot = icmp slt i32 %tmp16, 2
 294   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
 295
 296 LeafBlock:                                        ; preds = %entry
 297   %SwitchLeaf = icmp eq i32 %arg3, 1
 298   br i1 %SwitchLeaf, label %exit0, label %exit1
 299
 300 LeafBlock1:                                       ; preds = %entry
 301   %SwitchLeaf2 = icmp eq i32 %tmp16, 2
 302   br i1 %SwitchLeaf2, label %exit0, label %exit1
 303
 304 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 305   store volatile i32 9, i32 addrspace(1)* undef
 306   ret void
 307
 308 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 309   store volatile i32 17, i32 addrspace(3)* undef
 310   ret void
 311 }
 312
 313 ; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value(
 314 ; IR: Flow2:
 315 ; IR: %11 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ]
 316 ; IR: %12 = phi i1 [ false, %exit1 ], [ %16, %Flow1 ]
 317 ; IR: call void @llvm.amdgcn.end.cf(i64 %20)
 318
 319 ; IR: UnifiedReturnBlock:
 320 ; IR: %UnifiedRetVal = phi float [ %11, %Flow2 ], [ 1.000000e+00, %exit0 ]
 321 ; IR: call void @llvm.amdgcn.end.cf(i64 %15)
 322 ; IR: ret float %UnifiedRetVal
 323 define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 {
 324 entry:
 325   %Pivot = icmp slt i32 %vgpr, 2
 326   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
 327
 328 LeafBlock:                                        ; preds = %entry
 329   %SwitchLeaf = icmp eq i32 %vgpr, 1
 330   br i1 %SwitchLeaf, label %exit0, label %exit1
 331
 332 LeafBlock1:                                       ; preds = %entry
 333   %SwitchLeaf2 = icmp eq i32 %vgpr, 2
 334   br i1 %SwitchLeaf2, label %exit0, label %exit1
 335
 336 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 337   store i32 9, i32 addrspace(1)* undef
 338   ret float 1.0
 339
 340 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 341   store i32 17, i32 addrspace(3)* undef
 342   ret float 2.0
 343 }
 344
 345 ; IR-LABEL: @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(
 346
 347 ; GCN-LABEL: {{^}}uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value:
 348 ; GCN: s_cmp_gt_i32 s0, 1
 349 ; GCN: s_cbranch_scc0 [[FLOW:BB[0-9]+_[0-9]+]]
 350
 351 ; GCN: v_cmp_ne_u32_e32 vcc, 7, v0
 352
 353 ; GCN: {{^}}[[FLOW]]:
 354 ; GCN: s_cbranch_vccnz [[FLOW1:BB[0-9]+]]
 355
 356 ; GCN: v_mov_b32_e32 v0, 2.0
 357 ; GCN: s_or_b64 exec, exec
 358 ; GCN: s_and_b64 exec, exec
 359 ; GCN: v_mov_b32_e32 v0, 1.0
 360
 361 ; GCN: {{^BB[0-9]+_[0-9]+}}: ; %UnifiedReturnBlock
 362 ; GCN-NEXT: s_or_b64 exec, exec
 363 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
 364 ; GCN-NEXT: ; return
 365
 366 define amdgpu_ps float @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(i32 inreg %sgpr, i32 %vgpr) #0 {
 367 entry:
 368   %uniform.cond = icmp slt i32 %sgpr, 2
 369   br i1 %uniform.cond, label %LeafBlock, label %LeafBlock1
 370
 371 LeafBlock:                                        ; preds = %entry
 372   %divergent.cond0 = icmp eq i32 %vgpr, 3
 373   br i1 %divergent.cond0, label %exit0, label %exit1
 374
 375 LeafBlock1:                                       ; preds = %entry
 376   %divergent.cond1 = icmp eq i32 %vgpr, 7
 377   br i1 %divergent.cond1, label %exit0, label %exit1
 378
 379 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 380   store i32 9, i32 addrspace(1)* undef
 381   ret float 1.0
 382
 383 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 384   store i32 17, i32 addrspace(3)* undef
 385   ret float 2.0
 386 }
 387
 388 ; IR-LABEL: @multi_divergent_region_exit_ret_unreachable(
 389 ; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
 390
 391 ; IR: Flow:
 392 ; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
 393 ; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
 394 ; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
 395
 396 ; IR: Flow2:
 397 ; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
 398 ; IR: call void @llvm.amdgcn.end.cf(i64 %19)
 399 ; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
 400 ; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
 401
 402 ; IR: exit0:
 403 ; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
 404 ; IR-NEXT: br label %UnifiedReturnBlock
 405
 406 ; IR: Flow1:
 407 ; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
 408 ; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
 409 ; IR: call void @llvm.amdgcn.end.cf(i64 %8)
 410 ; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
 411 ; IR: %18 = extractvalue { i1, i64 } %17, 0
 412 ; IR: %19 = extractvalue { i1, i64 } %17, 1
 413 ; IR: br i1 %18, label %exit1, label %Flow2
 414
 415 ; IR: exit1:
 416 ; IR-NEXT: store volatile i32 9, i32 addrspace(1)* undef
 417 ; IR-NEXT: call void @llvm.amdgcn.unreachable()
 418 ; IR-NEXT: br label %Flow2
 419
 420 ; IR: UnifiedReturnBlock:
 421 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
 422 ; IR-NEXT: ret void
 423 define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
 424 entry:
 425   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 426   %tmp1 = add i32 0, %tmp
 427   %tmp2 = zext i32 %tmp1 to i64
 428   %tmp3 = add i64 0, %tmp2
 429   %tmp4 = shl i64 %tmp3, 32
 430   %tmp5 = ashr exact i64 %tmp4, 32
 431   %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
 432   %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
 433   %tmp8 = sext i32 %tmp7 to i64
 434   %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
 435   %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
 436   %tmp13 = zext i32 %tmp10 to i64
 437   %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
 438   %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
 439   %Pivot = icmp slt i32 %tmp16, 2
 440   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
 441
 442 LeafBlock:                                        ; preds = %entry
 443   %SwitchLeaf = icmp eq i32 %tmp16, 1
 444   br i1 %SwitchLeaf, label %exit0, label %exit1
 445
 446 LeafBlock1:                                       ; preds = %entry
 447   %SwitchLeaf2 = icmp eq i32 %tmp16, 2
 448   br i1 %SwitchLeaf2, label %exit0, label %exit1
 449
 450 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 451   store volatile i32 17, i32 addrspace(3)* undef
 452   ret void
 453
 454 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 455   store volatile i32 9, i32 addrspace(1)* undef
 456   unreachable
 457 }
 458
 459 ; The non-uniformity of the branch to the exiting blocks requires
 460 ; looking at transitive predecessors.
 461
 462 ; IR-LABEL: @indirect_multi_divergent_region_exit_ret_unreachable(
 463
 464 ; IR: exit0:                                            ; preds = %Flow2
 465 ; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
 466 ; IR-NEXT: br label %UnifiedReturnBlock
 467
 468
 469 ; IR: indirect.exit1:
 470 ; IR: %load = load volatile i32, i32 addrspace(1)* undef
 471 ; IR: store volatile i32 %load, i32 addrspace(1)* undef
 472 ; IR: store volatile i32 9, i32 addrspace(1)* undef
 473 ; IR: call void @llvm.amdgcn.unreachable()
 474 ; IR-NEXT: br label %Flow2
 475
 476 ; IR: UnifiedReturnBlock:                               ; preds = %exit0, %Flow2
 477 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
 478 ; IR-NEXT: ret void
 479 define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
 480 entry:
 481   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 482   %tmp1 = add i32 0, %tmp
 483   %tmp2 = zext i32 %tmp1 to i64
 484   %tmp3 = add i64 0, %tmp2
 485   %tmp4 = shl i64 %tmp3, 32
 486   %tmp5 = ashr exact i64 %tmp4, 32
 487   %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
 488   %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
 489   %tmp8 = sext i32 %tmp7 to i64
 490   %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
 491   %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
 492   %tmp13 = zext i32 %tmp10 to i64
 493   %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
 494   %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
 495   %Pivot = icmp slt i32 %tmp16, 2
 496   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
 497
 498 LeafBlock:                                        ; preds = %entry
 499   %SwitchLeaf = icmp eq i32 %tmp16, 1
 500   br i1 %SwitchLeaf, label %exit0, label %indirect.exit1
 501
 502 LeafBlock1:                                       ; preds = %entry
 503   %SwitchLeaf2 = icmp eq i32 %tmp16, 2
 504   br i1 %SwitchLeaf2, label %exit0, label %indirect.exit1
 505
 506 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 507   store volatile i32 17, i32 addrspace(3)* undef
 508   ret void
 509
 510 indirect.exit1:
 511   %load = load volatile i32, i32 addrspace(1)* undef
 512   store volatile i32 %load, i32 addrspace(1)* undef
 513   br label %exit1
 514
 515 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 516   store volatile i32 9, i32 addrspace(1)* undef
 517   unreachable
 518 }
 519
 520 ; IR-LABEL: @multi_divergent_region_exit_ret_switch(
 521 define amdgpu_kernel void @multi_divergent_region_exit_ret_switch(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
 522 entry:
 523   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 524   %tmp1 = add i32 0, %tmp
 525   %tmp2 = zext i32 %tmp1 to i64
 526   %tmp3 = add i64 0, %tmp2
 527   %tmp4 = shl i64 %tmp3, 32
 528   %tmp5 = ashr exact i64 %tmp4, 32
 529   %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
 530   %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
 531   %tmp8 = sext i32 %tmp7 to i64
 532   %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
 533   %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
 534   %tmp13 = zext i32 %tmp10 to i64
 535   %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
 536   %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
 537   switch i32 %tmp16, label %exit1
 538     [ i32 1, label %LeafBlock
 539       i32 2, label %LeafBlock1
 540       i32 3, label %exit0 ]
 541
 542 LeafBlock:                                        ; preds = %entry
 543   %SwitchLeaf = icmp eq i32 %tmp16, 1
 544   br i1 %SwitchLeaf, label %exit0, label %exit1
 545
 546 LeafBlock1:                                       ; preds = %entry
 547   %SwitchLeaf2 = icmp eq i32 %tmp16, 2
 548   br i1 %SwitchLeaf2, label %exit0, label %exit1
 549
 550 exit0:                                     ; preds = %LeafBlock, %LeafBlock1
 551   store volatile i32 17, i32 addrspace(3)* undef
 552   ret void
 553
 554 exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 555   store volatile i32 9, i32 addrspace(1)* undef
 556   unreachable
 557 }
 558
 559 ; IR-LABEL: @divergent_multi_ret_nest_in_uniform_triangle(
 560 define amdgpu_kernel void @divergent_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
 561 entry:
 562   %uniform.cond0 = icmp eq i32 %arg0, 4
 563   br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
 564
 565 divergent.multi.exit.region:
 566   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
 567   %divergent.cond0 = icmp eq i32 %id.x, 0
 568   br i1 %divergent.cond0, label %divergent.ret0, label %divergent.ret1
 569
 570 divergent.ret0:
 571   store volatile i32 11, i32 addrspace(3)* undef
 572   ret void
 573
 574 divergent.ret1:
 575   store volatile i32 42, i32 addrspace(3)* undef
 576   ret void
 577
 578 uniform.ret:
 579   store volatile i32 9, i32 addrspace(1)* undef
 580   ret void
 581 }
 582
 583 ; IR-LABEL: @divergent_complex_multi_ret_nest_in_uniform_triangle(
 584 define amdgpu_kernel void @divergent_complex_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
 585 entry:
 586   %uniform.cond0 = icmp eq i32 %arg0, 4
 587   br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
 588
 589 divergent.multi.exit.region:
 590   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
 591   %divergent.cond0 = icmp eq i32 %id.x, 0
 592   br i1 %divergent.cond0, label %divergent.if, label %divergent.ret1
 593
 594 divergent.if:
 595   %vgpr0 = load volatile float, float addrspace(1)* undef
 596   %divergent.cond1 = fcmp ogt float %vgpr0, 1.0
 597   br i1 %divergent.cond1, label %divergent.then, label %divergent.endif
 598
 599 divergent.then:
 600   %vgpr1 = load volatile float, float addrspace(1)* undef
 601   %divergent.cond2 = fcmp olt float %vgpr1, 4.0
 602   store volatile i32 33, i32 addrspace(1)* undef
 603   br i1 %divergent.cond2, label %divergent.ret0, label %divergent.endif
 604
 605 divergent.endif:
 606   store volatile i32 38, i32 addrspace(1)* undef
 607   br label %divergent.ret0
 608
 609 divergent.ret0:
 610   store volatile i32 11, i32 addrspace(3)* undef
 611   ret void
 612
 613 divergent.ret1:
 614   store volatile i32 42, i32 addrspace(3)* undef
 615   ret void
 616
 617 uniform.ret:
 618   store volatile i32 9, i32 addrspace(1)* undef
 619   ret void
 620 }
 621
 622 ; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle(
 623 ; IR: Flow1:                                            ; preds = %uniform.ret1, %uniform.multi.exit.region
 624 ; IR: %8 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ]
 625 ; IR: br i1 %8, label %uniform.if, label %Flow2
 626
 627 ; IR: Flow:                                             ; preds = %uniform.then, %uniform.if
 628 ; IR: %11 = phi i1 [ %10, %uniform.then ], [ %9, %uniform.if ]
 629 ; IR: br i1 %11, label %uniform.endif, label %uniform.ret0
 630
 631 ; IR: UnifiedReturnBlock:                               ; preds = %Flow3, %Flow2
 632 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %6)
 633 ; IR-NEXT: ret void
 634 define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 {
 635 entry:
 636   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
 637   %divergent.cond0 = icmp eq i32 %id.x, 0
 638   br i1 %divergent.cond0, label %uniform.multi.exit.region, label %divergent.ret
 639
 640 uniform.multi.exit.region:
 641   %uniform.cond0 = icmp eq i32 %arg0, 4
 642   br i1 %uniform.cond0, label %uniform.if, label %uniform.ret1
 643
 644 uniform.if:
 645   %sgpr0 = load volatile i32, i32 addrspace(2)* undef
 646   %uniform.cond1 = icmp slt i32 %sgpr0, 1
 647   br i1 %uniform.cond1, label %uniform.then, label %uniform.endif
 648
 649 uniform.then:
 650   %sgpr1 = load volatile i32, i32 addrspace(2)* undef
 651   %uniform.cond2 = icmp sge i32 %sgpr1, 4
 652   store volatile i32 33, i32 addrspace(1)* undef
 653   br i1 %uniform.cond2, label %uniform.ret0, label %uniform.endif
 654
 655 uniform.endif:
 656   store volatile i32 38, i32 addrspace(1)* undef
 657   br label %uniform.ret0
 658
 659 uniform.ret0:
 660   store volatile i32 11, i32 addrspace(3)* undef
 661   ret void
 662
 663 uniform.ret1:
 664   store volatile i32 42, i32 addrspace(3)* undef
 665   ret void
 666
 667 divergent.ret:
 668   store volatile i32 9, i32 addrspace(1)* undef
 669   ret void
 670 }
 671
 672 ; IR-LABEL: @multi_divergent_unreachable_exit(
 673 ; IR: UnifiedUnreachableBlock:
 674 ; IR-NEXT: call void @llvm.amdgcn.unreachable()
 675 ; IR-NEXT: br label %UnifiedReturnBlock
 676
 677 ; IR: UnifiedReturnBlock:
 678 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64
 679 ; IR-NEXT: ret void
 680 define amdgpu_kernel void @multi_divergent_unreachable_exit() #0 {
 681 bb:
 682   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
 683   switch i32 %tmp, label %bb3 [
 684     i32 2, label %bb1
 685     i32 0, label %bb2
 686   ]
 687
 688 bb1:                                              ; preds = %bb
 689   unreachable
 690
 691 bb2:                                              ; preds = %bb
 692   unreachable
 693
 694 bb3:                                              ; preds = %bb
 695   switch i32 undef, label %bb5 [
 696     i32 2, label %bb4
 697   ]
 698
 699 bb4:                                              ; preds = %bb3
 700   ret void
 701
 702 bb5:                                              ; preds = %bb3
 703   unreachable
 704 }
 705
 706 declare i32 @llvm.amdgcn.workitem.id.x() #1
 707
 708 attributes #0 = { nounwind }
 709 attributes #1 = { nounwind readnone }