llvm/test/CodeGen/AMDGPU/branch-relaxation.ll

   1 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-s-branch-bits=4 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
   2
   3
   4 ; FIXME: We should use llvm-mc for this, but we can't even parse our own output.
   5 ;        See PR33579.
   6 ; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=4 -o %t.o -filetype=obj -simplifycfg-require-and-preserve-domtree=1 %s
   7 ; RUN: llvm-readobj -r %t.o | FileCheck --check-prefix=OBJ %s
   8
   9 ; OBJ:       Relocations [
  10 ; OBJ-NEXT: ]
  11
  12 ; Restrict maximum branch to between +7 and -8 dwords
  13
  14 ; Used to emit an always 4 byte instruction. Inline asm always assumes
  15 ; each instruction is the maximum size.
  16 declare void @llvm.amdgcn.s.sleep(i32) #0
  17
  18 declare i32 @llvm.amdgcn.workitem.id.x() #1
  19
  20
  21 ; GCN-LABEL: {{^}}uniform_conditional_max_short_forward_branch:
  22 ; GCN: s_load_dword [[CND:s[0-9]+]]
  23 ; GCN: s_cmp_eq_u32 [[CND]], 0
  24 ; GCN-NEXT: s_cbranch_scc1 [[BB3:BB[0-9]+_[0-9]+]]
  25
  26
  27 ; GCN-NEXT: ; %bb.1: ; %bb2
  28 ; GCN-NEXT: ;;#ASMSTART
  29 ; GCN-NEXT: v_nop_e64
  30 ; GCN-NEXT: v_nop_e64
  31 ; GCN-NEXT: v_nop_e64
  32 ; GCN-NEXT: ;;#ASMEND
  33 ; GCN-NEXT: s_sleep 0
  34
  35 ; GCN-NEXT: [[BB3]]: ; %bb3
  36 ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
  37 ; GCN: buffer_store_dword [[V_CND]]
  38 ; GCN: s_endpgm
  39 define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
  40 bb:
  41   %cmp = icmp eq i32 %cnd, 0
  42   br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
  43
  44 bb2:
  45 ; 24 bytes
  46   call void asm sideeffect
  47    "v_nop_e64
  48     v_nop_e64
  49     v_nop_e64", ""() #0
  50   call void @llvm.amdgcn.s.sleep(i32 0)
  51   br label %bb3
  52
  53 bb3:
  54   store volatile i32 %cnd, i32 addrspace(1)* %arg
  55   ret void
  56 }
  57
  58 ; GCN-LABEL: {{^}}uniform_conditional_min_long_forward_branch:
  59 ; GCN: s_load_dword [[CND:s[0-9]+]]
  60 ; GCN: s_cmp_eq_u32 [[CND]], 0
  61 ; GCN-NEXT: s_cbranch_scc0 [[LONGBB:BB[0-9]+_[0-9]+]]
  62
  63 ; GCN-NEXT: {{BB[0-9]+_[0-9]+}}: ; %bb0
  64 ; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
  65 ; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}}
  66 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], ([[ENDBB:BB[0-9]+_[0-9]+]]-[[POST_GETPC]])&4294967295
  67 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], ([[ENDBB]]-[[POST_GETPC]])>>32
  68 ; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
  69
  70 ; GCN-NEXT: [[LONGBB]]:
  71 ; GCN-NEXT: ;;#ASMSTART
  72 ; GCN: v_nop_e64
  73 ; GCN: v_nop_e64
  74 ; GCN: v_nop_e64
  75 ; GCN: v_nop_e64
  76 ; GCN-NEXT: ;;#ASMEND
  77
  78 ; GCN-NEXT: [[ENDBB]]:
  79 ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
  80 ; GCN: buffer_store_dword [[V_CND]]
  81 ; GCN: s_endpgm
  82 define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
  83 bb0:
  84   %cmp = icmp eq i32 %cnd, 0
  85   br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch
  86
  87 bb2:
  88 ; 32 bytes
  89   call void asm sideeffect
  90    "v_nop_e64
  91     v_nop_e64
  92     v_nop_e64
  93     v_nop_e64", ""() #0
  94   br label %bb3
  95
  96 bb3:
  97   store volatile i32 %cnd, i32 addrspace(1)* %arg
  98   ret void
  99 }
 100
 101 ; GCN-LABEL: {{^}}uniform_conditional_min_long_forward_vcnd_branch:
 102 ; GCN: s_load_dword [[CND:s[0-9]+]]
 103
 104 ; GCN-DAG: v_cmp_eq_f32_e64 [[UNMASKED:s\[[0-9]+:[0-9]+\]]], [[CND]], 0
 105 ; GCN-DAG: s_and_b64 vcc, exec, [[UNMASKED]]
 106 ; GCN: s_cbranch_vccz [[LONGBB:BB[0-9]+_[0-9]+]]
 107
 108 ; GCN-NEXT: {{BB[0-9]+_[0-9]+}}: ; %bb0
 109 ; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 110 ; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}}
 111 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], ([[ENDBB:BB[0-9]+_[0-9]+]]-[[POST_GETPC]])&4294967295
 112 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], ([[ENDBB]]-[[POST_GETPC]])>>32
 113 ; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 114
 115 ; GCN-NEXT: [[LONGBB]]:
 116 ; GCN: v_nop_e64
 117 ; GCN: v_nop_e64
 118 ; GCN: v_nop_e64
 119 ; GCN: v_nop_e64
 120
 121 ; GCN: [[ENDBB]]:
 122 ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
 123 ; GCN: buffer_store_dword [[V_CND]]
 124 ; GCN: s_endpgm
 125 define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 {
 126 bb0:
 127   %cmp = fcmp oeq float %cnd, 0.0
 128   br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
 129
 130 bb2:
 131   call void asm sideeffect " ; 32 bytes
 132     v_nop_e64
 133     v_nop_e64
 134     v_nop_e64
 135     v_nop_e64", ""() #0
 136   br label %bb3
 137
 138 bb3:
 139   store volatile float %cnd, float addrspace(1)* %arg
 140   ret void
 141 }
 142
 143 ; GCN-LABEL: {{^}}min_long_forward_vbranch:
 144
 145 ; GCN: buffer_load_dword
 146 ; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 147 ; GCN: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
 148
 149 ; GCN: v_nop_e64
 150 ; GCN: v_nop_e64
 151 ; GCN: v_nop_e64
 152 ; GCN: v_nop_e64
 153
 154 ; GCN: s_or_b64 exec, exec, [[SAVE]]
 155 ; GCN: buffer_store_dword
 156 ; GCN: s_endpgm
 157 define amdgpu_kernel void @min_long_forward_vbranch(i32 addrspace(1)* %arg) #0 {
 158 bb:
 159   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 160   %tid.ext = zext i32 %tid to i64
 161   %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tid.ext
 162   %load = load volatile i32, i32 addrspace(1)* %gep
 163   %cmp = icmp eq i32 %load, 0
 164   br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
 165
 166 bb2:
 167   call void asm sideeffect " ; 32 bytes
 168     v_nop_e64
 169     v_nop_e64
 170     v_nop_e64
 171     v_nop_e64", ""() #0
 172   br label %bb3
 173
 174 bb3:
 175   store volatile i32 %load, i32 addrspace(1)* %gep
 176   ret void
 177 }
 178
 179 ; GCN-LABEL: {{^}}long_backward_sbranch:
 180 ; GCN: s_mov_b32 [[LOOPIDX:s[0-9]+]], 0{{$}}
 181
 182 ; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]: ; %bb2
 183 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
 184 ; GCN-NEXT: s_add_i32 [[INC:s[0-9]+]], [[LOOPIDX]], 1
 185 ; GCN-NEXT: s_cmp_lt_i32 [[INC]], 10
 186
 187 ; GCN-NEXT: ;;#ASMSTART
 188 ; GCN-NEXT: v_nop_e64
 189 ; GCN-NEXT: v_nop_e64
 190 ; GCN-NEXT: v_nop_e64
 191 ; GCN-NEXT: ;;#ASMEND
 192
 193 ; GCN-NEXT: s_cbranch_scc0 [[ENDBB:BB[0-9]+_[0-9]+]]
 194
 195 ; GCN-NEXT: {{BB[0-9]+_[0-9]+}}: ; %bb2
 196 ; GCN-NEXT: ; in Loop: Header=[[LOOPBB]] Depth=1
 197
 198 ; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 199 ; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}}
 200 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], ([[LOOPBB]]-[[POST_GETPC]])&4294967295
 201 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], ([[LOOPBB]]-[[POST_GETPC]])>>32
 202 ; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 203
 204 ; GCN-NEXT: [[ENDBB]]:
 205 ; GCN-NEXT: s_endpgm
 206 define amdgpu_kernel void @long_backward_sbranch(i32 addrspace(1)* %arg) #0 {
 207 bb:
 208   br label %bb2
 209
 210 bb2:
 211   %loop.idx = phi i32 [ 0, %bb ], [ %inc, %bb2 ]
 212    ; 24 bytes
 213   call void asm sideeffect
 214    "v_nop_e64
 215     v_nop_e64
 216     v_nop_e64", ""() #0
 217   %inc = add nsw i32 %loop.idx, 1 ; add cost 4
 218   %cmp = icmp slt i32 %inc, 10 ; condition cost = 8
 219   br i1 %cmp, label %bb2, label %bb3 ; -
 220
 221 bb3:
 222   ret void
 223 }
 224
 225 ; Requires expansion of unconditional branch from %bb2 to %bb4 (and
 226 ; expansion of conditional branch from %bb to %bb3.
 227
 228 ; GCN-LABEL: {{^}}uniform_unconditional_min_long_forward_branch:
 229 ; GCN: s_cmp_eq_u32
 230 ; GCN: s_cbranch_scc{{[0-1]}} [[BB2:BB[0-9]+_[0-9]+]]
 231
 232 ; GCN-NEXT: {{BB[0-9]+_[0-9]+}}: ; %bb0
 233 ; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}}
 234 ; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}}
 235 ; GCN-NEXT: s_add_u32 s[[PC0_LO]], s[[PC0_LO]], ([[BB3:BB[0-9]_[0-9]+]]-[[POST_GETPC]])&4294967295
 236 ; GCN-NEXT: s_addc_u32 s[[PC0_HI]], s[[PC0_HI]], ([[BB3:BB[0-9]_[0-9]+]]-[[POST_GETPC]])>>32
 237 ; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC0_LO]]:[[PC0_HI]]{{\]}}
 238
 239 ; GCN: [[BB2]]: ; %bb3
 240 ; GCN: v_nop_e64
 241 ; GCN: v_nop_e64
 242 ; GCN: v_nop_e64
 243 ; GCN: v_nop_e64
 244 ; GCN: ;;#ASMEND
 245
 246 ; GCN: [[BB3]]:
 247 ; GCN: v_mov_b32_e32 [[BB2_K:v[0-9]+]], 17
 248 ; GCN: buffer_store_dword [[BB2_K]]
 249
 250 ; GCN: v_mov_b32_e32 [[BB4_K:v[0-9]+]], 63
 251 ; GCN: buffer_store_dword [[BB4_K]]
 252 ; GCN: s_endpgm
 253 ; GCN-NEXT: .Lfunc_end{{[0-9]+}}:
 254 define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
 255 bb0:
 256   %tmp = icmp ne i32 %arg1, 0
 257   br i1 %tmp, label %bb2, label %bb3
 258
 259 bb2:
 260   store volatile i32 17, i32 addrspace(1)* undef
 261   br label %bb4
 262
 263 bb3:
 264   ; 32 byte asm
 265   call void asm sideeffect
 266    "v_nop_e64
 267     v_nop_e64
 268     v_nop_e64
 269     v_nop_e64", ""() #0
 270   br label %bb4
 271
 272 bb4:
 273   store volatile i32 63, i32 addrspace(1)* %arg
 274   ret void
 275 }
 276
 277 ; GCN-LABEL: {{^}}uniform_unconditional_min_long_backward_branch:
 278 ; GCN-NEXT: ; %bb.0: ; %entry
 279
 280 ; GCN-NEXT: [[LOOP:BB[0-9]_[0-9]+]]: ; %loop
 281 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
 282 ; GCN-NEXT: ;;#ASMSTART
 283 ; GCN-NEXT: v_nop_e64
 284 ; GCN-NEXT: v_nop_e64
 285 ; GCN-NEXT: v_nop_e64
 286 ; GCN-NEXT: v_nop_e64
 287 ; GCN-NEXT: ;;#ASMEND
 288
 289 ; GCN-NEXT: {{BB[0-9]+_[0-9]+}}: ; %loop
 290 ; GCN-NEXT: ; in Loop: Header=[[LOOP]] Depth=1
 291
 292 ; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 293 ; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}}
 294 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], ([[LOOP]]-[[POST_GETPC]])&4294967295
 295 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], ([[LOOP]]-[[POST_GETPC]])>>32
 296 ; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 297 ; GCN-NEXT: .Lfunc_end{{[0-9]+}}:
 298 define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
 299 entry:
 300   br label %loop
 301
 302 loop:
 303   ; 32 byte asm
 304   call void asm sideeffect
 305    "v_nop_e64
 306     v_nop_e64
 307     v_nop_e64
 308     v_nop_e64", ""() #0
 309   br label %loop
 310 }
 311
 312 ; Expansion of branch from %bb1 to %bb3 introduces need to expand
 313 ; branch from %bb0 to %bb2
 314
 315 ; GCN-LABEL: {{^}}expand_requires_expand:
 316 ; GCN-NEXT: ; %bb.0: ; %bb0
 317 ; GCN: s_load_dword
 318 ; GCN: {{s|v}}_cmp_lt_i32
 319 ; GCN: s_cbranch
 320
 321 ; GCN: s_load_dword
 322 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
 323 ; GCN-NEXT: v_cmp_{{eq|ne}}_u32_e64
 324 ; GCN: s_cbranch_vccz [[BB2:BB[0-9]_[0-9]+]]
 325
 326 ; GCN-NEXT: {{BB[0-9]+_[0-9]+}}:
 327 ; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC1_LO:[0-9]+]]:[[PC1_HI:[0-9]+]]{{\]}}
 328 ; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}}
 329 ; GCN-NEXT: s_add_u32 s[[PC1_LO]], s[[PC1_LO]], ([[BB3:BB[0-9]+_[0-9]+]]-[[POST_GETPC]])&4294967295
 330 ; GCN-NEXT: s_addc_u32 s[[PC1_HI]], s[[PC1_HI]], ([[BB3:BB[0-9]+_[0-9]+]]-[[POST_GETPC]])>>32
 331 ; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC1_LO]]:[[PC1_HI]]{{\]}}
 332
 333 ; GCN-NEXT: [[BB2]]: ; %bb2
 334 ; GCN-NEXT: ;;#ASMSTART
 335 ; GCN-NEXT: v_nop_e64
 336 ; GCN-NEXT: v_nop_e64
 337 ; GCN-NEXT: v_nop_e64
 338 ; GCN-NEXT: v_nop_e64
 339 ; GCN-NEXT: ;;#ASMEND
 340
 341 ; GCN-NEXT: [[BB3]]: ; %bb3
 342 ; GCN-NEXT: ;;#ASMSTART
 343 ; GCN-NEXT: v_nop_e64
 344 ; GCN-NEXT: ;;#ASMEND
 345 ; GCN-NEXT: ;;#ASMSTART
 346 ; GCN-NEXT: v_nop_e64
 347 ; GCN-NEXT: ;;#ASMEND
 348 ; GCN-NEXT: s_endpgm
 349 define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 {
 350 bb0:
 351   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
 352   %cmp0 = icmp slt i32 %cond0, 0
 353   br i1 %cmp0, label %bb2, label %bb1
 354
 355 bb1:
 356   %val = load volatile i32, i32 addrspace(4)* undef
 357   %cmp1 = icmp eq i32 %val, 3
 358   br i1 %cmp1, label %bb3, label %bb2
 359
 360 bb2:
 361   call void asm sideeffect
 362    "v_nop_e64
 363     v_nop_e64
 364     v_nop_e64
 365     v_nop_e64", ""() #0
 366   br label %bb3
 367
 368 bb3:
 369 ; These NOPs prevent tail-duplication-based outlining
 370 ; from firing, which defeats the need to expand the branches and this test.
 371   call void asm sideeffect
 372    "v_nop_e64", ""() #0
 373   call void asm sideeffect
 374    "v_nop_e64", ""() #0
 375   ret void
 376 }
 377
 378 ; Requires expanding of required skip branch.
 379
 380 ; GCN-LABEL: {{^}}uniform_inside_divergent:
 381 ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
 382 ; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
 383 ; GCN-NEXT: s_cbranch_execnz [[IF:BB[0-9]+_[0-9]+]]
 384
 385 ; GCN-NEXT: {{BB[0-9]+_[0-9]+}}: ; %entry
 386 ; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 387 ; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}}
 388 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], ([[BB2:BB[0-9]_[0-9]+]]-[[POST_GETPC]])&4294967295
 389 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], ([[BB2:BB[0-9]_[0-9]+]]-[[POST_GETPC]])>>32
 390 ; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 391
 392 ; GCN-NEXT: [[IF]]: ; %if
 393 ; GCN: buffer_store_dword
 394 ; GCN: s_cmp_lg_u32
 395 ; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]]
 396
 397 ; GCN-NEXT: ; %bb.2: ; %if_uniform
 398 ; GCN: buffer_store_dword
 399
 400 ; GCN-NEXT: [[ENDIF]]: ; %endif
 401 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
 402 ; GCN-NEXT: s_sleep 5
 403 ; GCN-NEXT: s_endpgm
 404 define amdgpu_kernel void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 {
 405 entry:
 406   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 407   %d_cmp = icmp ult i32 %tid, 16
 408   br i1 %d_cmp, label %if, label %endif
 409
 410 if:
 411   store i32 0, i32 addrspace(1)* %out
 412   %u_cmp = icmp eq i32 %cond, 0
 413   br i1 %u_cmp, label %if_uniform, label %endif
 414
 415 if_uniform:
 416   store i32 1, i32 addrspace(1)* %out
 417   br label %endif
 418
 419 endif:
 420   ; layout can remove the split branch if it can copy the return block.
 421   ; This call makes the return block long enough that it doesn't get copied.
 422   call void @llvm.amdgcn.s.sleep(i32 5);
 423   ret void
 424 }
 425
 426 ; si_mask_branch
 427
 428 ; GCN-LABEL: {{^}}analyze_mask_branch:
 429 ; GCN: v_cmp_nlt_f32_e32 vcc
 430 ; GCN-NEXT: s_and_saveexec_b64 [[TEMP_MASK:s\[[0-9]+:[0-9]+\]]], vcc
 431 ; GCN-NEXT: s_xor_b64  [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[TEMP_MASK]]
 432
 433 ; GCN: BB{{[0-9]+_[0-9]+}}: ; %Flow
 434 ; GCN-NEXT: s_or_saveexec_b64 [[TEMP_MASK1:s\[[0-9]+:[0-9]+\]]], [[MASK]]
 435 ; GCN-NEXT: s_xor_b64 exec, exec, [[TEMP_MASK1]]
 436
 437 ; GCN: [[LOOP_BODY:BB[0-9]+_[0-9]+]]: ; %loop{{$}}
 438 ; GCN: ;;#ASMSTART
 439 ; GCN: v_nop_e64
 440 ; GCN: v_nop_e64
 441 ; GCN: v_nop_e64
 442 ; GCN: v_nop_e64
 443 ; GCN: v_nop_e64
 444 ; GCN: v_nop_e64
 445 ; GCN: ;;#ASMEND
 446 ; GCN: s_cbranch_{{vccz|vccnz}} [[RET:BB[0-9]+_[0-9]+]]
 447
 448 ; GCN-NEXT: {{BB[0-9]+_[0-9]+}}: ; %loop
 449 ; GCN-NEXT: ; in Loop: Header=[[LOOP_BODY]] Depth=1
 450 ; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 451 ; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}}
 452 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], ([[LOOP_BODY]]-[[POST_GETPC]])&4294967295
 453 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], ([[LOOP_BODY]]-[[POST_GETPC]])>>32
 454 ; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 455
 456 ; GCN-NEXT: [[RET]]: ; %UnifiedReturnBlock
 457 ; GCN-NEXT: s_endpgm
 458 define amdgpu_kernel void @analyze_mask_branch() #0 {
 459 entry:
 460   %reg = call float asm sideeffect "v_mov_b32_e64 $0, 0", "=v"()
 461   %cmp0 = fcmp ogt float %reg, 0.000000e+00
 462   br i1 %cmp0, label %loop, label %ret
 463
 464 loop:
 465   %phi = phi float [ 0.000000e+00, %loop_body ], [ 1.000000e+00, %entry ]
 466   call void asm sideeffect
 467     "v_nop_e64
 468      v_nop_e64", ""() #0
 469   %cmp1 = fcmp olt float %phi, 8.0
 470   br i1 %cmp1, label %loop_body, label %ret
 471
 472 loop_body:
 473   call void asm sideeffect
 474   "v_nop_e64
 475    v_nop_e64
 476    v_nop_e64
 477    v_nop_e64", ""() #0
 478   br label %loop
 479
 480 ret:
 481   store volatile i32 7, i32 addrspace(1)* undef
 482   ret void
 483 }
 484
 485 ; GCN-LABEL: {{^}}long_branch_hang:
 486 ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6
 487 ; GCN: s_cbranch_scc{{[0-1]}} [[LONG_BR_0:BB[0-9]+_[0-9]+]]
 488 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:
 489
 490 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 491 ; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}}
 492 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], ([[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-[[POST_GETPC]])&4294967295
 493 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], ([[LONG_BR_DEST0]]-[[POST_GETPC]])>>32
 494 ; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 495 ; GCN-NEXT: [[LONG_BR_0]]:
 496
 497 ; GCN: [[LONG_BR_DEST0]]:
 498
 499 ; GCN-DAG: v_cmp_lt_i32
 500 ; GCN-DAG: v_cmp_ge_i32
 501
 502 ; GCN: s_cbranch_vccz
 503 ; GCN: s_setpc_b64
 504
 505 ; GCN: s_endpgm
 506 define amdgpu_kernel void @long_branch_hang(i32 addrspace(1)* nocapture %arg, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i64 %arg5) #0 {
 507 bb:
 508   %tmp = icmp slt i32 %arg2, 9
 509   %tmp6 = icmp eq i32 %arg1, 0
 510   %tmp7 = icmp sgt i32 %arg4, 0
 511   %tmp8 = icmp sgt i32 %arg4, 5
 512   br i1 %tmp8, label %bb9, label %bb13
 513
 514 bb9:                                              ; preds = %bb
 515   %tmp10 = and i1 %tmp7, %tmp
 516   %tmp11 = icmp slt i32 %arg3, %arg4
 517   %tmp12 = or i1 %tmp11, %tmp7
 518   br i1 %tmp12, label %bb19, label %bb14
 519
 520 bb13:                                             ; preds = %bb
 521   call void asm sideeffect
 522   "v_nop_e64
 523    v_nop_e64
 524    v_nop_e64
 525    v_nop_e64", ""() #0
 526   br i1 %tmp6, label %bb19, label %bb14
 527
 528 bb14:                                             ; preds = %bb13, %bb9
 529   %tmp15 = icmp slt i32 %arg3, %arg4
 530   %tmp16 = or i1 %tmp15, %tmp
 531   %tmp17 = and i1 %tmp6, %tmp16
 532   %tmp18 = zext i1 %tmp17 to i32
 533   br label %bb19
 534
 535 bb19:                                             ; preds = %bb14, %bb13, %bb9
 536   %tmp20 = phi i32 [ undef, %bb9 ], [ undef, %bb13 ], [ %tmp18, %bb14 ]
 537   %tmp21 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %arg5
 538   store i32 %tmp20, i32 addrspace(1)* %tmp21, align 4
 539   ret void
 540 }
 541
 542 attributes #0 = { nounwind }
 543 attributes #1 = { nounwind readnone }