llvm/test/CodeGen/AMDGPU/early-if-convert.ll

   1 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
   2 ; XUN: llc -march=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
   3
   4 ; FIXME: This leaves behind a now unnecessary and with exec
   5
   6 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle:
   7 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
   8 ; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]]
   9 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
  10 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[VAL]], vcc
  11 ; GCN: buffer_store_dword [[RESULT]]
  12 define amdgpu_kernel void @test_vccnz_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
  13 entry:
  14   %v = load float, float addrspace(1)* %in
  15   %cc = fcmp oeq float %v, 1.000000e+00
  16   br i1 %cc, label %if, label %endif
  17
  18 if:
  19   %u = fadd float %v, %v
  20   br label %endif
  21
  22 endif:
  23   %r = phi float [ %v, %entry ], [ %u, %if ]
  24   store float %r, float addrspace(1)* %out
  25   ret void
  26 }
  27
  28 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_diamond:
  29 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
  30 ; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]]
  31 ; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
  32 ; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VAL]], [[VAL]]
  33 ; GCN: buffer_store_dword [[RESULT]]
  34 define amdgpu_kernel void @test_vccnz_ifcvt_diamond(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
  35 entry:
  36   %v = load float, float addrspace(1)* %in
  37   %cc = fcmp oeq float %v, 1.000000e+00
  38   br i1 %cc, label %if, label %else
  39
  40 if:
  41   %u0 = fadd float %v, %v
  42   br label %endif
  43
  44 else:
  45   %u1 = fmul float %v, %v
  46   br label %endif
  47
  48 endif:
  49   %r = phi float [ %u0, %if ], [ %u1, %else ]
  50   store float %r, float addrspace(1)* %out
  51   ret void
  52 }
  53
  54 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_vcc_clobber:
  55 ; GCN: ; clobber vcc
  56 ; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0
  57 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc
  58 ; GCN: s_mov_b64 vcc, [[CMP]]
  59 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
  60 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_vcc_clobber(i32 addrspace(1)* %out, i32 addrspace(1)* %in, float %k) #0 {
  61 entry:
  62   %v = load i32, i32 addrspace(1)* %in
  63   %cc = fcmp oeq float %k, 1.000000e+00
  64   br i1 %cc, label %if, label %endif
  65
  66 if:
  67   call void asm "; clobber $0", "~{vcc}"() #0
  68   %u = add i32 %v, %v
  69   br label %endif
  70
  71 endif:
  72   %r = phi i32 [ %v, %entry ], [ %u, %if ]
  73   store i32 %r, i32 addrspace(1)* %out
  74   ret void
  75 }
  76
  77 ; Longest chain of cheap instructions to convert
  78 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_max_cheap:
  79 ; GCN: v_mul_f32
  80 ; GCN: v_mul_f32
  81 ; GCN: v_mul_f32
  82 ; GCN: v_mul_f32
  83 ; GCN: v_mul_f32
  84 ; GCN: v_mul_f32
  85 ; GCN: v_mul_f32
  86 ; GCN: v_mul_f32
  87 ; GCN: v_mul_f32
  88 ; GCN: v_cndmask_b32_e32
  89 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_max_cheap(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
  90 entry:
  91   %v = load float, float addrspace(1)* %in
  92   %cc = fcmp oeq float %v, 1.000000e+00
  93   br i1 %cc, label %if, label %endif
  94
  95 if:
  96   %u.0 = fmul float %v, %v
  97   %u.1 = fmul float %v, %u.0
  98   %u.2 = fmul float %v, %u.1
  99   %u.3 = fmul float %v, %u.2
 100   %u.4 = fmul float %v, %u.3
 101   %u.5 = fmul float %v, %u.4
 102   %u.6 = fmul float %v, %u.5
 103   %u.7 = fmul float %v, %u.6
 104   %u.8 = fmul float %v, %u.7
 105   br label %endif
 106
 107 endif:
 108   %r = phi float [ %v, %entry ], [ %u.8, %if ]
 109   store float %r, float addrspace(1)* %out
 110   ret void
 111 }
 112
 113 ; Short chain of cheap instructions to not convert
 114 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_min_expensive:
 115 ; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
 116
 117 ; GCN: v_mul_f32
 118 ; GCN: v_mul_f32
 119 ; GCN: v_mul_f32
 120 ; GCN: v_mul_f32
 121 ; GCN: v_mul_f32
 122 ; GCN: v_mul_f32
 123 ; GCN: v_mul_f32
 124 ; GCN: v_mul_f32
 125 ; GCN: v_mul_f32
 126 ; GCN: v_mul_f32
 127
 128 ; GCN: [[ENDIF]]:
 129 ; GCN: buffer_store_dword
 130 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_min_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 131 entry:
 132   %v = load float, float addrspace(1)* %in
 133   %cc = fcmp oeq float %v, 1.000000e+00
 134   br i1 %cc, label %if, label %endif
 135
 136 if:
 137   %u.0 = fmul float %v, %v
 138   %u.1 = fmul float %v, %u.0
 139   %u.2 = fmul float %v, %u.1
 140   %u.3 = fmul float %v, %u.2
 141   %u.4 = fmul float %v, %u.3
 142   %u.5 = fmul float %v, %u.4
 143   %u.6 = fmul float %v, %u.5
 144   %u.7 = fmul float %v, %u.6
 145   %u.8 = fmul float %v, %u.7
 146   %u.9 = fmul float %v, %u.8
 147   br label %endif
 148
 149 endif:
 150   %r = phi float [ %v, %entry ], [ %u.9, %if ]
 151   store float %r, float addrspace(1)* %out
 152   ret void
 153 }
 154
 155 ; Should still branch over fdiv expansion
 156 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_expensive:
 157 ; GCN: v_cmp_neq_f32_e32
 158 ; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
 159
 160 ; GCN: v_div_scale_f32
 161
 162 ; GCN: [[ENDIF]]:
 163 ; GCN: buffer_store_dword
 164 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 165 entry:
 166   %v = load float, float addrspace(1)* %in
 167   %cc = fcmp oeq float %v, 1.000000e+00
 168   br i1 %cc, label %if, label %endif
 169
 170 if:
 171   %u = fdiv float %v, %v
 172   br label %endif
 173
 174 endif:
 175   %r = phi float [ %v, %entry ], [ %u, %if ]
 176   store float %r, float addrspace(1)* %out
 177   ret void
 178 }
 179
 180 ; vcc branch with SGPR inputs
 181 ; GCN-LABEL: {{^}}test_vccnz_sgpr_ifcvt_triangle:
 182 ; GCN: v_cmp_neq_f32_e64
 183 ; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
 184
 185 ; GCN: s_add_i32
 186
 187 ; GCN: [[ENDIF]]:
 188 ; GCN: buffer_store_dword
 189 define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(4)* %in, float %cnd) #0 {
 190 entry:
 191   %v = load i32, i32 addrspace(4)* %in
 192   %cc = fcmp oeq float %cnd, 1.000000e+00
 193   br i1 %cc, label %if, label %endif
 194
 195 if:
 196   %u = add i32 %v, %v
 197   br label %endif
 198
 199 endif:
 200   %r = phi i32 [ %v, %entry ], [ %u, %if ]
 201   store i32 %r, i32 addrspace(1)* %out
 202   ret void
 203
 204 }
 205
 206 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_constant_load:
 207 ; GCN: v_cndmask_b32
 208 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(4)* %in) #0 {
 209 entry:
 210   %v = load float, float addrspace(4)* %in
 211   %cc = fcmp oeq float %v, 1.000000e+00
 212   br i1 %cc, label %if, label %endif
 213
 214 if:
 215   %u = fadd float %v, %v
 216   br label %endif
 217
 218 endif:
 219   %r = phi float [ %v, %entry ], [ %u, %if ]
 220   store float %r, float addrspace(1)* %out
 221   ret void
 222 }
 223
 224 ; Due to broken cost heuristic, this is not if converted like
 225 ; test_vccnz_ifcvt_triangle_constant_load even though it should be.
 226
 227 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_argload:
 228 ; GCN: v_cndmask_b32
 229 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_argload(float addrspace(1)* %out, float %v) #0 {
 230 entry:
 231   %cc = fcmp oeq float %v, 1.000000e+00
 232   br i1 %cc, label %if, label %endif
 233
 234 if:
 235   %u = fadd float %v, %v
 236   br label %endif
 237
 238 endif:
 239   %r = phi float [ %v, %entry ], [ %u, %if ]
 240   store float %r, float addrspace(1)* %out
 241   ret void
 242 }
 243
 244 ; Scalar branch and scalar inputs
 245 ; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle:
 246 ; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
 247 ; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]]
 248 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 249 ; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[VAL]], [[ADD]]
 250 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(4)* %in, i32 %cond) #0 {
 251 entry:
 252   %v = load i32, i32 addrspace(4)* %in
 253   %cc = icmp eq i32 %cond, 1
 254   br i1 %cc, label %if, label %endif
 255
 256 if:
 257   %u = add i32 %v, %v
 258   br label %endif
 259
 260 endif:
 261   %r = phi i32 [ %v, %entry ], [ %u, %if ]
 262   call void asm sideeffect "; reg use $0", "s"(i32 %r) #0
 263   ret void
 264 }
 265
 266 ; FIXME: Should be able to use VALU compare and select
 267 ; Scalar branch but VGPR select operands
 268 ; GCN-LABEL: {{^}}test_scc1_vgpr_ifcvt_triangle:
 269 ; GCN: s_cmp_lg_u32
 270 ; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]]
 271
 272 ; GCN: v_add_f32_e32
 273
 274 ; GCN: [[ENDIF]]:
 275 ; GCN: buffer_store_dword
 276 define amdgpu_kernel void @test_scc1_vgpr_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in, i32 %cond) #0 {
 277 entry:
 278   %v = load float, float addrspace(1)* %in
 279   %cc = icmp eq i32 %cond, 1
 280   br i1 %cc, label %if, label %endif
 281
 282 if:
 283   %u = fadd float %v, %v
 284   br label %endif
 285
 286 endif:
 287   %r = phi float [ %v, %entry ], [ %u, %if ]
 288   store float %r, float addrspace(1)* %out
 289   ret void
 290 }
 291
 292 ; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle64:
 293 ; GCN: s_add_u32
 294 ; GCN: s_addc_u32
 295 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 296 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
 297 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(4)* %in, i32 %cond) #0 {
 298 entry:
 299   %v = load i64, i64 addrspace(4)* %in
 300   %cc = icmp eq i32 %cond, 1
 301   br i1 %cc, label %if, label %endif
 302
 303 if:
 304   %u = add i64 %v, %v
 305   br label %endif
 306
 307 endif:
 308   %r = phi i64 [ %v, %entry ], [ %u, %if ]
 309   call void asm sideeffect "; reg use $0", "s"(i64 %r) #0
 310   ret void
 311 }
 312
 313 ; TODO: Can do s_cselect_b64; s_cselect_b32
 314 ; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle96:
 315 ; GCN: s_add_i32
 316 ; GCN: s_add_i32
 317 ; GCN: s_add_i32
 318 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 319 ; GCN-NEXT: s_cselect_b32 s
 320 ; GCN-NEXT: s_cselect_b32 s
 321 ; GCN-NEXT: s_cselect_b32 s
 322 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(4)* %in, i32 %cond) #0 {
 323 entry:
 324   %v = load <3 x i32>, <3 x i32> addrspace(4)* %in
 325   %cc = icmp eq i32 %cond, 1
 326   br i1 %cc, label %if, label %endif
 327
 328 if:
 329   %u = add <3 x i32> %v, %v
 330   br label %endif
 331
 332 endif:
 333   %r = phi <3 x i32> [ %v, %entry ], [ %u, %if ]
 334   %r.ext = shufflevector <3 x i32> %r, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 335   call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r.ext) #0
 336   ret void
 337 }
 338
 339 ; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle128:
 340 ; GCN: s_add_i32
 341 ; GCN: s_add_i32
 342 ; GCN: s_add_i32
 343 ; GCN: s_add_i32
 344 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 345 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
 346 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
 347 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(4)* %in, i32 %cond) #0 {
 348 entry:
 349   %v = load <4 x i32>, <4 x i32> addrspace(4)* %in
 350   %cc = icmp eq i32 %cond, 1
 351   br i1 %cc, label %if, label %endif
 352
 353 if:
 354   %u = add <4 x i32> %v, %v
 355   br label %endif
 356
 357 endif:
 358   %r = phi <4 x i32> [ %v, %entry ], [ %u, %if ]
 359   call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r) #0
 360   ret void
 361 }
 362
 363 ; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc_constant_select:
 364 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
 365 ; GCN: s_cselect_b32 s{{[0-9]+}}, 0, 1{{$}}
 366 define amdgpu_kernel void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, i32 addrspace(1)* %out) {
 367 entry:
 368   %cmp0 = icmp eq i32 %cond, 0
 369   br i1 %cmp0, label %else, label %if
 370
 371 if:
 372   br label %done
 373
 374 else:
 375   br label %done
 376
 377 done:
 378   %value = phi i32 [0, %if], [1, %else]
 379   store i32 %value, i32 addrspace(1)* %out
 380   ret void
 381 }
 382
 383 ; GCN-LABEL: {{^}}ifcvt_undef_scc:
 384 ; GCN: {{^}}; %bb.0:
 385 ; GCN-NEXT: s_load_dwordx2
 386 ; GCN-NEXT: s_cselect_b32 s{{[0-9]+}}, 0, 1{{$}}
 387 define amdgpu_kernel void @ifcvt_undef_scc(i32 %cond, i32 addrspace(1)* %out) {
 388 entry:
 389   br i1 undef, label %else, label %if
 390
 391 if:
 392   br label %done
 393
 394 else:
 395   br label %done
 396
 397 done:
 398   %value = phi i32 [0, %if], [1, %else]
 399   store i32 %value, i32 addrspace(1)* %out
 400   ret void
 401 }
 402
 403 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle256:
 404 ; GCN: v_cmp_neq_f32
 405 ; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
 406
 407 ; GCN: v_add_i32
 408 ; GCN: v_add_i32
 409
 410 ; GCN: [[ENDIF]]:
 411 ; GCN: buffer_store_dword
 412 define amdgpu_kernel void @test_vccnz_ifcvt_triangle256(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in, float %cnd) #0 {
 413 entry:
 414   %v = load <8 x i32>, <8 x i32> addrspace(1)* %in
 415   %cc = fcmp oeq float %cnd, 1.000000e+00
 416   br i1 %cc, label %if, label %endif
 417
 418 if:
 419   %u = add <8 x i32> %v, %v
 420   br label %endif
 421
 422 endif:
 423   %r = phi <8 x i32> [ %v, %entry ], [ %u, %if ]
 424   store <8 x i32> %r, <8 x i32> addrspace(1)* %out
 425   ret void
 426 }
 427
 428 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle512:
 429 ; GCN: v_cmp_neq_f32
 430 ; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
 431
 432 ; GCN: v_add_i32
 433 ; GCN: v_add_i32
 434
 435 ; GCN: [[ENDIF]]:
 436 ; GCN: buffer_store_dword
 437 define amdgpu_kernel void @test_vccnz_ifcvt_triangle512(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in, float %cnd) #0 {
 438 entry:
 439   %v = load <16 x i32>, <16 x i32> addrspace(1)* %in
 440   %cc = fcmp oeq float %cnd, 1.000000e+00
 441   br i1 %cc, label %if, label %endif
 442
 443 if:
 444   %u = add <16 x i32> %v, %v
 445   br label %endif
 446
 447 endif:
 448   %r = phi <16 x i32> [ %v, %entry ], [ %u, %if ]
 449   store <16 x i32> %r, <16 x i32> addrspace(1)* %out
 450   ret void
 451 }
 452
 453 attributes #0 = { nounwind }