llvm/test/CodeGen/AMDGPU/early-if-convert.ll

   1 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -amdgpu-codegenprepare-break-large-phis=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
   2 ; XUN: llc -march=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
   3
   4 ; Note: breaking up large PHIs is disabled to prevent some testcases from becoming
   5 ;  branchless.
   6
   7 ; FIXME: This leaves behind a now unnecessary and with exec
   8
   9 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle:
  10 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
  11 ; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]]
  12 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
  13 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[VAL]], vcc
  14 ; GCN: buffer_store_dword [[RESULT]]
  15 define amdgpu_kernel void @test_vccnz_ifcvt_triangle(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
  16 entry:
  17   %v = load float, ptr addrspace(1) %in
  18   %cc = fcmp oeq float %v, 1.000000e+00
  19   br i1 %cc, label %if, label %endif
  20
  21 if:
  22   %u = fadd float %v, %v
  23   br label %endif
  24
  25 endif:
  26   %r = phi float [ %v, %entry ], [ %u, %if ]
  27   store float %r, ptr addrspace(1) %out
  28   ret void
  29 }
  30
  31 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_diamond:
  32 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
  33 ; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]]
  34 ; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
  35 ; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VAL]], [[VAL]]
  36 ; GCN: buffer_store_dword [[MUL]]
  37 define amdgpu_kernel void @test_vccnz_ifcvt_diamond(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
  38 entry:
  39   %v = load float, ptr addrspace(1) %in
  40   %cc = fcmp oeq float %v, 1.000000e+00
  41   br i1 %cc, label %if, label %else
  42
  43 if:
  44   %u0 = fadd float %v, %v
  45   br label %endif
  46
  47 else:
  48   %u1 = fmul float %v, %v
  49   br label %endif
  50
  51 endif:
  52   %r = phi float [ %u0, %if ], [ %u1, %else ]
  53   store float %r, ptr addrspace(1) %out
  54   ret void
  55 }
  56
  57 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_vcc_clobber:
  58 ; GCN: ; clobber vcc
  59 ; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0
  60 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc
  61 ; GCN: s_mov_b64 vcc, [[CMP]]
  62 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
  63 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_vcc_clobber(ptr addrspace(1) %out, ptr addrspace(1) %in, float %k) #0 {
  64 entry:
  65   %v = load i32, ptr addrspace(1) %in
  66   %cc = fcmp oeq float %k, 1.000000e+00
  67   br i1 %cc, label %if, label %endif
  68
  69 if:
  70   call void asm "; clobber $0", "~{vcc}"() #0
  71   %u = add i32 %v, %v
  72   br label %endif
  73
  74 endif:
  75   %r = phi i32 [ %v, %entry ], [ %u, %if ]
  76   store i32 %r, ptr addrspace(1) %out
  77   ret void
  78 }
  79
  80 ; Longest chain of cheap instructions to convert
  81 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_max_cheap:
  82 ; GCN: v_mul_f32
  83 ; GCN: v_mul_f32
  84 ; GCN: v_mul_f32
  85 ; GCN: v_mul_f32
  86 ; GCN: v_mul_f32
  87 ; GCN: v_mul_f32
  88 ; GCN: v_mul_f32
  89 ; GCN: v_mul_f32
  90 ; GCN: v_mul_f32
  91 ; GCN: v_cndmask_b32_e32
  92 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_max_cheap(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
  93 entry:
  94   %v = load float, ptr addrspace(1) %in
  95   %cc = fcmp oeq float %v, 1.000000e+00
  96   br i1 %cc, label %if, label %endif
  97
  98 if:
  99   %u.0 = fmul float %v, %v
 100   %u.1 = fmul float %v, %u.0
 101   %u.2 = fmul float %v, %u.1
 102   %u.3 = fmul float %v, %u.2
 103   %u.4 = fmul float %v, %u.3
 104   %u.5 = fmul float %v, %u.4
 105   %u.6 = fmul float %v, %u.5
 106   %u.7 = fmul float %v, %u.6
 107   %u.8 = fmul float %v, %u.7
 108   br label %endif
 109
 110 endif:
 111   %r = phi float [ %v, %entry ], [ %u.8, %if ]
 112   store float %r, ptr addrspace(1) %out
 113   ret void
 114 }
 115
 116 ; Short chain of cheap instructions to not convert
 117 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_min_expensive:
 118 ; GCN: s_cbranch_vccnz [[ENDIF:.LBB[0-9]+_[0-9]+]]
 119
 120 ; GCN: v_mul_f32
 121 ; GCN: v_mul_f32
 122 ; GCN: v_mul_f32
 123 ; GCN: v_mul_f32
 124 ; GCN: v_mul_f32
 125 ; GCN: v_mul_f32
 126 ; GCN: v_mul_f32
 127 ; GCN: v_mul_f32
 128 ; GCN: v_mul_f32
 129 ; GCN: v_mul_f32
 130
 131 ; GCN: [[ENDIF]]:
 132 ; GCN: buffer_store_dword
 133 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_min_expensive(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 134 entry:
 135   %v = load float, ptr addrspace(1) %in
 136   %cc = fcmp oeq float %v, 1.000000e+00
 137   br i1 %cc, label %if, label %endif
 138
 139 if:
 140   %u.0 = fmul float %v, %v
 141   %u.1 = fmul float %v, %u.0
 142   %u.2 = fmul float %v, %u.1
 143   %u.3 = fmul float %v, %u.2
 144   %u.4 = fmul float %v, %u.3
 145   %u.5 = fmul float %v, %u.4
 146   %u.6 = fmul float %v, %u.5
 147   %u.7 = fmul float %v, %u.6
 148   %u.8 = fmul float %v, %u.7
 149   %u.9 = fmul float %v, %u.8
 150   br label %endif
 151
 152 endif:
 153   %r = phi float [ %v, %entry ], [ %u.9, %if ]
 154   store float %r, ptr addrspace(1) %out
 155   ret void
 156 }
 157
 158 ; Should still branch over fdiv expansion
 159 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_expensive:
 160 ; GCN: v_cmp_neq_f32_e32
 161 ; GCN: s_cbranch_vccnz [[ENDIF:.LBB[0-9]+_[0-9]+]]
 162
 163 ; GCN: v_div_scale_f32
 164
 165 ; GCN: [[ENDIF]]:
 166 ; GCN: buffer_store_dword
 167 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_expensive(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 168 entry:
 169   %v = load float, ptr addrspace(1) %in
 170   %cc = fcmp oeq float %v, 1.000000e+00
 171   br i1 %cc, label %if, label %endif
 172
 173 if:
 174   %u = fdiv float %v, %v
 175   br label %endif
 176
 177 endif:
 178   %r = phi float [ %v, %entry ], [ %u, %if ]
 179   store float %r, ptr addrspace(1) %out
 180   ret void
 181 }
 182
 183 ; vcc branch with SGPR inputs
 184 ; GCN-LABEL: {{^}}test_vccnz_sgpr_ifcvt_triangle:
 185 ; GCN: v_cmp_neq_f32_e64
 186 ; GCN: s_cbranch_vccnz [[ENDIF:.LBB[0-9]+_[0-9]+]]
 187
 188 ; GCN: s_add_i32
 189
 190 ; GCN: [[ENDIF]]:
 191 ; GCN: buffer_store_dword
 192 define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(ptr addrspace(1) %out, ptr addrspace(4) %in, float %cnd) #0 {
 193 entry:
 194   %v = load i32, ptr addrspace(4) %in
 195   %cc = fcmp oeq float %cnd, 1.000000e+00
 196   br i1 %cc, label %if, label %endif
 197
 198 if:
 199   %u = add i32 %v, %v
 200   br label %endif
 201
 202 endif:
 203   %r = phi i32 [ %v, %entry ], [ %u, %if ]
 204   store i32 %r, ptr addrspace(1) %out
 205   ret void
 206
 207 }
 208
 209 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_constant_load:
 210 ; GCN: v_cndmask_b32
 211 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 212 entry:
 213   %v = load float, ptr addrspace(4) %in
 214   %cc = fcmp oeq float %v, 1.000000e+00
 215   br i1 %cc, label %if, label %endif
 216
 217 if:
 218   %u = fadd float %v, %v
 219   br label %endif
 220
 221 endif:
 222   %r = phi float [ %v, %entry ], [ %u, %if ]
 223   store float %r, ptr addrspace(1) %out
 224   ret void
 225 }
 226
 227 ; Due to broken cost heuristic, this is not if converted like
 228 ; test_vccnz_ifcvt_triangle_constant_load even though it should be.
 229
 230 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_argload:
 231 ; GCN: v_cndmask_b32
 232 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_argload(ptr addrspace(1) %out, float %v) #0 {
 233 entry:
 234   %cc = fcmp oeq float %v, 1.000000e+00
 235   br i1 %cc, label %if, label %endif
 236
 237 if:
 238   %u = fadd float %v, %v
 239   br label %endif
 240
 241 endif:
 242   %r = phi float [ %v, %entry ], [ %u, %if ]
 243   store float %r, ptr addrspace(1) %out
 244   ret void
 245 }
 246
 247 ; Scalar branch and scalar inputs
 248 ; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle:
 249 ; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
 250 ; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]]
 251 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 252 ; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[VAL]], [[ADD]]
 253 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(ptr addrspace(4) %in, i32 %cond) #0 {
 254 entry:
 255   %v = load i32, ptr addrspace(4) %in
 256   %cc = icmp eq i32 %cond, 1
 257   br i1 %cc, label %if, label %endif
 258
 259 if:
 260   %u = add i32 %v, %v
 261   br label %endif
 262
 263 endif:
 264   %r = phi i32 [ %v, %entry ], [ %u, %if ]
 265   call void asm sideeffect "; reg use $0", "s"(i32 %r) #0
 266   ret void
 267 }
 268
 269 ; FIXME: Should be able to use VALU compare and select
 270 ; Scalar branch but VGPR select operands
 271 ; GCN-LABEL: {{^}}test_scc1_vgpr_ifcvt_triangle:
 272 ; GCN: s_cmp_lg_u32
 273 ; GCN: s_cbranch_scc1 [[ENDIF:.LBB[0-9]+_[0-9]+]]
 274
 275 ; GCN: v_add_f32_e32
 276
 277 ; GCN: [[ENDIF]]:
 278 ; GCN: buffer_store_dword
 279 define amdgpu_kernel void @test_scc1_vgpr_ifcvt_triangle(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %cond) #0 {
 280 entry:
 281   %v = load float, ptr addrspace(1) %in
 282   %cc = icmp eq i32 %cond, 1
 283   br i1 %cc, label %if, label %endif
 284
 285 if:
 286   %u = fadd float %v, %v
 287   br label %endif
 288
 289 endif:
 290   %r = phi float [ %v, %entry ], [ %u, %if ]
 291   store float %r, ptr addrspace(1) %out
 292   ret void
 293 }
 294
 295 ; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle64:
 296 ; GCN: s_add_u32
 297 ; GCN: s_addc_u32
 298 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 299 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
 300 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(ptr addrspace(4) %in, i32 %cond) #0 {
 301 entry:
 302   %v = load i64, ptr addrspace(4) %in
 303   %cc = icmp eq i32 %cond, 1
 304   br i1 %cc, label %if, label %endif
 305
 306 if:
 307   %u = add i64 %v, %v
 308   br label %endif
 309
 310 endif:
 311   %r = phi i64 [ %v, %entry ], [ %u, %if ]
 312   call void asm sideeffect "; reg use $0", "s"(i64 %r) #0
 313   ret void
 314 }
 315
 316 ; TODO: Can do s_cselect_b64; s_cselect_b32
 317 ; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle96:
 318 ; GCN: s_add_i32
 319 ; GCN: s_add_i32
 320 ; GCN: s_add_i32
 321 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 322 ; GCN-NEXT: s_cselect_b32 s
 323 ; GCN-NEXT: s_cselect_b32 s
 324 ; GCN-NEXT: s_cselect_b32 s
 325 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(ptr addrspace(4) %in, i32 %cond) #0 {
 326 entry:
 327   %v = load <3 x i32>, ptr addrspace(4) %in
 328   %cc = icmp eq i32 %cond, 1
 329   br i1 %cc, label %if, label %endif
 330
 331 if:
 332   %u = add <3 x i32> %v, %v
 333   br label %endif
 334
 335 endif:
 336   %r = phi <3 x i32> [ %v, %entry ], [ %u, %if ]
 337   %r.ext = shufflevector <3 x i32> %r, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 338   call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r.ext) #0
 339   ret void
 340 }
 341
 342 ; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle128:
 343 ; GCN: s_add_i32
 344 ; GCN: s_add_i32
 345 ; GCN: s_add_i32
 346 ; GCN: s_add_i32
 347 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 348 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
 349 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
 350 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(ptr addrspace(4) %in, i32 %cond) #0 {
 351 entry:
 352   %v = load <4 x i32>, ptr addrspace(4) %in
 353   %cc = icmp eq i32 %cond, 1
 354   br i1 %cc, label %if, label %endif
 355
 356 if:
 357   %u = add <4 x i32> %v, %v
 358   br label %endif
 359
 360 endif:
 361   %r = phi <4 x i32> [ %v, %entry ], [ %u, %if ]
 362   call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r) #0
 363   ret void
 364 }
 365
 366 ; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc_constant_select:
 367 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
 368 ; GCN: s_cselect_b32 s{{[0-9]+}}, 0, 1{{$}}
 369 define amdgpu_kernel void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, ptr addrspace(1) %out) {
 370 entry:
 371   %cmp0 = icmp eq i32 %cond, 0
 372   br i1 %cmp0, label %else, label %if
 373
 374 if:
 375   br label %done
 376
 377 else:
 378   br label %done
 379
 380 done:
 381   %value = phi i32 [0, %if], [1, %else]
 382   store i32 %value, ptr addrspace(1) %out
 383   ret void
 384 }
 385
 386 ; GCN-LABEL: {{^}}ifcvt_undef_scc:
 387 ; GCN: {{^}}; %bb.0:
 388 ; GCN-NEXT: s_load_dwordx2
 389 ; GCN-NEXT: s_cselect_b32 s{{[0-9]+}}, 0, 1{{$}}
 390 define amdgpu_kernel void @ifcvt_undef_scc(i32 %cond, ptr addrspace(1) %out) {
 391 entry:
 392   br i1 undef, label %else, label %if
 393
 394 if:
 395   br label %done
 396
 397 else:
 398   br label %done
 399
 400 done:
 401   %value = phi i32 [0, %if], [1, %else]
 402   store i32 %value, ptr addrspace(1) %out
 403   ret void
 404 }
 405
 406 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle256:
 407 ; GCN: v_cmp_neq_f32
 408 ; GCN: s_cbranch_vccnz [[ENDIF:.LBB[0-9]+_[0-9]+]]
 409
 410 ; GCN: v_add_i32
 411 ; GCN: v_add_i32
 412
 413 ; GCN: [[ENDIF]]:
 414 ; GCN: buffer_store_dword
 415 define amdgpu_kernel void @test_vccnz_ifcvt_triangle256(ptr addrspace(1) %out, ptr addrspace(1) %in, float %cnd) #0 {
 416 entry:
 417   %v = load <8 x i32>, ptr addrspace(1) %in
 418   %cc = fcmp oeq float %cnd, 1.000000e+00
 419   br i1 %cc, label %if, label %endif
 420
 421 if:
 422   %u = add <8 x i32> %v, %v
 423   br label %endif
 424
 425 endif:
 426   %r = phi <8 x i32> [ %v, %entry ], [ %u, %if ]
 427   store <8 x i32> %r, ptr addrspace(1) %out
 428   ret void
 429 }
 430
 431 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle512:
 432 ; GCN: v_cmp_neq_f32
 433 ; GCN: s_cbranch_vccnz [[ENDIF:.LBB[0-9]+_[0-9]+]]
 434
 435 ; GCN: v_add_i32
 436 ; GCN: v_add_i32
 437
 438 ; GCN: [[ENDIF]]:
 439 ; GCN: buffer_store_dword
 440 define amdgpu_kernel void @test_vccnz_ifcvt_triangle512(ptr addrspace(1) %out, ptr addrspace(1) %in, float %cnd) #0 {
 441 entry:
 442   %v = load <16 x i32>, ptr addrspace(1) %in
 443   %cc = fcmp oeq float %cnd, 1.000000e+00
 444   br i1 %cc, label %if, label %endif
 445
 446 if:
 447   %u = add <16 x i32> %v, %v
 448   br label %endif
 449
 450 endif:
 451   %r = phi <16 x i32> [ %v, %entry ], [ %u, %if ]
 452   store <16 x i32> %r, ptr addrspace(1) %out
 453   ret void
 454 }
 455
 456 attributes #0 = { nounwind }