test/CodeGen/AMDGPU/early-if-convert.ll

   1 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
   2 ; XUN: llc -march=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
   3
   4 ; FIXME: This leaves behind a now unnecessary and with exec
   5
   6 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle:
   7 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
   8 ; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]]
   9 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
  10 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[VAL]], vcc
  11 ; GCN: buffer_store_dword [[RESULT]]
  12 define amdgpu_kernel void @test_vccnz_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
  13 entry:
  14   %v = load float, float addrspace(1)* %in
  15   %cc = fcmp oeq float %v, 1.000000e+00
  16   br i1 %cc, label %if, label %endif
  17
  18 if:
  19   %u = fadd float %v, %v
  20   br label %endif
  21
  22 endif:
  23   %r = phi float [ %v, %entry ], [ %u, %if ]
  24   store float %r, float addrspace(1)* %out
  25   ret void
  26 }
  27
  28 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_diamond:
  29 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
  30 ; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]]
  31 ; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
  32 ; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VAL]], [[VAL]]
  33 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[MUL]], vcc
  34 ; GCN: buffer_store_dword [[RESULT]]
  35 define amdgpu_kernel void @test_vccnz_ifcvt_diamond(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
  36 entry:
  37   %v = load float, float addrspace(1)* %in
  38   %cc = fcmp oeq float %v, 1.000000e+00
  39   br i1 %cc, label %if, label %else
  40
  41 if:
  42   %u0 = fadd float %v, %v
  43   br label %endif
  44
  45 else:
  46   %u1 = fmul float %v, %v
  47   br label %endif
  48
  49 endif:
  50   %r = phi float [ %u0, %if ], [ %u1, %else ]
  51   store float %r, float addrspace(1)* %out
  52   ret void
  53 }
  54
  55 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_vcc_clobber:
  56 ; GCN: ; clobber vcc
  57 ; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0
  58 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc
  59 ; GCN: s_mov_b64 vcc, [[CMP]]
  60 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
  61 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_vcc_clobber(i32 addrspace(1)* %out, i32 addrspace(1)* %in, float %k) #0 {
  62 entry:
  63   %v = load i32, i32 addrspace(1)* %in
  64   %cc = fcmp oeq float %k, 1.000000e+00
  65   br i1 %cc, label %if, label %endif
  66
  67 if:
  68   call void asm "; clobber $0", "~{vcc}"() #0
  69   %u = add i32 %v, %v
  70   br label %endif
  71
  72 endif:
  73   %r = phi i32 [ %v, %entry ], [ %u, %if ]
  74   store i32 %r, i32 addrspace(1)* %out
  75   ret void
  76 }
  77
  78 ; Longest chain of cheap instructions to convert
  79 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_max_cheap:
  80 ; GCN: v_mul_f32
  81 ; GCN: v_mul_f32
  82 ; GCN: v_mul_f32
  83 ; GCN: v_mul_f32
  84 ; GCN: v_mul_f32
  85 ; GCN: v_mul_f32
  86 ; GCN: v_mul_f32
  87 ; GCN: v_mul_f32
  88 ; GCN: v_mul_f32
  89 ; GCN: v_cndmask_b32_e32
  90 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_max_cheap(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
  91 entry:
  92   %v = load float, float addrspace(1)* %in
  93   %cc = fcmp oeq float %v, 1.000000e+00
  94   br i1 %cc, label %if, label %endif
  95
  96 if:
  97   %u.0 = fmul float %v, %v
  98   %u.1 = fmul float %v, %u.0
  99   %u.2 = fmul float %v, %u.1
 100   %u.3 = fmul float %v, %u.2
 101   %u.4 = fmul float %v, %u.3
 102   %u.5 = fmul float %v, %u.4
 103   %u.6 = fmul float %v, %u.5
 104   %u.7 = fmul float %v, %u.6
 105   %u.8 = fmul float %v, %u.7
 106   br label %endif
 107
 108 endif:
 109   %r = phi float [ %v, %entry ], [ %u.8, %if ]
 110   store float %r, float addrspace(1)* %out
 111   ret void
 112 }
 113
 114 ; Short chain of cheap instructions to not convert
 115 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_min_expensive:
 116 ; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
 117
 118 ; GCN: v_mul_f32
 119 ; GCN: v_mul_f32
 120 ; GCN: v_mul_f32
 121 ; GCN: v_mul_f32
 122 ; GCN: v_mul_f32
 123 ; GCN: v_mul_f32
 124 ; GCN: v_mul_f32
 125 ; GCN: v_mul_f32
 126 ; GCN: v_mul_f32
 127 ; GCN: v_mul_f32
 128
 129 ; GCN: [[ENDIF]]:
 130 ; GCN: buffer_store_dword
 131 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_min_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 132 entry:
 133   %v = load float, float addrspace(1)* %in
 134   %cc = fcmp oeq float %v, 1.000000e+00
 135   br i1 %cc, label %if, label %endif
 136
 137 if:
 138   %u.0 = fmul float %v, %v
 139   %u.1 = fmul float %v, %u.0
 140   %u.2 = fmul float %v, %u.1
 141   %u.3 = fmul float %v, %u.2
 142   %u.4 = fmul float %v, %u.3
 143   %u.5 = fmul float %v, %u.4
 144   %u.6 = fmul float %v, %u.5
 145   %u.7 = fmul float %v, %u.6
 146   %u.8 = fmul float %v, %u.7
 147   %u.9 = fmul float %v, %u.8
 148   br label %endif
 149
 150 endif:
 151   %r = phi float [ %v, %entry ], [ %u.9, %if ]
 152   store float %r, float addrspace(1)* %out
 153   ret void
 154 }
 155
 156 ; Should still branch over fdiv expansion
 157 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_expensive:
 158 ; GCN: v_cmp_neq_f32_e32
 159 ; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
 160
 161 ; GCN: v_div_scale_f32
 162
 163 ; GCN: [[ENDIF]]:
 164 ; GCN: buffer_store_dword
 165 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 166 entry:
 167   %v = load float, float addrspace(1)* %in
 168   %cc = fcmp oeq float %v, 1.000000e+00
 169   br i1 %cc, label %if, label %endif
 170
 171 if:
 172   %u = fdiv float %v, %v
 173   br label %endif
 174
 175 endif:
 176   %r = phi float [ %v, %entry ], [ %u, %if ]
 177   store float %r, float addrspace(1)* %out
 178   ret void
 179 }
 180
 181 ; vcc branch with SGPR inputs
 182 ; GCN-LABEL: {{^}}test_vccnz_sgpr_ifcvt_triangle:
 183 ; GCN: v_cmp_neq_f32_e64
 184 ; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
 185
 186 ; GCN: s_add_i32
 187
 188 ; GCN: [[ENDIF]]:
 189 ; GCN: buffer_store_dword
 190 define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(4)* %in, float %cnd) #0 {
 191 entry:
 192   %v = load i32, i32 addrspace(4)* %in
 193   %cc = fcmp oeq float %cnd, 1.000000e+00
 194   br i1 %cc, label %if, label %endif
 195
 196 if:
 197   %u = add i32 %v, %v
 198   br label %endif
 199
 200 endif:
 201   %r = phi i32 [ %v, %entry ], [ %u, %if ]
 202   store i32 %r, i32 addrspace(1)* %out
 203   ret void
 204
 205 }
 206
 207 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_constant_load:
 208 ; GCN: v_cndmask_b32
 209 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(4)* %in) #0 {
 210 entry:
 211   %v = load float, float addrspace(4)* %in
 212   %cc = fcmp oeq float %v, 1.000000e+00
 213   br i1 %cc, label %if, label %endif
 214
 215 if:
 216   %u = fadd float %v, %v
 217   br label %endif
 218
 219 endif:
 220   %r = phi float [ %v, %entry ], [ %u, %if ]
 221   store float %r, float addrspace(1)* %out
 222   ret void
 223 }
 224
 225 ; Due to broken cost heuristic, this is not if converted like
 226 ; test_vccnz_ifcvt_triangle_constant_load even though it should be.
 227
 228 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_argload:
 229 ; GCN: v_cndmask_b32
 230 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_argload(float addrspace(1)* %out, float %v) #0 {
 231 entry:
 232   %cc = fcmp oeq float %v, 1.000000e+00
 233   br i1 %cc, label %if, label %endif
 234
 235 if:
 236   %u = fadd float %v, %v
 237   br label %endif
 238
 239 endif:
 240   %r = phi float [ %v, %entry ], [ %u, %if ]
 241   store float %r, float addrspace(1)* %out
 242   ret void
 243 }
 244
 245 ; Scalar branch and scalar inputs
 246 ; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle:
 247 ; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
 248 ; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]]
 249 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 250 ; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[ADD]], [[VAL]]
 251 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(4)* %in, i32 %cond) #0 {
 252 entry:
 253   %v = load i32, i32 addrspace(4)* %in
 254   %cc = icmp eq i32 %cond, 1
 255   br i1 %cc, label %if, label %endif
 256
 257 if:
 258   %u = add i32 %v, %v
 259   br label %endif
 260
 261 endif:
 262   %r = phi i32 [ %v, %entry ], [ %u, %if ]
 263   call void asm sideeffect "; reg use $0", "s"(i32 %r) #0
 264   ret void
 265 }
 266
 267 ; FIXME: Should be able to use VALU compare and select
 268 ; Scalar branch but VGPR select operands
 269 ; GCN-LABEL: {{^}}test_scc1_vgpr_ifcvt_triangle:
 270 ; GCN: s_cmp_lg_u32
 271 ; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]]
 272
 273 ; GCN: v_add_f32_e32
 274
 275 ; GCN: [[ENDIF]]:
 276 ; GCN: buffer_store_dword
 277 define amdgpu_kernel void @test_scc1_vgpr_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in, i32 %cond) #0 {
 278 entry:
 279   %v = load float, float addrspace(1)* %in
 280   %cc = icmp eq i32 %cond, 1
 281   br i1 %cc, label %if, label %endif
 282
 283 if:
 284   %u = fadd float %v, %v
 285   br label %endif
 286
 287 endif:
 288   %r = phi float [ %v, %entry ], [ %u, %if ]
 289   store float %r, float addrspace(1)* %out
 290   ret void
 291 }
 292
 293 ; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle64:
 294 ; GCN: s_add_u32
 295 ; GCN: s_addc_u32
 296 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 297 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
 298 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(4)* %in, i32 %cond) #0 {
 299 entry:
 300   %v = load i64, i64 addrspace(4)* %in
 301   %cc = icmp eq i32 %cond, 1
 302   br i1 %cc, label %if, label %endif
 303
 304 if:
 305   %u = add i64 %v, %v
 306   br label %endif
 307
 308 endif:
 309   %r = phi i64 [ %v, %entry ], [ %u, %if ]
 310   call void asm sideeffect "; reg use $0", "s"(i64 %r) #0
 311   ret void
 312 }
 313
 314 ; TODO: Can do s_cselect_b64; s_cselect_b32
 315 ; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle96:
 316 ; GCN: s_add_i32
 317 ; GCN: s_add_i32
 318 ; GCN: s_add_i32
 319 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 320 ; GCN-NEXT: s_cselect_b32 s
 321 ; GCN-NEXT: s_cselect_b32 s
 322 ; GCN-NEXT: s_cselect_b32 s
 323 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(4)* %in, i32 %cond) #0 {
 324 entry:
 325   %v = load <3 x i32>, <3 x i32> addrspace(4)* %in
 326   %cc = icmp eq i32 %cond, 1
 327   br i1 %cc, label %if, label %endif
 328
 329 if:
 330   %u = add <3 x i32> %v, %v
 331   br label %endif
 332
 333 endif:
 334   %r = phi <3 x i32> [ %v, %entry ], [ %u, %if ]
 335   %r.ext = shufflevector <3 x i32> %r, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 336   call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r.ext) #0
 337   ret void
 338 }
 339
 340 ; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle128:
 341 ; GCN: s_add_i32
 342 ; GCN: s_add_i32
 343 ; GCN: s_add_i32
 344 ; GCN: s_add_i32
 345 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 346 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
 347 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
 348 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(4)* %in, i32 %cond) #0 {
 349 entry:
 350   %v = load <4 x i32>, <4 x i32> addrspace(4)* %in
 351   %cc = icmp eq i32 %cond, 1
 352   br i1 %cc, label %if, label %endif
 353
 354 if:
 355   %u = add <4 x i32> %v, %v
 356   br label %endif
 357
 358 endif:
 359   %r = phi <4 x i32> [ %v, %entry ], [ %u, %if ]
 360   call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r) #0
 361   ret void
 362 }
 363
 364 ; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc_constant_select:
 365 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
 366 ; GCN: s_cselect_b32 s{{[0-9]+}}, 1, 0{{$}}
 367 define amdgpu_kernel void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, i32 addrspace(1)* %out) {
 368 entry:
 369   %cmp0 = icmp eq i32 %cond, 0
 370   br i1 %cmp0, label %else, label %if
 371
 372 if:
 373   br label %done
 374
 375 else:
 376   br label %done
 377
 378 done:
 379   %value = phi i32 [0, %if], [1, %else]
 380   store i32 %value, i32 addrspace(1)* %out
 381   ret void
 382 }
 383
 384 ; GCN-LABEL: {{^}}ifcvt_undef_scc:
 385 ; GCN: {{^}}; %bb.0:
 386 ; GCN-NEXT: s_load_dwordx2
 387 ; GCN-NEXT: s_cselect_b32 s{{[0-9]+}}, 1, 0
 388 define amdgpu_kernel void @ifcvt_undef_scc(i32 %cond, i32 addrspace(1)* %out) {
 389 entry:
 390   br i1 undef, label %else, label %if
 391
 392 if:
 393   br label %done
 394
 395 else:
 396   br label %done
 397
 398 done:
 399   %value = phi i32 [0, %if], [1, %else]
 400   store i32 %value, i32 addrspace(1)* %out
 401   ret void
 402 }
 403
 404 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle256:
 405 ; GCN: v_cmp_neq_f32
 406 ; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
 407
 408 ; GCN: v_add_i32
 409 ; GCN: v_add_i32
 410
 411 ; GCN: [[ENDIF]]:
 412 ; GCN: buffer_store_dword
 413 define amdgpu_kernel void @test_vccnz_ifcvt_triangle256(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in, float %cnd) #0 {
 414 entry:
 415   %v = load <8 x i32>, <8 x i32> addrspace(1)* %in
 416   %cc = fcmp oeq float %cnd, 1.000000e+00
 417   br i1 %cc, label %if, label %endif
 418
 419 if:
 420   %u = add <8 x i32> %v, %v
 421   br label %endif
 422
 423 endif:
 424   %r = phi <8 x i32> [ %v, %entry ], [ %u, %if ]
 425   store <8 x i32> %r, <8 x i32> addrspace(1)* %out
 426   ret void
 427 }
 428
 429 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle512:
 430 ; GCN: v_cmp_neq_f32
 431 ; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
 432
 433 ; GCN: v_add_i32
 434 ; GCN: v_add_i32
 435
 436 ; GCN: [[ENDIF]]:
 437 ; GCN: buffer_store_dword
 438 define amdgpu_kernel void @test_vccnz_ifcvt_triangle512(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in, float %cnd) #0 {
 439 entry:
 440   %v = load <16 x i32>, <16 x i32> addrspace(1)* %in
 441   %cc = fcmp oeq float %cnd, 1.000000e+00
 442   br i1 %cc, label %if, label %endif
 443
 444 if:
 445   %u = add <16 x i32> %v, %v
 446   br label %endif
 447
 448 endif:
 449   %r = phi <16 x i32> [ %v, %entry ], [ %u, %if ]
 450   store <16 x i32> %r, <16 x i32> addrspace(1)* %out
 451   ret void
 452 }
 453
 454 attributes #0 = { nounwind }