llvm/test/CodeGen/AMDGPU/GlobalISel/no-cse-nonlocal-convergent-instrs.mir

   1 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -o - -run-pass=machine-cse %s | FileCheck %s
   2 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -o - -passes=machine-cse %s | FileCheck %s
   3
   4 # LLVM's current definition of `isConvergent` does not necessarily prove that
   5 # non-local CSE is illegal. The following test extends the definition of
   6 # `isConvergent` to assume a convergent instruction is dependent not only on
   7 # additional conditions, but also on fewer conditions. LLVM does not have a
   8 # MachineInstr attribute which expresses this extended definition, so it's
   9 # necessary to use `isConvergent` to prevent illegally CSE-ing the subset of
  10 # `isConvergent` instructions which do fall into this extended definition.
  11
  12 # This is a coverage test for the MachineCSE change. It does not reproduce an
  13 # actual bug in the AMDGPU backend. The current open source GPU backends as is
  14 # do not appear to allow a reasonably simple test case that provably and
  15 # undeniably functionally breaks without the associated MachineCSE changes.
  16
  17 # The test checks that we don't CSE non-local convergent instrs. Otherwise,
  18 # reusing defs of convergent instrs from different control flow scopes can
  19 # cause illegal codegen. Previously, the swizzle in bb2 would be CSE-ed in
  20 # favor of using the swizzle in bb1 despite bb2 being a different BBs.
  21
  22 # CHECK-LABEL: name: no_cse
  23 # CHECK: bb.1.if.then
  24 # CHECK: [[SWIZZLE1:%[0-9]+]]:vgpr_32 = DS_SWIZZLE_B32 [[SRC:%[0-9]+]], 100, 0, implicit $exec
  25 # CHECK-NEXT: V_ADD_CO_U32_e64 [[SWIZZLE1]], {{%[0-9]+}}, 0, implicit $exec
  26 # CHECK-NEXT: S_CMP_LT_I32 {{.*}} implicit-def $scc
  27 # CHECK-NEXT: S_CBRANCH_SCC1 %bb.3, implicit $scc
  28 # CHECK-NEXT: S_BRANCH %bb.2
  29 # CHECK: bb.2.if.then.if.then
  30 # CHECK: [[SWIZZLE2:%[0-9]+]]:vgpr_32 = DS_SWIZZLE_B32 [[SRC]], 100, 0, implicit $exec
  31 # CHECK-NEXT: V_ADD_CO_U32_e64 [[SWIZZLE2]], {{%[0-9]+}}, 0, implicit $exec
  32
  33 --- |
  34   define amdgpu_kernel void @no_cse(ptr addrspace(1), i32, i1) {
  35   entry:
  36     unreachable
  37   if.then:
  38     unreachable
  39   if.then.if.then:
  40     unreachable
  41   if.then.phi:
  42     unreachable
  43   exit:
  44     unreachable
  45   }
  46 ...
  47 ---
  48 name: no_cse
  49 tracksRegLiveness: true
  50 body:             |
  51   bb.0.entry:
  52     liveins: $sgpr4_sgpr5
  53     %0:sgpr_64(p4) = COPY $sgpr4_sgpr5
  54     %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0(p4), 0, 0
  55     %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0(p4), 2, 0
  56     %3:sreg_64 = COPY %1
  57     %4:sreg_32 = COPY %2.sub1
  58     %5:sreg_32 = S_MOV_B32 42
  59     S_CMP_EQ_U32 %4, %5, implicit-def $scc
  60     %6:vgpr_32 = COPY %5, implicit $exec
  61     S_CBRANCH_SCC1 %bb.4, implicit $scc
  62     S_BRANCH %bb.1
  63
  64   bb.1.if.then:
  65     %7:sreg_32 = COPY %2.sub0
  66     %8:vgpr_32 = COPY %7
  67     %9:vgpr_32 = DS_SWIZZLE_B32 %8, 100, 0, implicit $exec
  68     %10:vgpr_32, %21:sreg_32 = V_ADD_CO_U32_e64 %9, %5, 0, implicit $exec
  69     S_CMP_LT_I32 %7, %5, implicit-def $scc
  70     S_CBRANCH_SCC1 %bb.3, implicit $scc
  71     S_BRANCH %bb.2
  72
  73   bb.2.if.then.if.then:
  74     %11:sreg_32 = S_MOV_B32 64
  75     %12:vgpr_32 = DS_SWIZZLE_B32 %8, 100, 0, implicit $exec
  76     %13:vgpr_32, %24:sreg_32 = V_ADD_CO_U32_e64 %12, %11, 0, implicit $exec
  77
  78   bb.3.if.then.phi:
  79     %14:vgpr_32 = PHI %10, %bb.1, %13, %bb.2
  80
  81   bb.4.exit:
  82     %15:vgpr_32 = PHI %6, %bb.0, %14, %bb.3
  83     %16:vreg_64 = COPY %3
  84     FLAT_STORE_DWORD %16, %15, 0, 0, implicit $exec, implicit $flat_scr
  85     S_ENDPGM 0
  86
  87 ...