llvm/test/CodeGen/AMDGPU/lds-output-queue.ll

   1 ; RUN: llc -mtriple=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s
   2 ;
   3 ; This test checks that the lds input queue will is empty at the end of
   4 ; the ALU clause.
   5
   6 ; CHECK-LABEL: {{^}}lds_input_queue:
   7 ; CHECK: LDS_READ_RET * OQAP
   8 ; CHECK-NOT: ALU clause
   9 ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
  10
  11 @local_mem = internal unnamed_addr addrspace(3) global [2 x i32] undef, align 4
  12
  13 define amdgpu_kernel void @lds_input_queue(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %index) {
  14 entry:
  15   %0 = getelementptr inbounds [2 x i32], ptr addrspace(3) @local_mem, i32 0, i32 %index
  16   %1 = load i32, ptr addrspace(3) %0
  17   call void @llvm.r600.group.barrier()
  18
  19   ; This will start a new clause for the vertex fetch
  20   %2 = load i32, ptr addrspace(1) %in
  21   %3 = add i32 %1, %2
  22   store i32 %3, ptr addrspace(1) %out
  23   ret void
  24 }
  25
  26 declare void @llvm.r600.group.barrier() nounwind convergent
  27
  28 ; The machine scheduler does not do proper alias analysis and assumes that
  29 ; loads from global values (Note that a global value is different that a
  30 ; value from global memory.  A global value is a value that is declared
  31 ; outside of a function, it can reside in any address space) alias with
  32 ; all other loads.
  33 ;
  34 ; This is a problem for scheduling the reads from the local data share (lds).
  35 ; These reads are implemented using two instructions.  The first copies the
  36 ; data from lds into the lds output queue, and the second moves the data from
  37 ; the input queue into main memory.  These two instructions don't have to be
  38 ; scheduled one after the other, but they do need to be scheduled in the same
  39 ; clause.  The aliasing problem mentioned above causes problems when there is a
  40 ; load from global memory which immediately follows a load from a global value that
  41 ; has been declared in the local memory space:
  42 ;
  43 ;  %0 = getelementptr inbounds [2 x i32], ptr addrspace(3) @local_mem, i32 0, i32 %index
  44 ;  %1 = load i32, ptr addrspace(3) %0
  45 ;  %2 = load i32, ptr addrspace(1) %in
  46 ;
  47 ; The instruction selection phase will generate ISA that looks like this:
  48 ; %oqap = LDS_READ_RET
  49 ; %0 = MOV %oqap
  50 ; %1 = VTX_READ_32
  51 ; %2 = ADD_INT %1, %0
  52 ;
  53 ; The bottom scheduler will schedule the two ALU instructions first:
  54 ;
  55 ; UNSCHEDULED:
  56 ; %oqap = LDS_READ_RET
  57 ; %1 = VTX_READ_32
  58 ;
  59 ; SCHEDULED:
  60 ;
  61 ; %0 = MOV %oqap
  62 ; %2 = ADD_INT %1, %2
  63 ;
  64 ; The lack of proper aliasing results in the local memory read (LDS_READ_RET)
  65 ; to consider the global memory read (VTX_READ_32) has a chain dependency, so
  66 ; the global memory read will always be scheduled first.  This will give us a
  67 ; final program which looks like this:
  68 ;
  69 ; Alu clause:
  70 ; %oqap = LDS_READ_RET
  71 ; VTX clause:
  72 ; %1 = VTX_READ_32
  73 ; Alu clause:
  74 ; %0 = MOV %oqap
  75 ; %2 = ADD_INT %1, %2
  76 ;
  77 ; This is an illegal program because the oqap def and use know occur in
  78 ; different ALU clauses.
  79 ;
  80 ; This test checks this scenario and makes sure it doesn't result in an
  81 ; illegal program.  For now, we have fixed this issue by merging the
  82 ; LDS_READ_RET and MOV together during instruction selection and then
  83 ; expanding them after scheduling.  Once the scheduler has better alias
  84 ; analysis, we should be able to keep these instructions sparate before
  85 ; scheduling.
  86 ;
  87 ; CHECK-LABEL: {{^}}local_global_alias:
  88 ; CHECK: LDS_READ_RET
  89 ; CHECK-NOT: ALU clause
  90 ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
  91 define amdgpu_kernel void @local_global_alias(ptr addrspace(1) %out, ptr addrspace(1) %in) {
  92 entry:
  93   %0 = load i32, ptr addrspace(3) @local_mem
  94   %1 = load i32, ptr addrspace(1) %in
  95   %2 = add i32 %1, %0
  96   store i32 %2, ptr addrspace(1) %out
  97   ret void
  98 }