llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll

   1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
   2
   3 ; indexing of vectors.
   4
   5 ; Subtest below moved from file test/CodeGen/AMDGPU/indirect-addressing-si.ll
   6 ; to avoid gfx9 scheduling induced issues.
   7
   8
   9 ; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
  10 ; GCN-DAG: s_load_dwordx16 s[[[S_ELT0:[0-9]+]]:[[S_ELT15:[0-9]+]]]
  11 ; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
  12 ; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
  13
  14 ; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]]
  15 ; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
  16
  17 ; GCN: v_cmp_eq_u32_e32
  18 ; GCN-COUNT-32: v_cndmask_b32
  19
  20 ; GCN-COUNT-4: buffer_store_dwordx4
  21 define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in, <16 x i32> %vec0) #0 {
  22 entry:
  23   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
  24   %id.ext = zext i32 %id to i64
  25   %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %id.ext
  26   %idx0 = load volatile i32, ptr addrspace(1) %gep
  27   %idx1 = add i32 %idx0, 1
  28   %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()
  29   %vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0
  30   %vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1
  31   store volatile <16 x i32> %vec2, ptr addrspace(1) %out0
  32   %cmp = icmp eq i32 %id, 0
  33   br i1 %cmp, label %bb1, label %bb2
  34
  35 bb1:
  36   store volatile i32 %live.out.val, ptr addrspace(1) undef
  37   br label %bb2
  38
  39 bb2:
  40   ret void
  41 }
  42
  43 ; Avoid inserting extra v_mov from copies within the vgpr indexing sequence. The
  44 ; gpr_idx mode switching sequence is expanded late for this reason.
  45
  46 ; GCN-LABEL: {{^}}insert_w_offset_multiple_in_block
  47
  48 ; GCN: s_set_gpr_idx_on
  49 ; GCN-NEXT: v_mov_b32_e32
  50 ; GCN-NEXT: s_set_gpr_idx_off
  51
  52 ; GCN: s_set_gpr_idx_on
  53 ; GCN-NEXT: v_mov_b32_e32
  54 ; GCN-NOT: v_mov_b32_e32
  55 ; GCN-NEXT: s_set_gpr_idx_off
  56 define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %out1, i32 %in) #0 {
  57 entry:
  58   %add1 = add i32 %in, 1
  59   %ins1 = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add1
  60   %add2 = add i32 %in, 2
  61   %ins2 = insertelement <16 x float> %ins1, float 17.0, i32 %add2
  62   store <16 x float> %ins1, ptr addrspace(1) %out1
  63   %out2 = getelementptr <16 x float>, ptr addrspace(1) %out1, i32 1
  64   store <16 x float> %ins2, ptr addrspace(1) %out2
  65
  66   ret void
  67 }
  68
  69 declare hidden void @foo()
  70
  71 ; For functions with calls, we were not accounting for m0_lo16/m0_hi16
  72 ; uses on the BUNDLE created when expanding the insert register pseudo.
  73 ; GCN-LABEL: {{^}}insertelement_with_call:
  74 ; GCN: s_set_gpr_idx_on s{{[0-9]+}}, gpr_idx(DST)
  75 ; GCN-NEXT: v_mov_b32_e32 {{v[0-9]+}}, 8
  76 ; GCN-NEXT: s_set_gpr_idx_off
  77 ; GCN: s_swappc_b64
  78 define amdgpu_kernel void @insertelement_with_call(ptr addrspace(1) %ptr, i32 %idx) #0 {
  79   %vec = load <16 x i32>, ptr addrspace(1) %ptr
  80   %i6 = insertelement <16 x i32> %vec, i32 8, i32 %idx
  81   call void @foo()
  82   store <16 x i32> %i6, ptr addrspace(1) null
  83   ret void
  84 }
  85
  86 declare i32 @llvm.amdgcn.workitem.id.x() #1
  87 declare void @llvm.amdgcn.s.barrier() #2
  88
  89 attributes #0 = { nounwind }