test/CodeGen/AMDGPU/memory_clause.ll

   1 ; RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefix=GCN %s
   2
   3 ; GCN-LABEL: {{^}}vector_clause:
   4 ; GCN:      global_load_dwordx4
   5 ; GCN-NEXT: global_load_dwordx4
   6 ; GCN-NEXT: global_load_dwordx4
   7 ; GCN-NEXT: global_load_dwordx4
   8 ; GCN-NEXT: s_nop
   9 define amdgpu_kernel void @vector_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) {
  10 bb:
  11   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  12   %tmp2 = zext i32 %tmp to i64
  13   %tmp3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp2
  14   %tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 16
  15   %tmp5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp2
  16   %tmp6 = add nuw nsw i64 %tmp2, 1
  17   %tmp7 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp6
  18   %tmp8 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp7, align 16
  19   %tmp9 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp6
  20   %tmp10 = add nuw nsw i64 %tmp2, 2
  21   %tmp11 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp10
  22   %tmp12 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp11, align 16
  23   %tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp10
  24   %tmp14 = add nuw nsw i64 %tmp2, 3
  25   %tmp15 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp14
  26   %tmp16 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp15, align 16
  27   %tmp17 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp14
  28   store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp5, align 16
  29   store <4 x i32> %tmp8, <4 x i32> addrspace(1)* %tmp9, align 16
  30   store <4 x i32> %tmp12, <4 x i32> addrspace(1)* %tmp13, align 16
  31   store <4 x i32> %tmp16, <4 x i32> addrspace(1)* %tmp17, align 16
  32   ret void
  33 }
  34
  35 ; GCN-LABEL: {{^}}scalar_clause:
  36 ; GCN:      s_load_dwordx2
  37 ; GCN-NEXT: s_load_dwordx2
  38 ; GCN-NEXT: s_nop
  39 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
  40 ; GCN-NEXT: s_load_dwordx4
  41 ; GCN-NEXT: s_load_dwordx4
  42 ; GCN-NEXT: s_load_dwordx4
  43 ; GCN-NEXT: s_load_dwordx4
  44 define amdgpu_kernel void @scalar_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) {
  45 bb:
  46   %tmp = load <4 x i32>, <4 x i32> addrspace(1)* %arg, align 16
  47   %tmp2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 1
  48   %tmp3 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp2, align 16
  49   %tmp4 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 1
  50   %tmp5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 2
  51   %tmp6 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp5, align 16
  52   %tmp7 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 2
  53   %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 3
  54   %tmp9 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp8, align 16
  55   %tmp10 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 3
  56   store <4 x i32> %tmp, <4 x i32> addrspace(1)* %arg1, align 16
  57   store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %tmp4, align 16
  58   store <4 x i32> %tmp6, <4 x i32> addrspace(1)* %tmp7, align 16
  59   store <4 x i32> %tmp9, <4 x i32> addrspace(1)* %tmp10, align 16
  60   ret void
  61 }
  62
  63 ; GCN-LABEL: {{^}}mubuf_clause:
  64 ; GCN:      buffer_load_dword
  65 ; GCN-NEXT: buffer_load_dword
  66 ; GCN-NEXT: buffer_load_dword
  67 ; GCN-NEXT: buffer_load_dword
  68 ; GCN-NEXT: buffer_load_dword
  69 ; GCN-NEXT: buffer_load_dword
  70 ; GCN-NEXT: buffer_load_dword
  71 ; GCN-NEXT: buffer_load_dword
  72 ; GCN-NEXT: buffer_load_dword
  73 ; GCN-NEXT: buffer_load_dword
  74 ; GCN-NEXT: buffer_load_dword
  75 ; GCN-NEXT: buffer_load_dword
  76 ; GCN-NEXT: buffer_load_dword
  77 ; GCN-NEXT: buffer_load_dword
  78 ; GCN-NEXT: buffer_load_dword
  79 ; GCN-NEXT: s_nop
  80 ; GCN-NEXT: s_nop
  81 ; GCN-NEXT: buffer_load_dword
  82 define void @mubuf_clause(<4 x i32> addrspace(5)* noalias nocapture readonly %arg, <4 x i32> addrspace(5)* noalias nocapture %arg1) {
  83 bb:
  84   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  85   %tmp2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %arg, i32 %tmp
  86   %tmp3 = load <4 x i32>, <4 x i32> addrspace(5)* %tmp2, align 16
  87   %tmp4 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %arg1, i32 %tmp
  88   %tmp5 = add nuw nsw i32 %tmp, 1
  89   %tmp6 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %arg, i32 %tmp5
  90   %tmp7 = load <4 x i32>, <4 x i32> addrspace(5)* %tmp6, align 16
  91   %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %arg1, i32 %tmp5
  92   %tmp9 = add nuw nsw i32 %tmp, 2
  93   %tmp10 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %arg, i32 %tmp9
  94   %tmp11 = load <4 x i32>, <4 x i32> addrspace(5)* %tmp10, align 16
  95   %tmp12 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %arg1, i32 %tmp9
  96   %tmp13 = add nuw nsw i32 %tmp, 3
  97   %tmp14 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %arg, i32 %tmp13
  98   %tmp15 = load <4 x i32>, <4 x i32> addrspace(5)* %tmp14, align 16
  99   %tmp16 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %arg1, i32 %tmp13
 100   store <4 x i32> %tmp3, <4 x i32> addrspace(5)* %tmp4, align 16
 101   store <4 x i32> %tmp7, <4 x i32> addrspace(5)* %tmp8, align 16
 102   store <4 x i32> %tmp11, <4 x i32> addrspace(5)* %tmp12, align 16
 103   store <4 x i32> %tmp15, <4 x i32> addrspace(5)* %tmp16, align 16
 104   ret void
 105 }
 106
 107 ; GCN-LABEL: {{^}}vector_clause_indirect:
 108 ; GCN: global_load_dwordx2 [[ADDR:v\[[0-9:]+\]]], v[{{[0-9:]+}}], s[{{[0-9:]+}}]
 109 ; GCN-NEXT: s_nop 0
 110 ; GCN-NEXT: s_waitcnt vmcnt(0)
 111 ; GCN-NEXT: s_nop 0
 112 ; GCN-NEXT: global_load_dwordx4 v[{{[0-9:]+}}], [[ADDR]], off
 113 ; GCN-NEXT: global_load_dwordx4 v[{{[0-9:]+}}], [[ADDR]], off offset:16
 114 define amdgpu_kernel void @vector_clause_indirect(i64 addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture readnone %arg1, <4 x i32> addrspace(1)* noalias nocapture %arg2) {
 115 bb:
 116   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
 117   %tmp3 = zext i32 %tmp to i64
 118   %tmp4 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp3
 119   %tmp5 = bitcast i64 addrspace(1)* %tmp4 to <4 x i32> addrspace(1)* addrspace(1)*
 120   %tmp6 = load <4 x i32> addrspace(1)*, <4 x i32> addrspace(1)* addrspace(1)* %tmp5, align 8
 121   %tmp7 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp6, align 16
 122   %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %tmp6, i64 1
 123   %tmp9 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp8, align 16
 124   store <4 x i32> %tmp7, <4 x i32> addrspace(1)* %arg2, align 16
 125   %tmp10 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg2, i64 1
 126   store <4 x i32> %tmp9, <4 x i32> addrspace(1)* %tmp10, align 16
 127   ret void
 128 }
 129
 130 ; GCN-LABEL: {{^}}load_global_d16_hi:
 131 ; GCN:      global_load_short_d16_hi v
 132 ; GCN-NEXT: s_nop
 133 ; GCN-NEXT: s_nop
 134 ; GCN-NEXT: global_load_short_d16_hi v
 135 define void @load_global_d16_hi(i16 addrspace(1)* %in, i16 %reg, <2 x i16> addrspace(1)* %out) {
 136 entry:
 137   %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 32
 138   %load1 = load i16, i16 addrspace(1)* %in
 139   %load2 = load i16, i16 addrspace(1)* %gep
 140   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
 141   %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
 142   store <2 x i16> %build1, <2 x i16> addrspace(1)* %out
 143   %build2 = insertelement <2 x i16> undef, i16 %reg, i32 0
 144   %build3 = insertelement <2 x i16> %build2, i16 %load2, i32 1
 145   %gep2 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 32
 146   store <2 x i16> %build3, <2 x i16> addrspace(1)* %gep2
 147   ret void
 148 }
 149
 150 ; GCN-LABEL: {{^}}load_global_d16_lo:
 151 ; GCN:      global_load_short_d16 v
 152 ; GCN-NEXT: s_nop
 153 ; GCN-NEXT: s_nop
 154 ; GCN-NEXT: global_load_short_d16 v
 155 define void @load_global_d16_lo(i16 addrspace(1)* %in, i32 %reg, <2 x i16> addrspace(1)* %out) {
 156 entry:
 157   %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 32
 158   %reg.bc1 = bitcast i32 %reg to <2 x i16>
 159   %reg.bc2 = bitcast i32 %reg to <2 x i16>
 160   %load1 = load i16, i16 addrspace(1)* %in
 161   %load2 = load i16, i16 addrspace(1)* %gep
 162   %build1 = insertelement <2 x i16> %reg.bc1, i16 %load1, i32 0
 163   %build2 = insertelement <2 x i16> %reg.bc2, i16 %load2, i32 0
 164   %gep2 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 32
 165   store <2 x i16> %build1, <2 x i16> addrspace(1)* %out
 166   store <2 x i16> %build2, <2 x i16> addrspace(1)* %gep2
 167   ret void
 168 }
 169
 170 declare i32 @llvm.amdgcn.workitem.id.x()