test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll

   1 ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI
   2 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI
   3
   4 ;CHECK-LABEL: {{^}}buffer_load:
   5 ;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
   6 ;CHECK: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc
   7 ;CHECK: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc
   8 ;CHECK: s_waitcnt
   9 define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) {
  10 main_body:
  11   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
  12   %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
  13   %data_slc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
  14   %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0
  15   %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1
  16   %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2
  17   ret {<4 x float>, <4 x float>, <4 x float>} %r2
  18 }
  19
  20 ;CHECK-LABEL: {{^}}buffer_load_immoffs:
  21 ;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40
  22 ;CHECK: s_waitcnt
  23 define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
  24 main_body:
  25   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 40, i1 0, i1 0)
  26   ret <4 x float> %data
  27 }
  28
  29 ;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
  30 ;SICI: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 offen
  31 ;VI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc
  32 ;VI: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:4
  33 ;CHECK: s_waitcnt
  34 define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
  35 main_body:
  36   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 8192, i1 0, i1 0)
  37   ret <4 x float> %data
  38 }
  39
  40 ;CHECK-LABEL: {{^}}buffer_load_idx:
  41 ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen
  42 ;CHECK: s_waitcnt
  43 define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) {
  44 main_body:
  45   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0)
  46   ret <4 x float> %data
  47 }
  48
  49 ;CHECK-LABEL: {{^}}buffer_load_ofs:
  50 ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen
  51 ;CHECK: s_waitcnt
  52 define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
  53 main_body:
  54   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0)
  55   ret <4 x float> %data
  56 }
  57
  58 ;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
  59 ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:60
  60 ;CHECK: s_waitcnt
  61 define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
  62 main_body:
  63   %ofs = add i32 %1, 60
  64   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0)
  65   ret <4 x float> %data
  66 }
  67
  68 ;CHECK-LABEL: {{^}}buffer_load_both:
  69 ;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
  70 ;CHECK: s_waitcnt
  71 define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) {
  72 main_body:
  73   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0)
  74   ret <4 x float> %data
  75 }
  76
  77 ;CHECK-LABEL: {{^}}buffer_load_both_reversed:
  78 ;CHECK: v_mov_b32_e32 v2, v0
  79 ;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen
  80 ;CHECK: s_waitcnt
  81 define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) {
  82 main_body:
  83   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0)
  84   ret <4 x float> %data
  85 }
  86
  87 ;CHECK-LABEL: {{^}}buffer_load_x1:
  88 ;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen
  89 ;CHECK: s_waitcnt
  90 define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
  91 main_body:
  92   %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0)
  93   ret float %data
  94 }
  95
  96 ;CHECK-LABEL: {{^}}buffer_load_x2:
  97 ;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen
  98 ;CHECK: s_waitcnt
  99 define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
 100 main_body:
 101   %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0)
 102   ret <2 x float> %data
 103 }
 104
 105 ;CHECK-LABEL: {{^}}buffer_load_negative_offset:
 106 ;CHECK: v_add_{{[iu]}}32_e32 [[VOFS:v[0-9]+]], vcc, -16, v0
 107 ;CHECK: buffer_load_dwordx4 v[0:3], [[VOFS]], s[0:3], 0 offen
 108 define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) {
 109 main_body:
 110   %ofs.1 = add i32 %ofs, -16
 111   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs.1, i1 0, i1 0)
 112   ret <4 x float> %data
 113 }
 114
 115 ; SI won't merge ds memory operations, because of the signed offset bug, so
 116 ; we only have check lines for VI.
 117 ; CHECK-LABEL: buffer_load_mmo:
 118 ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
 119 ; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
 120 define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, float addrspace(3)* %lds) {
 121 entry:
 122   store float 0.0, float addrspace(3)* %lds
 123   %val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0)
 124   %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
 125   store float 0.0, float addrspace(3)* %tmp2
 126   ret float %val
 127 }
 128
 129 ;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged:
 130 ;CHECK-NEXT: %bb.
 131 ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
 132 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
 133 ;CHECK: s_waitcnt
 134 define amdgpu_ps void @buffer_load_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a) {
 135 main_body:
 136   %a1 = add i32 %a, 4
 137   %a2 = add i32 %a, 8
 138   %a3 = add i32 %a, 12
 139   %a4 = add i32 %a, 16
 140   %a5 = add i32 %a, 28
 141   %a6 = add i32 %a, 32
 142   %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
 143   %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
 144   %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a3, i1 0, i1 0)
 145   %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a4, i1 0, i1 0)
 146   %r5 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a5, i1 0, i1 0)
 147   %r6 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a6, i1 0, i1 0)
 148   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
 149   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
 150   ret void
 151 }
 152
 153 ;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged_glc_slc:
 154 ;CHECK-NEXT: %bb.
 155 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}}
 156 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}}
 157 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}}
 158 ;CHECK: s_waitcnt
 159 define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a) {
 160 main_body:
 161   %a1 = add i32 %a, 4
 162   %a2 = add i32 %a, 8
 163   %a3 = add i32 %a, 12
 164   %a4 = add i32 %a, 16
 165   %a5 = add i32 %a, 28
 166   %a6 = add i32 %a, 32
 167   %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
 168   %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
 169   %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a3, i1 1, i1 0)
 170   %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a4, i1 1, i1 0)
 171   %r5 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a5, i1 1, i1 1)
 172   %r6 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a6, i1 1, i1 1)
 173   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
 174   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
 175   ret void
 176 }
 177
 178 ;CHECK-LABEL: {{^}}buffer_load_x2_offen_merged:
 179 ;CHECK-NEXT: %bb.
 180 ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
 181 ;CHECK: s_waitcnt
 182 define amdgpu_ps void @buffer_load_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a) {
 183 main_body:
 184   %a1 = add i32 %a, 4
 185   %a2 = add i32 %a, 12
 186   %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
 187   %vr2 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
 188   %r1 = extractelement <2 x float> %vr1, i32 0
 189   %r2 = extractelement <2 x float> %vr1, i32 1
 190   %r3 = extractelement <2 x float> %vr2, i32 0
 191   %r4 = extractelement <2 x float> %vr2, i32 1
 192   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
 193   ret void
 194 }
 195
 196 ;CHECK-LABEL: {{^}}buffer_load_x3_offen_merged:
 197 ;CHECK-NEXT: %bb.
 198 ;VI-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
 199 ;CHECK: s_waitcnt
 200 define amdgpu_ps void @buffer_load_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a) {
 201 main_body:
 202   %a1 = add i32 %a, 4
 203   %a2 = add i32 %a, 12
 204   %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
 205   %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
 206   %r1 = extractelement <2 x float> %vr1, i32 0
 207   %r2 = extractelement <2 x float> %vr1, i32 1
 208   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float undef, i1 true, i1 true)
 209   ret void
 210 }
 211
 212 ;CHECK-LABEL: {{^}}buffer_load_x1_offset_merged:
 213 ;CHECK-NEXT: %bb.
 214 ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
 215 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28
 216 ;CHECK: s_waitcnt
 217 define amdgpu_ps void @buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) {
 218 main_body:
 219   %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
 220   %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
 221   %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
 222   %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0)
 223   %r5 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 28, i1 0, i1 0)
 224   %r6 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 32, i1 0, i1 0)
 225   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
 226   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
 227   ret void
 228 }
 229
 230 ;CHECK-LABEL: {{^}}buffer_load_x2_offset_merged:
 231 ;CHECK-NEXT: %bb.
 232 ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
 233 ;CHECK: s_waitcnt
 234 define amdgpu_ps void @buffer_load_x2_offset_merged(<4 x i32> inreg %rsrc) {
 235 main_body:
 236   %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
 237   %vr2 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
 238   %r1 = extractelement <2 x float> %vr1, i32 0
 239   %r2 = extractelement <2 x float> %vr1, i32 1
 240   %r3 = extractelement <2 x float> %vr2, i32 0
 241   %r4 = extractelement <2 x float> %vr2, i32 1
 242   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
 243   ret void
 244 }
 245
 246 ;CHECK-LABEL: {{^}}buffer_load_x3_offset_merged:
 247 ;CHECK-NEXT: %bb.
 248 ;VI-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
 249 ;CHECK: s_waitcnt
 250 define amdgpu_ps void @buffer_load_x3_offset_merged(<4 x i32> inreg %rsrc) {
 251 main_body:
 252   %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
 253   %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
 254   %r1 = extractelement <2 x float> %vr1, i32 0
 255   %r2 = extractelement <2 x float> %vr1, i32 1
 256   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float undef, i1 true, i1 true)
 257   ret void
 258 }
 259
 260 declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0
 261 declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0
 262 declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0
 263 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 264
 265 attributes #0 = { nounwind readonly }