test/CodeGen/AMDGPU/private-memory-r600.ll

   1 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
   2 ; RUN: opt -S -mtriple=r600-unknown-unknown -mcpu=redwood -amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
   3
   4 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
   5
   6 ; FUNC-LABEL: {{^}}mova_same_clause:
   7
   8 ; R600: LDS_WRITE
   9 ; R600: LDS_WRITE
  10 ; R600: LDS_READ
  11 ; R600: LDS_READ
  12
  13 ; OPT: call i32 @llvm.r600.read.local.size.y(), !range !0
  14 ; OPT: call i32 @llvm.r600.read.local.size.z(), !range !0
  15 ; OPT: call i32 @llvm.r600.read.tidig.x(), !range !1
  16 ; OPT: call i32 @llvm.r600.read.tidig.y(), !range !1
  17 ; OPT: call i32 @llvm.r600.read.tidig.z(), !range !1
  18
  19 define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
  20 entry:
  21   %stack = alloca [5 x i32], align 4
  22   %0 = load i32, i32 addrspace(1)* %in, align 4
  23   %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
  24   store i32 4, i32* %arrayidx1, align 4
  25   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
  26   %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
  27   %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
  28   store i32 5, i32* %arrayidx3, align 4
  29   %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
  30   %2 = load i32, i32* %arrayidx10, align 4
  31   store i32 %2, i32 addrspace(1)* %out, align 4
  32   %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
  33   %3 = load i32, i32* %arrayidx12
  34   %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
  35   store i32 %3, i32 addrspace(1)* %arrayidx13
  36   ret void
  37 }
  38
  39 ; This test checks that the stack offset is calculated correctly for structs.
  40 ; All register loads/stores should be optimized away, so there shouldn't be
  41 ; any MOVA instructions.
  42 ;
  43 ; XXX: This generated code has unnecessary MOVs, we should be able to optimize
  44 ; this.
  45
  46 ; FUNC-LABEL: {{^}}multiple_structs:
  47 ; R600-NOT: MOVA_INT
  48 %struct.point = type { i32, i32 }
  49
  50 define amdgpu_kernel void @multiple_structs(i32 addrspace(1)* %out) #0 {
  51 entry:
  52   %a = alloca %struct.point
  53   %b = alloca %struct.point
  54   %a.x.ptr = getelementptr inbounds %struct.point, %struct.point* %a, i32 0, i32 0
  55   %a.y.ptr = getelementptr inbounds %struct.point, %struct.point* %a, i32 0, i32 1
  56   %b.x.ptr = getelementptr inbounds %struct.point, %struct.point* %b, i32 0, i32 0
  57   %b.y.ptr = getelementptr inbounds %struct.point, %struct.point* %b, i32 0, i32 1
  58   store i32 0, i32* %a.x.ptr
  59   store i32 1, i32* %a.y.ptr
  60   store i32 2, i32* %b.x.ptr
  61   store i32 3, i32* %b.y.ptr
  62   %a.indirect.ptr = getelementptr inbounds %struct.point, %struct.point* %a, i32 0, i32 0
  63   %b.indirect.ptr = getelementptr inbounds %struct.point, %struct.point* %b, i32 0, i32 0
  64   %a.indirect = load i32, i32* %a.indirect.ptr
  65   %b.indirect = load i32, i32* %b.indirect.ptr
  66   %0 = add i32 %a.indirect, %b.indirect
  67   store i32 %0, i32 addrspace(1)* %out
  68   ret void
  69 }
  70
  71 ; Test direct access of a private array inside a loop.  The private array
  72 ; loads and stores should be lowered to copies, so there shouldn't be any
  73 ; MOVA instructions.
  74
  75 ; FUNC-LABEL: {{^}}direct_loop:
  76 ; R600-NOT: MOVA_INT
  77
  78 define amdgpu_kernel void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
  79 entry:
  80   %prv_array_const = alloca [2 x i32]
  81   %prv_array = alloca [2 x i32]
  82   %a = load i32, i32 addrspace(1)* %in
  83   %b_src_ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
  84   %b = load i32, i32 addrspace(1)* %b_src_ptr
  85   %a_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
  86   store i32 %a, i32* %a_dst_ptr
  87   %b_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 1
  88   store i32 %b, i32* %b_dst_ptr
  89   br label %for.body
  90
  91 for.body:
  92   %inc = phi i32 [0, %entry], [%count, %for.body]
  93   %x_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
  94   %x = load i32, i32* %x_ptr
  95   %y_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
  96   %y = load i32, i32* %y_ptr
  97   %xy = add i32 %x, %y
  98   store i32 %xy, i32* %y_ptr
  99   %count = add i32 %inc, 1
 100   %done = icmp eq i32 %count, 4095
 101   br i1 %done, label %for.end, label %for.body
 102
 103 for.end:
 104   %value_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
 105   %value = load i32, i32* %value_ptr
 106   store i32 %value, i32 addrspace(1)* %out
 107   ret void
 108 }
 109
 110 ; FUNC-LABEL: {{^}}short_array:
 111
 112 ; R600: MOVA_INT
 113 define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
 114 entry:
 115   %0 = alloca [2 x i16]
 116   %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0
 117   %2 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 1
 118   store i16 0, i16* %1
 119   store i16 1, i16* %2
 120   %3 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 %index
 121   %4 = load i16, i16* %3
 122   %5 = sext i16 %4 to i32
 123   store i32 %5, i32 addrspace(1)* %out
 124   ret void
 125 }
 126
 127 ; FUNC-LABEL: {{^}}char_array:
 128
 129 ; R600: MOVA_INT
 130 define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
 131 entry:
 132   %0 = alloca [2 x i8]
 133   %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0
 134   %2 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 1
 135   store i8 0, i8* %1
 136   store i8 1, i8* %2
 137   %3 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 %index
 138   %4 = load i8, i8* %3
 139   %5 = sext i8 %4 to i32
 140   store i32 %5, i32 addrspace(1)* %out
 141   ret void
 142
 143 }
 144
 145 ; Make sure we don't overwrite workitem information with private memory
 146
 147 ; FUNC-LABEL: {{^}}work_item_info:
 148 ; R600-NOT: MOV T0.X
 149 ; Additional check in case the move ends up in the last slot
 150 ; R600-NOT: MOV * TO.X
 151 define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) #0 {
 152 entry:
 153   %0 = alloca [2 x i32]
 154   %1 = getelementptr inbounds [2 x i32], [2 x i32]* %0, i32 0, i32 0
 155   %2 = getelementptr inbounds [2 x i32], [2 x i32]* %0, i32 0, i32 1
 156   store i32 0, i32* %1
 157   store i32 1, i32* %2
 158   %3 = getelementptr inbounds [2 x i32], [2 x i32]* %0, i32 0, i32 %in
 159   %4 = load i32, i32* %3
 160   %5 = call i32 @llvm.r600.read.tidig.x()
 161   %6 = add i32 %4, %5
 162   store i32 %6, i32 addrspace(1)* %out
 163   ret void
 164 }
 165
 166 ; Test that two stack objects are not stored in the same register
 167 ; The second stack object should be in T3.X
 168 ; FUNC-LABEL: {{^}}no_overlap:
 169 ; R600_CHECK: MOV
 170 ; R600_CHECK: [[CHAN:[XYZW]]]+
 171 ; R600-NOT: [[CHAN]]+
 172 define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
 173 entry:
 174   %0 = alloca [3 x i8], align 1
 175   %1 = alloca [2 x i8], align 1
 176   %2 = getelementptr inbounds [3 x i8], [3 x i8]* %0, i32 0, i32 0
 177   %3 = getelementptr inbounds [3 x i8], [3 x i8]* %0, i32 0, i32 1
 178   %4 = getelementptr inbounds [3 x i8], [3 x i8]* %0, i32 0, i32 2
 179   %5 = getelementptr inbounds [2 x i8], [2 x i8]* %1, i32 0, i32 0
 180   %6 = getelementptr inbounds [2 x i8], [2 x i8]* %1, i32 0, i32 1
 181   store i8 0, i8* %2
 182   store i8 1, i8* %3
 183   store i8 2, i8* %4
 184   store i8 1, i8* %5
 185   store i8 0, i8* %6
 186   %7 = getelementptr inbounds [3 x i8], [3 x i8]* %0, i32 0, i32 %in
 187   %8 = getelementptr inbounds [2 x i8], [2 x i8]* %1, i32 0, i32 %in
 188   %9 = load i8, i8* %7
 189   %10 = load i8, i8* %8
 190   %11 = add i8 %9, %10
 191   %12 = sext i8 %11 to i32
 192   store i32 %12, i32 addrspace(1)* %out
 193   ret void
 194 }
 195
 196 define amdgpu_kernel void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 197 entry:
 198   %alloca = alloca [2 x [2 x i8]]
 199   %gep0 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
 200   %gep1 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 1
 201   store i8 0, i8* %gep0
 202   store i8 1, i8* %gep1
 203   %gep2 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 %index
 204   %load = load i8, i8* %gep2
 205   %sext = sext i8 %load to i32
 206   store i32 %sext, i32 addrspace(1)* %out
 207   ret void
 208 }
 209
 210 define amdgpu_kernel void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 211 entry:
 212   %alloca = alloca [2 x [2 x i32]]
 213   %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
 214   %gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1
 215   store i32 0, i32* %gep0
 216   store i32 1, i32* %gep1
 217   %gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index
 218   %load = load i32, i32* %gep2
 219   store i32 %load, i32 addrspace(1)* %out
 220   ret void
 221 }
 222
 223 define amdgpu_kernel void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
 224 entry:
 225   %alloca = alloca [2 x [2 x i64]]
 226   %gep0 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
 227   %gep1 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 1
 228   store i64 0, i64* %gep0
 229   store i64 1, i64* %gep1
 230   %gep2 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 %index
 231   %load = load i64, i64* %gep2
 232   store i64 %load, i64 addrspace(1)* %out
 233   ret void
 234 }
 235
 236 %struct.pair32 = type { i32, i32 }
 237
 238 define amdgpu_kernel void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 239 entry:
 240   %alloca = alloca [2 x [2 x %struct.pair32]]
 241   %gep0 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
 242   %gep1 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 1, i32 1
 243   store i32 0, i32* %gep0
 244   store i32 1, i32* %gep1
 245   %gep2 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 %index, i32 0
 246   %load = load i32, i32* %gep2
 247   store i32 %load, i32 addrspace(1)* %out
 248   ret void
 249 }
 250
 251 define amdgpu_kernel void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
 252 entry:
 253   %alloca = alloca [2 x %struct.pair32]
 254   %gep0 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
 255   %gep1 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 1, i32 0
 256   store i32 0, i32* %gep0
 257   store i32 1, i32* %gep1
 258   %gep2 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 %index, i32 0
 259   %load = load i32, i32* %gep2
 260   store i32 %load, i32 addrspace(1)* %out
 261   ret void
 262 }
 263
 264 define amdgpu_kernel void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 265 entry:
 266   %tmp = alloca [2 x i32]
 267   %tmp1 = getelementptr inbounds  [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
 268   %tmp2 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
 269   store i32 0, i32* %tmp1
 270   store i32 1, i32* %tmp2
 271   %cmp = icmp eq i32 %in, 0
 272   %sel = select i1 %cmp, i32* %tmp1, i32* %tmp2
 273   %load = load i32, i32* %sel
 274   store i32 %load, i32 addrspace(1)* %out
 275   ret void
 276 }
 277
 278 ; AMDGPUPromoteAlloca does not know how to handle ptrtoint.  When it
 279 ; finds one, it should stop trying to promote.
 280
 281 ; FUNC-LABEL: ptrtoint:
 282 ; SI-NOT: ds_write
 283 ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
 284 ; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ;
 285 define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 286   %alloca = alloca [16 x i32]
 287   %tmp0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
 288   store i32 5, i32* %tmp0
 289   %tmp1 = ptrtoint [16 x i32]* %alloca to i32
 290   %tmp2 = add i32 %tmp1, 5
 291   %tmp3 = inttoptr i32 %tmp2 to i32*
 292   %tmp4 = getelementptr inbounds i32, i32* %tmp3, i32 %b
 293   %tmp5 = load i32, i32* %tmp4
 294   store i32 %tmp5, i32 addrspace(1)* %out
 295   ret void
 296 }
 297
 298 ; OPT: !0 = !{i32 0, i32 257}
 299 ; OPT: !1 = !{i32 0, i32 256}
 300
 301 attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" }