test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll

   1 ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=SI,SICI,ALL %s
   2 ; RUN: opt -S -mcpu=tonga -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=CI,SICI,ALL %s
   3 ; RUN: opt -S -mcpu=gfx1010 -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=GFX10,ALL %s
   4
   5 ; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
   6 ; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
   7
   8 define amdgpu_kernel void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
   9 entry:
  10   %stack = alloca [5 x i32], align 4
  11   %0 = load i32, i32 addrspace(1)* %in, align 4
  12   %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
  13   store i32 4, i32* %arrayidx1, align 4
  14   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
  15   %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
  16   %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
  17   store i32 5, i32* %arrayidx3, align 4
  18   %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
  19   %2 = load i32, i32* %arrayidx10, align 4
  20   store i32 %2, i32 addrspace(1)* %out, align 4
  21   %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
  22   %3 = load i32, i32* %arrayidx12
  23   %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
  24   store i32 %3, i32 addrspace(1)* %arrayidx13
  25   ret void
  26 }
  27
  28 ; ALL: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4
  29
  30 define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
  31 entry:
  32   %stack = alloca [5 x i32], align 4
  33   %0 = load i32, i32 addrspace(1)* %in, align 4
  34   %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
  35   store i32 4, i32* %arrayidx1, align 4
  36   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
  37   %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
  38   %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
  39   store i32 5, i32* %arrayidx3, align 4
  40   %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
  41   %2 = load i32, i32* %arrayidx10, align 4
  42   store i32 %2, i32 addrspace(1)* %out, align 4
  43   %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
  44   %3 = load i32, i32* %arrayidx12
  45   %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
  46   store i32 %3, i32 addrspace(1)* %arrayidx13
  47   ret void
  48 }
  49
  50 ; SICI: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
  51 ; GFX10: alloca [5 x i32]
  52
  53 define amdgpu_kernel void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
  54 entry:
  55   %stack = alloca [5 x i32], align 4
  56   %0 = load i32, i32 addrspace(1)* %in, align 4
  57   %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
  58   store i32 4, i32* %arrayidx1, align 4
  59   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
  60   %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
  61   %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
  62   store i32 5, i32* %arrayidx3, align 4
  63   %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
  64   %2 = load i32, i32* %arrayidx10, align 4
  65   store i32 %2, i32 addrspace(1)* %out, align 4
  66   %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
  67   %3 = load i32, i32* %arrayidx12
  68   %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
  69   store i32 %3, i32 addrspace(1)* %arrayidx13
  70   ret void
  71 }
  72
  73 ; ALL-LABEL: @occupancy_0(
  74 ; CI-NOT: alloca [5 x i32]
  75 ; SI: alloca [5 x i32]
  76 define amdgpu_kernel void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
  77 entry:
  78   %stack = alloca [5 x i32], align 4
  79   %0 = load i32, i32 addrspace(1)* %in, align 4
  80   %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
  81   store i32 4, i32* %arrayidx1, align 4
  82   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
  83   %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
  84   %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
  85   store i32 5, i32* %arrayidx3, align 4
  86   %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
  87   %2 = load i32, i32* %arrayidx10, align 4
  88   store i32 %2, i32 addrspace(1)* %out, align 4
  89   %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
  90   %3 = load i32, i32* %arrayidx12
  91   %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
  92   store i32 %3, i32 addrspace(1)* %arrayidx13
  93   ret void
  94 }
  95
  96 ; ALL-LABEL: @occupancy_max(
  97 ; CI-NOT: alloca [5 x i32]
  98 ; SI: alloca [5 x i32]
  99 define amdgpu_kernel void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
 100 entry:
 101   %stack = alloca [5 x i32], align 4
 102   %0 = load i32, i32 addrspace(1)* %in, align 4
 103   %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
 104   store i32 4, i32* %arrayidx1, align 4
 105   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
 106   %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
 107   %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
 108   store i32 5, i32* %arrayidx3, align 4
 109   %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
 110   %2 = load i32, i32* %arrayidx10, align 4
 111   store i32 %2, i32 addrspace(1)* %out, align 4
 112   %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
 113   %3 = load i32, i32* %arrayidx12
 114   %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
 115   store i32 %3, i32 addrspace(1)* %arrayidx13
 116   ret void
 117 }
 118
 119 ; SI-LABEL: @occupancy_6(
 120 ; CI-LABEL: @occupancy_6(
 121 ; SI: alloca
 122 ; CI-NOT: alloca
 123 define amdgpu_kernel void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
 124 entry:
 125   %stack = alloca [42 x i8], align 4
 126   %tmp = load i8, i8 addrspace(1)* %in, align 1
 127   %tmp4 = sext i8 %tmp to i64
 128   %arrayidx1 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 %tmp4
 129   store i8 4, i8* %arrayidx1, align 1
 130   %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
 131   %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
 132   %tmp5 = sext i8 %tmp1 to i64
 133   %arrayidx3 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 %tmp5
 134   store i8 5, i8* %arrayidx3, align 1
 135   %arrayidx10 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 0
 136   %tmp2 = load i8, i8* %arrayidx10, align 1
 137   store i8 %tmp2, i8 addrspace(1)* %out, align 1
 138   %arrayidx12 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 1
 139   %tmp3 = load i8, i8* %arrayidx12, align 1
 140   %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
 141   store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
 142   ret void
 143 }
 144
 145 ; ALL-LABEL: @occupancy_6_over(
 146 ; SICI: alloca [43 x i8]
 147 ; GFX10-NOT: alloca
 148
 149 define amdgpu_kernel void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
 150 entry:
 151   %stack = alloca [43 x i8], align 4
 152   %tmp = load i8, i8 addrspace(1)* %in, align 1
 153   %tmp4 = sext i8 %tmp to i64
 154   %arrayidx1 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 %tmp4
 155   store i8 4, i8* %arrayidx1, align 1
 156   %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
 157   %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
 158   %tmp5 = sext i8 %tmp1 to i64
 159   %arrayidx3 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 %tmp5
 160   store i8 5, i8* %arrayidx3, align 1
 161   %arrayidx10 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 0
 162   %tmp2 = load i8, i8* %arrayidx10, align 1
 163   store i8 %tmp2, i8 addrspace(1)* %out, align 1
 164   %arrayidx12 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 1
 165   %tmp3 = load i8, i8* %arrayidx12, align 1
 166   %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
 167   store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
 168   ret void
 169 }
 170
 171 ; SI-LABEL: @occupancy_8(
 172 ; CI-LABEL: @occupancy_8(
 173 ; SI: alloca
 174 ; CI-NOT: alloca
 175 define amdgpu_kernel void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
 176 entry:
 177   %stack = alloca [32 x i8], align 4
 178   %tmp = load i8, i8 addrspace(1)* %in, align 1
 179   %tmp4 = sext i8 %tmp to i64
 180   %arrayidx1 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 %tmp4
 181   store i8 4, i8* %arrayidx1, align 1
 182   %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
 183   %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
 184   %tmp5 = sext i8 %tmp1 to i64
 185   %arrayidx3 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 %tmp5
 186   store i8 5, i8* %arrayidx3, align 1
 187   %arrayidx10 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 0
 188   %tmp2 = load i8, i8* %arrayidx10, align 1
 189   store i8 %tmp2, i8 addrspace(1)* %out, align 1
 190   %arrayidx12 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 1
 191   %tmp3 = load i8, i8* %arrayidx12, align 1
 192   %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
 193   store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
 194   ret void
 195 }
 196
 197 ; ALL-LABEL: @occupancy_8_over(
 198 ; SICI: alloca [33 x i8]
 199 ; GFX10-NOT: alloca
 200
 201 define amdgpu_kernel void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
 202 entry:
 203   %stack = alloca [33 x i8], align 4
 204   %tmp = load i8, i8 addrspace(1)* %in, align 1
 205   %tmp4 = sext i8 %tmp to i64
 206   %arrayidx1 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 %tmp4
 207   store i8 4, i8* %arrayidx1, align 1
 208   %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
 209   %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
 210   %tmp5 = sext i8 %tmp1 to i64
 211   %arrayidx3 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 %tmp5
 212   store i8 5, i8* %arrayidx3, align 1
 213   %arrayidx10 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 0
 214   %tmp2 = load i8, i8* %arrayidx10, align 1
 215   store i8 %tmp2, i8 addrspace(1)* %out, align 1
 216   %arrayidx12 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 1
 217   %tmp3 = load i8, i8* %arrayidx12, align 1
 218   %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
 219   store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
 220   ret void
 221 }
 222
 223 ; SI-LABEL: @occupancy_9(
 224 ; CI-LABEL: @occupancy_9(
 225 ; SI: alloca
 226 ; CI-NOT: alloca
 227 define amdgpu_kernel void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
 228 entry:
 229   %stack = alloca [28 x i8], align 4
 230   %tmp = load i8, i8 addrspace(1)* %in, align 1
 231   %tmp4 = sext i8 %tmp to i64
 232   %arrayidx1 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 %tmp4
 233   store i8 4, i8* %arrayidx1, align 1
 234   %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
 235   %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
 236   %tmp5 = sext i8 %tmp1 to i64
 237   %arrayidx3 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 %tmp5
 238   store i8 5, i8* %arrayidx3, align 1
 239   %arrayidx10 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 0
 240   %tmp2 = load i8, i8* %arrayidx10, align 1
 241   store i8 %tmp2, i8 addrspace(1)* %out, align 1
 242   %arrayidx12 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 1
 243   %tmp3 = load i8, i8* %arrayidx12, align 1
 244   %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
 245   store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
 246   ret void
 247 }
 248
 249 ; ALL-LABEL: @occupancy_9_over(
 250 ; SICI: alloca [29 x i8]
 251 ; GFX10-NOT: alloca
 252
 253 define amdgpu_kernel void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
 254 entry:
 255   %stack = alloca [29 x i8], align 4
 256   %tmp = load i8, i8 addrspace(1)* %in, align 1
 257   %tmp4 = sext i8 %tmp to i64
 258   %arrayidx1 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 %tmp4
 259   store i8 4, i8* %arrayidx1, align 1
 260   %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
 261   %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
 262   %tmp5 = sext i8 %tmp1 to i64
 263   %arrayidx3 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 %tmp5
 264   store i8 5, i8* %arrayidx3, align 1
 265   %arrayidx10 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 0
 266   %tmp2 = load i8, i8* %arrayidx10, align 1
 267   store i8 %tmp2, i8 addrspace(1)* %out, align 1
 268   %arrayidx12 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 1
 269   %tmp3 = load i8, i8* %arrayidx12, align 1
 270   %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
 271   store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
 272   ret void
 273 }
 274
 275 attributes #0 = { nounwind "amdgpu-flat-work-group-size"="63,63" }
 276 attributes #1 = { nounwind "amdgpu-waves-per-eu"="1,3" "amdgpu-flat-work-group-size"="256,256" }
 277 attributes #2 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1600,1600" }
 278 attributes #3 = { nounwind "amdgpu-waves-per-eu"="1,10" }
 279 attributes #4 = { nounwind "amdgpu-waves-per-eu"="1,10" }
 280 attributes #5 = { nounwind "amdgpu-waves-per-eu"="1,6" "amdgpu-flat-work-group-size"="64,64" }
 281 attributes #6 = { nounwind "amdgpu-waves-per-eu"="1,8" "amdgpu-flat-work-group-size"="64,64" }
 282 attributes #7 = { nounwind "amdgpu-waves-per-eu"="1,9" "amdgpu-flat-work-group-size"="64,64" }