test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll

   1 ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefix=SI --check-prefix=ALL %s
   2 ; RUN: opt -S -mcpu=tonga -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefix=CI --check-prefix=ALL %s
   3
   4 ; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
   5 ; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
   6
   7 define amdgpu_kernel void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
   8 entry:
   9   %stack = alloca [5 x i32], align 4
  10   %0 = load i32, i32 addrspace(1)* %in, align 4
  11   %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
  12   store i32 4, i32* %arrayidx1, align 4
  13   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
  14   %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
  15   %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
  16   store i32 5, i32* %arrayidx3, align 4
  17   %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
  18   %2 = load i32, i32* %arrayidx10, align 4
  19   store i32 %2, i32 addrspace(1)* %out, align 4
  20   %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
  21   %3 = load i32, i32* %arrayidx12
  22   %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
  23   store i32 %3, i32 addrspace(1)* %arrayidx13
  24   ret void
  25 }
  26
  27 ; ALL: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4
  28
  29 define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
  30 entry:
  31   %stack = alloca [5 x i32], align 4
  32   %0 = load i32, i32 addrspace(1)* %in, align 4
  33   %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
  34   store i32 4, i32* %arrayidx1, align 4
  35   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
  36   %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
  37   %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
  38   store i32 5, i32* %arrayidx3, align 4
  39   %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
  40   %2 = load i32, i32* %arrayidx10, align 4
  41   store i32 %2, i32 addrspace(1)* %out, align 4
  42   %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
  43   %3 = load i32, i32* %arrayidx12
  44   %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
  45   store i32 %3, i32 addrspace(1)* %arrayidx13
  46   ret void
  47 }
  48
  49 ; ALL: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
  50
  51 define amdgpu_kernel void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
  52 entry:
  53   %stack = alloca [5 x i32], align 4
  54   %0 = load i32, i32 addrspace(1)* %in, align 4
  55   %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
  56   store i32 4, i32* %arrayidx1, align 4
  57   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
  58   %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
  59   %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
  60   store i32 5, i32* %arrayidx3, align 4
  61   %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
  62   %2 = load i32, i32* %arrayidx10, align 4
  63   store i32 %2, i32 addrspace(1)* %out, align 4
  64   %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
  65   %3 = load i32, i32* %arrayidx12
  66   %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
  67   store i32 %3, i32 addrspace(1)* %arrayidx13
  68   ret void
  69 }
  70
  71 ; ALL-LABEL: @occupancy_0(
  72 ; CI-NOT: alloca [5 x i32]
  73 ; SI: alloca [5 x i32]
  74 define amdgpu_kernel void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
  75 entry:
  76   %stack = alloca [5 x i32], align 4
  77   %0 = load i32, i32 addrspace(1)* %in, align 4
  78   %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
  79   store i32 4, i32* %arrayidx1, align 4
  80   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
  81   %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
  82   %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
  83   store i32 5, i32* %arrayidx3, align 4
  84   %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
  85   %2 = load i32, i32* %arrayidx10, align 4
  86   store i32 %2, i32 addrspace(1)* %out, align 4
  87   %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
  88   %3 = load i32, i32* %arrayidx12
  89   %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
  90   store i32 %3, i32 addrspace(1)* %arrayidx13
  91   ret void
  92 }
  93
  94 ; ALL-LABEL: @occupancy_max(
  95 ; CI-NOT: alloca [5 x i32]
  96 ; SI: alloca [5 x i32]
  97 define amdgpu_kernel void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
  98 entry:
  99   %stack = alloca [5 x i32], align 4
 100   %0 = load i32, i32 addrspace(1)* %in, align 4
 101   %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
 102   store i32 4, i32* %arrayidx1, align 4
 103   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
 104   %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
 105   %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
 106   store i32 5, i32* %arrayidx3, align 4
 107   %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
 108   %2 = load i32, i32* %arrayidx10, align 4
 109   store i32 %2, i32 addrspace(1)* %out, align 4
 110   %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
 111   %3 = load i32, i32* %arrayidx12
 112   %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
 113   store i32 %3, i32 addrspace(1)* %arrayidx13
 114   ret void
 115 }
 116
 117 ; SI-LABEL: @occupancy_6(
 118 ; CI-LABEL: @occupancy_6(
 119 ; SI: alloca
 120 ; CI-NOT: alloca
 121 define amdgpu_kernel void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
 122 entry:
 123   %stack = alloca [42 x i8], align 4
 124   %tmp = load i8, i8 addrspace(1)* %in, align 1
 125   %tmp4 = sext i8 %tmp to i64
 126   %arrayidx1 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 %tmp4
 127   store i8 4, i8* %arrayidx1, align 1
 128   %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
 129   %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
 130   %tmp5 = sext i8 %tmp1 to i64
 131   %arrayidx3 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 %tmp5
 132   store i8 5, i8* %arrayidx3, align 1
 133   %arrayidx10 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 0
 134   %tmp2 = load i8, i8* %arrayidx10, align 1
 135   store i8 %tmp2, i8 addrspace(1)* %out, align 1
 136   %arrayidx12 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 1
 137   %tmp3 = load i8, i8* %arrayidx12, align 1
 138   %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
 139   store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
 140   ret void
 141 }
 142
 143 ; ALL-LABEL: @occupancy_6_over(
 144 ; ALL: alloca [43 x i8]
 145 define amdgpu_kernel void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
 146 entry:
 147   %stack = alloca [43 x i8], align 4
 148   %tmp = load i8, i8 addrspace(1)* %in, align 1
 149   %tmp4 = sext i8 %tmp to i64
 150   %arrayidx1 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 %tmp4
 151   store i8 4, i8* %arrayidx1, align 1
 152   %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
 153   %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
 154   %tmp5 = sext i8 %tmp1 to i64
 155   %arrayidx3 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 %tmp5
 156   store i8 5, i8* %arrayidx3, align 1
 157   %arrayidx10 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 0
 158   %tmp2 = load i8, i8* %arrayidx10, align 1
 159   store i8 %tmp2, i8 addrspace(1)* %out, align 1
 160   %arrayidx12 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 1
 161   %tmp3 = load i8, i8* %arrayidx12, align 1
 162   %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
 163   store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
 164   ret void
 165 }
 166
 167 ; SI-LABEL: @occupancy_8(
 168 ; CI-LABEL: @occupancy_8(
 169 ; SI: alloca
 170 ; CI-NOT: alloca
 171 define amdgpu_kernel void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
 172 entry:
 173   %stack = alloca [32 x i8], align 4
 174   %tmp = load i8, i8 addrspace(1)* %in, align 1
 175   %tmp4 = sext i8 %tmp to i64
 176   %arrayidx1 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 %tmp4
 177   store i8 4, i8* %arrayidx1, align 1
 178   %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
 179   %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
 180   %tmp5 = sext i8 %tmp1 to i64
 181   %arrayidx3 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 %tmp5
 182   store i8 5, i8* %arrayidx3, align 1
 183   %arrayidx10 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 0
 184   %tmp2 = load i8, i8* %arrayidx10, align 1
 185   store i8 %tmp2, i8 addrspace(1)* %out, align 1
 186   %arrayidx12 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 1
 187   %tmp3 = load i8, i8* %arrayidx12, align 1
 188   %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
 189   store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
 190   ret void
 191 }
 192
 193 ; ALL-LABEL: @occupancy_8_over(
 194 ; ALL: alloca [33 x i8]
 195 define amdgpu_kernel void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
 196 entry:
 197   %stack = alloca [33 x i8], align 4
 198   %tmp = load i8, i8 addrspace(1)* %in, align 1
 199   %tmp4 = sext i8 %tmp to i64
 200   %arrayidx1 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 %tmp4
 201   store i8 4, i8* %arrayidx1, align 1
 202   %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
 203   %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
 204   %tmp5 = sext i8 %tmp1 to i64
 205   %arrayidx3 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 %tmp5
 206   store i8 5, i8* %arrayidx3, align 1
 207   %arrayidx10 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 0
 208   %tmp2 = load i8, i8* %arrayidx10, align 1
 209   store i8 %tmp2, i8 addrspace(1)* %out, align 1
 210   %arrayidx12 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 1
 211   %tmp3 = load i8, i8* %arrayidx12, align 1
 212   %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
 213   store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
 214   ret void
 215 }
 216
 217 ; SI-LABEL: @occupancy_9(
 218 ; CI-LABEL: @occupancy_9(
 219 ; SI: alloca
 220 ; CI-NOT: alloca
 221 define amdgpu_kernel void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
 222 entry:
 223   %stack = alloca [28 x i8], align 4
 224   %tmp = load i8, i8 addrspace(1)* %in, align 1
 225   %tmp4 = sext i8 %tmp to i64
 226   %arrayidx1 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 %tmp4
 227   store i8 4, i8* %arrayidx1, align 1
 228   %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
 229   %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
 230   %tmp5 = sext i8 %tmp1 to i64
 231   %arrayidx3 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 %tmp5
 232   store i8 5, i8* %arrayidx3, align 1
 233   %arrayidx10 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 0
 234   %tmp2 = load i8, i8* %arrayidx10, align 1
 235   store i8 %tmp2, i8 addrspace(1)* %out, align 1
 236   %arrayidx12 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 1
 237   %tmp3 = load i8, i8* %arrayidx12, align 1
 238   %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
 239   store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
 240   ret void
 241 }
 242
 243 ; ALL-LABEL: @occupancy_9_over(
 244 ; ALL: alloca [29 x i8]
 245 define amdgpu_kernel void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
 246 entry:
 247   %stack = alloca [29 x i8], align 4
 248   %tmp = load i8, i8 addrspace(1)* %in, align 1
 249   %tmp4 = sext i8 %tmp to i64
 250   %arrayidx1 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 %tmp4
 251   store i8 4, i8* %arrayidx1, align 1
 252   %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
 253   %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
 254   %tmp5 = sext i8 %tmp1 to i64
 255   %arrayidx3 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 %tmp5
 256   store i8 5, i8* %arrayidx3, align 1
 257   %arrayidx10 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 0
 258   %tmp2 = load i8, i8* %arrayidx10, align 1
 259   store i8 %tmp2, i8 addrspace(1)* %out, align 1
 260   %arrayidx12 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 1
 261   %tmp3 = load i8, i8* %arrayidx12, align 1
 262   %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
 263   store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
 264   ret void
 265 }
 266
 267 attributes #0 = { nounwind "amdgpu-max-work-group-size"="63" }
 268 attributes #1 = { nounwind "amdgpu-waves-per-eu"="1,3" "amdgpu-flat-work-group-size"="256,256" }
 269 attributes #2 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1600,1600" }
 270 attributes #3 = { nounwind "amdgpu-waves-per-eu"="1,10" }
 271 attributes #4 = { nounwind "amdgpu-waves-per-eu"="1,10" }
 272 attributes #5 = { nounwind "amdgpu-waves-per-eu"="1,6" "amdgpu-flat-work-group-size"="64,64" }
 273 attributes #6 = { nounwind "amdgpu-waves-per-eu"="1,8" "amdgpu-flat-work-group-size"="64,64" }
 274 attributes #7 = { nounwind "amdgpu-waves-per-eu"="1,9" "amdgpu-flat-work-group-size"="64,64" }