1 ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefix=SI --check-prefix=ALL %s
2 ; RUN: opt -S -mcpu=tonga -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefix=CI --check-prefix=ALL %s
4 ; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
5 ; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
7 define amdgpu_kernel void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
9 %stack = alloca [5 x i32], align 4
10 %0 = load i32, i32 addrspace(1)* %in, align 4
11 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
12 store i32 4, i32* %arrayidx1, align 4
13 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
14 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
15 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
16 store i32 5, i32* %arrayidx3, align 4
17 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
18 %2 = load i32, i32* %arrayidx10, align 4
19 store i32 %2, i32 addrspace(1)* %out, align 4
20 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
21 %3 = load i32, i32* %arrayidx12
22 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
23 store i32 %3, i32 addrspace(1)* %arrayidx13
27 ; ALL: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4
29 define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
31 %stack = alloca [5 x i32], align 4
32 %0 = load i32, i32 addrspace(1)* %in, align 4
33 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
34 store i32 4, i32* %arrayidx1, align 4
35 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
36 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
37 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
38 store i32 5, i32* %arrayidx3, align 4
39 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
40 %2 = load i32, i32* %arrayidx10, align 4
41 store i32 %2, i32 addrspace(1)* %out, align 4
42 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
43 %3 = load i32, i32* %arrayidx12
44 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
45 store i32 %3, i32 addrspace(1)* %arrayidx13
49 ; ALL: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
51 define amdgpu_kernel void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
53 %stack = alloca [5 x i32], align 4
54 %0 = load i32, i32 addrspace(1)* %in, align 4
55 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
56 store i32 4, i32* %arrayidx1, align 4
57 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
58 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
59 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
60 store i32 5, i32* %arrayidx3, align 4
61 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
62 %2 = load i32, i32* %arrayidx10, align 4
63 store i32 %2, i32 addrspace(1)* %out, align 4
64 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
65 %3 = load i32, i32* %arrayidx12
66 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
67 store i32 %3, i32 addrspace(1)* %arrayidx13
71 ; ALL-LABEL: @occupancy_0(
72 ; CI-NOT: alloca [5 x i32]
73 ; SI: alloca [5 x i32]
74 define amdgpu_kernel void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
76 %stack = alloca [5 x i32], align 4
77 %0 = load i32, i32 addrspace(1)* %in, align 4
78 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
79 store i32 4, i32* %arrayidx1, align 4
80 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
81 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
82 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
83 store i32 5, i32* %arrayidx3, align 4
84 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
85 %2 = load i32, i32* %arrayidx10, align 4
86 store i32 %2, i32 addrspace(1)* %out, align 4
87 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
88 %3 = load i32, i32* %arrayidx12
89 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
90 store i32 %3, i32 addrspace(1)* %arrayidx13
94 ; ALL-LABEL: @occupancy_max(
95 ; CI-NOT: alloca [5 x i32]
96 ; SI: alloca [5 x i32]
97 define amdgpu_kernel void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
99 %stack = alloca [5 x i32], align 4
100 %0 = load i32, i32 addrspace(1)* %in, align 4
101 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
102 store i32 4, i32* %arrayidx1, align 4
103 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
104 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
105 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
106 store i32 5, i32* %arrayidx3, align 4
107 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
108 %2 = load i32, i32* %arrayidx10, align 4
109 store i32 %2, i32 addrspace(1)* %out, align 4
110 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
111 %3 = load i32, i32* %arrayidx12
112 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
113 store i32 %3, i32 addrspace(1)* %arrayidx13
117 ; SI-LABEL: @occupancy_6(
118 ; CI-LABEL: @occupancy_6(
121 define amdgpu_kernel void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
123 %stack = alloca [42 x i8], align 4
124 %tmp = load i8, i8 addrspace(1)* %in, align 1
125 %tmp4 = sext i8 %tmp to i64
126 %arrayidx1 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 %tmp4
127 store i8 4, i8* %arrayidx1, align 1
128 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
129 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
130 %tmp5 = sext i8 %tmp1 to i64
131 %arrayidx3 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 %tmp5
132 store i8 5, i8* %arrayidx3, align 1
133 %arrayidx10 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 0
134 %tmp2 = load i8, i8* %arrayidx10, align 1
135 store i8 %tmp2, i8 addrspace(1)* %out, align 1
136 %arrayidx12 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 1
137 %tmp3 = load i8, i8* %arrayidx12, align 1
138 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
139 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
143 ; ALL-LABEL: @occupancy_6_over(
144 ; ALL: alloca [43 x i8]
145 define amdgpu_kernel void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
147 %stack = alloca [43 x i8], align 4
148 %tmp = load i8, i8 addrspace(1)* %in, align 1
149 %tmp4 = sext i8 %tmp to i64
150 %arrayidx1 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 %tmp4
151 store i8 4, i8* %arrayidx1, align 1
152 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
153 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
154 %tmp5 = sext i8 %tmp1 to i64
155 %arrayidx3 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 %tmp5
156 store i8 5, i8* %arrayidx3, align 1
157 %arrayidx10 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 0
158 %tmp2 = load i8, i8* %arrayidx10, align 1
159 store i8 %tmp2, i8 addrspace(1)* %out, align 1
160 %arrayidx12 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 1
161 %tmp3 = load i8, i8* %arrayidx12, align 1
162 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
163 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
167 ; SI-LABEL: @occupancy_8(
168 ; CI-LABEL: @occupancy_8(
171 define amdgpu_kernel void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
173 %stack = alloca [32 x i8], align 4
174 %tmp = load i8, i8 addrspace(1)* %in, align 1
175 %tmp4 = sext i8 %tmp to i64
176 %arrayidx1 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 %tmp4
177 store i8 4, i8* %arrayidx1, align 1
178 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
179 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
180 %tmp5 = sext i8 %tmp1 to i64
181 %arrayidx3 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 %tmp5
182 store i8 5, i8* %arrayidx3, align 1
183 %arrayidx10 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 0
184 %tmp2 = load i8, i8* %arrayidx10, align 1
185 store i8 %tmp2, i8 addrspace(1)* %out, align 1
186 %arrayidx12 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 1
187 %tmp3 = load i8, i8* %arrayidx12, align 1
188 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
189 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
193 ; ALL-LABEL: @occupancy_8_over(
194 ; ALL: alloca [33 x i8]
195 define amdgpu_kernel void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
197 %stack = alloca [33 x i8], align 4
198 %tmp = load i8, i8 addrspace(1)* %in, align 1
199 %tmp4 = sext i8 %tmp to i64
200 %arrayidx1 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 %tmp4
201 store i8 4, i8* %arrayidx1, align 1
202 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
203 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
204 %tmp5 = sext i8 %tmp1 to i64
205 %arrayidx3 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 %tmp5
206 store i8 5, i8* %arrayidx3, align 1
207 %arrayidx10 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 0
208 %tmp2 = load i8, i8* %arrayidx10, align 1
209 store i8 %tmp2, i8 addrspace(1)* %out, align 1
210 %arrayidx12 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 1
211 %tmp3 = load i8, i8* %arrayidx12, align 1
212 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
213 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
217 ; SI-LABEL: @occupancy_9(
218 ; CI-LABEL: @occupancy_9(
221 define amdgpu_kernel void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
223 %stack = alloca [28 x i8], align 4
224 %tmp = load i8, i8 addrspace(1)* %in, align 1
225 %tmp4 = sext i8 %tmp to i64
226 %arrayidx1 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 %tmp4
227 store i8 4, i8* %arrayidx1, align 1
228 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
229 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
230 %tmp5 = sext i8 %tmp1 to i64
231 %arrayidx3 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 %tmp5
232 store i8 5, i8* %arrayidx3, align 1
233 %arrayidx10 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 0
234 %tmp2 = load i8, i8* %arrayidx10, align 1
235 store i8 %tmp2, i8 addrspace(1)* %out, align 1
236 %arrayidx12 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 1
237 %tmp3 = load i8, i8* %arrayidx12, align 1
238 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
239 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
243 ; ALL-LABEL: @occupancy_9_over(
244 ; ALL: alloca [29 x i8]
245 define amdgpu_kernel void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
247 %stack = alloca [29 x i8], align 4
248 %tmp = load i8, i8 addrspace(1)* %in, align 1
249 %tmp4 = sext i8 %tmp to i64
250 %arrayidx1 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 %tmp4
251 store i8 4, i8* %arrayidx1, align 1
252 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
253 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
254 %tmp5 = sext i8 %tmp1 to i64
255 %arrayidx3 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 %tmp5
256 store i8 5, i8* %arrayidx3, align 1
257 %arrayidx10 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 0
258 %tmp2 = load i8, i8* %arrayidx10, align 1
259 store i8 %tmp2, i8 addrspace(1)* %out, align 1
260 %arrayidx12 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 1
261 %tmp3 = load i8, i8* %arrayidx12, align 1
262 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
263 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
267 attributes #0 = { nounwind "amdgpu-max-work-group-size"="63" }
268 attributes #1 = { nounwind "amdgpu-waves-per-eu"="1,3" "amdgpu-flat-work-group-size"="256,256" }
269 attributes #2 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1600,1600" }
270 attributes #3 = { nounwind "amdgpu-waves-per-eu"="1,10" }
271 attributes #4 = { nounwind "amdgpu-waves-per-eu"="1,10" }
272 attributes #5 = { nounwind "amdgpu-waves-per-eu"="1,6" "amdgpu-flat-work-group-size"="64,64" }
273 attributes #6 = { nounwind "amdgpu-waves-per-eu"="1,8" "amdgpu-flat-work-group-size"="64,64" }
274 attributes #7 = { nounwind "amdgpu-waves-per-eu"="1,9" "amdgpu-flat-work-group-size"="64,64" }