1 ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=SI,SICI,ALL %s
2 ; RUN: opt -S -mcpu=tonga -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=CI,SICI,ALL %s
3 ; RUN: opt -S -mcpu=gfx1010 -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=GFX10,ALL %s
5 ; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
6 ; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
8 define amdgpu_kernel void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
10 %stack = alloca [5 x i32], align 4
11 %0 = load i32, i32 addrspace(1)* %in, align 4
12 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
13 store i32 4, i32* %arrayidx1, align 4
14 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
15 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
16 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
17 store i32 5, i32* %arrayidx3, align 4
18 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
19 %2 = load i32, i32* %arrayidx10, align 4
20 store i32 %2, i32 addrspace(1)* %out, align 4
21 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
22 %3 = load i32, i32* %arrayidx12
23 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
24 store i32 %3, i32 addrspace(1)* %arrayidx13
28 ; ALL: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4
30 define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
32 %stack = alloca [5 x i32], align 4
33 %0 = load i32, i32 addrspace(1)* %in, align 4
34 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
35 store i32 4, i32* %arrayidx1, align 4
36 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
37 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
38 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
39 store i32 5, i32* %arrayidx3, align 4
40 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
41 %2 = load i32, i32* %arrayidx10, align 4
42 store i32 %2, i32 addrspace(1)* %out, align 4
43 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
44 %3 = load i32, i32* %arrayidx12
45 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
46 store i32 %3, i32 addrspace(1)* %arrayidx13
50 ; SICI: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
51 ; GFX10: alloca [5 x i32]
53 define amdgpu_kernel void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
55 %stack = alloca [5 x i32], align 4
56 %0 = load i32, i32 addrspace(1)* %in, align 4
57 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
58 store i32 4, i32* %arrayidx1, align 4
59 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
60 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
61 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
62 store i32 5, i32* %arrayidx3, align 4
63 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
64 %2 = load i32, i32* %arrayidx10, align 4
65 store i32 %2, i32 addrspace(1)* %out, align 4
66 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
67 %3 = load i32, i32* %arrayidx12
68 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
69 store i32 %3, i32 addrspace(1)* %arrayidx13
73 ; ALL-LABEL: @occupancy_0(
74 ; CI-NOT: alloca [5 x i32]
75 ; SI: alloca [5 x i32]
76 define amdgpu_kernel void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
78 %stack = alloca [5 x i32], align 4
79 %0 = load i32, i32 addrspace(1)* %in, align 4
80 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
81 store i32 4, i32* %arrayidx1, align 4
82 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
83 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
84 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
85 store i32 5, i32* %arrayidx3, align 4
86 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
87 %2 = load i32, i32* %arrayidx10, align 4
88 store i32 %2, i32 addrspace(1)* %out, align 4
89 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
90 %3 = load i32, i32* %arrayidx12
91 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
92 store i32 %3, i32 addrspace(1)* %arrayidx13
96 ; ALL-LABEL: @occupancy_max(
97 ; CI-NOT: alloca [5 x i32]
98 ; SI: alloca [5 x i32]
99 define amdgpu_kernel void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
101 %stack = alloca [5 x i32], align 4
102 %0 = load i32, i32 addrspace(1)* %in, align 4
103 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
104 store i32 4, i32* %arrayidx1, align 4
105 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
106 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
107 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
108 store i32 5, i32* %arrayidx3, align 4
109 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
110 %2 = load i32, i32* %arrayidx10, align 4
111 store i32 %2, i32 addrspace(1)* %out, align 4
112 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
113 %3 = load i32, i32* %arrayidx12
114 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
115 store i32 %3, i32 addrspace(1)* %arrayidx13
119 ; SI-LABEL: @occupancy_6(
120 ; CI-LABEL: @occupancy_6(
123 define amdgpu_kernel void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
125 %stack = alloca [42 x i8], align 4
126 %tmp = load i8, i8 addrspace(1)* %in, align 1
127 %tmp4 = sext i8 %tmp to i64
128 %arrayidx1 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 %tmp4
129 store i8 4, i8* %arrayidx1, align 1
130 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
131 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
132 %tmp5 = sext i8 %tmp1 to i64
133 %arrayidx3 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 %tmp5
134 store i8 5, i8* %arrayidx3, align 1
135 %arrayidx10 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 0
136 %tmp2 = load i8, i8* %arrayidx10, align 1
137 store i8 %tmp2, i8 addrspace(1)* %out, align 1
138 %arrayidx12 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 1
139 %tmp3 = load i8, i8* %arrayidx12, align 1
140 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
141 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
145 ; ALL-LABEL: @occupancy_6_over(
146 ; SICI: alloca [43 x i8]
149 define amdgpu_kernel void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
151 %stack = alloca [43 x i8], align 4
152 %tmp = load i8, i8 addrspace(1)* %in, align 1
153 %tmp4 = sext i8 %tmp to i64
154 %arrayidx1 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 %tmp4
155 store i8 4, i8* %arrayidx1, align 1
156 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
157 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
158 %tmp5 = sext i8 %tmp1 to i64
159 %arrayidx3 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 %tmp5
160 store i8 5, i8* %arrayidx3, align 1
161 %arrayidx10 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 0
162 %tmp2 = load i8, i8* %arrayidx10, align 1
163 store i8 %tmp2, i8 addrspace(1)* %out, align 1
164 %arrayidx12 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 1
165 %tmp3 = load i8, i8* %arrayidx12, align 1
166 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
167 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
171 ; SI-LABEL: @occupancy_8(
172 ; CI-LABEL: @occupancy_8(
175 define amdgpu_kernel void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
177 %stack = alloca [32 x i8], align 4
178 %tmp = load i8, i8 addrspace(1)* %in, align 1
179 %tmp4 = sext i8 %tmp to i64
180 %arrayidx1 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 %tmp4
181 store i8 4, i8* %arrayidx1, align 1
182 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
183 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
184 %tmp5 = sext i8 %tmp1 to i64
185 %arrayidx3 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 %tmp5
186 store i8 5, i8* %arrayidx3, align 1
187 %arrayidx10 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 0
188 %tmp2 = load i8, i8* %arrayidx10, align 1
189 store i8 %tmp2, i8 addrspace(1)* %out, align 1
190 %arrayidx12 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 1
191 %tmp3 = load i8, i8* %arrayidx12, align 1
192 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
193 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
197 ; ALL-LABEL: @occupancy_8_over(
198 ; SICI: alloca [33 x i8]
201 define amdgpu_kernel void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
203 %stack = alloca [33 x i8], align 4
204 %tmp = load i8, i8 addrspace(1)* %in, align 1
205 %tmp4 = sext i8 %tmp to i64
206 %arrayidx1 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 %tmp4
207 store i8 4, i8* %arrayidx1, align 1
208 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
209 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
210 %tmp5 = sext i8 %tmp1 to i64
211 %arrayidx3 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 %tmp5
212 store i8 5, i8* %arrayidx3, align 1
213 %arrayidx10 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 0
214 %tmp2 = load i8, i8* %arrayidx10, align 1
215 store i8 %tmp2, i8 addrspace(1)* %out, align 1
216 %arrayidx12 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 1
217 %tmp3 = load i8, i8* %arrayidx12, align 1
218 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
219 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
223 ; SI-LABEL: @occupancy_9(
224 ; CI-LABEL: @occupancy_9(
227 define amdgpu_kernel void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
229 %stack = alloca [28 x i8], align 4
230 %tmp = load i8, i8 addrspace(1)* %in, align 1
231 %tmp4 = sext i8 %tmp to i64
232 %arrayidx1 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 %tmp4
233 store i8 4, i8* %arrayidx1, align 1
234 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
235 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
236 %tmp5 = sext i8 %tmp1 to i64
237 %arrayidx3 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 %tmp5
238 store i8 5, i8* %arrayidx3, align 1
239 %arrayidx10 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 0
240 %tmp2 = load i8, i8* %arrayidx10, align 1
241 store i8 %tmp2, i8 addrspace(1)* %out, align 1
242 %arrayidx12 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 1
243 %tmp3 = load i8, i8* %arrayidx12, align 1
244 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
245 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
249 ; ALL-LABEL: @occupancy_9_over(
250 ; SICI: alloca [29 x i8]
253 define amdgpu_kernel void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
255 %stack = alloca [29 x i8], align 4
256 %tmp = load i8, i8 addrspace(1)* %in, align 1
257 %tmp4 = sext i8 %tmp to i64
258 %arrayidx1 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 %tmp4
259 store i8 4, i8* %arrayidx1, align 1
260 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
261 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
262 %tmp5 = sext i8 %tmp1 to i64
263 %arrayidx3 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 %tmp5
264 store i8 5, i8* %arrayidx3, align 1
265 %arrayidx10 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 0
266 %tmp2 = load i8, i8* %arrayidx10, align 1
267 store i8 %tmp2, i8 addrspace(1)* %out, align 1
268 %arrayidx12 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 1
269 %tmp3 = load i8, i8* %arrayidx12, align 1
270 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
271 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
275 attributes #0 = { nounwind "amdgpu-flat-work-group-size"="63,63" }
276 attributes #1 = { nounwind "amdgpu-waves-per-eu"="1,3" "amdgpu-flat-work-group-size"="256,256" }
277 attributes #2 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1600,1600" }
278 attributes #3 = { nounwind "amdgpu-waves-per-eu"="1,10" }
279 attributes #4 = { nounwind "amdgpu-waves-per-eu"="1,10" }
280 attributes #5 = { nounwind "amdgpu-waves-per-eu"="1,6" "amdgpu-flat-work-group-size"="64,64" }
281 attributes #6 = { nounwind "amdgpu-waves-per-eu"="1,8" "amdgpu-flat-work-group-size"="64,64" }
282 attributes #7 = { nounwind "amdgpu-waves-per-eu"="1,9" "amdgpu-flat-work-group-size"="64,64" }