1 ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=SI,SICI,ALL %s
2 ; RUN: opt -S -mcpu=tonga -mtriple=amdgcn-unknown-unknown -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=CI,SICI,ALL %s
3 ; RUN: opt -S -mcpu=gfx1010 -mtriple=amdgcn-unknown-unknown -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=GFX10PLUS,ALL %s
4 ; RUN: opt -S -mcpu=gfx1100 -mtriple=amdgcn-unknown-unknown -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=GFX10PLUS,ALL %s
6 ; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] poison, align 4
7 ; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] poison, align 4
9 define amdgpu_kernel void @promote_alloca_size_63(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 {
11 %stack = alloca [5 x i32], align 4, addrspace(5)
12 %0 = load i32, ptr addrspace(1) %in, align 4
13 %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
14 store i32 4, ptr addrspace(5) %arrayidx1, align 4
15 %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
16 %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
17 %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
18 store i32 5, ptr addrspace(5) %arrayidx3, align 4
19 %2 = load i32, ptr addrspace(5) %stack, align 4
20 store i32 %2, ptr addrspace(1) %out, align 4
21 %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
22 %3 = load i32, ptr addrspace(5) %arrayidx12
23 %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
24 store i32 %3, ptr addrspace(1) %arrayidx13
28 ; ALL: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] poison, align 4
30 define amdgpu_kernel void @promote_alloca_size_256(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #1 {
32 %stack = alloca [5 x i32], align 4, addrspace(5)
33 %0 = load i32, ptr addrspace(1) %in, align 4
34 %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
35 store i32 4, ptr addrspace(5) %arrayidx1, align 4
36 %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
37 %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
38 %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
39 store i32 5, ptr addrspace(5) %arrayidx3, align 4
40 %2 = load i32, ptr addrspace(5) %stack, align 4
41 store i32 %2, ptr addrspace(1) %out, align 4
42 %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
43 %3 = load i32, ptr addrspace(5) %arrayidx12
44 %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
45 store i32 %3, ptr addrspace(1) %arrayidx13
49 ; SI-NOT: @promote_alloca_size_1600.stack
50 ; CI: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1024 x [5 x i32]] poison, align 4
51 ; GFX10PLUS: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1024 x [5 x i32]] poison, align 4
53 define amdgpu_kernel void @promote_alloca_size_1600(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #2 {
55 %stack = alloca [5 x i32], align 4, addrspace(5)
56 %0 = load i32, ptr addrspace(1) %in, align 4
57 %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
58 store i32 4, ptr addrspace(5) %arrayidx1, align 4
59 %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
60 %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
61 %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
62 store i32 5, ptr addrspace(5) %arrayidx3, align 4
63 %2 = load i32, ptr addrspace(5) %stack, align 4
64 store i32 %2, ptr addrspace(1) %out, align 4
65 %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
66 %3 = load i32, ptr addrspace(5) %arrayidx12
67 %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
68 store i32 %3, ptr addrspace(1) %arrayidx13
72 ; ALL-LABEL: @occupancy_0(
73 ; CI-NOT: alloca [5 x i32]
74 ; SI: alloca [5 x i32]
75 define amdgpu_kernel void @occupancy_0(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #3 {
77 %stack = alloca [5 x i32], align 4, addrspace(5)
78 %0 = load i32, ptr addrspace(1) %in, align 4
79 %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
80 store i32 4, ptr addrspace(5) %arrayidx1, align 4
81 %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
82 %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
83 %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
84 store i32 5, ptr addrspace(5) %arrayidx3, align 4
85 %2 = load i32, ptr addrspace(5) %stack, align 4
86 store i32 %2, ptr addrspace(1) %out, align 4
87 %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
88 %3 = load i32, ptr addrspace(5) %arrayidx12
89 %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
90 store i32 %3, ptr addrspace(1) %arrayidx13
94 ; ALL-LABEL: @occupancy_max(
95 ; CI-NOT: alloca [5 x i32]
96 ; SI: alloca [5 x i32]
97 define amdgpu_kernel void @occupancy_max(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #4 {
99 %stack = alloca [5 x i32], align 4, addrspace(5)
100 %0 = load i32, ptr addrspace(1) %in, align 4
101 %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
102 store i32 4, ptr addrspace(5) %arrayidx1, align 4
103 %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
104 %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
105 %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
106 store i32 5, ptr addrspace(5) %arrayidx3, align 4
107 %2 = load i32, ptr addrspace(5) %stack, align 4
108 store i32 %2, ptr addrspace(1) %out, align 4
109 %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
110 %3 = load i32, ptr addrspace(5) %arrayidx12
111 %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
112 store i32 %3, ptr addrspace(1) %arrayidx13
116 ; SI-LABEL: @occupancy_6(
117 ; CI-LABEL: @occupancy_6(
120 define amdgpu_kernel void @occupancy_6(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #5 {
122 %stack = alloca [42 x i8], align 4, addrspace(5)
123 %tmp = load i8, ptr addrspace(1) %in, align 1
124 %tmp4 = sext i8 %tmp to i64
125 %arrayidx1 = getelementptr inbounds [42 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
126 store i8 4, ptr addrspace(5) %arrayidx1, align 1
127 %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
128 %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
129 %tmp5 = sext i8 %tmp1 to i64
130 %arrayidx3 = getelementptr inbounds [42 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
131 store i8 5, ptr addrspace(5) %arrayidx3, align 1
132 %tmp2 = load i8, ptr addrspace(5) %stack, align 1
133 store i8 %tmp2, ptr addrspace(1) %out, align 1
134 %arrayidx12 = getelementptr inbounds [42 x i8], ptr addrspace(5) %stack, i64 0, i64 1
135 %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
136 %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
137 store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
141 ; ALL-LABEL: @occupancy_6_over(
142 ; SICI: alloca [43 x i8]
143 ; GFX10PLUS-NOT: alloca
145 define amdgpu_kernel void @occupancy_6_over(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #5 {
147 %stack = alloca [43 x i8], align 4, addrspace(5)
148 %tmp = load i8, ptr addrspace(1) %in, align 1
149 %tmp4 = sext i8 %tmp to i64
150 %arrayidx1 = getelementptr inbounds [43 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
151 store i8 4, ptr addrspace(5) %arrayidx1, align 1
152 %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
153 %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
154 %tmp5 = sext i8 %tmp1 to i64
155 %arrayidx3 = getelementptr inbounds [43 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
156 store i8 5, ptr addrspace(5) %arrayidx3, align 1
157 %tmp2 = load i8, ptr addrspace(5) %stack, align 1
158 store i8 %tmp2, ptr addrspace(1) %out, align 1
159 %arrayidx12 = getelementptr inbounds [43 x i8], ptr addrspace(5) %stack, i64 0, i64 1
160 %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
161 %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
162 store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
166 ; SI-LABEL: @occupancy_8(
167 ; CI-LABEL: @occupancy_8(
170 define amdgpu_kernel void @occupancy_8(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #6 {
172 %stack = alloca [32 x i8], align 4, addrspace(5)
173 %tmp = load i8, ptr addrspace(1) %in, align 1
174 %tmp4 = sext i8 %tmp to i64
175 %arrayidx1 = getelementptr inbounds [32 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
176 store i8 4, ptr addrspace(5) %arrayidx1, align 1
177 %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
178 %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
179 %tmp5 = sext i8 %tmp1 to i64
180 %arrayidx3 = getelementptr inbounds [32 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
181 store i8 5, ptr addrspace(5) %arrayidx3, align 1
182 %tmp2 = load i8, ptr addrspace(5) %stack, align 1
183 store i8 %tmp2, ptr addrspace(1) %out, align 1
184 %arrayidx12 = getelementptr inbounds [32 x i8], ptr addrspace(5) %stack, i64 0, i64 1
185 %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
186 %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
187 store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
191 ; ALL-LABEL: @occupancy_8_over(
192 ; SICI: alloca [33 x i8]
193 ; GFX10PLUS-NOT: alloca
195 define amdgpu_kernel void @occupancy_8_over(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #6 {
197 %stack = alloca [33 x i8], align 4, addrspace(5)
198 %tmp = load i8, ptr addrspace(1) %in, align 1
199 %tmp4 = sext i8 %tmp to i64
200 %arrayidx1 = getelementptr inbounds [33 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
201 store i8 4, ptr addrspace(5) %arrayidx1, align 1
202 %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
203 %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
204 %tmp5 = sext i8 %tmp1 to i64
205 %arrayidx3 = getelementptr inbounds [33 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
206 store i8 5, ptr addrspace(5) %arrayidx3, align 1
207 %tmp2 = load i8, ptr addrspace(5) %stack, align 1
208 store i8 %tmp2, ptr addrspace(1) %out, align 1
209 %arrayidx12 = getelementptr inbounds [33 x i8], ptr addrspace(5) %stack, i64 0, i64 1
210 %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
211 %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
212 store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
216 ; SI-LABEL: @occupancy_9(
217 ; CI-LABEL: @occupancy_9(
220 define amdgpu_kernel void @occupancy_9(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #7 {
222 %stack = alloca [28 x i8], align 4, addrspace(5)
223 %tmp = load i8, ptr addrspace(1) %in, align 1
224 %tmp4 = sext i8 %tmp to i64
225 %arrayidx1 = getelementptr inbounds [28 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
226 store i8 4, ptr addrspace(5) %arrayidx1, align 1
227 %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
228 %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
229 %tmp5 = sext i8 %tmp1 to i64
230 %arrayidx3 = getelementptr inbounds [28 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
231 store i8 5, ptr addrspace(5) %arrayidx3, align 1
232 %tmp2 = load i8, ptr addrspace(5) %stack, align 1
233 store i8 %tmp2, ptr addrspace(1) %out, align 1
234 %arrayidx12 = getelementptr inbounds [28 x i8], ptr addrspace(5) %stack, i64 0, i64 1
235 %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
236 %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
237 store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
241 ; ALL-LABEL: @occupancy_9_over(
242 ; SICI: alloca [29 x i8]
243 ; GFX10PLUS-NOT: alloca
245 define amdgpu_kernel void @occupancy_9_over(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #7 {
247 %stack = alloca [29 x i8], align 4, addrspace(5)
248 %tmp = load i8, ptr addrspace(1) %in, align 1
249 %tmp4 = sext i8 %tmp to i64
250 %arrayidx1 = getelementptr inbounds [29 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
251 store i8 4, ptr addrspace(5) %arrayidx1, align 1
252 %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
253 %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
254 %tmp5 = sext i8 %tmp1 to i64
255 %arrayidx3 = getelementptr inbounds [29 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
256 store i8 5, ptr addrspace(5) %arrayidx3, align 1
257 %tmp2 = load i8, ptr addrspace(5) %stack, align 1
258 store i8 %tmp2, ptr addrspace(1) %out, align 1
259 %arrayidx12 = getelementptr inbounds [29 x i8], ptr addrspace(5) %stack, i64 0, i64 1
260 %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
261 %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
262 store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
266 attributes #0 = { nounwind "amdgpu-flat-work-group-size"="63,63" }
267 attributes #1 = { nounwind "amdgpu-waves-per-eu"="1,3" "amdgpu-flat-work-group-size"="256,256" }
268 attributes #2 = { nounwind "amdgpu-waves-per-eu"="1,9" "amdgpu-flat-work-group-size"="1024,1024" }
269 attributes #3 = { nounwind "amdgpu-waves-per-eu"="1,10" }
270 attributes #4 = { nounwind "amdgpu-waves-per-eu"="1,10" }
271 attributes #5 = { nounwind "amdgpu-waves-per-eu"="1,6" "amdgpu-flat-work-group-size"="64,64" }
272 attributes #6 = { nounwind "amdgpu-waves-per-eu"="1,8" "amdgpu-flat-work-group-size"="64,64" }
273 attributes #7 = { nounwind "amdgpu-waves-per-eu"="1,9" "amdgpu-flat-work-group-size"="64,64" }