1 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine,infer-alignment %s | FileCheck -enable-var-scope %s
2 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine,infer-alignment %s | FileCheck -enable-var-scope %s
4 target datalayout = "n32"
6 ; CHECK-LABEL: @invalid_reqd_work_group_size(
8 define amdgpu_kernel void @invalid_reqd_work_group_size(ptr addrspace(1) %out) #0 !reqd_work_group_size !1 {
9 %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
10 %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
11 %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
12 store i16 %group.size.x, ptr addrspace(1) %out
16 ; CHECK-LABEL: @volatile_load_group_size_x(
17 ; CHECK: load volatile i16,
18 define amdgpu_kernel void @volatile_load_group_size_x(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
19 %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
20 %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
21 %group.size.x = load volatile i16, ptr addrspace(4) %gep.group.size.x, align 4
22 store i16 %group.size.x, ptr addrspace(1) %out
26 ; CHECK-LABEL: @load_group_size_x(
27 ; CHECK-NEXT: store i16 8,
28 define amdgpu_kernel void @load_group_size_x(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
29 %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
30 %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
31 %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
32 store i16 %group.size.x, ptr addrspace(1) %out
36 ; CHECK-LABEL: @load_group_size_y(
37 ; CHECK-NEXT: store i16 16,
38 define amdgpu_kernel void @load_group_size_y(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
39 %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
40 %gep.group.size.y = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 6
41 %group.size.y = load i16, ptr addrspace(4) %gep.group.size.y, align 4
42 store i16 %group.size.y, ptr addrspace(1) %out
46 ; CHECK-LABEL: @load_group_size_z(
47 ; CHECK-NEXT: store i16 2,
48 define amdgpu_kernel void @load_group_size_z(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
49 %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
50 %gep.group.size.z = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 8
51 %group.size.z = load i16, ptr addrspace(4) %gep.group.size.z, align 4
52 store i16 %group.size.z, ptr addrspace(1) %out
56 ; Metadata uses i64 instead of i32
57 ; CHECK-LABEL: @load_group_size_x_reqd_work_group_size_i64(
58 ; CHECK-NEXT: store i16 8,
59 define amdgpu_kernel void @load_group_size_x_reqd_work_group_size_i64(ptr addrspace(1) %out) #0 !reqd_work_group_size !2 {
60 %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
61 %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
62 %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
63 store i16 %group.size.x, ptr addrspace(1) %out
67 ; Metadata uses i16 instead of i32
68 ; CHECK-LABEL: @load_group_size_x_reqd_work_group_size_i16(
69 ; CHECK-NEXT: store i16 8,
70 define amdgpu_kernel void @load_group_size_x_reqd_work_group_size_i16(ptr addrspace(1) %out) #0 !reqd_work_group_size !3 {
71 %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
72 %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
73 %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
74 store i16 %group.size.x, ptr addrspace(1) %out
78 ; CHECK-LABEL: @use_local_size_x_8_16_2(
79 ; CHECK-NEXT: store i64 8,
80 define amdgpu_kernel void @use_local_size_x_8_16_2(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
81 %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
82 %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
83 %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
84 %gep.grid.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 12
85 %grid.size.x = load i32, ptr addrspace(4) %gep.grid.size.x, align 4
86 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
87 %group.size.x.zext = zext i16 %group.size.x to i32
88 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
89 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
90 %umin = call i32 @llvm.umin.i32(i32 %sub, i32 %group.size.x.zext)
91 %zext = zext i32 %umin to i64
92 store i64 %zext, ptr addrspace(1) %out
96 ; CHECK-LABEL: @use_local_size_y_8_16_2(
97 ; CHECK-NEXT: store i64 16,
98 define amdgpu_kernel void @use_local_size_y_8_16_2(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
99 %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
100 %gep.group.size.y = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 6
101 %group.size.y = load i16, ptr addrspace(4) %gep.group.size.y, align 4
102 %gep.grid.size.y = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 16
103 %grid.size.y = load i32, ptr addrspace(4) %gep.grid.size.y, align 4
104 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.y()
105 %group.size.y.zext = zext i16 %group.size.y to i32
106 %group.id_x_group.size.y = mul i32 %group.id, %group.size.y.zext
107 %sub = sub i32 %grid.size.y, %group.id_x_group.size.y
108 %umin = call i32 @llvm.umin.i32(i32 %sub, i32 %group.size.y.zext)
109 %zext = zext i32 %umin to i64
110 store i64 %zext, ptr addrspace(1) %out
114 ; CHECK-LABEL: @use_local_size_z_8_16_2(
115 ; CHECK-NEXT: store i64 2,
116 define amdgpu_kernel void @use_local_size_z_8_16_2(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
117 %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
118 %gep.group.size.z = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 8
119 %group.size.z = load i16, ptr addrspace(4) %gep.group.size.z, align 4
120 %gep.grid.size.z = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 20
121 %grid.size.z = load i32, ptr addrspace(4) %gep.grid.size.z, align 4
122 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.z()
123 %group.size.z.zext = zext i16 %group.size.z to i32
124 %group.id_x_group.size.z = mul i32 %group.id, %group.size.z.zext
125 %sub = sub i32 %grid.size.z, %group.id_x_group.size.z
126 %umin = call i32 @llvm.umin.i32(i32 %sub, i32 %group.size.z.zext)
127 %zext = zext i32 %umin to i64
128 store i64 %zext, ptr addrspace(1) %out
132 ; Simplification on select is invalid, but we can still eliminate the
133 ; load of the group size.
135 ; CHECK-LABEL: @local_size_x_8_16_2_wrong_group_id(
136 ; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.y()
137 ; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3
138 define amdgpu_kernel void @local_size_x_8_16_2_wrong_group_id(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
139 %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
140 %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
141 %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
142 %gep.grid.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 12
143 %grid.size.x = load i32, ptr addrspace(4) %gep.grid.size.x, align 4
144 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.y()
145 %group.size.x.zext = zext i16 %group.size.x to i32
146 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
147 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
148 %umin = call i32 @llvm.umin.i32(i32 %sub, i32 %group.size.x.zext)
149 %zext = zext i32 %umin to i64
150 store i64 %zext, ptr addrspace(1) %out
154 ; CHECK-LABEL: @local_size_x_8_16_2_wrong_grid_size(
155 ; CHECK: %grid.size.x = load i32, ptr addrspace(4) %gep.grid.size.x, align 4
156 ; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
157 ; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3
158 define amdgpu_kernel void @local_size_x_8_16_2_wrong_grid_size(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
159 %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
160 %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
161 %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
162 %gep.grid.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 16
163 %grid.size.x = load i32, ptr addrspace(4) %gep.grid.size.x, align 4
164 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
165 %group.size.x.zext = zext i16 %group.size.x to i32
166 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
167 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
168 %umin = call i32 @llvm.umin.i32(i32 %sub, i32 %group.size.x.zext)
169 %zext = zext i32 %umin to i64
170 store i64 %zext, ptr addrspace(1) %out
174 ; CHECK-LABEL: @local_size_x_8_16_2_wrong_cmp_type(
175 ; CHECK: %grid.size.x = load i32, ptr addrspace(4) %gep.grid.size.x, align 4
176 ; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
177 ; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3
178 ; CHECK: %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
179 ; CHECK: %smin = call i32 @llvm.smin.i32(i32 %sub, i32 8)
180 define amdgpu_kernel void @local_size_x_8_16_2_wrong_cmp_type(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
181 %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
182 %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
183 %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
184 %gep.grid.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 12
185 %grid.size.x = load i32, ptr addrspace(4) %gep.grid.size.x, align 4
186 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
187 %group.size.x.zext = zext i16 %group.size.x to i32
188 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
189 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
190 %smin = call i32 @llvm.smin.i32(i32 %sub, i32 %group.size.x.zext)
191 %zext = zext i32 %smin to i64
192 store i64 %zext, ptr addrspace(1) %out
196 ; CHECK-LABEL: @local_size_x_8_16_2_wrong_select(
197 ; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3
198 ; CHECK: %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
199 ; CHECK: %umax = call i32 @llvm.umax.i32(i32 %sub, i32 8)
200 ; CHECK: %zext = zext i32 %umax to i64
201 define amdgpu_kernel void @local_size_x_8_16_2_wrong_select(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
202 %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
203 %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
204 %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
205 %gep.grid.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 12
206 %grid.size.x = load i32, ptr addrspace(4) %gep.grid.size.x, align 4
207 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
208 %group.size.x.zext = zext i16 %group.size.x to i32
209 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
210 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
211 %umax = call i32 @llvm.umax.i32(i32 %sub, i32 %group.size.x.zext)
212 %zext = zext i32 %umax to i64
213 store i64 %zext, ptr addrspace(1) %out
217 ; CHECK-LABEL: @use_local_size_x_8_16_2_wrong_grid_load_size(
218 ; CHECK: %grid.size.x = load i16, ptr addrspace(4) %gep.grid.size.x, align 4
219 ; CHECK: %grid.size.x.zext = zext i16 %grid.size.x to i32
220 ; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
221 ; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3
222 ; CHECK: %sub = sub i32 %grid.size.x.zext, %group.id_x_group.size.x
223 define amdgpu_kernel void @use_local_size_x_8_16_2_wrong_grid_load_size(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
224 %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
225 %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
226 %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
227 %gep.grid.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 12
228 %grid.size.x = load i16, ptr addrspace(4) %gep.grid.size.x, align 4
229 %grid.size.x.zext = zext i16 %grid.size.x to i32
230 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
231 %group.size.x.zext = zext i16 %group.size.x to i32
232 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
233 %sub = sub i32 %grid.size.x.zext, %group.id_x_group.size.x
234 %umin = call i32 @llvm.umin.i32(i32 %sub, i32 %group.size.x.zext)
235 %zext = zext i32 %umin to i64
236 store i64 %zext, ptr addrspace(1) %out
240 ; CHECK-LABEL: @func_group_size_x(
241 ; CHECK-NEXT: ret i32 8
242 define i32 @func_group_size_x(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
243 %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
244 %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
245 %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
246 %zext = zext i16 %group.size.x to i32
250 ; CHECK-LABEL: @__ockl_get_local_size_reqd_size(
251 ; CHECK: %group.size = phi i32 [ 2, %bb17 ], [ 16, %bb9 ], [ 8, %bb1 ], [ 1, %bb ]
252 define i64 @__ockl_get_local_size_reqd_size(i32 %arg) #1 !reqd_work_group_size !0 {
254 %tmp = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #2
255 switch i32 %arg, label %bb25 [
262 %tmp2 = tail call i32 @llvm.amdgcn.workgroup.id.x()
263 %tmp3 = getelementptr inbounds i8, ptr addrspace(4) %tmp, i64 12
264 %tmp5 = load i32, ptr addrspace(4) %tmp3, align 4
265 %tmp6 = getelementptr inbounds i8, ptr addrspace(4) %tmp, i64 4
266 %tmp8 = load i16, ptr addrspace(4) %tmp6, align 4
270 %tmp10 = tail call i32 @llvm.amdgcn.workgroup.id.y()
271 %tmp11 = getelementptr inbounds i8, ptr addrspace(4) %tmp, i64 16
272 %tmp13 = load i32, ptr addrspace(4) %tmp11, align 8
273 %tmp14 = getelementptr inbounds i8, ptr addrspace(4) %tmp, i64 6
274 %tmp16 = load i16, ptr addrspace(4) %tmp14, align 2
278 %tmp18 = tail call i32 @llvm.amdgcn.workgroup.id.z()
279 %tmp19 = getelementptr inbounds i8, ptr addrspace(4) %tmp, i64 20
280 %tmp21 = load i32, ptr addrspace(4) %tmp19, align 4
281 %tmp22 = getelementptr inbounds i8, ptr addrspace(4) %tmp, i64 8
282 %tmp24 = load i16, ptr addrspace(4) %tmp22, align 8
285 bb25: ; preds = %bb17, %bb9, %bb1, %bb
286 %tmp26 = phi i32 [ %tmp21, %bb17 ], [ %tmp13, %bb9 ], [ %tmp5, %bb1 ], [ 0, %bb ]
287 %group.size = phi i16 [ %tmp24, %bb17 ], [ %tmp16, %bb9 ], [ %tmp8, %bb1 ], [ 1, %bb ]
288 %tmp28 = phi i32 [ %tmp18, %bb17 ], [ %tmp10, %bb9 ], [ %tmp2, %bb1 ], [ 0, %bb ]
289 %tmp29 = zext i16 %group.size to i32
290 %tmp30 = mul i32 %tmp28, %tmp29
291 %tmp31 = sub i32 %tmp26, %tmp30
292 %umin = call i32 @llvm.umin.i32(i32 %tmp31, i32 %tmp29)
293 %tmp34 = zext i32 %umin to i64
297 ; CHECK-LABEL: @all_local_size(
298 ; CHECK-NEXT: store volatile i64 8, ptr addrspace(1) %out, align 4
299 ; CHECK-NEXT: store volatile i64 16, ptr addrspace(1) %out, align 4
300 ; CHECK-NEXT: store volatile i64 2, ptr addrspace(1) %out, align 4
301 define amdgpu_kernel void @all_local_size(ptr addrspace(1) nocapture readnone %out) #0 !reqd_work_group_size !0 {
302 %tmp.i = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0
303 %tmp2.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #0
304 %tmp3.i = getelementptr inbounds i8, ptr addrspace(4) %tmp.i, i64 12
305 %tmp5.i = load i32, ptr addrspace(4) %tmp3.i, align 4
306 %tmp6.i = getelementptr inbounds i8, ptr addrspace(4) %tmp.i, i64 4
307 %tmp8.i = load i16, ptr addrspace(4) %tmp6.i, align 4
308 %tmp29.i = zext i16 %tmp8.i to i32
309 %tmp30.i = mul i32 %tmp2.i, %tmp29.i
310 %tmp31.i = sub i32 %tmp5.i, %tmp30.i
311 %umin0 = call i32 @llvm.umin.i32(i32 %tmp31.i, i32 %tmp29.i)
312 %tmp34.i = zext i32 %umin0 to i64
313 %tmp10.i = tail call i32 @llvm.amdgcn.workgroup.id.y() #0
314 %tmp11.i = getelementptr inbounds i8, ptr addrspace(4) %tmp.i, i64 16
315 %tmp13.i = load i32, ptr addrspace(4) %tmp11.i, align 8
316 %tmp14.i = getelementptr inbounds i8, ptr addrspace(4) %tmp.i, i64 6
317 %tmp16.i = load i16, ptr addrspace(4) %tmp14.i, align 2
318 %tmp29.i9 = zext i16 %tmp16.i to i32
319 %tmp30.i10 = mul i32 %tmp10.i, %tmp29.i9
320 %tmp31.i11 = sub i32 %tmp13.i, %tmp30.i10
321 %umin1 = call i32 @llvm.umin.i32(i32 %tmp31.i11, i32 %tmp29.i9)
322 %tmp34.i14 = zext i32 %umin1 to i64
323 %tmp18.i = tail call i32 @llvm.amdgcn.workgroup.id.z() #0
324 %tmp19.i = getelementptr inbounds i8, ptr addrspace(4) %tmp.i, i64 20
325 %tmp21.i = load i32, ptr addrspace(4) %tmp19.i, align 4
326 %tmp22.i = getelementptr inbounds i8, ptr addrspace(4) %tmp.i, i64 8
327 %tmp24.i = load i16, ptr addrspace(4) %tmp22.i, align 8
328 %tmp29.i2 = zext i16 %tmp24.i to i32
329 %tmp30.i3 = mul i32 %tmp18.i, %tmp29.i2
330 %tmp31.i4 = sub i32 %tmp21.i, %tmp30.i3
331 %umin2 = call i32 @llvm.umin.i32(i32 %tmp31.i4, i32 %tmp29.i2)
332 %tmp34.i7 = zext i32 %umin2 to i64
333 store volatile i64 %tmp34.i, ptr addrspace(1) %out, align 4
334 store volatile i64 %tmp34.i14, ptr addrspace(1) %out, align 4
335 store volatile i64 %tmp34.i7, ptr addrspace(1) %out, align 4
339 ; TODO: Should be able to handle this, but not much reason to.
340 ; CHECK-LABEL: @partial_load_group_size_x(
341 ; CHECK-NEXT: %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
342 ; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
343 ; CHECK-NEXT: %group.size.x.lo = load i8, ptr addrspace(4) %gep.group.size.x, align 4
344 ; CHECK-NEXT: store i8 %group.size.x.lo, ptr addrspace(1) %out, align 1
345 define amdgpu_kernel void @partial_load_group_size_x(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
346 %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
347 %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
348 %group.size.x.lo = load i8, ptr addrspace(4) %gep.group.size.x, align 1
349 store i8 %group.size.x.lo, ptr addrspace(1) %out
353 ; CHECK-LABEL: @partial_load_group_size_x_explicit_callsite_align(
354 ; CHECK-NEXT: %dispatch.ptr = tail call align 2 ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
355 ; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
356 ; CHECK-NEXT: %group.size.x.lo = load i8, ptr addrspace(4) %gep.group.size.x, align 2
357 ; CHECK-NEXT: store i8 %group.size.x.lo, ptr addrspace(1) %out, align 1
358 define amdgpu_kernel void @partial_load_group_size_x_explicit_callsite_align(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
359 %dispatch.ptr = tail call align 2 ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
360 %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
361 %group.size.x.lo = load i8, ptr addrspace(4) %gep.group.size.x, align 1
362 store i8 %group.size.x.lo, ptr addrspace(1) %out
366 ; TODO: Should be able to handle this
367 ; CHECK-LABEL: @load_group_size_xy_i32(
368 ; CHECK: %group.size.xy = load i32,
369 ; CHECK: store i32 %group.size.xy
370 define amdgpu_kernel void @load_group_size_xy_i32(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
371 %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
372 %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
373 %group.size.xy = load i32, ptr addrspace(4) %gep.group.size.x, align 4
374 store i32 %group.size.xy, ptr addrspace(1) %out
378 ; CHECK-LABEL: @load_group_size_x_y_multiple_dispatch_ptr(
379 ; CHECK-NEXT: store volatile i16 8, ptr addrspace(1) %out, align 2
380 ; CHECK-NEXT: store volatile i16 16, ptr addrspace(1) %out, align 2
381 define amdgpu_kernel void @load_group_size_x_y_multiple_dispatch_ptr(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
382 %dispatch.ptr0 = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
383 %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr0, i64 4
384 %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
385 store volatile i16 %group.size.x, ptr addrspace(1) %out
387 %dispatch.ptr1 = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
388 %gep.group.size.y = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr1, i64 6
389 %group.size.y = load i16, ptr addrspace(4) %gep.group.size.y, align 4
390 store volatile i16 %group.size.y, ptr addrspace(1) %out
395 ; CHECK-LABEL: @use_local_size_x_uniform_work_group_size(
396 ; CHECK-NEXT: %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
397 ; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
398 ; CHECK-NEXT: %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
399 ; CHECK-NEXT: %zext = zext i16 %group.size.x to i64
400 ; CHECK-NEXT: store i64 %zext, ptr addrspace(1) %out, align 4
401 define amdgpu_kernel void @use_local_size_x_uniform_work_group_size(ptr addrspace(1) %out) #2 {
402 %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
403 %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
404 %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
405 %gep.grid.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 12
406 %grid.size.x = load i32, ptr addrspace(4) %gep.grid.size.x, align 4
407 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
408 %group.size.x.zext = zext i16 %group.size.x to i32
409 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
410 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
411 %umin = call i32 @llvm.umin.i32(i32 %sub, i32 %group.size.x.zext)
412 %zext = zext i32 %umin to i64
413 store i64 %zext, ptr addrspace(1) %out
417 ; CHECK-LABEL: @use_local_size_x_uniform_work_group_size_false(
418 ; CHECK: call i32 @llvm.umin
419 define amdgpu_kernel void @use_local_size_x_uniform_work_group_size_false(ptr addrspace(1) %out) #3 {
420 %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
421 %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
422 %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
423 %gep.grid.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 12
424 %grid.size.x = load i32, ptr addrspace(4) %gep.grid.size.x, align 4
425 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
426 %group.size.x.zext = zext i16 %group.size.x to i32
427 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
428 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
429 %umin = call i32 @llvm.umin.i32(i32 %sub, i32 %group.size.x.zext)
430 %zext = zext i32 %umin to i64
431 store i64 %zext, ptr addrspace(1) %out
435 ; CHECK-LABEL: @no_use_dispatch_ptr(
436 ; CHECK-NEXT: ret void
437 define amdgpu_kernel void @no_use_dispatch_ptr() {
438 %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
442 declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #1
443 declare i32 @llvm.amdgcn.workgroup.id.x() #1
444 declare i32 @llvm.amdgcn.workgroup.id.y() #1
445 declare i32 @llvm.amdgcn.workgroup.id.z() #1
446 declare i32 @llvm.umin.i32(i32, i32) #1
447 declare i32 @llvm.smin.i32(i32, i32) #1
448 declare i32 @llvm.umax.i32(i32, i32) #1
450 attributes #0 = { nounwind "uniform-work-group-size"="true" }
451 attributes #1 = { nounwind readnone speculatable }
452 attributes #2 = { nounwind "uniform-work-group-size"="true" }
453 attributes #3 = { nounwind "uniform-work-group-size"="false" }
455 !0 = !{i32 8, i32 16, i32 2}
456 !1 = !{i32 8, i32 16}
457 !2 = !{i64 8, i64 16, i64 2}
458 !3 = !{i16 8, i16 16, i16 2}