1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2 ; RUN: llc -mtriple=amdgcn < %s | FileCheck --check-prefixes=SI,GCN,FUNC %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefixes=VI,GCN,FUNC %s
4 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600,FUNC %s
6 define amdgpu_kernel void @local_size_x(ptr addrspace(1) %out) {
7 ; SI-LABEL: local_size_x:
8 ; SI: ; %bb.0: ; %entry
9 ; SI-NEXT: s_load_dword s6, s[4:5], 0x6
10 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
11 ; SI-NEXT: s_mov_b32 s3, 0xf000
12 ; SI-NEXT: s_mov_b32 s2, -1
13 ; SI-NEXT: s_waitcnt lgkmcnt(0)
14 ; SI-NEXT: v_mov_b32_e32 v0, s6
15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
18 ; VI-LABEL: local_size_x:
19 ; VI: ; %bb.0: ; %entry
20 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
21 ; VI-NEXT: s_load_dword s2, s[4:5], 0x18
22 ; VI-NEXT: s_waitcnt lgkmcnt(0)
23 ; VI-NEXT: v_mov_b32_e32 v0, s0
24 ; VI-NEXT: v_mov_b32_e32 v1, s1
25 ; VI-NEXT: v_mov_b32_e32 v2, s2
26 ; VI-NEXT: flat_store_dword v[0:1], v2
29 ; R600-LABEL: local_size_x:
30 ; R600: ; %bb.0: ; %entry
31 ; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
32 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
35 ; R600-NEXT: ALU clause starting at 4:
36 ; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
37 ; R600-NEXT: MOV * T1.X, KC0[1].Z,
38 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
40 %0 = call i32 @llvm.r600.read.local.size.x() #0
41 store i32 %0, ptr addrspace(1) %out
45 define amdgpu_kernel void @local_size_y(ptr addrspace(1) %out) {
46 ; SI-LABEL: local_size_y:
47 ; SI: ; %bb.0: ; %entry
48 ; SI-NEXT: s_load_dword s6, s[4:5], 0x7
49 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
50 ; SI-NEXT: s_mov_b32 s3, 0xf000
51 ; SI-NEXT: s_mov_b32 s2, -1
52 ; SI-NEXT: s_waitcnt lgkmcnt(0)
53 ; SI-NEXT: v_mov_b32_e32 v0, s6
54 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
57 ; VI-LABEL: local_size_y:
58 ; VI: ; %bb.0: ; %entry
59 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
60 ; VI-NEXT: s_load_dword s2, s[4:5], 0x1c
61 ; VI-NEXT: s_waitcnt lgkmcnt(0)
62 ; VI-NEXT: v_mov_b32_e32 v0, s0
63 ; VI-NEXT: v_mov_b32_e32 v1, s1
64 ; VI-NEXT: v_mov_b32_e32 v2, s2
65 ; VI-NEXT: flat_store_dword v[0:1], v2
68 ; R600-LABEL: local_size_y:
69 ; R600: ; %bb.0: ; %entry
70 ; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
71 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
74 ; R600-NEXT: ALU clause starting at 4:
75 ; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
76 ; R600-NEXT: MOV * T1.X, KC0[1].W,
77 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
79 %0 = call i32 @llvm.r600.read.local.size.y() #0
80 store i32 %0, ptr addrspace(1) %out
84 define amdgpu_kernel void @local_size_z(ptr addrspace(1) %out) {
85 ; SI-LABEL: local_size_z:
86 ; SI: ; %bb.0: ; %entry
87 ; SI-NEXT: s_load_dword s6, s[4:5], 0x8
88 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
89 ; SI-NEXT: s_mov_b32 s3, 0xf000
90 ; SI-NEXT: s_mov_b32 s2, -1
91 ; SI-NEXT: s_waitcnt lgkmcnt(0)
92 ; SI-NEXT: v_mov_b32_e32 v0, s6
93 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
96 ; VI-LABEL: local_size_z:
97 ; VI: ; %bb.0: ; %entry
98 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
99 ; VI-NEXT: s_load_dword s2, s[4:5], 0x20
100 ; VI-NEXT: s_waitcnt lgkmcnt(0)
101 ; VI-NEXT: v_mov_b32_e32 v0, s0
102 ; VI-NEXT: v_mov_b32_e32 v1, s1
103 ; VI-NEXT: v_mov_b32_e32 v2, s2
104 ; VI-NEXT: flat_store_dword v[0:1], v2
107 ; R600-LABEL: local_size_z:
108 ; R600: ; %bb.0: ; %entry
109 ; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
110 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
113 ; R600-NEXT: ALU clause starting at 4:
114 ; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
115 ; R600-NEXT: MOV * T1.X, KC0[2].X,
116 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
118 %0 = call i32 @llvm.r600.read.local.size.z() #0
119 store i32 %0, ptr addrspace(1) %out
123 define amdgpu_kernel void @local_size_xy(ptr addrspace(1) %out) {
124 ; SI-LABEL: local_size_xy:
125 ; SI: ; %bb.0: ; %entry
126 ; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x6
127 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
128 ; SI-NEXT: s_mov_b32 s3, 0xf000
129 ; SI-NEXT: s_waitcnt lgkmcnt(0)
130 ; SI-NEXT: s_mul_i32 s4, s6, s7
131 ; SI-NEXT: s_mov_b32 s2, -1
132 ; SI-NEXT: v_mov_b32_e32 v0, s4
133 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
136 ; VI-LABEL: local_size_xy:
137 ; VI: ; %bb.0: ; %entry
138 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x18
139 ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
140 ; VI-NEXT: s_waitcnt lgkmcnt(0)
141 ; VI-NEXT: s_mul_i32 s0, s0, s1
142 ; VI-NEXT: v_mov_b32_e32 v0, s2
143 ; VI-NEXT: v_mov_b32_e32 v1, s3
144 ; VI-NEXT: v_mov_b32_e32 v2, s0
145 ; VI-NEXT: flat_store_dword v[0:1], v2
148 ; R600-LABEL: local_size_xy:
149 ; R600: ; %bb.0: ; %entry
150 ; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
151 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
154 ; R600-NEXT: ALU clause starting at 4:
155 ; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
156 ; R600-NEXT: MULLO_INT * T1.X, KC0[1].Z, KC0[1].W,
157 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
159 %x = call i32 @llvm.r600.read.local.size.x() #0
160 %y = call i32 @llvm.r600.read.local.size.y() #0
161 %val = mul i32 %x, %y
162 store i32 %val, ptr addrspace(1) %out
166 define amdgpu_kernel void @local_size_xz(ptr addrspace(1) %out) {
167 ; SI-LABEL: local_size_xz:
168 ; SI: ; %bb.0: ; %entry
169 ; SI-NEXT: s_load_dword s2, s[4:5], 0x6
170 ; SI-NEXT: s_load_dword s6, s[4:5], 0x8
171 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
172 ; SI-NEXT: s_mov_b32 s3, 0xf000
173 ; SI-NEXT: s_waitcnt lgkmcnt(0)
174 ; SI-NEXT: s_mul_i32 s4, s2, s6
175 ; SI-NEXT: s_mov_b32 s2, -1
176 ; SI-NEXT: v_mov_b32_e32 v0, s4
177 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
180 ; VI-LABEL: local_size_xz:
181 ; VI: ; %bb.0: ; %entry
182 ; VI-NEXT: s_load_dword s2, s[4:5], 0x18
183 ; VI-NEXT: s_load_dword s3, s[4:5], 0x20
184 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
185 ; VI-NEXT: s_waitcnt lgkmcnt(0)
186 ; VI-NEXT: s_mul_i32 s2, s2, s3
187 ; VI-NEXT: v_mov_b32_e32 v0, s0
188 ; VI-NEXT: v_mov_b32_e32 v1, s1
189 ; VI-NEXT: v_mov_b32_e32 v2, s2
190 ; VI-NEXT: flat_store_dword v[0:1], v2
193 ; R600-LABEL: local_size_xz:
194 ; R600: ; %bb.0: ; %entry
195 ; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
196 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
199 ; R600-NEXT: ALU clause starting at 4:
200 ; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
201 ; R600-NEXT: MULLO_INT * T1.X, KC0[1].Z, KC0[2].X,
202 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
204 %x = call i32 @llvm.r600.read.local.size.x() #0
205 %z = call i32 @llvm.r600.read.local.size.z() #0
206 %val = mul i32 %x, %z
207 store i32 %val, ptr addrspace(1) %out
211 define amdgpu_kernel void @local_size_yz(ptr addrspace(1) %out) {
212 ; SI-LABEL: local_size_yz:
213 ; SI: ; %bb.0: ; %entry
214 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x7
215 ; SI-NEXT: s_mov_b32 s7, 0xf000
216 ; SI-NEXT: s_waitcnt lgkmcnt(0)
217 ; SI-NEXT: s_mul_i32 s0, s0, s1
218 ; SI-NEXT: s_mov_b32 s6, -1
219 ; SI-NEXT: s_mov_b32 s4, s2
220 ; SI-NEXT: s_mov_b32 s5, s3
221 ; SI-NEXT: v_mov_b32_e32 v0, s0
222 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
225 ; VI-LABEL: local_size_yz:
226 ; VI: ; %bb.0: ; %entry
227 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x1c
228 ; VI-NEXT: s_waitcnt lgkmcnt(0)
229 ; VI-NEXT: s_mul_i32 s0, s0, s1
230 ; VI-NEXT: v_mov_b32_e32 v0, s2
231 ; VI-NEXT: v_mov_b32_e32 v1, s3
232 ; VI-NEXT: v_mov_b32_e32 v2, s0
233 ; VI-NEXT: flat_store_dword v[0:1], v2
236 ; R600-LABEL: local_size_yz:
237 ; R600: ; %bb.0: ; %entry
238 ; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
239 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
242 ; R600-NEXT: ALU clause starting at 4:
243 ; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
244 ; R600-NEXT: MULLO_INT * T1.X, KC0[1].W, KC0[2].X,
245 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
247 %y = call i32 @llvm.r600.read.local.size.y() #0
248 %z = call i32 @llvm.r600.read.local.size.z() #0
249 %val = mul i32 %y, %z
250 store i32 %val, ptr addrspace(1) %out
254 define amdgpu_kernel void @local_size_xyz(ptr addrspace(1) %out) {
255 ; SI-LABEL: local_size_xyz:
256 ; SI: ; %bb.0: ; %entry
257 ; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x6
258 ; SI-NEXT: s_load_dword s2, s[4:5], 0x8
259 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
260 ; SI-NEXT: s_mov_b32 s3, 0xf000
261 ; SI-NEXT: s_waitcnt lgkmcnt(0)
262 ; SI-NEXT: s_mul_i32 s4, s6, s7
263 ; SI-NEXT: s_add_i32 s4, s4, s2
264 ; SI-NEXT: s_mov_b32 s2, -1
265 ; SI-NEXT: v_mov_b32_e32 v0, s4
266 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
269 ; VI-LABEL: local_size_xyz:
270 ; VI: ; %bb.0: ; %entry
271 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x18
272 ; VI-NEXT: s_load_dword s6, s[4:5], 0x20
273 ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
274 ; VI-NEXT: s_waitcnt lgkmcnt(0)
275 ; VI-NEXT: s_mul_i32 s0, s0, s1
276 ; VI-NEXT: s_add_i32 s0, s0, s6
277 ; VI-NEXT: v_mov_b32_e32 v0, s2
278 ; VI-NEXT: v_mov_b32_e32 v1, s3
279 ; VI-NEXT: v_mov_b32_e32 v2, s0
280 ; VI-NEXT: flat_store_dword v[0:1], v2
283 ; R600-LABEL: local_size_xyz:
284 ; R600: ; %bb.0: ; %entry
285 ; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
286 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
289 ; R600-NEXT: ALU clause starting at 4:
290 ; R600-NEXT: MULLO_INT * T0.X, KC0[1].Z, KC0[1].W,
291 ; R600-NEXT: ADD_INT T0.X, PS, KC0[2].X,
292 ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
293 ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
295 %x = call i32 @llvm.r600.read.local.size.x() #0
296 %y = call i32 @llvm.r600.read.local.size.y() #0
297 %z = call i32 @llvm.r600.read.local.size.z() #0
299 %xyz = add i32 %xy, %z
300 store i32 %xyz, ptr addrspace(1) %out
304 define amdgpu_kernel void @local_size_x_known_bits(ptr addrspace(1) %out) {
305 ; SI-LABEL: local_size_x_known_bits:
306 ; SI: ; %bb.0: ; %entry
307 ; SI-NEXT: s_load_dword s6, s[4:5], 0x6
308 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
309 ; SI-NEXT: s_mov_b32 s3, 0xf000
310 ; SI-NEXT: s_mov_b32 s2, -1
311 ; SI-NEXT: s_waitcnt lgkmcnt(0)
312 ; SI-NEXT: v_mov_b32_e32 v0, s6
313 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
316 ; VI-LABEL: local_size_x_known_bits:
317 ; VI: ; %bb.0: ; %entry
318 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
319 ; VI-NEXT: s_load_dword s2, s[4:5], 0x18
320 ; VI-NEXT: s_waitcnt lgkmcnt(0)
321 ; VI-NEXT: v_mov_b32_e32 v0, s0
322 ; VI-NEXT: v_mov_b32_e32 v1, s1
323 ; VI-NEXT: v_mov_b32_e32 v2, s2
324 ; VI-NEXT: flat_store_dword v[0:1], v2
327 ; R600-LABEL: local_size_x_known_bits:
328 ; R600: ; %bb.0: ; %entry
329 ; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
330 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
333 ; R600-NEXT: ALU clause starting at 4:
334 ; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
335 ; R600-NEXT: AND_INT * T1.X, KC0[1].Z, literal.y,
336 ; R600-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
338 %size = call i32 @llvm.r600.read.local.size.x() #0
339 %shl = shl i32 %size, 16
340 %shr = lshr i32 %shl, 16
341 store i32 %shr, ptr addrspace(1) %out
345 define amdgpu_kernel void @local_size_y_known_bits(ptr addrspace(1) %out) {
346 ; SI-LABEL: local_size_y_known_bits:
347 ; SI: ; %bb.0: ; %entry
348 ; SI-NEXT: s_load_dword s6, s[4:5], 0x7
349 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
350 ; SI-NEXT: s_mov_b32 s3, 0xf000
351 ; SI-NEXT: s_mov_b32 s2, -1
352 ; SI-NEXT: s_waitcnt lgkmcnt(0)
353 ; SI-NEXT: v_mov_b32_e32 v0, s6
354 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
357 ; VI-LABEL: local_size_y_known_bits:
358 ; VI: ; %bb.0: ; %entry
359 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
360 ; VI-NEXT: s_load_dword s2, s[4:5], 0x1c
361 ; VI-NEXT: s_waitcnt lgkmcnt(0)
362 ; VI-NEXT: v_mov_b32_e32 v0, s0
363 ; VI-NEXT: v_mov_b32_e32 v1, s1
364 ; VI-NEXT: v_mov_b32_e32 v2, s2
365 ; VI-NEXT: flat_store_dword v[0:1], v2
368 ; R600-LABEL: local_size_y_known_bits:
369 ; R600: ; %bb.0: ; %entry
370 ; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
371 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
374 ; R600-NEXT: ALU clause starting at 4:
375 ; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
376 ; R600-NEXT: AND_INT * T1.X, KC0[1].W, literal.y,
377 ; R600-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
379 %size = call i32 @llvm.r600.read.local.size.y() #0
380 %shl = shl i32 %size, 16
381 %shr = lshr i32 %shl, 16
382 store i32 %shr, ptr addrspace(1) %out
386 define amdgpu_kernel void @local_size_z_known_bits(ptr addrspace(1) %out) {
387 ; SI-LABEL: local_size_z_known_bits:
388 ; SI: ; %bb.0: ; %entry
389 ; SI-NEXT: s_load_dword s6, s[4:5], 0x8
390 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
391 ; SI-NEXT: s_mov_b32 s3, 0xf000
392 ; SI-NEXT: s_mov_b32 s2, -1
393 ; SI-NEXT: s_waitcnt lgkmcnt(0)
394 ; SI-NEXT: v_mov_b32_e32 v0, s6
395 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
398 ; VI-LABEL: local_size_z_known_bits:
399 ; VI: ; %bb.0: ; %entry
400 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
401 ; VI-NEXT: s_load_dword s2, s[4:5], 0x20
402 ; VI-NEXT: s_waitcnt lgkmcnt(0)
403 ; VI-NEXT: v_mov_b32_e32 v0, s0
404 ; VI-NEXT: v_mov_b32_e32 v1, s1
405 ; VI-NEXT: v_mov_b32_e32 v2, s2
406 ; VI-NEXT: flat_store_dword v[0:1], v2
409 ; R600-LABEL: local_size_z_known_bits:
410 ; R600: ; %bb.0: ; %entry
411 ; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
412 ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
415 ; R600-NEXT: ALU clause starting at 4:
416 ; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
417 ; R600-NEXT: AND_INT * T1.X, KC0[2].X, literal.y,
418 ; R600-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
420 %size = call i32 @llvm.r600.read.local.size.z() #0
421 %shl = shl i32 %size, 16
422 %shr = lshr i32 %shl, 16
423 store i32 %shr, ptr addrspace(1) %out
427 declare i32 @llvm.r600.read.local.size.x() #0
428 declare i32 @llvm.r600.read.local.size.y() #0
429 declare i32 @llvm.r600.read.local.size.z() #0
431 attributes #0 = { nounwind readnone }
432 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: