1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s
3 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
4 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
6 declare i32 @llvm.amdgcn.workitem.id.x() #0
8 @lds.obj = addrspace(3) global [256 x i32] undef, align 4
10 define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 {
11 ; CI-LABEL: write_ds_sub0_offset0_global:
12 ; CI: ; %bb.0: ; %entry
13 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
14 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
15 ; CI-NEXT: v_mov_b32_e32 v1, 0x7b
16 ; CI-NEXT: s_mov_b32 m0, -1
17 ; CI-NEXT: ds_write_b32 v0, v1 offset:12
20 ; GFX9-LABEL: write_ds_sub0_offset0_global:
21 ; GFX9: ; %bb.0: ; %entry
22 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
23 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b
25 ; GFX9-NEXT: ds_write_b32 v0, v1 offset:12
28 ; GFX10-LABEL: write_ds_sub0_offset0_global:
29 ; GFX10: ; %bb.0: ; %entry
30 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
31 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x7b
32 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0
33 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:12
34 ; GFX10-NEXT: s_endpgm
36 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
37 %sub1 = sub i32 0, %x.i
38 %tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1
39 %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %tmp0, i32 3
40 store i32 123, i32 addrspace(3)* %arrayidx
44 define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.val) #0 {
45 ; CI-LABEL: write_ds_sub0_offset0_global_clamp_bit:
46 ; CI: ; %bb.0: ; %entry
47 ; CI-NEXT: s_load_dword s0, s[0:1], 0x9
48 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
49 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
50 ; CI-NEXT: s_mov_b64 vcc, 0
51 ; CI-NEXT: v_mov_b32_e32 v2, 0x7b
52 ; CI-NEXT: s_waitcnt lgkmcnt(0)
53 ; CI-NEXT: v_mov_b32_e32 v1, s0
54 ; CI-NEXT: s_mov_b32 s0, 0
55 ; CI-NEXT: v_div_fmas_f32 v1, v1, v1, v1
56 ; CI-NEXT: s_mov_b32 m0, -1
57 ; CI-NEXT: s_mov_b32 s3, 0xf000
58 ; CI-NEXT: s_mov_b32 s2, -1
59 ; CI-NEXT: s_mov_b32 s1, s0
60 ; CI-NEXT: ds_write_b32 v0, v2 offset:12
61 ; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0
62 ; CI-NEXT: s_waitcnt vmcnt(0)
65 ; GFX9-LABEL: write_ds_sub0_offset0_global_clamp_bit:
66 ; GFX9: ; %bb.0: ; %entry
67 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
68 ; GFX9-NEXT: s_mov_b64 vcc, 0
69 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
70 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v0
71 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7b
72 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
73 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
74 ; GFX9-NEXT: v_div_fmas_f32 v2, v1, v1, v1
75 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
76 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
77 ; GFX9-NEXT: ds_write_b32 v3, v4 offset:12
78 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
79 ; GFX9-NEXT: s_waitcnt vmcnt(0)
82 ; GFX10-LABEL: write_ds_sub0_offset0_global_clamp_bit:
83 ; GFX10: ; %bb.0: ; %entry
84 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24
85 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
86 ; GFX10-NEXT: s_mov_b32 vcc_lo, 0
87 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b
88 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0, v0
89 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
90 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
91 ; GFX10-NEXT: ds_write_b32 v2, v3 offset:12
92 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
93 ; GFX10-NEXT: v_div_fmas_f32 v4, s0, s0, s0
94 ; GFX10-NEXT: global_store_dword v[0:1], v4, off
95 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
96 ; GFX10-NEXT: s_endpgm
98 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
99 %sub1 = sub i32 0, %x.i
100 %tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1
101 %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %tmp0, i32 3
102 store i32 123, i32 addrspace(3)* %arrayidx
103 %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false)
104 store volatile float %fmas, float addrspace(1)* null
108 define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset() #1 {
109 ; CI-LABEL: add_x_shl_neg_to_sub_max_offset:
111 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
112 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
113 ; CI-NEXT: v_mov_b32_e32 v1, 13
114 ; CI-NEXT: s_mov_b32 m0, -1
115 ; CI-NEXT: ds_write_b8 v0, v1 offset:65535
118 ; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset:
120 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
121 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
122 ; GFX9-NEXT: v_mov_b32_e32 v1, 13
123 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:65535
124 ; GFX9-NEXT: s_endpgm
126 ; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset:
128 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
129 ; GFX10-NEXT: v_mov_b32_e32 v1, 13
130 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0
131 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:65535
132 ; GFX10-NEXT: s_endpgm
133 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
134 %neg = sub i32 0, %x.i
135 %shl = shl i32 %neg, 2
136 %add = add i32 65535, %shl
137 %ptr = inttoptr i32 %add to i8 addrspace(3)*
138 store i8 13, i8 addrspace(3)* %ptr
142 define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
143 ; CI-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
145 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
146 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x10000, v0
147 ; CI-NEXT: v_mov_b32_e32 v1, 13
148 ; CI-NEXT: s_mov_b32 m0, -1
149 ; CI-NEXT: ds_write_b8 v0, v1
152 ; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
154 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
155 ; GFX9-NEXT: v_sub_u32_e32 v0, 0x10000, v0
156 ; GFX9-NEXT: v_mov_b32_e32 v1, 13
157 ; GFX9-NEXT: ds_write_b8 v0, v1
158 ; GFX9-NEXT: s_endpgm
160 ; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
162 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
163 ; GFX10-NEXT: v_mov_b32_e32 v1, 13
164 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x10000, v0
165 ; GFX10-NEXT: ds_write_b8 v0, v1
166 ; GFX10-NEXT: s_endpgm
167 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
168 %neg = sub i32 0, %x.i
169 %shl = shl i32 %neg, 2
170 %add = add i32 65536, %shl
171 %ptr = inttoptr i32 %add to i8 addrspace(3)*
172 store i8 13, i8 addrspace(3)* %ptr
176 define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 {
177 ; CI-LABEL: add_x_shl_neg_to_sub_multi_use:
179 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
180 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
181 ; CI-NEXT: v_mov_b32_e32 v1, 13
182 ; CI-NEXT: s_mov_b32 m0, -1
183 ; CI-NEXT: ds_write_b32 v0, v1 offset:123
184 ; CI-NEXT: ds_write_b32 v0, v1 offset:456
187 ; GFX9-LABEL: add_x_shl_neg_to_sub_multi_use:
189 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
190 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
191 ; GFX9-NEXT: v_mov_b32_e32 v1, 13
192 ; GFX9-NEXT: ds_write_b32 v0, v1 offset:123
193 ; GFX9-NEXT: ds_write_b32 v0, v1 offset:456
194 ; GFX9-NEXT: s_endpgm
196 ; GFX10-LABEL: add_x_shl_neg_to_sub_multi_use:
198 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
199 ; GFX10-NEXT: v_mov_b32_e32 v1, 13
200 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0
201 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:123
202 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:456
203 ; GFX10-NEXT: s_endpgm
204 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
205 %neg = sub i32 0, %x.i
206 %shl = shl i32 %neg, 2
207 %add0 = add i32 123, %shl
208 %add1 = add i32 456, %shl
209 %ptr0 = inttoptr i32 %add0 to i32 addrspace(3)*
210 store volatile i32 13, i32 addrspace(3)* %ptr0
211 %ptr1 = inttoptr i32 %add1 to i32 addrspace(3)*
212 store volatile i32 13, i32 addrspace(3)* %ptr1
216 define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
217 ; CI-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
219 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
220 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
221 ; CI-NEXT: v_mov_b32_e32 v1, 13
222 ; CI-NEXT: s_mov_b32 m0, -1
223 ; CI-NEXT: ds_write_b32 v0, v1 offset:123
224 ; CI-NEXT: ds_write_b32 v0, v1 offset:123
227 ; GFX9-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
229 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
230 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
231 ; GFX9-NEXT: v_mov_b32_e32 v1, 13
232 ; GFX9-NEXT: ds_write_b32 v0, v1 offset:123
233 ; GFX9-NEXT: ds_write_b32 v0, v1 offset:123
234 ; GFX9-NEXT: s_endpgm
236 ; GFX10-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
238 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
239 ; GFX10-NEXT: v_mov_b32_e32 v1, 13
240 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0
241 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:123
242 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:123
243 ; GFX10-NEXT: s_endpgm
244 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
245 %neg = sub i32 0, %x.i
246 %shl = shl i32 %neg, 2
247 %add = add i32 123, %shl
248 %ptr = inttoptr i32 %add to i32 addrspace(3)*
249 store volatile i32 13, i32 addrspace(3)* %ptr
250 store volatile i32 13, i32 addrspace(3)* %ptr
254 define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
255 ; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
257 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
258 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fb, v0
259 ; CI-NEXT: v_mov_b32_e32 v1, 0x7b
260 ; CI-NEXT: v_mov_b32_e32 v2, 0
261 ; CI-NEXT: s_mov_b32 m0, -1
262 ; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
265 ; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
267 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
268 ; GFX9-NEXT: v_sub_u32_e32 v0, 0x3fb, v0
269 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b
270 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
271 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
272 ; GFX9-NEXT: s_endpgm
274 ; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
276 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
277 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x7b
278 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
279 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x3fb, v0
280 ; GFX10-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
281 ; GFX10-NEXT: s_endpgm
282 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
283 %neg = sub i32 0, %x.i
284 %shl = shl i32 %neg, 2
285 %add = add i32 1019, %shl
286 %ptr = inttoptr i32 %add to i64 addrspace(3)*
287 store i64 123, i64 addrspace(3)* %ptr, align 4
291 define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit(float %dummy.val) #1 {
292 ; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
294 ; CI-NEXT: s_load_dword s0, s[0:1], 0x9
295 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
296 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fb, v0
297 ; CI-NEXT: s_mov_b64 vcc, 0
298 ; CI-NEXT: v_mov_b32_e32 v2, 0x7b
299 ; CI-NEXT: s_waitcnt lgkmcnt(0)
300 ; CI-NEXT: v_mov_b32_e32 v1, s0
301 ; CI-NEXT: s_mov_b32 s0, 0
302 ; CI-NEXT: v_div_fmas_f32 v1, v1, v1, v1
303 ; CI-NEXT: v_mov_b32_e32 v3, 0
304 ; CI-NEXT: s_mov_b32 m0, -1
305 ; CI-NEXT: s_mov_b32 s3, 0xf000
306 ; CI-NEXT: s_mov_b32 s2, -1
307 ; CI-NEXT: s_mov_b32 s1, s0
308 ; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1
309 ; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0
310 ; CI-NEXT: s_waitcnt vmcnt(0)
313 ; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
315 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
316 ; GFX9-NEXT: s_mov_b64 vcc, 0
317 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
318 ; GFX9-NEXT: v_sub_u32_e32 v3, 0x3fb, v0
319 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7b
320 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
321 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
322 ; GFX9-NEXT: v_div_fmas_f32 v2, v1, v1, v1
323 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
324 ; GFX9-NEXT: v_mov_b32_e32 v5, 0
325 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
326 ; GFX9-NEXT: ds_write2_b32 v3, v4, v5 offset1:1
327 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
328 ; GFX9-NEXT: s_waitcnt vmcnt(0)
329 ; GFX9-NEXT: s_endpgm
331 ; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
333 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24
334 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
335 ; GFX10-NEXT: s_mov_b32 vcc_lo, 0
336 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b
337 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
338 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0x3fb, v0
339 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
340 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
341 ; GFX10-NEXT: ds_write2_b32 v2, v3, v4 offset1:1
342 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
343 ; GFX10-NEXT: v_div_fmas_f32 v5, s0, s0, s0
344 ; GFX10-NEXT: global_store_dword v[0:1], v5, off
345 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
346 ; GFX10-NEXT: s_endpgm
347 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
348 %neg = sub i32 0, %x.i
349 %shl = shl i32 %neg, 2
350 %add = add i32 1019, %shl
351 %ptr = inttoptr i32 %add to i64 addrspace(3)*
352 store i64 123, i64 addrspace(3)* %ptr, align 4
353 %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false)
354 store volatile float %fmas, float addrspace(1)* null
358 define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 {
359 ; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
361 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
362 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fc, v0
363 ; CI-NEXT: v_mov_b32_e32 v1, 0x7b
364 ; CI-NEXT: v_mov_b32_e32 v2, 0
365 ; CI-NEXT: s_mov_b32 m0, -1
366 ; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
369 ; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
371 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
372 ; GFX9-NEXT: v_sub_u32_e32 v0, 0x3fc, v0
373 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b
374 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
375 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
376 ; GFX9-NEXT: s_endpgm
378 ; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
380 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
381 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x7b
382 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
383 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x3fc, v0
384 ; GFX10-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
385 ; GFX10-NEXT: s_endpgm
386 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
387 %neg = sub i32 0, %x.i
388 %shl = shl i32 %neg, 2
389 %add = add i32 1020, %shl
390 %ptr = inttoptr i32 %add to i64 addrspace(3)*
391 store i64 123, i64 addrspace(3)* %ptr, align 4
395 declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1)
397 attributes #0 = { nounwind readnone }
398 attributes #1 = { nounwind }
399 attributes #2 = { nounwind convergent }