1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s
3 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
4 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
5 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
7 declare i32 @llvm.amdgcn.workitem.id.x() #0
9 @lds.obj = addrspace(3) global [256 x i32] undef, align 4
11 define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 {
12 ; CI-LABEL: write_ds_sub0_offset0_global:
13 ; CI: ; %bb.0: ; %entry
14 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
15 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
16 ; CI-NEXT: v_mov_b32_e32 v1, 0x7b
17 ; CI-NEXT: s_mov_b32 m0, -1
18 ; CI-NEXT: ds_write_b32 v0, v1 offset:12
21 ; GFX9-LABEL: write_ds_sub0_offset0_global:
22 ; GFX9: ; %bb.0: ; %entry
23 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
24 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
25 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b
26 ; GFX9-NEXT: ds_write_b32 v0, v1 offset:12
29 ; GFX10-LABEL: write_ds_sub0_offset0_global:
30 ; GFX10: ; %bb.0: ; %entry
31 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
32 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x7b
33 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0
34 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:12
35 ; GFX10-NEXT: s_endpgm
37 ; GFX11-LABEL: write_ds_sub0_offset0_global:
38 ; GFX11: ; %bb.0: ; %entry
39 ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0
40 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
41 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0
42 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:12
43 ; GFX11-NEXT: s_endpgm
45 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
46 %sub1 = sub i32 0, %x.i
47 %tmp0 = getelementptr [256 x i32], ptr addrspace(3) @lds.obj, i32 0, i32 %sub1
48 %arrayidx = getelementptr inbounds i32, ptr addrspace(3) %tmp0, i32 3
49 store i32 123, ptr addrspace(3) %arrayidx
53 define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.val) #0 {
54 ; CI-LABEL: write_ds_sub0_offset0_global_clamp_bit:
55 ; CI: ; %bb.0: ; %entry
56 ; CI-NEXT: s_load_dword s0, s[0:1], 0x0
57 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
58 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
59 ; CI-NEXT: s_mov_b64 vcc, 0
60 ; CI-NEXT: s_waitcnt lgkmcnt(0)
61 ; CI-NEXT: v_mov_b32_e32 v1, s0
62 ; CI-NEXT: s_mov_b32 s0, 0
63 ; CI-NEXT: v_div_fmas_f32 v1, v1, v1, v1
64 ; CI-NEXT: v_mov_b32_e32 v2, 0x7b
65 ; CI-NEXT: s_mov_b32 m0, -1
66 ; CI-NEXT: s_mov_b32 s3, 0xf000
67 ; CI-NEXT: s_mov_b32 s2, -1
68 ; CI-NEXT: s_mov_b32 s1, s0
69 ; CI-NEXT: ds_write_b32 v0, v2 offset:12
70 ; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0
71 ; CI-NEXT: s_waitcnt vmcnt(0)
74 ; GFX9-LABEL: write_ds_sub0_offset0_global_clamp_bit:
75 ; GFX9: ; %bb.0: ; %entry
76 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0
77 ; GFX9-NEXT: s_mov_b64 vcc, 0
78 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
79 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v0
80 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7b
81 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
82 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
83 ; GFX9-NEXT: v_div_fmas_f32 v2, v1, v1, v1
84 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
85 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
86 ; GFX9-NEXT: ds_write_b32 v3, v4 offset:12
87 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
88 ; GFX9-NEXT: s_waitcnt vmcnt(0)
91 ; GFX10-LABEL: write_ds_sub0_offset0_global_clamp_bit:
92 ; GFX10: ; %bb.0: ; %entry
93 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0
94 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
95 ; GFX10-NEXT: s_mov_b32 vcc_lo, 0
96 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b
97 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0, v0
98 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
99 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
100 ; GFX10-NEXT: ds_write_b32 v2, v3 offset:12
101 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
102 ; GFX10-NEXT: v_div_fmas_f32 v4, s0, s0, s0
103 ; GFX10-NEXT: global_store_dword v[0:1], v4, off
104 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
105 ; GFX10-NEXT: s_endpgm
107 ; GFX11-LABEL: write_ds_sub0_offset0_global_clamp_bit:
108 ; GFX11: ; %bb.0: ; %entry
109 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
110 ; GFX11-NEXT: v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0
111 ; GFX11-NEXT: s_mov_b32 vcc_lo, 0
112 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
113 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0, v0
114 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
115 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
116 ; GFX11-NEXT: ds_store_b32 v2, v3 offset:12
117 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
118 ; GFX11-NEXT: v_div_fmas_f32 v4, s0, s0, s0
119 ; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc
120 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
121 ; GFX11-NEXT: s_nop 0
122 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
123 ; GFX11-NEXT: s_endpgm
125 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
126 %sub1 = sub i32 0, %x.i
127 %tmp0 = getelementptr [256 x i32], ptr addrspace(3) @lds.obj, i32 0, i32 %sub1
128 %arrayidx = getelementptr inbounds i32, ptr addrspace(3) %tmp0, i32 3
129 store i32 123, ptr addrspace(3) %arrayidx
130 %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false)
131 store volatile float %fmas, ptr addrspace(1) null
135 define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy.val) #0 {
136 ; CI-LABEL: write_ds_sub_max_offset_global_clamp_bit:
138 ; CI-NEXT: s_load_dword s0, s[0:1], 0x0
139 ; CI-NEXT: s_mov_b64 vcc, 0
140 ; CI-NEXT: v_not_b32_e32 v0, v0
141 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
142 ; CI-NEXT: v_mov_b32_e32 v2, 0x7b
143 ; CI-NEXT: s_waitcnt lgkmcnt(0)
144 ; CI-NEXT: v_mov_b32_e32 v1, s0
145 ; CI-NEXT: v_div_fmas_f32 v1, v1, v1, v1
146 ; CI-NEXT: s_mov_b32 s0, 0
147 ; CI-NEXT: s_mov_b32 m0, -1
148 ; CI-NEXT: s_mov_b32 s3, 0xf000
149 ; CI-NEXT: s_mov_b32 s2, -1
150 ; CI-NEXT: s_mov_b32 s1, s0
151 ; CI-NEXT: ds_write_b32 v0, v2 offset:65532
152 ; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0
153 ; CI-NEXT: s_waitcnt vmcnt(0)
156 ; GFX9-LABEL: write_ds_sub_max_offset_global_clamp_bit:
158 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0
159 ; GFX9-NEXT: s_mov_b64 vcc, 0
160 ; GFX9-NEXT: v_not_b32_e32 v0, v0
161 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 2, v0
162 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7b
163 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
164 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
165 ; GFX9-NEXT: v_div_fmas_f32 v2, v1, v1, v1
166 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
167 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
168 ; GFX9-NEXT: ds_write_b32 v3, v4 offset:65532
169 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
170 ; GFX9-NEXT: s_waitcnt vmcnt(0)
171 ; GFX9-NEXT: s_endpgm
173 ; GFX10-LABEL: write_ds_sub_max_offset_global_clamp_bit:
175 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0
176 ; GFX10-NEXT: v_not_b32_e32 v0, v0
177 ; GFX10-NEXT: s_mov_b32 vcc_lo, 0
178 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b
179 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
180 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
181 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
182 ; GFX10-NEXT: ds_write_b32 v2, v3 offset:65532
183 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
184 ; GFX10-NEXT: v_div_fmas_f32 v4, s0, s0, s0
185 ; GFX10-NEXT: global_store_dword v[0:1], v4, off
186 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
187 ; GFX10-NEXT: s_endpgm
189 ; GFX11-LABEL: write_ds_sub_max_offset_global_clamp_bit:
191 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
192 ; GFX11-NEXT: v_not_b32_e32 v0, v0
193 ; GFX11-NEXT: s_mov_b32 vcc_lo, 0
194 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
195 ; GFX11-NEXT: v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v2, 2, v0
196 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
197 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
198 ; GFX11-NEXT: ds_store_b32 v2, v3 offset:65532
199 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
200 ; GFX11-NEXT: v_div_fmas_f32 v4, s0, s0, s0
201 ; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc
202 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
203 ; GFX11-NEXT: s_nop 0
204 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
205 ; GFX11-NEXT: s_endpgm
206 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
207 %sub1 = sub i32 -1, %x.i
208 %tmp0 = getelementptr [256 x i32], ptr addrspace(3) @lds.obj, i32 0, i32 %sub1
209 %arrayidx = getelementptr inbounds i32, ptr addrspace(3) %tmp0, i32 16383
210 store i32 123, ptr addrspace(3) %arrayidx
211 %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false)
212 store volatile float %fmas, ptr addrspace(1) null
216 define amdgpu_kernel void @add_x_shl_max_offset() #1 {
217 ; CI-LABEL: add_x_shl_max_offset:
219 ; CI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
220 ; CI-NEXT: v_mov_b32_e32 v1, 13
221 ; CI-NEXT: s_mov_b32 m0, -1
222 ; CI-NEXT: ds_write_b8 v0, v1 offset:65535
225 ; GFX9-LABEL: add_x_shl_max_offset:
227 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0
228 ; GFX9-NEXT: v_mov_b32_e32 v1, 13
229 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:65535
230 ; GFX9-NEXT: s_endpgm
232 ; GFX10-LABEL: add_x_shl_max_offset:
234 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0
235 ; GFX10-NEXT: v_mov_b32_e32 v1, 13
236 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:65535
237 ; GFX10-NEXT: s_endpgm
239 ; GFX11-LABEL: add_x_shl_max_offset:
241 ; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 4, v0
242 ; GFX11-NEXT: ds_store_b8 v0, v1 offset:65535
243 ; GFX11-NEXT: s_endpgm
244 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x()
245 %shl = shl i32 %x.i, 4
246 %add = add i32 %shl, 65535
247 %z = zext i32 %add to i64
248 %ptr = inttoptr i64 %z to ptr addrspace(3)
249 store i8 13, ptr addrspace(3) %ptr, align 1
253 ; this could have the offset transform, but sub became xor
255 define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_alt() #1 {
256 ; CI-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
258 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
259 ; CI-NEXT: v_xor_b32_e32 v0, 0xffff, v0
260 ; CI-NEXT: v_mov_b32_e32 v1, 13
261 ; CI-NEXT: s_mov_b32 m0, -1
262 ; CI-NEXT: ds_write_b8 v0, v1
265 ; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
267 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
268 ; GFX9-NEXT: v_xor_b32_e32 v0, 0xffff, v0
269 ; GFX9-NEXT: v_mov_b32_e32 v1, 13
270 ; GFX9-NEXT: ds_write_b8 v0, v1
271 ; GFX9-NEXT: s_endpgm
273 ; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
275 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
276 ; GFX10-NEXT: v_mov_b32_e32 v1, 13
277 ; GFX10-NEXT: v_xor_b32_e32 v0, 0xffff, v0
278 ; GFX10-NEXT: ds_write_b8 v0, v1
279 ; GFX10-NEXT: s_endpgm
281 ; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
283 ; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0
284 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
285 ; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff, v0
286 ; GFX11-NEXT: ds_store_b8 v0, v1
287 ; GFX11-NEXT: s_endpgm
288 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x()
289 %.neg = mul i32 %x.i, -4
290 %add = add i32 %.neg, 65535
291 %z = zext i32 %add to i64
292 %ptr = inttoptr i64 %z to ptr addrspace(3)
293 store i8 13, ptr addrspace(3) %ptr, align 1
297 ; this could have the offset transform, but sub became xor
299 define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_not_canonical() #1 {
300 ; CI-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
302 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
303 ; CI-NEXT: v_xor_b32_e32 v0, 0xffff, v0
304 ; CI-NEXT: v_mov_b32_e32 v1, 13
305 ; CI-NEXT: s_mov_b32 m0, -1
306 ; CI-NEXT: ds_write_b8 v0, v1
309 ; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
311 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
312 ; GFX9-NEXT: v_xor_b32_e32 v0, 0xffff, v0
313 ; GFX9-NEXT: v_mov_b32_e32 v1, 13
314 ; GFX9-NEXT: ds_write_b8 v0, v1
315 ; GFX9-NEXT: s_endpgm
317 ; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
319 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
320 ; GFX10-NEXT: v_mov_b32_e32 v1, 13
321 ; GFX10-NEXT: v_xor_b32_e32 v0, 0xffff, v0
322 ; GFX10-NEXT: ds_write_b8 v0, v1
323 ; GFX10-NEXT: s_endpgm
325 ; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
327 ; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0
328 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
329 ; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff, v0
330 ; GFX11-NEXT: ds_store_b8 v0, v1
331 ; GFX11-NEXT: s_endpgm
332 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
333 %neg = sub i32 0, %x.i
334 %shl = shl i32 %neg, 2
335 %add = add i32 65535, %shl
336 %ptr = inttoptr i32 %add to ptr addrspace(3)
337 store i8 13, ptr addrspace(3) %ptr
341 define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
342 ; CI-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
344 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
345 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x10000, v0
346 ; CI-NEXT: v_mov_b32_e32 v1, 13
347 ; CI-NEXT: s_mov_b32 m0, -1
348 ; CI-NEXT: ds_write_b8 v0, v1
351 ; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
353 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
354 ; GFX9-NEXT: v_sub_u32_e32 v0, 0x10000, v0
355 ; GFX9-NEXT: v_mov_b32_e32 v1, 13
356 ; GFX9-NEXT: ds_write_b8 v0, v1
357 ; GFX9-NEXT: s_endpgm
359 ; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
361 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
362 ; GFX10-NEXT: v_mov_b32_e32 v1, 13
363 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x10000, v0
364 ; GFX10-NEXT: ds_write_b8 v0, v1
365 ; GFX10-NEXT: s_endpgm
367 ; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
369 ; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0
370 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
371 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x10000, v0
372 ; GFX11-NEXT: ds_store_b8 v0, v1
373 ; GFX11-NEXT: s_endpgm
374 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
375 %neg = sub i32 0, %x.i
376 %shl = shl i32 %neg, 2
377 %add = add i32 65536, %shl
378 %ptr = inttoptr i32 %add to ptr addrspace(3)
379 store i8 13, ptr addrspace(3) %ptr
383 define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 {
384 ; CI-LABEL: add_x_shl_neg_to_sub_multi_use:
386 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
387 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
388 ; CI-NEXT: v_mov_b32_e32 v1, 13
389 ; CI-NEXT: s_mov_b32 m0, -1
390 ; CI-NEXT: ds_write_b32 v0, v1 offset:123
391 ; CI-NEXT: ds_write_b32 v0, v1 offset:456
394 ; GFX9-LABEL: add_x_shl_neg_to_sub_multi_use:
396 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
397 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
398 ; GFX9-NEXT: v_mov_b32_e32 v1, 13
399 ; GFX9-NEXT: ds_write_b32 v0, v1 offset:123
400 ; GFX9-NEXT: ds_write_b32 v0, v1 offset:456
401 ; GFX9-NEXT: s_endpgm
403 ; GFX10-LABEL: add_x_shl_neg_to_sub_multi_use:
405 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
406 ; GFX10-NEXT: v_mov_b32_e32 v1, 13
407 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0
408 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:123
409 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:456
410 ; GFX10-NEXT: s_endpgm
412 ; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use:
414 ; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0
415 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
416 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0
417 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:123
418 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:456
419 ; GFX11-NEXT: s_endpgm
420 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
421 %neg = sub i32 0, %x.i
422 %shl = shl i32 %neg, 2
423 %add0 = add i32 123, %shl
424 %add1 = add i32 456, %shl
425 %ptr0 = inttoptr i32 %add0 to ptr addrspace(3)
426 store volatile i32 13, ptr addrspace(3) %ptr0
427 %ptr1 = inttoptr i32 %add1 to ptr addrspace(3)
428 store volatile i32 13, ptr addrspace(3) %ptr1
432 define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
433 ; CI-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
435 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
436 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
437 ; CI-NEXT: v_mov_b32_e32 v1, 13
438 ; CI-NEXT: s_mov_b32 m0, -1
439 ; CI-NEXT: ds_write_b32 v0, v1 offset:123
440 ; CI-NEXT: ds_write_b32 v0, v1 offset:123
443 ; GFX9-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
445 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
446 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
447 ; GFX9-NEXT: v_mov_b32_e32 v1, 13
448 ; GFX9-NEXT: ds_write_b32 v0, v1 offset:123
449 ; GFX9-NEXT: ds_write_b32 v0, v1 offset:123
450 ; GFX9-NEXT: s_endpgm
452 ; GFX10-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
454 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
455 ; GFX10-NEXT: v_mov_b32_e32 v1, 13
456 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0
457 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:123
458 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:123
459 ; GFX10-NEXT: s_endpgm
461 ; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
463 ; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0
464 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
465 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0
466 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:123
467 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:123
468 ; GFX11-NEXT: s_endpgm
469 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
470 %neg = sub i32 0, %x.i
471 %shl = shl i32 %neg, 2
472 %add = add i32 123, %shl
473 %ptr = inttoptr i32 %add to ptr addrspace(3)
474 store volatile i32 13, ptr addrspace(3) %ptr
475 store volatile i32 13, ptr addrspace(3) %ptr
479 define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
480 ; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
482 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
483 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fb, v0
484 ; CI-NEXT: v_mov_b32_e32 v1, 0x7b
485 ; CI-NEXT: v_mov_b32_e32 v2, 0
486 ; CI-NEXT: s_mov_b32 m0, -1
487 ; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
490 ; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
492 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
493 ; GFX9-NEXT: v_sub_u32_e32 v0, 0x3fb, v0
494 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b
495 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
496 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
497 ; GFX9-NEXT: s_endpgm
499 ; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
501 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
502 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
503 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b
504 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0
505 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:1023
506 ; GFX10-NEXT: ds_write_b32 v0, v2 offset:1019
507 ; GFX10-NEXT: s_endpgm
509 ; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
511 ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0
512 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
513 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
514 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x3fb, v0
515 ; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1
516 ; GFX11-NEXT: s_endpgm
517 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
518 %neg = sub i32 0, %x.i
519 %shl = shl i32 %neg, 2
520 %add = add i32 1019, %shl
521 %ptr = inttoptr i32 %add to ptr addrspace(3)
522 store i64 123, ptr addrspace(3) %ptr, align 4
526 define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit(float %dummy.val) #1 {
527 ; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
529 ; CI-NEXT: s_load_dword s0, s[0:1], 0x0
530 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
531 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fb, v0
532 ; CI-NEXT: s_mov_b64 vcc, 0
533 ; CI-NEXT: s_waitcnt lgkmcnt(0)
534 ; CI-NEXT: v_mov_b32_e32 v1, s0
535 ; CI-NEXT: s_mov_b32 s0, 0
536 ; CI-NEXT: v_div_fmas_f32 v1, v1, v1, v1
537 ; CI-NEXT: v_mov_b32_e32 v2, 0x7b
538 ; CI-NEXT: v_mov_b32_e32 v3, 0
539 ; CI-NEXT: s_mov_b32 m0, -1
540 ; CI-NEXT: s_mov_b32 s3, 0xf000
541 ; CI-NEXT: s_mov_b32 s2, -1
542 ; CI-NEXT: s_mov_b32 s1, s0
543 ; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1
544 ; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0
545 ; CI-NEXT: s_waitcnt vmcnt(0)
548 ; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
550 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0
551 ; GFX9-NEXT: s_mov_b64 vcc, 0
552 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
553 ; GFX9-NEXT: v_sub_u32_e32 v3, 0x3fb, v0
554 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7b
555 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
556 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
557 ; GFX9-NEXT: v_div_fmas_f32 v2, v1, v1, v1
558 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
559 ; GFX9-NEXT: v_mov_b32_e32 v5, 0
560 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
561 ; GFX9-NEXT: ds_write2_b32 v3, v4, v5 offset1:1
562 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
563 ; GFX9-NEXT: s_waitcnt vmcnt(0)
564 ; GFX9-NEXT: s_endpgm
566 ; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
568 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0
569 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
570 ; GFX10-NEXT: s_mov_b32 vcc_lo, 0
571 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
572 ; GFX10-NEXT: v_mov_b32_e32 v4, 0x7b
573 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0, v0
574 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
575 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
576 ; GFX10-NEXT: ds_write_b32 v2, v3 offset:1023
577 ; GFX10-NEXT: ds_write_b32 v2, v4 offset:1019
578 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
579 ; GFX10-NEXT: v_div_fmas_f32 v5, s0, s0, s0
580 ; GFX10-NEXT: global_store_dword v[0:1], v5, off
581 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
582 ; GFX10-NEXT: s_endpgm
584 ; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
586 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
587 ; GFX11-NEXT: v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0
588 ; GFX11-NEXT: s_mov_b32 vcc_lo, 0
589 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
590 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
591 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x3fb, v0
592 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
593 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
594 ; GFX11-NEXT: ds_store_2addr_b32 v2, v3, v4 offset1:1
595 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
596 ; GFX11-NEXT: v_div_fmas_f32 v5, s0, s0, s0
597 ; GFX11-NEXT: global_store_b32 v[0:1], v5, off dlc
598 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
599 ; GFX11-NEXT: s_nop 0
600 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
601 ; GFX11-NEXT: s_endpgm
602 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
603 %neg = sub i32 0, %x.i
604 %shl = shl i32 %neg, 2
605 %add = add i32 1019, %shl
606 %ptr = inttoptr i32 %add to ptr addrspace(3)
607 store i64 123, ptr addrspace(3) %ptr, align 4
608 %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false)
609 store volatile float %fmas, ptr addrspace(1) null
613 define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 {
614 ; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
616 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
617 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fc, v0
618 ; CI-NEXT: v_mov_b32_e32 v1, 0x7b
619 ; CI-NEXT: v_mov_b32_e32 v2, 0
620 ; CI-NEXT: s_mov_b32 m0, -1
621 ; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
624 ; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
626 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
627 ; GFX9-NEXT: v_sub_u32_e32 v0, 0x3fc, v0
628 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b
629 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
630 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
631 ; GFX9-NEXT: s_endpgm
633 ; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
635 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
636 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
637 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b
638 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0
639 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0
640 ; GFX10-NEXT: ds_write2_b32 v0, v2, v1 offset0:127 offset1:128
641 ; GFX10-NEXT: s_endpgm
643 ; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
645 ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0
646 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
647 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
648 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x3fc, v0
649 ; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1
650 ; GFX11-NEXT: s_endpgm
651 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
652 %neg = sub i32 0, %x.i
653 %shl = shl i32 %neg, 2
654 %add = add i32 1020, %shl
655 %ptr = inttoptr i32 %add to ptr addrspace(3)
656 store i64 123, ptr addrspace(3) %ptr, align 4
660 declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1)
662 attributes #0 = { nounwind readnone }
663 attributes #1 = { nounwind }
664 attributes #2 = { nounwind convergent }