1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s
3 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
4 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
5 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
7 declare i32 @llvm.amdgcn.workitem.id.x() #0
9 @lds.obj = addrspace(3) global [256 x i32] undef, align 4
11 define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 {
12 ; CI-LABEL: write_ds_sub0_offset0_global:
13 ; CI: ; %bb.0: ; %entry
14 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
15 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
16 ; CI-NEXT: v_mov_b32_e32 v1, 0x7b
17 ; CI-NEXT: s_mov_b32 m0, -1
18 ; CI-NEXT: ds_write_b32 v0, v1 offset:12
21 ; GFX9-LABEL: write_ds_sub0_offset0_global:
22 ; GFX9: ; %bb.0: ; %entry
23 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
24 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
25 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b
26 ; GFX9-NEXT: ds_write_b32 v0, v1 offset:12
29 ; GFX10-LABEL: write_ds_sub0_offset0_global:
30 ; GFX10: ; %bb.0: ; %entry
31 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
32 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x7b
33 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0
34 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:12
35 ; GFX10-NEXT: s_endpgm
37 ; GFX11-LABEL: write_ds_sub0_offset0_global:
38 ; GFX11: ; %bb.0: ; %entry
39 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
40 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
41 ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0
42 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0
43 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:12
44 ; GFX11-NEXT: s_endpgm
46 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
47 %sub1 = sub i32 0, %x.i
48 %tmp0 = getelementptr [256 x i32], ptr addrspace(3) @lds.obj, i32 0, i32 %sub1
49 %arrayidx = getelementptr inbounds i32, ptr addrspace(3) %tmp0, i32 3
50 store i32 123, ptr addrspace(3) %arrayidx
54 define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.val) #0 {
55 ; CI-LABEL: write_ds_sub0_offset0_global_clamp_bit:
56 ; CI: ; %bb.0: ; %entry
57 ; CI-NEXT: s_load_dword s0, s[4:5], 0x0
58 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
59 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
60 ; CI-NEXT: s_mov_b64 vcc, 0
61 ; CI-NEXT: s_waitcnt lgkmcnt(0)
62 ; CI-NEXT: v_mov_b32_e32 v1, s0
63 ; CI-NEXT: s_mov_b32 s0, 0
64 ; CI-NEXT: v_div_fmas_f32 v1, v1, v1, v1
65 ; CI-NEXT: v_mov_b32_e32 v2, 0x7b
66 ; CI-NEXT: s_mov_b32 m0, -1
67 ; CI-NEXT: s_mov_b32 s3, 0xf000
68 ; CI-NEXT: s_mov_b32 s2, -1
69 ; CI-NEXT: s_mov_b32 s1, s0
70 ; CI-NEXT: ds_write_b32 v0, v2 offset:12
71 ; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0
72 ; CI-NEXT: s_waitcnt vmcnt(0)
75 ; GFX9-LABEL: write_ds_sub0_offset0_global_clamp_bit:
76 ; GFX9: ; %bb.0: ; %entry
77 ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
78 ; GFX9-NEXT: s_mov_b64 vcc, 0
79 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
80 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v0
81 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7b
82 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
83 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
84 ; GFX9-NEXT: v_div_fmas_f32 v2, v1, v1, v1
85 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
86 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
87 ; GFX9-NEXT: ds_write_b32 v3, v4 offset:12
88 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
89 ; GFX9-NEXT: s_waitcnt vmcnt(0)
92 ; GFX10-LABEL: write_ds_sub0_offset0_global_clamp_bit:
93 ; GFX10: ; %bb.0: ; %entry
94 ; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0
95 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
96 ; GFX10-NEXT: s_mov_b32 vcc_lo, 0
97 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b
98 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0, v0
99 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
100 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
101 ; GFX10-NEXT: ds_write_b32 v2, v3 offset:12
102 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
103 ; GFX10-NEXT: v_div_fmas_f32 v4, s0, s0, s0
104 ; GFX10-NEXT: global_store_dword v[0:1], v4, off
105 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
106 ; GFX10-NEXT: s_endpgm
108 ; GFX11-LABEL: write_ds_sub0_offset0_global_clamp_bit:
109 ; GFX11: ; %bb.0: ; %entry
110 ; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0
111 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
112 ; GFX11-NEXT: s_mov_b32 vcc_lo, 0
113 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
114 ; GFX11-NEXT: v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0
115 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0, v0
116 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
117 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
118 ; GFX11-NEXT: ds_store_b32 v2, v3 offset:12
119 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
120 ; GFX11-NEXT: v_div_fmas_f32 v4, s0, s0, s0
121 ; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc
122 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
123 ; GFX11-NEXT: s_endpgm
125 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
126 %sub1 = sub i32 0, %x.i
127 %tmp0 = getelementptr [256 x i32], ptr addrspace(3) @lds.obj, i32 0, i32 %sub1
128 %arrayidx = getelementptr inbounds i32, ptr addrspace(3) %tmp0, i32 3
129 store i32 123, ptr addrspace(3) %arrayidx
130 %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false)
131 store volatile float %fmas, ptr addrspace(1) null
135 define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy.val) #0 {
136 ; CI-LABEL: write_ds_sub_max_offset_global_clamp_bit:
138 ; CI-NEXT: s_load_dword s0, s[4:5], 0x0
139 ; CI-NEXT: s_mov_b64 vcc, 0
140 ; CI-NEXT: v_mov_b32_e32 v1, 0x7b
141 ; CI-NEXT: v_mov_b32_e32 v2, 0
142 ; CI-NEXT: s_mov_b32 m0, -1
143 ; CI-NEXT: s_waitcnt lgkmcnt(0)
144 ; CI-NEXT: v_mov_b32_e32 v0, s0
145 ; CI-NEXT: v_div_fmas_f32 v0, v0, v0, v0
146 ; CI-NEXT: s_mov_b32 s0, 0
147 ; CI-NEXT: s_mov_b32 s3, 0xf000
148 ; CI-NEXT: s_mov_b32 s2, -1
149 ; CI-NEXT: s_mov_b32 s1, s0
150 ; CI-NEXT: ds_write_b32 v2, v1
151 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
152 ; CI-NEXT: s_waitcnt vmcnt(0)
155 ; GFX9-LABEL: write_ds_sub_max_offset_global_clamp_bit:
157 ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
158 ; GFX9-NEXT: s_mov_b64 vcc, 0
159 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7b
160 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
161 ; GFX9-NEXT: ds_write_b32 v4, v3
162 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
163 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
164 ; GFX9-NEXT: v_div_fmas_f32 v2, v0, v0, v0
165 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
166 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
167 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
168 ; GFX9-NEXT: s_waitcnt vmcnt(0)
169 ; GFX9-NEXT: s_endpgm
171 ; GFX10-LABEL: write_ds_sub_max_offset_global_clamp_bit:
173 ; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0
174 ; GFX10-NEXT: s_mov_b32 vcc_lo, 0
175 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
176 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b
177 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
178 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
179 ; GFX10-NEXT: ds_write_b32 v3, v2
180 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
181 ; GFX10-NEXT: v_div_fmas_f32 v4, s0, s0, s0
182 ; GFX10-NEXT: global_store_dword v[0:1], v4, off
183 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
184 ; GFX10-NEXT: s_endpgm
186 ; GFX11-LABEL: write_ds_sub_max_offset_global_clamp_bit:
188 ; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0
189 ; GFX11-NEXT: s_mov_b32 vcc_lo, 0
190 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
191 ; GFX11-NEXT: v_dual_mov_b32 v2, 0x7b :: v_dual_mov_b32 v3, 0
192 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
193 ; GFX11-NEXT: ds_store_b32 v3, v2
194 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
195 ; GFX11-NEXT: v_div_fmas_f32 v4, s0, s0, s0
196 ; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc
197 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
198 ; GFX11-NEXT: s_endpgm
199 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
200 %sub1 = sub i32 -1, %x.i
201 %tmp0 = getelementptr [256 x i32], ptr addrspace(3) @lds.obj, i32 0, i32 %sub1
202 %arrayidx = getelementptr inbounds i32, ptr addrspace(3) %tmp0, i32 16383
203 store i32 123, ptr addrspace(3) %arrayidx
204 %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false)
205 store volatile float %fmas, ptr addrspace(1) null
209 define amdgpu_kernel void @add_x_shl_max_offset() #1 {
210 ; CI-LABEL: add_x_shl_max_offset:
212 ; CI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
213 ; CI-NEXT: v_mov_b32_e32 v1, 13
214 ; CI-NEXT: s_mov_b32 m0, -1
215 ; CI-NEXT: ds_write_b8 v0, v1 offset:65535
218 ; GFX9-LABEL: add_x_shl_max_offset:
220 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0
221 ; GFX9-NEXT: v_mov_b32_e32 v1, 13
222 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:65535
223 ; GFX9-NEXT: s_endpgm
225 ; GFX10-LABEL: add_x_shl_max_offset:
227 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0
228 ; GFX10-NEXT: v_mov_b32_e32 v1, 13
229 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:65535
230 ; GFX10-NEXT: s_endpgm
232 ; GFX11-LABEL: add_x_shl_max_offset:
234 ; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0
235 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
236 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0
237 ; GFX11-NEXT: ds_store_b8 v0, v1 offset:65535
238 ; GFX11-NEXT: s_endpgm
239 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x()
240 %shl = shl i32 %x.i, 4
241 %add = add i32 %shl, 65535
242 %z = zext i32 %add to i64
243 %ptr = inttoptr i64 %z to ptr addrspace(3)
244 store i8 13, ptr addrspace(3) %ptr, align 1
248 ; this could have the offset transform, but sub became xor
250 define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_alt() #1 {
251 ; CI-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
253 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
254 ; CI-NEXT: v_xor_b32_e32 v0, 0xffff, v0
255 ; CI-NEXT: v_mov_b32_e32 v1, 13
256 ; CI-NEXT: s_mov_b32 m0, -1
257 ; CI-NEXT: ds_write_b8 v0, v1
260 ; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
262 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
263 ; GFX9-NEXT: v_xor_b32_e32 v0, 0xffff, v0
264 ; GFX9-NEXT: v_mov_b32_e32 v1, 13
265 ; GFX9-NEXT: ds_write_b8 v0, v1
266 ; GFX9-NEXT: s_endpgm
268 ; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
270 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
271 ; GFX10-NEXT: v_mov_b32_e32 v1, 13
272 ; GFX10-NEXT: v_xor_b32_e32 v0, 0xffff, v0
273 ; GFX10-NEXT: ds_write_b8 v0, v1
274 ; GFX10-NEXT: s_endpgm
276 ; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
278 ; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0
279 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
280 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
281 ; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff, v0
282 ; GFX11-NEXT: ds_store_b8 v0, v1
283 ; GFX11-NEXT: s_endpgm
284 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x()
285 %.neg = mul i32 %x.i, -4
286 %add = add i32 %.neg, 65535
287 %z = zext i32 %add to i64
288 %ptr = inttoptr i64 %z to ptr addrspace(3)
289 store i8 13, ptr addrspace(3) %ptr, align 1
293 ; this could have the offset transform, but sub became xor
295 define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_not_canonical() #1 {
296 ; CI-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
298 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
299 ; CI-NEXT: v_xor_b32_e32 v0, 0xffff, v0
300 ; CI-NEXT: v_mov_b32_e32 v1, 13
301 ; CI-NEXT: s_mov_b32 m0, -1
302 ; CI-NEXT: ds_write_b8 v0, v1
305 ; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
307 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
308 ; GFX9-NEXT: v_xor_b32_e32 v0, 0xffff, v0
309 ; GFX9-NEXT: v_mov_b32_e32 v1, 13
310 ; GFX9-NEXT: ds_write_b8 v0, v1
311 ; GFX9-NEXT: s_endpgm
313 ; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
315 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
316 ; GFX10-NEXT: v_mov_b32_e32 v1, 13
317 ; GFX10-NEXT: v_xor_b32_e32 v0, 0xffff, v0
318 ; GFX10-NEXT: ds_write_b8 v0, v1
319 ; GFX10-NEXT: s_endpgm
321 ; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
323 ; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0
324 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
325 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
326 ; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff, v0
327 ; GFX11-NEXT: ds_store_b8 v0, v1
328 ; GFX11-NEXT: s_endpgm
329 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
330 %neg = sub i32 0, %x.i
331 %shl = shl i32 %neg, 2
332 %add = add i32 65535, %shl
333 %ptr = inttoptr i32 %add to ptr addrspace(3)
334 store i8 13, ptr addrspace(3) %ptr
338 define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
339 ; CI-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
341 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
342 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x10000, v0
343 ; CI-NEXT: v_mov_b32_e32 v1, 13
344 ; CI-NEXT: s_mov_b32 m0, -1
345 ; CI-NEXT: ds_write_b8 v0, v1
348 ; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
350 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
351 ; GFX9-NEXT: v_sub_u32_e32 v0, 0x10000, v0
352 ; GFX9-NEXT: v_mov_b32_e32 v1, 13
353 ; GFX9-NEXT: ds_write_b8 v0, v1
354 ; GFX9-NEXT: s_endpgm
356 ; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
358 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
359 ; GFX10-NEXT: v_mov_b32_e32 v1, 13
360 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x10000, v0
361 ; GFX10-NEXT: ds_write_b8 v0, v1
362 ; GFX10-NEXT: s_endpgm
364 ; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
366 ; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0
367 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
368 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
369 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x10000, v0
370 ; GFX11-NEXT: ds_store_b8 v0, v1
371 ; GFX11-NEXT: s_endpgm
372 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
373 %neg = sub i32 0, %x.i
374 %shl = shl i32 %neg, 2
375 %add = add i32 65536, %shl
376 %ptr = inttoptr i32 %add to ptr addrspace(3)
377 store i8 13, ptr addrspace(3) %ptr
381 define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 {
382 ; CI-LABEL: add_x_shl_neg_to_sub_multi_use:
384 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
385 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
386 ; CI-NEXT: v_mov_b32_e32 v1, 13
387 ; CI-NEXT: s_mov_b32 m0, -1
388 ; CI-NEXT: ds_write_b32 v0, v1 offset:123
389 ; CI-NEXT: ds_write_b32 v0, v1 offset:456
392 ; GFX9-LABEL: add_x_shl_neg_to_sub_multi_use:
394 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
395 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
396 ; GFX9-NEXT: v_mov_b32_e32 v1, 13
397 ; GFX9-NEXT: ds_write_b32 v0, v1 offset:123
398 ; GFX9-NEXT: ds_write_b32 v0, v1 offset:456
399 ; GFX9-NEXT: s_endpgm
401 ; GFX10-LABEL: add_x_shl_neg_to_sub_multi_use:
403 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
404 ; GFX10-NEXT: v_mov_b32_e32 v1, 13
405 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0
406 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:123
407 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:456
408 ; GFX10-NEXT: s_endpgm
410 ; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use:
412 ; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0
413 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
414 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0
415 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0
416 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:123
417 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:456
418 ; GFX11-NEXT: s_endpgm
419 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
420 %neg = sub i32 0, %x.i
421 %shl = shl i32 %neg, 2
422 %add0 = add i32 123, %shl
423 %add1 = add i32 456, %shl
424 %ptr0 = inttoptr i32 %add0 to ptr addrspace(3)
425 store volatile i32 13, ptr addrspace(3) %ptr0
426 %ptr1 = inttoptr i32 %add1 to ptr addrspace(3)
427 store volatile i32 13, ptr addrspace(3) %ptr1
431 define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
432 ; CI-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
434 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
435 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
436 ; CI-NEXT: v_mov_b32_e32 v1, 13
437 ; CI-NEXT: s_mov_b32 m0, -1
438 ; CI-NEXT: ds_write_b32 v0, v1 offset:123
439 ; CI-NEXT: ds_write_b32 v0, v1 offset:123
442 ; GFX9-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
444 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
445 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
446 ; GFX9-NEXT: v_mov_b32_e32 v1, 13
447 ; GFX9-NEXT: ds_write_b32 v0, v1 offset:123
448 ; GFX9-NEXT: ds_write_b32 v0, v1 offset:123
449 ; GFX9-NEXT: s_endpgm
451 ; GFX10-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
453 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
454 ; GFX10-NEXT: v_mov_b32_e32 v1, 13
455 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0
456 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:123
457 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:123
458 ; GFX10-NEXT: s_endpgm
460 ; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
462 ; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0
463 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
464 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
465 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0
466 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:123
467 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:123
468 ; GFX11-NEXT: s_endpgm
469 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
470 %neg = sub i32 0, %x.i
471 %shl = shl i32 %neg, 2
472 %add = add i32 123, %shl
473 %ptr = inttoptr i32 %add to ptr addrspace(3)
474 store volatile i32 13, ptr addrspace(3) %ptr
475 store volatile i32 13, ptr addrspace(3) %ptr
479 define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
480 ; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
482 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
483 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fb, v0
484 ; CI-NEXT: v_mov_b32_e32 v1, 0x7b
485 ; CI-NEXT: v_mov_b32_e32 v2, 0
486 ; CI-NEXT: s_mov_b32 m0, -1
487 ; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
490 ; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
492 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
493 ; GFX9-NEXT: v_sub_u32_e32 v0, 0x3fb, v0
494 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b
495 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
496 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
497 ; GFX9-NEXT: s_endpgm
499 ; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
501 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
502 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
503 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b
504 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0
505 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:1023
506 ; GFX10-NEXT: ds_write_b32 v0, v2 offset:1019
507 ; GFX10-NEXT: s_endpgm
509 ; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
511 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
512 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x7b
513 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
514 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
515 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x3fb, v0
516 ; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1
517 ; GFX11-NEXT: s_endpgm
518 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
519 %neg = sub i32 0, %x.i
520 %shl = shl i32 %neg, 2
521 %add = add i32 1019, %shl
522 %ptr = inttoptr i32 %add to ptr addrspace(3)
523 store i64 123, ptr addrspace(3) %ptr, align 4
527 define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit(float %dummy.val) #1 {
528 ; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
530 ; CI-NEXT: s_load_dword s0, s[4:5], 0x0
531 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
532 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fb, v0
533 ; CI-NEXT: s_mov_b64 vcc, 0
534 ; CI-NEXT: s_waitcnt lgkmcnt(0)
535 ; CI-NEXT: v_mov_b32_e32 v1, s0
536 ; CI-NEXT: s_mov_b32 s0, 0
537 ; CI-NEXT: v_div_fmas_f32 v1, v1, v1, v1
538 ; CI-NEXT: v_mov_b32_e32 v2, 0x7b
539 ; CI-NEXT: v_mov_b32_e32 v3, 0
540 ; CI-NEXT: s_mov_b32 m0, -1
541 ; CI-NEXT: s_mov_b32 s3, 0xf000
542 ; CI-NEXT: s_mov_b32 s2, -1
543 ; CI-NEXT: s_mov_b32 s1, s0
544 ; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1
545 ; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0
546 ; CI-NEXT: s_waitcnt vmcnt(0)
549 ; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
551 ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
552 ; GFX9-NEXT: s_mov_b64 vcc, 0
553 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
554 ; GFX9-NEXT: v_sub_u32_e32 v3, 0x3fb, v0
555 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7b
556 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
557 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
558 ; GFX9-NEXT: v_div_fmas_f32 v2, v1, v1, v1
559 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
560 ; GFX9-NEXT: v_mov_b32_e32 v5, 0
561 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
562 ; GFX9-NEXT: ds_write2_b32 v3, v4, v5 offset1:1
563 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
564 ; GFX9-NEXT: s_waitcnt vmcnt(0)
565 ; GFX9-NEXT: s_endpgm
567 ; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
569 ; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0
570 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
571 ; GFX10-NEXT: s_mov_b32 vcc_lo, 0
572 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
573 ; GFX10-NEXT: v_mov_b32_e32 v4, 0x7b
574 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0, v0
575 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
576 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
577 ; GFX10-NEXT: ds_write_b32 v2, v3 offset:1023
578 ; GFX10-NEXT: ds_write_b32 v2, v4 offset:1019
579 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
580 ; GFX10-NEXT: v_div_fmas_f32 v5, s0, s0, s0
581 ; GFX10-NEXT: global_store_dword v[0:1], v5, off
582 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
583 ; GFX10-NEXT: s_endpgm
585 ; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
587 ; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0
588 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
589 ; GFX11-NEXT: s_mov_b32 vcc_lo, 0
590 ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, 0x7b
591 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
592 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
593 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x3fb, v0
594 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
595 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
596 ; GFX11-NEXT: ds_store_2addr_b32 v2, v3, v4 offset1:1
597 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
598 ; GFX11-NEXT: v_div_fmas_f32 v5, s0, s0, s0
599 ; GFX11-NEXT: global_store_b32 v[0:1], v5, off dlc
600 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
601 ; GFX11-NEXT: s_endpgm
602 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
603 %neg = sub i32 0, %x.i
604 %shl = shl i32 %neg, 2
605 %add = add i32 1019, %shl
606 %ptr = inttoptr i32 %add to ptr addrspace(3)
607 store i64 123, ptr addrspace(3) %ptr, align 4
608 %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false)
609 store volatile float %fmas, ptr addrspace(1) null
613 define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 {
614 ; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
616 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
617 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fc, v0
618 ; CI-NEXT: v_mov_b32_e32 v1, 0x7b
619 ; CI-NEXT: v_mov_b32_e32 v2, 0
620 ; CI-NEXT: s_mov_b32 m0, -1
621 ; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
624 ; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
626 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
627 ; GFX9-NEXT: v_sub_u32_e32 v0, 0x3fc, v0
628 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b
629 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
630 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
631 ; GFX9-NEXT: s_endpgm
633 ; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
635 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
636 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
637 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b
638 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0
639 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0
640 ; GFX10-NEXT: ds_write2_b32 v0, v2, v1 offset0:127 offset1:128
641 ; GFX10-NEXT: s_endpgm
643 ; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
645 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
646 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x7b
647 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
648 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
649 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x3fc, v0
650 ; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1
651 ; GFX11-NEXT: s_endpgm
652 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
653 %neg = sub i32 0, %x.i
654 %shl = shl i32 %neg, 2
655 %add = add i32 1020, %shl
656 %ptr = inttoptr i32 %add to ptr addrspace(3)
657 store i64 123, ptr addrspace(3) %ptr, align 4
661 declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1)
663 attributes #0 = { nounwind readnone }
664 attributes #1 = { nounwind }
665 attributes #2 = { nounwind convergent }