1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s
3 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
4 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
5 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
7 declare i32 @llvm.amdgcn.workitem.id.x() #0
9 @lds.obj = addrspace(3) global [256 x i32] undef, align 4
11 define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 {
12 ; CI-LABEL: write_ds_sub0_offset0_global:
13 ; CI: ; %bb.0: ; %entry
14 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
15 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
16 ; CI-NEXT: v_mov_b32_e32 v1, 0x7b
17 ; CI-NEXT: s_mov_b32 m0, -1
18 ; CI-NEXT: ds_write_b32 v0, v1 offset:12
21 ; GFX9-LABEL: write_ds_sub0_offset0_global:
22 ; GFX9: ; %bb.0: ; %entry
23 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
24 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
25 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b
26 ; GFX9-NEXT: ds_write_b32 v0, v1 offset:12
29 ; GFX10-LABEL: write_ds_sub0_offset0_global:
30 ; GFX10: ; %bb.0: ; %entry
31 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
32 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x7b
33 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0
34 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:12
35 ; GFX10-NEXT: s_endpgm
37 ; GFX11-LABEL: write_ds_sub0_offset0_global:
38 ; GFX11: ; %bb.0: ; %entry
39 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
40 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
41 ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0
42 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0
43 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:12
44 ; GFX11-NEXT: s_endpgm
46 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
47 %sub1 = sub i32 0, %x.i
48 %tmp0 = getelementptr [256 x i32], ptr addrspace(3) @lds.obj, i32 0, i32 %sub1
49 %arrayidx = getelementptr inbounds i32, ptr addrspace(3) %tmp0, i32 3
50 store i32 123, ptr addrspace(3) %arrayidx
54 define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.val) #0 {
55 ; CI-LABEL: write_ds_sub0_offset0_global_clamp_bit:
56 ; CI: ; %bb.0: ; %entry
57 ; CI-NEXT: s_load_dword s0, s[2:3], 0x0
58 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
59 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
60 ; CI-NEXT: s_mov_b64 vcc, 0
61 ; CI-NEXT: s_waitcnt lgkmcnt(0)
62 ; CI-NEXT: v_mov_b32_e32 v1, s0
63 ; CI-NEXT: s_mov_b32 s0, 0
64 ; CI-NEXT: v_div_fmas_f32 v1, v1, v1, v1
65 ; CI-NEXT: v_mov_b32_e32 v2, 0x7b
66 ; CI-NEXT: s_mov_b32 m0, -1
67 ; CI-NEXT: s_mov_b32 s3, 0xf000
68 ; CI-NEXT: s_mov_b32 s2, -1
69 ; CI-NEXT: s_mov_b32 s1, s0
70 ; CI-NEXT: ds_write_b32 v0, v2 offset:12
71 ; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0
72 ; CI-NEXT: s_waitcnt vmcnt(0)
75 ; GFX9-LABEL: write_ds_sub0_offset0_global_clamp_bit:
76 ; GFX9: ; %bb.0: ; %entry
77 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
78 ; GFX9-NEXT: s_mov_b64 vcc, 0
79 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
80 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v0
81 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7b
82 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
83 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
84 ; GFX9-NEXT: v_div_fmas_f32 v2, v1, v1, v1
85 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
86 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
87 ; GFX9-NEXT: ds_write_b32 v3, v4 offset:12
88 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
89 ; GFX9-NEXT: s_waitcnt vmcnt(0)
92 ; GFX10-LABEL: write_ds_sub0_offset0_global_clamp_bit:
93 ; GFX10: ; %bb.0: ; %entry
94 ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
95 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
96 ; GFX10-NEXT: s_mov_b32 vcc_lo, 0
97 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b
98 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0, v0
99 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
100 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
101 ; GFX10-NEXT: ds_write_b32 v2, v3 offset:12
102 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
103 ; GFX10-NEXT: v_div_fmas_f32 v4, s0, s0, s0
104 ; GFX10-NEXT: global_store_dword v[0:1], v4, off
105 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
106 ; GFX10-NEXT: s_endpgm
108 ; GFX11-LABEL: write_ds_sub0_offset0_global_clamp_bit:
109 ; GFX11: ; %bb.0: ; %entry
110 ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
111 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
112 ; GFX11-NEXT: s_mov_b32 vcc_lo, 0
113 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
114 ; GFX11-NEXT: v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0
115 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0, v0
116 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
117 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
118 ; GFX11-NEXT: ds_store_b32 v2, v3 offset:12
119 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
120 ; GFX11-NEXT: v_div_fmas_f32 v4, s0, s0, s0
121 ; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc
122 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
123 ; GFX11-NEXT: s_nop 0
124 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
125 ; GFX11-NEXT: s_endpgm
127 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
128 %sub1 = sub i32 0, %x.i
129 %tmp0 = getelementptr [256 x i32], ptr addrspace(3) @lds.obj, i32 0, i32 %sub1
130 %arrayidx = getelementptr inbounds i32, ptr addrspace(3) %tmp0, i32 3
131 store i32 123, ptr addrspace(3) %arrayidx
132 %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false)
133 store volatile float %fmas, ptr addrspace(1) null
137 define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy.val) #0 {
138 ; CI-LABEL: write_ds_sub_max_offset_global_clamp_bit:
140 ; CI-NEXT: s_load_dword s0, s[2:3], 0x0
141 ; CI-NEXT: s_mov_b64 vcc, 0
142 ; CI-NEXT: v_mov_b32_e32 v1, 0x7b
143 ; CI-NEXT: v_mov_b32_e32 v2, 0
144 ; CI-NEXT: s_mov_b32 m0, -1
145 ; CI-NEXT: s_waitcnt lgkmcnt(0)
146 ; CI-NEXT: v_mov_b32_e32 v0, s0
147 ; CI-NEXT: v_div_fmas_f32 v0, v0, v0, v0
148 ; CI-NEXT: s_mov_b32 s0, 0
149 ; CI-NEXT: s_mov_b32 s3, 0xf000
150 ; CI-NEXT: s_mov_b32 s2, -1
151 ; CI-NEXT: s_mov_b32 s1, s0
152 ; CI-NEXT: ds_write_b32 v2, v1
153 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
154 ; CI-NEXT: s_waitcnt vmcnt(0)
157 ; GFX9-LABEL: write_ds_sub_max_offset_global_clamp_bit:
159 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
160 ; GFX9-NEXT: s_mov_b64 vcc, 0
161 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7b
162 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
163 ; GFX9-NEXT: ds_write_b32 v4, v3
164 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
165 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
166 ; GFX9-NEXT: v_div_fmas_f32 v2, v0, v0, v0
167 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
168 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
169 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
170 ; GFX9-NEXT: s_waitcnt vmcnt(0)
171 ; GFX9-NEXT: s_endpgm
173 ; GFX10-LABEL: write_ds_sub_max_offset_global_clamp_bit:
175 ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
176 ; GFX10-NEXT: s_mov_b32 vcc_lo, 0
177 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
178 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b
179 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
180 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
181 ; GFX10-NEXT: ds_write_b32 v3, v2
182 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
183 ; GFX10-NEXT: v_div_fmas_f32 v4, s0, s0, s0
184 ; GFX10-NEXT: global_store_dword v[0:1], v4, off
185 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
186 ; GFX10-NEXT: s_endpgm
188 ; GFX11-LABEL: write_ds_sub_max_offset_global_clamp_bit:
190 ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
191 ; GFX11-NEXT: s_mov_b32 vcc_lo, 0
192 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
193 ; GFX11-NEXT: v_dual_mov_b32 v2, 0x7b :: v_dual_mov_b32 v3, 0
194 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
195 ; GFX11-NEXT: ds_store_b32 v3, v2
196 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
197 ; GFX11-NEXT: v_div_fmas_f32 v4, s0, s0, s0
198 ; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc
199 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
200 ; GFX11-NEXT: s_nop 0
201 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
202 ; GFX11-NEXT: s_endpgm
203 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
204 %sub1 = sub i32 -1, %x.i
205 %tmp0 = getelementptr [256 x i32], ptr addrspace(3) @lds.obj, i32 0, i32 %sub1
206 %arrayidx = getelementptr inbounds i32, ptr addrspace(3) %tmp0, i32 16383
207 store i32 123, ptr addrspace(3) %arrayidx
208 %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false)
209 store volatile float %fmas, ptr addrspace(1) null
213 define amdgpu_kernel void @add_x_shl_max_offset() #1 {
214 ; CI-LABEL: add_x_shl_max_offset:
216 ; CI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
217 ; CI-NEXT: v_mov_b32_e32 v1, 13
218 ; CI-NEXT: s_mov_b32 m0, -1
219 ; CI-NEXT: ds_write_b8 v0, v1 offset:65535
222 ; GFX9-LABEL: add_x_shl_max_offset:
224 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0
225 ; GFX9-NEXT: v_mov_b32_e32 v1, 13
226 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:65535
227 ; GFX9-NEXT: s_endpgm
229 ; GFX10-LABEL: add_x_shl_max_offset:
231 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0
232 ; GFX10-NEXT: v_mov_b32_e32 v1, 13
233 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:65535
234 ; GFX10-NEXT: s_endpgm
236 ; GFX11-LABEL: add_x_shl_max_offset:
238 ; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0
239 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
240 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0
241 ; GFX11-NEXT: ds_store_b8 v0, v1 offset:65535
242 ; GFX11-NEXT: s_endpgm
243 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x()
244 %shl = shl i32 %x.i, 4
245 %add = add i32 %shl, 65535
246 %z = zext i32 %add to i64
247 %ptr = inttoptr i64 %z to ptr addrspace(3)
248 store i8 13, ptr addrspace(3) %ptr, align 1
252 ; this could have the offset transform, but sub became xor
254 define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_alt() #1 {
255 ; CI-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
257 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
258 ; CI-NEXT: v_xor_b32_e32 v0, 0xffff, v0
259 ; CI-NEXT: v_mov_b32_e32 v1, 13
260 ; CI-NEXT: s_mov_b32 m0, -1
261 ; CI-NEXT: ds_write_b8 v0, v1
264 ; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
266 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
267 ; GFX9-NEXT: v_xor_b32_e32 v0, 0xffff, v0
268 ; GFX9-NEXT: v_mov_b32_e32 v1, 13
269 ; GFX9-NEXT: ds_write_b8 v0, v1
270 ; GFX9-NEXT: s_endpgm
272 ; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
274 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
275 ; GFX10-NEXT: v_mov_b32_e32 v1, 13
276 ; GFX10-NEXT: v_xor_b32_e32 v0, 0xffff, v0
277 ; GFX10-NEXT: ds_write_b8 v0, v1
278 ; GFX10-NEXT: s_endpgm
280 ; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
282 ; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0
283 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
284 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
285 ; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff, v0
286 ; GFX11-NEXT: ds_store_b8 v0, v1
287 ; GFX11-NEXT: s_endpgm
288 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x()
289 %.neg = mul i32 %x.i, -4
290 %add = add i32 %.neg, 65535
291 %z = zext i32 %add to i64
292 %ptr = inttoptr i64 %z to ptr addrspace(3)
293 store i8 13, ptr addrspace(3) %ptr, align 1
297 ; this could have the offset transform, but sub became xor
299 define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_not_canonical() #1 {
300 ; CI-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
302 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
303 ; CI-NEXT: v_xor_b32_e32 v0, 0xffff, v0
304 ; CI-NEXT: v_mov_b32_e32 v1, 13
305 ; CI-NEXT: s_mov_b32 m0, -1
306 ; CI-NEXT: ds_write_b8 v0, v1
309 ; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
311 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
312 ; GFX9-NEXT: v_xor_b32_e32 v0, 0xffff, v0
313 ; GFX9-NEXT: v_mov_b32_e32 v1, 13
314 ; GFX9-NEXT: ds_write_b8 v0, v1
315 ; GFX9-NEXT: s_endpgm
317 ; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
319 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
320 ; GFX10-NEXT: v_mov_b32_e32 v1, 13
321 ; GFX10-NEXT: v_xor_b32_e32 v0, 0xffff, v0
322 ; GFX10-NEXT: ds_write_b8 v0, v1
323 ; GFX10-NEXT: s_endpgm
325 ; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
327 ; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0
328 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
329 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
330 ; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff, v0
331 ; GFX11-NEXT: ds_store_b8 v0, v1
332 ; GFX11-NEXT: s_endpgm
333 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
334 %neg = sub i32 0, %x.i
335 %shl = shl i32 %neg, 2
336 %add = add i32 65535, %shl
337 %ptr = inttoptr i32 %add to ptr addrspace(3)
338 store i8 13, ptr addrspace(3) %ptr
342 define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
343 ; CI-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
345 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
346 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x10000, v0
347 ; CI-NEXT: v_mov_b32_e32 v1, 13
348 ; CI-NEXT: s_mov_b32 m0, -1
349 ; CI-NEXT: ds_write_b8 v0, v1
352 ; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
354 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
355 ; GFX9-NEXT: v_sub_u32_e32 v0, 0x10000, v0
356 ; GFX9-NEXT: v_mov_b32_e32 v1, 13
357 ; GFX9-NEXT: ds_write_b8 v0, v1
358 ; GFX9-NEXT: s_endpgm
360 ; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
362 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
363 ; GFX10-NEXT: v_mov_b32_e32 v1, 13
364 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x10000, v0
365 ; GFX10-NEXT: ds_write_b8 v0, v1
366 ; GFX10-NEXT: s_endpgm
368 ; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
370 ; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0
371 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
372 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
373 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x10000, v0
374 ; GFX11-NEXT: ds_store_b8 v0, v1
375 ; GFX11-NEXT: s_endpgm
376 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
377 %neg = sub i32 0, %x.i
378 %shl = shl i32 %neg, 2
379 %add = add i32 65536, %shl
380 %ptr = inttoptr i32 %add to ptr addrspace(3)
381 store i8 13, ptr addrspace(3) %ptr
385 define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 {
386 ; CI-LABEL: add_x_shl_neg_to_sub_multi_use:
388 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
389 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
390 ; CI-NEXT: v_mov_b32_e32 v1, 13
391 ; CI-NEXT: s_mov_b32 m0, -1
392 ; CI-NEXT: ds_write_b32 v0, v1 offset:123
393 ; CI-NEXT: ds_write_b32 v0, v1 offset:456
396 ; GFX9-LABEL: add_x_shl_neg_to_sub_multi_use:
398 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
399 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
400 ; GFX9-NEXT: v_mov_b32_e32 v1, 13
401 ; GFX9-NEXT: ds_write_b32 v0, v1 offset:123
402 ; GFX9-NEXT: ds_write_b32 v0, v1 offset:456
403 ; GFX9-NEXT: s_endpgm
405 ; GFX10-LABEL: add_x_shl_neg_to_sub_multi_use:
407 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
408 ; GFX10-NEXT: v_mov_b32_e32 v1, 13
409 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0
410 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:123
411 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:456
412 ; GFX10-NEXT: s_endpgm
414 ; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use:
416 ; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0
417 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
418 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0
419 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0
420 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:123
421 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:456
422 ; GFX11-NEXT: s_endpgm
423 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
424 %neg = sub i32 0, %x.i
425 %shl = shl i32 %neg, 2
426 %add0 = add i32 123, %shl
427 %add1 = add i32 456, %shl
428 %ptr0 = inttoptr i32 %add0 to ptr addrspace(3)
429 store volatile i32 13, ptr addrspace(3) %ptr0
430 %ptr1 = inttoptr i32 %add1 to ptr addrspace(3)
431 store volatile i32 13, ptr addrspace(3) %ptr1
435 define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
436 ; CI-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
438 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
439 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
440 ; CI-NEXT: v_mov_b32_e32 v1, 13
441 ; CI-NEXT: s_mov_b32 m0, -1
442 ; CI-NEXT: ds_write_b32 v0, v1 offset:123
443 ; CI-NEXT: ds_write_b32 v0, v1 offset:123
446 ; GFX9-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
448 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
449 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
450 ; GFX9-NEXT: v_mov_b32_e32 v1, 13
451 ; GFX9-NEXT: ds_write_b32 v0, v1 offset:123
452 ; GFX9-NEXT: ds_write_b32 v0, v1 offset:123
453 ; GFX9-NEXT: s_endpgm
455 ; GFX10-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
457 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
458 ; GFX10-NEXT: v_mov_b32_e32 v1, 13
459 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0
460 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:123
461 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:123
462 ; GFX10-NEXT: s_endpgm
464 ; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
466 ; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0
467 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
468 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
469 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0
470 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:123
471 ; GFX11-NEXT: ds_store_b32 v0, v1 offset:123
472 ; GFX11-NEXT: s_endpgm
473 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
474 %neg = sub i32 0, %x.i
475 %shl = shl i32 %neg, 2
476 %add = add i32 123, %shl
477 %ptr = inttoptr i32 %add to ptr addrspace(3)
478 store volatile i32 13, ptr addrspace(3) %ptr
479 store volatile i32 13, ptr addrspace(3) %ptr
483 define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
484 ; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
486 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
487 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fb, v0
488 ; CI-NEXT: v_mov_b32_e32 v1, 0x7b
489 ; CI-NEXT: v_mov_b32_e32 v2, 0
490 ; CI-NEXT: s_mov_b32 m0, -1
491 ; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
494 ; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
496 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
497 ; GFX9-NEXT: v_sub_u32_e32 v0, 0x3fb, v0
498 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b
499 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
500 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
501 ; GFX9-NEXT: s_endpgm
503 ; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
505 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
506 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
507 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b
508 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0
509 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:1023
510 ; GFX10-NEXT: ds_write_b32 v0, v2 offset:1019
511 ; GFX10-NEXT: s_endpgm
513 ; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
515 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
516 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x7b
517 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
518 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
519 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x3fb, v0
520 ; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1
521 ; GFX11-NEXT: s_endpgm
522 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
523 %neg = sub i32 0, %x.i
524 %shl = shl i32 %neg, 2
525 %add = add i32 1019, %shl
526 %ptr = inttoptr i32 %add to ptr addrspace(3)
527 store i64 123, ptr addrspace(3) %ptr, align 4
531 define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit(float %dummy.val) #1 {
532 ; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
534 ; CI-NEXT: s_load_dword s0, s[2:3], 0x0
535 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
536 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fb, v0
537 ; CI-NEXT: s_mov_b64 vcc, 0
538 ; CI-NEXT: s_waitcnt lgkmcnt(0)
539 ; CI-NEXT: v_mov_b32_e32 v1, s0
540 ; CI-NEXT: s_mov_b32 s0, 0
541 ; CI-NEXT: v_div_fmas_f32 v1, v1, v1, v1
542 ; CI-NEXT: v_mov_b32_e32 v2, 0x7b
543 ; CI-NEXT: v_mov_b32_e32 v3, 0
544 ; CI-NEXT: s_mov_b32 m0, -1
545 ; CI-NEXT: s_mov_b32 s3, 0xf000
546 ; CI-NEXT: s_mov_b32 s2, -1
547 ; CI-NEXT: s_mov_b32 s1, s0
548 ; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1
549 ; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0
550 ; CI-NEXT: s_waitcnt vmcnt(0)
553 ; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
555 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
556 ; GFX9-NEXT: s_mov_b64 vcc, 0
557 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
558 ; GFX9-NEXT: v_sub_u32_e32 v3, 0x3fb, v0
559 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7b
560 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
561 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
562 ; GFX9-NEXT: v_div_fmas_f32 v2, v1, v1, v1
563 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
564 ; GFX9-NEXT: v_mov_b32_e32 v5, 0
565 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
566 ; GFX9-NEXT: ds_write2_b32 v3, v4, v5 offset1:1
567 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
568 ; GFX9-NEXT: s_waitcnt vmcnt(0)
569 ; GFX9-NEXT: s_endpgm
571 ; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
573 ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
574 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
575 ; GFX10-NEXT: s_mov_b32 vcc_lo, 0
576 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
577 ; GFX10-NEXT: v_mov_b32_e32 v4, 0x7b
578 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0, v0
579 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
580 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
581 ; GFX10-NEXT: ds_write_b32 v2, v3 offset:1023
582 ; GFX10-NEXT: ds_write_b32 v2, v4 offset:1019
583 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
584 ; GFX10-NEXT: v_div_fmas_f32 v5, s0, s0, s0
585 ; GFX10-NEXT: global_store_dword v[0:1], v5, off
586 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
587 ; GFX10-NEXT: s_endpgm
589 ; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
591 ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
592 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
593 ; GFX11-NEXT: s_mov_b32 vcc_lo, 0
594 ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, 0x7b
595 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
596 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
597 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x3fb, v0
598 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
599 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
600 ; GFX11-NEXT: ds_store_2addr_b32 v2, v3, v4 offset1:1
601 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
602 ; GFX11-NEXT: v_div_fmas_f32 v5, s0, s0, s0
603 ; GFX11-NEXT: global_store_b32 v[0:1], v5, off dlc
604 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
605 ; GFX11-NEXT: s_nop 0
606 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
607 ; GFX11-NEXT: s_endpgm
608 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
609 %neg = sub i32 0, %x.i
610 %shl = shl i32 %neg, 2
611 %add = add i32 1019, %shl
612 %ptr = inttoptr i32 %add to ptr addrspace(3)
613 store i64 123, ptr addrspace(3) %ptr, align 4
614 %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false)
615 store volatile float %fmas, ptr addrspace(1) null
619 define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 {
620 ; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
622 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
623 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fc, v0
624 ; CI-NEXT: v_mov_b32_e32 v1, 0x7b
625 ; CI-NEXT: v_mov_b32_e32 v2, 0
626 ; CI-NEXT: s_mov_b32 m0, -1
627 ; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
630 ; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
632 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
633 ; GFX9-NEXT: v_sub_u32_e32 v0, 0x3fc, v0
634 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b
635 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
636 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
637 ; GFX9-NEXT: s_endpgm
639 ; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
641 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
642 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
643 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b
644 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0
645 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0
646 ; GFX10-NEXT: ds_write2_b32 v0, v2, v1 offset0:127 offset1:128
647 ; GFX10-NEXT: s_endpgm
649 ; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
651 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
652 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x7b
653 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
654 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
655 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x3fc, v0
656 ; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1
657 ; GFX11-NEXT: s_endpgm
658 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
659 %neg = sub i32 0, %x.i
660 %shl = shl i32 %neg, 2
661 %add = add i32 1020, %shl
662 %ptr = inttoptr i32 %add to ptr addrspace(3)
663 store i64 123, ptr addrspace(3) %ptr, align 4
667 declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1)
669 attributes #0 = { nounwind readnone }
670 attributes #1 = { nounwind }
671 attributes #2 = { nounwind convergent }