1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
4 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s
5 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12 %s
7 ; Test using saddr addressing mode of global_* flat atomic instructions.
9 define amdgpu_ps void @global_xchg_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
10 ; GFX9-LABEL: global_xchg_saddr_i32_nortn:
12 ; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3]
13 ; GFX9-NEXT: s_waitcnt vmcnt(0)
14 ; GFX9-NEXT: buffer_wbinvl1
17 ; GFX10-LABEL: global_xchg_saddr_i32_nortn:
19 ; GFX10-NEXT: global_atomic_swap v0, v1, s[2:3]
20 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
21 ; GFX10-NEXT: buffer_gl1_inv
22 ; GFX10-NEXT: buffer_gl0_inv
23 ; GFX10-NEXT: s_endpgm
25 ; GFX11-LABEL: global_xchg_saddr_i32_nortn:
27 ; GFX11-NEXT: global_atomic_swap_b32 v0, v1, s[2:3]
28 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
29 ; GFX11-NEXT: buffer_gl1_inv
30 ; GFX11-NEXT: buffer_gl0_inv
31 ; GFX11-NEXT: s_endpgm
33 ; GFX12-LABEL: global_xchg_saddr_i32_nortn:
35 ; GFX12-NEXT: global_atomic_swap_b32 v0, v1, s[2:3] scope:SCOPE_DEV
36 ; GFX12-NEXT: s_wait_storecnt 0x0
37 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
38 ; GFX12-NEXT: s_endpgm
39 %zext.offset = zext i32 %voffset to i64
40 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
41 %unused = atomicrmw xchg ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
45 ; Maximum positive offset on gfx10
46 define amdgpu_ps void @global_xchg_saddr_i32_nortn_offset_2047(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
47 ; GFX9-LABEL: global_xchg_saddr_i32_nortn_offset_2047:
49 ; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3] offset:2047
50 ; GFX9-NEXT: s_waitcnt vmcnt(0)
51 ; GFX9-NEXT: buffer_wbinvl1
54 ; GFX10-LABEL: global_xchg_saddr_i32_nortn_offset_2047:
56 ; GFX10-NEXT: global_atomic_swap v0, v1, s[2:3] offset:2047
57 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
58 ; GFX10-NEXT: buffer_gl1_inv
59 ; GFX10-NEXT: buffer_gl0_inv
60 ; GFX10-NEXT: s_endpgm
62 ; GFX11-LABEL: global_xchg_saddr_i32_nortn_offset_2047:
64 ; GFX11-NEXT: global_atomic_swap_b32 v0, v1, s[2:3] offset:2047
65 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
66 ; GFX11-NEXT: buffer_gl1_inv
67 ; GFX11-NEXT: buffer_gl0_inv
68 ; GFX11-NEXT: s_endpgm
70 ; GFX12-LABEL: global_xchg_saddr_i32_nortn_offset_2047:
72 ; GFX12-NEXT: global_atomic_swap_b32 v0, v1, s[2:3] offset:2047 scope:SCOPE_DEV
73 ; GFX12-NEXT: s_wait_storecnt 0x0
74 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
75 ; GFX12-NEXT: s_endpgm
76 %zext.offset = zext i32 %voffset to i64
77 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
78 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2047
79 %unused = atomicrmw xchg ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
83 ; Maximum negative offset on gfx10
84 define amdgpu_ps void @global_xchg_saddr_i32_nortn_offset_neg2048(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
85 ; GFX9-LABEL: global_xchg_saddr_i32_nortn_offset_neg2048:
87 ; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3] offset:-2048
88 ; GFX9-NEXT: s_waitcnt vmcnt(0)
89 ; GFX9-NEXT: buffer_wbinvl1
92 ; GFX10-LABEL: global_xchg_saddr_i32_nortn_offset_neg2048:
94 ; GFX10-NEXT: global_atomic_swap v0, v1, s[2:3] offset:-2048
95 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
96 ; GFX10-NEXT: buffer_gl1_inv
97 ; GFX10-NEXT: buffer_gl0_inv
98 ; GFX10-NEXT: s_endpgm
100 ; GFX11-LABEL: global_xchg_saddr_i32_nortn_offset_neg2048:
102 ; GFX11-NEXT: global_atomic_swap_b32 v0, v1, s[2:3] offset:-2048
103 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
104 ; GFX11-NEXT: buffer_gl1_inv
105 ; GFX11-NEXT: buffer_gl0_inv
106 ; GFX11-NEXT: s_endpgm
108 ; GFX12-LABEL: global_xchg_saddr_i32_nortn_offset_neg2048:
110 ; GFX12-NEXT: global_atomic_swap_b32 v0, v1, s[2:3] offset:-2048 scope:SCOPE_DEV
111 ; GFX12-NEXT: s_wait_storecnt 0x0
112 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
113 ; GFX12-NEXT: s_endpgm
114 %zext.offset = zext i32 %voffset to i64
115 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
116 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2048
117 %unused = atomicrmw xchg ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
121 define amdgpu_ps float @global_xchg_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
122 ; GFX9-LABEL: global_xchg_saddr_i32_rtn:
124 ; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[2:3] glc
125 ; GFX9-NEXT: s_waitcnt vmcnt(0)
126 ; GFX9-NEXT: buffer_wbinvl1
127 ; GFX9-NEXT: ; return to shader part epilog
129 ; GFX10-LABEL: global_xchg_saddr_i32_rtn:
131 ; GFX10-NEXT: global_atomic_swap v0, v0, v1, s[2:3] glc
132 ; GFX10-NEXT: s_waitcnt vmcnt(0)
133 ; GFX10-NEXT: buffer_gl1_inv
134 ; GFX10-NEXT: buffer_gl0_inv
135 ; GFX10-NEXT: ; return to shader part epilog
137 ; GFX11-LABEL: global_xchg_saddr_i32_rtn:
139 ; GFX11-NEXT: global_atomic_swap_b32 v0, v0, v1, s[2:3] glc
140 ; GFX11-NEXT: s_waitcnt vmcnt(0)
141 ; GFX11-NEXT: buffer_gl1_inv
142 ; GFX11-NEXT: buffer_gl0_inv
143 ; GFX11-NEXT: ; return to shader part epilog
145 ; GFX12-LABEL: global_xchg_saddr_i32_rtn:
147 ; GFX12-NEXT: global_atomic_swap_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
148 ; GFX12-NEXT: s_wait_loadcnt 0x0
149 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
150 ; GFX12-NEXT: ; return to shader part epilog
151 %zext.offset = zext i32 %voffset to i64
152 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
153 %rtn = atomicrmw xchg ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
154 %cast.rtn = bitcast i32 %rtn to float
158 define amdgpu_ps float @global_xchg_saddr_i32_rtn_2048(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
159 ; GFX9-LABEL: global_xchg_saddr_i32_rtn_2048:
161 ; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[2:3] offset:2048 glc
162 ; GFX9-NEXT: s_waitcnt vmcnt(0)
163 ; GFX9-NEXT: buffer_wbinvl1
164 ; GFX9-NEXT: ; return to shader part epilog
166 ; GFX10-LABEL: global_xchg_saddr_i32_rtn_2048:
168 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
169 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
170 ; GFX10-NEXT: v_add_co_u32 v2, vcc, 0x800, v0
171 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc, 0, v3, vcc
172 ; GFX10-NEXT: global_atomic_swap v0, v[2:3], v1, off glc
173 ; GFX10-NEXT: s_waitcnt vmcnt(0)
174 ; GFX10-NEXT: buffer_gl1_inv
175 ; GFX10-NEXT: buffer_gl0_inv
176 ; GFX10-NEXT: ; return to shader part epilog
178 ; GFX11-LABEL: global_xchg_saddr_i32_rtn_2048:
180 ; GFX11-NEXT: global_atomic_swap_b32 v0, v0, v1, s[2:3] offset:2048 glc
181 ; GFX11-NEXT: s_waitcnt vmcnt(0)
182 ; GFX11-NEXT: buffer_gl1_inv
183 ; GFX11-NEXT: buffer_gl0_inv
184 ; GFX11-NEXT: ; return to shader part epilog
186 ; GFX12-LABEL: global_xchg_saddr_i32_rtn_2048:
188 ; GFX12-NEXT: global_atomic_swap_b32 v0, v0, v1, s[2:3] offset:2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
189 ; GFX12-NEXT: s_wait_loadcnt 0x0
190 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
191 ; GFX12-NEXT: ; return to shader part epilog
192 %zext.offset = zext i32 %voffset to i64
193 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
194 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2048
195 %rtn = atomicrmw xchg ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
196 %cast.rtn = bitcast i32 %rtn to float
200 define amdgpu_ps float @global_xchg_saddr_i32_rtn_neg2048(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
201 ; GFX9-LABEL: global_xchg_saddr_i32_rtn_neg2048:
203 ; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[2:3] offset:-2048 glc
204 ; GFX9-NEXT: s_waitcnt vmcnt(0)
205 ; GFX9-NEXT: buffer_wbinvl1
206 ; GFX9-NEXT: ; return to shader part epilog
208 ; GFX10-LABEL: global_xchg_saddr_i32_rtn_neg2048:
210 ; GFX10-NEXT: global_atomic_swap v0, v0, v1, s[2:3] offset:-2048 glc
211 ; GFX10-NEXT: s_waitcnt vmcnt(0)
212 ; GFX10-NEXT: buffer_gl1_inv
213 ; GFX10-NEXT: buffer_gl0_inv
214 ; GFX10-NEXT: ; return to shader part epilog
216 ; GFX11-LABEL: global_xchg_saddr_i32_rtn_neg2048:
218 ; GFX11-NEXT: global_atomic_swap_b32 v0, v0, v1, s[2:3] offset:-2048 glc
219 ; GFX11-NEXT: s_waitcnt vmcnt(0)
220 ; GFX11-NEXT: buffer_gl1_inv
221 ; GFX11-NEXT: buffer_gl0_inv
222 ; GFX11-NEXT: ; return to shader part epilog
224 ; GFX12-LABEL: global_xchg_saddr_i32_rtn_neg2048:
226 ; GFX12-NEXT: global_atomic_swap_b32 v0, v0, v1, s[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
227 ; GFX12-NEXT: s_wait_loadcnt 0x0
228 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
229 ; GFX12-NEXT: ; return to shader part epilog
230 %zext.offset = zext i32 %voffset to i64
231 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
232 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2048
233 %rtn = atomicrmw xchg ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
234 %cast.rtn = bitcast i32 %rtn to float
238 ; --------------------------------------------------------------------------------
239 ; Uniformity edge cases
240 ; --------------------------------------------------------------------------------
242 @ptr.in.lds = internal addrspace(3) global ptr addrspace(1) undef
244 ; Base pointer is uniform, but also in VGPRs
245 define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i32 %data) {
246 ; GFX9-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn:
248 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
249 ; GFX9-NEXT: ds_read_b64 v[2:3], v2
250 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
251 ; GFX9-NEXT: v_readfirstlane_b32 s0, v2
252 ; GFX9-NEXT: v_readfirstlane_b32 s1, v3
254 ; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[0:1] glc
255 ; GFX9-NEXT: s_waitcnt vmcnt(0)
256 ; GFX9-NEXT: buffer_wbinvl1
257 ; GFX9-NEXT: ; return to shader part epilog
259 ; GFX10-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn:
261 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
262 ; GFX10-NEXT: ds_read_b64 v[2:3], v2
263 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
264 ; GFX10-NEXT: v_readfirstlane_b32 s0, v2
265 ; GFX10-NEXT: v_readfirstlane_b32 s1, v3
266 ; GFX10-NEXT: global_atomic_swap v0, v0, v1, s[0:1] glc
267 ; GFX10-NEXT: s_waitcnt vmcnt(0)
268 ; GFX10-NEXT: buffer_gl1_inv
269 ; GFX10-NEXT: buffer_gl0_inv
270 ; GFX10-NEXT: ; return to shader part epilog
272 ; GFX11-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn:
274 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
275 ; GFX11-NEXT: ds_load_b64 v[2:3], v2
276 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
277 ; GFX11-NEXT: v_readfirstlane_b32 s0, v2
278 ; GFX11-NEXT: v_readfirstlane_b32 s1, v3
279 ; GFX11-NEXT: global_atomic_swap_b32 v0, v0, v1, s[0:1] glc
280 ; GFX11-NEXT: s_waitcnt vmcnt(0)
281 ; GFX11-NEXT: buffer_gl1_inv
282 ; GFX11-NEXT: buffer_gl0_inv
283 ; GFX11-NEXT: ; return to shader part epilog
285 ; GFX12-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn:
287 ; GFX12-NEXT: v_mov_b32_e32 v2, 0
288 ; GFX12-NEXT: ds_load_b64 v[2:3], v2
289 ; GFX12-NEXT: s_wait_dscnt 0x0
290 ; GFX12-NEXT: v_readfirstlane_b32 s0, v2
291 ; GFX12-NEXT: v_readfirstlane_b32 s1, v3
292 ; GFX12-NEXT: global_atomic_swap_b32 v0, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
293 ; GFX12-NEXT: s_wait_loadcnt 0x0
294 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
295 ; GFX12-NEXT: ; return to shader part epilog
296 %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
297 %zext.offset = zext i32 %voffset to i64
298 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
299 %rtn = atomicrmw xchg ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
300 %cast.rtn = bitcast i32 %rtn to float
304 ; Base pointer is uniform, but also in VGPRs, with imm offset
305 define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 %voffset, i32 %data) {
306 ; GFX9-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset:
308 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
309 ; GFX9-NEXT: ds_read_b64 v[2:3], v2
310 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
311 ; GFX9-NEXT: v_readfirstlane_b32 s0, v2
312 ; GFX9-NEXT: v_readfirstlane_b32 s1, v3
314 ; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[0:1] offset:42 glc
315 ; GFX9-NEXT: s_waitcnt vmcnt(0)
316 ; GFX9-NEXT: buffer_wbinvl1
317 ; GFX9-NEXT: ; return to shader part epilog
319 ; GFX10-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset:
321 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
322 ; GFX10-NEXT: ds_read_b64 v[2:3], v2
323 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
324 ; GFX10-NEXT: v_readfirstlane_b32 s0, v2
325 ; GFX10-NEXT: v_readfirstlane_b32 s1, v3
326 ; GFX10-NEXT: global_atomic_swap v0, v0, v1, s[0:1] offset:42 glc
327 ; GFX10-NEXT: s_waitcnt vmcnt(0)
328 ; GFX10-NEXT: buffer_gl1_inv
329 ; GFX10-NEXT: buffer_gl0_inv
330 ; GFX10-NEXT: ; return to shader part epilog
332 ; GFX11-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset:
334 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
335 ; GFX11-NEXT: ds_load_b64 v[2:3], v2
336 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
337 ; GFX11-NEXT: v_readfirstlane_b32 s0, v2
338 ; GFX11-NEXT: v_readfirstlane_b32 s1, v3
339 ; GFX11-NEXT: global_atomic_swap_b32 v0, v0, v1, s[0:1] offset:42 glc
340 ; GFX11-NEXT: s_waitcnt vmcnt(0)
341 ; GFX11-NEXT: buffer_gl1_inv
342 ; GFX11-NEXT: buffer_gl0_inv
343 ; GFX11-NEXT: ; return to shader part epilog
345 ; GFX12-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset:
347 ; GFX12-NEXT: v_mov_b32_e32 v2, 0
348 ; GFX12-NEXT: ds_load_b64 v[2:3], v2
349 ; GFX12-NEXT: s_wait_dscnt 0x0
350 ; GFX12-NEXT: v_readfirstlane_b32 s0, v2
351 ; GFX12-NEXT: v_readfirstlane_b32 s1, v3
352 ; GFX12-NEXT: global_atomic_swap_b32 v0, v0, v1, s[0:1] offset:42 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
353 ; GFX12-NEXT: s_wait_loadcnt 0x0
354 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
355 ; GFX12-NEXT: ; return to shader part epilog
356 %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
357 %zext.offset = zext i32 %voffset to i64
358 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
359 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 42
360 %rtn = atomicrmw xchg ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
361 %cast.rtn = bitcast i32 %rtn to float
365 ; Base pointer is uniform, but also in VGPRs
366 define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset, i32 %data) {
367 ; GFX9-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn:
369 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
370 ; GFX9-NEXT: ds_read_b64 v[2:3], v2
371 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
372 ; GFX9-NEXT: v_readfirstlane_b32 s0, v2
373 ; GFX9-NEXT: v_readfirstlane_b32 s1, v3
375 ; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1]
376 ; GFX9-NEXT: s_waitcnt vmcnt(0)
377 ; GFX9-NEXT: buffer_wbinvl1
378 ; GFX9-NEXT: s_endpgm
380 ; GFX10-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn:
382 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
383 ; GFX10-NEXT: ds_read_b64 v[2:3], v2
384 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
385 ; GFX10-NEXT: v_readfirstlane_b32 s0, v2
386 ; GFX10-NEXT: v_readfirstlane_b32 s1, v3
387 ; GFX10-NEXT: global_atomic_swap v0, v1, s[0:1]
388 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
389 ; GFX10-NEXT: buffer_gl1_inv
390 ; GFX10-NEXT: buffer_gl0_inv
391 ; GFX10-NEXT: s_endpgm
393 ; GFX11-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn:
395 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
396 ; GFX11-NEXT: ds_load_b64 v[2:3], v2
397 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
398 ; GFX11-NEXT: v_readfirstlane_b32 s0, v2
399 ; GFX11-NEXT: v_readfirstlane_b32 s1, v3
400 ; GFX11-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
401 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
402 ; GFX11-NEXT: buffer_gl1_inv
403 ; GFX11-NEXT: buffer_gl0_inv
404 ; GFX11-NEXT: s_endpgm
406 ; GFX12-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn:
408 ; GFX12-NEXT: v_mov_b32_e32 v2, 0
409 ; GFX12-NEXT: ds_load_b64 v[2:3], v2
410 ; GFX12-NEXT: s_wait_dscnt 0x0
411 ; GFX12-NEXT: v_readfirstlane_b32 s0, v2
412 ; GFX12-NEXT: v_readfirstlane_b32 s1, v3
413 ; GFX12-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
414 ; GFX12-NEXT: s_wait_storecnt 0x0
415 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
416 ; GFX12-NEXT: s_endpgm
417 %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
418 %zext.offset = zext i32 %voffset to i64
419 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
420 %unused = atomicrmw xchg ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
424 ; Base pointer is uniform, but also in VGPRs, with imm offset
425 define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32 %voffset, i32 %data) {
426 ; GFX9-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset:
428 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
429 ; GFX9-NEXT: ds_read_b64 v[2:3], v2
430 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
431 ; GFX9-NEXT: v_readfirstlane_b32 s0, v2
432 ; GFX9-NEXT: v_readfirstlane_b32 s1, v3
434 ; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] offset:42
435 ; GFX9-NEXT: s_waitcnt vmcnt(0)
436 ; GFX9-NEXT: buffer_wbinvl1
437 ; GFX9-NEXT: s_endpgm
439 ; GFX10-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset:
441 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
442 ; GFX10-NEXT: ds_read_b64 v[2:3], v2
443 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
444 ; GFX10-NEXT: v_readfirstlane_b32 s0, v2
445 ; GFX10-NEXT: v_readfirstlane_b32 s1, v3
446 ; GFX10-NEXT: global_atomic_swap v0, v1, s[0:1] offset:42
447 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
448 ; GFX10-NEXT: buffer_gl1_inv
449 ; GFX10-NEXT: buffer_gl0_inv
450 ; GFX10-NEXT: s_endpgm
452 ; GFX11-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset:
454 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
455 ; GFX11-NEXT: ds_load_b64 v[2:3], v2
456 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
457 ; GFX11-NEXT: v_readfirstlane_b32 s0, v2
458 ; GFX11-NEXT: v_readfirstlane_b32 s1, v3
459 ; GFX11-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] offset:42
460 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
461 ; GFX11-NEXT: buffer_gl1_inv
462 ; GFX11-NEXT: buffer_gl0_inv
463 ; GFX11-NEXT: s_endpgm
465 ; GFX12-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset:
467 ; GFX12-NEXT: v_mov_b32_e32 v2, 0
468 ; GFX12-NEXT: ds_load_b64 v[2:3], v2
469 ; GFX12-NEXT: s_wait_dscnt 0x0
470 ; GFX12-NEXT: v_readfirstlane_b32 s0, v2
471 ; GFX12-NEXT: v_readfirstlane_b32 s1, v3
472 ; GFX12-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] offset:42 scope:SCOPE_DEV
473 ; GFX12-NEXT: s_wait_storecnt 0x0
474 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
475 ; GFX12-NEXT: s_endpgm
476 %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
477 %zext.offset = zext i32 %voffset to i64
478 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
479 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 42
480 %unused = atomicrmw xchg ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
484 ; --------------------------------------------------------------------------------
486 ; --------------------------------------------------------------------------------
488 ; --------------------------------------------------------------------------------
490 ; --------------------------------------------------------------------------------
492 define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
493 ; GFX9-LABEL: global_xchg_saddr_i64_rtn:
495 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v0, v[1:2], s[2:3] glc
496 ; GFX9-NEXT: s_waitcnt vmcnt(0)
497 ; GFX9-NEXT: buffer_wbinvl1
498 ; GFX9-NEXT: ; return to shader part epilog
500 ; GFX10-LABEL: global_xchg_saddr_i64_rtn:
502 ; GFX10-NEXT: global_atomic_swap_x2 v[0:1], v0, v[1:2], s[2:3] glc
503 ; GFX10-NEXT: s_waitcnt vmcnt(0)
504 ; GFX10-NEXT: buffer_gl1_inv
505 ; GFX10-NEXT: buffer_gl0_inv
506 ; GFX10-NEXT: ; return to shader part epilog
508 ; GFX11-LABEL: global_xchg_saddr_i64_rtn:
510 ; GFX11-NEXT: global_atomic_swap_b64 v[0:1], v0, v[1:2], s[2:3] glc
511 ; GFX11-NEXT: s_waitcnt vmcnt(0)
512 ; GFX11-NEXT: buffer_gl1_inv
513 ; GFX11-NEXT: buffer_gl0_inv
514 ; GFX11-NEXT: ; return to shader part epilog
516 ; GFX12-LABEL: global_xchg_saddr_i64_rtn:
518 ; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
519 ; GFX12-NEXT: s_wait_loadcnt 0x0
520 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
521 ; GFX12-NEXT: ; return to shader part epilog
522 %zext.offset = zext i32 %voffset to i64
523 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
524 %rtn = atomicrmw xchg ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
525 %cast.rtn = bitcast i64 %rtn to <2 x float>
526 ret <2 x float> %cast.rtn
529 define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
530 ; GFX9-LABEL: global_xchg_saddr_i64_rtn_neg128:
532 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
533 ; GFX9-NEXT: s_waitcnt vmcnt(0)
534 ; GFX9-NEXT: buffer_wbinvl1
535 ; GFX9-NEXT: ; return to shader part epilog
537 ; GFX10-LABEL: global_xchg_saddr_i64_rtn_neg128:
539 ; GFX10-NEXT: global_atomic_swap_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
540 ; GFX10-NEXT: s_waitcnt vmcnt(0)
541 ; GFX10-NEXT: buffer_gl1_inv
542 ; GFX10-NEXT: buffer_gl0_inv
543 ; GFX10-NEXT: ; return to shader part epilog
545 ; GFX11-LABEL: global_xchg_saddr_i64_rtn_neg128:
547 ; GFX11-NEXT: global_atomic_swap_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
548 ; GFX11-NEXT: s_waitcnt vmcnt(0)
549 ; GFX11-NEXT: buffer_gl1_inv
550 ; GFX11-NEXT: buffer_gl0_inv
551 ; GFX11-NEXT: ; return to shader part epilog
553 ; GFX12-LABEL: global_xchg_saddr_i64_rtn_neg128:
555 ; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
556 ; GFX12-NEXT: s_wait_loadcnt 0x0
557 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
558 ; GFX12-NEXT: ; return to shader part epilog
559 %zext.offset = zext i32 %voffset to i64
560 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
561 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
562 %rtn = atomicrmw xchg ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
563 %cast.rtn = bitcast i64 %rtn to <2 x float>
564 ret <2 x float> %cast.rtn
567 define amdgpu_ps void @global_xchg_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
568 ; GFX9-LABEL: global_xchg_saddr_i64_nortn:
570 ; GFX9-NEXT: global_atomic_swap_x2 v0, v[1:2], s[2:3]
571 ; GFX9-NEXT: s_waitcnt vmcnt(0)
572 ; GFX9-NEXT: buffer_wbinvl1
573 ; GFX9-NEXT: s_endpgm
575 ; GFX10-LABEL: global_xchg_saddr_i64_nortn:
577 ; GFX10-NEXT: global_atomic_swap_x2 v0, v[1:2], s[2:3]
578 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
579 ; GFX10-NEXT: buffer_gl1_inv
580 ; GFX10-NEXT: buffer_gl0_inv
581 ; GFX10-NEXT: s_endpgm
583 ; GFX11-LABEL: global_xchg_saddr_i64_nortn:
585 ; GFX11-NEXT: global_atomic_swap_b64 v0, v[1:2], s[2:3]
586 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
587 ; GFX11-NEXT: buffer_gl1_inv
588 ; GFX11-NEXT: buffer_gl0_inv
589 ; GFX11-NEXT: s_endpgm
591 ; GFX12-LABEL: global_xchg_saddr_i64_nortn:
593 ; GFX12-NEXT: global_atomic_swap_b64 v0, v[1:2], s[2:3] scope:SCOPE_DEV
594 ; GFX12-NEXT: s_wait_storecnt 0x0
595 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
596 ; GFX12-NEXT: s_endpgm
597 %zext.offset = zext i32 %voffset to i64
598 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
599 %unused = atomicrmw xchg ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
603 define amdgpu_ps void @global_xchg_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
604 ; GFX9-LABEL: global_xchg_saddr_i64_nortn_neg128:
606 ; GFX9-NEXT: global_atomic_swap_x2 v0, v[1:2], s[2:3] offset:-128
607 ; GFX9-NEXT: s_waitcnt vmcnt(0)
608 ; GFX9-NEXT: buffer_wbinvl1
609 ; GFX9-NEXT: s_endpgm
611 ; GFX10-LABEL: global_xchg_saddr_i64_nortn_neg128:
613 ; GFX10-NEXT: global_atomic_swap_x2 v0, v[1:2], s[2:3] offset:-128
614 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
615 ; GFX10-NEXT: buffer_gl1_inv
616 ; GFX10-NEXT: buffer_gl0_inv
617 ; GFX10-NEXT: s_endpgm
619 ; GFX11-LABEL: global_xchg_saddr_i64_nortn_neg128:
621 ; GFX11-NEXT: global_atomic_swap_b64 v0, v[1:2], s[2:3] offset:-128
622 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
623 ; GFX11-NEXT: buffer_gl1_inv
624 ; GFX11-NEXT: buffer_gl0_inv
625 ; GFX11-NEXT: s_endpgm
627 ; GFX12-LABEL: global_xchg_saddr_i64_nortn_neg128:
629 ; GFX12-NEXT: global_atomic_swap_b64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_DEV
630 ; GFX12-NEXT: s_wait_storecnt 0x0
631 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
632 ; GFX12-NEXT: s_endpgm
633 %zext.offset = zext i32 %voffset to i64
634 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
635 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
636 %unused = atomicrmw xchg ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
640 ; --------------------------------------------------------------------------------
642 ; --------------------------------------------------------------------------------
644 define amdgpu_ps float @global_add_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
645 ; GFX9-LABEL: global_add_saddr_i32_rtn:
647 ; GFX9-NEXT: global_atomic_add v0, v0, v1, s[2:3] glc
648 ; GFX9-NEXT: s_waitcnt vmcnt(0)
649 ; GFX9-NEXT: buffer_wbinvl1
650 ; GFX9-NEXT: ; return to shader part epilog
652 ; GFX10-LABEL: global_add_saddr_i32_rtn:
654 ; GFX10-NEXT: global_atomic_add v0, v0, v1, s[2:3] glc
655 ; GFX10-NEXT: s_waitcnt vmcnt(0)
656 ; GFX10-NEXT: buffer_gl1_inv
657 ; GFX10-NEXT: buffer_gl0_inv
658 ; GFX10-NEXT: ; return to shader part epilog
660 ; GFX11-LABEL: global_add_saddr_i32_rtn:
662 ; GFX11-NEXT: global_atomic_add_u32 v0, v0, v1, s[2:3] glc
663 ; GFX11-NEXT: s_waitcnt vmcnt(0)
664 ; GFX11-NEXT: buffer_gl1_inv
665 ; GFX11-NEXT: buffer_gl0_inv
666 ; GFX11-NEXT: ; return to shader part epilog
668 ; GFX12-LABEL: global_add_saddr_i32_rtn:
670 ; GFX12-NEXT: global_atomic_add_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
671 ; GFX12-NEXT: s_wait_loadcnt 0x0
672 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
673 ; GFX12-NEXT: ; return to shader part epilog
674 %zext.offset = zext i32 %voffset to i64
675 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
676 %rtn = atomicrmw add ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
677 %cast.rtn = bitcast i32 %rtn to float
681 define amdgpu_ps float @global_add_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
682 ; GFX9-LABEL: global_add_saddr_i32_rtn_neg128:
684 ; GFX9-NEXT: global_atomic_add v0, v0, v1, s[2:3] offset:-128 glc
685 ; GFX9-NEXT: s_waitcnt vmcnt(0)
686 ; GFX9-NEXT: buffer_wbinvl1
687 ; GFX9-NEXT: ; return to shader part epilog
689 ; GFX10-LABEL: global_add_saddr_i32_rtn_neg128:
691 ; GFX10-NEXT: global_atomic_add v0, v0, v1, s[2:3] offset:-128 glc
692 ; GFX10-NEXT: s_waitcnt vmcnt(0)
693 ; GFX10-NEXT: buffer_gl1_inv
694 ; GFX10-NEXT: buffer_gl0_inv
695 ; GFX10-NEXT: ; return to shader part epilog
697 ; GFX11-LABEL: global_add_saddr_i32_rtn_neg128:
699 ; GFX11-NEXT: global_atomic_add_u32 v0, v0, v1, s[2:3] offset:-128 glc
700 ; GFX11-NEXT: s_waitcnt vmcnt(0)
701 ; GFX11-NEXT: buffer_gl1_inv
702 ; GFX11-NEXT: buffer_gl0_inv
703 ; GFX11-NEXT: ; return to shader part epilog
705 ; GFX12-LABEL: global_add_saddr_i32_rtn_neg128:
707 ; GFX12-NEXT: global_atomic_add_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
708 ; GFX12-NEXT: s_wait_loadcnt 0x0
709 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
710 ; GFX12-NEXT: ; return to shader part epilog
711 %zext.offset = zext i32 %voffset to i64
712 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
713 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
714 %rtn = atomicrmw add ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
715 %cast.rtn = bitcast i32 %rtn to float
719 define amdgpu_ps void @global_add_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
720 ; GFX9-LABEL: global_add_saddr_i32_nortn:
722 ; GFX9-NEXT: global_atomic_add v0, v1, s[2:3]
723 ; GFX9-NEXT: s_waitcnt vmcnt(0)
724 ; GFX9-NEXT: buffer_wbinvl1
725 ; GFX9-NEXT: s_endpgm
727 ; GFX10-LABEL: global_add_saddr_i32_nortn:
729 ; GFX10-NEXT: global_atomic_add v0, v1, s[2:3]
730 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
731 ; GFX10-NEXT: buffer_gl1_inv
732 ; GFX10-NEXT: buffer_gl0_inv
733 ; GFX10-NEXT: s_endpgm
735 ; GFX11-LABEL: global_add_saddr_i32_nortn:
737 ; GFX11-NEXT: global_atomic_add_u32 v0, v1, s[2:3]
738 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
739 ; GFX11-NEXT: buffer_gl1_inv
740 ; GFX11-NEXT: buffer_gl0_inv
741 ; GFX11-NEXT: s_endpgm
743 ; GFX12-LABEL: global_add_saddr_i32_nortn:
745 ; GFX12-NEXT: global_atomic_add_u32 v0, v1, s[2:3] scope:SCOPE_DEV
746 ; GFX12-NEXT: s_wait_storecnt 0x0
747 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
748 ; GFX12-NEXT: s_endpgm
749 %zext.offset = zext i32 %voffset to i64
750 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
751 %unused = atomicrmw add ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
755 define amdgpu_ps void @global_add_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
756 ; GFX9-LABEL: global_add_saddr_i32_nortn_neg128:
758 ; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] offset:-128
759 ; GFX9-NEXT: s_waitcnt vmcnt(0)
760 ; GFX9-NEXT: buffer_wbinvl1
761 ; GFX9-NEXT: s_endpgm
763 ; GFX10-LABEL: global_add_saddr_i32_nortn_neg128:
765 ; GFX10-NEXT: global_atomic_add v0, v1, s[2:3] offset:-128
766 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
767 ; GFX10-NEXT: buffer_gl1_inv
768 ; GFX10-NEXT: buffer_gl0_inv
769 ; GFX10-NEXT: s_endpgm
771 ; GFX11-LABEL: global_add_saddr_i32_nortn_neg128:
773 ; GFX11-NEXT: global_atomic_add_u32 v0, v1, s[2:3] offset:-128
774 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
775 ; GFX11-NEXT: buffer_gl1_inv
776 ; GFX11-NEXT: buffer_gl0_inv
777 ; GFX11-NEXT: s_endpgm
779 ; GFX12-LABEL: global_add_saddr_i32_nortn_neg128:
781 ; GFX12-NEXT: global_atomic_add_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
782 ; GFX12-NEXT: s_wait_storecnt 0x0
783 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
784 ; GFX12-NEXT: s_endpgm
785 %zext.offset = zext i32 %voffset to i64
786 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
787 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
788 %unused = atomicrmw add ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
792 define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
793 ; GFX9-LABEL: global_add_saddr_i64_rtn:
795 ; GFX9-NEXT: global_atomic_add_x2 v[0:1], v0, v[1:2], s[2:3] glc
796 ; GFX9-NEXT: s_waitcnt vmcnt(0)
797 ; GFX9-NEXT: buffer_wbinvl1
798 ; GFX9-NEXT: ; return to shader part epilog
800 ; GFX10-LABEL: global_add_saddr_i64_rtn:
802 ; GFX10-NEXT: global_atomic_add_x2 v[0:1], v0, v[1:2], s[2:3] glc
803 ; GFX10-NEXT: s_waitcnt vmcnt(0)
804 ; GFX10-NEXT: buffer_gl1_inv
805 ; GFX10-NEXT: buffer_gl0_inv
806 ; GFX10-NEXT: ; return to shader part epilog
808 ; GFX11-LABEL: global_add_saddr_i64_rtn:
810 ; GFX11-NEXT: global_atomic_add_u64 v[0:1], v0, v[1:2], s[2:3] glc
811 ; GFX11-NEXT: s_waitcnt vmcnt(0)
812 ; GFX11-NEXT: buffer_gl1_inv
813 ; GFX11-NEXT: buffer_gl0_inv
814 ; GFX11-NEXT: ; return to shader part epilog
816 ; GFX12-LABEL: global_add_saddr_i64_rtn:
818 ; GFX12-NEXT: global_atomic_add_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
819 ; GFX12-NEXT: s_wait_loadcnt 0x0
820 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
821 ; GFX12-NEXT: ; return to shader part epilog
822 %zext.offset = zext i32 %voffset to i64
823 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
824 %rtn = atomicrmw add ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
825 %cast.rtn = bitcast i64 %rtn to <2 x float>
826 ret <2 x float> %cast.rtn
829 define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
830 ; GFX9-LABEL: global_add_saddr_i64_rtn_neg128:
832 ; GFX9-NEXT: global_atomic_add_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
833 ; GFX9-NEXT: s_waitcnt vmcnt(0)
834 ; GFX9-NEXT: buffer_wbinvl1
835 ; GFX9-NEXT: ; return to shader part epilog
837 ; GFX10-LABEL: global_add_saddr_i64_rtn_neg128:
839 ; GFX10-NEXT: global_atomic_add_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
840 ; GFX10-NEXT: s_waitcnt vmcnt(0)
841 ; GFX10-NEXT: buffer_gl1_inv
842 ; GFX10-NEXT: buffer_gl0_inv
843 ; GFX10-NEXT: ; return to shader part epilog
845 ; GFX11-LABEL: global_add_saddr_i64_rtn_neg128:
847 ; GFX11-NEXT: global_atomic_add_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
848 ; GFX11-NEXT: s_waitcnt vmcnt(0)
849 ; GFX11-NEXT: buffer_gl1_inv
850 ; GFX11-NEXT: buffer_gl0_inv
851 ; GFX11-NEXT: ; return to shader part epilog
853 ; GFX12-LABEL: global_add_saddr_i64_rtn_neg128:
855 ; GFX12-NEXT: global_atomic_add_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
856 ; GFX12-NEXT: s_wait_loadcnt 0x0
857 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
858 ; GFX12-NEXT: ; return to shader part epilog
859 %zext.offset = zext i32 %voffset to i64
860 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
861 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
862 %rtn = atomicrmw add ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
863 %cast.rtn = bitcast i64 %rtn to <2 x float>
864 ret <2 x float> %cast.rtn
867 define amdgpu_ps void @global_add_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
868 ; GFX9-LABEL: global_add_saddr_i64_nortn:
870 ; GFX9-NEXT: global_atomic_add_x2 v0, v[1:2], s[2:3]
871 ; GFX9-NEXT: s_waitcnt vmcnt(0)
872 ; GFX9-NEXT: buffer_wbinvl1
873 ; GFX9-NEXT: s_endpgm
875 ; GFX10-LABEL: global_add_saddr_i64_nortn:
877 ; GFX10-NEXT: global_atomic_add_x2 v0, v[1:2], s[2:3]
878 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
879 ; GFX10-NEXT: buffer_gl1_inv
880 ; GFX10-NEXT: buffer_gl0_inv
881 ; GFX10-NEXT: s_endpgm
883 ; GFX11-LABEL: global_add_saddr_i64_nortn:
885 ; GFX11-NEXT: global_atomic_add_u64 v0, v[1:2], s[2:3]
886 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
887 ; GFX11-NEXT: buffer_gl1_inv
888 ; GFX11-NEXT: buffer_gl0_inv
889 ; GFX11-NEXT: s_endpgm
891 ; GFX12-LABEL: global_add_saddr_i64_nortn:
893 ; GFX12-NEXT: global_atomic_add_u64 v0, v[1:2], s[2:3] scope:SCOPE_DEV
894 ; GFX12-NEXT: s_wait_storecnt 0x0
895 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
896 ; GFX12-NEXT: s_endpgm
897 %zext.offset = zext i32 %voffset to i64
898 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
899 %unused = atomicrmw add ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
903 define amdgpu_ps void @global_add_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
904 ; GFX9-LABEL: global_add_saddr_i64_nortn_neg128:
906 ; GFX9-NEXT: global_atomic_add_x2 v0, v[1:2], s[2:3] offset:-128
907 ; GFX9-NEXT: s_waitcnt vmcnt(0)
908 ; GFX9-NEXT: buffer_wbinvl1
909 ; GFX9-NEXT: s_endpgm
911 ; GFX10-LABEL: global_add_saddr_i64_nortn_neg128:
913 ; GFX10-NEXT: global_atomic_add_x2 v0, v[1:2], s[2:3] offset:-128
914 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
915 ; GFX10-NEXT: buffer_gl1_inv
916 ; GFX10-NEXT: buffer_gl0_inv
917 ; GFX10-NEXT: s_endpgm
919 ; GFX11-LABEL: global_add_saddr_i64_nortn_neg128:
921 ; GFX11-NEXT: global_atomic_add_u64 v0, v[1:2], s[2:3] offset:-128
922 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
923 ; GFX11-NEXT: buffer_gl1_inv
924 ; GFX11-NEXT: buffer_gl0_inv
925 ; GFX11-NEXT: s_endpgm
927 ; GFX12-LABEL: global_add_saddr_i64_nortn_neg128:
929 ; GFX12-NEXT: global_atomic_add_u64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_DEV
930 ; GFX12-NEXT: s_wait_storecnt 0x0
931 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
932 ; GFX12-NEXT: s_endpgm
933 %zext.offset = zext i32 %voffset to i64
934 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
935 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
936 %unused = atomicrmw add ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
940 ; --------------------------------------------------------------------------------
942 ; --------------------------------------------------------------------------------
944 define amdgpu_ps float @global_sub_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
945 ; GFX9-LABEL: global_sub_saddr_i32_rtn:
947 ; GFX9-NEXT: global_atomic_sub v0, v0, v1, s[2:3] glc
948 ; GFX9-NEXT: s_waitcnt vmcnt(0)
949 ; GFX9-NEXT: buffer_wbinvl1
950 ; GFX9-NEXT: ; return to shader part epilog
952 ; GFX10-LABEL: global_sub_saddr_i32_rtn:
954 ; GFX10-NEXT: global_atomic_sub v0, v0, v1, s[2:3] glc
955 ; GFX10-NEXT: s_waitcnt vmcnt(0)
956 ; GFX10-NEXT: buffer_gl1_inv
957 ; GFX10-NEXT: buffer_gl0_inv
958 ; GFX10-NEXT: ; return to shader part epilog
960 ; GFX11-LABEL: global_sub_saddr_i32_rtn:
962 ; GFX11-NEXT: global_atomic_sub_u32 v0, v0, v1, s[2:3] glc
963 ; GFX11-NEXT: s_waitcnt vmcnt(0)
964 ; GFX11-NEXT: buffer_gl1_inv
965 ; GFX11-NEXT: buffer_gl0_inv
966 ; GFX11-NEXT: ; return to shader part epilog
968 ; GFX12-LABEL: global_sub_saddr_i32_rtn:
970 ; GFX12-NEXT: global_atomic_sub_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
971 ; GFX12-NEXT: s_wait_loadcnt 0x0
972 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
973 ; GFX12-NEXT: ; return to shader part epilog
974 %zext.offset = zext i32 %voffset to i64
975 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
976 %rtn = atomicrmw sub ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
977 %cast.rtn = bitcast i32 %rtn to float
981 define amdgpu_ps float @global_sub_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
982 ; GFX9-LABEL: global_sub_saddr_i32_rtn_neg128:
984 ; GFX9-NEXT: global_atomic_sub v0, v0, v1, s[2:3] offset:-128 glc
985 ; GFX9-NEXT: s_waitcnt vmcnt(0)
986 ; GFX9-NEXT: buffer_wbinvl1
987 ; GFX9-NEXT: ; return to shader part epilog
989 ; GFX10-LABEL: global_sub_saddr_i32_rtn_neg128:
991 ; GFX10-NEXT: global_atomic_sub v0, v0, v1, s[2:3] offset:-128 glc
992 ; GFX10-NEXT: s_waitcnt vmcnt(0)
993 ; GFX10-NEXT: buffer_gl1_inv
994 ; GFX10-NEXT: buffer_gl0_inv
995 ; GFX10-NEXT: ; return to shader part epilog
997 ; GFX11-LABEL: global_sub_saddr_i32_rtn_neg128:
999 ; GFX11-NEXT: global_atomic_sub_u32 v0, v0, v1, s[2:3] offset:-128 glc
1000 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1001 ; GFX11-NEXT: buffer_gl1_inv
1002 ; GFX11-NEXT: buffer_gl0_inv
1003 ; GFX11-NEXT: ; return to shader part epilog
1005 ; GFX12-LABEL: global_sub_saddr_i32_rtn_neg128:
1007 ; GFX12-NEXT: global_atomic_sub_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1008 ; GFX12-NEXT: s_wait_loadcnt 0x0
1009 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1010 ; GFX12-NEXT: ; return to shader part epilog
1011 %zext.offset = zext i32 %voffset to i64
1012 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1013 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1014 %rtn = atomicrmw sub ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
1015 %cast.rtn = bitcast i32 %rtn to float
1019 define amdgpu_ps void @global_sub_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
1020 ; GFX9-LABEL: global_sub_saddr_i32_nortn:
1022 ; GFX9-NEXT: global_atomic_sub v0, v1, s[2:3]
1023 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1024 ; GFX9-NEXT: buffer_wbinvl1
1025 ; GFX9-NEXT: s_endpgm
1027 ; GFX10-LABEL: global_sub_saddr_i32_nortn:
1029 ; GFX10-NEXT: global_atomic_sub v0, v1, s[2:3]
1030 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1031 ; GFX10-NEXT: buffer_gl1_inv
1032 ; GFX10-NEXT: buffer_gl0_inv
1033 ; GFX10-NEXT: s_endpgm
1035 ; GFX11-LABEL: global_sub_saddr_i32_nortn:
1037 ; GFX11-NEXT: global_atomic_sub_u32 v0, v1, s[2:3]
1038 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1039 ; GFX11-NEXT: buffer_gl1_inv
1040 ; GFX11-NEXT: buffer_gl0_inv
1041 ; GFX11-NEXT: s_endpgm
1043 ; GFX12-LABEL: global_sub_saddr_i32_nortn:
1045 ; GFX12-NEXT: global_atomic_sub_u32 v0, v1, s[2:3] scope:SCOPE_DEV
1046 ; GFX12-NEXT: s_wait_storecnt 0x0
1047 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1048 ; GFX12-NEXT: s_endpgm
1049 %zext.offset = zext i32 %voffset to i64
1050 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1051 %unused = atomicrmw sub ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
1055 define amdgpu_ps void @global_sub_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
1056 ; GFX9-LABEL: global_sub_saddr_i32_nortn_neg128:
1058 ; GFX9-NEXT: global_atomic_sub v0, v1, s[2:3] offset:-128
1059 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1060 ; GFX9-NEXT: buffer_wbinvl1
1061 ; GFX9-NEXT: s_endpgm
1063 ; GFX10-LABEL: global_sub_saddr_i32_nortn_neg128:
1065 ; GFX10-NEXT: global_atomic_sub v0, v1, s[2:3] offset:-128
1066 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1067 ; GFX10-NEXT: buffer_gl1_inv
1068 ; GFX10-NEXT: buffer_gl0_inv
1069 ; GFX10-NEXT: s_endpgm
1071 ; GFX11-LABEL: global_sub_saddr_i32_nortn_neg128:
1073 ; GFX11-NEXT: global_atomic_sub_u32 v0, v1, s[2:3] offset:-128
1074 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1075 ; GFX11-NEXT: buffer_gl1_inv
1076 ; GFX11-NEXT: buffer_gl0_inv
1077 ; GFX11-NEXT: s_endpgm
1079 ; GFX12-LABEL: global_sub_saddr_i32_nortn_neg128:
1081 ; GFX12-NEXT: global_atomic_sub_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
1082 ; GFX12-NEXT: s_wait_storecnt 0x0
1083 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1084 ; GFX12-NEXT: s_endpgm
1085 %zext.offset = zext i32 %voffset to i64
1086 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1087 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1088 %unused = atomicrmw sub ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
1092 define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
1093 ; GFX9-LABEL: global_sub_saddr_i64_rtn:
1095 ; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v0, v[1:2], s[2:3] glc
1096 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1097 ; GFX9-NEXT: buffer_wbinvl1
1098 ; GFX9-NEXT: ; return to shader part epilog
1100 ; GFX10-LABEL: global_sub_saddr_i64_rtn:
1102 ; GFX10-NEXT: global_atomic_sub_x2 v[0:1], v0, v[1:2], s[2:3] glc
1103 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1104 ; GFX10-NEXT: buffer_gl1_inv
1105 ; GFX10-NEXT: buffer_gl0_inv
1106 ; GFX10-NEXT: ; return to shader part epilog
1108 ; GFX11-LABEL: global_sub_saddr_i64_rtn:
1110 ; GFX11-NEXT: global_atomic_sub_u64 v[0:1], v0, v[1:2], s[2:3] glc
1111 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1112 ; GFX11-NEXT: buffer_gl1_inv
1113 ; GFX11-NEXT: buffer_gl0_inv
1114 ; GFX11-NEXT: ; return to shader part epilog
1116 ; GFX12-LABEL: global_sub_saddr_i64_rtn:
1118 ; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1119 ; GFX12-NEXT: s_wait_loadcnt 0x0
1120 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1121 ; GFX12-NEXT: ; return to shader part epilog
1122 %zext.offset = zext i32 %voffset to i64
1123 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1124 %rtn = atomicrmw sub ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
1125 %cast.rtn = bitcast i64 %rtn to <2 x float>
1126 ret <2 x float> %cast.rtn
1129 define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
1130 ; GFX9-LABEL: global_sub_saddr_i64_rtn_neg128:
1132 ; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
1133 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1134 ; GFX9-NEXT: buffer_wbinvl1
1135 ; GFX9-NEXT: ; return to shader part epilog
1137 ; GFX10-LABEL: global_sub_saddr_i64_rtn_neg128:
1139 ; GFX10-NEXT: global_atomic_sub_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
1140 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1141 ; GFX10-NEXT: buffer_gl1_inv
1142 ; GFX10-NEXT: buffer_gl0_inv
1143 ; GFX10-NEXT: ; return to shader part epilog
1145 ; GFX11-LABEL: global_sub_saddr_i64_rtn_neg128:
1147 ; GFX11-NEXT: global_atomic_sub_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
1148 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1149 ; GFX11-NEXT: buffer_gl1_inv
1150 ; GFX11-NEXT: buffer_gl0_inv
1151 ; GFX11-NEXT: ; return to shader part epilog
1153 ; GFX12-LABEL: global_sub_saddr_i64_rtn_neg128:
1155 ; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1156 ; GFX12-NEXT: s_wait_loadcnt 0x0
1157 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1158 ; GFX12-NEXT: ; return to shader part epilog
1159 %zext.offset = zext i32 %voffset to i64
1160 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1161 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1162 %rtn = atomicrmw sub ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
1163 %cast.rtn = bitcast i64 %rtn to <2 x float>
1164 ret <2 x float> %cast.rtn
1167 define amdgpu_ps void @global_sub_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
1168 ; GFX9-LABEL: global_sub_saddr_i64_nortn:
1170 ; GFX9-NEXT: global_atomic_sub_x2 v0, v[1:2], s[2:3]
1171 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1172 ; GFX9-NEXT: buffer_wbinvl1
1173 ; GFX9-NEXT: s_endpgm
1175 ; GFX10-LABEL: global_sub_saddr_i64_nortn:
1177 ; GFX10-NEXT: global_atomic_sub_x2 v0, v[1:2], s[2:3]
1178 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1179 ; GFX10-NEXT: buffer_gl1_inv
1180 ; GFX10-NEXT: buffer_gl0_inv
1181 ; GFX10-NEXT: s_endpgm
1183 ; GFX11-LABEL: global_sub_saddr_i64_nortn:
1185 ; GFX11-NEXT: global_atomic_sub_u64 v0, v[1:2], s[2:3]
1186 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1187 ; GFX11-NEXT: buffer_gl1_inv
1188 ; GFX11-NEXT: buffer_gl0_inv
1189 ; GFX11-NEXT: s_endpgm
1191 ; GFX12-LABEL: global_sub_saddr_i64_nortn:
1193 ; GFX12-NEXT: global_atomic_sub_u64 v0, v[1:2], s[2:3] scope:SCOPE_DEV
1194 ; GFX12-NEXT: s_wait_storecnt 0x0
1195 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1196 ; GFX12-NEXT: s_endpgm
1197 %zext.offset = zext i32 %voffset to i64
1198 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1199 %unused = atomicrmw sub ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
1203 define amdgpu_ps void @global_sub_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
1204 ; GFX9-LABEL: global_sub_saddr_i64_nortn_neg128:
1206 ; GFX9-NEXT: global_atomic_sub_x2 v0, v[1:2], s[2:3] offset:-128
1207 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1208 ; GFX9-NEXT: buffer_wbinvl1
1209 ; GFX9-NEXT: s_endpgm
1211 ; GFX10-LABEL: global_sub_saddr_i64_nortn_neg128:
1213 ; GFX10-NEXT: global_atomic_sub_x2 v0, v[1:2], s[2:3] offset:-128
1214 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1215 ; GFX10-NEXT: buffer_gl1_inv
1216 ; GFX10-NEXT: buffer_gl0_inv
1217 ; GFX10-NEXT: s_endpgm
1219 ; GFX11-LABEL: global_sub_saddr_i64_nortn_neg128:
1221 ; GFX11-NEXT: global_atomic_sub_u64 v0, v[1:2], s[2:3] offset:-128
1222 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1223 ; GFX11-NEXT: buffer_gl1_inv
1224 ; GFX11-NEXT: buffer_gl0_inv
1225 ; GFX11-NEXT: s_endpgm
1227 ; GFX12-LABEL: global_sub_saddr_i64_nortn_neg128:
1229 ; GFX12-NEXT: global_atomic_sub_u64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_DEV
1230 ; GFX12-NEXT: s_wait_storecnt 0x0
1231 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1232 ; GFX12-NEXT: s_endpgm
1233 %zext.offset = zext i32 %voffset to i64
1234 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1235 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1236 %unused = atomicrmw sub ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
1240 ; --------------------------------------------------------------------------------
1242 ; --------------------------------------------------------------------------------
1244 define amdgpu_ps float @global_and_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
1245 ; GFX9-LABEL: global_and_saddr_i32_rtn:
1247 ; GFX9-NEXT: global_atomic_and v0, v0, v1, s[2:3] glc
1248 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1249 ; GFX9-NEXT: buffer_wbinvl1
1250 ; GFX9-NEXT: ; return to shader part epilog
1252 ; GFX10-LABEL: global_and_saddr_i32_rtn:
1254 ; GFX10-NEXT: global_atomic_and v0, v0, v1, s[2:3] glc
1255 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1256 ; GFX10-NEXT: buffer_gl1_inv
1257 ; GFX10-NEXT: buffer_gl0_inv
1258 ; GFX10-NEXT: ; return to shader part epilog
1260 ; GFX11-LABEL: global_and_saddr_i32_rtn:
1262 ; GFX11-NEXT: global_atomic_and_b32 v0, v0, v1, s[2:3] glc
1263 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1264 ; GFX11-NEXT: buffer_gl1_inv
1265 ; GFX11-NEXT: buffer_gl0_inv
1266 ; GFX11-NEXT: ; return to shader part epilog
1268 ; GFX12-LABEL: global_and_saddr_i32_rtn:
1270 ; GFX12-NEXT: global_atomic_and_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1271 ; GFX12-NEXT: s_wait_loadcnt 0x0
1272 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1273 ; GFX12-NEXT: ; return to shader part epilog
1274 %zext.offset = zext i32 %voffset to i64
1275 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1276 %rtn = atomicrmw and ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
1277 %cast.rtn = bitcast i32 %rtn to float
1281 define amdgpu_ps float @global_and_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
1282 ; GFX9-LABEL: global_and_saddr_i32_rtn_neg128:
1284 ; GFX9-NEXT: global_atomic_and v0, v0, v1, s[2:3] offset:-128 glc
1285 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1286 ; GFX9-NEXT: buffer_wbinvl1
1287 ; GFX9-NEXT: ; return to shader part epilog
1289 ; GFX10-LABEL: global_and_saddr_i32_rtn_neg128:
1291 ; GFX10-NEXT: global_atomic_and v0, v0, v1, s[2:3] offset:-128 glc
1292 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1293 ; GFX10-NEXT: buffer_gl1_inv
1294 ; GFX10-NEXT: buffer_gl0_inv
1295 ; GFX10-NEXT: ; return to shader part epilog
1297 ; GFX11-LABEL: global_and_saddr_i32_rtn_neg128:
1299 ; GFX11-NEXT: global_atomic_and_b32 v0, v0, v1, s[2:3] offset:-128 glc
1300 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1301 ; GFX11-NEXT: buffer_gl1_inv
1302 ; GFX11-NEXT: buffer_gl0_inv
1303 ; GFX11-NEXT: ; return to shader part epilog
1305 ; GFX12-LABEL: global_and_saddr_i32_rtn_neg128:
1307 ; GFX12-NEXT: global_atomic_and_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1308 ; GFX12-NEXT: s_wait_loadcnt 0x0
1309 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1310 ; GFX12-NEXT: ; return to shader part epilog
1311 %zext.offset = zext i32 %voffset to i64
1312 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1313 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1314 %rtn = atomicrmw and ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
1315 %cast.rtn = bitcast i32 %rtn to float
1319 define amdgpu_ps void @global_and_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
1320 ; GFX9-LABEL: global_and_saddr_i32_nortn:
1322 ; GFX9-NEXT: global_atomic_and v0, v1, s[2:3]
1323 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1324 ; GFX9-NEXT: buffer_wbinvl1
1325 ; GFX9-NEXT: s_endpgm
1327 ; GFX10-LABEL: global_and_saddr_i32_nortn:
1329 ; GFX10-NEXT: global_atomic_and v0, v1, s[2:3]
1330 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1331 ; GFX10-NEXT: buffer_gl1_inv
1332 ; GFX10-NEXT: buffer_gl0_inv
1333 ; GFX10-NEXT: s_endpgm
1335 ; GFX11-LABEL: global_and_saddr_i32_nortn:
1337 ; GFX11-NEXT: global_atomic_and_b32 v0, v1, s[2:3]
1338 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1339 ; GFX11-NEXT: buffer_gl1_inv
1340 ; GFX11-NEXT: buffer_gl0_inv
1341 ; GFX11-NEXT: s_endpgm
1343 ; GFX12-LABEL: global_and_saddr_i32_nortn:
1345 ; GFX12-NEXT: global_atomic_and_b32 v0, v1, s[2:3] scope:SCOPE_DEV
1346 ; GFX12-NEXT: s_wait_storecnt 0x0
1347 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1348 ; GFX12-NEXT: s_endpgm
1349 %zext.offset = zext i32 %voffset to i64
1350 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1351 %unused = atomicrmw and ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
1355 define amdgpu_ps void @global_and_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
1356 ; GFX9-LABEL: global_and_saddr_i32_nortn_neg128:
1358 ; GFX9-NEXT: global_atomic_and v0, v1, s[2:3] offset:-128
1359 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1360 ; GFX9-NEXT: buffer_wbinvl1
1361 ; GFX9-NEXT: s_endpgm
1363 ; GFX10-LABEL: global_and_saddr_i32_nortn_neg128:
1365 ; GFX10-NEXT: global_atomic_and v0, v1, s[2:3] offset:-128
1366 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1367 ; GFX10-NEXT: buffer_gl1_inv
1368 ; GFX10-NEXT: buffer_gl0_inv
1369 ; GFX10-NEXT: s_endpgm
1371 ; GFX11-LABEL: global_and_saddr_i32_nortn_neg128:
1373 ; GFX11-NEXT: global_atomic_and_b32 v0, v1, s[2:3] offset:-128
1374 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1375 ; GFX11-NEXT: buffer_gl1_inv
1376 ; GFX11-NEXT: buffer_gl0_inv
1377 ; GFX11-NEXT: s_endpgm
1379 ; GFX12-LABEL: global_and_saddr_i32_nortn_neg128:
1381 ; GFX12-NEXT: global_atomic_and_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
1382 ; GFX12-NEXT: s_wait_storecnt 0x0
1383 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1384 ; GFX12-NEXT: s_endpgm
1385 %zext.offset = zext i32 %voffset to i64
1386 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1387 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1388 %unused = atomicrmw and ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
1392 define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
1393 ; GFX9-LABEL: global_and_saddr_i64_rtn:
1395 ; GFX9-NEXT: global_atomic_and_x2 v[0:1], v0, v[1:2], s[2:3] glc
1396 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1397 ; GFX9-NEXT: buffer_wbinvl1
1398 ; GFX9-NEXT: ; return to shader part epilog
1400 ; GFX10-LABEL: global_and_saddr_i64_rtn:
1402 ; GFX10-NEXT: global_atomic_and_x2 v[0:1], v0, v[1:2], s[2:3] glc
1403 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1404 ; GFX10-NEXT: buffer_gl1_inv
1405 ; GFX10-NEXT: buffer_gl0_inv
1406 ; GFX10-NEXT: ; return to shader part epilog
1408 ; GFX11-LABEL: global_and_saddr_i64_rtn:
1410 ; GFX11-NEXT: global_atomic_and_b64 v[0:1], v0, v[1:2], s[2:3] glc
1411 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1412 ; GFX11-NEXT: buffer_gl1_inv
1413 ; GFX11-NEXT: buffer_gl0_inv
1414 ; GFX11-NEXT: ; return to shader part epilog
1416 ; GFX12-LABEL: global_and_saddr_i64_rtn:
1418 ; GFX12-NEXT: global_atomic_and_b64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1419 ; GFX12-NEXT: s_wait_loadcnt 0x0
1420 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1421 ; GFX12-NEXT: ; return to shader part epilog
1422 %zext.offset = zext i32 %voffset to i64
1423 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1424 %rtn = atomicrmw and ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
1425 %cast.rtn = bitcast i64 %rtn to <2 x float>
1426 ret <2 x float> %cast.rtn
1429 define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
1430 ; GFX9-LABEL: global_and_saddr_i64_rtn_neg128:
1432 ; GFX9-NEXT: global_atomic_and_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
1433 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1434 ; GFX9-NEXT: buffer_wbinvl1
1435 ; GFX9-NEXT: ; return to shader part epilog
1437 ; GFX10-LABEL: global_and_saddr_i64_rtn_neg128:
1439 ; GFX10-NEXT: global_atomic_and_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
1440 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1441 ; GFX10-NEXT: buffer_gl1_inv
1442 ; GFX10-NEXT: buffer_gl0_inv
1443 ; GFX10-NEXT: ; return to shader part epilog
1445 ; GFX11-LABEL: global_and_saddr_i64_rtn_neg128:
1447 ; GFX11-NEXT: global_atomic_and_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
1448 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1449 ; GFX11-NEXT: buffer_gl1_inv
1450 ; GFX11-NEXT: buffer_gl0_inv
1451 ; GFX11-NEXT: ; return to shader part epilog
1453 ; GFX12-LABEL: global_and_saddr_i64_rtn_neg128:
1455 ; GFX12-NEXT: global_atomic_and_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1456 ; GFX12-NEXT: s_wait_loadcnt 0x0
1457 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1458 ; GFX12-NEXT: ; return to shader part epilog
1459 %zext.offset = zext i32 %voffset to i64
1460 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1461 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1462 %rtn = atomicrmw and ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
1463 %cast.rtn = bitcast i64 %rtn to <2 x float>
1464 ret <2 x float> %cast.rtn
1467 define amdgpu_ps void @global_and_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
1468 ; GFX9-LABEL: global_and_saddr_i64_nortn:
1470 ; GFX9-NEXT: global_atomic_and_x2 v0, v[1:2], s[2:3]
1471 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1472 ; GFX9-NEXT: buffer_wbinvl1
1473 ; GFX9-NEXT: s_endpgm
1475 ; GFX10-LABEL: global_and_saddr_i64_nortn:
1477 ; GFX10-NEXT: global_atomic_and_x2 v0, v[1:2], s[2:3]
1478 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1479 ; GFX10-NEXT: buffer_gl1_inv
1480 ; GFX10-NEXT: buffer_gl0_inv
1481 ; GFX10-NEXT: s_endpgm
1483 ; GFX11-LABEL: global_and_saddr_i64_nortn:
1485 ; GFX11-NEXT: global_atomic_and_b64 v0, v[1:2], s[2:3]
1486 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1487 ; GFX11-NEXT: buffer_gl1_inv
1488 ; GFX11-NEXT: buffer_gl0_inv
1489 ; GFX11-NEXT: s_endpgm
1491 ; GFX12-LABEL: global_and_saddr_i64_nortn:
1493 ; GFX12-NEXT: global_atomic_and_b64 v0, v[1:2], s[2:3] scope:SCOPE_DEV
1494 ; GFX12-NEXT: s_wait_storecnt 0x0
1495 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1496 ; GFX12-NEXT: s_endpgm
1497 %zext.offset = zext i32 %voffset to i64
1498 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1499 %unused = atomicrmw and ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
1503 define amdgpu_ps void @global_and_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
1504 ; GFX9-LABEL: global_and_saddr_i64_nortn_neg128:
1506 ; GFX9-NEXT: global_atomic_and_x2 v0, v[1:2], s[2:3] offset:-128
1507 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1508 ; GFX9-NEXT: buffer_wbinvl1
1509 ; GFX9-NEXT: s_endpgm
1511 ; GFX10-LABEL: global_and_saddr_i64_nortn_neg128:
1513 ; GFX10-NEXT: global_atomic_and_x2 v0, v[1:2], s[2:3] offset:-128
1514 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1515 ; GFX10-NEXT: buffer_gl1_inv
1516 ; GFX10-NEXT: buffer_gl0_inv
1517 ; GFX10-NEXT: s_endpgm
1519 ; GFX11-LABEL: global_and_saddr_i64_nortn_neg128:
1521 ; GFX11-NEXT: global_atomic_and_b64 v0, v[1:2], s[2:3] offset:-128
1522 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1523 ; GFX11-NEXT: buffer_gl1_inv
1524 ; GFX11-NEXT: buffer_gl0_inv
1525 ; GFX11-NEXT: s_endpgm
1527 ; GFX12-LABEL: global_and_saddr_i64_nortn_neg128:
1529 ; GFX12-NEXT: global_atomic_and_b64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_DEV
1530 ; GFX12-NEXT: s_wait_storecnt 0x0
1531 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1532 ; GFX12-NEXT: s_endpgm
1533 %zext.offset = zext i32 %voffset to i64
1534 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1535 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1536 %unused = atomicrmw and ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
1540 ; --------------------------------------------------------------------------------
1542 ; --------------------------------------------------------------------------------
1544 define amdgpu_ps float @global_or_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
1545 ; GFX9-LABEL: global_or_saddr_i32_rtn:
1547 ; GFX9-NEXT: global_atomic_or v0, v0, v1, s[2:3] glc
1548 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1549 ; GFX9-NEXT: buffer_wbinvl1
1550 ; GFX9-NEXT: ; return to shader part epilog
1552 ; GFX10-LABEL: global_or_saddr_i32_rtn:
1554 ; GFX10-NEXT: global_atomic_or v0, v0, v1, s[2:3] glc
1555 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1556 ; GFX10-NEXT: buffer_gl1_inv
1557 ; GFX10-NEXT: buffer_gl0_inv
1558 ; GFX10-NEXT: ; return to shader part epilog
1560 ; GFX11-LABEL: global_or_saddr_i32_rtn:
1562 ; GFX11-NEXT: global_atomic_or_b32 v0, v0, v1, s[2:3] glc
1563 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1564 ; GFX11-NEXT: buffer_gl1_inv
1565 ; GFX11-NEXT: buffer_gl0_inv
1566 ; GFX11-NEXT: ; return to shader part epilog
1568 ; GFX12-LABEL: global_or_saddr_i32_rtn:
1570 ; GFX12-NEXT: global_atomic_or_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1571 ; GFX12-NEXT: s_wait_loadcnt 0x0
1572 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1573 ; GFX12-NEXT: ; return to shader part epilog
1574 %zext.offset = zext i32 %voffset to i64
1575 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1576 %rtn = atomicrmw or ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
1577 %cast.rtn = bitcast i32 %rtn to float
1581 define amdgpu_ps float @global_or_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
1582 ; GFX9-LABEL: global_or_saddr_i32_rtn_neg128:
1584 ; GFX9-NEXT: global_atomic_or v0, v0, v1, s[2:3] offset:-128 glc
1585 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1586 ; GFX9-NEXT: buffer_wbinvl1
1587 ; GFX9-NEXT: ; return to shader part epilog
1589 ; GFX10-LABEL: global_or_saddr_i32_rtn_neg128:
1591 ; GFX10-NEXT: global_atomic_or v0, v0, v1, s[2:3] offset:-128 glc
1592 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1593 ; GFX10-NEXT: buffer_gl1_inv
1594 ; GFX10-NEXT: buffer_gl0_inv
1595 ; GFX10-NEXT: ; return to shader part epilog
1597 ; GFX11-LABEL: global_or_saddr_i32_rtn_neg128:
1599 ; GFX11-NEXT: global_atomic_or_b32 v0, v0, v1, s[2:3] offset:-128 glc
1600 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1601 ; GFX11-NEXT: buffer_gl1_inv
1602 ; GFX11-NEXT: buffer_gl0_inv
1603 ; GFX11-NEXT: ; return to shader part epilog
1605 ; GFX12-LABEL: global_or_saddr_i32_rtn_neg128:
1607 ; GFX12-NEXT: global_atomic_or_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1608 ; GFX12-NEXT: s_wait_loadcnt 0x0
1609 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1610 ; GFX12-NEXT: ; return to shader part epilog
1611 %zext.offset = zext i32 %voffset to i64
1612 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1613 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1614 %rtn = atomicrmw or ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
1615 %cast.rtn = bitcast i32 %rtn to float
1619 define amdgpu_ps void @global_or_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
1620 ; GFX9-LABEL: global_or_saddr_i32_nortn:
1622 ; GFX9-NEXT: global_atomic_or v0, v1, s[2:3]
1623 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1624 ; GFX9-NEXT: buffer_wbinvl1
1625 ; GFX9-NEXT: s_endpgm
1627 ; GFX10-LABEL: global_or_saddr_i32_nortn:
1629 ; GFX10-NEXT: global_atomic_or v0, v1, s[2:3]
1630 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1631 ; GFX10-NEXT: buffer_gl1_inv
1632 ; GFX10-NEXT: buffer_gl0_inv
1633 ; GFX10-NEXT: s_endpgm
1635 ; GFX11-LABEL: global_or_saddr_i32_nortn:
1637 ; GFX11-NEXT: global_atomic_or_b32 v0, v1, s[2:3]
1638 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1639 ; GFX11-NEXT: buffer_gl1_inv
1640 ; GFX11-NEXT: buffer_gl0_inv
1641 ; GFX11-NEXT: s_endpgm
1643 ; GFX12-LABEL: global_or_saddr_i32_nortn:
1645 ; GFX12-NEXT: global_atomic_or_b32 v0, v1, s[2:3] scope:SCOPE_DEV
1646 ; GFX12-NEXT: s_wait_storecnt 0x0
1647 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1648 ; GFX12-NEXT: s_endpgm
1649 %zext.offset = zext i32 %voffset to i64
1650 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1651 %unused = atomicrmw or ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
1655 define amdgpu_ps void @global_or_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
1656 ; GFX9-LABEL: global_or_saddr_i32_nortn_neg128:
1658 ; GFX9-NEXT: global_atomic_or v0, v1, s[2:3] offset:-128
1659 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1660 ; GFX9-NEXT: buffer_wbinvl1
1661 ; GFX9-NEXT: s_endpgm
1663 ; GFX10-LABEL: global_or_saddr_i32_nortn_neg128:
1665 ; GFX10-NEXT: global_atomic_or v0, v1, s[2:3] offset:-128
1666 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1667 ; GFX10-NEXT: buffer_gl1_inv
1668 ; GFX10-NEXT: buffer_gl0_inv
1669 ; GFX10-NEXT: s_endpgm
1671 ; GFX11-LABEL: global_or_saddr_i32_nortn_neg128:
1673 ; GFX11-NEXT: global_atomic_or_b32 v0, v1, s[2:3] offset:-128
1674 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1675 ; GFX11-NEXT: buffer_gl1_inv
1676 ; GFX11-NEXT: buffer_gl0_inv
1677 ; GFX11-NEXT: s_endpgm
1679 ; GFX12-LABEL: global_or_saddr_i32_nortn_neg128:
1681 ; GFX12-NEXT: global_atomic_or_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
1682 ; GFX12-NEXT: s_wait_storecnt 0x0
1683 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1684 ; GFX12-NEXT: s_endpgm
1685 %zext.offset = zext i32 %voffset to i64
1686 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1687 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1688 %unused = atomicrmw or ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
1692 define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
1693 ; GFX9-LABEL: global_or_saddr_i64_rtn:
1695 ; GFX9-NEXT: global_atomic_or_x2 v[0:1], v0, v[1:2], s[2:3] glc
1696 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1697 ; GFX9-NEXT: buffer_wbinvl1
1698 ; GFX9-NEXT: ; return to shader part epilog
1700 ; GFX10-LABEL: global_or_saddr_i64_rtn:
1702 ; GFX10-NEXT: global_atomic_or_x2 v[0:1], v0, v[1:2], s[2:3] glc
1703 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1704 ; GFX10-NEXT: buffer_gl1_inv
1705 ; GFX10-NEXT: buffer_gl0_inv
1706 ; GFX10-NEXT: ; return to shader part epilog
1708 ; GFX11-LABEL: global_or_saddr_i64_rtn:
1710 ; GFX11-NEXT: global_atomic_or_b64 v[0:1], v0, v[1:2], s[2:3] glc
1711 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1712 ; GFX11-NEXT: buffer_gl1_inv
1713 ; GFX11-NEXT: buffer_gl0_inv
1714 ; GFX11-NEXT: ; return to shader part epilog
1716 ; GFX12-LABEL: global_or_saddr_i64_rtn:
1718 ; GFX12-NEXT: global_atomic_or_b64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1719 ; GFX12-NEXT: s_wait_loadcnt 0x0
1720 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1721 ; GFX12-NEXT: ; return to shader part epilog
1722 %zext.offset = zext i32 %voffset to i64
1723 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1724 %rtn = atomicrmw or ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
1725 %cast.rtn = bitcast i64 %rtn to <2 x float>
1726 ret <2 x float> %cast.rtn
1729 define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
1730 ; GFX9-LABEL: global_or_saddr_i64_rtn_neg128:
1732 ; GFX9-NEXT: global_atomic_or_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
1733 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1734 ; GFX9-NEXT: buffer_wbinvl1
1735 ; GFX9-NEXT: ; return to shader part epilog
1737 ; GFX10-LABEL: global_or_saddr_i64_rtn_neg128:
1739 ; GFX10-NEXT: global_atomic_or_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
1740 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1741 ; GFX10-NEXT: buffer_gl1_inv
1742 ; GFX10-NEXT: buffer_gl0_inv
1743 ; GFX10-NEXT: ; return to shader part epilog
1745 ; GFX11-LABEL: global_or_saddr_i64_rtn_neg128:
1747 ; GFX11-NEXT: global_atomic_or_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
1748 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1749 ; GFX11-NEXT: buffer_gl1_inv
1750 ; GFX11-NEXT: buffer_gl0_inv
1751 ; GFX11-NEXT: ; return to shader part epilog
1753 ; GFX12-LABEL: global_or_saddr_i64_rtn_neg128:
1755 ; GFX12-NEXT: global_atomic_or_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1756 ; GFX12-NEXT: s_wait_loadcnt 0x0
1757 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1758 ; GFX12-NEXT: ; return to shader part epilog
1759 %zext.offset = zext i32 %voffset to i64
1760 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1761 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1762 %rtn = atomicrmw or ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
1763 %cast.rtn = bitcast i64 %rtn to <2 x float>
1764 ret <2 x float> %cast.rtn
1767 define amdgpu_ps void @global_or_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
1768 ; GFX9-LABEL: global_or_saddr_i64_nortn:
1770 ; GFX9-NEXT: global_atomic_or_x2 v0, v[1:2], s[2:3]
1771 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1772 ; GFX9-NEXT: buffer_wbinvl1
1773 ; GFX9-NEXT: s_endpgm
1775 ; GFX10-LABEL: global_or_saddr_i64_nortn:
1777 ; GFX10-NEXT: global_atomic_or_x2 v0, v[1:2], s[2:3]
1778 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1779 ; GFX10-NEXT: buffer_gl1_inv
1780 ; GFX10-NEXT: buffer_gl0_inv
1781 ; GFX10-NEXT: s_endpgm
1783 ; GFX11-LABEL: global_or_saddr_i64_nortn:
1785 ; GFX11-NEXT: global_atomic_or_b64 v0, v[1:2], s[2:3]
1786 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1787 ; GFX11-NEXT: buffer_gl1_inv
1788 ; GFX11-NEXT: buffer_gl0_inv
1789 ; GFX11-NEXT: s_endpgm
1791 ; GFX12-LABEL: global_or_saddr_i64_nortn:
1793 ; GFX12-NEXT: global_atomic_or_b64 v0, v[1:2], s[2:3] scope:SCOPE_DEV
1794 ; GFX12-NEXT: s_wait_storecnt 0x0
1795 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1796 ; GFX12-NEXT: s_endpgm
1797 %zext.offset = zext i32 %voffset to i64
1798 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1799 %unused = atomicrmw or ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
1803 define amdgpu_ps void @global_or_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
1804 ; GFX9-LABEL: global_or_saddr_i64_nortn_neg128:
1806 ; GFX9-NEXT: global_atomic_or_x2 v0, v[1:2], s[2:3] offset:-128
1807 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1808 ; GFX9-NEXT: buffer_wbinvl1
1809 ; GFX9-NEXT: s_endpgm
1811 ; GFX10-LABEL: global_or_saddr_i64_nortn_neg128:
1813 ; GFX10-NEXT: global_atomic_or_x2 v0, v[1:2], s[2:3] offset:-128
1814 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1815 ; GFX10-NEXT: buffer_gl1_inv
1816 ; GFX10-NEXT: buffer_gl0_inv
1817 ; GFX10-NEXT: s_endpgm
1819 ; GFX11-LABEL: global_or_saddr_i64_nortn_neg128:
1821 ; GFX11-NEXT: global_atomic_or_b64 v0, v[1:2], s[2:3] offset:-128
1822 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1823 ; GFX11-NEXT: buffer_gl1_inv
1824 ; GFX11-NEXT: buffer_gl0_inv
1825 ; GFX11-NEXT: s_endpgm
1827 ; GFX12-LABEL: global_or_saddr_i64_nortn_neg128:
1829 ; GFX12-NEXT: global_atomic_or_b64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_DEV
1830 ; GFX12-NEXT: s_wait_storecnt 0x0
1831 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1832 ; GFX12-NEXT: s_endpgm
1833 %zext.offset = zext i32 %voffset to i64
1834 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1835 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1836 %unused = atomicrmw or ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
1840 ; --------------------------------------------------------------------------------
1842 ; --------------------------------------------------------------------------------
1844 define amdgpu_ps float @global_xor_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
1845 ; GFX9-LABEL: global_xor_saddr_i32_rtn:
1847 ; GFX9-NEXT: global_atomic_xor v0, v0, v1, s[2:3] glc
1848 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1849 ; GFX9-NEXT: buffer_wbinvl1
1850 ; GFX9-NEXT: ; return to shader part epilog
1852 ; GFX10-LABEL: global_xor_saddr_i32_rtn:
1854 ; GFX10-NEXT: global_atomic_xor v0, v0, v1, s[2:3] glc
1855 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1856 ; GFX10-NEXT: buffer_gl1_inv
1857 ; GFX10-NEXT: buffer_gl0_inv
1858 ; GFX10-NEXT: ; return to shader part epilog
1860 ; GFX11-LABEL: global_xor_saddr_i32_rtn:
1862 ; GFX11-NEXT: global_atomic_xor_b32 v0, v0, v1, s[2:3] glc
1863 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1864 ; GFX11-NEXT: buffer_gl1_inv
1865 ; GFX11-NEXT: buffer_gl0_inv
1866 ; GFX11-NEXT: ; return to shader part epilog
1868 ; GFX12-LABEL: global_xor_saddr_i32_rtn:
1870 ; GFX12-NEXT: global_atomic_xor_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1871 ; GFX12-NEXT: s_wait_loadcnt 0x0
1872 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1873 ; GFX12-NEXT: ; return to shader part epilog
1874 %zext.offset = zext i32 %voffset to i64
1875 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1876 %rtn = atomicrmw xor ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
1877 %cast.rtn = bitcast i32 %rtn to float
1881 define amdgpu_ps float @global_xor_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
1882 ; GFX9-LABEL: global_xor_saddr_i32_rtn_neg128:
1884 ; GFX9-NEXT: global_atomic_xor v0, v0, v1, s[2:3] offset:-128 glc
1885 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1886 ; GFX9-NEXT: buffer_wbinvl1
1887 ; GFX9-NEXT: ; return to shader part epilog
1889 ; GFX10-LABEL: global_xor_saddr_i32_rtn_neg128:
1891 ; GFX10-NEXT: global_atomic_xor v0, v0, v1, s[2:3] offset:-128 glc
1892 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1893 ; GFX10-NEXT: buffer_gl1_inv
1894 ; GFX10-NEXT: buffer_gl0_inv
1895 ; GFX10-NEXT: ; return to shader part epilog
1897 ; GFX11-LABEL: global_xor_saddr_i32_rtn_neg128:
1899 ; GFX11-NEXT: global_atomic_xor_b32 v0, v0, v1, s[2:3] offset:-128 glc
1900 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1901 ; GFX11-NEXT: buffer_gl1_inv
1902 ; GFX11-NEXT: buffer_gl0_inv
1903 ; GFX11-NEXT: ; return to shader part epilog
1905 ; GFX12-LABEL: global_xor_saddr_i32_rtn_neg128:
1907 ; GFX12-NEXT: global_atomic_xor_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1908 ; GFX12-NEXT: s_wait_loadcnt 0x0
1909 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1910 ; GFX12-NEXT: ; return to shader part epilog
1911 %zext.offset = zext i32 %voffset to i64
1912 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1913 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1914 %rtn = atomicrmw xor ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
1915 %cast.rtn = bitcast i32 %rtn to float
1919 define amdgpu_ps void @global_xor_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
1920 ; GFX9-LABEL: global_xor_saddr_i32_nortn:
1922 ; GFX9-NEXT: global_atomic_xor v0, v1, s[2:3]
1923 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1924 ; GFX9-NEXT: buffer_wbinvl1
1925 ; GFX9-NEXT: s_endpgm
1927 ; GFX10-LABEL: global_xor_saddr_i32_nortn:
1929 ; GFX10-NEXT: global_atomic_xor v0, v1, s[2:3]
1930 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1931 ; GFX10-NEXT: buffer_gl1_inv
1932 ; GFX10-NEXT: buffer_gl0_inv
1933 ; GFX10-NEXT: s_endpgm
1935 ; GFX11-LABEL: global_xor_saddr_i32_nortn:
1937 ; GFX11-NEXT: global_atomic_xor_b32 v0, v1, s[2:3]
1938 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1939 ; GFX11-NEXT: buffer_gl1_inv
1940 ; GFX11-NEXT: buffer_gl0_inv
1941 ; GFX11-NEXT: s_endpgm
1943 ; GFX12-LABEL: global_xor_saddr_i32_nortn:
1945 ; GFX12-NEXT: global_atomic_xor_b32 v0, v1, s[2:3] scope:SCOPE_DEV
1946 ; GFX12-NEXT: s_wait_storecnt 0x0
1947 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1948 ; GFX12-NEXT: s_endpgm
1949 %zext.offset = zext i32 %voffset to i64
1950 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1951 %unused = atomicrmw xor ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
1955 define amdgpu_ps void @global_xor_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
1956 ; GFX9-LABEL: global_xor_saddr_i32_nortn_neg128:
1958 ; GFX9-NEXT: global_atomic_xor v0, v1, s[2:3] offset:-128
1959 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1960 ; GFX9-NEXT: buffer_wbinvl1
1961 ; GFX9-NEXT: s_endpgm
1963 ; GFX10-LABEL: global_xor_saddr_i32_nortn_neg128:
1965 ; GFX10-NEXT: global_atomic_xor v0, v1, s[2:3] offset:-128
1966 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1967 ; GFX10-NEXT: buffer_gl1_inv
1968 ; GFX10-NEXT: buffer_gl0_inv
1969 ; GFX10-NEXT: s_endpgm
1971 ; GFX11-LABEL: global_xor_saddr_i32_nortn_neg128:
1973 ; GFX11-NEXT: global_atomic_xor_b32 v0, v1, s[2:3] offset:-128
1974 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1975 ; GFX11-NEXT: buffer_gl1_inv
1976 ; GFX11-NEXT: buffer_gl0_inv
1977 ; GFX11-NEXT: s_endpgm
1979 ; GFX12-LABEL: global_xor_saddr_i32_nortn_neg128:
1981 ; GFX12-NEXT: global_atomic_xor_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
1982 ; GFX12-NEXT: s_wait_storecnt 0x0
1983 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1984 ; GFX12-NEXT: s_endpgm
1985 %zext.offset = zext i32 %voffset to i64
1986 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1987 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1988 %unused = atomicrmw xor ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
1992 define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
1993 ; GFX9-LABEL: global_xor_saddr_i64_rtn:
1995 ; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v0, v[1:2], s[2:3] glc
1996 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1997 ; GFX9-NEXT: buffer_wbinvl1
1998 ; GFX9-NEXT: ; return to shader part epilog
2000 ; GFX10-LABEL: global_xor_saddr_i64_rtn:
2002 ; GFX10-NEXT: global_atomic_xor_x2 v[0:1], v0, v[1:2], s[2:3] glc
2003 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2004 ; GFX10-NEXT: buffer_gl1_inv
2005 ; GFX10-NEXT: buffer_gl0_inv
2006 ; GFX10-NEXT: ; return to shader part epilog
2008 ; GFX11-LABEL: global_xor_saddr_i64_rtn:
2010 ; GFX11-NEXT: global_atomic_xor_b64 v[0:1], v0, v[1:2], s[2:3] glc
2011 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2012 ; GFX11-NEXT: buffer_gl1_inv
2013 ; GFX11-NEXT: buffer_gl0_inv
2014 ; GFX11-NEXT: ; return to shader part epilog
2016 ; GFX12-LABEL: global_xor_saddr_i64_rtn:
2018 ; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
2019 ; GFX12-NEXT: s_wait_loadcnt 0x0
2020 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
2021 ; GFX12-NEXT: ; return to shader part epilog
2022 %zext.offset = zext i32 %voffset to i64
2023 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2024 %rtn = atomicrmw xor ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
2025 %cast.rtn = bitcast i64 %rtn to <2 x float>
2026 ret <2 x float> %cast.rtn
2029 define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
2030 ; GFX9-LABEL: global_xor_saddr_i64_rtn_neg128:
2032 ; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
2033 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2034 ; GFX9-NEXT: buffer_wbinvl1
2035 ; GFX9-NEXT: ; return to shader part epilog
2037 ; GFX10-LABEL: global_xor_saddr_i64_rtn_neg128:
2039 ; GFX10-NEXT: global_atomic_xor_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
2040 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2041 ; GFX10-NEXT: buffer_gl1_inv
2042 ; GFX10-NEXT: buffer_gl0_inv
2043 ; GFX10-NEXT: ; return to shader part epilog
2045 ; GFX11-LABEL: global_xor_saddr_i64_rtn_neg128:
2047 ; GFX11-NEXT: global_atomic_xor_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
2048 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2049 ; GFX11-NEXT: buffer_gl1_inv
2050 ; GFX11-NEXT: buffer_gl0_inv
2051 ; GFX11-NEXT: ; return to shader part epilog
2053 ; GFX12-LABEL: global_xor_saddr_i64_rtn_neg128:
2055 ; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
2056 ; GFX12-NEXT: s_wait_loadcnt 0x0
2057 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
2058 ; GFX12-NEXT: ; return to shader part epilog
2059 %zext.offset = zext i32 %voffset to i64
2060 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2061 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2062 %rtn = atomicrmw xor ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
2063 %cast.rtn = bitcast i64 %rtn to <2 x float>
2064 ret <2 x float> %cast.rtn
2067 define amdgpu_ps void @global_xor_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
2068 ; GFX9-LABEL: global_xor_saddr_i64_nortn:
2070 ; GFX9-NEXT: global_atomic_xor_x2 v0, v[1:2], s[2:3]
2071 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2072 ; GFX9-NEXT: buffer_wbinvl1
2073 ; GFX9-NEXT: s_endpgm
2075 ; GFX10-LABEL: global_xor_saddr_i64_nortn:
2077 ; GFX10-NEXT: global_atomic_xor_x2 v0, v[1:2], s[2:3]
2078 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2079 ; GFX10-NEXT: buffer_gl1_inv
2080 ; GFX10-NEXT: buffer_gl0_inv
2081 ; GFX10-NEXT: s_endpgm
2083 ; GFX11-LABEL: global_xor_saddr_i64_nortn:
2085 ; GFX11-NEXT: global_atomic_xor_b64 v0, v[1:2], s[2:3]
2086 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
2087 ; GFX11-NEXT: buffer_gl1_inv
2088 ; GFX11-NEXT: buffer_gl0_inv
2089 ; GFX11-NEXT: s_endpgm
2091 ; GFX12-LABEL: global_xor_saddr_i64_nortn:
2093 ; GFX12-NEXT: global_atomic_xor_b64 v0, v[1:2], s[2:3] scope:SCOPE_DEV
2094 ; GFX12-NEXT: s_wait_storecnt 0x0
2095 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
2096 ; GFX12-NEXT: s_endpgm
2097 %zext.offset = zext i32 %voffset to i64
2098 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2099 %unused = atomicrmw xor ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
2103 define amdgpu_ps void @global_xor_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
2104 ; GFX9-LABEL: global_xor_saddr_i64_nortn_neg128:
2106 ; GFX9-NEXT: global_atomic_xor_x2 v0, v[1:2], s[2:3] offset:-128
2107 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2108 ; GFX9-NEXT: buffer_wbinvl1
2109 ; GFX9-NEXT: s_endpgm
2111 ; GFX10-LABEL: global_xor_saddr_i64_nortn_neg128:
2113 ; GFX10-NEXT: global_atomic_xor_x2 v0, v[1:2], s[2:3] offset:-128
2114 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2115 ; GFX10-NEXT: buffer_gl1_inv
2116 ; GFX10-NEXT: buffer_gl0_inv
2117 ; GFX10-NEXT: s_endpgm
2119 ; GFX11-LABEL: global_xor_saddr_i64_nortn_neg128:
2121 ; GFX11-NEXT: global_atomic_xor_b64 v0, v[1:2], s[2:3] offset:-128
2122 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
2123 ; GFX11-NEXT: buffer_gl1_inv
2124 ; GFX11-NEXT: buffer_gl0_inv
2125 ; GFX11-NEXT: s_endpgm
2127 ; GFX12-LABEL: global_xor_saddr_i64_nortn_neg128:
2129 ; GFX12-NEXT: global_atomic_xor_b64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_DEV
2130 ; GFX12-NEXT: s_wait_storecnt 0x0
2131 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
2132 ; GFX12-NEXT: s_endpgm
2133 %zext.offset = zext i32 %voffset to i64
2134 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2135 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2136 %unused = atomicrmw xor ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
2140 ; --------------------------------------------------------------------------------
2142 ; --------------------------------------------------------------------------------
2144 define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
2145 ; GFX9-LABEL: global_max_saddr_i32_rtn:
2147 ; GFX9-NEXT: global_atomic_smax v0, v0, v1, s[2:3] glc
2148 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2149 ; GFX9-NEXT: ; return to shader part epilog
2151 ; GFX10-LABEL: global_max_saddr_i32_rtn:
2153 ; GFX10-NEXT: global_atomic_smax v0, v0, v1, s[2:3] glc
2154 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2155 ; GFX10-NEXT: buffer_gl0_inv
2156 ; GFX10-NEXT: ; return to shader part epilog
2158 ; GFX11-LABEL: global_max_saddr_i32_rtn:
2160 ; GFX11-NEXT: global_atomic_max_i32 v0, v0, v1, s[2:3] glc
2161 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2162 ; GFX11-NEXT: buffer_gl0_inv
2163 ; GFX11-NEXT: ; return to shader part epilog
2165 ; GFX12-LABEL: global_max_saddr_i32_rtn:
2167 ; GFX12-NEXT: global_atomic_max_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
2168 ; GFX12-NEXT: s_wait_loadcnt 0x0
2169 ; GFX12-NEXT: global_inv scope:SCOPE_SE
2170 ; GFX12-NEXT: ; return to shader part epilog
2171 %zext.offset = zext i32 %voffset to i64
2172 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2173 %rtn = atomicrmw max ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst
2174 %cast.rtn = bitcast i32 %rtn to float
2178 define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
2179 ; GFX9-LABEL: global_max_saddr_i32_rtn_neg128:
2181 ; GFX9-NEXT: global_atomic_smax v0, v0, v1, s[2:3] offset:-128 glc
2182 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2183 ; GFX9-NEXT: ; return to shader part epilog
2185 ; GFX10-LABEL: global_max_saddr_i32_rtn_neg128:
2187 ; GFX10-NEXT: global_atomic_smax v0, v0, v1, s[2:3] offset:-128 glc
2188 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2189 ; GFX10-NEXT: buffer_gl0_inv
2190 ; GFX10-NEXT: ; return to shader part epilog
2192 ; GFX11-LABEL: global_max_saddr_i32_rtn_neg128:
2194 ; GFX11-NEXT: global_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 glc
2195 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2196 ; GFX11-NEXT: buffer_gl0_inv
2197 ; GFX11-NEXT: ; return to shader part epilog
2199 ; GFX12-LABEL: global_max_saddr_i32_rtn_neg128:
2201 ; GFX12-NEXT: global_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
2202 ; GFX12-NEXT: s_wait_loadcnt 0x0
2203 ; GFX12-NEXT: global_inv scope:SCOPE_SE
2204 ; GFX12-NEXT: ; return to shader part epilog
2205 %zext.offset = zext i32 %voffset to i64
2206 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2207 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2208 %rtn = atomicrmw max ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst
2209 %cast.rtn = bitcast i32 %rtn to float
2213 define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
2214 ; GFX9-LABEL: global_max_saddr_i32_nortn:
2216 ; GFX9-NEXT: global_atomic_smax v0, v1, s[2:3]
2217 ; GFX9-NEXT: s_endpgm
2219 ; GFX10-LABEL: global_max_saddr_i32_nortn:
2221 ; GFX10-NEXT: global_atomic_smax v0, v1, s[2:3]
2222 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2223 ; GFX10-NEXT: buffer_gl0_inv
2224 ; GFX10-NEXT: s_endpgm
2226 ; GFX11-LABEL: global_max_saddr_i32_nortn:
2228 ; GFX11-NEXT: global_atomic_max_i32 v0, v1, s[2:3]
2229 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
2230 ; GFX11-NEXT: buffer_gl0_inv
2231 ; GFX11-NEXT: s_endpgm
2233 ; GFX12-LABEL: global_max_saddr_i32_nortn:
2235 ; GFX12-NEXT: global_atomic_max_i32 v0, v1, s[2:3] scope:SCOPE_SE
2236 ; GFX12-NEXT: s_wait_storecnt 0x0
2237 ; GFX12-NEXT: global_inv scope:SCOPE_SE
2238 ; GFX12-NEXT: s_endpgm
2239 %zext.offset = zext i32 %voffset to i64
2240 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2241 %unused = atomicrmw max ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst
2245 define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
2246 ; GFX9-LABEL: global_max_saddr_i32_nortn_neg128:
2248 ; GFX9-NEXT: global_atomic_smax v0, v1, s[2:3] offset:-128
2249 ; GFX9-NEXT: s_endpgm
2251 ; GFX10-LABEL: global_max_saddr_i32_nortn_neg128:
2253 ; GFX10-NEXT: global_atomic_smax v0, v1, s[2:3] offset:-128
2254 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2255 ; GFX10-NEXT: buffer_gl0_inv
2256 ; GFX10-NEXT: s_endpgm
2258 ; GFX11-LABEL: global_max_saddr_i32_nortn_neg128:
2260 ; GFX11-NEXT: global_atomic_max_i32 v0, v1, s[2:3] offset:-128
2261 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
2262 ; GFX11-NEXT: buffer_gl0_inv
2263 ; GFX11-NEXT: s_endpgm
2265 ; GFX12-LABEL: global_max_saddr_i32_nortn_neg128:
2267 ; GFX12-NEXT: global_atomic_max_i32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE
2268 ; GFX12-NEXT: s_wait_storecnt 0x0
2269 ; GFX12-NEXT: global_inv scope:SCOPE_SE
2270 ; GFX12-NEXT: s_endpgm
2271 %zext.offset = zext i32 %voffset to i64
2272 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2273 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2274 %unused = atomicrmw max ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst
2278 define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
2279 ; GFX9-LABEL: global_max_saddr_i64_rtn:
2281 ; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] glc
2282 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2283 ; GFX9-NEXT: ; return to shader part epilog
2285 ; GFX10-LABEL: global_max_saddr_i64_rtn:
2287 ; GFX10-NEXT: global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] glc
2288 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2289 ; GFX10-NEXT: buffer_gl0_inv
2290 ; GFX10-NEXT: ; return to shader part epilog
2292 ; GFX11-LABEL: global_max_saddr_i64_rtn:
2294 ; GFX11-NEXT: global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] glc
2295 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2296 ; GFX11-NEXT: buffer_gl0_inv
2297 ; GFX11-NEXT: ; return to shader part epilog
2299 ; GFX12-LABEL: global_max_saddr_i64_rtn:
2301 ; GFX12-NEXT: global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
2302 ; GFX12-NEXT: s_wait_loadcnt 0x0
2303 ; GFX12-NEXT: global_inv scope:SCOPE_SE
2304 ; GFX12-NEXT: ; return to shader part epilog
2305 %zext.offset = zext i32 %voffset to i64
2306 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2307 %rtn = atomicrmw max ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst
2308 %cast.rtn = bitcast i64 %rtn to <2 x float>
2309 ret <2 x float> %cast.rtn
2312 define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
2313 ; GFX9-LABEL: global_max_saddr_i64_rtn_neg128:
2315 ; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
2316 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2317 ; GFX9-NEXT: ; return to shader part epilog
2319 ; GFX10-LABEL: global_max_saddr_i64_rtn_neg128:
2321 ; GFX10-NEXT: global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
2322 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2323 ; GFX10-NEXT: buffer_gl0_inv
2324 ; GFX10-NEXT: ; return to shader part epilog
2326 ; GFX11-LABEL: global_max_saddr_i64_rtn_neg128:
2328 ; GFX11-NEXT: global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
2329 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2330 ; GFX11-NEXT: buffer_gl0_inv
2331 ; GFX11-NEXT: ; return to shader part epilog
2333 ; GFX12-LABEL: global_max_saddr_i64_rtn_neg128:
2335 ; GFX12-NEXT: global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
2336 ; GFX12-NEXT: s_wait_loadcnt 0x0
2337 ; GFX12-NEXT: global_inv scope:SCOPE_SE
2338 ; GFX12-NEXT: ; return to shader part epilog
2339 %zext.offset = zext i32 %voffset to i64
2340 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2341 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2342 %rtn = atomicrmw max ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst
2343 %cast.rtn = bitcast i64 %rtn to <2 x float>
2344 ret <2 x float> %cast.rtn
2347 define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
2348 ; GFX9-LABEL: global_max_saddr_i64_nortn:
2350 ; GFX9-NEXT: global_atomic_smax_x2 v0, v[1:2], s[2:3]
2351 ; GFX9-NEXT: s_endpgm
2353 ; GFX10-LABEL: global_max_saddr_i64_nortn:
2355 ; GFX10-NEXT: global_atomic_smax_x2 v0, v[1:2], s[2:3]
2356 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2357 ; GFX10-NEXT: buffer_gl0_inv
2358 ; GFX10-NEXT: s_endpgm
2360 ; GFX11-LABEL: global_max_saddr_i64_nortn:
2362 ; GFX11-NEXT: global_atomic_max_i64 v0, v[1:2], s[2:3]
2363 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
2364 ; GFX11-NEXT: buffer_gl0_inv
2365 ; GFX11-NEXT: s_endpgm
2367 ; GFX12-LABEL: global_max_saddr_i64_nortn:
2369 ; GFX12-NEXT: global_atomic_max_i64 v0, v[1:2], s[2:3] scope:SCOPE_SE
2370 ; GFX12-NEXT: s_wait_storecnt 0x0
2371 ; GFX12-NEXT: global_inv scope:SCOPE_SE
2372 ; GFX12-NEXT: s_endpgm
2373 %zext.offset = zext i32 %voffset to i64
2374 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2375 %unused = atomicrmw max ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst
2379 define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
2380 ; GFX9-LABEL: global_max_saddr_i64_nortn_neg128:
2382 ; GFX9-NEXT: global_atomic_smax_x2 v0, v[1:2], s[2:3] offset:-128
2383 ; GFX9-NEXT: s_endpgm
2385 ; GFX10-LABEL: global_max_saddr_i64_nortn_neg128:
2387 ; GFX10-NEXT: global_atomic_smax_x2 v0, v[1:2], s[2:3] offset:-128
2388 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2389 ; GFX10-NEXT: buffer_gl0_inv
2390 ; GFX10-NEXT: s_endpgm
2392 ; GFX11-LABEL: global_max_saddr_i64_nortn_neg128:
2394 ; GFX11-NEXT: global_atomic_max_i64 v0, v[1:2], s[2:3] offset:-128
2395 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
2396 ; GFX11-NEXT: buffer_gl0_inv
2397 ; GFX11-NEXT: s_endpgm
2399 ; GFX12-LABEL: global_max_saddr_i64_nortn_neg128:
2401 ; GFX12-NEXT: global_atomic_max_i64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE
2402 ; GFX12-NEXT: s_wait_storecnt 0x0
2403 ; GFX12-NEXT: global_inv scope:SCOPE_SE
2404 ; GFX12-NEXT: s_endpgm
2405 %zext.offset = zext i32 %voffset to i64
2406 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2407 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2408 %unused = atomicrmw max ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst
2412 ; --------------------------------------------------------------------------------
2414 ; --------------------------------------------------------------------------------
2416 define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
2417 ; GFX9-LABEL: global_min_saddr_i32_rtn:
2419 ; GFX9-NEXT: global_atomic_smin v0, v0, v1, s[2:3] glc
2420 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2421 ; GFX9-NEXT: ; return to shader part epilog
2423 ; GFX10-LABEL: global_min_saddr_i32_rtn:
2425 ; GFX10-NEXT: global_atomic_smin v0, v0, v1, s[2:3] glc
2426 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2427 ; GFX10-NEXT: buffer_gl0_inv
2428 ; GFX10-NEXT: ; return to shader part epilog
2430 ; GFX11-LABEL: global_min_saddr_i32_rtn:
2432 ; GFX11-NEXT: global_atomic_min_i32 v0, v0, v1, s[2:3] glc
2433 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2434 ; GFX11-NEXT: buffer_gl0_inv
2435 ; GFX11-NEXT: ; return to shader part epilog
2437 ; GFX12-LABEL: global_min_saddr_i32_rtn:
2439 ; GFX12-NEXT: global_atomic_min_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
2440 ; GFX12-NEXT: s_wait_loadcnt 0x0
2441 ; GFX12-NEXT: global_inv scope:SCOPE_SE
2442 ; GFX12-NEXT: ; return to shader part epilog
2443 %zext.offset = zext i32 %voffset to i64
2444 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2445 %rtn = atomicrmw min ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst
2446 %cast.rtn = bitcast i32 %rtn to float
2450 define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
2451 ; GFX9-LABEL: global_min_saddr_i32_rtn_neg128:
2453 ; GFX9-NEXT: global_atomic_smin v0, v0, v1, s[2:3] offset:-128 glc
2454 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2455 ; GFX9-NEXT: ; return to shader part epilog
2457 ; GFX10-LABEL: global_min_saddr_i32_rtn_neg128:
2459 ; GFX10-NEXT: global_atomic_smin v0, v0, v1, s[2:3] offset:-128 glc
2460 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2461 ; GFX10-NEXT: buffer_gl0_inv
2462 ; GFX10-NEXT: ; return to shader part epilog
2464 ; GFX11-LABEL: global_min_saddr_i32_rtn_neg128:
2466 ; GFX11-NEXT: global_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 glc
2467 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2468 ; GFX11-NEXT: buffer_gl0_inv
2469 ; GFX11-NEXT: ; return to shader part epilog
2471 ; GFX12-LABEL: global_min_saddr_i32_rtn_neg128:
2473 ; GFX12-NEXT: global_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
2474 ; GFX12-NEXT: s_wait_loadcnt 0x0
2475 ; GFX12-NEXT: global_inv scope:SCOPE_SE
2476 ; GFX12-NEXT: ; return to shader part epilog
2477 %zext.offset = zext i32 %voffset to i64
2478 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2479 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2480 %rtn = atomicrmw min ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst
2481 %cast.rtn = bitcast i32 %rtn to float
2485 define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
2486 ; GFX9-LABEL: global_min_saddr_i32_nortn:
2488 ; GFX9-NEXT: global_atomic_smin v0, v1, s[2:3]
2489 ; GFX9-NEXT: s_endpgm
2491 ; GFX10-LABEL: global_min_saddr_i32_nortn:
2493 ; GFX10-NEXT: global_atomic_smin v0, v1, s[2:3]
2494 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2495 ; GFX10-NEXT: buffer_gl0_inv
2496 ; GFX10-NEXT: s_endpgm
2498 ; GFX11-LABEL: global_min_saddr_i32_nortn:
2500 ; GFX11-NEXT: global_atomic_min_i32 v0, v1, s[2:3]
2501 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
2502 ; GFX11-NEXT: buffer_gl0_inv
2503 ; GFX11-NEXT: s_endpgm
2505 ; GFX12-LABEL: global_min_saddr_i32_nortn:
2507 ; GFX12-NEXT: global_atomic_min_i32 v0, v1, s[2:3] scope:SCOPE_SE
2508 ; GFX12-NEXT: s_wait_storecnt 0x0
2509 ; GFX12-NEXT: global_inv scope:SCOPE_SE
2510 ; GFX12-NEXT: s_endpgm
2511 %zext.offset = zext i32 %voffset to i64
2512 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2513 %unused = atomicrmw min ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst
2517 define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
2518 ; GFX9-LABEL: global_min_saddr_i32_nortn_neg128:
2520 ; GFX9-NEXT: global_atomic_smin v0, v1, s[2:3] offset:-128
2521 ; GFX9-NEXT: s_endpgm
2523 ; GFX10-LABEL: global_min_saddr_i32_nortn_neg128:
2525 ; GFX10-NEXT: global_atomic_smin v0, v1, s[2:3] offset:-128
2526 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2527 ; GFX10-NEXT: buffer_gl0_inv
2528 ; GFX10-NEXT: s_endpgm
2530 ; GFX11-LABEL: global_min_saddr_i32_nortn_neg128:
2532 ; GFX11-NEXT: global_atomic_min_i32 v0, v1, s[2:3] offset:-128
2533 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
2534 ; GFX11-NEXT: buffer_gl0_inv
2535 ; GFX11-NEXT: s_endpgm
2537 ; GFX12-LABEL: global_min_saddr_i32_nortn_neg128:
2539 ; GFX12-NEXT: global_atomic_min_i32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE
2540 ; GFX12-NEXT: s_wait_storecnt 0x0
2541 ; GFX12-NEXT: global_inv scope:SCOPE_SE
2542 ; GFX12-NEXT: s_endpgm
2543 %zext.offset = zext i32 %voffset to i64
2544 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2545 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2546 %unused = atomicrmw min ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst
2550 define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
2551 ; GFX9-LABEL: global_min_saddr_i64_rtn:
2553 ; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] glc
2554 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2555 ; GFX9-NEXT: ; return to shader part epilog
2557 ; GFX10-LABEL: global_min_saddr_i64_rtn:
2559 ; GFX10-NEXT: global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] glc
2560 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2561 ; GFX10-NEXT: buffer_gl0_inv
2562 ; GFX10-NEXT: ; return to shader part epilog
2564 ; GFX11-LABEL: global_min_saddr_i64_rtn:
2566 ; GFX11-NEXT: global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] glc
2567 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2568 ; GFX11-NEXT: buffer_gl0_inv
2569 ; GFX11-NEXT: ; return to shader part epilog
2571 ; GFX12-LABEL: global_min_saddr_i64_rtn:
2573 ; GFX12-NEXT: global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
2574 ; GFX12-NEXT: s_wait_loadcnt 0x0
2575 ; GFX12-NEXT: global_inv scope:SCOPE_SE
2576 ; GFX12-NEXT: ; return to shader part epilog
2577 %zext.offset = zext i32 %voffset to i64
2578 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2579 %rtn = atomicrmw min ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst
2580 %cast.rtn = bitcast i64 %rtn to <2 x float>
2581 ret <2 x float> %cast.rtn
2584 define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
2585 ; GFX9-LABEL: global_min_saddr_i64_rtn_neg128:
2587 ; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
2588 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2589 ; GFX9-NEXT: ; return to shader part epilog
2591 ; GFX10-LABEL: global_min_saddr_i64_rtn_neg128:
2593 ; GFX10-NEXT: global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
2594 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2595 ; GFX10-NEXT: buffer_gl0_inv
2596 ; GFX10-NEXT: ; return to shader part epilog
2598 ; GFX11-LABEL: global_min_saddr_i64_rtn_neg128:
2600 ; GFX11-NEXT: global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
2601 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2602 ; GFX11-NEXT: buffer_gl0_inv
2603 ; GFX11-NEXT: ; return to shader part epilog
2605 ; GFX12-LABEL: global_min_saddr_i64_rtn_neg128:
2607 ; GFX12-NEXT: global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
2608 ; GFX12-NEXT: s_wait_loadcnt 0x0
2609 ; GFX12-NEXT: global_inv scope:SCOPE_SE
2610 ; GFX12-NEXT: ; return to shader part epilog
2611 %zext.offset = zext i32 %voffset to i64
2612 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2613 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2614 %rtn = atomicrmw min ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst
2615 %cast.rtn = bitcast i64 %rtn to <2 x float>
2616 ret <2 x float> %cast.rtn
2619 define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
2620 ; GFX9-LABEL: global_min_saddr_i64_nortn:
2622 ; GFX9-NEXT: global_atomic_smin_x2 v0, v[1:2], s[2:3]
2623 ; GFX9-NEXT: s_endpgm
2625 ; GFX10-LABEL: global_min_saddr_i64_nortn:
2627 ; GFX10-NEXT: global_atomic_smin_x2 v0, v[1:2], s[2:3]
2628 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2629 ; GFX10-NEXT: buffer_gl0_inv
2630 ; GFX10-NEXT: s_endpgm
2632 ; GFX11-LABEL: global_min_saddr_i64_nortn:
2634 ; GFX11-NEXT: global_atomic_min_i64 v0, v[1:2], s[2:3]
2635 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
2636 ; GFX11-NEXT: buffer_gl0_inv
2637 ; GFX11-NEXT: s_endpgm
2639 ; GFX12-LABEL: global_min_saddr_i64_nortn:
2641 ; GFX12-NEXT: global_atomic_min_i64 v0, v[1:2], s[2:3] scope:SCOPE_SE
2642 ; GFX12-NEXT: s_wait_storecnt 0x0
2643 ; GFX12-NEXT: global_inv scope:SCOPE_SE
2644 ; GFX12-NEXT: s_endpgm
2645 %zext.offset = zext i32 %voffset to i64
2646 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2647 %unused = atomicrmw min ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst
2651 define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
2652 ; GFX9-LABEL: global_min_saddr_i64_nortn_neg128:
2654 ; GFX9-NEXT: global_atomic_smin_x2 v0, v[1:2], s[2:3] offset:-128
2655 ; GFX9-NEXT: s_endpgm
2657 ; GFX10-LABEL: global_min_saddr_i64_nortn_neg128:
2659 ; GFX10-NEXT: global_atomic_smin_x2 v0, v[1:2], s[2:3] offset:-128
2660 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2661 ; GFX10-NEXT: buffer_gl0_inv
2662 ; GFX10-NEXT: s_endpgm
2664 ; GFX11-LABEL: global_min_saddr_i64_nortn_neg128:
2666 ; GFX11-NEXT: global_atomic_min_i64 v0, v[1:2], s[2:3] offset:-128
2667 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
2668 ; GFX11-NEXT: buffer_gl0_inv
2669 ; GFX11-NEXT: s_endpgm
2671 ; GFX12-LABEL: global_min_saddr_i64_nortn_neg128:
2673 ; GFX12-NEXT: global_atomic_min_i64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE
2674 ; GFX12-NEXT: s_wait_storecnt 0x0
2675 ; GFX12-NEXT: global_inv scope:SCOPE_SE
2676 ; GFX12-NEXT: s_endpgm
2677 %zext.offset = zext i32 %voffset to i64
2678 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2679 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2680 %unused = atomicrmw min ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst
2684 ; --------------------------------------------------------------------------------
2686 ; --------------------------------------------------------------------------------
2688 define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
2689 ; GFX9-LABEL: global_umax_saddr_i32_rtn:
2691 ; GFX9-NEXT: global_atomic_umax v0, v0, v1, s[2:3] glc
2692 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2693 ; GFX9-NEXT: ; return to shader part epilog
2695 ; GFX10-LABEL: global_umax_saddr_i32_rtn:
2697 ; GFX10-NEXT: global_atomic_umax v0, v0, v1, s[2:3] glc
2698 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2699 ; GFX10-NEXT: buffer_gl0_inv
2700 ; GFX10-NEXT: ; return to shader part epilog
2702 ; GFX11-LABEL: global_umax_saddr_i32_rtn:
2704 ; GFX11-NEXT: global_atomic_max_u32 v0, v0, v1, s[2:3] glc
2705 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2706 ; GFX11-NEXT: buffer_gl0_inv
2707 ; GFX11-NEXT: ; return to shader part epilog
2709 ; GFX12-LABEL: global_umax_saddr_i32_rtn:
2711 ; GFX12-NEXT: global_atomic_max_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
2712 ; GFX12-NEXT: s_wait_loadcnt 0x0
2713 ; GFX12-NEXT: global_inv scope:SCOPE_SE
2714 ; GFX12-NEXT: ; return to shader part epilog
2715 %zext.offset = zext i32 %voffset to i64
2716 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2717 %rtn = atomicrmw umax ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst
2718 %cast.rtn = bitcast i32 %rtn to float
2722 define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
2723 ; GFX9-LABEL: global_umax_saddr_i32_rtn_neg128:
2725 ; GFX9-NEXT: global_atomic_umax v0, v0, v1, s[2:3] offset:-128 glc
2726 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2727 ; GFX9-NEXT: ; return to shader part epilog
2729 ; GFX10-LABEL: global_umax_saddr_i32_rtn_neg128:
2731 ; GFX10-NEXT: global_atomic_umax v0, v0, v1, s[2:3] offset:-128 glc
2732 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2733 ; GFX10-NEXT: buffer_gl0_inv
2734 ; GFX10-NEXT: ; return to shader part epilog
2736 ; GFX11-LABEL: global_umax_saddr_i32_rtn_neg128:
2738 ; GFX11-NEXT: global_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 glc
2739 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2740 ; GFX11-NEXT: buffer_gl0_inv
2741 ; GFX11-NEXT: ; return to shader part epilog
2743 ; GFX12-LABEL: global_umax_saddr_i32_rtn_neg128:
2745 ; GFX12-NEXT: global_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
2746 ; GFX12-NEXT: s_wait_loadcnt 0x0
2747 ; GFX12-NEXT: global_inv scope:SCOPE_SE
2748 ; GFX12-NEXT: ; return to shader part epilog
2749 %zext.offset = zext i32 %voffset to i64
2750 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2751 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2752 %rtn = atomicrmw umax ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst
2753 %cast.rtn = bitcast i32 %rtn to float
2757 define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
2758 ; GFX9-LABEL: global_umax_saddr_i32_nortn:
2760 ; GFX9-NEXT: global_atomic_umax v0, v1, s[2:3]
2761 ; GFX9-NEXT: s_endpgm
2763 ; GFX10-LABEL: global_umax_saddr_i32_nortn:
2765 ; GFX10-NEXT: global_atomic_umax v0, v1, s[2:3]
2766 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2767 ; GFX10-NEXT: buffer_gl0_inv
2768 ; GFX10-NEXT: s_endpgm
2770 ; GFX11-LABEL: global_umax_saddr_i32_nortn:
2772 ; GFX11-NEXT: global_atomic_max_u32 v0, v1, s[2:3]
2773 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
2774 ; GFX11-NEXT: buffer_gl0_inv
2775 ; GFX11-NEXT: s_endpgm
2777 ; GFX12-LABEL: global_umax_saddr_i32_nortn:
2779 ; GFX12-NEXT: global_atomic_max_u32 v0, v1, s[2:3] scope:SCOPE_SE
2780 ; GFX12-NEXT: s_wait_storecnt 0x0
2781 ; GFX12-NEXT: global_inv scope:SCOPE_SE
2782 ; GFX12-NEXT: s_endpgm
2783 %zext.offset = zext i32 %voffset to i64
2784 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2785 %unused = atomicrmw umax ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst
2789 define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
2790 ; GFX9-LABEL: global_umax_saddr_i32_nortn_neg128:
2792 ; GFX9-NEXT: global_atomic_umax v0, v1, s[2:3] offset:-128
2793 ; GFX9-NEXT: s_endpgm
2795 ; GFX10-LABEL: global_umax_saddr_i32_nortn_neg128:
2797 ; GFX10-NEXT: global_atomic_umax v0, v1, s[2:3] offset:-128
2798 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2799 ; GFX10-NEXT: buffer_gl0_inv
2800 ; GFX10-NEXT: s_endpgm
2802 ; GFX11-LABEL: global_umax_saddr_i32_nortn_neg128:
2804 ; GFX11-NEXT: global_atomic_max_u32 v0, v1, s[2:3] offset:-128
2805 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
2806 ; GFX11-NEXT: buffer_gl0_inv
2807 ; GFX11-NEXT: s_endpgm
2809 ; GFX12-LABEL: global_umax_saddr_i32_nortn_neg128:
2811 ; GFX12-NEXT: global_atomic_max_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE
2812 ; GFX12-NEXT: s_wait_storecnt 0x0
2813 ; GFX12-NEXT: global_inv scope:SCOPE_SE
2814 ; GFX12-NEXT: s_endpgm
2815 %zext.offset = zext i32 %voffset to i64
2816 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2817 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2818 %unused = atomicrmw umax ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst
2822 define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
2823 ; GFX9-LABEL: global_umax_saddr_i64_rtn:
2825 ; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] glc
2826 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2827 ; GFX9-NEXT: ; return to shader part epilog
2829 ; GFX10-LABEL: global_umax_saddr_i64_rtn:
2831 ; GFX10-NEXT: global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] glc
2832 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2833 ; GFX10-NEXT: buffer_gl0_inv
2834 ; GFX10-NEXT: ; return to shader part epilog
2836 ; GFX11-LABEL: global_umax_saddr_i64_rtn:
2838 ; GFX11-NEXT: global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] glc
2839 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2840 ; GFX11-NEXT: buffer_gl0_inv
2841 ; GFX11-NEXT: ; return to shader part epilog
2843 ; GFX12-LABEL: global_umax_saddr_i64_rtn:
2845 ; GFX12-NEXT: global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
2846 ; GFX12-NEXT: s_wait_loadcnt 0x0
2847 ; GFX12-NEXT: global_inv scope:SCOPE_SE
2848 ; GFX12-NEXT: ; return to shader part epilog
2849 %zext.offset = zext i32 %voffset to i64
2850 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2851 %rtn = atomicrmw umax ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst
2852 %cast.rtn = bitcast i64 %rtn to <2 x float>
2853 ret <2 x float> %cast.rtn
2856 define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
2857 ; GFX9-LABEL: global_umax_saddr_i64_rtn_neg128:
2859 ; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
2860 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2861 ; GFX9-NEXT: ; return to shader part epilog
2863 ; GFX10-LABEL: global_umax_saddr_i64_rtn_neg128:
2865 ; GFX10-NEXT: global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
2866 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2867 ; GFX10-NEXT: buffer_gl0_inv
2868 ; GFX10-NEXT: ; return to shader part epilog
2870 ; GFX11-LABEL: global_umax_saddr_i64_rtn_neg128:
2872 ; GFX11-NEXT: global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
2873 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2874 ; GFX11-NEXT: buffer_gl0_inv
2875 ; GFX11-NEXT: ; return to shader part epilog
2877 ; GFX12-LABEL: global_umax_saddr_i64_rtn_neg128:
2879 ; GFX12-NEXT: global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
2880 ; GFX12-NEXT: s_wait_loadcnt 0x0
2881 ; GFX12-NEXT: global_inv scope:SCOPE_SE
2882 ; GFX12-NEXT: ; return to shader part epilog
2883 %zext.offset = zext i32 %voffset to i64
2884 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2885 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2886 %rtn = atomicrmw umax ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst
2887 %cast.rtn = bitcast i64 %rtn to <2 x float>
2888 ret <2 x float> %cast.rtn
2891 define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
2892 ; GFX9-LABEL: global_umax_saddr_i64_nortn:
2894 ; GFX9-NEXT: global_atomic_umax_x2 v0, v[1:2], s[2:3]
2895 ; GFX9-NEXT: s_endpgm
2897 ; GFX10-LABEL: global_umax_saddr_i64_nortn:
2899 ; GFX10-NEXT: global_atomic_umax_x2 v0, v[1:2], s[2:3]
2900 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2901 ; GFX10-NEXT: buffer_gl0_inv
2902 ; GFX10-NEXT: s_endpgm
2904 ; GFX11-LABEL: global_umax_saddr_i64_nortn:
2906 ; GFX11-NEXT: global_atomic_max_u64 v0, v[1:2], s[2:3]
2907 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
2908 ; GFX11-NEXT: buffer_gl0_inv
2909 ; GFX11-NEXT: s_endpgm
2911 ; GFX12-LABEL: global_umax_saddr_i64_nortn:
2913 ; GFX12-NEXT: global_atomic_max_u64 v0, v[1:2], s[2:3] scope:SCOPE_SE
2914 ; GFX12-NEXT: s_wait_storecnt 0x0
2915 ; GFX12-NEXT: global_inv scope:SCOPE_SE
2916 ; GFX12-NEXT: s_endpgm
2917 %zext.offset = zext i32 %voffset to i64
2918 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2919 %unused = atomicrmw umax ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst
2923 define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
2924 ; GFX9-LABEL: global_umax_saddr_i64_nortn_neg128:
2926 ; GFX9-NEXT: global_atomic_umax_x2 v0, v[1:2], s[2:3] offset:-128
2927 ; GFX9-NEXT: s_endpgm
2929 ; GFX10-LABEL: global_umax_saddr_i64_nortn_neg128:
2931 ; GFX10-NEXT: global_atomic_umax_x2 v0, v[1:2], s[2:3] offset:-128
2932 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2933 ; GFX10-NEXT: buffer_gl0_inv
2934 ; GFX10-NEXT: s_endpgm
2936 ; GFX11-LABEL: global_umax_saddr_i64_nortn_neg128:
2938 ; GFX11-NEXT: global_atomic_max_u64 v0, v[1:2], s[2:3] offset:-128
2939 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
2940 ; GFX11-NEXT: buffer_gl0_inv
2941 ; GFX11-NEXT: s_endpgm
2943 ; GFX12-LABEL: global_umax_saddr_i64_nortn_neg128:
2945 ; GFX12-NEXT: global_atomic_max_u64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE
2946 ; GFX12-NEXT: s_wait_storecnt 0x0
2947 ; GFX12-NEXT: global_inv scope:SCOPE_SE
2948 ; GFX12-NEXT: s_endpgm
2949 %zext.offset = zext i32 %voffset to i64
2950 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2951 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2952 %unused = atomicrmw umax ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst
2956 ; --------------------------------------------------------------------------------
2958 ; --------------------------------------------------------------------------------
2960 define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
2961 ; GFX9-LABEL: global_umin_saddr_i32_rtn:
2963 ; GFX9-NEXT: global_atomic_umin v0, v0, v1, s[2:3] glc
2964 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2965 ; GFX9-NEXT: ; return to shader part epilog
2967 ; GFX10-LABEL: global_umin_saddr_i32_rtn:
2969 ; GFX10-NEXT: global_atomic_umin v0, v0, v1, s[2:3] glc
2970 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2971 ; GFX10-NEXT: buffer_gl0_inv
2972 ; GFX10-NEXT: ; return to shader part epilog
2974 ; GFX11-LABEL: global_umin_saddr_i32_rtn:
2976 ; GFX11-NEXT: global_atomic_min_u32 v0, v0, v1, s[2:3] glc
2977 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2978 ; GFX11-NEXT: buffer_gl0_inv
2979 ; GFX11-NEXT: ; return to shader part epilog
2981 ; GFX12-LABEL: global_umin_saddr_i32_rtn:
2983 ; GFX12-NEXT: global_atomic_min_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
2984 ; GFX12-NEXT: s_wait_loadcnt 0x0
2985 ; GFX12-NEXT: global_inv scope:SCOPE_SE
2986 ; GFX12-NEXT: ; return to shader part epilog
2987 %zext.offset = zext i32 %voffset to i64
2988 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2989 %rtn = atomicrmw umin ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst
2990 %cast.rtn = bitcast i32 %rtn to float
2994 define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
2995 ; GFX9-LABEL: global_umin_saddr_i32_rtn_neg128:
2997 ; GFX9-NEXT: global_atomic_umin v0, v0, v1, s[2:3] offset:-128 glc
2998 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2999 ; GFX9-NEXT: ; return to shader part epilog
3001 ; GFX10-LABEL: global_umin_saddr_i32_rtn_neg128:
3003 ; GFX10-NEXT: global_atomic_umin v0, v0, v1, s[2:3] offset:-128 glc
3004 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3005 ; GFX10-NEXT: buffer_gl0_inv
3006 ; GFX10-NEXT: ; return to shader part epilog
3008 ; GFX11-LABEL: global_umin_saddr_i32_rtn_neg128:
3010 ; GFX11-NEXT: global_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 glc
3011 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3012 ; GFX11-NEXT: buffer_gl0_inv
3013 ; GFX11-NEXT: ; return to shader part epilog
3015 ; GFX12-LABEL: global_umin_saddr_i32_rtn_neg128:
3017 ; GFX12-NEXT: global_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
3018 ; GFX12-NEXT: s_wait_loadcnt 0x0
3019 ; GFX12-NEXT: global_inv scope:SCOPE_SE
3020 ; GFX12-NEXT: ; return to shader part epilog
3021 %zext.offset = zext i32 %voffset to i64
3022 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3023 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
3024 %rtn = atomicrmw umin ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst
3025 %cast.rtn = bitcast i32 %rtn to float
3029 define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
3030 ; GFX9-LABEL: global_umin_saddr_i32_nortn:
3032 ; GFX9-NEXT: global_atomic_umin v0, v1, s[2:3]
3033 ; GFX9-NEXT: s_endpgm
3035 ; GFX10-LABEL: global_umin_saddr_i32_nortn:
3037 ; GFX10-NEXT: global_atomic_umin v0, v1, s[2:3]
3038 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
3039 ; GFX10-NEXT: buffer_gl0_inv
3040 ; GFX10-NEXT: s_endpgm
3042 ; GFX11-LABEL: global_umin_saddr_i32_nortn:
3044 ; GFX11-NEXT: global_atomic_min_u32 v0, v1, s[2:3]
3045 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
3046 ; GFX11-NEXT: buffer_gl0_inv
3047 ; GFX11-NEXT: s_endpgm
3049 ; GFX12-LABEL: global_umin_saddr_i32_nortn:
3051 ; GFX12-NEXT: global_atomic_min_u32 v0, v1, s[2:3] scope:SCOPE_SE
3052 ; GFX12-NEXT: s_wait_storecnt 0x0
3053 ; GFX12-NEXT: global_inv scope:SCOPE_SE
3054 ; GFX12-NEXT: s_endpgm
3055 %zext.offset = zext i32 %voffset to i64
3056 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3057 %unused = atomicrmw umin ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst
3061 define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
3062 ; GFX9-LABEL: global_umin_saddr_i32_nortn_neg128:
3064 ; GFX9-NEXT: global_atomic_umin v0, v1, s[2:3] offset:-128
3065 ; GFX9-NEXT: s_endpgm
3067 ; GFX10-LABEL: global_umin_saddr_i32_nortn_neg128:
3069 ; GFX10-NEXT: global_atomic_umin v0, v1, s[2:3] offset:-128
3070 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
3071 ; GFX10-NEXT: buffer_gl0_inv
3072 ; GFX10-NEXT: s_endpgm
3074 ; GFX11-LABEL: global_umin_saddr_i32_nortn_neg128:
3076 ; GFX11-NEXT: global_atomic_min_u32 v0, v1, s[2:3] offset:-128
3077 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
3078 ; GFX11-NEXT: buffer_gl0_inv
3079 ; GFX11-NEXT: s_endpgm
3081 ; GFX12-LABEL: global_umin_saddr_i32_nortn_neg128:
3083 ; GFX12-NEXT: global_atomic_min_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE
3084 ; GFX12-NEXT: s_wait_storecnt 0x0
3085 ; GFX12-NEXT: global_inv scope:SCOPE_SE
3086 ; GFX12-NEXT: s_endpgm
3087 %zext.offset = zext i32 %voffset to i64
3088 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3089 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
3090 %unused = atomicrmw umin ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst
3094 define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
3095 ; GFX9-LABEL: global_umin_saddr_i64_rtn:
3097 ; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] glc
3098 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3099 ; GFX9-NEXT: ; return to shader part epilog
3101 ; GFX10-LABEL: global_umin_saddr_i64_rtn:
3103 ; GFX10-NEXT: global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] glc
3104 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3105 ; GFX10-NEXT: buffer_gl0_inv
3106 ; GFX10-NEXT: ; return to shader part epilog
3108 ; GFX11-LABEL: global_umin_saddr_i64_rtn:
3110 ; GFX11-NEXT: global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] glc
3111 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3112 ; GFX11-NEXT: buffer_gl0_inv
3113 ; GFX11-NEXT: ; return to shader part epilog
3115 ; GFX12-LABEL: global_umin_saddr_i64_rtn:
3117 ; GFX12-NEXT: global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
3118 ; GFX12-NEXT: s_wait_loadcnt 0x0
3119 ; GFX12-NEXT: global_inv scope:SCOPE_SE
3120 ; GFX12-NEXT: ; return to shader part epilog
3121 %zext.offset = zext i32 %voffset to i64
3122 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3123 %rtn = atomicrmw umin ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst
3124 %cast.rtn = bitcast i64 %rtn to <2 x float>
3125 ret <2 x float> %cast.rtn
3128 define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
3129 ; GFX9-LABEL: global_umin_saddr_i64_rtn_neg128:
3131 ; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
3132 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3133 ; GFX9-NEXT: ; return to shader part epilog
3135 ; GFX10-LABEL: global_umin_saddr_i64_rtn_neg128:
3137 ; GFX10-NEXT: global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
3138 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3139 ; GFX10-NEXT: buffer_gl0_inv
3140 ; GFX10-NEXT: ; return to shader part epilog
3142 ; GFX11-LABEL: global_umin_saddr_i64_rtn_neg128:
3144 ; GFX11-NEXT: global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
3145 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3146 ; GFX11-NEXT: buffer_gl0_inv
3147 ; GFX11-NEXT: ; return to shader part epilog
3149 ; GFX12-LABEL: global_umin_saddr_i64_rtn_neg128:
3151 ; GFX12-NEXT: global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE
3152 ; GFX12-NEXT: s_wait_loadcnt 0x0
3153 ; GFX12-NEXT: global_inv scope:SCOPE_SE
3154 ; GFX12-NEXT: ; return to shader part epilog
3155 %zext.offset = zext i32 %voffset to i64
3156 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3157 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
3158 %rtn = atomicrmw umin ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst
3159 %cast.rtn = bitcast i64 %rtn to <2 x float>
3160 ret <2 x float> %cast.rtn
3163 define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
3164 ; GFX9-LABEL: global_umin_saddr_i64_nortn:
3166 ; GFX9-NEXT: global_atomic_umin_x2 v0, v[1:2], s[2:3]
3167 ; GFX9-NEXT: s_endpgm
3169 ; GFX10-LABEL: global_umin_saddr_i64_nortn:
3171 ; GFX10-NEXT: global_atomic_umin_x2 v0, v[1:2], s[2:3]
3172 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
3173 ; GFX10-NEXT: buffer_gl0_inv
3174 ; GFX10-NEXT: s_endpgm
3176 ; GFX11-LABEL: global_umin_saddr_i64_nortn:
3178 ; GFX11-NEXT: global_atomic_min_u64 v0, v[1:2], s[2:3]
3179 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
3180 ; GFX11-NEXT: buffer_gl0_inv
3181 ; GFX11-NEXT: s_endpgm
3183 ; GFX12-LABEL: global_umin_saddr_i64_nortn:
3185 ; GFX12-NEXT: global_atomic_min_u64 v0, v[1:2], s[2:3] scope:SCOPE_SE
3186 ; GFX12-NEXT: s_wait_storecnt 0x0
3187 ; GFX12-NEXT: global_inv scope:SCOPE_SE
3188 ; GFX12-NEXT: s_endpgm
3189 %zext.offset = zext i32 %voffset to i64
3190 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3191 %unused = atomicrmw umin ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst
3195 define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
3196 ; GFX9-LABEL: global_umin_saddr_i64_nortn_neg128:
3198 ; GFX9-NEXT: global_atomic_umin_x2 v0, v[1:2], s[2:3] offset:-128
3199 ; GFX9-NEXT: s_endpgm
3201 ; GFX10-LABEL: global_umin_saddr_i64_nortn_neg128:
3203 ; GFX10-NEXT: global_atomic_umin_x2 v0, v[1:2], s[2:3] offset:-128
3204 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
3205 ; GFX10-NEXT: buffer_gl0_inv
3206 ; GFX10-NEXT: s_endpgm
3208 ; GFX11-LABEL: global_umin_saddr_i64_nortn_neg128:
3210 ; GFX11-NEXT: global_atomic_min_u64 v0, v[1:2], s[2:3] offset:-128
3211 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
3212 ; GFX11-NEXT: buffer_gl0_inv
3213 ; GFX11-NEXT: s_endpgm
3215 ; GFX12-LABEL: global_umin_saddr_i64_nortn_neg128:
3217 ; GFX12-NEXT: global_atomic_min_u64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE
3218 ; GFX12-NEXT: s_wait_storecnt 0x0
3219 ; GFX12-NEXT: global_inv scope:SCOPE_SE
3220 ; GFX12-NEXT: s_endpgm
3221 %zext.offset = zext i32 %voffset to i64
3222 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3223 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
3224 %unused = atomicrmw umin ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst
3228 ; --------------------------------------------------------------------------------
3230 ; --------------------------------------------------------------------------------
3232 define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
3233 ; GFX9-LABEL: global_cmpxchg_saddr_i32_rtn:
3235 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
3236 ; GFX9-NEXT: global_atomic_cmpswap v0, v0, v[2:3], s[2:3] glc
3237 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3238 ; GFX9-NEXT: buffer_wbinvl1
3239 ; GFX9-NEXT: ; return to shader part epilog
3241 ; GFX10-LABEL: global_cmpxchg_saddr_i32_rtn:
3243 ; GFX10-NEXT: v_mov_b32_e32 v3, v1
3244 ; GFX10-NEXT: global_atomic_cmpswap v0, v0, v[2:3], s[2:3] glc
3245 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3246 ; GFX10-NEXT: buffer_gl1_inv
3247 ; GFX10-NEXT: buffer_gl0_inv
3248 ; GFX10-NEXT: ; return to shader part epilog
3250 ; GFX11-LABEL: global_cmpxchg_saddr_i32_rtn:
3252 ; GFX11-NEXT: v_mov_b32_e32 v3, v1
3253 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v0, v[2:3], s[2:3] glc
3254 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3255 ; GFX11-NEXT: buffer_gl1_inv
3256 ; GFX11-NEXT: buffer_gl0_inv
3257 ; GFX11-NEXT: ; return to shader part epilog
3259 ; GFX12-LABEL: global_cmpxchg_saddr_i32_rtn:
3261 ; GFX12-NEXT: v_mov_b32_e32 v3, v1
3262 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
3263 ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v0, v[2:3], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
3264 ; GFX12-NEXT: s_wait_loadcnt 0x0
3265 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
3266 ; GFX12-NEXT: ; return to shader part epilog
3267 %zext.offset = zext i32 %voffset to i64
3268 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3269 %cmpxchg = cmpxchg ptr addrspace(1) %gep0, i32 %cmp, i32 %data seq_cst seq_cst
3270 %rtn = extractvalue { i32, i1 } %cmpxchg, 0
3271 %cast.rtn = bitcast i32 %rtn to float
3275 define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
3276 ; GFX9-LABEL: global_cmpxchg_saddr_i32_rtn_neg128:
3278 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
3279 ; GFX9-NEXT: global_atomic_cmpswap v0, v0, v[2:3], s[2:3] offset:-128 glc
3280 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3281 ; GFX9-NEXT: buffer_wbinvl1
3282 ; GFX9-NEXT: ; return to shader part epilog
3284 ; GFX10-LABEL: global_cmpxchg_saddr_i32_rtn_neg128:
3286 ; GFX10-NEXT: v_mov_b32_e32 v3, v1
3287 ; GFX10-NEXT: global_atomic_cmpswap v0, v0, v[2:3], s[2:3] offset:-128 glc
3288 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3289 ; GFX10-NEXT: buffer_gl1_inv
3290 ; GFX10-NEXT: buffer_gl0_inv
3291 ; GFX10-NEXT: ; return to shader part epilog
3293 ; GFX11-LABEL: global_cmpxchg_saddr_i32_rtn_neg128:
3295 ; GFX11-NEXT: v_mov_b32_e32 v3, v1
3296 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v0, v[2:3], s[2:3] offset:-128 glc
3297 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3298 ; GFX11-NEXT: buffer_gl1_inv
3299 ; GFX11-NEXT: buffer_gl0_inv
3300 ; GFX11-NEXT: ; return to shader part epilog
3302 ; GFX12-LABEL: global_cmpxchg_saddr_i32_rtn_neg128:
3304 ; GFX12-NEXT: v_mov_b32_e32 v3, v1
3305 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
3306 ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v0, v[2:3], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
3307 ; GFX12-NEXT: s_wait_loadcnt 0x0
3308 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
3309 ; GFX12-NEXT: ; return to shader part epilog
3310 %zext.offset = zext i32 %voffset to i64
3311 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3312 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
3313 %cmpxchg = cmpxchg ptr addrspace(1) %gep1, i32 %cmp, i32 %data seq_cst seq_cst
3314 %rtn = extractvalue { i32, i1 } %cmpxchg, 0
3315 %cast.rtn = bitcast i32 %rtn to float
3319 define amdgpu_ps void @global_cmpxchg_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
3320 ; GFX9-LABEL: global_cmpxchg_saddr_i32_nortn:
3322 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
3323 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], s[2:3]
3324 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3325 ; GFX9-NEXT: buffer_wbinvl1
3326 ; GFX9-NEXT: s_endpgm
3328 ; GFX10-LABEL: global_cmpxchg_saddr_i32_nortn:
3330 ; GFX10-NEXT: v_mov_b32_e32 v3, v1
3331 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], s[2:3]
3332 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
3333 ; GFX10-NEXT: buffer_gl1_inv
3334 ; GFX10-NEXT: buffer_gl0_inv
3335 ; GFX10-NEXT: s_endpgm
3337 ; GFX11-LABEL: global_cmpxchg_saddr_i32_nortn:
3339 ; GFX11-NEXT: v_mov_b32_e32 v3, v1
3340 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[2:3]
3341 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
3342 ; GFX11-NEXT: buffer_gl1_inv
3343 ; GFX11-NEXT: buffer_gl0_inv
3344 ; GFX11-NEXT: s_endpgm
3346 ; GFX12-LABEL: global_cmpxchg_saddr_i32_nortn:
3348 ; GFX12-NEXT: v_mov_b32_e32 v3, v1
3349 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
3350 ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[2:3] scope:SCOPE_SYS
3351 ; GFX12-NEXT: s_wait_storecnt 0x0
3352 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
3353 ; GFX12-NEXT: s_endpgm
3354 %zext.offset = zext i32 %voffset to i64
3355 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3356 %unused = cmpxchg ptr addrspace(1) %gep0, i32 %cmp, i32 %data seq_cst seq_cst
3360 define amdgpu_ps void @global_cmpxchg_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
3361 ; GFX9-LABEL: global_cmpxchg_saddr_i32_nortn_neg128:
3363 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
3364 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], s[2:3] offset:-128
3365 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3366 ; GFX9-NEXT: buffer_wbinvl1
3367 ; GFX9-NEXT: s_endpgm
3369 ; GFX10-LABEL: global_cmpxchg_saddr_i32_nortn_neg128:
3371 ; GFX10-NEXT: v_mov_b32_e32 v3, v1
3372 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], s[2:3] offset:-128
3373 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
3374 ; GFX10-NEXT: buffer_gl1_inv
3375 ; GFX10-NEXT: buffer_gl0_inv
3376 ; GFX10-NEXT: s_endpgm
3378 ; GFX11-LABEL: global_cmpxchg_saddr_i32_nortn_neg128:
3380 ; GFX11-NEXT: v_mov_b32_e32 v3, v1
3381 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[2:3] offset:-128
3382 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
3383 ; GFX11-NEXT: buffer_gl1_inv
3384 ; GFX11-NEXT: buffer_gl0_inv
3385 ; GFX11-NEXT: s_endpgm
3387 ; GFX12-LABEL: global_cmpxchg_saddr_i32_nortn_neg128:
3389 ; GFX12-NEXT: v_mov_b32_e32 v3, v1
3390 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
3391 ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[2:3] offset:-128 scope:SCOPE_SYS
3392 ; GFX12-NEXT: s_wait_storecnt 0x0
3393 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
3394 ; GFX12-NEXT: s_endpgm
3395 %zext.offset = zext i32 %voffset to i64
3396 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3397 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
3398 %unused = cmpxchg ptr addrspace(1) %gep1, i32 %cmp, i32 %data seq_cst seq_cst
3402 define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
3403 ; GFX9-LABEL: global_cmpxchg_saddr_i64_rtn:
3405 ; GFX9-NEXT: v_mov_b32_e32 v6, v2
3406 ; GFX9-NEXT: v_mov_b32_e32 v5, v1
3407 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v0, v[3:6], s[2:3] glc
3408 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3409 ; GFX9-NEXT: buffer_wbinvl1
3410 ; GFX9-NEXT: ; return to shader part epilog
3412 ; GFX10-LABEL: global_cmpxchg_saddr_i64_rtn:
3414 ; GFX10-NEXT: v_mov_b32_e32 v6, v2
3415 ; GFX10-NEXT: v_mov_b32_e32 v5, v1
3416 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v0, v[3:6], s[2:3] glc
3417 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3418 ; GFX10-NEXT: buffer_gl1_inv
3419 ; GFX10-NEXT: buffer_gl0_inv
3420 ; GFX10-NEXT: ; return to shader part epilog
3422 ; GFX11-LABEL: global_cmpxchg_saddr_i64_rtn:
3424 ; GFX11-NEXT: v_mov_b32_e32 v6, v2
3425 ; GFX11-NEXT: v_mov_b32_e32 v5, v1
3426 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[0:1], v0, v[3:6], s[2:3] glc
3427 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3428 ; GFX11-NEXT: buffer_gl1_inv
3429 ; GFX11-NEXT: buffer_gl0_inv
3430 ; GFX11-NEXT: ; return to shader part epilog
3432 ; GFX12-LABEL: global_cmpxchg_saddr_i64_rtn:
3434 ; GFX12-NEXT: v_mov_b32_e32 v6, v2
3435 ; GFX12-NEXT: v_mov_b32_e32 v5, v1
3436 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
3437 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v0, v[3:6], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
3438 ; GFX12-NEXT: s_wait_loadcnt 0x0
3439 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
3440 ; GFX12-NEXT: ; return to shader part epilog
3441 %zext.offset = zext i32 %voffset to i64
3442 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3443 %cmpxchg = cmpxchg ptr addrspace(1) %gep0, i64 %cmp, i64 %data seq_cst seq_cst
3444 %rtn = extractvalue { i64, i1 } %cmpxchg, 0
3445 %cast.rtn = bitcast i64 %rtn to <2 x float>
3446 ret <2 x float> %cast.rtn
3449 define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
3450 ; GFX9-LABEL: global_cmpxchg_saddr_i64_rtn_neg128:
3452 ; GFX9-NEXT: v_mov_b32_e32 v6, v2
3453 ; GFX9-NEXT: v_mov_b32_e32 v5, v1
3454 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v0, v[3:6], s[2:3] offset:-128 glc
3455 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3456 ; GFX9-NEXT: buffer_wbinvl1
3457 ; GFX9-NEXT: ; return to shader part epilog
3459 ; GFX10-LABEL: global_cmpxchg_saddr_i64_rtn_neg128:
3461 ; GFX10-NEXT: v_mov_b32_e32 v6, v2
3462 ; GFX10-NEXT: v_mov_b32_e32 v5, v1
3463 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v0, v[3:6], s[2:3] offset:-128 glc
3464 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3465 ; GFX10-NEXT: buffer_gl1_inv
3466 ; GFX10-NEXT: buffer_gl0_inv
3467 ; GFX10-NEXT: ; return to shader part epilog
3469 ; GFX11-LABEL: global_cmpxchg_saddr_i64_rtn_neg128:
3471 ; GFX11-NEXT: v_mov_b32_e32 v6, v2
3472 ; GFX11-NEXT: v_mov_b32_e32 v5, v1
3473 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[0:1], v0, v[3:6], s[2:3] offset:-128 glc
3474 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3475 ; GFX11-NEXT: buffer_gl1_inv
3476 ; GFX11-NEXT: buffer_gl0_inv
3477 ; GFX11-NEXT: ; return to shader part epilog
3479 ; GFX12-LABEL: global_cmpxchg_saddr_i64_rtn_neg128:
3481 ; GFX12-NEXT: v_mov_b32_e32 v6, v2
3482 ; GFX12-NEXT: v_mov_b32_e32 v5, v1
3483 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
3484 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v0, v[3:6], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
3485 ; GFX12-NEXT: s_wait_loadcnt 0x0
3486 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
3487 ; GFX12-NEXT: ; return to shader part epilog
3488 %zext.offset = zext i32 %voffset to i64
3489 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3490 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
3491 %cmpxchg = cmpxchg ptr addrspace(1) %gep1, i64 %cmp, i64 %data seq_cst seq_cst
3492 %rtn = extractvalue { i64, i1 } %cmpxchg, 0
3493 %cast.rtn = bitcast i64 %rtn to <2 x float>
3494 ret <2 x float> %cast.rtn
3497 define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
3498 ; GFX9-LABEL: global_cmpxchg_saddr_i64_nortn:
3500 ; GFX9-NEXT: v_mov_b32_e32 v6, v2
3501 ; GFX9-NEXT: v_mov_b32_e32 v5, v1
3502 ; GFX9-NEXT: global_atomic_cmpswap_x2 v0, v[3:6], s[2:3]
3503 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3504 ; GFX9-NEXT: buffer_wbinvl1
3505 ; GFX9-NEXT: s_endpgm
3507 ; GFX10-LABEL: global_cmpxchg_saddr_i64_nortn:
3509 ; GFX10-NEXT: v_mov_b32_e32 v6, v2
3510 ; GFX10-NEXT: v_mov_b32_e32 v5, v1
3511 ; GFX10-NEXT: global_atomic_cmpswap_x2 v0, v[3:6], s[2:3]
3512 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
3513 ; GFX10-NEXT: buffer_gl1_inv
3514 ; GFX10-NEXT: buffer_gl0_inv
3515 ; GFX10-NEXT: s_endpgm
3517 ; GFX11-LABEL: global_cmpxchg_saddr_i64_nortn:
3519 ; GFX11-NEXT: v_mov_b32_e32 v6, v2
3520 ; GFX11-NEXT: v_mov_b32_e32 v5, v1
3521 ; GFX11-NEXT: global_atomic_cmpswap_b64 v0, v[3:6], s[2:3]
3522 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
3523 ; GFX11-NEXT: buffer_gl1_inv
3524 ; GFX11-NEXT: buffer_gl0_inv
3525 ; GFX11-NEXT: s_endpgm
3527 ; GFX12-LABEL: global_cmpxchg_saddr_i64_nortn:
3529 ; GFX12-NEXT: v_mov_b32_e32 v6, v2
3530 ; GFX12-NEXT: v_mov_b32_e32 v5, v1
3531 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
3532 ; GFX12-NEXT: global_atomic_cmpswap_b64 v0, v[3:6], s[2:3] scope:SCOPE_SYS
3533 ; GFX12-NEXT: s_wait_storecnt 0x0
3534 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
3535 ; GFX12-NEXT: s_endpgm
3536 %zext.offset = zext i32 %voffset to i64
3537 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3538 %unused = cmpxchg ptr addrspace(1) %gep0, i64 %cmp, i64 %data seq_cst seq_cst
3542 define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
3543 ; GFX9-LABEL: global_cmpxchg_saddr_i64_nortn_neg128:
3545 ; GFX9-NEXT: v_mov_b32_e32 v6, v2
3546 ; GFX9-NEXT: v_mov_b32_e32 v5, v1
3547 ; GFX9-NEXT: global_atomic_cmpswap_x2 v0, v[3:6], s[2:3] offset:-128
3548 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3549 ; GFX9-NEXT: buffer_wbinvl1
3550 ; GFX9-NEXT: s_endpgm
3552 ; GFX10-LABEL: global_cmpxchg_saddr_i64_nortn_neg128:
3554 ; GFX10-NEXT: v_mov_b32_e32 v6, v2
3555 ; GFX10-NEXT: v_mov_b32_e32 v5, v1
3556 ; GFX10-NEXT: global_atomic_cmpswap_x2 v0, v[3:6], s[2:3] offset:-128
3557 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
3558 ; GFX10-NEXT: buffer_gl1_inv
3559 ; GFX10-NEXT: buffer_gl0_inv
3560 ; GFX10-NEXT: s_endpgm
3562 ; GFX11-LABEL: global_cmpxchg_saddr_i64_nortn_neg128:
3564 ; GFX11-NEXT: v_mov_b32_e32 v6, v2
3565 ; GFX11-NEXT: v_mov_b32_e32 v5, v1
3566 ; GFX11-NEXT: global_atomic_cmpswap_b64 v0, v[3:6], s[2:3] offset:-128
3567 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
3568 ; GFX11-NEXT: buffer_gl1_inv
3569 ; GFX11-NEXT: buffer_gl0_inv
3570 ; GFX11-NEXT: s_endpgm
3572 ; GFX12-LABEL: global_cmpxchg_saddr_i64_nortn_neg128:
3574 ; GFX12-NEXT: v_mov_b32_e32 v6, v2
3575 ; GFX12-NEXT: v_mov_b32_e32 v5, v1
3576 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
3577 ; GFX12-NEXT: global_atomic_cmpswap_b64 v0, v[3:6], s[2:3] offset:-128 scope:SCOPE_SYS
3578 ; GFX12-NEXT: s_wait_storecnt 0x0
3579 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
3580 ; GFX12-NEXT: s_endpgm
3581 %zext.offset = zext i32 %voffset to i64
3582 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3583 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
3584 %unused = cmpxchg ptr addrspace(1) %gep1, i64 %cmp, i64 %data seq_cst seq_cst
3588 ; --------------------------------------------------------------------------------
3590 ; --------------------------------------------------------------------------------
3592 define amdgpu_ps float @global_inc_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
3593 ; GCN-LABEL: global_inc_saddr_i32_rtn:
3595 ; GCN-NEXT: global_atomic_inc v0, v0, v1, s[2:3] glc
3596 ; GCN-NEXT: s_waitcnt vmcnt(0)
3597 ; GCN-NEXT: ; return to shader part epilog
3599 ; GFX11-LABEL: global_inc_saddr_i32_rtn:
3601 ; GFX11-NEXT: global_atomic_inc_u32 v0, v0, v1, s[2:3] glc
3602 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3603 ; GFX11-NEXT: ; return to shader part epilog
3605 ; GFX12-LABEL: global_inc_saddr_i32_rtn:
3607 ; GFX12-NEXT: global_atomic_inc_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
3608 ; GFX12-NEXT: s_wait_loadcnt 0x0
3609 ; GFX12-NEXT: ; return to shader part epilog
3610 %zext.offset = zext i32 %voffset to i64
3611 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3612 %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic
3613 %cast.rtn = bitcast i32 %rtn to float
3617 define amdgpu_ps float @global_inc_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
3618 ; GCN-LABEL: global_inc_saddr_i32_rtn_neg128:
3620 ; GCN-NEXT: global_atomic_inc v0, v0, v1, s[2:3] offset:-128 glc
3621 ; GCN-NEXT: s_waitcnt vmcnt(0)
3622 ; GCN-NEXT: ; return to shader part epilog
3624 ; GFX11-LABEL: global_inc_saddr_i32_rtn_neg128:
3626 ; GFX11-NEXT: global_atomic_inc_u32 v0, v0, v1, s[2:3] offset:-128 glc
3627 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3628 ; GFX11-NEXT: ; return to shader part epilog
3630 ; GFX12-LABEL: global_inc_saddr_i32_rtn_neg128:
3632 ; GFX12-NEXT: global_atomic_inc_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
3633 ; GFX12-NEXT: s_wait_loadcnt 0x0
3634 ; GFX12-NEXT: ; return to shader part epilog
3635 %zext.offset = zext i32 %voffset to i64
3636 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3637 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
3638 %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i32 %data syncscope("agent") monotonic
3639 %cast.rtn = bitcast i32 %rtn to float
3643 define amdgpu_ps void @global_inc_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
3644 ; GCN-LABEL: global_inc_saddr_i32_nortn:
3646 ; GCN-NEXT: global_atomic_inc v0, v1, s[2:3]
3647 ; GCN-NEXT: s_endpgm
3649 ; GFX11-LABEL: global_inc_saddr_i32_nortn:
3651 ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, s[2:3]
3652 ; GFX11-NEXT: s_endpgm
3654 ; GFX12-LABEL: global_inc_saddr_i32_nortn:
3656 ; GFX12-NEXT: global_atomic_inc_u32 v0, v1, s[2:3] scope:SCOPE_DEV
3657 ; GFX12-NEXT: s_endpgm
3658 %zext.offset = zext i32 %voffset to i64
3659 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3660 %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic
3664 define amdgpu_ps void @global_inc_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
3665 ; GCN-LABEL: global_inc_saddr_i32_nortn_neg128:
3667 ; GCN-NEXT: global_atomic_inc v0, v1, s[2:3] offset:-128
3668 ; GCN-NEXT: s_endpgm
3670 ; GFX11-LABEL: global_inc_saddr_i32_nortn_neg128:
3672 ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, s[2:3] offset:-128
3673 ; GFX11-NEXT: s_endpgm
3675 ; GFX12-LABEL: global_inc_saddr_i32_nortn_neg128:
3677 ; GFX12-NEXT: global_atomic_inc_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
3678 ; GFX12-NEXT: s_endpgm
3679 %zext.offset = zext i32 %voffset to i64
3680 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3681 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
3682 %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i32 %data syncscope("agent") monotonic
3686 define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
3687 ; GCN-LABEL: global_inc_saddr_i64_rtn:
3689 ; GCN-NEXT: global_atomic_inc_x2 v[0:1], v0, v[1:2], s[2:3] glc
3690 ; GCN-NEXT: s_waitcnt vmcnt(0)
3691 ; GCN-NEXT: ; return to shader part epilog
3693 ; GFX11-LABEL: global_inc_saddr_i64_rtn:
3695 ; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v0, v[1:2], s[2:3] glc
3696 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3697 ; GFX11-NEXT: ; return to shader part epilog
3699 ; GFX12-LABEL: global_inc_saddr_i64_rtn:
3701 ; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
3702 ; GFX12-NEXT: s_wait_loadcnt 0x0
3703 ; GFX12-NEXT: ; return to shader part epilog
3704 %zext.offset = zext i32 %voffset to i64
3705 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3706 %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic
3707 %cast.rtn = bitcast i64 %rtn to <2 x float>
3708 ret <2 x float> %cast.rtn
3711 define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
3712 ; GCN-LABEL: global_inc_saddr_i64_rtn_neg128:
3714 ; GCN-NEXT: global_atomic_inc_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
3715 ; GCN-NEXT: s_waitcnt vmcnt(0)
3716 ; GCN-NEXT: ; return to shader part epilog
3718 ; GFX11-LABEL: global_inc_saddr_i64_rtn_neg128:
3720 ; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
3721 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3722 ; GFX11-NEXT: ; return to shader part epilog
3724 ; GFX12-LABEL: global_inc_saddr_i64_rtn_neg128:
3726 ; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
3727 ; GFX12-NEXT: s_wait_loadcnt 0x0
3728 ; GFX12-NEXT: ; return to shader part epilog
3729 %zext.offset = zext i32 %voffset to i64
3730 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3731 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
3732 %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i64 %data syncscope("agent") monotonic
3733 %cast.rtn = bitcast i64 %rtn to <2 x float>
3734 ret <2 x float> %cast.rtn
3737 define amdgpu_ps void @global_inc_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
3738 ; GCN-LABEL: global_inc_saddr_i64_nortn:
3740 ; GCN-NEXT: global_atomic_inc_x2 v0, v[1:2], s[2:3]
3741 ; GCN-NEXT: s_endpgm
3743 ; GFX11-LABEL: global_inc_saddr_i64_nortn:
3745 ; GFX11-NEXT: global_atomic_inc_u64 v0, v[1:2], s[2:3]
3746 ; GFX11-NEXT: s_endpgm
3748 ; GFX12-LABEL: global_inc_saddr_i64_nortn:
3750 ; GFX12-NEXT: global_atomic_inc_u64 v0, v[1:2], s[2:3] scope:SCOPE_DEV
3751 ; GFX12-NEXT: s_endpgm
3752 %zext.offset = zext i32 %voffset to i64
3753 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3754 %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic
3758 define amdgpu_ps void @global_inc_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
3759 ; GCN-LABEL: global_inc_saddr_i64_nortn_neg128:
3761 ; GCN-NEXT: global_atomic_inc_x2 v0, v[1:2], s[2:3] offset:-128
3762 ; GCN-NEXT: s_endpgm
3764 ; GFX11-LABEL: global_inc_saddr_i64_nortn_neg128:
3766 ; GFX11-NEXT: global_atomic_inc_u64 v0, v[1:2], s[2:3] offset:-128
3767 ; GFX11-NEXT: s_endpgm
3769 ; GFX12-LABEL: global_inc_saddr_i64_nortn_neg128:
3771 ; GFX12-NEXT: global_atomic_inc_u64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_DEV
3772 ; GFX12-NEXT: s_endpgm
3773 %zext.offset = zext i32 %voffset to i64
3774 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3775 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
3776 %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i64 %data syncscope("agent") monotonic
3780 ; --------------------------------------------------------------------------------
3782 ; --------------------------------------------------------------------------------
3785 define amdgpu_ps float @global_dec_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
3786 ; GCN-LABEL: global_dec_saddr_i32_rtn:
3788 ; GCN-NEXT: global_atomic_dec v0, v0, v1, s[2:3] glc
3789 ; GCN-NEXT: s_waitcnt vmcnt(0)
3790 ; GCN-NEXT: ; return to shader part epilog
3792 ; GFX11-LABEL: global_dec_saddr_i32_rtn:
3794 ; GFX11-NEXT: global_atomic_dec_u32 v0, v0, v1, s[2:3] glc
3795 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3796 ; GFX11-NEXT: ; return to shader part epilog
3798 ; GFX12-LABEL: global_dec_saddr_i32_rtn:
3800 ; GFX12-NEXT: global_atomic_dec_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
3801 ; GFX12-NEXT: s_wait_loadcnt 0x0
3802 ; GFX12-NEXT: ; return to shader part epilog
3803 %zext.offset = zext i32 %voffset to i64
3804 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3805 %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic
3806 %cast.rtn = bitcast i32 %rtn to float
3810 define amdgpu_ps float @global_dec_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
3811 ; GCN-LABEL: global_dec_saddr_i32_rtn_neg128:
3813 ; GCN-NEXT: global_atomic_dec v0, v0, v1, s[2:3] offset:-128 glc
3814 ; GCN-NEXT: s_waitcnt vmcnt(0)
3815 ; GCN-NEXT: ; return to shader part epilog
3817 ; GFX11-LABEL: global_dec_saddr_i32_rtn_neg128:
3819 ; GFX11-NEXT: global_atomic_dec_u32 v0, v0, v1, s[2:3] offset:-128 glc
3820 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3821 ; GFX11-NEXT: ; return to shader part epilog
3823 ; GFX12-LABEL: global_dec_saddr_i32_rtn_neg128:
3825 ; GFX12-NEXT: global_atomic_dec_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
3826 ; GFX12-NEXT: s_wait_loadcnt 0x0
3827 ; GFX12-NEXT: ; return to shader part epilog
3828 %zext.offset = zext i32 %voffset to i64
3829 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3830 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
3831 %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep1, i32 %data syncscope("agent") monotonic
3832 %cast.rtn = bitcast i32 %rtn to float
3836 define amdgpu_ps void @global_dec_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
3837 ; GCN-LABEL: global_dec_saddr_i32_nortn:
3839 ; GCN-NEXT: global_atomic_dec v0, v1, s[2:3]
3840 ; GCN-NEXT: s_endpgm
3842 ; GFX11-LABEL: global_dec_saddr_i32_nortn:
3844 ; GFX11-NEXT: global_atomic_dec_u32 v0, v1, s[2:3]
3845 ; GFX11-NEXT: s_endpgm
3847 ; GFX12-LABEL: global_dec_saddr_i32_nortn:
3849 ; GFX12-NEXT: global_atomic_dec_u32 v0, v1, s[2:3] scope:SCOPE_DEV
3850 ; GFX12-NEXT: s_endpgm
3851 %zext.offset = zext i32 %voffset to i64
3852 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3853 %unused = atomicrmw udec_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic
3857 define amdgpu_ps void @global_dec_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
3858 ; GCN-LABEL: global_dec_saddr_i32_nortn_neg128:
3860 ; GCN-NEXT: global_atomic_dec v0, v1, s[2:3] offset:-128
3861 ; GCN-NEXT: s_endpgm
3863 ; GFX11-LABEL: global_dec_saddr_i32_nortn_neg128:
3865 ; GFX11-NEXT: global_atomic_dec_u32 v0, v1, s[2:3] offset:-128
3866 ; GFX11-NEXT: s_endpgm
3868 ; GFX12-LABEL: global_dec_saddr_i32_nortn_neg128:
3870 ; GFX12-NEXT: global_atomic_dec_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
3871 ; GFX12-NEXT: s_endpgm
3872 %zext.offset = zext i32 %voffset to i64
3873 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3874 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
3875 %unused = atomicrmw udec_wrap ptr addrspace(1) %gep1, i32 %data syncscope("agent") monotonic
3879 define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
3880 ; GCN-LABEL: global_dec_saddr_i64_rtn:
3882 ; GCN-NEXT: global_atomic_dec_x2 v[0:1], v0, v[1:2], s[2:3] glc
3883 ; GCN-NEXT: s_waitcnt vmcnt(0)
3884 ; GCN-NEXT: ; return to shader part epilog
3886 ; GFX11-LABEL: global_dec_saddr_i64_rtn:
3888 ; GFX11-NEXT: global_atomic_dec_u64 v[0:1], v0, v[1:2], s[2:3] glc
3889 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3890 ; GFX11-NEXT: ; return to shader part epilog
3892 ; GFX12-LABEL: global_dec_saddr_i64_rtn:
3894 ; GFX12-NEXT: global_atomic_dec_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
3895 ; GFX12-NEXT: s_wait_loadcnt 0x0
3896 ; GFX12-NEXT: ; return to shader part epilog
3897 %zext.offset = zext i32 %voffset to i64
3898 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3899 %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic
3900 %cast.rtn = bitcast i64 %rtn to <2 x float>
3901 ret <2 x float> %cast.rtn
3904 define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
3905 ; GCN-LABEL: global_dec_saddr_i64_rtn_neg128:
3907 ; GCN-NEXT: global_atomic_dec_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
3908 ; GCN-NEXT: s_waitcnt vmcnt(0)
3909 ; GCN-NEXT: ; return to shader part epilog
3911 ; GFX11-LABEL: global_dec_saddr_i64_rtn_neg128:
3913 ; GFX11-NEXT: global_atomic_dec_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
3914 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3915 ; GFX11-NEXT: ; return to shader part epilog
3917 ; GFX12-LABEL: global_dec_saddr_i64_rtn_neg128:
3919 ; GFX12-NEXT: global_atomic_dec_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
3920 ; GFX12-NEXT: s_wait_loadcnt 0x0
3921 ; GFX12-NEXT: ; return to shader part epilog
3922 %zext.offset = zext i32 %voffset to i64
3923 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3924 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
3925 %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep1, i64 %data syncscope("agent") monotonic
3926 %cast.rtn = bitcast i64 %rtn to <2 x float>
3927 ret <2 x float> %cast.rtn
3930 define amdgpu_ps void @global_dec_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
3931 ; GCN-LABEL: global_dec_saddr_i64_nortn:
3933 ; GCN-NEXT: global_atomic_dec_x2 v0, v[1:2], s[2:3]
3934 ; GCN-NEXT: s_endpgm
3936 ; GFX11-LABEL: global_dec_saddr_i64_nortn:
3938 ; GFX11-NEXT: global_atomic_dec_u64 v0, v[1:2], s[2:3]
3939 ; GFX11-NEXT: s_endpgm
3941 ; GFX12-LABEL: global_dec_saddr_i64_nortn:
3943 ; GFX12-NEXT: global_atomic_dec_u64 v0, v[1:2], s[2:3] scope:SCOPE_DEV
3944 ; GFX12-NEXT: s_endpgm
3945 %zext.offset = zext i32 %voffset to i64
3946 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3947 %unused = atomicrmw udec_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic
3951 define amdgpu_ps void @global_dec_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
3952 ; GCN-LABEL: global_dec_saddr_i64_nortn_neg128:
3954 ; GCN-NEXT: global_atomic_dec_x2 v0, v[1:2], s[2:3] offset:-128
3955 ; GCN-NEXT: s_endpgm
3957 ; GFX11-LABEL: global_dec_saddr_i64_nortn_neg128:
3959 ; GFX11-NEXT: global_atomic_dec_u64 v0, v[1:2], s[2:3] offset:-128
3960 ; GFX11-NEXT: s_endpgm
3962 ; GFX12-LABEL: global_dec_saddr_i64_nortn_neg128:
3964 ; GFX12-NEXT: global_atomic_dec_u64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_DEV
3965 ; GFX12-NEXT: s_endpgm
3966 %zext.offset = zext i32 %voffset to i64
3967 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3968 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
3969 %unused = atomicrmw udec_wrap ptr addrspace(1) %gep1, i64 %data syncscope("agent") monotonic
3973 attributes #0 = { argmemonly nounwind willreturn }