1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
6 ; ---------------------------------------------------------------------
8 ; ---------------------------------------------------------------------
10 define void @global_atomic_xchg_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
11 ; SI-LABEL: global_atomic_xchg_i32_noret:
13 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14 ; SI-NEXT: s_mov_b32 s6, 0
15 ; SI-NEXT: s_mov_b32 s7, 0xf000
16 ; SI-NEXT: s_mov_b32 s4, s6
17 ; SI-NEXT: s_mov_b32 s5, s6
18 ; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64
19 ; SI-NEXT: s_waitcnt vmcnt(0)
20 ; SI-NEXT: buffer_wbinvl1
21 ; SI-NEXT: s_waitcnt expcnt(0)
22 ; SI-NEXT: s_setpc_b64 s[30:31]
24 ; VI-LABEL: global_atomic_xchg_i32_noret:
26 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27 ; VI-NEXT: flat_atomic_swap v[0:1], v2
28 ; VI-NEXT: s_waitcnt vmcnt(0)
29 ; VI-NEXT: buffer_wbinvl1_vol
30 ; VI-NEXT: s_setpc_b64 s[30:31]
32 ; GFX9-LABEL: global_atomic_xchg_i32_noret:
34 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35 ; GFX9-NEXT: global_atomic_swap v[0:1], v2, off
36 ; GFX9-NEXT: s_waitcnt vmcnt(0)
37 ; GFX9-NEXT: buffer_wbinvl1_vol
38 ; GFX9-NEXT: s_setpc_b64 s[30:31]
39 %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, i32 %in seq_cst
43 define void @global_atomic_xchg_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
44 ; SI-LABEL: global_atomic_xchg_i32_noret_offset:
46 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
47 ; SI-NEXT: s_mov_b32 s6, 0
48 ; SI-NEXT: s_mov_b32 s7, 0xf000
49 ; SI-NEXT: s_mov_b32 s4, s6
50 ; SI-NEXT: s_mov_b32 s5, s6
51 ; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16
52 ; SI-NEXT: s_waitcnt vmcnt(0)
53 ; SI-NEXT: buffer_wbinvl1
54 ; SI-NEXT: s_waitcnt expcnt(0)
55 ; SI-NEXT: s_setpc_b64 s[30:31]
57 ; VI-LABEL: global_atomic_xchg_i32_noret_offset:
59 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
60 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
61 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
62 ; VI-NEXT: flat_atomic_swap v[0:1], v2
63 ; VI-NEXT: s_waitcnt vmcnt(0)
64 ; VI-NEXT: buffer_wbinvl1_vol
65 ; VI-NEXT: s_setpc_b64 s[30:31]
67 ; GFX9-LABEL: global_atomic_xchg_i32_noret_offset:
69 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
70 ; GFX9-NEXT: global_atomic_swap v[0:1], v2, off offset:16
71 ; GFX9-NEXT: s_waitcnt vmcnt(0)
72 ; GFX9-NEXT: buffer_wbinvl1_vol
73 ; GFX9-NEXT: s_setpc_b64 s[30:31]
74 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
75 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst
79 define i32 @global_atomic_xchg_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
80 ; SI-LABEL: global_atomic_xchg_i32_ret:
82 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83 ; SI-NEXT: s_mov_b32 s6, 0
84 ; SI-NEXT: s_mov_b32 s7, 0xf000
85 ; SI-NEXT: s_mov_b32 s4, s6
86 ; SI-NEXT: s_mov_b32 s5, s6
87 ; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 glc
88 ; SI-NEXT: s_waitcnt vmcnt(0)
89 ; SI-NEXT: buffer_wbinvl1
90 ; SI-NEXT: v_mov_b32_e32 v0, v2
91 ; SI-NEXT: s_waitcnt expcnt(0)
92 ; SI-NEXT: s_setpc_b64 s[30:31]
94 ; VI-LABEL: global_atomic_xchg_i32_ret:
96 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
97 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
98 ; VI-NEXT: s_waitcnt vmcnt(0)
99 ; VI-NEXT: buffer_wbinvl1_vol
100 ; VI-NEXT: s_setpc_b64 s[30:31]
102 ; GFX9-LABEL: global_atomic_xchg_i32_ret:
104 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
105 ; GFX9-NEXT: global_atomic_swap v0, v[0:1], v2, off glc
106 ; GFX9-NEXT: s_waitcnt vmcnt(0)
107 ; GFX9-NEXT: buffer_wbinvl1_vol
108 ; GFX9-NEXT: s_setpc_b64 s[30:31]
109 %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %in seq_cst
113 define i32 @global_atomic_xchg_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
114 ; SI-LABEL: global_atomic_xchg_i32_ret_offset:
116 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
117 ; SI-NEXT: s_mov_b32 s6, 0
118 ; SI-NEXT: s_mov_b32 s7, 0xf000
119 ; SI-NEXT: s_mov_b32 s4, s6
120 ; SI-NEXT: s_mov_b32 s5, s6
121 ; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
122 ; SI-NEXT: s_waitcnt vmcnt(0)
123 ; SI-NEXT: buffer_wbinvl1
124 ; SI-NEXT: v_mov_b32_e32 v0, v2
125 ; SI-NEXT: s_waitcnt expcnt(0)
126 ; SI-NEXT: s_setpc_b64 s[30:31]
128 ; VI-LABEL: global_atomic_xchg_i32_ret_offset:
130 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
131 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
132 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
133 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
134 ; VI-NEXT: s_waitcnt vmcnt(0)
135 ; VI-NEXT: buffer_wbinvl1_vol
136 ; VI-NEXT: s_setpc_b64 s[30:31]
138 ; GFX9-LABEL: global_atomic_xchg_i32_ret_offset:
140 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
141 ; GFX9-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:16 glc
142 ; GFX9-NEXT: s_waitcnt vmcnt(0)
143 ; GFX9-NEXT: buffer_wbinvl1_vol
144 ; GFX9-NEXT: s_setpc_b64 s[30:31]
145 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
146 %result = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst
150 define amdgpu_gfx void @global_atomic_xchg_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
151 ; SI-LABEL: global_atomic_xchg_i32_noret_scalar:
153 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
154 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
155 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
156 ; SI-NEXT: s_mov_b64 exec, s[34:35]
157 ; SI-NEXT: s_waitcnt expcnt(0)
158 ; SI-NEXT: v_writelane_b32 v1, s6, 0
159 ; SI-NEXT: v_writelane_b32 v1, s7, 1
160 ; SI-NEXT: s_mov_b32 s34, s6
161 ; SI-NEXT: s_mov_b32 s7, 0xf000
162 ; SI-NEXT: s_mov_b32 s6, -1
163 ; SI-NEXT: v_mov_b32_e32 v0, s34
164 ; SI-NEXT: s_waitcnt vmcnt(0)
165 ; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0
166 ; SI-NEXT: s_waitcnt vmcnt(0)
167 ; SI-NEXT: buffer_wbinvl1
168 ; SI-NEXT: v_readlane_b32 s7, v1, 1
169 ; SI-NEXT: v_readlane_b32 s6, v1, 0
170 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
171 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
172 ; SI-NEXT: s_mov_b64 exec, s[34:35]
173 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
174 ; SI-NEXT: s_setpc_b64 s[30:31]
176 ; VI-LABEL: global_atomic_xchg_i32_noret_scalar:
178 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
179 ; VI-NEXT: v_mov_b32_e32 v0, s4
180 ; VI-NEXT: v_mov_b32_e32 v1, s5
181 ; VI-NEXT: v_mov_b32_e32 v2, s6
182 ; VI-NEXT: flat_atomic_swap v[0:1], v2
183 ; VI-NEXT: s_waitcnt vmcnt(0)
184 ; VI-NEXT: buffer_wbinvl1_vol
185 ; VI-NEXT: s_setpc_b64 s[30:31]
187 ; GFX9-LABEL: global_atomic_xchg_i32_noret_scalar:
189 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
190 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
191 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
192 ; GFX9-NEXT: global_atomic_swap v0, v1, s[4:5]
193 ; GFX9-NEXT: s_waitcnt vmcnt(0)
194 ; GFX9-NEXT: buffer_wbinvl1_vol
195 ; GFX9-NEXT: s_setpc_b64 s[30:31]
196 %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, i32 %in seq_cst
200 define amdgpu_gfx void @global_atomic_xchg_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
201 ; SI-LABEL: global_atomic_xchg_i32_noret_offset_scalar:
203 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
204 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
205 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
206 ; SI-NEXT: s_mov_b64 exec, s[34:35]
207 ; SI-NEXT: s_waitcnt expcnt(0)
208 ; SI-NEXT: v_writelane_b32 v1, s6, 0
209 ; SI-NEXT: v_writelane_b32 v1, s7, 1
210 ; SI-NEXT: s_mov_b32 s34, s6
211 ; SI-NEXT: s_mov_b32 s7, 0xf000
212 ; SI-NEXT: s_mov_b32 s6, -1
213 ; SI-NEXT: v_mov_b32_e32 v0, s34
214 ; SI-NEXT: s_waitcnt vmcnt(0)
215 ; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16
216 ; SI-NEXT: s_waitcnt vmcnt(0)
217 ; SI-NEXT: buffer_wbinvl1
218 ; SI-NEXT: v_readlane_b32 s7, v1, 1
219 ; SI-NEXT: v_readlane_b32 s6, v1, 0
220 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
221 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
222 ; SI-NEXT: s_mov_b64 exec, s[34:35]
223 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
224 ; SI-NEXT: s_setpc_b64 s[30:31]
226 ; VI-LABEL: global_atomic_xchg_i32_noret_offset_scalar:
228 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
229 ; VI-NEXT: s_add_u32 s34, s4, 16
230 ; VI-NEXT: s_addc_u32 s35, s5, 0
231 ; VI-NEXT: v_mov_b32_e32 v0, s34
232 ; VI-NEXT: v_mov_b32_e32 v1, s35
233 ; VI-NEXT: v_mov_b32_e32 v2, s6
234 ; VI-NEXT: flat_atomic_swap v[0:1], v2
235 ; VI-NEXT: s_waitcnt vmcnt(0)
236 ; VI-NEXT: buffer_wbinvl1_vol
237 ; VI-NEXT: s_setpc_b64 s[30:31]
239 ; GFX9-LABEL: global_atomic_xchg_i32_noret_offset_scalar:
241 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
242 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
243 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
244 ; GFX9-NEXT: global_atomic_swap v0, v1, s[4:5] offset:16
245 ; GFX9-NEXT: s_waitcnt vmcnt(0)
246 ; GFX9-NEXT: buffer_wbinvl1_vol
247 ; GFX9-NEXT: s_setpc_b64 s[30:31]
248 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
249 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst
253 define amdgpu_gfx i32 @global_atomic_xchg_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
254 ; SI-LABEL: global_atomic_xchg_i32_ret_scalar:
256 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
257 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
258 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
259 ; SI-NEXT: s_mov_b64 exec, s[34:35]
260 ; SI-NEXT: s_waitcnt expcnt(0)
261 ; SI-NEXT: v_writelane_b32 v1, s6, 0
262 ; SI-NEXT: v_writelane_b32 v1, s7, 1
263 ; SI-NEXT: s_mov_b32 s34, s6
264 ; SI-NEXT: s_mov_b32 s7, 0xf000
265 ; SI-NEXT: s_mov_b32 s6, -1
266 ; SI-NEXT: v_mov_b32_e32 v0, s34
267 ; SI-NEXT: s_waitcnt vmcnt(0)
268 ; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc
269 ; SI-NEXT: s_waitcnt vmcnt(0)
270 ; SI-NEXT: buffer_wbinvl1
271 ; SI-NEXT: v_readlane_b32 s7, v1, 1
272 ; SI-NEXT: v_readlane_b32 s6, v1, 0
273 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
274 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
275 ; SI-NEXT: s_mov_b64 exec, s[34:35]
276 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
277 ; SI-NEXT: s_setpc_b64 s[30:31]
279 ; VI-LABEL: global_atomic_xchg_i32_ret_scalar:
281 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
282 ; VI-NEXT: v_mov_b32_e32 v0, s4
283 ; VI-NEXT: v_mov_b32_e32 v1, s5
284 ; VI-NEXT: v_mov_b32_e32 v2, s6
285 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
286 ; VI-NEXT: s_waitcnt vmcnt(0)
287 ; VI-NEXT: buffer_wbinvl1_vol
288 ; VI-NEXT: s_setpc_b64 s[30:31]
290 ; GFX9-LABEL: global_atomic_xchg_i32_ret_scalar:
292 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
293 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
294 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
295 ; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[4:5] glc
296 ; GFX9-NEXT: s_waitcnt vmcnt(0)
297 ; GFX9-NEXT: buffer_wbinvl1_vol
298 ; GFX9-NEXT: s_setpc_b64 s[30:31]
299 %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %in seq_cst
303 define amdgpu_gfx i32 @global_atomic_xchg_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
304 ; SI-LABEL: global_atomic_xchg_i32_ret_offset_scalar:
306 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
307 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
308 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
309 ; SI-NEXT: s_mov_b64 exec, s[34:35]
310 ; SI-NEXT: s_waitcnt expcnt(0)
311 ; SI-NEXT: v_writelane_b32 v1, s6, 0
312 ; SI-NEXT: v_writelane_b32 v1, s7, 1
313 ; SI-NEXT: s_mov_b32 s34, s6
314 ; SI-NEXT: s_mov_b32 s7, 0xf000
315 ; SI-NEXT: s_mov_b32 s6, -1
316 ; SI-NEXT: v_mov_b32_e32 v0, s34
317 ; SI-NEXT: s_waitcnt vmcnt(0)
318 ; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16 glc
319 ; SI-NEXT: s_waitcnt vmcnt(0)
320 ; SI-NEXT: buffer_wbinvl1
321 ; SI-NEXT: v_readlane_b32 s7, v1, 1
322 ; SI-NEXT: v_readlane_b32 s6, v1, 0
323 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
324 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
325 ; SI-NEXT: s_mov_b64 exec, s[34:35]
326 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
327 ; SI-NEXT: s_setpc_b64 s[30:31]
329 ; VI-LABEL: global_atomic_xchg_i32_ret_offset_scalar:
331 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
332 ; VI-NEXT: s_add_u32 s34, s4, 16
333 ; VI-NEXT: s_addc_u32 s35, s5, 0
334 ; VI-NEXT: v_mov_b32_e32 v0, s34
335 ; VI-NEXT: v_mov_b32_e32 v1, s35
336 ; VI-NEXT: v_mov_b32_e32 v2, s6
337 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
338 ; VI-NEXT: s_waitcnt vmcnt(0)
339 ; VI-NEXT: buffer_wbinvl1_vol
340 ; VI-NEXT: s_setpc_b64 s[30:31]
342 ; GFX9-LABEL: global_atomic_xchg_i32_ret_offset_scalar:
344 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
345 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
346 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
347 ; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[4:5] offset:16 glc
348 ; GFX9-NEXT: s_waitcnt vmcnt(0)
349 ; GFX9-NEXT: buffer_wbinvl1_vol
350 ; GFX9-NEXT: s_setpc_b64 s[30:31]
351 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
352 %result = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst
356 define void @global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
357 ; SI-LABEL: global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory:
359 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
360 ; SI-NEXT: s_mov_b32 s6, 0
361 ; SI-NEXT: s_mov_b32 s7, 0xf000
362 ; SI-NEXT: s_mov_b32 s4, s6
363 ; SI-NEXT: s_mov_b32 s5, s6
364 ; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16
365 ; SI-NEXT: s_waitcnt vmcnt(0)
366 ; SI-NEXT: buffer_wbinvl1
367 ; SI-NEXT: s_waitcnt expcnt(0)
368 ; SI-NEXT: s_setpc_b64 s[30:31]
370 ; VI-LABEL: global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory:
372 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
373 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
374 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
375 ; VI-NEXT: flat_atomic_swap v[0:1], v2
376 ; VI-NEXT: s_waitcnt vmcnt(0)
377 ; VI-NEXT: buffer_wbinvl1_vol
378 ; VI-NEXT: s_setpc_b64 s[30:31]
380 ; GFX9-LABEL: global_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory:
382 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
383 ; GFX9-NEXT: global_atomic_swap v[0:1], v2, off offset:16
384 ; GFX9-NEXT: s_waitcnt vmcnt(0)
385 ; GFX9-NEXT: buffer_wbinvl1_vol
386 ; GFX9-NEXT: s_setpc_b64 s[30:31]
387 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
388 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
392 define i32 @global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
393 ; SI-LABEL: global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory:
395 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
396 ; SI-NEXT: s_mov_b32 s6, 0
397 ; SI-NEXT: s_mov_b32 s7, 0xf000
398 ; SI-NEXT: s_mov_b32 s4, s6
399 ; SI-NEXT: s_mov_b32 s5, s6
400 ; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
401 ; SI-NEXT: s_waitcnt vmcnt(0)
402 ; SI-NEXT: buffer_wbinvl1
403 ; SI-NEXT: v_mov_b32_e32 v0, v2
404 ; SI-NEXT: s_waitcnt expcnt(0)
405 ; SI-NEXT: s_setpc_b64 s[30:31]
407 ; VI-LABEL: global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory:
409 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
410 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
411 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
412 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
413 ; VI-NEXT: s_waitcnt vmcnt(0)
414 ; VI-NEXT: buffer_wbinvl1_vol
415 ; VI-NEXT: s_setpc_b64 s[30:31]
417 ; GFX9-LABEL: global_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory:
419 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
420 ; GFX9-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:16 glc
421 ; GFX9-NEXT: s_waitcnt vmcnt(0)
422 ; GFX9-NEXT: buffer_wbinvl1_vol
423 ; GFX9-NEXT: s_setpc_b64 s[30:31]
424 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
425 %result = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
429 ; ---------------------------------------------------------------------
431 ; ---------------------------------------------------------------------
433 define void @global_atomic_xchg_f32_noret(ptr addrspace(1) %ptr, float %in) {
434 ; SI-LABEL: global_atomic_xchg_f32_noret:
436 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
437 ; SI-NEXT: s_mov_b32 s6, 0
438 ; SI-NEXT: s_mov_b32 s7, 0xf000
439 ; SI-NEXT: s_mov_b32 s4, s6
440 ; SI-NEXT: s_mov_b32 s5, s6
441 ; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64
442 ; SI-NEXT: s_waitcnt vmcnt(0)
443 ; SI-NEXT: buffer_wbinvl1
444 ; SI-NEXT: s_waitcnt expcnt(0)
445 ; SI-NEXT: s_setpc_b64 s[30:31]
447 ; VI-LABEL: global_atomic_xchg_f32_noret:
449 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
450 ; VI-NEXT: flat_atomic_swap v[0:1], v2
451 ; VI-NEXT: s_waitcnt vmcnt(0)
452 ; VI-NEXT: buffer_wbinvl1_vol
453 ; VI-NEXT: s_setpc_b64 s[30:31]
455 ; GFX9-LABEL: global_atomic_xchg_f32_noret:
457 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
458 ; GFX9-NEXT: global_atomic_swap v[0:1], v2, off
459 ; GFX9-NEXT: s_waitcnt vmcnt(0)
460 ; GFX9-NEXT: buffer_wbinvl1_vol
461 ; GFX9-NEXT: s_setpc_b64 s[30:31]
462 %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, float %in seq_cst
466 define void @global_atomic_xchg_f32_noret_offset(ptr addrspace(1) %out, float %in) {
467 ; SI-LABEL: global_atomic_xchg_f32_noret_offset:
469 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
470 ; SI-NEXT: s_mov_b32 s6, 0
471 ; SI-NEXT: s_mov_b32 s7, 0xf000
472 ; SI-NEXT: s_mov_b32 s4, s6
473 ; SI-NEXT: s_mov_b32 s5, s6
474 ; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16
475 ; SI-NEXT: s_waitcnt vmcnt(0)
476 ; SI-NEXT: buffer_wbinvl1
477 ; SI-NEXT: s_waitcnt expcnt(0)
478 ; SI-NEXT: s_setpc_b64 s[30:31]
480 ; VI-LABEL: global_atomic_xchg_f32_noret_offset:
482 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
483 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
484 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
485 ; VI-NEXT: flat_atomic_swap v[0:1], v2
486 ; VI-NEXT: s_waitcnt vmcnt(0)
487 ; VI-NEXT: buffer_wbinvl1_vol
488 ; VI-NEXT: s_setpc_b64 s[30:31]
490 ; GFX9-LABEL: global_atomic_xchg_f32_noret_offset:
492 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
493 ; GFX9-NEXT: global_atomic_swap v[0:1], v2, off offset:16
494 ; GFX9-NEXT: s_waitcnt vmcnt(0)
495 ; GFX9-NEXT: buffer_wbinvl1_vol
496 ; GFX9-NEXT: s_setpc_b64 s[30:31]
497 %gep = getelementptr float, ptr addrspace(1) %out, i32 4
498 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst
502 define float @global_atomic_xchg_f32_ret(ptr addrspace(1) %ptr, float %in) {
503 ; SI-LABEL: global_atomic_xchg_f32_ret:
505 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
506 ; SI-NEXT: s_mov_b32 s6, 0
507 ; SI-NEXT: s_mov_b32 s7, 0xf000
508 ; SI-NEXT: s_mov_b32 s4, s6
509 ; SI-NEXT: s_mov_b32 s5, s6
510 ; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 glc
511 ; SI-NEXT: s_waitcnt vmcnt(0)
512 ; SI-NEXT: buffer_wbinvl1
513 ; SI-NEXT: v_mov_b32_e32 v0, v2
514 ; SI-NEXT: s_waitcnt expcnt(0)
515 ; SI-NEXT: s_setpc_b64 s[30:31]
517 ; VI-LABEL: global_atomic_xchg_f32_ret:
519 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
520 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
521 ; VI-NEXT: s_waitcnt vmcnt(0)
522 ; VI-NEXT: buffer_wbinvl1_vol
523 ; VI-NEXT: s_setpc_b64 s[30:31]
525 ; GFX9-LABEL: global_atomic_xchg_f32_ret:
527 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
528 ; GFX9-NEXT: global_atomic_swap v0, v[0:1], v2, off glc
529 ; GFX9-NEXT: s_waitcnt vmcnt(0)
530 ; GFX9-NEXT: buffer_wbinvl1_vol
531 ; GFX9-NEXT: s_setpc_b64 s[30:31]
532 %result = atomicrmw xchg ptr addrspace(1) %ptr, float %in seq_cst
536 define float @global_atomic_xchg_f32_ret_offset(ptr addrspace(1) %out, float %in) {
537 ; SI-LABEL: global_atomic_xchg_f32_ret_offset:
539 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
540 ; SI-NEXT: s_mov_b32 s6, 0
541 ; SI-NEXT: s_mov_b32 s7, 0xf000
542 ; SI-NEXT: s_mov_b32 s4, s6
543 ; SI-NEXT: s_mov_b32 s5, s6
544 ; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
545 ; SI-NEXT: s_waitcnt vmcnt(0)
546 ; SI-NEXT: buffer_wbinvl1
547 ; SI-NEXT: v_mov_b32_e32 v0, v2
548 ; SI-NEXT: s_waitcnt expcnt(0)
549 ; SI-NEXT: s_setpc_b64 s[30:31]
551 ; VI-LABEL: global_atomic_xchg_f32_ret_offset:
553 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
554 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
555 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
556 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
557 ; VI-NEXT: s_waitcnt vmcnt(0)
558 ; VI-NEXT: buffer_wbinvl1_vol
559 ; VI-NEXT: s_setpc_b64 s[30:31]
561 ; GFX9-LABEL: global_atomic_xchg_f32_ret_offset:
563 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
564 ; GFX9-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:16 glc
565 ; GFX9-NEXT: s_waitcnt vmcnt(0)
566 ; GFX9-NEXT: buffer_wbinvl1_vol
567 ; GFX9-NEXT: s_setpc_b64 s[30:31]
568 %gep = getelementptr float, ptr addrspace(1) %out, i32 4
569 %result = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst
573 define amdgpu_gfx void @global_atomic_xchg_f32_noret_scalar(ptr addrspace(1) inreg %ptr, float inreg %in) {
574 ; SI-LABEL: global_atomic_xchg_f32_noret_scalar:
576 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
577 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
578 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
579 ; SI-NEXT: s_mov_b64 exec, s[34:35]
580 ; SI-NEXT: s_waitcnt expcnt(0)
581 ; SI-NEXT: v_writelane_b32 v1, s6, 0
582 ; SI-NEXT: v_writelane_b32 v1, s7, 1
583 ; SI-NEXT: s_mov_b32 s34, s6
584 ; SI-NEXT: s_mov_b32 s7, 0xf000
585 ; SI-NEXT: s_mov_b32 s6, -1
586 ; SI-NEXT: v_mov_b32_e32 v0, s34
587 ; SI-NEXT: s_waitcnt vmcnt(0)
588 ; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0
589 ; SI-NEXT: s_waitcnt vmcnt(0)
590 ; SI-NEXT: buffer_wbinvl1
591 ; SI-NEXT: v_readlane_b32 s7, v1, 1
592 ; SI-NEXT: v_readlane_b32 s6, v1, 0
593 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
594 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
595 ; SI-NEXT: s_mov_b64 exec, s[34:35]
596 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
597 ; SI-NEXT: s_setpc_b64 s[30:31]
599 ; VI-LABEL: global_atomic_xchg_f32_noret_scalar:
601 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
602 ; VI-NEXT: v_mov_b32_e32 v0, s4
603 ; VI-NEXT: v_mov_b32_e32 v1, s5
604 ; VI-NEXT: v_mov_b32_e32 v2, s6
605 ; VI-NEXT: flat_atomic_swap v[0:1], v2
606 ; VI-NEXT: s_waitcnt vmcnt(0)
607 ; VI-NEXT: buffer_wbinvl1_vol
608 ; VI-NEXT: s_setpc_b64 s[30:31]
610 ; GFX9-LABEL: global_atomic_xchg_f32_noret_scalar:
612 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
613 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
614 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
615 ; GFX9-NEXT: global_atomic_swap v0, v1, s[4:5]
616 ; GFX9-NEXT: s_waitcnt vmcnt(0)
617 ; GFX9-NEXT: buffer_wbinvl1_vol
618 ; GFX9-NEXT: s_setpc_b64 s[30:31]
619 %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, float %in seq_cst
623 define amdgpu_gfx void @global_atomic_xchg_f32_noret_offset_scalar(ptr addrspace(1) inreg %out, float inreg %in) {
624 ; SI-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
626 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
627 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
628 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
629 ; SI-NEXT: s_mov_b64 exec, s[34:35]
630 ; SI-NEXT: s_waitcnt expcnt(0)
631 ; SI-NEXT: v_writelane_b32 v1, s6, 0
632 ; SI-NEXT: v_writelane_b32 v1, s7, 1
633 ; SI-NEXT: s_mov_b32 s34, s6
634 ; SI-NEXT: s_mov_b32 s7, 0xf000
635 ; SI-NEXT: s_mov_b32 s6, -1
636 ; SI-NEXT: v_mov_b32_e32 v0, s34
637 ; SI-NEXT: s_waitcnt vmcnt(0)
638 ; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16
639 ; SI-NEXT: s_waitcnt vmcnt(0)
640 ; SI-NEXT: buffer_wbinvl1
641 ; SI-NEXT: v_readlane_b32 s7, v1, 1
642 ; SI-NEXT: v_readlane_b32 s6, v1, 0
643 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
644 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
645 ; SI-NEXT: s_mov_b64 exec, s[34:35]
646 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
647 ; SI-NEXT: s_setpc_b64 s[30:31]
649 ; VI-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
651 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
652 ; VI-NEXT: s_add_u32 s34, s4, 16
653 ; VI-NEXT: s_addc_u32 s35, s5, 0
654 ; VI-NEXT: v_mov_b32_e32 v0, s34
655 ; VI-NEXT: v_mov_b32_e32 v1, s35
656 ; VI-NEXT: v_mov_b32_e32 v2, s6
657 ; VI-NEXT: flat_atomic_swap v[0:1], v2
658 ; VI-NEXT: s_waitcnt vmcnt(0)
659 ; VI-NEXT: buffer_wbinvl1_vol
660 ; VI-NEXT: s_setpc_b64 s[30:31]
662 ; GFX9-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
664 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
665 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
666 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
667 ; GFX9-NEXT: global_atomic_swap v0, v1, s[4:5] offset:16
668 ; GFX9-NEXT: s_waitcnt vmcnt(0)
669 ; GFX9-NEXT: buffer_wbinvl1_vol
670 ; GFX9-NEXT: s_setpc_b64 s[30:31]
671 %gep = getelementptr float, ptr addrspace(1) %out, i32 4
672 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst
676 define amdgpu_gfx float @global_atomic_xchg_f32_ret_scalar(ptr addrspace(1) inreg %ptr, float inreg %in) {
677 ; SI-LABEL: global_atomic_xchg_f32_ret_scalar:
679 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
680 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
681 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
682 ; SI-NEXT: s_mov_b64 exec, s[34:35]
683 ; SI-NEXT: s_waitcnt expcnt(0)
684 ; SI-NEXT: v_writelane_b32 v1, s6, 0
685 ; SI-NEXT: v_writelane_b32 v1, s7, 1
686 ; SI-NEXT: s_mov_b32 s34, s6
687 ; SI-NEXT: s_mov_b32 s7, 0xf000
688 ; SI-NEXT: s_mov_b32 s6, -1
689 ; SI-NEXT: v_mov_b32_e32 v0, s34
690 ; SI-NEXT: s_waitcnt vmcnt(0)
691 ; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc
692 ; SI-NEXT: s_waitcnt vmcnt(0)
693 ; SI-NEXT: buffer_wbinvl1
694 ; SI-NEXT: v_readlane_b32 s7, v1, 1
695 ; SI-NEXT: v_readlane_b32 s6, v1, 0
696 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
697 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
698 ; SI-NEXT: s_mov_b64 exec, s[34:35]
699 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
700 ; SI-NEXT: s_setpc_b64 s[30:31]
702 ; VI-LABEL: global_atomic_xchg_f32_ret_scalar:
704 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
705 ; VI-NEXT: v_mov_b32_e32 v0, s4
706 ; VI-NEXT: v_mov_b32_e32 v1, s5
707 ; VI-NEXT: v_mov_b32_e32 v2, s6
708 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
709 ; VI-NEXT: s_waitcnt vmcnt(0)
710 ; VI-NEXT: buffer_wbinvl1_vol
711 ; VI-NEXT: s_setpc_b64 s[30:31]
713 ; GFX9-LABEL: global_atomic_xchg_f32_ret_scalar:
715 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
716 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
717 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
718 ; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[4:5] glc
719 ; GFX9-NEXT: s_waitcnt vmcnt(0)
720 ; GFX9-NEXT: buffer_wbinvl1_vol
721 ; GFX9-NEXT: s_setpc_b64 s[30:31]
722 %result = atomicrmw xchg ptr addrspace(1) %ptr, float %in seq_cst
726 define amdgpu_gfx float @global_atomic_xchg_f32_ret_offset_scalar(ptr addrspace(1) inreg %out, float inreg %in) {
727 ; SI-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
729 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
730 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
731 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
732 ; SI-NEXT: s_mov_b64 exec, s[34:35]
733 ; SI-NEXT: s_waitcnt expcnt(0)
734 ; SI-NEXT: v_writelane_b32 v1, s6, 0
735 ; SI-NEXT: v_writelane_b32 v1, s7, 1
736 ; SI-NEXT: s_mov_b32 s34, s6
737 ; SI-NEXT: s_mov_b32 s7, 0xf000
738 ; SI-NEXT: s_mov_b32 s6, -1
739 ; SI-NEXT: v_mov_b32_e32 v0, s34
740 ; SI-NEXT: s_waitcnt vmcnt(0)
741 ; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16 glc
742 ; SI-NEXT: s_waitcnt vmcnt(0)
743 ; SI-NEXT: buffer_wbinvl1
744 ; SI-NEXT: v_readlane_b32 s7, v1, 1
745 ; SI-NEXT: v_readlane_b32 s6, v1, 0
746 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
747 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
748 ; SI-NEXT: s_mov_b64 exec, s[34:35]
749 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
750 ; SI-NEXT: s_setpc_b64 s[30:31]
752 ; VI-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
754 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
755 ; VI-NEXT: s_add_u32 s34, s4, 16
756 ; VI-NEXT: s_addc_u32 s35, s5, 0
757 ; VI-NEXT: v_mov_b32_e32 v0, s34
758 ; VI-NEXT: v_mov_b32_e32 v1, s35
759 ; VI-NEXT: v_mov_b32_e32 v2, s6
760 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
761 ; VI-NEXT: s_waitcnt vmcnt(0)
762 ; VI-NEXT: buffer_wbinvl1_vol
763 ; VI-NEXT: s_setpc_b64 s[30:31]
765 ; GFX9-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
767 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
768 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
769 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
770 ; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[4:5] offset:16 glc
771 ; GFX9-NEXT: s_waitcnt vmcnt(0)
772 ; GFX9-NEXT: buffer_wbinvl1_vol
773 ; GFX9-NEXT: s_setpc_b64 s[30:31]
774 %gep = getelementptr float, ptr addrspace(1) %out, i32 4
775 %result = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst
779 define void @global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, float %in) {
780 ; SI-LABEL: global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory:
782 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
783 ; SI-NEXT: s_mov_b32 s6, 0
784 ; SI-NEXT: s_mov_b32 s7, 0xf000
785 ; SI-NEXT: s_mov_b32 s4, s6
786 ; SI-NEXT: s_mov_b32 s5, s6
787 ; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16
788 ; SI-NEXT: s_waitcnt vmcnt(0)
789 ; SI-NEXT: buffer_wbinvl1
790 ; SI-NEXT: s_waitcnt expcnt(0)
791 ; SI-NEXT: s_setpc_b64 s[30:31]
793 ; VI-LABEL: global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory:
795 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
796 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
797 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
798 ; VI-NEXT: flat_atomic_swap v[0:1], v2
799 ; VI-NEXT: s_waitcnt vmcnt(0)
800 ; VI-NEXT: buffer_wbinvl1_vol
801 ; VI-NEXT: s_setpc_b64 s[30:31]
803 ; GFX9-LABEL: global_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory:
805 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
806 ; GFX9-NEXT: global_atomic_swap v[0:1], v2, off offset:16
807 ; GFX9-NEXT: s_waitcnt vmcnt(0)
808 ; GFX9-NEXT: buffer_wbinvl1_vol
809 ; GFX9-NEXT: s_setpc_b64 s[30:31]
810 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
811 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst, !amdgpu.no.remote.memory !0
815 define float @global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, float %in) {
816 ; SI-LABEL: global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory:
818 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
819 ; SI-NEXT: s_mov_b32 s6, 0
820 ; SI-NEXT: s_mov_b32 s7, 0xf000
821 ; SI-NEXT: s_mov_b32 s4, s6
822 ; SI-NEXT: s_mov_b32 s5, s6
823 ; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
824 ; SI-NEXT: s_waitcnt vmcnt(0)
825 ; SI-NEXT: buffer_wbinvl1
826 ; SI-NEXT: v_mov_b32_e32 v0, v2
827 ; SI-NEXT: s_waitcnt expcnt(0)
828 ; SI-NEXT: s_setpc_b64 s[30:31]
830 ; VI-LABEL: global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory:
832 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
833 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
834 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
835 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
836 ; VI-NEXT: s_waitcnt vmcnt(0)
837 ; VI-NEXT: buffer_wbinvl1_vol
838 ; VI-NEXT: s_setpc_b64 s[30:31]
840 ; GFX9-LABEL: global_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory:
842 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
843 ; GFX9-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:16 glc
844 ; GFX9-NEXT: s_waitcnt vmcnt(0)
845 ; GFX9-NEXT: buffer_wbinvl1_vol
846 ; GFX9-NEXT: s_setpc_b64 s[30:31]
847 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
848 %result = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst, !amdgpu.no.remote.memory !0
852 ; ---------------------------------------------------------------------
854 ; ---------------------------------------------------------------------
856 define void @global_atomic_add_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
857 ; SI-LABEL: global_atomic_add_i32_noret:
859 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
860 ; SI-NEXT: s_mov_b32 s6, 0
861 ; SI-NEXT: s_mov_b32 s7, 0xf000
862 ; SI-NEXT: s_mov_b32 s4, s6
863 ; SI-NEXT: s_mov_b32 s5, s6
864 ; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64
865 ; SI-NEXT: s_waitcnt vmcnt(0)
866 ; SI-NEXT: buffer_wbinvl1
867 ; SI-NEXT: s_waitcnt expcnt(0)
868 ; SI-NEXT: s_setpc_b64 s[30:31]
870 ; VI-LABEL: global_atomic_add_i32_noret:
872 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
873 ; VI-NEXT: flat_atomic_add v[0:1], v2
874 ; VI-NEXT: s_waitcnt vmcnt(0)
875 ; VI-NEXT: buffer_wbinvl1_vol
876 ; VI-NEXT: s_setpc_b64 s[30:31]
878 ; GFX9-LABEL: global_atomic_add_i32_noret:
880 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
881 ; GFX9-NEXT: global_atomic_add v[0:1], v2, off
882 ; GFX9-NEXT: s_waitcnt vmcnt(0)
883 ; GFX9-NEXT: buffer_wbinvl1_vol
884 ; GFX9-NEXT: s_setpc_b64 s[30:31]
885 %tmp0 = atomicrmw add ptr addrspace(1) %ptr, i32 %in seq_cst
889 define void @global_atomic_add_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
890 ; SI-LABEL: global_atomic_add_i32_noret_offset:
892 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
893 ; SI-NEXT: s_mov_b32 s6, 0
894 ; SI-NEXT: s_mov_b32 s7, 0xf000
895 ; SI-NEXT: s_mov_b32 s4, s6
896 ; SI-NEXT: s_mov_b32 s5, s6
897 ; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 offset:16
898 ; SI-NEXT: s_waitcnt vmcnt(0)
899 ; SI-NEXT: buffer_wbinvl1
900 ; SI-NEXT: s_waitcnt expcnt(0)
901 ; SI-NEXT: s_setpc_b64 s[30:31]
903 ; VI-LABEL: global_atomic_add_i32_noret_offset:
905 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
906 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
907 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
908 ; VI-NEXT: flat_atomic_add v[0:1], v2
909 ; VI-NEXT: s_waitcnt vmcnt(0)
910 ; VI-NEXT: buffer_wbinvl1_vol
911 ; VI-NEXT: s_setpc_b64 s[30:31]
913 ; GFX9-LABEL: global_atomic_add_i32_noret_offset:
915 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
916 ; GFX9-NEXT: global_atomic_add v[0:1], v2, off offset:16
917 ; GFX9-NEXT: s_waitcnt vmcnt(0)
918 ; GFX9-NEXT: buffer_wbinvl1_vol
919 ; GFX9-NEXT: s_setpc_b64 s[30:31]
920 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
921 %tmp0 = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst
925 define i32 @global_atomic_add_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
926 ; SI-LABEL: global_atomic_add_i32_ret:
928 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
929 ; SI-NEXT: s_mov_b32 s6, 0
930 ; SI-NEXT: s_mov_b32 s7, 0xf000
931 ; SI-NEXT: s_mov_b32 s4, s6
932 ; SI-NEXT: s_mov_b32 s5, s6
933 ; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 glc
934 ; SI-NEXT: s_waitcnt vmcnt(0)
935 ; SI-NEXT: buffer_wbinvl1
936 ; SI-NEXT: v_mov_b32_e32 v0, v2
937 ; SI-NEXT: s_waitcnt expcnt(0)
938 ; SI-NEXT: s_setpc_b64 s[30:31]
940 ; VI-LABEL: global_atomic_add_i32_ret:
942 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
943 ; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc
944 ; VI-NEXT: s_waitcnt vmcnt(0)
945 ; VI-NEXT: buffer_wbinvl1_vol
946 ; VI-NEXT: s_setpc_b64 s[30:31]
948 ; GFX9-LABEL: global_atomic_add_i32_ret:
950 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
951 ; GFX9-NEXT: global_atomic_add v0, v[0:1], v2, off glc
952 ; GFX9-NEXT: s_waitcnt vmcnt(0)
953 ; GFX9-NEXT: buffer_wbinvl1_vol
954 ; GFX9-NEXT: s_setpc_b64 s[30:31]
955 %result = atomicrmw add ptr addrspace(1) %ptr, i32 %in seq_cst
959 define i32 @global_atomic_add_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
960 ; SI-LABEL: global_atomic_add_i32_ret_offset:
962 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
963 ; SI-NEXT: s_mov_b32 s6, 0
964 ; SI-NEXT: s_mov_b32 s7, 0xf000
965 ; SI-NEXT: s_mov_b32 s4, s6
966 ; SI-NEXT: s_mov_b32 s5, s6
967 ; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
968 ; SI-NEXT: s_waitcnt vmcnt(0)
969 ; SI-NEXT: buffer_wbinvl1
970 ; SI-NEXT: v_mov_b32_e32 v0, v2
971 ; SI-NEXT: s_waitcnt expcnt(0)
972 ; SI-NEXT: s_setpc_b64 s[30:31]
974 ; VI-LABEL: global_atomic_add_i32_ret_offset:
976 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
977 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
978 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
979 ; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc
980 ; VI-NEXT: s_waitcnt vmcnt(0)
981 ; VI-NEXT: buffer_wbinvl1_vol
982 ; VI-NEXT: s_setpc_b64 s[30:31]
984 ; GFX9-LABEL: global_atomic_add_i32_ret_offset:
986 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
987 ; GFX9-NEXT: global_atomic_add v0, v[0:1], v2, off offset:16 glc
988 ; GFX9-NEXT: s_waitcnt vmcnt(0)
989 ; GFX9-NEXT: buffer_wbinvl1_vol
990 ; GFX9-NEXT: s_setpc_b64 s[30:31]
991 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
992 %result = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst
996 define amdgpu_gfx void @global_atomic_add_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
997 ; SI-LABEL: global_atomic_add_i32_noret_scalar:
999 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1000 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1001 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1002 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1003 ; SI-NEXT: s_waitcnt expcnt(0)
1004 ; SI-NEXT: v_writelane_b32 v1, s6, 0
1005 ; SI-NEXT: v_writelane_b32 v1, s7, 1
1006 ; SI-NEXT: s_mov_b32 s34, s6
1007 ; SI-NEXT: s_mov_b32 s7, 0xf000
1008 ; SI-NEXT: s_mov_b32 s6, -1
1009 ; SI-NEXT: v_mov_b32_e32 v0, s34
1010 ; SI-NEXT: s_waitcnt vmcnt(0)
1011 ; SI-NEXT: buffer_atomic_add v0, off, s[4:7], 0
1012 ; SI-NEXT: s_waitcnt vmcnt(0)
1013 ; SI-NEXT: buffer_wbinvl1
1014 ; SI-NEXT: v_readlane_b32 s7, v1, 1
1015 ; SI-NEXT: v_readlane_b32 s6, v1, 0
1016 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1017 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1018 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1019 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1020 ; SI-NEXT: s_setpc_b64 s[30:31]
1022 ; VI-LABEL: global_atomic_add_i32_noret_scalar:
1024 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1025 ; VI-NEXT: v_mov_b32_e32 v0, s4
1026 ; VI-NEXT: v_mov_b32_e32 v1, s5
1027 ; VI-NEXT: v_mov_b32_e32 v2, s6
1028 ; VI-NEXT: flat_atomic_add v[0:1], v2
1029 ; VI-NEXT: s_waitcnt vmcnt(0)
1030 ; VI-NEXT: buffer_wbinvl1_vol
1031 ; VI-NEXT: s_setpc_b64 s[30:31]
1033 ; GFX9-LABEL: global_atomic_add_i32_noret_scalar:
1035 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1036 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1037 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1038 ; GFX9-NEXT: global_atomic_add v0, v1, s[4:5]
1039 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1040 ; GFX9-NEXT: buffer_wbinvl1_vol
1041 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1042 %tmp0 = atomicrmw add ptr addrspace(1) %ptr, i32 %in seq_cst
1046 define amdgpu_gfx void @global_atomic_add_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
1047 ; SI-LABEL: global_atomic_add_i32_noret_offset_scalar:
1049 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1050 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1051 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1052 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1053 ; SI-NEXT: s_waitcnt expcnt(0)
1054 ; SI-NEXT: v_writelane_b32 v1, s6, 0
1055 ; SI-NEXT: v_writelane_b32 v1, s7, 1
1056 ; SI-NEXT: s_mov_b32 s34, s6
1057 ; SI-NEXT: s_mov_b32 s7, 0xf000
1058 ; SI-NEXT: s_mov_b32 s6, -1
1059 ; SI-NEXT: v_mov_b32_e32 v0, s34
1060 ; SI-NEXT: s_waitcnt vmcnt(0)
1061 ; SI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 offset:16
1062 ; SI-NEXT: s_waitcnt vmcnt(0)
1063 ; SI-NEXT: buffer_wbinvl1
1064 ; SI-NEXT: v_readlane_b32 s7, v1, 1
1065 ; SI-NEXT: v_readlane_b32 s6, v1, 0
1066 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1067 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1068 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1069 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1070 ; SI-NEXT: s_setpc_b64 s[30:31]
1072 ; VI-LABEL: global_atomic_add_i32_noret_offset_scalar:
1074 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1075 ; VI-NEXT: s_add_u32 s34, s4, 16
1076 ; VI-NEXT: s_addc_u32 s35, s5, 0
1077 ; VI-NEXT: v_mov_b32_e32 v0, s34
1078 ; VI-NEXT: v_mov_b32_e32 v1, s35
1079 ; VI-NEXT: v_mov_b32_e32 v2, s6
1080 ; VI-NEXT: flat_atomic_add v[0:1], v2
1081 ; VI-NEXT: s_waitcnt vmcnt(0)
1082 ; VI-NEXT: buffer_wbinvl1_vol
1083 ; VI-NEXT: s_setpc_b64 s[30:31]
1085 ; GFX9-LABEL: global_atomic_add_i32_noret_offset_scalar:
1087 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1088 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1089 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1090 ; GFX9-NEXT: global_atomic_add v0, v1, s[4:5] offset:16
1091 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1092 ; GFX9-NEXT: buffer_wbinvl1_vol
1093 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1094 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
1095 %tmp0 = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst
1099 define amdgpu_gfx i32 @global_atomic_add_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
1100 ; SI-LABEL: global_atomic_add_i32_ret_scalar:
1102 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1103 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1104 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1105 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1106 ; SI-NEXT: s_waitcnt expcnt(0)
1107 ; SI-NEXT: v_writelane_b32 v1, s6, 0
1108 ; SI-NEXT: v_writelane_b32 v1, s7, 1
1109 ; SI-NEXT: s_mov_b32 s34, s6
1110 ; SI-NEXT: s_mov_b32 s7, 0xf000
1111 ; SI-NEXT: s_mov_b32 s6, -1
1112 ; SI-NEXT: v_mov_b32_e32 v0, s34
1113 ; SI-NEXT: s_waitcnt vmcnt(0)
1114 ; SI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc
1115 ; SI-NEXT: s_waitcnt vmcnt(0)
1116 ; SI-NEXT: buffer_wbinvl1
1117 ; SI-NEXT: v_readlane_b32 s7, v1, 1
1118 ; SI-NEXT: v_readlane_b32 s6, v1, 0
1119 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1120 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1121 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1122 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1123 ; SI-NEXT: s_setpc_b64 s[30:31]
1125 ; VI-LABEL: global_atomic_add_i32_ret_scalar:
1127 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1128 ; VI-NEXT: v_mov_b32_e32 v0, s4
1129 ; VI-NEXT: v_mov_b32_e32 v1, s5
1130 ; VI-NEXT: v_mov_b32_e32 v2, s6
1131 ; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc
1132 ; VI-NEXT: s_waitcnt vmcnt(0)
1133 ; VI-NEXT: buffer_wbinvl1_vol
1134 ; VI-NEXT: s_setpc_b64 s[30:31]
1136 ; GFX9-LABEL: global_atomic_add_i32_ret_scalar:
1138 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1139 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1140 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1141 ; GFX9-NEXT: global_atomic_add v0, v0, v1, s[4:5] glc
1142 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1143 ; GFX9-NEXT: buffer_wbinvl1_vol
1144 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1145 %result = atomicrmw add ptr addrspace(1) %ptr, i32 %in seq_cst
1149 define amdgpu_gfx i32 @global_atomic_add_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
1150 ; SI-LABEL: global_atomic_add_i32_ret_offset_scalar:
1152 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1153 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1154 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1155 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1156 ; SI-NEXT: s_waitcnt expcnt(0)
1157 ; SI-NEXT: v_writelane_b32 v1, s6, 0
1158 ; SI-NEXT: v_writelane_b32 v1, s7, 1
1159 ; SI-NEXT: s_mov_b32 s34, s6
1160 ; SI-NEXT: s_mov_b32 s7, 0xf000
1161 ; SI-NEXT: s_mov_b32 s6, -1
1162 ; SI-NEXT: v_mov_b32_e32 v0, s34
1163 ; SI-NEXT: s_waitcnt vmcnt(0)
1164 ; SI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 offset:16 glc
1165 ; SI-NEXT: s_waitcnt vmcnt(0)
1166 ; SI-NEXT: buffer_wbinvl1
1167 ; SI-NEXT: v_readlane_b32 s7, v1, 1
1168 ; SI-NEXT: v_readlane_b32 s6, v1, 0
1169 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1170 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1171 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1172 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1173 ; SI-NEXT: s_setpc_b64 s[30:31]
1175 ; VI-LABEL: global_atomic_add_i32_ret_offset_scalar:
1177 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1178 ; VI-NEXT: s_add_u32 s34, s4, 16
1179 ; VI-NEXT: s_addc_u32 s35, s5, 0
1180 ; VI-NEXT: v_mov_b32_e32 v0, s34
1181 ; VI-NEXT: v_mov_b32_e32 v1, s35
1182 ; VI-NEXT: v_mov_b32_e32 v2, s6
1183 ; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc
1184 ; VI-NEXT: s_waitcnt vmcnt(0)
1185 ; VI-NEXT: buffer_wbinvl1_vol
1186 ; VI-NEXT: s_setpc_b64 s[30:31]
1188 ; GFX9-LABEL: global_atomic_add_i32_ret_offset_scalar:
1190 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1191 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1192 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1193 ; GFX9-NEXT: global_atomic_add v0, v0, v1, s[4:5] offset:16 glc
1194 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1195 ; GFX9-NEXT: buffer_wbinvl1_vol
1196 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1197 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
1198 %result = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst
1202 define void @global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
1203 ; SI-LABEL: global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory:
1205 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1206 ; SI-NEXT: s_mov_b32 s6, 0
1207 ; SI-NEXT: s_mov_b32 s7, 0xf000
1208 ; SI-NEXT: s_mov_b32 s4, s6
1209 ; SI-NEXT: s_mov_b32 s5, s6
1210 ; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 offset:16
1211 ; SI-NEXT: s_waitcnt vmcnt(0)
1212 ; SI-NEXT: buffer_wbinvl1
1213 ; SI-NEXT: s_waitcnt expcnt(0)
1214 ; SI-NEXT: s_setpc_b64 s[30:31]
1216 ; VI-LABEL: global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory:
1218 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1219 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
1220 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1221 ; VI-NEXT: flat_atomic_add v[0:1], v2
1222 ; VI-NEXT: s_waitcnt vmcnt(0)
1223 ; VI-NEXT: buffer_wbinvl1_vol
1224 ; VI-NEXT: s_setpc_b64 s[30:31]
1226 ; GFX9-LABEL: global_atomic_add_i32_noret_offset__amdgpu_no_remote_memory:
1228 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1229 ; GFX9-NEXT: global_atomic_add v[0:1], v2, off offset:16
1230 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1231 ; GFX9-NEXT: buffer_wbinvl1_vol
1232 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1233 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
1234 %tmp0 = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
1238 define i32 @global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
1239 ; SI-LABEL: global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory:
1241 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1242 ; SI-NEXT: s_mov_b32 s6, 0
1243 ; SI-NEXT: s_mov_b32 s7, 0xf000
1244 ; SI-NEXT: s_mov_b32 s4, s6
1245 ; SI-NEXT: s_mov_b32 s5, s6
1246 ; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
1247 ; SI-NEXT: s_waitcnt vmcnt(0)
1248 ; SI-NEXT: buffer_wbinvl1
1249 ; SI-NEXT: v_mov_b32_e32 v0, v2
1250 ; SI-NEXT: s_waitcnt expcnt(0)
1251 ; SI-NEXT: s_setpc_b64 s[30:31]
1253 ; VI-LABEL: global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory:
1255 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1256 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
1257 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1258 ; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc
1259 ; VI-NEXT: s_waitcnt vmcnt(0)
1260 ; VI-NEXT: buffer_wbinvl1_vol
1261 ; VI-NEXT: s_setpc_b64 s[30:31]
1263 ; GFX9-LABEL: global_atomic_add_i32_ret_offset__amdgpu_no_remote_memory:
1265 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1266 ; GFX9-NEXT: global_atomic_add v0, v[0:1], v2, off offset:16 glc
1267 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1268 ; GFX9-NEXT: buffer_wbinvl1_vol
1269 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1270 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
1271 %result = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
1275 ; ---------------------------------------------------------------------
1277 ; ---------------------------------------------------------------------
1279 define void @global_atomic_sub_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
1280 ; SI-LABEL: global_atomic_sub_i32_noret:
1282 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1283 ; SI-NEXT: s_mov_b32 s6, 0
1284 ; SI-NEXT: s_mov_b32 s7, 0xf000
1285 ; SI-NEXT: s_mov_b32 s4, s6
1286 ; SI-NEXT: s_mov_b32 s5, s6
1287 ; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64
1288 ; SI-NEXT: s_waitcnt vmcnt(0)
1289 ; SI-NEXT: buffer_wbinvl1
1290 ; SI-NEXT: s_waitcnt expcnt(0)
1291 ; SI-NEXT: s_setpc_b64 s[30:31]
1293 ; VI-LABEL: global_atomic_sub_i32_noret:
1295 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1296 ; VI-NEXT: flat_atomic_sub v[0:1], v2
1297 ; VI-NEXT: s_waitcnt vmcnt(0)
1298 ; VI-NEXT: buffer_wbinvl1_vol
1299 ; VI-NEXT: s_setpc_b64 s[30:31]
1301 ; GFX9-LABEL: global_atomic_sub_i32_noret:
1303 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1304 ; GFX9-NEXT: global_atomic_sub v[0:1], v2, off
1305 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1306 ; GFX9-NEXT: buffer_wbinvl1_vol
1307 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1308 %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst
1312 define void @global_atomic_sub_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
1313 ; SI-LABEL: global_atomic_sub_i32_noret_offset:
1315 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1316 ; SI-NEXT: s_mov_b32 s6, 0
1317 ; SI-NEXT: s_mov_b32 s7, 0xf000
1318 ; SI-NEXT: s_mov_b32 s4, s6
1319 ; SI-NEXT: s_mov_b32 s5, s6
1320 ; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16
1321 ; SI-NEXT: s_waitcnt vmcnt(0)
1322 ; SI-NEXT: buffer_wbinvl1
1323 ; SI-NEXT: s_waitcnt expcnt(0)
1324 ; SI-NEXT: s_setpc_b64 s[30:31]
1326 ; VI-LABEL: global_atomic_sub_i32_noret_offset:
1328 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1329 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
1330 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1331 ; VI-NEXT: flat_atomic_sub v[0:1], v2
1332 ; VI-NEXT: s_waitcnt vmcnt(0)
1333 ; VI-NEXT: buffer_wbinvl1_vol
1334 ; VI-NEXT: s_setpc_b64 s[30:31]
1336 ; GFX9-LABEL: global_atomic_sub_i32_noret_offset:
1338 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1339 ; GFX9-NEXT: global_atomic_sub v[0:1], v2, off offset:16
1340 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1341 ; GFX9-NEXT: buffer_wbinvl1_vol
1342 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1343 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
1344 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst
1348 define i32 @global_atomic_sub_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
1349 ; SI-LABEL: global_atomic_sub_i32_ret:
1351 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1352 ; SI-NEXT: s_mov_b32 s6, 0
1353 ; SI-NEXT: s_mov_b32 s7, 0xf000
1354 ; SI-NEXT: s_mov_b32 s4, s6
1355 ; SI-NEXT: s_mov_b32 s5, s6
1356 ; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 glc
1357 ; SI-NEXT: s_waitcnt vmcnt(0)
1358 ; SI-NEXT: buffer_wbinvl1
1359 ; SI-NEXT: v_mov_b32_e32 v0, v2
1360 ; SI-NEXT: s_waitcnt expcnt(0)
1361 ; SI-NEXT: s_setpc_b64 s[30:31]
1363 ; VI-LABEL: global_atomic_sub_i32_ret:
1365 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1366 ; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
1367 ; VI-NEXT: s_waitcnt vmcnt(0)
1368 ; VI-NEXT: buffer_wbinvl1_vol
1369 ; VI-NEXT: s_setpc_b64 s[30:31]
1371 ; GFX9-LABEL: global_atomic_sub_i32_ret:
1373 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1374 ; GFX9-NEXT: global_atomic_sub v0, v[0:1], v2, off glc
1375 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1376 ; GFX9-NEXT: buffer_wbinvl1_vol
1377 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1378 %result = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst
1382 define i32 @global_atomic_sub_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
1383 ; SI-LABEL: global_atomic_sub_i32_ret_offset:
1385 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1386 ; SI-NEXT: s_mov_b32 s6, 0
1387 ; SI-NEXT: s_mov_b32 s7, 0xf000
1388 ; SI-NEXT: s_mov_b32 s4, s6
1389 ; SI-NEXT: s_mov_b32 s5, s6
1390 ; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
1391 ; SI-NEXT: s_waitcnt vmcnt(0)
1392 ; SI-NEXT: buffer_wbinvl1
1393 ; SI-NEXT: v_mov_b32_e32 v0, v2
1394 ; SI-NEXT: s_waitcnt expcnt(0)
1395 ; SI-NEXT: s_setpc_b64 s[30:31]
1397 ; VI-LABEL: global_atomic_sub_i32_ret_offset:
1399 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1400 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
1401 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1402 ; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
1403 ; VI-NEXT: s_waitcnt vmcnt(0)
1404 ; VI-NEXT: buffer_wbinvl1_vol
1405 ; VI-NEXT: s_setpc_b64 s[30:31]
1407 ; GFX9-LABEL: global_atomic_sub_i32_ret_offset:
1409 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1410 ; GFX9-NEXT: global_atomic_sub v0, v[0:1], v2, off offset:16 glc
1411 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1412 ; GFX9-NEXT: buffer_wbinvl1_vol
1413 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1414 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
1415 %result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst
1419 define amdgpu_gfx void @global_atomic_sub_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
1420 ; SI-LABEL: global_atomic_sub_i32_noret_scalar:
1422 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1423 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1424 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1425 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1426 ; SI-NEXT: s_waitcnt expcnt(0)
1427 ; SI-NEXT: v_writelane_b32 v1, s6, 0
1428 ; SI-NEXT: v_writelane_b32 v1, s7, 1
1429 ; SI-NEXT: s_mov_b32 s34, s6
1430 ; SI-NEXT: s_mov_b32 s7, 0xf000
1431 ; SI-NEXT: s_mov_b32 s6, -1
1432 ; SI-NEXT: v_mov_b32_e32 v0, s34
1433 ; SI-NEXT: s_waitcnt vmcnt(0)
1434 ; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0
1435 ; SI-NEXT: s_waitcnt vmcnt(0)
1436 ; SI-NEXT: buffer_wbinvl1
1437 ; SI-NEXT: v_readlane_b32 s7, v1, 1
1438 ; SI-NEXT: v_readlane_b32 s6, v1, 0
1439 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1440 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1441 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1442 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1443 ; SI-NEXT: s_setpc_b64 s[30:31]
1445 ; VI-LABEL: global_atomic_sub_i32_noret_scalar:
1447 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1448 ; VI-NEXT: v_mov_b32_e32 v0, s4
1449 ; VI-NEXT: v_mov_b32_e32 v1, s5
1450 ; VI-NEXT: v_mov_b32_e32 v2, s6
1451 ; VI-NEXT: flat_atomic_sub v[0:1], v2
1452 ; VI-NEXT: s_waitcnt vmcnt(0)
1453 ; VI-NEXT: buffer_wbinvl1_vol
1454 ; VI-NEXT: s_setpc_b64 s[30:31]
1456 ; GFX9-LABEL: global_atomic_sub_i32_noret_scalar:
1458 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1459 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1460 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1461 ; GFX9-NEXT: global_atomic_sub v0, v1, s[4:5]
1462 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1463 ; GFX9-NEXT: buffer_wbinvl1_vol
1464 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1465 %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst
1469 define amdgpu_gfx void @global_atomic_sub_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
1470 ; SI-LABEL: global_atomic_sub_i32_noret_offset_scalar:
1472 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1473 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1474 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1475 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1476 ; SI-NEXT: s_waitcnt expcnt(0)
1477 ; SI-NEXT: v_writelane_b32 v1, s6, 0
1478 ; SI-NEXT: v_writelane_b32 v1, s7, 1
1479 ; SI-NEXT: s_mov_b32 s34, s6
1480 ; SI-NEXT: s_mov_b32 s7, 0xf000
1481 ; SI-NEXT: s_mov_b32 s6, -1
1482 ; SI-NEXT: v_mov_b32_e32 v0, s34
1483 ; SI-NEXT: s_waitcnt vmcnt(0)
1484 ; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 offset:16
1485 ; SI-NEXT: s_waitcnt vmcnt(0)
1486 ; SI-NEXT: buffer_wbinvl1
1487 ; SI-NEXT: v_readlane_b32 s7, v1, 1
1488 ; SI-NEXT: v_readlane_b32 s6, v1, 0
1489 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1490 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1491 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1492 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1493 ; SI-NEXT: s_setpc_b64 s[30:31]
1495 ; VI-LABEL: global_atomic_sub_i32_noret_offset_scalar:
1497 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1498 ; VI-NEXT: s_add_u32 s34, s4, 16
1499 ; VI-NEXT: s_addc_u32 s35, s5, 0
1500 ; VI-NEXT: v_mov_b32_e32 v0, s34
1501 ; VI-NEXT: v_mov_b32_e32 v1, s35
1502 ; VI-NEXT: v_mov_b32_e32 v2, s6
1503 ; VI-NEXT: flat_atomic_sub v[0:1], v2
1504 ; VI-NEXT: s_waitcnt vmcnt(0)
1505 ; VI-NEXT: buffer_wbinvl1_vol
1506 ; VI-NEXT: s_setpc_b64 s[30:31]
1508 ; GFX9-LABEL: global_atomic_sub_i32_noret_offset_scalar:
1510 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1511 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1512 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1513 ; GFX9-NEXT: global_atomic_sub v0, v1, s[4:5] offset:16
1514 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1515 ; GFX9-NEXT: buffer_wbinvl1_vol
1516 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1517 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
1518 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst
1522 define amdgpu_gfx i32 @global_atomic_sub_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
1523 ; SI-LABEL: global_atomic_sub_i32_ret_scalar:
1525 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1526 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1527 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1528 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1529 ; SI-NEXT: s_waitcnt expcnt(0)
1530 ; SI-NEXT: v_writelane_b32 v1, s6, 0
1531 ; SI-NEXT: v_writelane_b32 v1, s7, 1
1532 ; SI-NEXT: s_mov_b32 s34, s6
1533 ; SI-NEXT: s_mov_b32 s7, 0xf000
1534 ; SI-NEXT: s_mov_b32 s6, -1
1535 ; SI-NEXT: v_mov_b32_e32 v0, s34
1536 ; SI-NEXT: s_waitcnt vmcnt(0)
1537 ; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc
1538 ; SI-NEXT: s_waitcnt vmcnt(0)
1539 ; SI-NEXT: buffer_wbinvl1
1540 ; SI-NEXT: v_readlane_b32 s7, v1, 1
1541 ; SI-NEXT: v_readlane_b32 s6, v1, 0
1542 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1543 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1544 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1545 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1546 ; SI-NEXT: s_setpc_b64 s[30:31]
1548 ; VI-LABEL: global_atomic_sub_i32_ret_scalar:
1550 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1551 ; VI-NEXT: v_mov_b32_e32 v0, s4
1552 ; VI-NEXT: v_mov_b32_e32 v1, s5
1553 ; VI-NEXT: v_mov_b32_e32 v2, s6
1554 ; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
1555 ; VI-NEXT: s_waitcnt vmcnt(0)
1556 ; VI-NEXT: buffer_wbinvl1_vol
1557 ; VI-NEXT: s_setpc_b64 s[30:31]
1559 ; GFX9-LABEL: global_atomic_sub_i32_ret_scalar:
1561 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1562 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1563 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1564 ; GFX9-NEXT: global_atomic_sub v0, v0, v1, s[4:5] glc
1565 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1566 ; GFX9-NEXT: buffer_wbinvl1_vol
1567 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1568 %result = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst
1572 define amdgpu_gfx i32 @global_atomic_sub_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
1573 ; SI-LABEL: global_atomic_sub_i32_ret_offset_scalar:
1575 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1576 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1577 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1578 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1579 ; SI-NEXT: s_waitcnt expcnt(0)
1580 ; SI-NEXT: v_writelane_b32 v1, s6, 0
1581 ; SI-NEXT: v_writelane_b32 v1, s7, 1
1582 ; SI-NEXT: s_mov_b32 s34, s6
1583 ; SI-NEXT: s_mov_b32 s7, 0xf000
1584 ; SI-NEXT: s_mov_b32 s6, -1
1585 ; SI-NEXT: v_mov_b32_e32 v0, s34
1586 ; SI-NEXT: s_waitcnt vmcnt(0)
1587 ; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 offset:16 glc
1588 ; SI-NEXT: s_waitcnt vmcnt(0)
1589 ; SI-NEXT: buffer_wbinvl1
1590 ; SI-NEXT: v_readlane_b32 s7, v1, 1
1591 ; SI-NEXT: v_readlane_b32 s6, v1, 0
1592 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1593 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1594 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1595 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1596 ; SI-NEXT: s_setpc_b64 s[30:31]
1598 ; VI-LABEL: global_atomic_sub_i32_ret_offset_scalar:
1600 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1601 ; VI-NEXT: s_add_u32 s34, s4, 16
1602 ; VI-NEXT: s_addc_u32 s35, s5, 0
1603 ; VI-NEXT: v_mov_b32_e32 v0, s34
1604 ; VI-NEXT: v_mov_b32_e32 v1, s35
1605 ; VI-NEXT: v_mov_b32_e32 v2, s6
1606 ; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
1607 ; VI-NEXT: s_waitcnt vmcnt(0)
1608 ; VI-NEXT: buffer_wbinvl1_vol
1609 ; VI-NEXT: s_setpc_b64 s[30:31]
1611 ; GFX9-LABEL: global_atomic_sub_i32_ret_offset_scalar:
1613 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1614 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1615 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1616 ; GFX9-NEXT: global_atomic_sub v0, v0, v1, s[4:5] offset:16 glc
1617 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1618 ; GFX9-NEXT: buffer_wbinvl1_vol
1619 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1620 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
1621 %result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst
1625 define i32 @global_atomic_sub_0_i32_ret(ptr addrspace(1) %ptr) {
1626 ; SI-LABEL: global_atomic_sub_0_i32_ret:
1628 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1629 ; SI-NEXT: s_mov_b32 s7, 0xf000
1630 ; SI-NEXT: s_mov_b32 s6, 0
1631 ; SI-NEXT: v_mov_b32_e32 v2, 0
1632 ; SI-NEXT: s_mov_b32 s4, s6
1633 ; SI-NEXT: s_mov_b32 s5, s6
1634 ; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 glc
1635 ; SI-NEXT: s_waitcnt vmcnt(0)
1636 ; SI-NEXT: buffer_wbinvl1
1637 ; SI-NEXT: v_mov_b32_e32 v0, v2
1638 ; SI-NEXT: s_waitcnt expcnt(0)
1639 ; SI-NEXT: s_setpc_b64 s[30:31]
1641 ; VI-LABEL: global_atomic_sub_0_i32_ret:
1643 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1644 ; VI-NEXT: v_mov_b32_e32 v2, 0
1645 ; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc
1646 ; VI-NEXT: s_waitcnt vmcnt(0)
1647 ; VI-NEXT: buffer_wbinvl1_vol
1648 ; VI-NEXT: s_setpc_b64 s[30:31]
1650 ; GFX9-LABEL: global_atomic_sub_0_i32_ret:
1652 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1653 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1654 ; GFX9-NEXT: global_atomic_add v0, v[0:1], v2, off glc
1655 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1656 ; GFX9-NEXT: buffer_wbinvl1_vol
1657 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1658 %result = atomicrmw sub ptr addrspace(1) %ptr, i32 0 seq_cst
1662 define void @global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
1663 ; SI-LABEL: global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory:
1665 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1666 ; SI-NEXT: s_mov_b32 s6, 0
1667 ; SI-NEXT: s_mov_b32 s7, 0xf000
1668 ; SI-NEXT: s_mov_b32 s4, s6
1669 ; SI-NEXT: s_mov_b32 s5, s6
1670 ; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16
1671 ; SI-NEXT: s_waitcnt vmcnt(0)
1672 ; SI-NEXT: buffer_wbinvl1
1673 ; SI-NEXT: s_waitcnt expcnt(0)
1674 ; SI-NEXT: s_setpc_b64 s[30:31]
1676 ; VI-LABEL: global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory:
1678 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1679 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
1680 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1681 ; VI-NEXT: flat_atomic_sub v[0:1], v2
1682 ; VI-NEXT: s_waitcnt vmcnt(0)
1683 ; VI-NEXT: buffer_wbinvl1_vol
1684 ; VI-NEXT: s_setpc_b64 s[30:31]
1686 ; GFX9-LABEL: global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory:
1688 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1689 ; GFX9-NEXT: global_atomic_sub v[0:1], v2, off offset:16
1690 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1691 ; GFX9-NEXT: buffer_wbinvl1_vol
1692 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1693 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
1694 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
1698 define i32 @global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
1699 ; SI-LABEL: global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory:
1701 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1702 ; SI-NEXT: s_mov_b32 s6, 0
1703 ; SI-NEXT: s_mov_b32 s7, 0xf000
1704 ; SI-NEXT: s_mov_b32 s4, s6
1705 ; SI-NEXT: s_mov_b32 s5, s6
1706 ; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
1707 ; SI-NEXT: s_waitcnt vmcnt(0)
1708 ; SI-NEXT: buffer_wbinvl1
1709 ; SI-NEXT: v_mov_b32_e32 v0, v2
1710 ; SI-NEXT: s_waitcnt expcnt(0)
1711 ; SI-NEXT: s_setpc_b64 s[30:31]
1713 ; VI-LABEL: global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory:
1715 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1716 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
1717 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1718 ; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
1719 ; VI-NEXT: s_waitcnt vmcnt(0)
1720 ; VI-NEXT: buffer_wbinvl1_vol
1721 ; VI-NEXT: s_setpc_b64 s[30:31]
1723 ; GFX9-LABEL: global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory:
1725 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1726 ; GFX9-NEXT: global_atomic_sub v0, v[0:1], v2, off offset:16 glc
1727 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1728 ; GFX9-NEXT: buffer_wbinvl1_vol
1729 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1730 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
1731 %result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
1735 ; ---------------------------------------------------------------------
1737 ; ---------------------------------------------------------------------
1739 define void @global_atomic_and_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
1740 ; SI-LABEL: global_atomic_and_i32_noret:
1742 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1743 ; SI-NEXT: s_mov_b32 s6, 0
1744 ; SI-NEXT: s_mov_b32 s7, 0xf000
1745 ; SI-NEXT: s_mov_b32 s4, s6
1746 ; SI-NEXT: s_mov_b32 s5, s6
1747 ; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64
1748 ; SI-NEXT: s_waitcnt vmcnt(0)
1749 ; SI-NEXT: buffer_wbinvl1
1750 ; SI-NEXT: s_waitcnt expcnt(0)
1751 ; SI-NEXT: s_setpc_b64 s[30:31]
1753 ; VI-LABEL: global_atomic_and_i32_noret:
1755 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1756 ; VI-NEXT: flat_atomic_and v[0:1], v2
1757 ; VI-NEXT: s_waitcnt vmcnt(0)
1758 ; VI-NEXT: buffer_wbinvl1_vol
1759 ; VI-NEXT: s_setpc_b64 s[30:31]
1761 ; GFX9-LABEL: global_atomic_and_i32_noret:
1763 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1764 ; GFX9-NEXT: global_atomic_and v[0:1], v2, off
1765 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1766 ; GFX9-NEXT: buffer_wbinvl1_vol
1767 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1768 %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst
1772 define void @global_atomic_and_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
1773 ; SI-LABEL: global_atomic_and_i32_noret_offset:
1775 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1776 ; SI-NEXT: s_mov_b32 s6, 0
1777 ; SI-NEXT: s_mov_b32 s7, 0xf000
1778 ; SI-NEXT: s_mov_b32 s4, s6
1779 ; SI-NEXT: s_mov_b32 s5, s6
1780 ; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16
1781 ; SI-NEXT: s_waitcnt vmcnt(0)
1782 ; SI-NEXT: buffer_wbinvl1
1783 ; SI-NEXT: s_waitcnt expcnt(0)
1784 ; SI-NEXT: s_setpc_b64 s[30:31]
1786 ; VI-LABEL: global_atomic_and_i32_noret_offset:
1788 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1789 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
1790 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1791 ; VI-NEXT: flat_atomic_and v[0:1], v2
1792 ; VI-NEXT: s_waitcnt vmcnt(0)
1793 ; VI-NEXT: buffer_wbinvl1_vol
1794 ; VI-NEXT: s_setpc_b64 s[30:31]
1796 ; GFX9-LABEL: global_atomic_and_i32_noret_offset:
1798 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1799 ; GFX9-NEXT: global_atomic_and v[0:1], v2, off offset:16
1800 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1801 ; GFX9-NEXT: buffer_wbinvl1_vol
1802 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1803 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
1804 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst
1808 define i32 @global_atomic_and_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
1809 ; SI-LABEL: global_atomic_and_i32_ret:
1811 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1812 ; SI-NEXT: s_mov_b32 s6, 0
1813 ; SI-NEXT: s_mov_b32 s7, 0xf000
1814 ; SI-NEXT: s_mov_b32 s4, s6
1815 ; SI-NEXT: s_mov_b32 s5, s6
1816 ; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 glc
1817 ; SI-NEXT: s_waitcnt vmcnt(0)
1818 ; SI-NEXT: buffer_wbinvl1
1819 ; SI-NEXT: v_mov_b32_e32 v0, v2
1820 ; SI-NEXT: s_waitcnt expcnt(0)
1821 ; SI-NEXT: s_setpc_b64 s[30:31]
1823 ; VI-LABEL: global_atomic_and_i32_ret:
1825 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1826 ; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc
1827 ; VI-NEXT: s_waitcnt vmcnt(0)
1828 ; VI-NEXT: buffer_wbinvl1_vol
1829 ; VI-NEXT: s_setpc_b64 s[30:31]
1831 ; GFX9-LABEL: global_atomic_and_i32_ret:
1833 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1834 ; GFX9-NEXT: global_atomic_and v0, v[0:1], v2, off glc
1835 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1836 ; GFX9-NEXT: buffer_wbinvl1_vol
1837 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1838 %result = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst
1842 define i32 @global_atomic_and_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
1843 ; SI-LABEL: global_atomic_and_i32_ret_offset:
1845 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1846 ; SI-NEXT: s_mov_b32 s6, 0
1847 ; SI-NEXT: s_mov_b32 s7, 0xf000
1848 ; SI-NEXT: s_mov_b32 s4, s6
1849 ; SI-NEXT: s_mov_b32 s5, s6
1850 ; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
1851 ; SI-NEXT: s_waitcnt vmcnt(0)
1852 ; SI-NEXT: buffer_wbinvl1
1853 ; SI-NEXT: v_mov_b32_e32 v0, v2
1854 ; SI-NEXT: s_waitcnt expcnt(0)
1855 ; SI-NEXT: s_setpc_b64 s[30:31]
1857 ; VI-LABEL: global_atomic_and_i32_ret_offset:
1859 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1860 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
1861 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1862 ; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc
1863 ; VI-NEXT: s_waitcnt vmcnt(0)
1864 ; VI-NEXT: buffer_wbinvl1_vol
1865 ; VI-NEXT: s_setpc_b64 s[30:31]
1867 ; GFX9-LABEL: global_atomic_and_i32_ret_offset:
1869 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1870 ; GFX9-NEXT: global_atomic_and v0, v[0:1], v2, off offset:16 glc
1871 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1872 ; GFX9-NEXT: buffer_wbinvl1_vol
1873 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1874 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
1875 %result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst
1879 define amdgpu_gfx void @global_atomic_and_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
1880 ; SI-LABEL: global_atomic_and_i32_noret_scalar:
1882 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1883 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1884 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1885 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1886 ; SI-NEXT: s_waitcnt expcnt(0)
1887 ; SI-NEXT: v_writelane_b32 v1, s6, 0
1888 ; SI-NEXT: v_writelane_b32 v1, s7, 1
1889 ; SI-NEXT: s_mov_b32 s34, s6
1890 ; SI-NEXT: s_mov_b32 s7, 0xf000
1891 ; SI-NEXT: s_mov_b32 s6, -1
1892 ; SI-NEXT: v_mov_b32_e32 v0, s34
1893 ; SI-NEXT: s_waitcnt vmcnt(0)
1894 ; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0
1895 ; SI-NEXT: s_waitcnt vmcnt(0)
1896 ; SI-NEXT: buffer_wbinvl1
1897 ; SI-NEXT: v_readlane_b32 s7, v1, 1
1898 ; SI-NEXT: v_readlane_b32 s6, v1, 0
1899 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1900 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1901 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1902 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1903 ; SI-NEXT: s_setpc_b64 s[30:31]
1905 ; VI-LABEL: global_atomic_and_i32_noret_scalar:
1907 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1908 ; VI-NEXT: v_mov_b32_e32 v0, s4
1909 ; VI-NEXT: v_mov_b32_e32 v1, s5
1910 ; VI-NEXT: v_mov_b32_e32 v2, s6
1911 ; VI-NEXT: flat_atomic_and v[0:1], v2
1912 ; VI-NEXT: s_waitcnt vmcnt(0)
1913 ; VI-NEXT: buffer_wbinvl1_vol
1914 ; VI-NEXT: s_setpc_b64 s[30:31]
1916 ; GFX9-LABEL: global_atomic_and_i32_noret_scalar:
1918 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1919 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1920 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1921 ; GFX9-NEXT: global_atomic_and v0, v1, s[4:5]
1922 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1923 ; GFX9-NEXT: buffer_wbinvl1_vol
1924 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1925 %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst
1929 define amdgpu_gfx void @global_atomic_and_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
1930 ; SI-LABEL: global_atomic_and_i32_noret_offset_scalar:
1932 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1933 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1934 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1935 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1936 ; SI-NEXT: s_waitcnt expcnt(0)
1937 ; SI-NEXT: v_writelane_b32 v1, s6, 0
1938 ; SI-NEXT: v_writelane_b32 v1, s7, 1
1939 ; SI-NEXT: s_mov_b32 s34, s6
1940 ; SI-NEXT: s_mov_b32 s7, 0xf000
1941 ; SI-NEXT: s_mov_b32 s6, -1
1942 ; SI-NEXT: v_mov_b32_e32 v0, s34
1943 ; SI-NEXT: s_waitcnt vmcnt(0)
1944 ; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 offset:16
1945 ; SI-NEXT: s_waitcnt vmcnt(0)
1946 ; SI-NEXT: buffer_wbinvl1
1947 ; SI-NEXT: v_readlane_b32 s7, v1, 1
1948 ; SI-NEXT: v_readlane_b32 s6, v1, 0
1949 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1950 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1951 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1952 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1953 ; SI-NEXT: s_setpc_b64 s[30:31]
1955 ; VI-LABEL: global_atomic_and_i32_noret_offset_scalar:
1957 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1958 ; VI-NEXT: s_add_u32 s34, s4, 16
1959 ; VI-NEXT: s_addc_u32 s35, s5, 0
1960 ; VI-NEXT: v_mov_b32_e32 v0, s34
1961 ; VI-NEXT: v_mov_b32_e32 v1, s35
1962 ; VI-NEXT: v_mov_b32_e32 v2, s6
1963 ; VI-NEXT: flat_atomic_and v[0:1], v2
1964 ; VI-NEXT: s_waitcnt vmcnt(0)
1965 ; VI-NEXT: buffer_wbinvl1_vol
1966 ; VI-NEXT: s_setpc_b64 s[30:31]
1968 ; GFX9-LABEL: global_atomic_and_i32_noret_offset_scalar:
1970 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1971 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1972 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1973 ; GFX9-NEXT: global_atomic_and v0, v1, s[4:5] offset:16
1974 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1975 ; GFX9-NEXT: buffer_wbinvl1_vol
1976 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1977 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
1978 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst
1982 define amdgpu_gfx i32 @global_atomic_and_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
1983 ; SI-LABEL: global_atomic_and_i32_ret_scalar:
1985 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1986 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1987 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1988 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1989 ; SI-NEXT: s_waitcnt expcnt(0)
1990 ; SI-NEXT: v_writelane_b32 v1, s6, 0
1991 ; SI-NEXT: v_writelane_b32 v1, s7, 1
1992 ; SI-NEXT: s_mov_b32 s34, s6
1993 ; SI-NEXT: s_mov_b32 s7, 0xf000
1994 ; SI-NEXT: s_mov_b32 s6, -1
1995 ; SI-NEXT: v_mov_b32_e32 v0, s34
1996 ; SI-NEXT: s_waitcnt vmcnt(0)
1997 ; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 glc
1998 ; SI-NEXT: s_waitcnt vmcnt(0)
1999 ; SI-NEXT: buffer_wbinvl1
2000 ; SI-NEXT: v_readlane_b32 s7, v1, 1
2001 ; SI-NEXT: v_readlane_b32 s6, v1, 0
2002 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2003 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
2004 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2005 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2006 ; SI-NEXT: s_setpc_b64 s[30:31]
2008 ; VI-LABEL: global_atomic_and_i32_ret_scalar:
2010 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2011 ; VI-NEXT: v_mov_b32_e32 v0, s4
2012 ; VI-NEXT: v_mov_b32_e32 v1, s5
2013 ; VI-NEXT: v_mov_b32_e32 v2, s6
2014 ; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc
2015 ; VI-NEXT: s_waitcnt vmcnt(0)
2016 ; VI-NEXT: buffer_wbinvl1_vol
2017 ; VI-NEXT: s_setpc_b64 s[30:31]
2019 ; GFX9-LABEL: global_atomic_and_i32_ret_scalar:
2021 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2022 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2023 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
2024 ; GFX9-NEXT: global_atomic_and v0, v0, v1, s[4:5] glc
2025 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2026 ; GFX9-NEXT: buffer_wbinvl1_vol
2027 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2028 %result = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst
2032 define amdgpu_gfx i32 @global_atomic_and_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
2033 ; SI-LABEL: global_atomic_and_i32_ret_offset_scalar:
2035 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2036 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2037 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
2038 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2039 ; SI-NEXT: s_waitcnt expcnt(0)
2040 ; SI-NEXT: v_writelane_b32 v1, s6, 0
2041 ; SI-NEXT: v_writelane_b32 v1, s7, 1
2042 ; SI-NEXT: s_mov_b32 s34, s6
2043 ; SI-NEXT: s_mov_b32 s7, 0xf000
2044 ; SI-NEXT: s_mov_b32 s6, -1
2045 ; SI-NEXT: v_mov_b32_e32 v0, s34
2046 ; SI-NEXT: s_waitcnt vmcnt(0)
2047 ; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 offset:16 glc
2048 ; SI-NEXT: s_waitcnt vmcnt(0)
2049 ; SI-NEXT: buffer_wbinvl1
2050 ; SI-NEXT: v_readlane_b32 s7, v1, 1
2051 ; SI-NEXT: v_readlane_b32 s6, v1, 0
2052 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2053 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
2054 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2055 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2056 ; SI-NEXT: s_setpc_b64 s[30:31]
2058 ; VI-LABEL: global_atomic_and_i32_ret_offset_scalar:
2060 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2061 ; VI-NEXT: s_add_u32 s34, s4, 16
2062 ; VI-NEXT: s_addc_u32 s35, s5, 0
2063 ; VI-NEXT: v_mov_b32_e32 v0, s34
2064 ; VI-NEXT: v_mov_b32_e32 v1, s35
2065 ; VI-NEXT: v_mov_b32_e32 v2, s6
2066 ; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc
2067 ; VI-NEXT: s_waitcnt vmcnt(0)
2068 ; VI-NEXT: buffer_wbinvl1_vol
2069 ; VI-NEXT: s_setpc_b64 s[30:31]
2071 ; GFX9-LABEL: global_atomic_and_i32_ret_offset_scalar:
2073 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2074 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2075 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
2076 ; GFX9-NEXT: global_atomic_and v0, v0, v1, s[4:5] offset:16 glc
2077 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2078 ; GFX9-NEXT: buffer_wbinvl1_vol
2079 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2080 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
2081 %result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst
2085 define void @global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
2086 ; SI-LABEL: global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory:
2088 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2089 ; SI-NEXT: s_mov_b32 s6, 0
2090 ; SI-NEXT: s_mov_b32 s7, 0xf000
2091 ; SI-NEXT: s_mov_b32 s4, s6
2092 ; SI-NEXT: s_mov_b32 s5, s6
2093 ; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16
2094 ; SI-NEXT: s_waitcnt vmcnt(0)
2095 ; SI-NEXT: buffer_wbinvl1
2096 ; SI-NEXT: s_waitcnt expcnt(0)
2097 ; SI-NEXT: s_setpc_b64 s[30:31]
2099 ; VI-LABEL: global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory:
2101 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2102 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
2103 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2104 ; VI-NEXT: flat_atomic_and v[0:1], v2
2105 ; VI-NEXT: s_waitcnt vmcnt(0)
2106 ; VI-NEXT: buffer_wbinvl1_vol
2107 ; VI-NEXT: s_setpc_b64 s[30:31]
2109 ; GFX9-LABEL: global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory:
2111 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2112 ; GFX9-NEXT: global_atomic_and v[0:1], v2, off offset:16
2113 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2114 ; GFX9-NEXT: buffer_wbinvl1_vol
2115 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2116 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
2117 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
2121 define i32 @global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
2122 ; SI-LABEL: global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory:
2124 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2125 ; SI-NEXT: s_mov_b32 s6, 0
2126 ; SI-NEXT: s_mov_b32 s7, 0xf000
2127 ; SI-NEXT: s_mov_b32 s4, s6
2128 ; SI-NEXT: s_mov_b32 s5, s6
2129 ; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
2130 ; SI-NEXT: s_waitcnt vmcnt(0)
2131 ; SI-NEXT: buffer_wbinvl1
2132 ; SI-NEXT: v_mov_b32_e32 v0, v2
2133 ; SI-NEXT: s_waitcnt expcnt(0)
2134 ; SI-NEXT: s_setpc_b64 s[30:31]
2136 ; VI-LABEL: global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory:
2138 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2139 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
2140 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2141 ; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc
2142 ; VI-NEXT: s_waitcnt vmcnt(0)
2143 ; VI-NEXT: buffer_wbinvl1_vol
2144 ; VI-NEXT: s_setpc_b64 s[30:31]
2146 ; GFX9-LABEL: global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory:
2148 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2149 ; GFX9-NEXT: global_atomic_and v0, v[0:1], v2, off offset:16 glc
2150 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2151 ; GFX9-NEXT: buffer_wbinvl1_vol
2152 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2153 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
2154 %result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
2158 ; ---------------------------------------------------------------------
2160 ; ---------------------------------------------------------------------
2162 define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
2163 ; SI-LABEL: global_atomic_nand_i32_noret:
2165 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2166 ; SI-NEXT: s_mov_b32 s6, 0
2167 ; SI-NEXT: s_mov_b32 s7, 0xf000
2168 ; SI-NEXT: s_mov_b32 s4, s6
2169 ; SI-NEXT: s_mov_b32 s5, s6
2170 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
2171 ; SI-NEXT: s_mov_b64 s[8:9], 0
2172 ; SI-NEXT: .LBB51_1: ; %atomicrmw.start
2173 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
2174 ; SI-NEXT: s_waitcnt vmcnt(0)
2175 ; SI-NEXT: v_and_b32_e32 v3, v4, v2
2176 ; SI-NEXT: v_not_b32_e32 v3, v3
2177 ; SI-NEXT: s_waitcnt expcnt(0)
2178 ; SI-NEXT: v_mov_b32_e32 v6, v4
2179 ; SI-NEXT: v_mov_b32_e32 v5, v3
2180 ; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
2181 ; SI-NEXT: s_waitcnt vmcnt(0)
2182 ; SI-NEXT: buffer_wbinvl1
2183 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
2184 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
2185 ; SI-NEXT: v_mov_b32_e32 v4, v5
2186 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
2187 ; SI-NEXT: s_cbranch_execnz .LBB51_1
2188 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
2189 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
2190 ; SI-NEXT: s_waitcnt expcnt(0)
2191 ; SI-NEXT: s_setpc_b64 s[30:31]
2193 ; VI-LABEL: global_atomic_nand_i32_noret:
2195 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2196 ; VI-NEXT: flat_load_dword v4, v[0:1]
2197 ; VI-NEXT: s_mov_b64 s[4:5], 0
2198 ; VI-NEXT: .LBB51_1: ; %atomicrmw.start
2199 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
2200 ; VI-NEXT: s_waitcnt vmcnt(0)
2201 ; VI-NEXT: v_and_b32_e32 v3, v4, v2
2202 ; VI-NEXT: v_not_b32_e32 v3, v3
2203 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2204 ; VI-NEXT: s_waitcnt vmcnt(0)
2205 ; VI-NEXT: buffer_wbinvl1_vol
2206 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
2207 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2208 ; VI-NEXT: v_mov_b32_e32 v4, v3
2209 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
2210 ; VI-NEXT: s_cbranch_execnz .LBB51_1
2211 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
2212 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
2213 ; VI-NEXT: s_setpc_b64 s[30:31]
2215 ; GFX9-LABEL: global_atomic_nand_i32_noret:
2217 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2218 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
2219 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
2220 ; GFX9-NEXT: .LBB51_1: ; %atomicrmw.start
2221 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2222 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2223 ; GFX9-NEXT: v_and_b32_e32 v3, v4, v2
2224 ; GFX9-NEXT: v_not_b32_e32 v3, v3
2225 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
2226 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2227 ; GFX9-NEXT: buffer_wbinvl1_vol
2228 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
2229 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2230 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
2231 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
2232 ; GFX9-NEXT: s_cbranch_execnz .LBB51_1
2233 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2234 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
2235 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2236 %tmp0 = atomicrmw nand ptr addrspace(1) %ptr, i32 %in seq_cst
2240 define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
2241 ; SI-LABEL: global_atomic_nand_i32_noret_offset:
2243 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2244 ; SI-NEXT: s_mov_b32 s6, 0
2245 ; SI-NEXT: s_mov_b32 s7, 0xf000
2246 ; SI-NEXT: s_mov_b32 s4, s6
2247 ; SI-NEXT: s_mov_b32 s5, s6
2248 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
2249 ; SI-NEXT: s_mov_b64 s[8:9], 0
2250 ; SI-NEXT: .LBB52_1: ; %atomicrmw.start
2251 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
2252 ; SI-NEXT: s_waitcnt vmcnt(0)
2253 ; SI-NEXT: v_and_b32_e32 v3, v4, v2
2254 ; SI-NEXT: v_not_b32_e32 v3, v3
2255 ; SI-NEXT: s_waitcnt expcnt(0)
2256 ; SI-NEXT: v_mov_b32_e32 v6, v4
2257 ; SI-NEXT: v_mov_b32_e32 v5, v3
2258 ; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
2259 ; SI-NEXT: s_waitcnt vmcnt(0)
2260 ; SI-NEXT: buffer_wbinvl1
2261 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
2262 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
2263 ; SI-NEXT: v_mov_b32_e32 v4, v5
2264 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
2265 ; SI-NEXT: s_cbranch_execnz .LBB52_1
2266 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
2267 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
2268 ; SI-NEXT: s_waitcnt expcnt(0)
2269 ; SI-NEXT: s_setpc_b64 s[30:31]
2271 ; VI-LABEL: global_atomic_nand_i32_noret_offset:
2273 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2274 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
2275 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2276 ; VI-NEXT: flat_load_dword v4, v[0:1]
2277 ; VI-NEXT: s_mov_b64 s[4:5], 0
2278 ; VI-NEXT: .LBB52_1: ; %atomicrmw.start
2279 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
2280 ; VI-NEXT: s_waitcnt vmcnt(0)
2281 ; VI-NEXT: v_and_b32_e32 v3, v4, v2
2282 ; VI-NEXT: v_not_b32_e32 v3, v3
2283 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2284 ; VI-NEXT: s_waitcnt vmcnt(0)
2285 ; VI-NEXT: buffer_wbinvl1_vol
2286 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
2287 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2288 ; VI-NEXT: v_mov_b32_e32 v4, v3
2289 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
2290 ; VI-NEXT: s_cbranch_execnz .LBB52_1
2291 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
2292 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
2293 ; VI-NEXT: s_setpc_b64 s[30:31]
2295 ; GFX9-LABEL: global_atomic_nand_i32_noret_offset:
2297 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2298 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16
2299 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
2300 ; GFX9-NEXT: .LBB52_1: ; %atomicrmw.start
2301 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2302 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2303 ; GFX9-NEXT: v_and_b32_e32 v3, v4, v2
2304 ; GFX9-NEXT: v_not_b32_e32 v3, v3
2305 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
2306 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2307 ; GFX9-NEXT: buffer_wbinvl1_vol
2308 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
2309 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2310 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
2311 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
2312 ; GFX9-NEXT: s_cbranch_execnz .LBB52_1
2313 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2314 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
2315 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2316 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
2317 %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst
2321 define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
2322 ; SI-LABEL: global_atomic_nand_i32_ret:
2324 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2325 ; SI-NEXT: s_mov_b32 s6, 0
2326 ; SI-NEXT: s_mov_b32 s7, 0xf000
2327 ; SI-NEXT: s_mov_b32 s4, s6
2328 ; SI-NEXT: s_mov_b32 s5, s6
2329 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
2330 ; SI-NEXT: s_mov_b64 s[8:9], 0
2331 ; SI-NEXT: .LBB53_1: ; %atomicrmw.start
2332 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
2333 ; SI-NEXT: s_waitcnt vmcnt(0)
2334 ; SI-NEXT: v_mov_b32_e32 v5, v3
2335 ; SI-NEXT: s_waitcnt expcnt(0)
2336 ; SI-NEXT: v_and_b32_e32 v3, v5, v2
2337 ; SI-NEXT: v_not_b32_e32 v4, v3
2338 ; SI-NEXT: v_mov_b32_e32 v3, v4
2339 ; SI-NEXT: v_mov_b32_e32 v4, v5
2340 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
2341 ; SI-NEXT: s_waitcnt vmcnt(0)
2342 ; SI-NEXT: buffer_wbinvl1
2343 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
2344 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
2345 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
2346 ; SI-NEXT: s_cbranch_execnz .LBB53_1
2347 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
2348 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
2349 ; SI-NEXT: v_mov_b32_e32 v0, v3
2350 ; SI-NEXT: s_waitcnt expcnt(0)
2351 ; SI-NEXT: s_setpc_b64 s[30:31]
2353 ; VI-LABEL: global_atomic_nand_i32_ret:
2355 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2356 ; VI-NEXT: flat_load_dword v3, v[0:1]
2357 ; VI-NEXT: s_mov_b64 s[4:5], 0
2358 ; VI-NEXT: .LBB53_1: ; %atomicrmw.start
2359 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
2360 ; VI-NEXT: s_waitcnt vmcnt(0)
2361 ; VI-NEXT: v_mov_b32_e32 v4, v3
2362 ; VI-NEXT: v_and_b32_e32 v3, v4, v2
2363 ; VI-NEXT: v_not_b32_e32 v3, v3
2364 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2365 ; VI-NEXT: s_waitcnt vmcnt(0)
2366 ; VI-NEXT: buffer_wbinvl1_vol
2367 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
2368 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2369 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
2370 ; VI-NEXT: s_cbranch_execnz .LBB53_1
2371 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
2372 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
2373 ; VI-NEXT: v_mov_b32_e32 v0, v3
2374 ; VI-NEXT: s_setpc_b64 s[30:31]
2376 ; GFX9-LABEL: global_atomic_nand_i32_ret:
2378 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2379 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
2380 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
2381 ; GFX9-NEXT: .LBB53_1: ; %atomicrmw.start
2382 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2383 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2384 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
2385 ; GFX9-NEXT: v_and_b32_e32 v3, v4, v2
2386 ; GFX9-NEXT: v_not_b32_e32 v3, v3
2387 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
2388 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2389 ; GFX9-NEXT: buffer_wbinvl1_vol
2390 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
2391 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2392 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
2393 ; GFX9-NEXT: s_cbranch_execnz .LBB53_1
2394 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2395 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
2396 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
2397 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2398 %result = atomicrmw nand ptr addrspace(1) %ptr, i32 %in seq_cst
2402 define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
2403 ; SI-LABEL: global_atomic_nand_i32_ret_offset:
2405 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2406 ; SI-NEXT: s_mov_b32 s6, 0
2407 ; SI-NEXT: s_mov_b32 s7, 0xf000
2408 ; SI-NEXT: s_mov_b32 s4, s6
2409 ; SI-NEXT: s_mov_b32 s5, s6
2410 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
2411 ; SI-NEXT: s_mov_b64 s[8:9], 0
2412 ; SI-NEXT: .LBB54_1: ; %atomicrmw.start
2413 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
2414 ; SI-NEXT: s_waitcnt vmcnt(0)
2415 ; SI-NEXT: v_mov_b32_e32 v5, v3
2416 ; SI-NEXT: s_waitcnt expcnt(0)
2417 ; SI-NEXT: v_and_b32_e32 v3, v5, v2
2418 ; SI-NEXT: v_not_b32_e32 v4, v3
2419 ; SI-NEXT: v_mov_b32_e32 v3, v4
2420 ; SI-NEXT: v_mov_b32_e32 v4, v5
2421 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
2422 ; SI-NEXT: s_waitcnt vmcnt(0)
2423 ; SI-NEXT: buffer_wbinvl1
2424 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
2425 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
2426 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
2427 ; SI-NEXT: s_cbranch_execnz .LBB54_1
2428 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
2429 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
2430 ; SI-NEXT: v_mov_b32_e32 v0, v3
2431 ; SI-NEXT: s_waitcnt expcnt(0)
2432 ; SI-NEXT: s_setpc_b64 s[30:31]
2434 ; VI-LABEL: global_atomic_nand_i32_ret_offset:
2436 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2437 ; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0
2438 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
2439 ; VI-NEXT: flat_load_dword v0, v[3:4]
2440 ; VI-NEXT: s_mov_b64 s[4:5], 0
2441 ; VI-NEXT: .LBB54_1: ; %atomicrmw.start
2442 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
2443 ; VI-NEXT: s_waitcnt vmcnt(0)
2444 ; VI-NEXT: v_mov_b32_e32 v1, v0
2445 ; VI-NEXT: v_and_b32_e32 v0, v1, v2
2446 ; VI-NEXT: v_not_b32_e32 v0, v0
2447 ; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2448 ; VI-NEXT: s_waitcnt vmcnt(0)
2449 ; VI-NEXT: buffer_wbinvl1_vol
2450 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2451 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2452 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
2453 ; VI-NEXT: s_cbranch_execnz .LBB54_1
2454 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
2455 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
2456 ; VI-NEXT: s_setpc_b64 s[30:31]
2458 ; GFX9-LABEL: global_atomic_nand_i32_ret_offset:
2460 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2461 ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16
2462 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
2463 ; GFX9-NEXT: .LBB54_1: ; %atomicrmw.start
2464 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2465 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2466 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
2467 ; GFX9-NEXT: v_and_b32_e32 v3, v4, v2
2468 ; GFX9-NEXT: v_not_b32_e32 v3, v3
2469 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
2470 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2471 ; GFX9-NEXT: buffer_wbinvl1_vol
2472 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
2473 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2474 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
2475 ; GFX9-NEXT: s_cbranch_execnz .LBB54_1
2476 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2477 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
2478 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
2479 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2480 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
2481 %result = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst
2485 define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
2486 ; SI-LABEL: global_atomic_nand_i32_noret_scalar:
2488 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2489 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2490 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
2491 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2492 ; SI-NEXT: s_waitcnt expcnt(0)
2493 ; SI-NEXT: v_writelane_b32 v4, s6, 0
2494 ; SI-NEXT: v_writelane_b32 v4, s7, 1
2495 ; SI-NEXT: s_mov_b32 s34, s6
2496 ; SI-NEXT: s_mov_b32 s7, 0xf000
2497 ; SI-NEXT: s_mov_b32 s6, -1
2498 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
2499 ; SI-NEXT: s_mov_b64 s[36:37], 0
2500 ; SI-NEXT: .LBB55_1: ; %atomicrmw.start
2501 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
2502 ; SI-NEXT: s_waitcnt vmcnt(0)
2503 ; SI-NEXT: v_and_b32_e32 v0, s34, v1
2504 ; SI-NEXT: v_not_b32_e32 v0, v0
2505 ; SI-NEXT: s_waitcnt expcnt(0)
2506 ; SI-NEXT: v_mov_b32_e32 v3, v1
2507 ; SI-NEXT: v_mov_b32_e32 v2, v0
2508 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
2509 ; SI-NEXT: s_waitcnt vmcnt(0)
2510 ; SI-NEXT: buffer_wbinvl1
2511 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
2512 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
2513 ; SI-NEXT: v_mov_b32_e32 v1, v2
2514 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
2515 ; SI-NEXT: s_cbranch_execnz .LBB55_1
2516 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
2517 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
2518 ; SI-NEXT: v_readlane_b32 s7, v4, 1
2519 ; SI-NEXT: v_readlane_b32 s6, v4, 0
2520 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2521 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
2522 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2523 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2524 ; SI-NEXT: s_setpc_b64 s[30:31]
2526 ; VI-LABEL: global_atomic_nand_i32_noret_scalar:
2528 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2529 ; VI-NEXT: v_mov_b32_e32 v0, s4
2530 ; VI-NEXT: v_mov_b32_e32 v1, s5
2531 ; VI-NEXT: flat_load_dword v3, v[0:1]
2532 ; VI-NEXT: s_mov_b64 s[34:35], 0
2533 ; VI-NEXT: .LBB55_1: ; %atomicrmw.start
2534 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
2535 ; VI-NEXT: s_waitcnt vmcnt(0)
2536 ; VI-NEXT: v_and_b32_e32 v2, s6, v3
2537 ; VI-NEXT: v_not_b32_e32 v2, v2
2538 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2539 ; VI-NEXT: s_waitcnt vmcnt(0)
2540 ; VI-NEXT: buffer_wbinvl1_vol
2541 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
2542 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2543 ; VI-NEXT: v_mov_b32_e32 v3, v2
2544 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
2545 ; VI-NEXT: s_cbranch_execnz .LBB55_1
2546 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
2547 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
2548 ; VI-NEXT: s_setpc_b64 s[30:31]
2550 ; GFX9-LABEL: global_atomic_nand_i32_noret_scalar:
2552 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2553 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2554 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5]
2555 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
2556 ; GFX9-NEXT: .LBB55_1: ; %atomicrmw.start
2557 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2558 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2559 ; GFX9-NEXT: v_and_b32_e32 v0, s6, v1
2560 ; GFX9-NEXT: v_not_b32_e32 v0, v0
2561 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc
2562 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2563 ; GFX9-NEXT: buffer_wbinvl1_vol
2564 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2565 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2566 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
2567 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
2568 ; GFX9-NEXT: s_cbranch_execnz .LBB55_1
2569 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2570 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
2571 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2572 %tmp0 = atomicrmw nand ptr addrspace(1) %ptr, i32 %in seq_cst
2576 define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
2577 ; SI-LABEL: global_atomic_nand_i32_noret_offset_scalar:
2579 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2580 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2581 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
2582 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2583 ; SI-NEXT: s_waitcnt expcnt(0)
2584 ; SI-NEXT: v_writelane_b32 v4, s6, 0
2585 ; SI-NEXT: v_writelane_b32 v4, s7, 1
2586 ; SI-NEXT: s_mov_b32 s34, s6
2587 ; SI-NEXT: s_mov_b32 s7, 0xf000
2588 ; SI-NEXT: s_mov_b32 s6, -1
2589 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16
2590 ; SI-NEXT: s_mov_b64 s[36:37], 0
2591 ; SI-NEXT: .LBB56_1: ; %atomicrmw.start
2592 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
2593 ; SI-NEXT: s_waitcnt vmcnt(0)
2594 ; SI-NEXT: v_and_b32_e32 v0, s34, v1
2595 ; SI-NEXT: v_not_b32_e32 v0, v0
2596 ; SI-NEXT: s_waitcnt expcnt(0)
2597 ; SI-NEXT: v_mov_b32_e32 v3, v1
2598 ; SI-NEXT: v_mov_b32_e32 v2, v0
2599 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
2600 ; SI-NEXT: s_waitcnt vmcnt(0)
2601 ; SI-NEXT: buffer_wbinvl1
2602 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
2603 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
2604 ; SI-NEXT: v_mov_b32_e32 v1, v2
2605 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
2606 ; SI-NEXT: s_cbranch_execnz .LBB56_1
2607 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
2608 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
2609 ; SI-NEXT: v_readlane_b32 s7, v4, 1
2610 ; SI-NEXT: v_readlane_b32 s6, v4, 0
2611 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2612 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
2613 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2614 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2615 ; SI-NEXT: s_setpc_b64 s[30:31]
2617 ; VI-LABEL: global_atomic_nand_i32_noret_offset_scalar:
2619 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2620 ; VI-NEXT: s_add_u32 s34, s4, 16
2621 ; VI-NEXT: s_addc_u32 s35, s5, 0
2622 ; VI-NEXT: v_mov_b32_e32 v0, s34
2623 ; VI-NEXT: v_mov_b32_e32 v1, s35
2624 ; VI-NEXT: flat_load_dword v3, v[0:1]
2625 ; VI-NEXT: s_mov_b64 s[34:35], 0
2626 ; VI-NEXT: .LBB56_1: ; %atomicrmw.start
2627 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
2628 ; VI-NEXT: s_waitcnt vmcnt(0)
2629 ; VI-NEXT: v_and_b32_e32 v2, s6, v3
2630 ; VI-NEXT: v_not_b32_e32 v2, v2
2631 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2632 ; VI-NEXT: s_waitcnt vmcnt(0)
2633 ; VI-NEXT: buffer_wbinvl1_vol
2634 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
2635 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2636 ; VI-NEXT: v_mov_b32_e32 v3, v2
2637 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
2638 ; VI-NEXT: s_cbranch_execnz .LBB56_1
2639 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
2640 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
2641 ; VI-NEXT: s_setpc_b64 s[30:31]
2643 ; GFX9-LABEL: global_atomic_nand_i32_noret_offset_scalar:
2645 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2646 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2647 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16
2648 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
2649 ; GFX9-NEXT: .LBB56_1: ; %atomicrmw.start
2650 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2651 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2652 ; GFX9-NEXT: v_and_b32_e32 v0, s6, v1
2653 ; GFX9-NEXT: v_not_b32_e32 v0, v0
2654 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc
2655 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2656 ; GFX9-NEXT: buffer_wbinvl1_vol
2657 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2658 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2659 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
2660 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
2661 ; GFX9-NEXT: s_cbranch_execnz .LBB56_1
2662 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2663 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
2664 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2665 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
2666 %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst
2670 define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
2671 ; SI-LABEL: global_atomic_nand_i32_ret_scalar:
2673 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2674 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2675 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
2676 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2677 ; SI-NEXT: s_waitcnt expcnt(0)
2678 ; SI-NEXT: v_writelane_b32 v3, s6, 0
2679 ; SI-NEXT: v_writelane_b32 v3, s7, 1
2680 ; SI-NEXT: s_mov_b32 s34, s6
2681 ; SI-NEXT: s_mov_b32 s7, 0xf000
2682 ; SI-NEXT: s_mov_b32 s6, -1
2683 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
2684 ; SI-NEXT: s_mov_b64 s[36:37], 0
2685 ; SI-NEXT: .LBB57_1: ; %atomicrmw.start
2686 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
2687 ; SI-NEXT: s_waitcnt vmcnt(0)
2688 ; SI-NEXT: v_mov_b32_e32 v2, v0
2689 ; SI-NEXT: s_waitcnt expcnt(0)
2690 ; SI-NEXT: v_and_b32_e32 v0, s34, v2
2691 ; SI-NEXT: v_not_b32_e32 v1, v0
2692 ; SI-NEXT: v_mov_b32_e32 v0, v1
2693 ; SI-NEXT: v_mov_b32_e32 v1, v2
2694 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
2695 ; SI-NEXT: s_waitcnt vmcnt(0)
2696 ; SI-NEXT: buffer_wbinvl1
2697 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
2698 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
2699 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
2700 ; SI-NEXT: s_cbranch_execnz .LBB57_1
2701 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
2702 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
2703 ; SI-NEXT: v_readlane_b32 s7, v3, 1
2704 ; SI-NEXT: v_readlane_b32 s6, v3, 0
2705 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2706 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
2707 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2708 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2709 ; SI-NEXT: s_setpc_b64 s[30:31]
2711 ; VI-LABEL: global_atomic_nand_i32_ret_scalar:
2713 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2714 ; VI-NEXT: v_mov_b32_e32 v0, s4
2715 ; VI-NEXT: v_mov_b32_e32 v1, s5
2716 ; VI-NEXT: flat_load_dword v0, v[0:1]
2717 ; VI-NEXT: v_mov_b32_e32 v1, s4
2718 ; VI-NEXT: s_mov_b64 s[34:35], 0
2719 ; VI-NEXT: v_mov_b32_e32 v2, s5
2720 ; VI-NEXT: .LBB57_1: ; %atomicrmw.start
2721 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
2722 ; VI-NEXT: s_waitcnt vmcnt(0)
2723 ; VI-NEXT: v_mov_b32_e32 v4, v0
2724 ; VI-NEXT: v_and_b32_e32 v0, s6, v4
2725 ; VI-NEXT: v_not_b32_e32 v3, v0
2726 ; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
2727 ; VI-NEXT: s_waitcnt vmcnt(0)
2728 ; VI-NEXT: buffer_wbinvl1_vol
2729 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
2730 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2731 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
2732 ; VI-NEXT: s_cbranch_execnz .LBB57_1
2733 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
2734 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
2735 ; VI-NEXT: s_setpc_b64 s[30:31]
2737 ; GFX9-LABEL: global_atomic_nand_i32_ret_scalar:
2739 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2740 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
2741 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5]
2742 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
2743 ; GFX9-NEXT: .LBB57_1: ; %atomicrmw.start
2744 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2745 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2746 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
2747 ; GFX9-NEXT: v_and_b32_e32 v0, s6, v3
2748 ; GFX9-NEXT: v_not_b32_e32 v2, v0
2749 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
2750 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2751 ; GFX9-NEXT: buffer_wbinvl1_vol
2752 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
2753 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2754 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
2755 ; GFX9-NEXT: s_cbranch_execnz .LBB57_1
2756 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2757 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
2758 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2759 %result = atomicrmw nand ptr addrspace(1) %ptr, i32 %in seq_cst
2763 define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
2764 ; SI-LABEL: global_atomic_nand_i32_ret_offset_scalar:
2766 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2767 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2768 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
2769 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2770 ; SI-NEXT: s_waitcnt expcnt(0)
2771 ; SI-NEXT: v_writelane_b32 v3, s6, 0
2772 ; SI-NEXT: v_writelane_b32 v3, s7, 1
2773 ; SI-NEXT: s_mov_b32 s34, s6
2774 ; SI-NEXT: s_mov_b32 s7, 0xf000
2775 ; SI-NEXT: s_mov_b32 s6, -1
2776 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16
2777 ; SI-NEXT: s_mov_b64 s[36:37], 0
2778 ; SI-NEXT: .LBB58_1: ; %atomicrmw.start
2779 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
2780 ; SI-NEXT: s_waitcnt vmcnt(0)
2781 ; SI-NEXT: v_mov_b32_e32 v2, v0
2782 ; SI-NEXT: s_waitcnt expcnt(0)
2783 ; SI-NEXT: v_and_b32_e32 v0, s34, v2
2784 ; SI-NEXT: v_not_b32_e32 v1, v0
2785 ; SI-NEXT: v_mov_b32_e32 v0, v1
2786 ; SI-NEXT: v_mov_b32_e32 v1, v2
2787 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2788 ; SI-NEXT: s_waitcnt vmcnt(0)
2789 ; SI-NEXT: buffer_wbinvl1
2790 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
2791 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
2792 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
2793 ; SI-NEXT: s_cbranch_execnz .LBB58_1
2794 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
2795 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
2796 ; SI-NEXT: v_readlane_b32 s7, v3, 1
2797 ; SI-NEXT: v_readlane_b32 s6, v3, 0
2798 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2799 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
2800 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2801 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2802 ; SI-NEXT: s_setpc_b64 s[30:31]
2804 ; VI-LABEL: global_atomic_nand_i32_ret_offset_scalar:
2806 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2807 ; VI-NEXT: s_add_u32 s34, s4, 16
2808 ; VI-NEXT: s_addc_u32 s35, s5, 0
2809 ; VI-NEXT: v_mov_b32_e32 v1, s34
2810 ; VI-NEXT: v_mov_b32_e32 v2, s35
2811 ; VI-NEXT: flat_load_dword v0, v[1:2]
2812 ; VI-NEXT: s_mov_b64 s[34:35], 0
2813 ; VI-NEXT: .LBB58_1: ; %atomicrmw.start
2814 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
2815 ; VI-NEXT: s_waitcnt vmcnt(0)
2816 ; VI-NEXT: v_mov_b32_e32 v4, v0
2817 ; VI-NEXT: v_and_b32_e32 v0, s6, v4
2818 ; VI-NEXT: v_not_b32_e32 v3, v0
2819 ; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
2820 ; VI-NEXT: s_waitcnt vmcnt(0)
2821 ; VI-NEXT: buffer_wbinvl1_vol
2822 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
2823 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2824 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
2825 ; VI-NEXT: s_cbranch_execnz .LBB58_1
2826 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
2827 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
2828 ; VI-NEXT: s_setpc_b64 s[30:31]
2830 ; GFX9-LABEL: global_atomic_nand_i32_ret_offset_scalar:
2832 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2833 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
2834 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16
2835 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
2836 ; GFX9-NEXT: .LBB58_1: ; %atomicrmw.start
2837 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2838 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2839 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
2840 ; GFX9-NEXT: v_and_b32_e32 v0, s6, v3
2841 ; GFX9-NEXT: v_not_b32_e32 v2, v0
2842 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc
2843 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2844 ; GFX9-NEXT: buffer_wbinvl1_vol
2845 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
2846 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2847 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
2848 ; GFX9-NEXT: s_cbranch_execnz .LBB58_1
2849 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2850 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
2851 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2852 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
2853 %result = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst
2857 define void @global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
2858 ; SI-LABEL: global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory:
2860 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2861 ; SI-NEXT: s_mov_b32 s6, 0
2862 ; SI-NEXT: s_mov_b32 s7, 0xf000
2863 ; SI-NEXT: s_mov_b32 s4, s6
2864 ; SI-NEXT: s_mov_b32 s5, s6
2865 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
2866 ; SI-NEXT: s_mov_b64 s[8:9], 0
2867 ; SI-NEXT: .LBB59_1: ; %atomicrmw.start
2868 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
2869 ; SI-NEXT: s_waitcnt vmcnt(0)
2870 ; SI-NEXT: v_and_b32_e32 v3, v4, v2
2871 ; SI-NEXT: v_not_b32_e32 v3, v3
2872 ; SI-NEXT: s_waitcnt expcnt(0)
2873 ; SI-NEXT: v_mov_b32_e32 v6, v4
2874 ; SI-NEXT: v_mov_b32_e32 v5, v3
2875 ; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
2876 ; SI-NEXT: s_waitcnt vmcnt(0)
2877 ; SI-NEXT: buffer_wbinvl1
2878 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
2879 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
2880 ; SI-NEXT: v_mov_b32_e32 v4, v5
2881 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
2882 ; SI-NEXT: s_cbranch_execnz .LBB59_1
2883 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
2884 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
2885 ; SI-NEXT: s_waitcnt expcnt(0)
2886 ; SI-NEXT: s_setpc_b64 s[30:31]
2888 ; VI-LABEL: global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory:
2890 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2891 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
2892 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2893 ; VI-NEXT: flat_load_dword v4, v[0:1]
2894 ; VI-NEXT: s_mov_b64 s[4:5], 0
2895 ; VI-NEXT: .LBB59_1: ; %atomicrmw.start
2896 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
2897 ; VI-NEXT: s_waitcnt vmcnt(0)
2898 ; VI-NEXT: v_and_b32_e32 v3, v4, v2
2899 ; VI-NEXT: v_not_b32_e32 v3, v3
2900 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2901 ; VI-NEXT: s_waitcnt vmcnt(0)
2902 ; VI-NEXT: buffer_wbinvl1_vol
2903 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
2904 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2905 ; VI-NEXT: v_mov_b32_e32 v4, v3
2906 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
2907 ; VI-NEXT: s_cbranch_execnz .LBB59_1
2908 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
2909 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
2910 ; VI-NEXT: s_setpc_b64 s[30:31]
2912 ; GFX9-LABEL: global_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory:
2914 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2915 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16
2916 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
2917 ; GFX9-NEXT: .LBB59_1: ; %atomicrmw.start
2918 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2919 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2920 ; GFX9-NEXT: v_and_b32_e32 v3, v4, v2
2921 ; GFX9-NEXT: v_not_b32_e32 v3, v3
2922 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
2923 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2924 ; GFX9-NEXT: buffer_wbinvl1_vol
2925 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
2926 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2927 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
2928 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
2929 ; GFX9-NEXT: s_cbranch_execnz .LBB59_1
2930 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2931 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
2932 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2933 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
2934 %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
2938 define i32 @global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
2939 ; SI-LABEL: global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory:
2941 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2942 ; SI-NEXT: s_mov_b32 s6, 0
2943 ; SI-NEXT: s_mov_b32 s7, 0xf000
2944 ; SI-NEXT: s_mov_b32 s4, s6
2945 ; SI-NEXT: s_mov_b32 s5, s6
2946 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
2947 ; SI-NEXT: s_mov_b64 s[8:9], 0
2948 ; SI-NEXT: .LBB60_1: ; %atomicrmw.start
2949 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
2950 ; SI-NEXT: s_waitcnt vmcnt(0)
2951 ; SI-NEXT: v_mov_b32_e32 v5, v3
2952 ; SI-NEXT: s_waitcnt expcnt(0)
2953 ; SI-NEXT: v_and_b32_e32 v3, v5, v2
2954 ; SI-NEXT: v_not_b32_e32 v4, v3
2955 ; SI-NEXT: v_mov_b32_e32 v3, v4
2956 ; SI-NEXT: v_mov_b32_e32 v4, v5
2957 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
2958 ; SI-NEXT: s_waitcnt vmcnt(0)
2959 ; SI-NEXT: buffer_wbinvl1
2960 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
2961 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
2962 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
2963 ; SI-NEXT: s_cbranch_execnz .LBB60_1
2964 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
2965 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
2966 ; SI-NEXT: v_mov_b32_e32 v0, v3
2967 ; SI-NEXT: s_waitcnt expcnt(0)
2968 ; SI-NEXT: s_setpc_b64 s[30:31]
2970 ; VI-LABEL: global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory:
2972 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2973 ; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0
2974 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
2975 ; VI-NEXT: flat_load_dword v0, v[3:4]
2976 ; VI-NEXT: s_mov_b64 s[4:5], 0
2977 ; VI-NEXT: .LBB60_1: ; %atomicrmw.start
2978 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
2979 ; VI-NEXT: s_waitcnt vmcnt(0)
2980 ; VI-NEXT: v_mov_b32_e32 v1, v0
2981 ; VI-NEXT: v_and_b32_e32 v0, v1, v2
2982 ; VI-NEXT: v_not_b32_e32 v0, v0
2983 ; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2984 ; VI-NEXT: s_waitcnt vmcnt(0)
2985 ; VI-NEXT: buffer_wbinvl1_vol
2986 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2987 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2988 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
2989 ; VI-NEXT: s_cbranch_execnz .LBB60_1
2990 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
2991 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
2992 ; VI-NEXT: s_setpc_b64 s[30:31]
2994 ; GFX9-LABEL: global_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory:
2996 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2997 ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16
2998 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
2999 ; GFX9-NEXT: .LBB60_1: ; %atomicrmw.start
3000 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
3001 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3002 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
3003 ; GFX9-NEXT: v_and_b32_e32 v3, v4, v2
3004 ; GFX9-NEXT: v_not_b32_e32 v3, v3
3005 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
3006 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3007 ; GFX9-NEXT: buffer_wbinvl1_vol
3008 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
3009 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3010 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
3011 ; GFX9-NEXT: s_cbranch_execnz .LBB60_1
3012 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
3013 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
3014 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
3015 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3016 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
3017 %result = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
3021 ; ---------------------------------------------------------------------
3023 ; ---------------------------------------------------------------------
3025 define void @global_atomic_or_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
3026 ; SI-LABEL: global_atomic_or_i32_noret:
3028 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3029 ; SI-NEXT: s_mov_b32 s6, 0
3030 ; SI-NEXT: s_mov_b32 s7, 0xf000
3031 ; SI-NEXT: s_mov_b32 s4, s6
3032 ; SI-NEXT: s_mov_b32 s5, s6
3033 ; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64
3034 ; SI-NEXT: s_waitcnt vmcnt(0)
3035 ; SI-NEXT: buffer_wbinvl1
3036 ; SI-NEXT: s_waitcnt expcnt(0)
3037 ; SI-NEXT: s_setpc_b64 s[30:31]
3039 ; VI-LABEL: global_atomic_or_i32_noret:
3041 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3042 ; VI-NEXT: flat_atomic_or v[0:1], v2
3043 ; VI-NEXT: s_waitcnt vmcnt(0)
3044 ; VI-NEXT: buffer_wbinvl1_vol
3045 ; VI-NEXT: s_setpc_b64 s[30:31]
3047 ; GFX9-LABEL: global_atomic_or_i32_noret:
3049 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3050 ; GFX9-NEXT: global_atomic_or v[0:1], v2, off
3051 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3052 ; GFX9-NEXT: buffer_wbinvl1_vol
3053 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3054 %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst
3058 define void @global_atomic_or_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
3059 ; SI-LABEL: global_atomic_or_i32_noret_offset:
3061 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3062 ; SI-NEXT: s_mov_b32 s6, 0
3063 ; SI-NEXT: s_mov_b32 s7, 0xf000
3064 ; SI-NEXT: s_mov_b32 s4, s6
3065 ; SI-NEXT: s_mov_b32 s5, s6
3066 ; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16
3067 ; SI-NEXT: s_waitcnt vmcnt(0)
3068 ; SI-NEXT: buffer_wbinvl1
3069 ; SI-NEXT: s_waitcnt expcnt(0)
3070 ; SI-NEXT: s_setpc_b64 s[30:31]
3072 ; VI-LABEL: global_atomic_or_i32_noret_offset:
3074 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3075 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
3076 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3077 ; VI-NEXT: flat_atomic_or v[0:1], v2
3078 ; VI-NEXT: s_waitcnt vmcnt(0)
3079 ; VI-NEXT: buffer_wbinvl1_vol
3080 ; VI-NEXT: s_setpc_b64 s[30:31]
3082 ; GFX9-LABEL: global_atomic_or_i32_noret_offset:
3084 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3085 ; GFX9-NEXT: global_atomic_or v[0:1], v2, off offset:16
3086 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3087 ; GFX9-NEXT: buffer_wbinvl1_vol
3088 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3089 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3090 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst
3094 define i32 @global_atomic_or_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
3095 ; SI-LABEL: global_atomic_or_i32_ret:
3097 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3098 ; SI-NEXT: s_mov_b32 s6, 0
3099 ; SI-NEXT: s_mov_b32 s7, 0xf000
3100 ; SI-NEXT: s_mov_b32 s4, s6
3101 ; SI-NEXT: s_mov_b32 s5, s6
3102 ; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 glc
3103 ; SI-NEXT: s_waitcnt vmcnt(0)
3104 ; SI-NEXT: buffer_wbinvl1
3105 ; SI-NEXT: v_mov_b32_e32 v0, v2
3106 ; SI-NEXT: s_waitcnt expcnt(0)
3107 ; SI-NEXT: s_setpc_b64 s[30:31]
3109 ; VI-LABEL: global_atomic_or_i32_ret:
3111 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3112 ; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc
3113 ; VI-NEXT: s_waitcnt vmcnt(0)
3114 ; VI-NEXT: buffer_wbinvl1_vol
3115 ; VI-NEXT: s_setpc_b64 s[30:31]
3117 ; GFX9-LABEL: global_atomic_or_i32_ret:
3119 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3120 ; GFX9-NEXT: global_atomic_or v0, v[0:1], v2, off glc
3121 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3122 ; GFX9-NEXT: buffer_wbinvl1_vol
3123 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3124 %result = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst
3128 define i32 @global_atomic_or_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
3129 ; SI-LABEL: global_atomic_or_i32_ret_offset:
3131 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3132 ; SI-NEXT: s_mov_b32 s6, 0
3133 ; SI-NEXT: s_mov_b32 s7, 0xf000
3134 ; SI-NEXT: s_mov_b32 s4, s6
3135 ; SI-NEXT: s_mov_b32 s5, s6
3136 ; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
3137 ; SI-NEXT: s_waitcnt vmcnt(0)
3138 ; SI-NEXT: buffer_wbinvl1
3139 ; SI-NEXT: v_mov_b32_e32 v0, v2
3140 ; SI-NEXT: s_waitcnt expcnt(0)
3141 ; SI-NEXT: s_setpc_b64 s[30:31]
3143 ; VI-LABEL: global_atomic_or_i32_ret_offset:
3145 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3146 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
3147 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3148 ; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc
3149 ; VI-NEXT: s_waitcnt vmcnt(0)
3150 ; VI-NEXT: buffer_wbinvl1_vol
3151 ; VI-NEXT: s_setpc_b64 s[30:31]
3153 ; GFX9-LABEL: global_atomic_or_i32_ret_offset:
3155 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3156 ; GFX9-NEXT: global_atomic_or v0, v[0:1], v2, off offset:16 glc
3157 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3158 ; GFX9-NEXT: buffer_wbinvl1_vol
3159 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3160 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3161 %result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst
3165 define amdgpu_gfx void @global_atomic_or_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
3166 ; SI-LABEL: global_atomic_or_i32_noret_scalar:
3168 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3169 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3170 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
3171 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3172 ; SI-NEXT: s_waitcnt expcnt(0)
3173 ; SI-NEXT: v_writelane_b32 v1, s6, 0
3174 ; SI-NEXT: v_writelane_b32 v1, s7, 1
3175 ; SI-NEXT: s_mov_b32 s34, s6
3176 ; SI-NEXT: s_mov_b32 s7, 0xf000
3177 ; SI-NEXT: s_mov_b32 s6, -1
3178 ; SI-NEXT: v_mov_b32_e32 v0, s34
3179 ; SI-NEXT: s_waitcnt vmcnt(0)
3180 ; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0
3181 ; SI-NEXT: s_waitcnt vmcnt(0)
3182 ; SI-NEXT: buffer_wbinvl1
3183 ; SI-NEXT: v_readlane_b32 s7, v1, 1
3184 ; SI-NEXT: v_readlane_b32 s6, v1, 0
3185 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3186 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
3187 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3188 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3189 ; SI-NEXT: s_setpc_b64 s[30:31]
3191 ; VI-LABEL: global_atomic_or_i32_noret_scalar:
3193 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3194 ; VI-NEXT: v_mov_b32_e32 v0, s4
3195 ; VI-NEXT: v_mov_b32_e32 v1, s5
3196 ; VI-NEXT: v_mov_b32_e32 v2, s6
3197 ; VI-NEXT: flat_atomic_or v[0:1], v2
3198 ; VI-NEXT: s_waitcnt vmcnt(0)
3199 ; VI-NEXT: buffer_wbinvl1_vol
3200 ; VI-NEXT: s_setpc_b64 s[30:31]
3202 ; GFX9-LABEL: global_atomic_or_i32_noret_scalar:
3204 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3205 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3206 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
3207 ; GFX9-NEXT: global_atomic_or v0, v1, s[4:5]
3208 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3209 ; GFX9-NEXT: buffer_wbinvl1_vol
3210 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3211 %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst
3215 define amdgpu_gfx void @global_atomic_or_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
3216 ; SI-LABEL: global_atomic_or_i32_noret_offset_scalar:
3218 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3219 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3220 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
3221 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3222 ; SI-NEXT: s_waitcnt expcnt(0)
3223 ; SI-NEXT: v_writelane_b32 v1, s6, 0
3224 ; SI-NEXT: v_writelane_b32 v1, s7, 1
3225 ; SI-NEXT: s_mov_b32 s34, s6
3226 ; SI-NEXT: s_mov_b32 s7, 0xf000
3227 ; SI-NEXT: s_mov_b32 s6, -1
3228 ; SI-NEXT: v_mov_b32_e32 v0, s34
3229 ; SI-NEXT: s_waitcnt vmcnt(0)
3230 ; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 offset:16
3231 ; SI-NEXT: s_waitcnt vmcnt(0)
3232 ; SI-NEXT: buffer_wbinvl1
3233 ; SI-NEXT: v_readlane_b32 s7, v1, 1
3234 ; SI-NEXT: v_readlane_b32 s6, v1, 0
3235 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3236 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
3237 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3238 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3239 ; SI-NEXT: s_setpc_b64 s[30:31]
3241 ; VI-LABEL: global_atomic_or_i32_noret_offset_scalar:
3243 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3244 ; VI-NEXT: s_add_u32 s34, s4, 16
3245 ; VI-NEXT: s_addc_u32 s35, s5, 0
3246 ; VI-NEXT: v_mov_b32_e32 v0, s34
3247 ; VI-NEXT: v_mov_b32_e32 v1, s35
3248 ; VI-NEXT: v_mov_b32_e32 v2, s6
3249 ; VI-NEXT: flat_atomic_or v[0:1], v2
3250 ; VI-NEXT: s_waitcnt vmcnt(0)
3251 ; VI-NEXT: buffer_wbinvl1_vol
3252 ; VI-NEXT: s_setpc_b64 s[30:31]
3254 ; GFX9-LABEL: global_atomic_or_i32_noret_offset_scalar:
3256 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3257 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3258 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
3259 ; GFX9-NEXT: global_atomic_or v0, v1, s[4:5] offset:16
3260 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3261 ; GFX9-NEXT: buffer_wbinvl1_vol
3262 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3263 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3264 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst
3268 define amdgpu_gfx i32 @global_atomic_or_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
3269 ; SI-LABEL: global_atomic_or_i32_ret_scalar:
3271 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3272 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3273 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
3274 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3275 ; SI-NEXT: s_waitcnt expcnt(0)
3276 ; SI-NEXT: v_writelane_b32 v1, s6, 0
3277 ; SI-NEXT: v_writelane_b32 v1, s7, 1
3278 ; SI-NEXT: s_mov_b32 s34, s6
3279 ; SI-NEXT: s_mov_b32 s7, 0xf000
3280 ; SI-NEXT: s_mov_b32 s6, -1
3281 ; SI-NEXT: v_mov_b32_e32 v0, s34
3282 ; SI-NEXT: s_waitcnt vmcnt(0)
3283 ; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 glc
3284 ; SI-NEXT: s_waitcnt vmcnt(0)
3285 ; SI-NEXT: buffer_wbinvl1
3286 ; SI-NEXT: v_readlane_b32 s7, v1, 1
3287 ; SI-NEXT: v_readlane_b32 s6, v1, 0
3288 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3289 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
3290 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3291 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3292 ; SI-NEXT: s_setpc_b64 s[30:31]
3294 ; VI-LABEL: global_atomic_or_i32_ret_scalar:
3296 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3297 ; VI-NEXT: v_mov_b32_e32 v0, s4
3298 ; VI-NEXT: v_mov_b32_e32 v1, s5
3299 ; VI-NEXT: v_mov_b32_e32 v2, s6
3300 ; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc
3301 ; VI-NEXT: s_waitcnt vmcnt(0)
3302 ; VI-NEXT: buffer_wbinvl1_vol
3303 ; VI-NEXT: s_setpc_b64 s[30:31]
3305 ; GFX9-LABEL: global_atomic_or_i32_ret_scalar:
3307 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3308 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3309 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
3310 ; GFX9-NEXT: global_atomic_or v0, v0, v1, s[4:5] glc
3311 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3312 ; GFX9-NEXT: buffer_wbinvl1_vol
3313 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3314 %result = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst
3318 define amdgpu_gfx i32 @global_atomic_or_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
3319 ; SI-LABEL: global_atomic_or_i32_ret_offset_scalar:
3321 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3322 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3323 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
3324 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3325 ; SI-NEXT: s_waitcnt expcnt(0)
3326 ; SI-NEXT: v_writelane_b32 v1, s6, 0
3327 ; SI-NEXT: v_writelane_b32 v1, s7, 1
3328 ; SI-NEXT: s_mov_b32 s34, s6
3329 ; SI-NEXT: s_mov_b32 s7, 0xf000
3330 ; SI-NEXT: s_mov_b32 s6, -1
3331 ; SI-NEXT: v_mov_b32_e32 v0, s34
3332 ; SI-NEXT: s_waitcnt vmcnt(0)
3333 ; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 offset:16 glc
3334 ; SI-NEXT: s_waitcnt vmcnt(0)
3335 ; SI-NEXT: buffer_wbinvl1
3336 ; SI-NEXT: v_readlane_b32 s7, v1, 1
3337 ; SI-NEXT: v_readlane_b32 s6, v1, 0
3338 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3339 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
3340 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3341 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3342 ; SI-NEXT: s_setpc_b64 s[30:31]
3344 ; VI-LABEL: global_atomic_or_i32_ret_offset_scalar:
3346 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3347 ; VI-NEXT: s_add_u32 s34, s4, 16
3348 ; VI-NEXT: s_addc_u32 s35, s5, 0
3349 ; VI-NEXT: v_mov_b32_e32 v0, s34
3350 ; VI-NEXT: v_mov_b32_e32 v1, s35
3351 ; VI-NEXT: v_mov_b32_e32 v2, s6
3352 ; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc
3353 ; VI-NEXT: s_waitcnt vmcnt(0)
3354 ; VI-NEXT: buffer_wbinvl1_vol
3355 ; VI-NEXT: s_setpc_b64 s[30:31]
3357 ; GFX9-LABEL: global_atomic_or_i32_ret_offset_scalar:
3359 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3360 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3361 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
3362 ; GFX9-NEXT: global_atomic_or v0, v0, v1, s[4:5] offset:16 glc
3363 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3364 ; GFX9-NEXT: buffer_wbinvl1_vol
3365 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3366 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3367 %result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst
3371 define i32 @global_atomic_or_0_i32_ret(ptr addrspace(1) %ptr) {
3372 ; SI-LABEL: global_atomic_or_0_i32_ret:
3374 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3375 ; SI-NEXT: s_mov_b32 s7, 0xf000
3376 ; SI-NEXT: s_mov_b32 s6, 0
3377 ; SI-NEXT: v_mov_b32_e32 v2, 0
3378 ; SI-NEXT: s_mov_b32 s4, s6
3379 ; SI-NEXT: s_mov_b32 s5, s6
3380 ; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 glc
3381 ; SI-NEXT: s_waitcnt vmcnt(0)
3382 ; SI-NEXT: buffer_wbinvl1
3383 ; SI-NEXT: v_mov_b32_e32 v0, v2
3384 ; SI-NEXT: s_waitcnt expcnt(0)
3385 ; SI-NEXT: s_setpc_b64 s[30:31]
3387 ; VI-LABEL: global_atomic_or_0_i32_ret:
3389 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3390 ; VI-NEXT: v_mov_b32_e32 v2, 0
3391 ; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc
3392 ; VI-NEXT: s_waitcnt vmcnt(0)
3393 ; VI-NEXT: buffer_wbinvl1_vol
3394 ; VI-NEXT: s_setpc_b64 s[30:31]
3396 ; GFX9-LABEL: global_atomic_or_0_i32_ret:
3398 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3399 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3400 ; GFX9-NEXT: global_atomic_add v0, v[0:1], v2, off glc
3401 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3402 ; GFX9-NEXT: buffer_wbinvl1_vol
3403 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3404 %result = atomicrmw or ptr addrspace(1) %ptr, i32 0 seq_cst
3408 define void @global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
3409 ; SI-LABEL: global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory:
3411 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3412 ; SI-NEXT: s_mov_b32 s6, 0
3413 ; SI-NEXT: s_mov_b32 s7, 0xf000
3414 ; SI-NEXT: s_mov_b32 s4, s6
3415 ; SI-NEXT: s_mov_b32 s5, s6
3416 ; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16
3417 ; SI-NEXT: s_waitcnt vmcnt(0)
3418 ; SI-NEXT: buffer_wbinvl1
3419 ; SI-NEXT: s_waitcnt expcnt(0)
3420 ; SI-NEXT: s_setpc_b64 s[30:31]
3422 ; VI-LABEL: global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory:
3424 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3425 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
3426 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3427 ; VI-NEXT: flat_atomic_or v[0:1], v2
3428 ; VI-NEXT: s_waitcnt vmcnt(0)
3429 ; VI-NEXT: buffer_wbinvl1_vol
3430 ; VI-NEXT: s_setpc_b64 s[30:31]
3432 ; GFX9-LABEL: global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory:
3434 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3435 ; GFX9-NEXT: global_atomic_or v[0:1], v2, off offset:16
3436 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3437 ; GFX9-NEXT: buffer_wbinvl1_vol
3438 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3439 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
3440 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
3444 define i32 @global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
3445 ; SI-LABEL: global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory:
3447 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3448 ; SI-NEXT: s_mov_b32 s6, 0
3449 ; SI-NEXT: s_mov_b32 s7, 0xf000
3450 ; SI-NEXT: s_mov_b32 s4, s6
3451 ; SI-NEXT: s_mov_b32 s5, s6
3452 ; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
3453 ; SI-NEXT: s_waitcnt vmcnt(0)
3454 ; SI-NEXT: buffer_wbinvl1
3455 ; SI-NEXT: v_mov_b32_e32 v0, v2
3456 ; SI-NEXT: s_waitcnt expcnt(0)
3457 ; SI-NEXT: s_setpc_b64 s[30:31]
3459 ; VI-LABEL: global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory:
3461 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3462 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
3463 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3464 ; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc
3465 ; VI-NEXT: s_waitcnt vmcnt(0)
3466 ; VI-NEXT: buffer_wbinvl1_vol
3467 ; VI-NEXT: s_setpc_b64 s[30:31]
3469 ; GFX9-LABEL: global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory:
3471 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3472 ; GFX9-NEXT: global_atomic_or v0, v[0:1], v2, off offset:16 glc
3473 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3474 ; GFX9-NEXT: buffer_wbinvl1_vol
3475 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3476 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
3477 %result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
3481 ; ---------------------------------------------------------------------
3483 ; ---------------------------------------------------------------------
3485 define void @global_atomic_xor_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
3486 ; SI-LABEL: global_atomic_xor_i32_noret:
3488 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3489 ; SI-NEXT: s_mov_b32 s6, 0
3490 ; SI-NEXT: s_mov_b32 s7, 0xf000
3491 ; SI-NEXT: s_mov_b32 s4, s6
3492 ; SI-NEXT: s_mov_b32 s5, s6
3493 ; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64
3494 ; SI-NEXT: s_waitcnt vmcnt(0)
3495 ; SI-NEXT: buffer_wbinvl1
3496 ; SI-NEXT: s_waitcnt expcnt(0)
3497 ; SI-NEXT: s_setpc_b64 s[30:31]
3499 ; VI-LABEL: global_atomic_xor_i32_noret:
3501 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3502 ; VI-NEXT: flat_atomic_xor v[0:1], v2
3503 ; VI-NEXT: s_waitcnt vmcnt(0)
3504 ; VI-NEXT: buffer_wbinvl1_vol
3505 ; VI-NEXT: s_setpc_b64 s[30:31]
3507 ; GFX9-LABEL: global_atomic_xor_i32_noret:
3509 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3510 ; GFX9-NEXT: global_atomic_xor v[0:1], v2, off
3511 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3512 ; GFX9-NEXT: buffer_wbinvl1_vol
3513 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3514 %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst
3518 define void @global_atomic_xor_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
3519 ; SI-LABEL: global_atomic_xor_i32_noret_offset:
3521 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3522 ; SI-NEXT: s_mov_b32 s6, 0
3523 ; SI-NEXT: s_mov_b32 s7, 0xf000
3524 ; SI-NEXT: s_mov_b32 s4, s6
3525 ; SI-NEXT: s_mov_b32 s5, s6
3526 ; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16
3527 ; SI-NEXT: s_waitcnt vmcnt(0)
3528 ; SI-NEXT: buffer_wbinvl1
3529 ; SI-NEXT: s_waitcnt expcnt(0)
3530 ; SI-NEXT: s_setpc_b64 s[30:31]
3532 ; VI-LABEL: global_atomic_xor_i32_noret_offset:
3534 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3535 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
3536 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3537 ; VI-NEXT: flat_atomic_xor v[0:1], v2
3538 ; VI-NEXT: s_waitcnt vmcnt(0)
3539 ; VI-NEXT: buffer_wbinvl1_vol
3540 ; VI-NEXT: s_setpc_b64 s[30:31]
3542 ; GFX9-LABEL: global_atomic_xor_i32_noret_offset:
3544 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3545 ; GFX9-NEXT: global_atomic_xor v[0:1], v2, off offset:16
3546 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3547 ; GFX9-NEXT: buffer_wbinvl1_vol
3548 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3549 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3550 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst
3554 define i32 @global_atomic_xor_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
3555 ; SI-LABEL: global_atomic_xor_i32_ret:
3557 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3558 ; SI-NEXT: s_mov_b32 s6, 0
3559 ; SI-NEXT: s_mov_b32 s7, 0xf000
3560 ; SI-NEXT: s_mov_b32 s4, s6
3561 ; SI-NEXT: s_mov_b32 s5, s6
3562 ; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 glc
3563 ; SI-NEXT: s_waitcnt vmcnt(0)
3564 ; SI-NEXT: buffer_wbinvl1
3565 ; SI-NEXT: v_mov_b32_e32 v0, v2
3566 ; SI-NEXT: s_waitcnt expcnt(0)
3567 ; SI-NEXT: s_setpc_b64 s[30:31]
3569 ; VI-LABEL: global_atomic_xor_i32_ret:
3571 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3572 ; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
3573 ; VI-NEXT: s_waitcnt vmcnt(0)
3574 ; VI-NEXT: buffer_wbinvl1_vol
3575 ; VI-NEXT: s_setpc_b64 s[30:31]
3577 ; GFX9-LABEL: global_atomic_xor_i32_ret:
3579 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3580 ; GFX9-NEXT: global_atomic_xor v0, v[0:1], v2, off glc
3581 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3582 ; GFX9-NEXT: buffer_wbinvl1_vol
3583 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3584 %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst
3588 define i32 @global_atomic_xor_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
3589 ; SI-LABEL: global_atomic_xor_i32_ret_offset:
3591 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3592 ; SI-NEXT: s_mov_b32 s6, 0
3593 ; SI-NEXT: s_mov_b32 s7, 0xf000
3594 ; SI-NEXT: s_mov_b32 s4, s6
3595 ; SI-NEXT: s_mov_b32 s5, s6
3596 ; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
3597 ; SI-NEXT: s_waitcnt vmcnt(0)
3598 ; SI-NEXT: buffer_wbinvl1
3599 ; SI-NEXT: v_mov_b32_e32 v0, v2
3600 ; SI-NEXT: s_waitcnt expcnt(0)
3601 ; SI-NEXT: s_setpc_b64 s[30:31]
3603 ; VI-LABEL: global_atomic_xor_i32_ret_offset:
3605 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3606 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
3607 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3608 ; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
3609 ; VI-NEXT: s_waitcnt vmcnt(0)
3610 ; VI-NEXT: buffer_wbinvl1_vol
3611 ; VI-NEXT: s_setpc_b64 s[30:31]
3613 ; GFX9-LABEL: global_atomic_xor_i32_ret_offset:
3615 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3616 ; GFX9-NEXT: global_atomic_xor v0, v[0:1], v2, off offset:16 glc
3617 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3618 ; GFX9-NEXT: buffer_wbinvl1_vol
3619 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3620 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3621 %result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst
3625 define amdgpu_gfx void @global_atomic_xor_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
3626 ; SI-LABEL: global_atomic_xor_i32_noret_scalar:
3628 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3629 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3630 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
3631 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3632 ; SI-NEXT: s_waitcnt expcnt(0)
3633 ; SI-NEXT: v_writelane_b32 v1, s6, 0
3634 ; SI-NEXT: v_writelane_b32 v1, s7, 1
3635 ; SI-NEXT: s_mov_b32 s34, s6
3636 ; SI-NEXT: s_mov_b32 s7, 0xf000
3637 ; SI-NEXT: s_mov_b32 s6, -1
3638 ; SI-NEXT: v_mov_b32_e32 v0, s34
3639 ; SI-NEXT: s_waitcnt vmcnt(0)
3640 ; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0
3641 ; SI-NEXT: s_waitcnt vmcnt(0)
3642 ; SI-NEXT: buffer_wbinvl1
3643 ; SI-NEXT: v_readlane_b32 s7, v1, 1
3644 ; SI-NEXT: v_readlane_b32 s6, v1, 0
3645 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3646 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
3647 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3648 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3649 ; SI-NEXT: s_setpc_b64 s[30:31]
3651 ; VI-LABEL: global_atomic_xor_i32_noret_scalar:
3653 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3654 ; VI-NEXT: v_mov_b32_e32 v0, s4
3655 ; VI-NEXT: v_mov_b32_e32 v1, s5
3656 ; VI-NEXT: v_mov_b32_e32 v2, s6
3657 ; VI-NEXT: flat_atomic_xor v[0:1], v2
3658 ; VI-NEXT: s_waitcnt vmcnt(0)
3659 ; VI-NEXT: buffer_wbinvl1_vol
3660 ; VI-NEXT: s_setpc_b64 s[30:31]
3662 ; GFX9-LABEL: global_atomic_xor_i32_noret_scalar:
3664 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3665 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3666 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
3667 ; GFX9-NEXT: global_atomic_xor v0, v1, s[4:5]
3668 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3669 ; GFX9-NEXT: buffer_wbinvl1_vol
3670 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3671 %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst
3675 define amdgpu_gfx void @global_atomic_xor_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
3676 ; SI-LABEL: global_atomic_xor_i32_noret_offset_scalar:
3678 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3679 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3680 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
3681 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3682 ; SI-NEXT: s_waitcnt expcnt(0)
3683 ; SI-NEXT: v_writelane_b32 v1, s6, 0
3684 ; SI-NEXT: v_writelane_b32 v1, s7, 1
3685 ; SI-NEXT: s_mov_b32 s34, s6
3686 ; SI-NEXT: s_mov_b32 s7, 0xf000
3687 ; SI-NEXT: s_mov_b32 s6, -1
3688 ; SI-NEXT: v_mov_b32_e32 v0, s34
3689 ; SI-NEXT: s_waitcnt vmcnt(0)
3690 ; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 offset:16
3691 ; SI-NEXT: s_waitcnt vmcnt(0)
3692 ; SI-NEXT: buffer_wbinvl1
3693 ; SI-NEXT: v_readlane_b32 s7, v1, 1
3694 ; SI-NEXT: v_readlane_b32 s6, v1, 0
3695 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3696 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
3697 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3698 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3699 ; SI-NEXT: s_setpc_b64 s[30:31]
3701 ; VI-LABEL: global_atomic_xor_i32_noret_offset_scalar:
3703 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3704 ; VI-NEXT: s_add_u32 s34, s4, 16
3705 ; VI-NEXT: s_addc_u32 s35, s5, 0
3706 ; VI-NEXT: v_mov_b32_e32 v0, s34
3707 ; VI-NEXT: v_mov_b32_e32 v1, s35
3708 ; VI-NEXT: v_mov_b32_e32 v2, s6
3709 ; VI-NEXT: flat_atomic_xor v[0:1], v2
3710 ; VI-NEXT: s_waitcnt vmcnt(0)
3711 ; VI-NEXT: buffer_wbinvl1_vol
3712 ; VI-NEXT: s_setpc_b64 s[30:31]
3714 ; GFX9-LABEL: global_atomic_xor_i32_noret_offset_scalar:
3716 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3717 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3718 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
3719 ; GFX9-NEXT: global_atomic_xor v0, v1, s[4:5] offset:16
3720 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3721 ; GFX9-NEXT: buffer_wbinvl1_vol
3722 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3723 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3724 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst
3728 define amdgpu_gfx i32 @global_atomic_xor_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
3729 ; SI-LABEL: global_atomic_xor_i32_ret_scalar:
3731 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3732 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3733 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
3734 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3735 ; SI-NEXT: s_waitcnt expcnt(0)
3736 ; SI-NEXT: v_writelane_b32 v1, s6, 0
3737 ; SI-NEXT: v_writelane_b32 v1, s7, 1
3738 ; SI-NEXT: s_mov_b32 s34, s6
3739 ; SI-NEXT: s_mov_b32 s7, 0xf000
3740 ; SI-NEXT: s_mov_b32 s6, -1
3741 ; SI-NEXT: v_mov_b32_e32 v0, s34
3742 ; SI-NEXT: s_waitcnt vmcnt(0)
3743 ; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 glc
3744 ; SI-NEXT: s_waitcnt vmcnt(0)
3745 ; SI-NEXT: buffer_wbinvl1
3746 ; SI-NEXT: v_readlane_b32 s7, v1, 1
3747 ; SI-NEXT: v_readlane_b32 s6, v1, 0
3748 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3749 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
3750 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3751 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3752 ; SI-NEXT: s_setpc_b64 s[30:31]
3754 ; VI-LABEL: global_atomic_xor_i32_ret_scalar:
3756 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3757 ; VI-NEXT: v_mov_b32_e32 v0, s4
3758 ; VI-NEXT: v_mov_b32_e32 v1, s5
3759 ; VI-NEXT: v_mov_b32_e32 v2, s6
3760 ; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
3761 ; VI-NEXT: s_waitcnt vmcnt(0)
3762 ; VI-NEXT: buffer_wbinvl1_vol
3763 ; VI-NEXT: s_setpc_b64 s[30:31]
3765 ; GFX9-LABEL: global_atomic_xor_i32_ret_scalar:
3767 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3768 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3769 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
3770 ; GFX9-NEXT: global_atomic_xor v0, v0, v1, s[4:5] glc
3771 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3772 ; GFX9-NEXT: buffer_wbinvl1_vol
3773 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3774 %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst
3778 define amdgpu_gfx i32 @global_atomic_xor_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
3779 ; SI-LABEL: global_atomic_xor_i32_ret_offset_scalar:
3781 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3782 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3783 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
3784 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3785 ; SI-NEXT: s_waitcnt expcnt(0)
3786 ; SI-NEXT: v_writelane_b32 v1, s6, 0
3787 ; SI-NEXT: v_writelane_b32 v1, s7, 1
3788 ; SI-NEXT: s_mov_b32 s34, s6
3789 ; SI-NEXT: s_mov_b32 s7, 0xf000
3790 ; SI-NEXT: s_mov_b32 s6, -1
3791 ; SI-NEXT: v_mov_b32_e32 v0, s34
3792 ; SI-NEXT: s_waitcnt vmcnt(0)
3793 ; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 offset:16 glc
3794 ; SI-NEXT: s_waitcnt vmcnt(0)
3795 ; SI-NEXT: buffer_wbinvl1
3796 ; SI-NEXT: v_readlane_b32 s7, v1, 1
3797 ; SI-NEXT: v_readlane_b32 s6, v1, 0
3798 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3799 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
3800 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3801 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3802 ; SI-NEXT: s_setpc_b64 s[30:31]
3804 ; VI-LABEL: global_atomic_xor_i32_ret_offset_scalar:
3806 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3807 ; VI-NEXT: s_add_u32 s34, s4, 16
3808 ; VI-NEXT: s_addc_u32 s35, s5, 0
3809 ; VI-NEXT: v_mov_b32_e32 v0, s34
3810 ; VI-NEXT: v_mov_b32_e32 v1, s35
3811 ; VI-NEXT: v_mov_b32_e32 v2, s6
3812 ; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
3813 ; VI-NEXT: s_waitcnt vmcnt(0)
3814 ; VI-NEXT: buffer_wbinvl1_vol
3815 ; VI-NEXT: s_setpc_b64 s[30:31]
3817 ; GFX9-LABEL: global_atomic_xor_i32_ret_offset_scalar:
3819 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3820 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3821 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
3822 ; GFX9-NEXT: global_atomic_xor v0, v0, v1, s[4:5] offset:16 glc
3823 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3824 ; GFX9-NEXT: buffer_wbinvl1_vol
3825 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3826 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3827 %result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst
3831 define i32 @global_atomic_xor_0_i32_ret(ptr addrspace(1) %ptr) {
3832 ; SI-LABEL: global_atomic_xor_0_i32_ret:
3834 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3835 ; SI-NEXT: s_mov_b32 s7, 0xf000
3836 ; SI-NEXT: s_mov_b32 s6, 0
3837 ; SI-NEXT: v_mov_b32_e32 v2, 0
3838 ; SI-NEXT: s_mov_b32 s4, s6
3839 ; SI-NEXT: s_mov_b32 s5, s6
3840 ; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 glc
3841 ; SI-NEXT: s_waitcnt vmcnt(0)
3842 ; SI-NEXT: buffer_wbinvl1
3843 ; SI-NEXT: v_mov_b32_e32 v0, v2
3844 ; SI-NEXT: s_waitcnt expcnt(0)
3845 ; SI-NEXT: s_setpc_b64 s[30:31]
3847 ; VI-LABEL: global_atomic_xor_0_i32_ret:
3849 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3850 ; VI-NEXT: v_mov_b32_e32 v2, 0
3851 ; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc
3852 ; VI-NEXT: s_waitcnt vmcnt(0)
3853 ; VI-NEXT: buffer_wbinvl1_vol
3854 ; VI-NEXT: s_setpc_b64 s[30:31]
3856 ; GFX9-LABEL: global_atomic_xor_0_i32_ret:
3858 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3859 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3860 ; GFX9-NEXT: global_atomic_add v0, v[0:1], v2, off glc
3861 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3862 ; GFX9-NEXT: buffer_wbinvl1_vol
3863 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3864 %result = atomicrmw xor ptr addrspace(1) %ptr, i32 0 seq_cst
3868 define void @global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
3869 ; SI-LABEL: global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory:
3871 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3872 ; SI-NEXT: s_mov_b32 s6, 0
3873 ; SI-NEXT: s_mov_b32 s7, 0xf000
3874 ; SI-NEXT: s_mov_b32 s4, s6
3875 ; SI-NEXT: s_mov_b32 s5, s6
3876 ; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16
3877 ; SI-NEXT: s_waitcnt vmcnt(0)
3878 ; SI-NEXT: buffer_wbinvl1
3879 ; SI-NEXT: s_waitcnt expcnt(0)
3880 ; SI-NEXT: s_setpc_b64 s[30:31]
3882 ; VI-LABEL: global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory:
3884 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3885 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
3886 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3887 ; VI-NEXT: flat_atomic_xor v[0:1], v2
3888 ; VI-NEXT: s_waitcnt vmcnt(0)
3889 ; VI-NEXT: buffer_wbinvl1_vol
3890 ; VI-NEXT: s_setpc_b64 s[30:31]
3892 ; GFX9-LABEL: global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory:
3894 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3895 ; GFX9-NEXT: global_atomic_xor v[0:1], v2, off offset:16
3896 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3897 ; GFX9-NEXT: buffer_wbinvl1_vol
3898 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3899 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
3900 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
3904 define i32 @global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
3905 ; SI-LABEL: global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory:
3907 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3908 ; SI-NEXT: s_mov_b32 s6, 0
3909 ; SI-NEXT: s_mov_b32 s7, 0xf000
3910 ; SI-NEXT: s_mov_b32 s4, s6
3911 ; SI-NEXT: s_mov_b32 s5, s6
3912 ; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
3913 ; SI-NEXT: s_waitcnt vmcnt(0)
3914 ; SI-NEXT: buffer_wbinvl1
3915 ; SI-NEXT: v_mov_b32_e32 v0, v2
3916 ; SI-NEXT: s_waitcnt expcnt(0)
3917 ; SI-NEXT: s_setpc_b64 s[30:31]
3919 ; VI-LABEL: global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory:
3921 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3922 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
3923 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3924 ; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
3925 ; VI-NEXT: s_waitcnt vmcnt(0)
3926 ; VI-NEXT: buffer_wbinvl1_vol
3927 ; VI-NEXT: s_setpc_b64 s[30:31]
3929 ; GFX9-LABEL: global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory:
3931 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3932 ; GFX9-NEXT: global_atomic_xor v0, v[0:1], v2, off offset:16 glc
3933 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3934 ; GFX9-NEXT: buffer_wbinvl1_vol
3935 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3936 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
3937 %result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
3941 ; ---------------------------------------------------------------------
3943 ; ---------------------------------------------------------------------
3945 define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
3946 ; SI-LABEL: global_atomic_max_i32_noret:
3948 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3949 ; SI-NEXT: s_mov_b32 s6, 0
3950 ; SI-NEXT: s_mov_b32 s7, 0xf000
3951 ; SI-NEXT: s_mov_b32 s4, s6
3952 ; SI-NEXT: s_mov_b32 s5, s6
3953 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
3954 ; SI-NEXT: s_mov_b64 s[8:9], 0
3955 ; SI-NEXT: .LBB83_1: ; %atomicrmw.start
3956 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
3957 ; SI-NEXT: s_waitcnt vmcnt(0)
3958 ; SI-NEXT: v_max_i32_e32 v3, v4, v2
3959 ; SI-NEXT: s_waitcnt expcnt(0)
3960 ; SI-NEXT: v_mov_b32_e32 v6, v4
3961 ; SI-NEXT: v_mov_b32_e32 v5, v3
3962 ; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
3963 ; SI-NEXT: s_waitcnt vmcnt(0)
3964 ; SI-NEXT: buffer_wbinvl1
3965 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
3966 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
3967 ; SI-NEXT: v_mov_b32_e32 v4, v5
3968 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
3969 ; SI-NEXT: s_cbranch_execnz .LBB83_1
3970 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
3971 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
3972 ; SI-NEXT: s_waitcnt expcnt(0)
3973 ; SI-NEXT: s_setpc_b64 s[30:31]
3975 ; VI-LABEL: global_atomic_max_i32_noret:
3977 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3978 ; VI-NEXT: flat_load_dword v4, v[0:1]
3979 ; VI-NEXT: s_mov_b64 s[4:5], 0
3980 ; VI-NEXT: .LBB83_1: ; %atomicrmw.start
3981 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
3982 ; VI-NEXT: s_waitcnt vmcnt(0)
3983 ; VI-NEXT: v_max_i32_e32 v3, v4, v2
3984 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3985 ; VI-NEXT: s_waitcnt vmcnt(0)
3986 ; VI-NEXT: buffer_wbinvl1_vol
3987 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
3988 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3989 ; VI-NEXT: v_mov_b32_e32 v4, v3
3990 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
3991 ; VI-NEXT: s_cbranch_execnz .LBB83_1
3992 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
3993 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
3994 ; VI-NEXT: s_setpc_b64 s[30:31]
3996 ; GFX9-LABEL: global_atomic_max_i32_noret:
3998 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3999 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
4000 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
4001 ; GFX9-NEXT: .LBB83_1: ; %atomicrmw.start
4002 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4003 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4004 ; GFX9-NEXT: v_max_i32_e32 v3, v4, v2
4005 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
4006 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4007 ; GFX9-NEXT: buffer_wbinvl1_vol
4008 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
4009 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4010 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
4011 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
4012 ; GFX9-NEXT: s_cbranch_execnz .LBB83_1
4013 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4014 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
4015 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4016 %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst
4020 define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
4021 ; SI-LABEL: global_atomic_max_i32_noret_offset:
4023 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4024 ; SI-NEXT: s_mov_b32 s6, 0
4025 ; SI-NEXT: s_mov_b32 s7, 0xf000
4026 ; SI-NEXT: s_mov_b32 s4, s6
4027 ; SI-NEXT: s_mov_b32 s5, s6
4028 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
4029 ; SI-NEXT: s_mov_b64 s[8:9], 0
4030 ; SI-NEXT: .LBB84_1: ; %atomicrmw.start
4031 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4032 ; SI-NEXT: s_waitcnt vmcnt(0)
4033 ; SI-NEXT: v_max_i32_e32 v3, v4, v2
4034 ; SI-NEXT: s_waitcnt expcnt(0)
4035 ; SI-NEXT: v_mov_b32_e32 v6, v4
4036 ; SI-NEXT: v_mov_b32_e32 v5, v3
4037 ; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
4038 ; SI-NEXT: s_waitcnt vmcnt(0)
4039 ; SI-NEXT: buffer_wbinvl1
4040 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
4041 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
4042 ; SI-NEXT: v_mov_b32_e32 v4, v5
4043 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
4044 ; SI-NEXT: s_cbranch_execnz .LBB84_1
4045 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4046 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
4047 ; SI-NEXT: s_waitcnt expcnt(0)
4048 ; SI-NEXT: s_setpc_b64 s[30:31]
4050 ; VI-LABEL: global_atomic_max_i32_noret_offset:
4052 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4053 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
4054 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4055 ; VI-NEXT: flat_load_dword v4, v[0:1]
4056 ; VI-NEXT: s_mov_b64 s[4:5], 0
4057 ; VI-NEXT: .LBB84_1: ; %atomicrmw.start
4058 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4059 ; VI-NEXT: s_waitcnt vmcnt(0)
4060 ; VI-NEXT: v_max_i32_e32 v3, v4, v2
4061 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4062 ; VI-NEXT: s_waitcnt vmcnt(0)
4063 ; VI-NEXT: buffer_wbinvl1_vol
4064 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
4065 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4066 ; VI-NEXT: v_mov_b32_e32 v4, v3
4067 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
4068 ; VI-NEXT: s_cbranch_execnz .LBB84_1
4069 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4070 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
4071 ; VI-NEXT: s_setpc_b64 s[30:31]
4073 ; GFX9-LABEL: global_atomic_max_i32_noret_offset:
4075 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4076 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16
4077 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
4078 ; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start
4079 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4080 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4081 ; GFX9-NEXT: v_max_i32_e32 v3, v4, v2
4082 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
4083 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4084 ; GFX9-NEXT: buffer_wbinvl1_vol
4085 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
4086 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4087 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
4088 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
4089 ; GFX9-NEXT: s_cbranch_execnz .LBB84_1
4090 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4091 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
4092 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4093 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
4094 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst
4098 define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
4099 ; SI-LABEL: global_atomic_max_i32_ret:
4101 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4102 ; SI-NEXT: s_mov_b32 s6, 0
4103 ; SI-NEXT: s_mov_b32 s7, 0xf000
4104 ; SI-NEXT: s_mov_b32 s4, s6
4105 ; SI-NEXT: s_mov_b32 s5, s6
4106 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
4107 ; SI-NEXT: s_mov_b64 s[8:9], 0
4108 ; SI-NEXT: .LBB85_1: ; %atomicrmw.start
4109 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4110 ; SI-NEXT: s_waitcnt vmcnt(0)
4111 ; SI-NEXT: v_mov_b32_e32 v5, v3
4112 ; SI-NEXT: s_waitcnt expcnt(0)
4113 ; SI-NEXT: v_max_i32_e32 v4, v5, v2
4114 ; SI-NEXT: v_mov_b32_e32 v3, v4
4115 ; SI-NEXT: v_mov_b32_e32 v4, v5
4116 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
4117 ; SI-NEXT: s_waitcnt vmcnt(0)
4118 ; SI-NEXT: buffer_wbinvl1
4119 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
4120 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
4121 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
4122 ; SI-NEXT: s_cbranch_execnz .LBB85_1
4123 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4124 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
4125 ; SI-NEXT: v_mov_b32_e32 v0, v3
4126 ; SI-NEXT: s_waitcnt expcnt(0)
4127 ; SI-NEXT: s_setpc_b64 s[30:31]
4129 ; VI-LABEL: global_atomic_max_i32_ret:
4131 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4132 ; VI-NEXT: flat_load_dword v3, v[0:1]
4133 ; VI-NEXT: s_mov_b64 s[4:5], 0
4134 ; VI-NEXT: .LBB85_1: ; %atomicrmw.start
4135 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4136 ; VI-NEXT: s_waitcnt vmcnt(0)
4137 ; VI-NEXT: v_mov_b32_e32 v4, v3
4138 ; VI-NEXT: v_max_i32_e32 v3, v4, v2
4139 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4140 ; VI-NEXT: s_waitcnt vmcnt(0)
4141 ; VI-NEXT: buffer_wbinvl1_vol
4142 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
4143 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4144 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
4145 ; VI-NEXT: s_cbranch_execnz .LBB85_1
4146 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4147 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
4148 ; VI-NEXT: v_mov_b32_e32 v0, v3
4149 ; VI-NEXT: s_setpc_b64 s[30:31]
4151 ; GFX9-LABEL: global_atomic_max_i32_ret:
4153 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4154 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
4155 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
4156 ; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start
4157 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4158 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4159 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
4160 ; GFX9-NEXT: v_max_i32_e32 v3, v4, v2
4161 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
4162 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4163 ; GFX9-NEXT: buffer_wbinvl1_vol
4164 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
4165 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4166 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
4167 ; GFX9-NEXT: s_cbranch_execnz .LBB85_1
4168 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4169 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
4170 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
4171 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4172 %result = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst
4176 define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
4177 ; SI-LABEL: global_atomic_max_i32_ret_offset:
4179 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4180 ; SI-NEXT: s_mov_b32 s6, 0
4181 ; SI-NEXT: s_mov_b32 s7, 0xf000
4182 ; SI-NEXT: s_mov_b32 s4, s6
4183 ; SI-NEXT: s_mov_b32 s5, s6
4184 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
4185 ; SI-NEXT: s_mov_b64 s[8:9], 0
4186 ; SI-NEXT: .LBB86_1: ; %atomicrmw.start
4187 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4188 ; SI-NEXT: s_waitcnt vmcnt(0)
4189 ; SI-NEXT: v_mov_b32_e32 v5, v3
4190 ; SI-NEXT: s_waitcnt expcnt(0)
4191 ; SI-NEXT: v_max_i32_e32 v4, v5, v2
4192 ; SI-NEXT: v_mov_b32_e32 v3, v4
4193 ; SI-NEXT: v_mov_b32_e32 v4, v5
4194 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
4195 ; SI-NEXT: s_waitcnt vmcnt(0)
4196 ; SI-NEXT: buffer_wbinvl1
4197 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
4198 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
4199 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
4200 ; SI-NEXT: s_cbranch_execnz .LBB86_1
4201 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4202 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
4203 ; SI-NEXT: v_mov_b32_e32 v0, v3
4204 ; SI-NEXT: s_waitcnt expcnt(0)
4205 ; SI-NEXT: s_setpc_b64 s[30:31]
4207 ; VI-LABEL: global_atomic_max_i32_ret_offset:
4209 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4210 ; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0
4211 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
4212 ; VI-NEXT: flat_load_dword v0, v[3:4]
4213 ; VI-NEXT: s_mov_b64 s[4:5], 0
4214 ; VI-NEXT: .LBB86_1: ; %atomicrmw.start
4215 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4216 ; VI-NEXT: s_waitcnt vmcnt(0)
4217 ; VI-NEXT: v_mov_b32_e32 v1, v0
4218 ; VI-NEXT: v_max_i32_e32 v0, v1, v2
4219 ; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
4220 ; VI-NEXT: s_waitcnt vmcnt(0)
4221 ; VI-NEXT: buffer_wbinvl1_vol
4222 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4223 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4224 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
4225 ; VI-NEXT: s_cbranch_execnz .LBB86_1
4226 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4227 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
4228 ; VI-NEXT: s_setpc_b64 s[30:31]
4230 ; GFX9-LABEL: global_atomic_max_i32_ret_offset:
4232 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4233 ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16
4234 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
4235 ; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start
4236 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4237 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4238 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
4239 ; GFX9-NEXT: v_max_i32_e32 v3, v4, v2
4240 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
4241 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4242 ; GFX9-NEXT: buffer_wbinvl1_vol
4243 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
4244 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4245 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
4246 ; GFX9-NEXT: s_cbranch_execnz .LBB86_1
4247 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4248 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
4249 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
4250 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4251 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
4252 %result = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst
4256 define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
4257 ; SI-LABEL: global_atomic_max_i32_noret_scalar:
4259 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4260 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
4261 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
4262 ; SI-NEXT: s_mov_b64 exec, s[34:35]
4263 ; SI-NEXT: s_waitcnt expcnt(0)
4264 ; SI-NEXT: v_writelane_b32 v4, s6, 0
4265 ; SI-NEXT: v_writelane_b32 v4, s7, 1
4266 ; SI-NEXT: s_mov_b32 s34, s6
4267 ; SI-NEXT: s_mov_b32 s7, 0xf000
4268 ; SI-NEXT: s_mov_b32 s6, -1
4269 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
4270 ; SI-NEXT: s_mov_b64 s[36:37], 0
4271 ; SI-NEXT: .LBB87_1: ; %atomicrmw.start
4272 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4273 ; SI-NEXT: s_waitcnt vmcnt(0)
4274 ; SI-NEXT: v_max_i32_e32 v0, s34, v1
4275 ; SI-NEXT: s_waitcnt expcnt(0)
4276 ; SI-NEXT: v_mov_b32_e32 v3, v1
4277 ; SI-NEXT: v_mov_b32_e32 v2, v0
4278 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
4279 ; SI-NEXT: s_waitcnt vmcnt(0)
4280 ; SI-NEXT: buffer_wbinvl1
4281 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
4282 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
4283 ; SI-NEXT: v_mov_b32_e32 v1, v2
4284 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
4285 ; SI-NEXT: s_cbranch_execnz .LBB87_1
4286 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4287 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
4288 ; SI-NEXT: v_readlane_b32 s7, v4, 1
4289 ; SI-NEXT: v_readlane_b32 s6, v4, 0
4290 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
4291 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
4292 ; SI-NEXT: s_mov_b64 exec, s[34:35]
4293 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4294 ; SI-NEXT: s_setpc_b64 s[30:31]
4296 ; VI-LABEL: global_atomic_max_i32_noret_scalar:
4298 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4299 ; VI-NEXT: v_mov_b32_e32 v0, s4
4300 ; VI-NEXT: v_mov_b32_e32 v1, s5
4301 ; VI-NEXT: flat_load_dword v3, v[0:1]
4302 ; VI-NEXT: s_mov_b64 s[34:35], 0
4303 ; VI-NEXT: .LBB87_1: ; %atomicrmw.start
4304 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4305 ; VI-NEXT: s_waitcnt vmcnt(0)
4306 ; VI-NEXT: v_max_i32_e32 v2, s6, v3
4307 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4308 ; VI-NEXT: s_waitcnt vmcnt(0)
4309 ; VI-NEXT: buffer_wbinvl1_vol
4310 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
4311 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4312 ; VI-NEXT: v_mov_b32_e32 v3, v2
4313 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
4314 ; VI-NEXT: s_cbranch_execnz .LBB87_1
4315 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4316 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
4317 ; VI-NEXT: s_setpc_b64 s[30:31]
4319 ; GFX9-LABEL: global_atomic_max_i32_noret_scalar:
4321 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4322 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4323 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5]
4324 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
4325 ; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start
4326 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4327 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4328 ; GFX9-NEXT: v_max_i32_e32 v0, s6, v1
4329 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc
4330 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4331 ; GFX9-NEXT: buffer_wbinvl1_vol
4332 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4333 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4334 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
4335 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
4336 ; GFX9-NEXT: s_cbranch_execnz .LBB87_1
4337 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4338 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
4339 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4340 %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst
4344 define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
4345 ; SI-LABEL: global_atomic_max_i32_noret_offset_scalar:
4347 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4348 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
4349 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
4350 ; SI-NEXT: s_mov_b64 exec, s[34:35]
4351 ; SI-NEXT: s_waitcnt expcnt(0)
4352 ; SI-NEXT: v_writelane_b32 v4, s6, 0
4353 ; SI-NEXT: v_writelane_b32 v4, s7, 1
4354 ; SI-NEXT: s_mov_b32 s34, s6
4355 ; SI-NEXT: s_mov_b32 s7, 0xf000
4356 ; SI-NEXT: s_mov_b32 s6, -1
4357 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16
4358 ; SI-NEXT: s_mov_b64 s[36:37], 0
4359 ; SI-NEXT: .LBB88_1: ; %atomicrmw.start
4360 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4361 ; SI-NEXT: s_waitcnt vmcnt(0)
4362 ; SI-NEXT: v_max_i32_e32 v0, s34, v1
4363 ; SI-NEXT: s_waitcnt expcnt(0)
4364 ; SI-NEXT: v_mov_b32_e32 v3, v1
4365 ; SI-NEXT: v_mov_b32_e32 v2, v0
4366 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
4367 ; SI-NEXT: s_waitcnt vmcnt(0)
4368 ; SI-NEXT: buffer_wbinvl1
4369 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
4370 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
4371 ; SI-NEXT: v_mov_b32_e32 v1, v2
4372 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
4373 ; SI-NEXT: s_cbranch_execnz .LBB88_1
4374 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4375 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
4376 ; SI-NEXT: v_readlane_b32 s7, v4, 1
4377 ; SI-NEXT: v_readlane_b32 s6, v4, 0
4378 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
4379 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
4380 ; SI-NEXT: s_mov_b64 exec, s[34:35]
4381 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4382 ; SI-NEXT: s_setpc_b64 s[30:31]
4384 ; VI-LABEL: global_atomic_max_i32_noret_offset_scalar:
4386 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4387 ; VI-NEXT: s_add_u32 s34, s4, 16
4388 ; VI-NEXT: s_addc_u32 s35, s5, 0
4389 ; VI-NEXT: v_mov_b32_e32 v0, s34
4390 ; VI-NEXT: v_mov_b32_e32 v1, s35
4391 ; VI-NEXT: flat_load_dword v3, v[0:1]
4392 ; VI-NEXT: s_mov_b64 s[34:35], 0
4393 ; VI-NEXT: .LBB88_1: ; %atomicrmw.start
4394 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4395 ; VI-NEXT: s_waitcnt vmcnt(0)
4396 ; VI-NEXT: v_max_i32_e32 v2, s6, v3
4397 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4398 ; VI-NEXT: s_waitcnt vmcnt(0)
4399 ; VI-NEXT: buffer_wbinvl1_vol
4400 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
4401 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4402 ; VI-NEXT: v_mov_b32_e32 v3, v2
4403 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
4404 ; VI-NEXT: s_cbranch_execnz .LBB88_1
4405 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4406 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
4407 ; VI-NEXT: s_setpc_b64 s[30:31]
4409 ; GFX9-LABEL: global_atomic_max_i32_noret_offset_scalar:
4411 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4412 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4413 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16
4414 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
4415 ; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start
4416 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4417 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4418 ; GFX9-NEXT: v_max_i32_e32 v0, s6, v1
4419 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc
4420 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4421 ; GFX9-NEXT: buffer_wbinvl1_vol
4422 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4423 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4424 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
4425 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
4426 ; GFX9-NEXT: s_cbranch_execnz .LBB88_1
4427 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4428 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
4429 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4430 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
4431 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst
4435 define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
4436 ; SI-LABEL: global_atomic_max_i32_ret_scalar:
4438 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4439 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
4440 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
4441 ; SI-NEXT: s_mov_b64 exec, s[34:35]
4442 ; SI-NEXT: s_waitcnt expcnt(0)
4443 ; SI-NEXT: v_writelane_b32 v3, s6, 0
4444 ; SI-NEXT: v_writelane_b32 v3, s7, 1
4445 ; SI-NEXT: s_mov_b32 s34, s6
4446 ; SI-NEXT: s_mov_b32 s7, 0xf000
4447 ; SI-NEXT: s_mov_b32 s6, -1
4448 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
4449 ; SI-NEXT: s_mov_b64 s[36:37], 0
4450 ; SI-NEXT: .LBB89_1: ; %atomicrmw.start
4451 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4452 ; SI-NEXT: s_waitcnt vmcnt(0)
4453 ; SI-NEXT: v_mov_b32_e32 v2, v0
4454 ; SI-NEXT: s_waitcnt expcnt(0)
4455 ; SI-NEXT: v_max_i32_e32 v1, s34, v2
4456 ; SI-NEXT: v_mov_b32_e32 v0, v1
4457 ; SI-NEXT: v_mov_b32_e32 v1, v2
4458 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
4459 ; SI-NEXT: s_waitcnt vmcnt(0)
4460 ; SI-NEXT: buffer_wbinvl1
4461 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
4462 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
4463 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
4464 ; SI-NEXT: s_cbranch_execnz .LBB89_1
4465 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4466 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
4467 ; SI-NEXT: v_readlane_b32 s7, v3, 1
4468 ; SI-NEXT: v_readlane_b32 s6, v3, 0
4469 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
4470 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
4471 ; SI-NEXT: s_mov_b64 exec, s[34:35]
4472 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4473 ; SI-NEXT: s_setpc_b64 s[30:31]
4475 ; VI-LABEL: global_atomic_max_i32_ret_scalar:
4477 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4478 ; VI-NEXT: v_mov_b32_e32 v0, s4
4479 ; VI-NEXT: v_mov_b32_e32 v1, s5
4480 ; VI-NEXT: flat_load_dword v0, v[0:1]
4481 ; VI-NEXT: v_mov_b32_e32 v1, s4
4482 ; VI-NEXT: s_mov_b64 s[34:35], 0
4483 ; VI-NEXT: v_mov_b32_e32 v2, s5
4484 ; VI-NEXT: .LBB89_1: ; %atomicrmw.start
4485 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4486 ; VI-NEXT: s_waitcnt vmcnt(0)
4487 ; VI-NEXT: v_mov_b32_e32 v4, v0
4488 ; VI-NEXT: v_max_i32_e32 v3, s6, v4
4489 ; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
4490 ; VI-NEXT: s_waitcnt vmcnt(0)
4491 ; VI-NEXT: buffer_wbinvl1_vol
4492 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
4493 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4494 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
4495 ; VI-NEXT: s_cbranch_execnz .LBB89_1
4496 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4497 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
4498 ; VI-NEXT: s_setpc_b64 s[30:31]
4500 ; GFX9-LABEL: global_atomic_max_i32_ret_scalar:
4502 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4503 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
4504 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5]
4505 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
4506 ; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start
4507 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4508 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4509 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
4510 ; GFX9-NEXT: v_max_i32_e32 v2, s6, v3
4511 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
4512 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4513 ; GFX9-NEXT: buffer_wbinvl1_vol
4514 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
4515 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4516 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
4517 ; GFX9-NEXT: s_cbranch_execnz .LBB89_1
4518 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4519 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
4520 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4521 %result = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst
4525 define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
4526 ; SI-LABEL: global_atomic_max_i32_ret_offset_scalar:
4528 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4529 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
4530 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
4531 ; SI-NEXT: s_mov_b64 exec, s[34:35]
4532 ; SI-NEXT: s_waitcnt expcnt(0)
4533 ; SI-NEXT: v_writelane_b32 v3, s6, 0
4534 ; SI-NEXT: v_writelane_b32 v3, s7, 1
4535 ; SI-NEXT: s_mov_b32 s34, s6
4536 ; SI-NEXT: s_mov_b32 s7, 0xf000
4537 ; SI-NEXT: s_mov_b32 s6, -1
4538 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16
4539 ; SI-NEXT: s_mov_b64 s[36:37], 0
4540 ; SI-NEXT: .LBB90_1: ; %atomicrmw.start
4541 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4542 ; SI-NEXT: s_waitcnt vmcnt(0)
4543 ; SI-NEXT: v_mov_b32_e32 v2, v0
4544 ; SI-NEXT: s_waitcnt expcnt(0)
4545 ; SI-NEXT: v_max_i32_e32 v1, s34, v2
4546 ; SI-NEXT: v_mov_b32_e32 v0, v1
4547 ; SI-NEXT: v_mov_b32_e32 v1, v2
4548 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
4549 ; SI-NEXT: s_waitcnt vmcnt(0)
4550 ; SI-NEXT: buffer_wbinvl1
4551 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
4552 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
4553 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
4554 ; SI-NEXT: s_cbranch_execnz .LBB90_1
4555 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4556 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
4557 ; SI-NEXT: v_readlane_b32 s7, v3, 1
4558 ; SI-NEXT: v_readlane_b32 s6, v3, 0
4559 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
4560 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
4561 ; SI-NEXT: s_mov_b64 exec, s[34:35]
4562 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4563 ; SI-NEXT: s_setpc_b64 s[30:31]
4565 ; VI-LABEL: global_atomic_max_i32_ret_offset_scalar:
4567 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4568 ; VI-NEXT: s_add_u32 s34, s4, 16
4569 ; VI-NEXT: s_addc_u32 s35, s5, 0
4570 ; VI-NEXT: v_mov_b32_e32 v1, s34
4571 ; VI-NEXT: v_mov_b32_e32 v2, s35
4572 ; VI-NEXT: flat_load_dword v0, v[1:2]
4573 ; VI-NEXT: s_mov_b64 s[34:35], 0
4574 ; VI-NEXT: .LBB90_1: ; %atomicrmw.start
4575 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4576 ; VI-NEXT: s_waitcnt vmcnt(0)
4577 ; VI-NEXT: v_mov_b32_e32 v4, v0
4578 ; VI-NEXT: v_max_i32_e32 v3, s6, v4
4579 ; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
4580 ; VI-NEXT: s_waitcnt vmcnt(0)
4581 ; VI-NEXT: buffer_wbinvl1_vol
4582 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
4583 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4584 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
4585 ; VI-NEXT: s_cbranch_execnz .LBB90_1
4586 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4587 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
4588 ; VI-NEXT: s_setpc_b64 s[30:31]
4590 ; GFX9-LABEL: global_atomic_max_i32_ret_offset_scalar:
4592 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4593 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
4594 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16
4595 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
4596 ; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start
4597 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4598 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4599 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
4600 ; GFX9-NEXT: v_max_i32_e32 v2, s6, v3
4601 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc
4602 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4603 ; GFX9-NEXT: buffer_wbinvl1_vol
4604 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
4605 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4606 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
4607 ; GFX9-NEXT: s_cbranch_execnz .LBB90_1
4608 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4609 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
4610 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4611 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
4612 %result = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst
4616 define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) {
4617 ; SI-LABEL: atomic_max_i32_addr64_offset:
4618 ; SI: ; %bb.0: ; %entry
4619 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
4620 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4621 ; SI-NEXT: s_ashr_i32 s5, s3, 31
4622 ; SI-NEXT: s_mov_b32 s4, s3
4623 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
4624 ; SI-NEXT: s_add_u32 s4, s0, s4
4625 ; SI-NEXT: s_addc_u32 s5, s1, s5
4626 ; SI-NEXT: s_load_dword s3, s[4:5], 0x4
4627 ; SI-NEXT: s_mov_b64 s[0:1], 0
4628 ; SI-NEXT: s_mov_b32 s7, 0xf000
4629 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4630 ; SI-NEXT: v_mov_b32_e32 v1, s3
4631 ; SI-NEXT: s_mov_b32 s6, -1
4632 ; SI-NEXT: .LBB91_1: ; %atomicrmw.start
4633 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4634 ; SI-NEXT: v_max_i32_e32 v0, s2, v1
4635 ; SI-NEXT: s_waitcnt expcnt(0)
4636 ; SI-NEXT: v_mov_b32_e32 v3, v1
4637 ; SI-NEXT: v_mov_b32_e32 v2, v0
4638 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
4639 ; SI-NEXT: s_waitcnt vmcnt(0)
4640 ; SI-NEXT: buffer_wbinvl1
4641 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
4642 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
4643 ; SI-NEXT: v_mov_b32_e32 v1, v2
4644 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
4645 ; SI-NEXT: s_cbranch_execnz .LBB91_1
4646 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4649 ; VI-LABEL: atomic_max_i32_addr64_offset:
4650 ; VI: ; %bb.0: ; %entry
4651 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
4652 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4653 ; VI-NEXT: s_ashr_i32 s5, s3, 31
4654 ; VI-NEXT: s_mov_b32 s4, s3
4655 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
4656 ; VI-NEXT: s_add_u32 s4, s0, s4
4657 ; VI-NEXT: s_addc_u32 s5, s1, s5
4658 ; VI-NEXT: s_load_dword s3, s[4:5], 0x10
4659 ; VI-NEXT: s_add_u32 s4, s4, 16
4660 ; VI-NEXT: s_addc_u32 s5, s5, 0
4661 ; VI-NEXT: v_mov_b32_e32 v0, s4
4662 ; VI-NEXT: s_mov_b64 s[0:1], 0
4663 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4664 ; VI-NEXT: v_mov_b32_e32 v3, s3
4665 ; VI-NEXT: v_mov_b32_e32 v1, s5
4666 ; VI-NEXT: .LBB91_1: ; %atomicrmw.start
4667 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4668 ; VI-NEXT: v_max_i32_e32 v2, s2, v3
4669 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4670 ; VI-NEXT: s_waitcnt vmcnt(0)
4671 ; VI-NEXT: buffer_wbinvl1_vol
4672 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
4673 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
4674 ; VI-NEXT: v_mov_b32_e32 v3, v2
4675 ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
4676 ; VI-NEXT: s_cbranch_execnz .LBB91_1
4677 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4680 ; GFX9-LABEL: atomic_max_i32_addr64_offset:
4681 ; GFX9: ; %bb.0: ; %entry
4682 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
4683 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
4684 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4685 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4686 ; GFX9-NEXT: s_ashr_i32 s1, s7, 31
4687 ; GFX9-NEXT: s_mov_b32 s0, s7
4688 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
4689 ; GFX9-NEXT: s_add_u32 s0, s4, s0
4690 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
4691 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10
4692 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4693 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
4694 ; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start
4695 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4696 ; GFX9-NEXT: v_max_i32_e32 v0, s6, v1
4697 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
4698 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4699 ; GFX9-NEXT: buffer_wbinvl1_vol
4700 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4701 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
4702 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
4703 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
4704 ; GFX9-NEXT: s_cbranch_execnz .LBB91_1
4705 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4706 ; GFX9-NEXT: s_endpgm
4708 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
4709 %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
4710 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst
4714 define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) {
4715 ; SI-LABEL: atomic_max_i32_ret_addr64_offset:
4716 ; SI: ; %bb.0: ; %entry
4717 ; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd
4718 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
4719 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4720 ; SI-NEXT: s_ashr_i32 s5, s9, 31
4721 ; SI-NEXT: s_mov_b32 s4, s9
4722 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
4723 ; SI-NEXT: s_add_u32 s4, s0, s4
4724 ; SI-NEXT: s_addc_u32 s5, s1, s5
4725 ; SI-NEXT: s_load_dword s6, s[4:5], 0x4
4726 ; SI-NEXT: s_mov_b64 s[0:1], 0
4727 ; SI-NEXT: s_mov_b32 s7, 0xf000
4728 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4729 ; SI-NEXT: v_mov_b32_e32 v1, s6
4730 ; SI-NEXT: s_mov_b32 s6, -1
4731 ; SI-NEXT: .LBB92_1: ; %atomicrmw.start
4732 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4733 ; SI-NEXT: v_max_i32_e32 v0, s8, v1
4734 ; SI-NEXT: s_waitcnt expcnt(0)
4735 ; SI-NEXT: v_mov_b32_e32 v3, v1
4736 ; SI-NEXT: v_mov_b32_e32 v2, v0
4737 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
4738 ; SI-NEXT: s_waitcnt vmcnt(0)
4739 ; SI-NEXT: buffer_wbinvl1
4740 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
4741 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
4742 ; SI-NEXT: v_mov_b32_e32 v1, v2
4743 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
4744 ; SI-NEXT: s_cbranch_execnz .LBB92_1
4745 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4746 ; SI-NEXT: s_or_b64 exec, exec, s[0:1]
4747 ; SI-NEXT: s_mov_b32 s7, 0xf000
4748 ; SI-NEXT: s_mov_b32 s6, -1
4749 ; SI-NEXT: s_mov_b32 s4, s2
4750 ; SI-NEXT: s_mov_b32 s5, s3
4751 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
4754 ; VI-LABEL: atomic_max_i32_ret_addr64_offset:
4755 ; VI: ; %bb.0: ; %entry
4756 ; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
4757 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
4758 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4759 ; VI-NEXT: s_ashr_i32 s7, s5, 31
4760 ; VI-NEXT: s_mov_b32 s6, s5
4761 ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
4762 ; VI-NEXT: s_add_u32 s6, s0, s6
4763 ; VI-NEXT: s_addc_u32 s7, s1, s7
4764 ; VI-NEXT: s_load_dword s5, s[6:7], 0x10
4765 ; VI-NEXT: s_add_u32 s6, s6, 16
4766 ; VI-NEXT: s_addc_u32 s7, s7, 0
4767 ; VI-NEXT: v_mov_b32_e32 v0, s6
4768 ; VI-NEXT: s_mov_b64 s[0:1], 0
4769 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4770 ; VI-NEXT: v_mov_b32_e32 v2, s5
4771 ; VI-NEXT: v_mov_b32_e32 v1, s7
4772 ; VI-NEXT: .LBB92_1: ; %atomicrmw.start
4773 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4774 ; VI-NEXT: v_mov_b32_e32 v3, v2
4775 ; VI-NEXT: v_max_i32_e32 v2, s4, v3
4776 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4777 ; VI-NEXT: s_waitcnt vmcnt(0)
4778 ; VI-NEXT: buffer_wbinvl1_vol
4779 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
4780 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
4781 ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
4782 ; VI-NEXT: s_cbranch_execnz .LBB92_1
4783 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4784 ; VI-NEXT: s_or_b64 exec, exec, s[0:1]
4785 ; VI-NEXT: v_mov_b32_e32 v0, s2
4786 ; VI-NEXT: v_mov_b32_e32 v1, s3
4787 ; VI-NEXT: flat_store_dword v[0:1], v2
4790 ; GFX9-LABEL: atomic_max_i32_ret_addr64_offset:
4791 ; GFX9: ; %bb.0: ; %entry
4792 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
4793 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
4794 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
4795 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4796 ; GFX9-NEXT: s_ashr_i32 s3, s1, 31
4797 ; GFX9-NEXT: s_mov_b32 s2, s1
4798 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2
4799 ; GFX9-NEXT: s_add_u32 s2, s4, s2
4800 ; GFX9-NEXT: s_addc_u32 s3, s5, s3
4801 ; GFX9-NEXT: s_load_dword s1, s[2:3], 0x10
4802 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
4803 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4804 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
4805 ; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start
4806 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4807 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
4808 ; GFX9-NEXT: v_max_i32_e32 v2, s0, v3
4809 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] offset:16 glc
4810 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4811 ; GFX9-NEXT: buffer_wbinvl1_vol
4812 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
4813 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4814 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
4815 ; GFX9-NEXT: s_cbranch_execnz .LBB92_1
4816 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4817 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
4818 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
4819 ; GFX9-NEXT: global_store_dword v1, v0, s[6:7]
4820 ; GFX9-NEXT: s_endpgm
4822 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
4823 %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
4824 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst
4825 store i32 %tmp0, ptr addrspace(1) %out2
4829 define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i32 %index) {
4830 ; SI-LABEL: atomic_max_i32_addr64:
4831 ; SI: ; %bb.0: ; %entry
4832 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
4833 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4834 ; SI-NEXT: s_ashr_i32 s5, s3, 31
4835 ; SI-NEXT: s_mov_b32 s4, s3
4836 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
4837 ; SI-NEXT: s_add_u32 s4, s0, s4
4838 ; SI-NEXT: s_addc_u32 s5, s1, s5
4839 ; SI-NEXT: s_load_dword s3, s[4:5], 0x0
4840 ; SI-NEXT: s_mov_b64 s[0:1], 0
4841 ; SI-NEXT: s_mov_b32 s7, 0xf000
4842 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4843 ; SI-NEXT: v_mov_b32_e32 v1, s3
4844 ; SI-NEXT: s_mov_b32 s6, -1
4845 ; SI-NEXT: .LBB93_1: ; %atomicrmw.start
4846 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4847 ; SI-NEXT: v_max_i32_e32 v0, s2, v1
4848 ; SI-NEXT: s_waitcnt expcnt(0)
4849 ; SI-NEXT: v_mov_b32_e32 v3, v1
4850 ; SI-NEXT: v_mov_b32_e32 v2, v0
4851 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
4852 ; SI-NEXT: s_waitcnt vmcnt(0)
4853 ; SI-NEXT: buffer_wbinvl1
4854 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
4855 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
4856 ; SI-NEXT: v_mov_b32_e32 v1, v2
4857 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
4858 ; SI-NEXT: s_cbranch_execnz .LBB93_1
4859 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4862 ; VI-LABEL: atomic_max_i32_addr64:
4863 ; VI: ; %bb.0: ; %entry
4864 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
4865 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4866 ; VI-NEXT: s_ashr_i32 s5, s3, 31
4867 ; VI-NEXT: s_mov_b32 s4, s3
4868 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
4869 ; VI-NEXT: s_add_u32 s4, s0, s4
4870 ; VI-NEXT: s_addc_u32 s5, s1, s5
4871 ; VI-NEXT: s_load_dword s3, s[4:5], 0x0
4872 ; VI-NEXT: v_mov_b32_e32 v0, s4
4873 ; VI-NEXT: s_mov_b64 s[0:1], 0
4874 ; VI-NEXT: v_mov_b32_e32 v1, s5
4875 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4876 ; VI-NEXT: v_mov_b32_e32 v3, s3
4877 ; VI-NEXT: .LBB93_1: ; %atomicrmw.start
4878 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4879 ; VI-NEXT: v_max_i32_e32 v2, s2, v3
4880 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4881 ; VI-NEXT: s_waitcnt vmcnt(0)
4882 ; VI-NEXT: buffer_wbinvl1_vol
4883 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
4884 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
4885 ; VI-NEXT: v_mov_b32_e32 v3, v2
4886 ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
4887 ; VI-NEXT: s_cbranch_execnz .LBB93_1
4888 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4891 ; GFX9-LABEL: atomic_max_i32_addr64:
4892 ; GFX9: ; %bb.0: ; %entry
4893 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
4894 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
4895 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4896 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4897 ; GFX9-NEXT: s_ashr_i32 s1, s7, 31
4898 ; GFX9-NEXT: s_mov_b32 s0, s7
4899 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
4900 ; GFX9-NEXT: s_add_u32 s0, s4, s0
4901 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
4902 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
4903 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4904 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
4905 ; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start
4906 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4907 ; GFX9-NEXT: v_max_i32_e32 v0, s6, v1
4908 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
4909 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4910 ; GFX9-NEXT: buffer_wbinvl1_vol
4911 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4912 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
4913 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
4914 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
4915 ; GFX9-NEXT: s_cbranch_execnz .LBB93_1
4916 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4917 ; GFX9-NEXT: s_endpgm
4919 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
4920 %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst
4924 define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) {
4925 ; SI-LABEL: atomic_max_i32_ret_addr64:
4926 ; SI: ; %bb.0: ; %entry
4927 ; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd
4928 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
4929 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4930 ; SI-NEXT: s_ashr_i32 s5, s9, 31
4931 ; SI-NEXT: s_mov_b32 s4, s9
4932 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
4933 ; SI-NEXT: s_add_u32 s4, s0, s4
4934 ; SI-NEXT: s_addc_u32 s5, s1, s5
4935 ; SI-NEXT: s_load_dword s6, s[4:5], 0x0
4936 ; SI-NEXT: s_mov_b64 s[0:1], 0
4937 ; SI-NEXT: s_mov_b32 s7, 0xf000
4938 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4939 ; SI-NEXT: v_mov_b32_e32 v1, s6
4940 ; SI-NEXT: s_mov_b32 s6, -1
4941 ; SI-NEXT: .LBB94_1: ; %atomicrmw.start
4942 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4943 ; SI-NEXT: v_max_i32_e32 v0, s8, v1
4944 ; SI-NEXT: s_waitcnt expcnt(0)
4945 ; SI-NEXT: v_mov_b32_e32 v3, v1
4946 ; SI-NEXT: v_mov_b32_e32 v2, v0
4947 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
4948 ; SI-NEXT: s_waitcnt vmcnt(0)
4949 ; SI-NEXT: buffer_wbinvl1
4950 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
4951 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
4952 ; SI-NEXT: v_mov_b32_e32 v1, v2
4953 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
4954 ; SI-NEXT: s_cbranch_execnz .LBB94_1
4955 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4956 ; SI-NEXT: s_or_b64 exec, exec, s[0:1]
4957 ; SI-NEXT: s_mov_b32 s7, 0xf000
4958 ; SI-NEXT: s_mov_b32 s6, -1
4959 ; SI-NEXT: s_mov_b32 s4, s2
4960 ; SI-NEXT: s_mov_b32 s5, s3
4961 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
4964 ; VI-LABEL: atomic_max_i32_ret_addr64:
4965 ; VI: ; %bb.0: ; %entry
4966 ; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
4967 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
4968 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4969 ; VI-NEXT: s_ashr_i32 s7, s5, 31
4970 ; VI-NEXT: s_mov_b32 s6, s5
4971 ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
4972 ; VI-NEXT: s_add_u32 s6, s0, s6
4973 ; VI-NEXT: s_addc_u32 s7, s1, s7
4974 ; VI-NEXT: s_load_dword s5, s[6:7], 0x0
4975 ; VI-NEXT: v_mov_b32_e32 v0, s6
4976 ; VI-NEXT: s_mov_b64 s[0:1], 0
4977 ; VI-NEXT: v_mov_b32_e32 v1, s7
4978 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4979 ; VI-NEXT: v_mov_b32_e32 v2, s5
4980 ; VI-NEXT: .LBB94_1: ; %atomicrmw.start
4981 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4982 ; VI-NEXT: v_mov_b32_e32 v3, v2
4983 ; VI-NEXT: v_max_i32_e32 v2, s4, v3
4984 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4985 ; VI-NEXT: s_waitcnt vmcnt(0)
4986 ; VI-NEXT: buffer_wbinvl1_vol
4987 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
4988 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
4989 ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
4990 ; VI-NEXT: s_cbranch_execnz .LBB94_1
4991 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4992 ; VI-NEXT: s_or_b64 exec, exec, s[0:1]
4993 ; VI-NEXT: v_mov_b32_e32 v0, s2
4994 ; VI-NEXT: v_mov_b32_e32 v1, s3
4995 ; VI-NEXT: flat_store_dword v[0:1], v2
4998 ; GFX9-LABEL: atomic_max_i32_ret_addr64:
4999 ; GFX9: ; %bb.0: ; %entry
5000 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
5001 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
5002 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
5003 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5004 ; GFX9-NEXT: s_ashr_i32 s3, s1, 31
5005 ; GFX9-NEXT: s_mov_b32 s2, s1
5006 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2
5007 ; GFX9-NEXT: s_add_u32 s2, s4, s2
5008 ; GFX9-NEXT: s_addc_u32 s3, s5, s3
5009 ; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0
5010 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
5011 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5012 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
5013 ; GFX9-NEXT: .LBB94_1: ; %atomicrmw.start
5014 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5015 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
5016 ; GFX9-NEXT: v_max_i32_e32 v2, s0, v3
5017 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] glc
5018 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5019 ; GFX9-NEXT: buffer_wbinvl1_vol
5020 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
5021 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5022 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
5023 ; GFX9-NEXT: s_cbranch_execnz .LBB94_1
5024 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5025 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
5026 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
5027 ; GFX9-NEXT: global_store_dword v1, v0, s[6:7]
5028 ; GFX9-NEXT: s_endpgm
5030 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
5031 %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst
5032 store i32 %tmp0, ptr addrspace(1) %out2
5036 define void @global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
5037 ; SI-LABEL: global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory:
5039 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5040 ; SI-NEXT: s_mov_b32 s6, 0
5041 ; SI-NEXT: s_mov_b32 s7, 0xf000
5042 ; SI-NEXT: s_mov_b32 s4, s6
5043 ; SI-NEXT: s_mov_b32 s5, s6
5044 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
5045 ; SI-NEXT: s_mov_b64 s[8:9], 0
5046 ; SI-NEXT: .LBB95_1: ; %atomicrmw.start
5047 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5048 ; SI-NEXT: s_waitcnt vmcnt(0)
5049 ; SI-NEXT: v_max_i32_e32 v3, v4, v2
5050 ; SI-NEXT: s_waitcnt expcnt(0)
5051 ; SI-NEXT: v_mov_b32_e32 v6, v4
5052 ; SI-NEXT: v_mov_b32_e32 v5, v3
5053 ; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
5054 ; SI-NEXT: s_waitcnt vmcnt(0)
5055 ; SI-NEXT: buffer_wbinvl1
5056 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
5057 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
5058 ; SI-NEXT: v_mov_b32_e32 v4, v5
5059 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
5060 ; SI-NEXT: s_cbranch_execnz .LBB95_1
5061 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5062 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
5063 ; SI-NEXT: s_waitcnt expcnt(0)
5064 ; SI-NEXT: s_setpc_b64 s[30:31]
5066 ; VI-LABEL: global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory:
5068 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5069 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
5070 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
5071 ; VI-NEXT: flat_load_dword v4, v[0:1]
5072 ; VI-NEXT: s_mov_b64 s[4:5], 0
5073 ; VI-NEXT: .LBB95_1: ; %atomicrmw.start
5074 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5075 ; VI-NEXT: s_waitcnt vmcnt(0)
5076 ; VI-NEXT: v_max_i32_e32 v3, v4, v2
5077 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5078 ; VI-NEXT: s_waitcnt vmcnt(0)
5079 ; VI-NEXT: buffer_wbinvl1_vol
5080 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5081 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5082 ; VI-NEXT: v_mov_b32_e32 v4, v3
5083 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
5084 ; VI-NEXT: s_cbranch_execnz .LBB95_1
5085 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5086 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
5087 ; VI-NEXT: s_setpc_b64 s[30:31]
5089 ; GFX9-LABEL: global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory:
5091 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5092 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16
5093 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
5094 ; GFX9-NEXT: .LBB95_1: ; %atomicrmw.start
5095 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5096 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5097 ; GFX9-NEXT: v_max_i32_e32 v3, v4, v2
5098 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
5099 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5100 ; GFX9-NEXT: buffer_wbinvl1_vol
5101 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5102 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5103 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
5104 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
5105 ; GFX9-NEXT: s_cbranch_execnz .LBB95_1
5106 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5107 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
5108 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5109 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
5110 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
5114 define i32 @global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
5115 ; SI-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory:
5117 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5118 ; SI-NEXT: s_mov_b32 s6, 0
5119 ; SI-NEXT: s_mov_b32 s7, 0xf000
5120 ; SI-NEXT: s_mov_b32 s4, s6
5121 ; SI-NEXT: s_mov_b32 s5, s6
5122 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
5123 ; SI-NEXT: s_mov_b64 s[8:9], 0
5124 ; SI-NEXT: .LBB96_1: ; %atomicrmw.start
5125 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5126 ; SI-NEXT: s_waitcnt vmcnt(0)
5127 ; SI-NEXT: v_mov_b32_e32 v5, v3
5128 ; SI-NEXT: s_waitcnt expcnt(0)
5129 ; SI-NEXT: v_max_i32_e32 v4, v5, v2
5130 ; SI-NEXT: v_mov_b32_e32 v3, v4
5131 ; SI-NEXT: v_mov_b32_e32 v4, v5
5132 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
5133 ; SI-NEXT: s_waitcnt vmcnt(0)
5134 ; SI-NEXT: buffer_wbinvl1
5135 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
5136 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
5137 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
5138 ; SI-NEXT: s_cbranch_execnz .LBB96_1
5139 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5140 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
5141 ; SI-NEXT: v_mov_b32_e32 v0, v3
5142 ; SI-NEXT: s_waitcnt expcnt(0)
5143 ; SI-NEXT: s_setpc_b64 s[30:31]
5145 ; VI-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory:
5147 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5148 ; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0
5149 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
5150 ; VI-NEXT: flat_load_dword v0, v[3:4]
5151 ; VI-NEXT: s_mov_b64 s[4:5], 0
5152 ; VI-NEXT: .LBB96_1: ; %atomicrmw.start
5153 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5154 ; VI-NEXT: s_waitcnt vmcnt(0)
5155 ; VI-NEXT: v_mov_b32_e32 v1, v0
5156 ; VI-NEXT: v_max_i32_e32 v0, v1, v2
5157 ; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
5158 ; VI-NEXT: s_waitcnt vmcnt(0)
5159 ; VI-NEXT: buffer_wbinvl1_vol
5160 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
5161 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5162 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
5163 ; VI-NEXT: s_cbranch_execnz .LBB96_1
5164 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5165 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
5166 ; VI-NEXT: s_setpc_b64 s[30:31]
5168 ; GFX9-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory:
5170 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5171 ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16
5172 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
5173 ; GFX9-NEXT: .LBB96_1: ; %atomicrmw.start
5174 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5175 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5176 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
5177 ; GFX9-NEXT: v_max_i32_e32 v3, v4, v2
5178 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
5179 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5180 ; GFX9-NEXT: buffer_wbinvl1_vol
5181 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5182 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5183 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
5184 ; GFX9-NEXT: s_cbranch_execnz .LBB96_1
5185 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5186 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
5187 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
5188 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5189 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
5190 %result = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
5194 ; ---------------------------------------------------------------------
5196 ; ---------------------------------------------------------------------
5198 define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
5199 ; SI-LABEL: global_atomic_umax_i32_noret:
5201 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5202 ; SI-NEXT: s_mov_b32 s6, 0
5203 ; SI-NEXT: s_mov_b32 s7, 0xf000
5204 ; SI-NEXT: s_mov_b32 s4, s6
5205 ; SI-NEXT: s_mov_b32 s5, s6
5206 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
5207 ; SI-NEXT: s_mov_b64 s[8:9], 0
5208 ; SI-NEXT: .LBB97_1: ; %atomicrmw.start
5209 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5210 ; SI-NEXT: s_waitcnt vmcnt(0)
5211 ; SI-NEXT: v_max_u32_e32 v3, v4, v2
5212 ; SI-NEXT: s_waitcnt expcnt(0)
5213 ; SI-NEXT: v_mov_b32_e32 v6, v4
5214 ; SI-NEXT: v_mov_b32_e32 v5, v3
5215 ; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
5216 ; SI-NEXT: s_waitcnt vmcnt(0)
5217 ; SI-NEXT: buffer_wbinvl1
5218 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
5219 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
5220 ; SI-NEXT: v_mov_b32_e32 v4, v5
5221 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
5222 ; SI-NEXT: s_cbranch_execnz .LBB97_1
5223 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5224 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
5225 ; SI-NEXT: s_waitcnt expcnt(0)
5226 ; SI-NEXT: s_setpc_b64 s[30:31]
5228 ; VI-LABEL: global_atomic_umax_i32_noret:
5230 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5231 ; VI-NEXT: flat_load_dword v4, v[0:1]
5232 ; VI-NEXT: s_mov_b64 s[4:5], 0
5233 ; VI-NEXT: .LBB97_1: ; %atomicrmw.start
5234 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5235 ; VI-NEXT: s_waitcnt vmcnt(0)
5236 ; VI-NEXT: v_max_u32_e32 v3, v4, v2
5237 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5238 ; VI-NEXT: s_waitcnt vmcnt(0)
5239 ; VI-NEXT: buffer_wbinvl1_vol
5240 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5241 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5242 ; VI-NEXT: v_mov_b32_e32 v4, v3
5243 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
5244 ; VI-NEXT: s_cbranch_execnz .LBB97_1
5245 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5246 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
5247 ; VI-NEXT: s_setpc_b64 s[30:31]
5249 ; GFX9-LABEL: global_atomic_umax_i32_noret:
5251 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5252 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
5253 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
5254 ; GFX9-NEXT: .LBB97_1: ; %atomicrmw.start
5255 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5256 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5257 ; GFX9-NEXT: v_max_u32_e32 v3, v4, v2
5258 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
5259 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5260 ; GFX9-NEXT: buffer_wbinvl1_vol
5261 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5262 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5263 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
5264 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
5265 ; GFX9-NEXT: s_cbranch_execnz .LBB97_1
5266 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5267 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
5268 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5269 %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst
5273 define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
5274 ; SI-LABEL: global_atomic_umax_i32_noret_offset:
5276 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5277 ; SI-NEXT: s_mov_b32 s6, 0
5278 ; SI-NEXT: s_mov_b32 s7, 0xf000
5279 ; SI-NEXT: s_mov_b32 s4, s6
5280 ; SI-NEXT: s_mov_b32 s5, s6
5281 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
5282 ; SI-NEXT: s_mov_b64 s[8:9], 0
5283 ; SI-NEXT: .LBB98_1: ; %atomicrmw.start
5284 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5285 ; SI-NEXT: s_waitcnt vmcnt(0)
5286 ; SI-NEXT: v_max_u32_e32 v3, v4, v2
5287 ; SI-NEXT: s_waitcnt expcnt(0)
5288 ; SI-NEXT: v_mov_b32_e32 v6, v4
5289 ; SI-NEXT: v_mov_b32_e32 v5, v3
5290 ; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
5291 ; SI-NEXT: s_waitcnt vmcnt(0)
5292 ; SI-NEXT: buffer_wbinvl1
5293 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
5294 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
5295 ; SI-NEXT: v_mov_b32_e32 v4, v5
5296 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
5297 ; SI-NEXT: s_cbranch_execnz .LBB98_1
5298 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5299 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
5300 ; SI-NEXT: s_waitcnt expcnt(0)
5301 ; SI-NEXT: s_setpc_b64 s[30:31]
5303 ; VI-LABEL: global_atomic_umax_i32_noret_offset:
5305 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5306 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
5307 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
5308 ; VI-NEXT: flat_load_dword v4, v[0:1]
5309 ; VI-NEXT: s_mov_b64 s[4:5], 0
5310 ; VI-NEXT: .LBB98_1: ; %atomicrmw.start
5311 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5312 ; VI-NEXT: s_waitcnt vmcnt(0)
5313 ; VI-NEXT: v_max_u32_e32 v3, v4, v2
5314 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5315 ; VI-NEXT: s_waitcnt vmcnt(0)
5316 ; VI-NEXT: buffer_wbinvl1_vol
5317 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5318 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5319 ; VI-NEXT: v_mov_b32_e32 v4, v3
5320 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
5321 ; VI-NEXT: s_cbranch_execnz .LBB98_1
5322 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5323 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
5324 ; VI-NEXT: s_setpc_b64 s[30:31]
5326 ; GFX9-LABEL: global_atomic_umax_i32_noret_offset:
5328 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5329 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16
5330 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
5331 ; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start
5332 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5333 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5334 ; GFX9-NEXT: v_max_u32_e32 v3, v4, v2
5335 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
5336 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5337 ; GFX9-NEXT: buffer_wbinvl1_vol
5338 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5339 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5340 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
5341 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
5342 ; GFX9-NEXT: s_cbranch_execnz .LBB98_1
5343 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5344 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
5345 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5346 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
5347 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst
5351 define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
5352 ; SI-LABEL: global_atomic_umax_i32_ret:
5354 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5355 ; SI-NEXT: s_mov_b32 s6, 0
5356 ; SI-NEXT: s_mov_b32 s7, 0xf000
5357 ; SI-NEXT: s_mov_b32 s4, s6
5358 ; SI-NEXT: s_mov_b32 s5, s6
5359 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
5360 ; SI-NEXT: s_mov_b64 s[8:9], 0
5361 ; SI-NEXT: .LBB99_1: ; %atomicrmw.start
5362 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5363 ; SI-NEXT: s_waitcnt vmcnt(0)
5364 ; SI-NEXT: v_mov_b32_e32 v5, v3
5365 ; SI-NEXT: s_waitcnt expcnt(0)
5366 ; SI-NEXT: v_max_u32_e32 v4, v5, v2
5367 ; SI-NEXT: v_mov_b32_e32 v3, v4
5368 ; SI-NEXT: v_mov_b32_e32 v4, v5
5369 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
5370 ; SI-NEXT: s_waitcnt vmcnt(0)
5371 ; SI-NEXT: buffer_wbinvl1
5372 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
5373 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
5374 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
5375 ; SI-NEXT: s_cbranch_execnz .LBB99_1
5376 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5377 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
5378 ; SI-NEXT: v_mov_b32_e32 v0, v3
5379 ; SI-NEXT: s_waitcnt expcnt(0)
5380 ; SI-NEXT: s_setpc_b64 s[30:31]
5382 ; VI-LABEL: global_atomic_umax_i32_ret:
5384 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5385 ; VI-NEXT: flat_load_dword v3, v[0:1]
5386 ; VI-NEXT: s_mov_b64 s[4:5], 0
5387 ; VI-NEXT: .LBB99_1: ; %atomicrmw.start
5388 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5389 ; VI-NEXT: s_waitcnt vmcnt(0)
5390 ; VI-NEXT: v_mov_b32_e32 v4, v3
5391 ; VI-NEXT: v_max_u32_e32 v3, v4, v2
5392 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5393 ; VI-NEXT: s_waitcnt vmcnt(0)
5394 ; VI-NEXT: buffer_wbinvl1_vol
5395 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5396 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5397 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
5398 ; VI-NEXT: s_cbranch_execnz .LBB99_1
5399 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5400 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
5401 ; VI-NEXT: v_mov_b32_e32 v0, v3
5402 ; VI-NEXT: s_setpc_b64 s[30:31]
5404 ; GFX9-LABEL: global_atomic_umax_i32_ret:
5406 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5407 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
5408 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
5409 ; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start
5410 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5411 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5412 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
5413 ; GFX9-NEXT: v_max_u32_e32 v3, v4, v2
5414 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
5415 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5416 ; GFX9-NEXT: buffer_wbinvl1_vol
5417 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5418 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5419 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
5420 ; GFX9-NEXT: s_cbranch_execnz .LBB99_1
5421 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5422 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
5423 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
5424 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5425 %result = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst
5429 define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
5430 ; SI-LABEL: global_atomic_umax_i32_ret_offset:
5432 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5433 ; SI-NEXT: s_mov_b32 s6, 0
5434 ; SI-NEXT: s_mov_b32 s7, 0xf000
5435 ; SI-NEXT: s_mov_b32 s4, s6
5436 ; SI-NEXT: s_mov_b32 s5, s6
5437 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
5438 ; SI-NEXT: s_mov_b64 s[8:9], 0
5439 ; SI-NEXT: .LBB100_1: ; %atomicrmw.start
5440 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5441 ; SI-NEXT: s_waitcnt vmcnt(0)
5442 ; SI-NEXT: v_mov_b32_e32 v5, v3
5443 ; SI-NEXT: s_waitcnt expcnt(0)
5444 ; SI-NEXT: v_max_u32_e32 v4, v5, v2
5445 ; SI-NEXT: v_mov_b32_e32 v3, v4
5446 ; SI-NEXT: v_mov_b32_e32 v4, v5
5447 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
5448 ; SI-NEXT: s_waitcnt vmcnt(0)
5449 ; SI-NEXT: buffer_wbinvl1
5450 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
5451 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
5452 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
5453 ; SI-NEXT: s_cbranch_execnz .LBB100_1
5454 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5455 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
5456 ; SI-NEXT: v_mov_b32_e32 v0, v3
5457 ; SI-NEXT: s_waitcnt expcnt(0)
5458 ; SI-NEXT: s_setpc_b64 s[30:31]
5460 ; VI-LABEL: global_atomic_umax_i32_ret_offset:
5462 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5463 ; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0
5464 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
5465 ; VI-NEXT: flat_load_dword v0, v[3:4]
5466 ; VI-NEXT: s_mov_b64 s[4:5], 0
5467 ; VI-NEXT: .LBB100_1: ; %atomicrmw.start
5468 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5469 ; VI-NEXT: s_waitcnt vmcnt(0)
5470 ; VI-NEXT: v_mov_b32_e32 v1, v0
5471 ; VI-NEXT: v_max_u32_e32 v0, v1, v2
5472 ; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
5473 ; VI-NEXT: s_waitcnt vmcnt(0)
5474 ; VI-NEXT: buffer_wbinvl1_vol
5475 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
5476 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5477 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
5478 ; VI-NEXT: s_cbranch_execnz .LBB100_1
5479 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5480 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
5481 ; VI-NEXT: s_setpc_b64 s[30:31]
5483 ; GFX9-LABEL: global_atomic_umax_i32_ret_offset:
5485 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5486 ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16
5487 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
5488 ; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start
5489 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5490 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5491 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
5492 ; GFX9-NEXT: v_max_u32_e32 v3, v4, v2
5493 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
5494 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5495 ; GFX9-NEXT: buffer_wbinvl1_vol
5496 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5497 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5498 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
5499 ; GFX9-NEXT: s_cbranch_execnz .LBB100_1
5500 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5501 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
5502 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
5503 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5504 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
5505 %result = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst
5509 define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
5510 ; SI-LABEL: global_atomic_umax_i32_noret_scalar:
5512 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5513 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
5514 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
5515 ; SI-NEXT: s_mov_b64 exec, s[34:35]
5516 ; SI-NEXT: s_waitcnt expcnt(0)
5517 ; SI-NEXT: v_writelane_b32 v4, s6, 0
5518 ; SI-NEXT: v_writelane_b32 v4, s7, 1
5519 ; SI-NEXT: s_mov_b32 s34, s6
5520 ; SI-NEXT: s_mov_b32 s7, 0xf000
5521 ; SI-NEXT: s_mov_b32 s6, -1
5522 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
5523 ; SI-NEXT: s_mov_b64 s[36:37], 0
5524 ; SI-NEXT: .LBB101_1: ; %atomicrmw.start
5525 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5526 ; SI-NEXT: s_waitcnt vmcnt(0)
5527 ; SI-NEXT: v_max_u32_e32 v0, s34, v1
5528 ; SI-NEXT: s_waitcnt expcnt(0)
5529 ; SI-NEXT: v_mov_b32_e32 v3, v1
5530 ; SI-NEXT: v_mov_b32_e32 v2, v0
5531 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
5532 ; SI-NEXT: s_waitcnt vmcnt(0)
5533 ; SI-NEXT: buffer_wbinvl1
5534 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
5535 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
5536 ; SI-NEXT: v_mov_b32_e32 v1, v2
5537 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
5538 ; SI-NEXT: s_cbranch_execnz .LBB101_1
5539 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5540 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
5541 ; SI-NEXT: v_readlane_b32 s7, v4, 1
5542 ; SI-NEXT: v_readlane_b32 s6, v4, 0
5543 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
5544 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
5545 ; SI-NEXT: s_mov_b64 exec, s[34:35]
5546 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5547 ; SI-NEXT: s_setpc_b64 s[30:31]
5549 ; VI-LABEL: global_atomic_umax_i32_noret_scalar:
5551 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5552 ; VI-NEXT: v_mov_b32_e32 v0, s4
5553 ; VI-NEXT: v_mov_b32_e32 v1, s5
5554 ; VI-NEXT: flat_load_dword v3, v[0:1]
5555 ; VI-NEXT: s_mov_b64 s[34:35], 0
5556 ; VI-NEXT: .LBB101_1: ; %atomicrmw.start
5557 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5558 ; VI-NEXT: s_waitcnt vmcnt(0)
5559 ; VI-NEXT: v_max_u32_e32 v2, s6, v3
5560 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5561 ; VI-NEXT: s_waitcnt vmcnt(0)
5562 ; VI-NEXT: buffer_wbinvl1_vol
5563 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
5564 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5565 ; VI-NEXT: v_mov_b32_e32 v3, v2
5566 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
5567 ; VI-NEXT: s_cbranch_execnz .LBB101_1
5568 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5569 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
5570 ; VI-NEXT: s_setpc_b64 s[30:31]
5572 ; GFX9-LABEL: global_atomic_umax_i32_noret_scalar:
5574 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5575 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5576 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5]
5577 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
5578 ; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start
5579 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5580 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5581 ; GFX9-NEXT: v_max_u32_e32 v0, s6, v1
5582 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc
5583 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5584 ; GFX9-NEXT: buffer_wbinvl1_vol
5585 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
5586 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5587 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
5588 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
5589 ; GFX9-NEXT: s_cbranch_execnz .LBB101_1
5590 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5591 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
5592 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5593 %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst
5597 define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
5598 ; SI-LABEL: global_atomic_umax_i32_noret_offset_scalar:
5600 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5601 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
5602 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
5603 ; SI-NEXT: s_mov_b64 exec, s[34:35]
5604 ; SI-NEXT: s_waitcnt expcnt(0)
5605 ; SI-NEXT: v_writelane_b32 v4, s6, 0
5606 ; SI-NEXT: v_writelane_b32 v4, s7, 1
5607 ; SI-NEXT: s_mov_b32 s34, s6
5608 ; SI-NEXT: s_mov_b32 s7, 0xf000
5609 ; SI-NEXT: s_mov_b32 s6, -1
5610 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16
5611 ; SI-NEXT: s_mov_b64 s[36:37], 0
5612 ; SI-NEXT: .LBB102_1: ; %atomicrmw.start
5613 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5614 ; SI-NEXT: s_waitcnt vmcnt(0)
5615 ; SI-NEXT: v_max_u32_e32 v0, s34, v1
5616 ; SI-NEXT: s_waitcnt expcnt(0)
5617 ; SI-NEXT: v_mov_b32_e32 v3, v1
5618 ; SI-NEXT: v_mov_b32_e32 v2, v0
5619 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
5620 ; SI-NEXT: s_waitcnt vmcnt(0)
5621 ; SI-NEXT: buffer_wbinvl1
5622 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
5623 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
5624 ; SI-NEXT: v_mov_b32_e32 v1, v2
5625 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
5626 ; SI-NEXT: s_cbranch_execnz .LBB102_1
5627 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5628 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
5629 ; SI-NEXT: v_readlane_b32 s7, v4, 1
5630 ; SI-NEXT: v_readlane_b32 s6, v4, 0
5631 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
5632 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
5633 ; SI-NEXT: s_mov_b64 exec, s[34:35]
5634 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5635 ; SI-NEXT: s_setpc_b64 s[30:31]
5637 ; VI-LABEL: global_atomic_umax_i32_noret_offset_scalar:
5639 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5640 ; VI-NEXT: s_add_u32 s34, s4, 16
5641 ; VI-NEXT: s_addc_u32 s35, s5, 0
5642 ; VI-NEXT: v_mov_b32_e32 v0, s34
5643 ; VI-NEXT: v_mov_b32_e32 v1, s35
5644 ; VI-NEXT: flat_load_dword v3, v[0:1]
5645 ; VI-NEXT: s_mov_b64 s[34:35], 0
5646 ; VI-NEXT: .LBB102_1: ; %atomicrmw.start
5647 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5648 ; VI-NEXT: s_waitcnt vmcnt(0)
5649 ; VI-NEXT: v_max_u32_e32 v2, s6, v3
5650 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5651 ; VI-NEXT: s_waitcnt vmcnt(0)
5652 ; VI-NEXT: buffer_wbinvl1_vol
5653 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
5654 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5655 ; VI-NEXT: v_mov_b32_e32 v3, v2
5656 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
5657 ; VI-NEXT: s_cbranch_execnz .LBB102_1
5658 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5659 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
5660 ; VI-NEXT: s_setpc_b64 s[30:31]
5662 ; GFX9-LABEL: global_atomic_umax_i32_noret_offset_scalar:
5664 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5665 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5666 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16
5667 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
5668 ; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start
5669 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5670 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5671 ; GFX9-NEXT: v_max_u32_e32 v0, s6, v1
5672 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc
5673 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5674 ; GFX9-NEXT: buffer_wbinvl1_vol
5675 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
5676 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5677 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
5678 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
5679 ; GFX9-NEXT: s_cbranch_execnz .LBB102_1
5680 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5681 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
5682 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5683 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
5684 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst
5688 define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
5689 ; SI-LABEL: global_atomic_umax_i32_ret_scalar:
5691 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5692 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
5693 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
5694 ; SI-NEXT: s_mov_b64 exec, s[34:35]
5695 ; SI-NEXT: s_waitcnt expcnt(0)
5696 ; SI-NEXT: v_writelane_b32 v3, s6, 0
5697 ; SI-NEXT: v_writelane_b32 v3, s7, 1
5698 ; SI-NEXT: s_mov_b32 s34, s6
5699 ; SI-NEXT: s_mov_b32 s7, 0xf000
5700 ; SI-NEXT: s_mov_b32 s6, -1
5701 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
5702 ; SI-NEXT: s_mov_b64 s[36:37], 0
5703 ; SI-NEXT: .LBB103_1: ; %atomicrmw.start
5704 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5705 ; SI-NEXT: s_waitcnt vmcnt(0)
5706 ; SI-NEXT: v_mov_b32_e32 v2, v0
5707 ; SI-NEXT: s_waitcnt expcnt(0)
5708 ; SI-NEXT: v_max_u32_e32 v1, s34, v2
5709 ; SI-NEXT: v_mov_b32_e32 v0, v1
5710 ; SI-NEXT: v_mov_b32_e32 v1, v2
5711 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
5712 ; SI-NEXT: s_waitcnt vmcnt(0)
5713 ; SI-NEXT: buffer_wbinvl1
5714 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
5715 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
5716 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
5717 ; SI-NEXT: s_cbranch_execnz .LBB103_1
5718 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5719 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
5720 ; SI-NEXT: v_readlane_b32 s7, v3, 1
5721 ; SI-NEXT: v_readlane_b32 s6, v3, 0
5722 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
5723 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
5724 ; SI-NEXT: s_mov_b64 exec, s[34:35]
5725 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5726 ; SI-NEXT: s_setpc_b64 s[30:31]
5728 ; VI-LABEL: global_atomic_umax_i32_ret_scalar:
5730 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5731 ; VI-NEXT: v_mov_b32_e32 v0, s4
5732 ; VI-NEXT: v_mov_b32_e32 v1, s5
5733 ; VI-NEXT: flat_load_dword v0, v[0:1]
5734 ; VI-NEXT: v_mov_b32_e32 v1, s4
5735 ; VI-NEXT: s_mov_b64 s[34:35], 0
5736 ; VI-NEXT: v_mov_b32_e32 v2, s5
5737 ; VI-NEXT: .LBB103_1: ; %atomicrmw.start
5738 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5739 ; VI-NEXT: s_waitcnt vmcnt(0)
5740 ; VI-NEXT: v_mov_b32_e32 v4, v0
5741 ; VI-NEXT: v_max_u32_e32 v3, s6, v4
5742 ; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
5743 ; VI-NEXT: s_waitcnt vmcnt(0)
5744 ; VI-NEXT: buffer_wbinvl1_vol
5745 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
5746 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5747 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
5748 ; VI-NEXT: s_cbranch_execnz .LBB103_1
5749 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5750 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
5751 ; VI-NEXT: s_setpc_b64 s[30:31]
5753 ; GFX9-LABEL: global_atomic_umax_i32_ret_scalar:
5755 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5756 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
5757 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5]
5758 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
5759 ; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start
5760 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5761 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5762 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
5763 ; GFX9-NEXT: v_max_u32_e32 v2, s6, v3
5764 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
5765 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5766 ; GFX9-NEXT: buffer_wbinvl1_vol
5767 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
5768 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5769 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
5770 ; GFX9-NEXT: s_cbranch_execnz .LBB103_1
5771 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5772 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
5773 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5774 %result = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst
5778 define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
5779 ; SI-LABEL: global_atomic_umax_i32_ret_offset_scalar:
5781 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5782 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
5783 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
5784 ; SI-NEXT: s_mov_b64 exec, s[34:35]
5785 ; SI-NEXT: s_waitcnt expcnt(0)
5786 ; SI-NEXT: v_writelane_b32 v3, s6, 0
5787 ; SI-NEXT: v_writelane_b32 v3, s7, 1
5788 ; SI-NEXT: s_mov_b32 s34, s6
5789 ; SI-NEXT: s_mov_b32 s7, 0xf000
5790 ; SI-NEXT: s_mov_b32 s6, -1
5791 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16
5792 ; SI-NEXT: s_mov_b64 s[36:37], 0
5793 ; SI-NEXT: .LBB104_1: ; %atomicrmw.start
5794 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5795 ; SI-NEXT: s_waitcnt vmcnt(0)
5796 ; SI-NEXT: v_mov_b32_e32 v2, v0
5797 ; SI-NEXT: s_waitcnt expcnt(0)
5798 ; SI-NEXT: v_max_u32_e32 v1, s34, v2
5799 ; SI-NEXT: v_mov_b32_e32 v0, v1
5800 ; SI-NEXT: v_mov_b32_e32 v1, v2
5801 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
5802 ; SI-NEXT: s_waitcnt vmcnt(0)
5803 ; SI-NEXT: buffer_wbinvl1
5804 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
5805 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
5806 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
5807 ; SI-NEXT: s_cbranch_execnz .LBB104_1
5808 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5809 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
5810 ; SI-NEXT: v_readlane_b32 s7, v3, 1
5811 ; SI-NEXT: v_readlane_b32 s6, v3, 0
5812 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
5813 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
5814 ; SI-NEXT: s_mov_b64 exec, s[34:35]
5815 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5816 ; SI-NEXT: s_setpc_b64 s[30:31]
5818 ; VI-LABEL: global_atomic_umax_i32_ret_offset_scalar:
5820 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5821 ; VI-NEXT: s_add_u32 s34, s4, 16
5822 ; VI-NEXT: s_addc_u32 s35, s5, 0
5823 ; VI-NEXT: v_mov_b32_e32 v1, s34
5824 ; VI-NEXT: v_mov_b32_e32 v2, s35
5825 ; VI-NEXT: flat_load_dword v0, v[1:2]
5826 ; VI-NEXT: s_mov_b64 s[34:35], 0
5827 ; VI-NEXT: .LBB104_1: ; %atomicrmw.start
5828 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5829 ; VI-NEXT: s_waitcnt vmcnt(0)
5830 ; VI-NEXT: v_mov_b32_e32 v4, v0
5831 ; VI-NEXT: v_max_u32_e32 v3, s6, v4
5832 ; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
5833 ; VI-NEXT: s_waitcnt vmcnt(0)
5834 ; VI-NEXT: buffer_wbinvl1_vol
5835 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
5836 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5837 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
5838 ; VI-NEXT: s_cbranch_execnz .LBB104_1
5839 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5840 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
5841 ; VI-NEXT: s_setpc_b64 s[30:31]
5843 ; GFX9-LABEL: global_atomic_umax_i32_ret_offset_scalar:
5845 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5846 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
5847 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16
5848 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
5849 ; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start
5850 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5851 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5852 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
5853 ; GFX9-NEXT: v_max_u32_e32 v2, s6, v3
5854 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc
5855 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5856 ; GFX9-NEXT: buffer_wbinvl1_vol
5857 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
5858 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5859 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
5860 ; GFX9-NEXT: s_cbranch_execnz .LBB104_1
5861 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5862 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
5863 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5864 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
5865 %result = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst
5869 define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) {
5870 ; SI-LABEL: atomic_umax_i32_addr64_offset:
5871 ; SI: ; %bb.0: ; %entry
5872 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
5873 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5874 ; SI-NEXT: s_ashr_i32 s5, s3, 31
5875 ; SI-NEXT: s_mov_b32 s4, s3
5876 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5877 ; SI-NEXT: s_add_u32 s4, s0, s4
5878 ; SI-NEXT: s_addc_u32 s5, s1, s5
5879 ; SI-NEXT: s_load_dword s3, s[4:5], 0x4
5880 ; SI-NEXT: s_mov_b64 s[0:1], 0
5881 ; SI-NEXT: s_mov_b32 s7, 0xf000
5882 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5883 ; SI-NEXT: v_mov_b32_e32 v1, s3
5884 ; SI-NEXT: s_mov_b32 s6, -1
5885 ; SI-NEXT: .LBB105_1: ; %atomicrmw.start
5886 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5887 ; SI-NEXT: v_max_u32_e32 v0, s2, v1
5888 ; SI-NEXT: s_waitcnt expcnt(0)
5889 ; SI-NEXT: v_mov_b32_e32 v3, v1
5890 ; SI-NEXT: v_mov_b32_e32 v2, v0
5891 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
5892 ; SI-NEXT: s_waitcnt vmcnt(0)
5893 ; SI-NEXT: buffer_wbinvl1
5894 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
5895 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
5896 ; SI-NEXT: v_mov_b32_e32 v1, v2
5897 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
5898 ; SI-NEXT: s_cbranch_execnz .LBB105_1
5899 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5902 ; VI-LABEL: atomic_umax_i32_addr64_offset:
5903 ; VI: ; %bb.0: ; %entry
5904 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
5905 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5906 ; VI-NEXT: s_ashr_i32 s5, s3, 31
5907 ; VI-NEXT: s_mov_b32 s4, s3
5908 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5909 ; VI-NEXT: s_add_u32 s4, s0, s4
5910 ; VI-NEXT: s_addc_u32 s5, s1, s5
5911 ; VI-NEXT: s_load_dword s3, s[4:5], 0x10
5912 ; VI-NEXT: s_add_u32 s4, s4, 16
5913 ; VI-NEXT: s_addc_u32 s5, s5, 0
5914 ; VI-NEXT: v_mov_b32_e32 v0, s4
5915 ; VI-NEXT: s_mov_b64 s[0:1], 0
5916 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5917 ; VI-NEXT: v_mov_b32_e32 v3, s3
5918 ; VI-NEXT: v_mov_b32_e32 v1, s5
5919 ; VI-NEXT: .LBB105_1: ; %atomicrmw.start
5920 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5921 ; VI-NEXT: v_max_u32_e32 v2, s2, v3
5922 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5923 ; VI-NEXT: s_waitcnt vmcnt(0)
5924 ; VI-NEXT: buffer_wbinvl1_vol
5925 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
5926 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
5927 ; VI-NEXT: v_mov_b32_e32 v3, v2
5928 ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
5929 ; VI-NEXT: s_cbranch_execnz .LBB105_1
5930 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5933 ; GFX9-LABEL: atomic_umax_i32_addr64_offset:
5934 ; GFX9: ; %bb.0: ; %entry
5935 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
5936 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
5937 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5938 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5939 ; GFX9-NEXT: s_ashr_i32 s1, s7, 31
5940 ; GFX9-NEXT: s_mov_b32 s0, s7
5941 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
5942 ; GFX9-NEXT: s_add_u32 s0, s4, s0
5943 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
5944 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10
5945 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5946 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
5947 ; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start
5948 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5949 ; GFX9-NEXT: v_max_u32_e32 v0, s6, v1
5950 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
5951 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5952 ; GFX9-NEXT: buffer_wbinvl1_vol
5953 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
5954 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
5955 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
5956 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
5957 ; GFX9-NEXT: s_cbranch_execnz .LBB105_1
5958 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5959 ; GFX9-NEXT: s_endpgm
5961 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
5962 %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
5963 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst
5967 define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) {
5968 ; SI-LABEL: atomic_umax_i32_ret_addr64_offset:
5969 ; SI: ; %bb.0: ; %entry
5970 ; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd
5971 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
5972 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5973 ; SI-NEXT: s_ashr_i32 s5, s9, 31
5974 ; SI-NEXT: s_mov_b32 s4, s9
5975 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5976 ; SI-NEXT: s_add_u32 s4, s0, s4
5977 ; SI-NEXT: s_addc_u32 s5, s1, s5
5978 ; SI-NEXT: s_load_dword s6, s[4:5], 0x4
5979 ; SI-NEXT: s_mov_b64 s[0:1], 0
5980 ; SI-NEXT: s_mov_b32 s7, 0xf000
5981 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5982 ; SI-NEXT: v_mov_b32_e32 v1, s6
5983 ; SI-NEXT: s_mov_b32 s6, -1
5984 ; SI-NEXT: .LBB106_1: ; %atomicrmw.start
5985 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5986 ; SI-NEXT: v_max_u32_e32 v0, s8, v1
5987 ; SI-NEXT: s_waitcnt expcnt(0)
5988 ; SI-NEXT: v_mov_b32_e32 v3, v1
5989 ; SI-NEXT: v_mov_b32_e32 v2, v0
5990 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
5991 ; SI-NEXT: s_waitcnt vmcnt(0)
5992 ; SI-NEXT: buffer_wbinvl1
5993 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
5994 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
5995 ; SI-NEXT: v_mov_b32_e32 v1, v2
5996 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
5997 ; SI-NEXT: s_cbranch_execnz .LBB106_1
5998 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5999 ; SI-NEXT: s_or_b64 exec, exec, s[0:1]
6000 ; SI-NEXT: s_mov_b32 s7, 0xf000
6001 ; SI-NEXT: s_mov_b32 s6, -1
6002 ; SI-NEXT: s_mov_b32 s4, s2
6003 ; SI-NEXT: s_mov_b32 s5, s3
6004 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
6007 ; VI-LABEL: atomic_umax_i32_ret_addr64_offset:
6008 ; VI: ; %bb.0: ; %entry
6009 ; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
6010 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
6011 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6012 ; VI-NEXT: s_ashr_i32 s7, s5, 31
6013 ; VI-NEXT: s_mov_b32 s6, s5
6014 ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
6015 ; VI-NEXT: s_add_u32 s6, s0, s6
6016 ; VI-NEXT: s_addc_u32 s7, s1, s7
6017 ; VI-NEXT: s_load_dword s5, s[6:7], 0x10
6018 ; VI-NEXT: s_add_u32 s6, s6, 16
6019 ; VI-NEXT: s_addc_u32 s7, s7, 0
6020 ; VI-NEXT: v_mov_b32_e32 v0, s6
6021 ; VI-NEXT: s_mov_b64 s[0:1], 0
6022 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6023 ; VI-NEXT: v_mov_b32_e32 v2, s5
6024 ; VI-NEXT: v_mov_b32_e32 v1, s7
6025 ; VI-NEXT: .LBB106_1: ; %atomicrmw.start
6026 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6027 ; VI-NEXT: v_mov_b32_e32 v3, v2
6028 ; VI-NEXT: v_max_u32_e32 v2, s4, v3
6029 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6030 ; VI-NEXT: s_waitcnt vmcnt(0)
6031 ; VI-NEXT: buffer_wbinvl1_vol
6032 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
6033 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
6034 ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
6035 ; VI-NEXT: s_cbranch_execnz .LBB106_1
6036 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6037 ; VI-NEXT: s_or_b64 exec, exec, s[0:1]
6038 ; VI-NEXT: v_mov_b32_e32 v0, s2
6039 ; VI-NEXT: v_mov_b32_e32 v1, s3
6040 ; VI-NEXT: flat_store_dword v[0:1], v2
6043 ; GFX9-LABEL: atomic_umax_i32_ret_addr64_offset:
6044 ; GFX9: ; %bb.0: ; %entry
6045 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
6046 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
6047 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
6048 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6049 ; GFX9-NEXT: s_ashr_i32 s3, s1, 31
6050 ; GFX9-NEXT: s_mov_b32 s2, s1
6051 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2
6052 ; GFX9-NEXT: s_add_u32 s2, s4, s2
6053 ; GFX9-NEXT: s_addc_u32 s3, s5, s3
6054 ; GFX9-NEXT: s_load_dword s1, s[2:3], 0x10
6055 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
6056 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6057 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
6058 ; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start
6059 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6060 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
6061 ; GFX9-NEXT: v_max_u32_e32 v2, s0, v3
6062 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] offset:16 glc
6063 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6064 ; GFX9-NEXT: buffer_wbinvl1_vol
6065 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
6066 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6067 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
6068 ; GFX9-NEXT: s_cbranch_execnz .LBB106_1
6069 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6070 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
6071 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
6072 ; GFX9-NEXT: global_store_dword v1, v0, s[6:7]
6073 ; GFX9-NEXT: s_endpgm
6075 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
6076 %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
6077 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst
6078 store i32 %tmp0, ptr addrspace(1) %out2
6082 define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) {
6083 ; SI-LABEL: atomic_umax_i32_ret_addr64:
6084 ; SI: ; %bb.0: ; %entry
6085 ; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd
6086 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
6087 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6088 ; SI-NEXT: s_ashr_i32 s5, s9, 31
6089 ; SI-NEXT: s_mov_b32 s4, s9
6090 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
6091 ; SI-NEXT: s_add_u32 s4, s0, s4
6092 ; SI-NEXT: s_addc_u32 s5, s1, s5
6093 ; SI-NEXT: s_load_dword s6, s[4:5], 0x0
6094 ; SI-NEXT: s_mov_b64 s[0:1], 0
6095 ; SI-NEXT: s_mov_b32 s7, 0xf000
6096 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6097 ; SI-NEXT: v_mov_b32_e32 v1, s6
6098 ; SI-NEXT: s_mov_b32 s6, -1
6099 ; SI-NEXT: .LBB107_1: ; %atomicrmw.start
6100 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6101 ; SI-NEXT: v_max_u32_e32 v0, s8, v1
6102 ; SI-NEXT: s_waitcnt expcnt(0)
6103 ; SI-NEXT: v_mov_b32_e32 v3, v1
6104 ; SI-NEXT: v_mov_b32_e32 v2, v0
6105 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
6106 ; SI-NEXT: s_waitcnt vmcnt(0)
6107 ; SI-NEXT: buffer_wbinvl1
6108 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
6109 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
6110 ; SI-NEXT: v_mov_b32_e32 v1, v2
6111 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
6112 ; SI-NEXT: s_cbranch_execnz .LBB107_1
6113 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6114 ; SI-NEXT: s_or_b64 exec, exec, s[0:1]
6115 ; SI-NEXT: s_mov_b32 s7, 0xf000
6116 ; SI-NEXT: s_mov_b32 s6, -1
6117 ; SI-NEXT: s_mov_b32 s4, s2
6118 ; SI-NEXT: s_mov_b32 s5, s3
6119 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
6122 ; VI-LABEL: atomic_umax_i32_ret_addr64:
6123 ; VI: ; %bb.0: ; %entry
6124 ; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
6125 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
6126 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6127 ; VI-NEXT: s_ashr_i32 s7, s5, 31
6128 ; VI-NEXT: s_mov_b32 s6, s5
6129 ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
6130 ; VI-NEXT: s_add_u32 s6, s0, s6
6131 ; VI-NEXT: s_addc_u32 s7, s1, s7
6132 ; VI-NEXT: s_load_dword s5, s[6:7], 0x0
6133 ; VI-NEXT: v_mov_b32_e32 v0, s6
6134 ; VI-NEXT: s_mov_b64 s[0:1], 0
6135 ; VI-NEXT: v_mov_b32_e32 v1, s7
6136 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6137 ; VI-NEXT: v_mov_b32_e32 v2, s5
6138 ; VI-NEXT: .LBB107_1: ; %atomicrmw.start
6139 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6140 ; VI-NEXT: v_mov_b32_e32 v3, v2
6141 ; VI-NEXT: v_max_u32_e32 v2, s4, v3
6142 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6143 ; VI-NEXT: s_waitcnt vmcnt(0)
6144 ; VI-NEXT: buffer_wbinvl1_vol
6145 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
6146 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
6147 ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
6148 ; VI-NEXT: s_cbranch_execnz .LBB107_1
6149 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6150 ; VI-NEXT: s_or_b64 exec, exec, s[0:1]
6151 ; VI-NEXT: v_mov_b32_e32 v0, s2
6152 ; VI-NEXT: v_mov_b32_e32 v1, s3
6153 ; VI-NEXT: flat_store_dword v[0:1], v2
6156 ; GFX9-LABEL: atomic_umax_i32_ret_addr64:
6157 ; GFX9: ; %bb.0: ; %entry
6158 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
6159 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
6160 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
6161 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6162 ; GFX9-NEXT: s_ashr_i32 s3, s1, 31
6163 ; GFX9-NEXT: s_mov_b32 s2, s1
6164 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2
6165 ; GFX9-NEXT: s_add_u32 s2, s4, s2
6166 ; GFX9-NEXT: s_addc_u32 s3, s5, s3
6167 ; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0
6168 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
6169 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6170 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
6171 ; GFX9-NEXT: .LBB107_1: ; %atomicrmw.start
6172 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6173 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
6174 ; GFX9-NEXT: v_max_u32_e32 v2, s0, v3
6175 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] glc
6176 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6177 ; GFX9-NEXT: buffer_wbinvl1_vol
6178 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
6179 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6180 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
6181 ; GFX9-NEXT: s_cbranch_execnz .LBB107_1
6182 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6183 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
6184 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
6185 ; GFX9-NEXT: global_store_dword v1, v0, s[6:7]
6186 ; GFX9-NEXT: s_endpgm
6188 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
6189 %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst
6190 store i32 %tmp0, ptr addrspace(1) %out2
6194 define void @global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
6195 ; SI-LABEL: global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory:
6197 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6198 ; SI-NEXT: s_mov_b32 s6, 0
6199 ; SI-NEXT: s_mov_b32 s7, 0xf000
6200 ; SI-NEXT: s_mov_b32 s4, s6
6201 ; SI-NEXT: s_mov_b32 s5, s6
6202 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
6203 ; SI-NEXT: s_mov_b64 s[8:9], 0
6204 ; SI-NEXT: .LBB108_1: ; %atomicrmw.start
6205 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6206 ; SI-NEXT: s_waitcnt vmcnt(0)
6207 ; SI-NEXT: v_max_u32_e32 v3, v4, v2
6208 ; SI-NEXT: s_waitcnt expcnt(0)
6209 ; SI-NEXT: v_mov_b32_e32 v6, v4
6210 ; SI-NEXT: v_mov_b32_e32 v5, v3
6211 ; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
6212 ; SI-NEXT: s_waitcnt vmcnt(0)
6213 ; SI-NEXT: buffer_wbinvl1
6214 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
6215 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
6216 ; SI-NEXT: v_mov_b32_e32 v4, v5
6217 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
6218 ; SI-NEXT: s_cbranch_execnz .LBB108_1
6219 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6220 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
6221 ; SI-NEXT: s_waitcnt expcnt(0)
6222 ; SI-NEXT: s_setpc_b64 s[30:31]
6224 ; VI-LABEL: global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory:
6226 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6227 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
6228 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
6229 ; VI-NEXT: flat_load_dword v4, v[0:1]
6230 ; VI-NEXT: s_mov_b64 s[4:5], 0
6231 ; VI-NEXT: .LBB108_1: ; %atomicrmw.start
6232 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6233 ; VI-NEXT: s_waitcnt vmcnt(0)
6234 ; VI-NEXT: v_max_u32_e32 v3, v4, v2
6235 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6236 ; VI-NEXT: s_waitcnt vmcnt(0)
6237 ; VI-NEXT: buffer_wbinvl1_vol
6238 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6239 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6240 ; VI-NEXT: v_mov_b32_e32 v4, v3
6241 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
6242 ; VI-NEXT: s_cbranch_execnz .LBB108_1
6243 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6244 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
6245 ; VI-NEXT: s_setpc_b64 s[30:31]
6247 ; GFX9-LABEL: global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory:
6249 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6250 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16
6251 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
6252 ; GFX9-NEXT: .LBB108_1: ; %atomicrmw.start
6253 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6254 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6255 ; GFX9-NEXT: v_max_u32_e32 v3, v4, v2
6256 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
6257 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6258 ; GFX9-NEXT: buffer_wbinvl1_vol
6259 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6260 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6261 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
6262 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
6263 ; GFX9-NEXT: s_cbranch_execnz .LBB108_1
6264 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6265 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
6266 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6267 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
6268 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
6272 define i32 @global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
6273 ; SI-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory:
6275 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6276 ; SI-NEXT: s_mov_b32 s6, 0
6277 ; SI-NEXT: s_mov_b32 s7, 0xf000
6278 ; SI-NEXT: s_mov_b32 s4, s6
6279 ; SI-NEXT: s_mov_b32 s5, s6
6280 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
6281 ; SI-NEXT: s_mov_b64 s[8:9], 0
6282 ; SI-NEXT: .LBB109_1: ; %atomicrmw.start
6283 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6284 ; SI-NEXT: s_waitcnt vmcnt(0)
6285 ; SI-NEXT: v_mov_b32_e32 v5, v3
6286 ; SI-NEXT: s_waitcnt expcnt(0)
6287 ; SI-NEXT: v_max_u32_e32 v4, v5, v2
6288 ; SI-NEXT: v_mov_b32_e32 v3, v4
6289 ; SI-NEXT: v_mov_b32_e32 v4, v5
6290 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
6291 ; SI-NEXT: s_waitcnt vmcnt(0)
6292 ; SI-NEXT: buffer_wbinvl1
6293 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
6294 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
6295 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
6296 ; SI-NEXT: s_cbranch_execnz .LBB109_1
6297 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6298 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
6299 ; SI-NEXT: v_mov_b32_e32 v0, v3
6300 ; SI-NEXT: s_waitcnt expcnt(0)
6301 ; SI-NEXT: s_setpc_b64 s[30:31]
6303 ; VI-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory:
6305 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6306 ; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0
6307 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
6308 ; VI-NEXT: flat_load_dword v0, v[3:4]
6309 ; VI-NEXT: s_mov_b64 s[4:5], 0
6310 ; VI-NEXT: .LBB109_1: ; %atomicrmw.start
6311 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6312 ; VI-NEXT: s_waitcnt vmcnt(0)
6313 ; VI-NEXT: v_mov_b32_e32 v1, v0
6314 ; VI-NEXT: v_max_u32_e32 v0, v1, v2
6315 ; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
6316 ; VI-NEXT: s_waitcnt vmcnt(0)
6317 ; VI-NEXT: buffer_wbinvl1_vol
6318 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
6319 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6320 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
6321 ; VI-NEXT: s_cbranch_execnz .LBB109_1
6322 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6323 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
6324 ; VI-NEXT: s_setpc_b64 s[30:31]
6326 ; GFX9-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory:
6328 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6329 ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16
6330 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
6331 ; GFX9-NEXT: .LBB109_1: ; %atomicrmw.start
6332 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6333 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6334 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
6335 ; GFX9-NEXT: v_max_u32_e32 v3, v4, v2
6336 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
6337 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6338 ; GFX9-NEXT: buffer_wbinvl1_vol
6339 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6340 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6341 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
6342 ; GFX9-NEXT: s_cbranch_execnz .LBB109_1
6343 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6344 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
6345 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
6346 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6347 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
6348 %result = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
6352 ; ---------------------------------------------------------------------
6354 ; ---------------------------------------------------------------------
6356 define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
6357 ; SI-LABEL: global_atomic_umin_i32_noret:
6359 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6360 ; SI-NEXT: s_mov_b32 s6, 0
6361 ; SI-NEXT: s_mov_b32 s7, 0xf000
6362 ; SI-NEXT: s_mov_b32 s4, s6
6363 ; SI-NEXT: s_mov_b32 s5, s6
6364 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
6365 ; SI-NEXT: s_mov_b64 s[8:9], 0
6366 ; SI-NEXT: .LBB110_1: ; %atomicrmw.start
6367 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6368 ; SI-NEXT: s_waitcnt vmcnt(0)
6369 ; SI-NEXT: v_min_u32_e32 v3, v4, v2
6370 ; SI-NEXT: s_waitcnt expcnt(0)
6371 ; SI-NEXT: v_mov_b32_e32 v6, v4
6372 ; SI-NEXT: v_mov_b32_e32 v5, v3
6373 ; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
6374 ; SI-NEXT: s_waitcnt vmcnt(0)
6375 ; SI-NEXT: buffer_wbinvl1
6376 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
6377 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
6378 ; SI-NEXT: v_mov_b32_e32 v4, v5
6379 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
6380 ; SI-NEXT: s_cbranch_execnz .LBB110_1
6381 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6382 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
6383 ; SI-NEXT: s_waitcnt expcnt(0)
6384 ; SI-NEXT: s_setpc_b64 s[30:31]
6386 ; VI-LABEL: global_atomic_umin_i32_noret:
6388 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6389 ; VI-NEXT: flat_load_dword v4, v[0:1]
6390 ; VI-NEXT: s_mov_b64 s[4:5], 0
6391 ; VI-NEXT: .LBB110_1: ; %atomicrmw.start
6392 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6393 ; VI-NEXT: s_waitcnt vmcnt(0)
6394 ; VI-NEXT: v_min_u32_e32 v3, v4, v2
6395 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6396 ; VI-NEXT: s_waitcnt vmcnt(0)
6397 ; VI-NEXT: buffer_wbinvl1_vol
6398 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6399 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6400 ; VI-NEXT: v_mov_b32_e32 v4, v3
6401 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
6402 ; VI-NEXT: s_cbranch_execnz .LBB110_1
6403 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6404 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
6405 ; VI-NEXT: s_setpc_b64 s[30:31]
6407 ; GFX9-LABEL: global_atomic_umin_i32_noret:
6409 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6410 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
6411 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
6412 ; GFX9-NEXT: .LBB110_1: ; %atomicrmw.start
6413 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6414 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6415 ; GFX9-NEXT: v_min_u32_e32 v3, v4, v2
6416 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
6417 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6418 ; GFX9-NEXT: buffer_wbinvl1_vol
6419 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6420 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6421 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
6422 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
6423 ; GFX9-NEXT: s_cbranch_execnz .LBB110_1
6424 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6425 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
6426 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6427 %tmp0 = atomicrmw umin ptr addrspace(1) %ptr, i32 %in seq_cst
6431 define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
6432 ; SI-LABEL: global_atomic_umin_i32_noret_offset:
6434 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6435 ; SI-NEXT: s_mov_b32 s6, 0
6436 ; SI-NEXT: s_mov_b32 s7, 0xf000
6437 ; SI-NEXT: s_mov_b32 s4, s6
6438 ; SI-NEXT: s_mov_b32 s5, s6
6439 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
6440 ; SI-NEXT: s_mov_b64 s[8:9], 0
6441 ; SI-NEXT: .LBB111_1: ; %atomicrmw.start
6442 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6443 ; SI-NEXT: s_waitcnt vmcnt(0)
6444 ; SI-NEXT: v_min_u32_e32 v3, v4, v2
6445 ; SI-NEXT: s_waitcnt expcnt(0)
6446 ; SI-NEXT: v_mov_b32_e32 v6, v4
6447 ; SI-NEXT: v_mov_b32_e32 v5, v3
6448 ; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
6449 ; SI-NEXT: s_waitcnt vmcnt(0)
6450 ; SI-NEXT: buffer_wbinvl1
6451 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
6452 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
6453 ; SI-NEXT: v_mov_b32_e32 v4, v5
6454 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
6455 ; SI-NEXT: s_cbranch_execnz .LBB111_1
6456 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6457 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
6458 ; SI-NEXT: s_waitcnt expcnt(0)
6459 ; SI-NEXT: s_setpc_b64 s[30:31]
6461 ; VI-LABEL: global_atomic_umin_i32_noret_offset:
6463 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6464 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
6465 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
6466 ; VI-NEXT: flat_load_dword v4, v[0:1]
6467 ; VI-NEXT: s_mov_b64 s[4:5], 0
6468 ; VI-NEXT: .LBB111_1: ; %atomicrmw.start
6469 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6470 ; VI-NEXT: s_waitcnt vmcnt(0)
6471 ; VI-NEXT: v_min_u32_e32 v3, v4, v2
6472 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6473 ; VI-NEXT: s_waitcnt vmcnt(0)
6474 ; VI-NEXT: buffer_wbinvl1_vol
6475 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6476 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6477 ; VI-NEXT: v_mov_b32_e32 v4, v3
6478 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
6479 ; VI-NEXT: s_cbranch_execnz .LBB111_1
6480 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6481 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
6482 ; VI-NEXT: s_setpc_b64 s[30:31]
6484 ; GFX9-LABEL: global_atomic_umin_i32_noret_offset:
6486 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6487 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16
6488 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
6489 ; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start
6490 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6491 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6492 ; GFX9-NEXT: v_min_u32_e32 v3, v4, v2
6493 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
6494 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6495 ; GFX9-NEXT: buffer_wbinvl1_vol
6496 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6497 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6498 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
6499 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
6500 ; GFX9-NEXT: s_cbranch_execnz .LBB111_1
6501 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6502 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
6503 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6504 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
6505 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst
6509 define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
6510 ; SI-LABEL: global_atomic_umin_i32_ret:
6512 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6513 ; SI-NEXT: s_mov_b32 s6, 0
6514 ; SI-NEXT: s_mov_b32 s7, 0xf000
6515 ; SI-NEXT: s_mov_b32 s4, s6
6516 ; SI-NEXT: s_mov_b32 s5, s6
6517 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
6518 ; SI-NEXT: s_mov_b64 s[8:9], 0
6519 ; SI-NEXT: .LBB112_1: ; %atomicrmw.start
6520 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6521 ; SI-NEXT: s_waitcnt vmcnt(0)
6522 ; SI-NEXT: v_mov_b32_e32 v5, v3
6523 ; SI-NEXT: s_waitcnt expcnt(0)
6524 ; SI-NEXT: v_min_u32_e32 v4, v5, v2
6525 ; SI-NEXT: v_mov_b32_e32 v3, v4
6526 ; SI-NEXT: v_mov_b32_e32 v4, v5
6527 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
6528 ; SI-NEXT: s_waitcnt vmcnt(0)
6529 ; SI-NEXT: buffer_wbinvl1
6530 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
6531 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
6532 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
6533 ; SI-NEXT: s_cbranch_execnz .LBB112_1
6534 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6535 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
6536 ; SI-NEXT: v_mov_b32_e32 v0, v3
6537 ; SI-NEXT: s_waitcnt expcnt(0)
6538 ; SI-NEXT: s_setpc_b64 s[30:31]
6540 ; VI-LABEL: global_atomic_umin_i32_ret:
6542 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6543 ; VI-NEXT: flat_load_dword v3, v[0:1]
6544 ; VI-NEXT: s_mov_b64 s[4:5], 0
6545 ; VI-NEXT: .LBB112_1: ; %atomicrmw.start
6546 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6547 ; VI-NEXT: s_waitcnt vmcnt(0)
6548 ; VI-NEXT: v_mov_b32_e32 v4, v3
6549 ; VI-NEXT: v_min_u32_e32 v3, v4, v2
6550 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6551 ; VI-NEXT: s_waitcnt vmcnt(0)
6552 ; VI-NEXT: buffer_wbinvl1_vol
6553 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6554 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6555 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
6556 ; VI-NEXT: s_cbranch_execnz .LBB112_1
6557 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6558 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
6559 ; VI-NEXT: v_mov_b32_e32 v0, v3
6560 ; VI-NEXT: s_setpc_b64 s[30:31]
6562 ; GFX9-LABEL: global_atomic_umin_i32_ret:
6564 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6565 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
6566 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
6567 ; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start
6568 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6569 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6570 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
6571 ; GFX9-NEXT: v_min_u32_e32 v3, v4, v2
6572 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
6573 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6574 ; GFX9-NEXT: buffer_wbinvl1_vol
6575 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6576 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6577 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
6578 ; GFX9-NEXT: s_cbranch_execnz .LBB112_1
6579 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6580 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
6581 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
6582 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6583 %result = atomicrmw umin ptr addrspace(1) %ptr, i32 %in seq_cst
6587 define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
6588 ; SI-LABEL: global_atomic_umin_i32_ret_offset:
6590 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6591 ; SI-NEXT: s_mov_b32 s6, 0
6592 ; SI-NEXT: s_mov_b32 s7, 0xf000
6593 ; SI-NEXT: s_mov_b32 s4, s6
6594 ; SI-NEXT: s_mov_b32 s5, s6
6595 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
6596 ; SI-NEXT: s_mov_b64 s[8:9], 0
6597 ; SI-NEXT: .LBB113_1: ; %atomicrmw.start
6598 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6599 ; SI-NEXT: s_waitcnt vmcnt(0)
6600 ; SI-NEXT: v_mov_b32_e32 v5, v3
6601 ; SI-NEXT: s_waitcnt expcnt(0)
6602 ; SI-NEXT: v_min_u32_e32 v4, v5, v2
6603 ; SI-NEXT: v_mov_b32_e32 v3, v4
6604 ; SI-NEXT: v_mov_b32_e32 v4, v5
6605 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
6606 ; SI-NEXT: s_waitcnt vmcnt(0)
6607 ; SI-NEXT: buffer_wbinvl1
6608 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
6609 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
6610 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
6611 ; SI-NEXT: s_cbranch_execnz .LBB113_1
6612 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6613 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
6614 ; SI-NEXT: v_mov_b32_e32 v0, v3
6615 ; SI-NEXT: s_waitcnt expcnt(0)
6616 ; SI-NEXT: s_setpc_b64 s[30:31]
6618 ; VI-LABEL: global_atomic_umin_i32_ret_offset:
6620 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6621 ; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0
6622 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
6623 ; VI-NEXT: flat_load_dword v0, v[3:4]
6624 ; VI-NEXT: s_mov_b64 s[4:5], 0
6625 ; VI-NEXT: .LBB113_1: ; %atomicrmw.start
6626 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6627 ; VI-NEXT: s_waitcnt vmcnt(0)
6628 ; VI-NEXT: v_mov_b32_e32 v1, v0
6629 ; VI-NEXT: v_min_u32_e32 v0, v1, v2
6630 ; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
6631 ; VI-NEXT: s_waitcnt vmcnt(0)
6632 ; VI-NEXT: buffer_wbinvl1_vol
6633 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
6634 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6635 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
6636 ; VI-NEXT: s_cbranch_execnz .LBB113_1
6637 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6638 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
6639 ; VI-NEXT: s_setpc_b64 s[30:31]
6641 ; GFX9-LABEL: global_atomic_umin_i32_ret_offset:
6643 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6644 ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16
6645 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
6646 ; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start
6647 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6648 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6649 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
6650 ; GFX9-NEXT: v_min_u32_e32 v3, v4, v2
6651 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
6652 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6653 ; GFX9-NEXT: buffer_wbinvl1_vol
6654 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6655 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6656 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
6657 ; GFX9-NEXT: s_cbranch_execnz .LBB113_1
6658 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6659 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
6660 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
6661 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6662 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
6663 %result = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst
6667 define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
6668 ; SI-LABEL: global_atomic_umin_i32_noret_scalar:
6670 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6671 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
6672 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
6673 ; SI-NEXT: s_mov_b64 exec, s[34:35]
6674 ; SI-NEXT: s_waitcnt expcnt(0)
6675 ; SI-NEXT: v_writelane_b32 v4, s6, 0
6676 ; SI-NEXT: v_writelane_b32 v4, s7, 1
6677 ; SI-NEXT: s_mov_b32 s34, s6
6678 ; SI-NEXT: s_mov_b32 s7, 0xf000
6679 ; SI-NEXT: s_mov_b32 s6, -1
6680 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
6681 ; SI-NEXT: s_mov_b64 s[36:37], 0
6682 ; SI-NEXT: .LBB114_1: ; %atomicrmw.start
6683 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6684 ; SI-NEXT: s_waitcnt vmcnt(0)
6685 ; SI-NEXT: v_min_u32_e32 v0, s34, v1
6686 ; SI-NEXT: s_waitcnt expcnt(0)
6687 ; SI-NEXT: v_mov_b32_e32 v3, v1
6688 ; SI-NEXT: v_mov_b32_e32 v2, v0
6689 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
6690 ; SI-NEXT: s_waitcnt vmcnt(0)
6691 ; SI-NEXT: buffer_wbinvl1
6692 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
6693 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
6694 ; SI-NEXT: v_mov_b32_e32 v1, v2
6695 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
6696 ; SI-NEXT: s_cbranch_execnz .LBB114_1
6697 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6698 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
6699 ; SI-NEXT: v_readlane_b32 s7, v4, 1
6700 ; SI-NEXT: v_readlane_b32 s6, v4, 0
6701 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
6702 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
6703 ; SI-NEXT: s_mov_b64 exec, s[34:35]
6704 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
6705 ; SI-NEXT: s_setpc_b64 s[30:31]
6707 ; VI-LABEL: global_atomic_umin_i32_noret_scalar:
6709 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6710 ; VI-NEXT: v_mov_b32_e32 v0, s4
6711 ; VI-NEXT: v_mov_b32_e32 v1, s5
6712 ; VI-NEXT: flat_load_dword v3, v[0:1]
6713 ; VI-NEXT: s_mov_b64 s[34:35], 0
6714 ; VI-NEXT: .LBB114_1: ; %atomicrmw.start
6715 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6716 ; VI-NEXT: s_waitcnt vmcnt(0)
6717 ; VI-NEXT: v_min_u32_e32 v2, s6, v3
6718 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6719 ; VI-NEXT: s_waitcnt vmcnt(0)
6720 ; VI-NEXT: buffer_wbinvl1_vol
6721 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
6722 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6723 ; VI-NEXT: v_mov_b32_e32 v3, v2
6724 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
6725 ; VI-NEXT: s_cbranch_execnz .LBB114_1
6726 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6727 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
6728 ; VI-NEXT: s_setpc_b64 s[30:31]
6730 ; GFX9-LABEL: global_atomic_umin_i32_noret_scalar:
6732 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6733 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
6734 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5]
6735 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
6736 ; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start
6737 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6738 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6739 ; GFX9-NEXT: v_min_u32_e32 v0, s6, v1
6740 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc
6741 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6742 ; GFX9-NEXT: buffer_wbinvl1_vol
6743 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
6744 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6745 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
6746 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
6747 ; GFX9-NEXT: s_cbranch_execnz .LBB114_1
6748 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6749 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
6750 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6751 %tmp0 = atomicrmw umin ptr addrspace(1) %ptr, i32 %in seq_cst
6755 define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
6756 ; SI-LABEL: global_atomic_umin_i32_noret_offset_scalar:
6758 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6759 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
6760 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
6761 ; SI-NEXT: s_mov_b64 exec, s[34:35]
6762 ; SI-NEXT: s_waitcnt expcnt(0)
6763 ; SI-NEXT: v_writelane_b32 v4, s6, 0
6764 ; SI-NEXT: v_writelane_b32 v4, s7, 1
6765 ; SI-NEXT: s_mov_b32 s34, s6
6766 ; SI-NEXT: s_mov_b32 s7, 0xf000
6767 ; SI-NEXT: s_mov_b32 s6, -1
6768 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16
6769 ; SI-NEXT: s_mov_b64 s[36:37], 0
6770 ; SI-NEXT: .LBB115_1: ; %atomicrmw.start
6771 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6772 ; SI-NEXT: s_waitcnt vmcnt(0)
6773 ; SI-NEXT: v_min_u32_e32 v0, s34, v1
6774 ; SI-NEXT: s_waitcnt expcnt(0)
6775 ; SI-NEXT: v_mov_b32_e32 v3, v1
6776 ; SI-NEXT: v_mov_b32_e32 v2, v0
6777 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
6778 ; SI-NEXT: s_waitcnt vmcnt(0)
6779 ; SI-NEXT: buffer_wbinvl1
6780 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
6781 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
6782 ; SI-NEXT: v_mov_b32_e32 v1, v2
6783 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
6784 ; SI-NEXT: s_cbranch_execnz .LBB115_1
6785 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6786 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
6787 ; SI-NEXT: v_readlane_b32 s7, v4, 1
6788 ; SI-NEXT: v_readlane_b32 s6, v4, 0
6789 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
6790 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
6791 ; SI-NEXT: s_mov_b64 exec, s[34:35]
6792 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
6793 ; SI-NEXT: s_setpc_b64 s[30:31]
6795 ; VI-LABEL: global_atomic_umin_i32_noret_offset_scalar:
6797 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6798 ; VI-NEXT: s_add_u32 s34, s4, 16
6799 ; VI-NEXT: s_addc_u32 s35, s5, 0
6800 ; VI-NEXT: v_mov_b32_e32 v0, s34
6801 ; VI-NEXT: v_mov_b32_e32 v1, s35
6802 ; VI-NEXT: flat_load_dword v3, v[0:1]
6803 ; VI-NEXT: s_mov_b64 s[34:35], 0
6804 ; VI-NEXT: .LBB115_1: ; %atomicrmw.start
6805 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6806 ; VI-NEXT: s_waitcnt vmcnt(0)
6807 ; VI-NEXT: v_min_u32_e32 v2, s6, v3
6808 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6809 ; VI-NEXT: s_waitcnt vmcnt(0)
6810 ; VI-NEXT: buffer_wbinvl1_vol
6811 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
6812 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6813 ; VI-NEXT: v_mov_b32_e32 v3, v2
6814 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
6815 ; VI-NEXT: s_cbranch_execnz .LBB115_1
6816 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6817 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
6818 ; VI-NEXT: s_setpc_b64 s[30:31]
6820 ; GFX9-LABEL: global_atomic_umin_i32_noret_offset_scalar:
6822 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6823 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
6824 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16
6825 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
6826 ; GFX9-NEXT: .LBB115_1: ; %atomicrmw.start
6827 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6828 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6829 ; GFX9-NEXT: v_min_u32_e32 v0, s6, v1
6830 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc
6831 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6832 ; GFX9-NEXT: buffer_wbinvl1_vol
6833 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
6834 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6835 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
6836 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
6837 ; GFX9-NEXT: s_cbranch_execnz .LBB115_1
6838 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6839 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
6840 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6841 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
6842 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst
6846 define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
6847 ; SI-LABEL: global_atomic_umin_i32_ret_scalar:
6849 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6850 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
6851 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
6852 ; SI-NEXT: s_mov_b64 exec, s[34:35]
6853 ; SI-NEXT: s_waitcnt expcnt(0)
6854 ; SI-NEXT: v_writelane_b32 v3, s6, 0
6855 ; SI-NEXT: v_writelane_b32 v3, s7, 1
6856 ; SI-NEXT: s_mov_b32 s34, s6
6857 ; SI-NEXT: s_mov_b32 s7, 0xf000
6858 ; SI-NEXT: s_mov_b32 s6, -1
6859 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
6860 ; SI-NEXT: s_mov_b64 s[36:37], 0
6861 ; SI-NEXT: .LBB116_1: ; %atomicrmw.start
6862 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6863 ; SI-NEXT: s_waitcnt vmcnt(0)
6864 ; SI-NEXT: v_mov_b32_e32 v2, v0
6865 ; SI-NEXT: s_waitcnt expcnt(0)
6866 ; SI-NEXT: v_min_u32_e32 v1, s34, v2
6867 ; SI-NEXT: v_mov_b32_e32 v0, v1
6868 ; SI-NEXT: v_mov_b32_e32 v1, v2
6869 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
6870 ; SI-NEXT: s_waitcnt vmcnt(0)
6871 ; SI-NEXT: buffer_wbinvl1
6872 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
6873 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
6874 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
6875 ; SI-NEXT: s_cbranch_execnz .LBB116_1
6876 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6877 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
6878 ; SI-NEXT: v_readlane_b32 s7, v3, 1
6879 ; SI-NEXT: v_readlane_b32 s6, v3, 0
6880 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
6881 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
6882 ; SI-NEXT: s_mov_b64 exec, s[34:35]
6883 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
6884 ; SI-NEXT: s_setpc_b64 s[30:31]
6886 ; VI-LABEL: global_atomic_umin_i32_ret_scalar:
6888 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6889 ; VI-NEXT: v_mov_b32_e32 v0, s4
6890 ; VI-NEXT: v_mov_b32_e32 v1, s5
6891 ; VI-NEXT: flat_load_dword v0, v[0:1]
6892 ; VI-NEXT: v_mov_b32_e32 v1, s4
6893 ; VI-NEXT: s_mov_b64 s[34:35], 0
6894 ; VI-NEXT: v_mov_b32_e32 v2, s5
6895 ; VI-NEXT: .LBB116_1: ; %atomicrmw.start
6896 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6897 ; VI-NEXT: s_waitcnt vmcnt(0)
6898 ; VI-NEXT: v_mov_b32_e32 v4, v0
6899 ; VI-NEXT: v_min_u32_e32 v3, s6, v4
6900 ; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
6901 ; VI-NEXT: s_waitcnt vmcnt(0)
6902 ; VI-NEXT: buffer_wbinvl1_vol
6903 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
6904 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6905 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
6906 ; VI-NEXT: s_cbranch_execnz .LBB116_1
6907 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6908 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
6909 ; VI-NEXT: s_setpc_b64 s[30:31]
6911 ; GFX9-LABEL: global_atomic_umin_i32_ret_scalar:
6913 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6914 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
6915 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5]
6916 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
6917 ; GFX9-NEXT: .LBB116_1: ; %atomicrmw.start
6918 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6919 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6920 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
6921 ; GFX9-NEXT: v_min_u32_e32 v2, s6, v3
6922 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
6923 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6924 ; GFX9-NEXT: buffer_wbinvl1_vol
6925 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
6926 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6927 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
6928 ; GFX9-NEXT: s_cbranch_execnz .LBB116_1
6929 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6930 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
6931 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6932 %result = atomicrmw umin ptr addrspace(1) %ptr, i32 %in seq_cst
6936 define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
6937 ; SI-LABEL: global_atomic_umin_i32_ret_offset_scalar:
6939 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6940 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
6941 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
6942 ; SI-NEXT: s_mov_b64 exec, s[34:35]
6943 ; SI-NEXT: s_waitcnt expcnt(0)
6944 ; SI-NEXT: v_writelane_b32 v3, s6, 0
6945 ; SI-NEXT: v_writelane_b32 v3, s7, 1
6946 ; SI-NEXT: s_mov_b32 s34, s6
6947 ; SI-NEXT: s_mov_b32 s7, 0xf000
6948 ; SI-NEXT: s_mov_b32 s6, -1
6949 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16
6950 ; SI-NEXT: s_mov_b64 s[36:37], 0
6951 ; SI-NEXT: .LBB117_1: ; %atomicrmw.start
6952 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6953 ; SI-NEXT: s_waitcnt vmcnt(0)
6954 ; SI-NEXT: v_mov_b32_e32 v2, v0
6955 ; SI-NEXT: s_waitcnt expcnt(0)
6956 ; SI-NEXT: v_min_u32_e32 v1, s34, v2
6957 ; SI-NEXT: v_mov_b32_e32 v0, v1
6958 ; SI-NEXT: v_mov_b32_e32 v1, v2
6959 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
6960 ; SI-NEXT: s_waitcnt vmcnt(0)
6961 ; SI-NEXT: buffer_wbinvl1
6962 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
6963 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
6964 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
6965 ; SI-NEXT: s_cbranch_execnz .LBB117_1
6966 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6967 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
6968 ; SI-NEXT: v_readlane_b32 s7, v3, 1
6969 ; SI-NEXT: v_readlane_b32 s6, v3, 0
6970 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
6971 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
6972 ; SI-NEXT: s_mov_b64 exec, s[34:35]
6973 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
6974 ; SI-NEXT: s_setpc_b64 s[30:31]
6976 ; VI-LABEL: global_atomic_umin_i32_ret_offset_scalar:
6978 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6979 ; VI-NEXT: s_add_u32 s34, s4, 16
6980 ; VI-NEXT: s_addc_u32 s35, s5, 0
6981 ; VI-NEXT: v_mov_b32_e32 v1, s34
6982 ; VI-NEXT: v_mov_b32_e32 v2, s35
6983 ; VI-NEXT: flat_load_dword v0, v[1:2]
6984 ; VI-NEXT: s_mov_b64 s[34:35], 0
6985 ; VI-NEXT: .LBB117_1: ; %atomicrmw.start
6986 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6987 ; VI-NEXT: s_waitcnt vmcnt(0)
6988 ; VI-NEXT: v_mov_b32_e32 v4, v0
6989 ; VI-NEXT: v_min_u32_e32 v3, s6, v4
6990 ; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
6991 ; VI-NEXT: s_waitcnt vmcnt(0)
6992 ; VI-NEXT: buffer_wbinvl1_vol
6993 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
6994 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6995 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
6996 ; VI-NEXT: s_cbranch_execnz .LBB117_1
6997 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6998 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
6999 ; VI-NEXT: s_setpc_b64 s[30:31]
7001 ; GFX9-LABEL: global_atomic_umin_i32_ret_offset_scalar:
7003 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7004 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
7005 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16
7006 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
7007 ; GFX9-NEXT: .LBB117_1: ; %atomicrmw.start
7008 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7009 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7010 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
7011 ; GFX9-NEXT: v_min_u32_e32 v2, s6, v3
7012 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc
7013 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7014 ; GFX9-NEXT: buffer_wbinvl1_vol
7015 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
7016 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7017 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
7018 ; GFX9-NEXT: s_cbranch_execnz .LBB117_1
7019 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7020 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
7021 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7022 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
7023 %result = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst
7027 define void @global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
7028 ; SI-LABEL: global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory:
7030 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7031 ; SI-NEXT: s_mov_b32 s6, 0
7032 ; SI-NEXT: s_mov_b32 s7, 0xf000
7033 ; SI-NEXT: s_mov_b32 s4, s6
7034 ; SI-NEXT: s_mov_b32 s5, s6
7035 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
7036 ; SI-NEXT: s_mov_b64 s[8:9], 0
7037 ; SI-NEXT: .LBB118_1: ; %atomicrmw.start
7038 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7039 ; SI-NEXT: s_waitcnt vmcnt(0)
7040 ; SI-NEXT: v_min_u32_e32 v3, v4, v2
7041 ; SI-NEXT: s_waitcnt expcnt(0)
7042 ; SI-NEXT: v_mov_b32_e32 v6, v4
7043 ; SI-NEXT: v_mov_b32_e32 v5, v3
7044 ; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
7045 ; SI-NEXT: s_waitcnt vmcnt(0)
7046 ; SI-NEXT: buffer_wbinvl1
7047 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
7048 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
7049 ; SI-NEXT: v_mov_b32_e32 v4, v5
7050 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
7051 ; SI-NEXT: s_cbranch_execnz .LBB118_1
7052 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7053 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
7054 ; SI-NEXT: s_waitcnt expcnt(0)
7055 ; SI-NEXT: s_setpc_b64 s[30:31]
7057 ; VI-LABEL: global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory:
7059 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7060 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
7061 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7062 ; VI-NEXT: flat_load_dword v4, v[0:1]
7063 ; VI-NEXT: s_mov_b64 s[4:5], 0
7064 ; VI-NEXT: .LBB118_1: ; %atomicrmw.start
7065 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7066 ; VI-NEXT: s_waitcnt vmcnt(0)
7067 ; VI-NEXT: v_min_u32_e32 v3, v4, v2
7068 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
7069 ; VI-NEXT: s_waitcnt vmcnt(0)
7070 ; VI-NEXT: buffer_wbinvl1_vol
7071 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
7072 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7073 ; VI-NEXT: v_mov_b32_e32 v4, v3
7074 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
7075 ; VI-NEXT: s_cbranch_execnz .LBB118_1
7076 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7077 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
7078 ; VI-NEXT: s_setpc_b64 s[30:31]
7080 ; GFX9-LABEL: global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory:
7082 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7083 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16
7084 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
7085 ; GFX9-NEXT: .LBB118_1: ; %atomicrmw.start
7086 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7087 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7088 ; GFX9-NEXT: v_min_u32_e32 v3, v4, v2
7089 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
7090 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7091 ; GFX9-NEXT: buffer_wbinvl1_vol
7092 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
7093 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7094 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
7095 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
7096 ; GFX9-NEXT: s_cbranch_execnz .LBB118_1
7097 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7098 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
7099 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7100 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
7101 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
7105 define i32 @global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
7106 ; SI-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory:
7108 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7109 ; SI-NEXT: s_mov_b32 s6, 0
7110 ; SI-NEXT: s_mov_b32 s7, 0xf000
7111 ; SI-NEXT: s_mov_b32 s4, s6
7112 ; SI-NEXT: s_mov_b32 s5, s6
7113 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
7114 ; SI-NEXT: s_mov_b64 s[8:9], 0
7115 ; SI-NEXT: .LBB119_1: ; %atomicrmw.start
7116 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7117 ; SI-NEXT: s_waitcnt vmcnt(0)
7118 ; SI-NEXT: v_mov_b32_e32 v5, v3
7119 ; SI-NEXT: s_waitcnt expcnt(0)
7120 ; SI-NEXT: v_min_u32_e32 v4, v5, v2
7121 ; SI-NEXT: v_mov_b32_e32 v3, v4
7122 ; SI-NEXT: v_mov_b32_e32 v4, v5
7123 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
7124 ; SI-NEXT: s_waitcnt vmcnt(0)
7125 ; SI-NEXT: buffer_wbinvl1
7126 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
7127 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
7128 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
7129 ; SI-NEXT: s_cbranch_execnz .LBB119_1
7130 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7131 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
7132 ; SI-NEXT: v_mov_b32_e32 v0, v3
7133 ; SI-NEXT: s_waitcnt expcnt(0)
7134 ; SI-NEXT: s_setpc_b64 s[30:31]
7136 ; VI-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory:
7138 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7139 ; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0
7140 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
7141 ; VI-NEXT: flat_load_dword v0, v[3:4]
7142 ; VI-NEXT: s_mov_b64 s[4:5], 0
7143 ; VI-NEXT: .LBB119_1: ; %atomicrmw.start
7144 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7145 ; VI-NEXT: s_waitcnt vmcnt(0)
7146 ; VI-NEXT: v_mov_b32_e32 v1, v0
7147 ; VI-NEXT: v_min_u32_e32 v0, v1, v2
7148 ; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
7149 ; VI-NEXT: s_waitcnt vmcnt(0)
7150 ; VI-NEXT: buffer_wbinvl1_vol
7151 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
7152 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7153 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
7154 ; VI-NEXT: s_cbranch_execnz .LBB119_1
7155 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7156 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
7157 ; VI-NEXT: s_setpc_b64 s[30:31]
7159 ; GFX9-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory:
7161 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7162 ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16
7163 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
7164 ; GFX9-NEXT: .LBB119_1: ; %atomicrmw.start
7165 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7166 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7167 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
7168 ; GFX9-NEXT: v_min_u32_e32 v3, v4, v2
7169 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
7170 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7171 ; GFX9-NEXT: buffer_wbinvl1_vol
7172 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
7173 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7174 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
7175 ; GFX9-NEXT: s_cbranch_execnz .LBB119_1
7176 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7177 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
7178 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
7179 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7180 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
7181 %result = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
7185 ; ---------------------------------------------------------------------
7187 ; ---------------------------------------------------------------------
7189 define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
7190 ; SI-LABEL: global_atomic_min_i32_noret:
7192 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7193 ; SI-NEXT: s_mov_b32 s6, 0
7194 ; SI-NEXT: s_mov_b32 s7, 0xf000
7195 ; SI-NEXT: s_mov_b32 s4, s6
7196 ; SI-NEXT: s_mov_b32 s5, s6
7197 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
7198 ; SI-NEXT: s_mov_b64 s[8:9], 0
7199 ; SI-NEXT: .LBB120_1: ; %atomicrmw.start
7200 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7201 ; SI-NEXT: s_waitcnt vmcnt(0)
7202 ; SI-NEXT: v_min_i32_e32 v3, v4, v2
7203 ; SI-NEXT: s_waitcnt expcnt(0)
7204 ; SI-NEXT: v_mov_b32_e32 v6, v4
7205 ; SI-NEXT: v_mov_b32_e32 v5, v3
7206 ; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
7207 ; SI-NEXT: s_waitcnt vmcnt(0)
7208 ; SI-NEXT: buffer_wbinvl1
7209 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
7210 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
7211 ; SI-NEXT: v_mov_b32_e32 v4, v5
7212 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
7213 ; SI-NEXT: s_cbranch_execnz .LBB120_1
7214 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7215 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
7216 ; SI-NEXT: s_waitcnt expcnt(0)
7217 ; SI-NEXT: s_setpc_b64 s[30:31]
7219 ; VI-LABEL: global_atomic_min_i32_noret:
7221 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7222 ; VI-NEXT: flat_load_dword v4, v[0:1]
7223 ; VI-NEXT: s_mov_b64 s[4:5], 0
7224 ; VI-NEXT: .LBB120_1: ; %atomicrmw.start
7225 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7226 ; VI-NEXT: s_waitcnt vmcnt(0)
7227 ; VI-NEXT: v_min_i32_e32 v3, v4, v2
7228 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
7229 ; VI-NEXT: s_waitcnt vmcnt(0)
7230 ; VI-NEXT: buffer_wbinvl1_vol
7231 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
7232 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7233 ; VI-NEXT: v_mov_b32_e32 v4, v3
7234 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
7235 ; VI-NEXT: s_cbranch_execnz .LBB120_1
7236 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7237 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
7238 ; VI-NEXT: s_setpc_b64 s[30:31]
7240 ; GFX9-LABEL: global_atomic_min_i32_noret:
7242 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7243 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
7244 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
7245 ; GFX9-NEXT: .LBB120_1: ; %atomicrmw.start
7246 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7247 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7248 ; GFX9-NEXT: v_min_i32_e32 v3, v4, v2
7249 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
7250 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7251 ; GFX9-NEXT: buffer_wbinvl1_vol
7252 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
7253 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7254 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
7255 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
7256 ; GFX9-NEXT: s_cbranch_execnz .LBB120_1
7257 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7258 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
7259 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7260 %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst
7264 define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
7265 ; SI-LABEL: global_atomic_min_i32_noret_offset:
7267 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7268 ; SI-NEXT: s_mov_b32 s6, 0
7269 ; SI-NEXT: s_mov_b32 s7, 0xf000
7270 ; SI-NEXT: s_mov_b32 s4, s6
7271 ; SI-NEXT: s_mov_b32 s5, s6
7272 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
7273 ; SI-NEXT: s_mov_b64 s[8:9], 0
7274 ; SI-NEXT: .LBB121_1: ; %atomicrmw.start
7275 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7276 ; SI-NEXT: s_waitcnt vmcnt(0)
7277 ; SI-NEXT: v_min_i32_e32 v3, v4, v2
7278 ; SI-NEXT: s_waitcnt expcnt(0)
7279 ; SI-NEXT: v_mov_b32_e32 v6, v4
7280 ; SI-NEXT: v_mov_b32_e32 v5, v3
7281 ; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
7282 ; SI-NEXT: s_waitcnt vmcnt(0)
7283 ; SI-NEXT: buffer_wbinvl1
7284 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
7285 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
7286 ; SI-NEXT: v_mov_b32_e32 v4, v5
7287 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
7288 ; SI-NEXT: s_cbranch_execnz .LBB121_1
7289 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7290 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
7291 ; SI-NEXT: s_waitcnt expcnt(0)
7292 ; SI-NEXT: s_setpc_b64 s[30:31]
7294 ; VI-LABEL: global_atomic_min_i32_noret_offset:
7296 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7297 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
7298 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7299 ; VI-NEXT: flat_load_dword v4, v[0:1]
7300 ; VI-NEXT: s_mov_b64 s[4:5], 0
7301 ; VI-NEXT: .LBB121_1: ; %atomicrmw.start
7302 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7303 ; VI-NEXT: s_waitcnt vmcnt(0)
7304 ; VI-NEXT: v_min_i32_e32 v3, v4, v2
7305 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
7306 ; VI-NEXT: s_waitcnt vmcnt(0)
7307 ; VI-NEXT: buffer_wbinvl1_vol
7308 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
7309 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7310 ; VI-NEXT: v_mov_b32_e32 v4, v3
7311 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
7312 ; VI-NEXT: s_cbranch_execnz .LBB121_1
7313 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7314 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
7315 ; VI-NEXT: s_setpc_b64 s[30:31]
7317 ; GFX9-LABEL: global_atomic_min_i32_noret_offset:
7319 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7320 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16
7321 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
7322 ; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start
7323 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7324 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7325 ; GFX9-NEXT: v_min_i32_e32 v3, v4, v2
7326 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
7327 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7328 ; GFX9-NEXT: buffer_wbinvl1_vol
7329 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
7330 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7331 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
7332 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
7333 ; GFX9-NEXT: s_cbranch_execnz .LBB121_1
7334 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7335 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
7336 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7337 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
7338 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst
7342 define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
7343 ; SI-LABEL: global_atomic_min_i32_ret:
7345 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7346 ; SI-NEXT: s_mov_b32 s6, 0
7347 ; SI-NEXT: s_mov_b32 s7, 0xf000
7348 ; SI-NEXT: s_mov_b32 s4, s6
7349 ; SI-NEXT: s_mov_b32 s5, s6
7350 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
7351 ; SI-NEXT: s_mov_b64 s[8:9], 0
7352 ; SI-NEXT: .LBB122_1: ; %atomicrmw.start
7353 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7354 ; SI-NEXT: s_waitcnt vmcnt(0)
7355 ; SI-NEXT: v_mov_b32_e32 v5, v3
7356 ; SI-NEXT: s_waitcnt expcnt(0)
7357 ; SI-NEXT: v_min_i32_e32 v4, v5, v2
7358 ; SI-NEXT: v_mov_b32_e32 v3, v4
7359 ; SI-NEXT: v_mov_b32_e32 v4, v5
7360 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
7361 ; SI-NEXT: s_waitcnt vmcnt(0)
7362 ; SI-NEXT: buffer_wbinvl1
7363 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
7364 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
7365 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
7366 ; SI-NEXT: s_cbranch_execnz .LBB122_1
7367 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7368 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
7369 ; SI-NEXT: v_mov_b32_e32 v0, v3
7370 ; SI-NEXT: s_waitcnt expcnt(0)
7371 ; SI-NEXT: s_setpc_b64 s[30:31]
7373 ; VI-LABEL: global_atomic_min_i32_ret:
7375 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7376 ; VI-NEXT: flat_load_dword v3, v[0:1]
7377 ; VI-NEXT: s_mov_b64 s[4:5], 0
7378 ; VI-NEXT: .LBB122_1: ; %atomicrmw.start
7379 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7380 ; VI-NEXT: s_waitcnt vmcnt(0)
7381 ; VI-NEXT: v_mov_b32_e32 v4, v3
7382 ; VI-NEXT: v_min_i32_e32 v3, v4, v2
7383 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
7384 ; VI-NEXT: s_waitcnt vmcnt(0)
7385 ; VI-NEXT: buffer_wbinvl1_vol
7386 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
7387 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7388 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
7389 ; VI-NEXT: s_cbranch_execnz .LBB122_1
7390 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7391 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
7392 ; VI-NEXT: v_mov_b32_e32 v0, v3
7393 ; VI-NEXT: s_setpc_b64 s[30:31]
7395 ; GFX9-LABEL: global_atomic_min_i32_ret:
7397 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7398 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
7399 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
7400 ; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start
7401 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7402 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7403 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
7404 ; GFX9-NEXT: v_min_i32_e32 v3, v4, v2
7405 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
7406 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7407 ; GFX9-NEXT: buffer_wbinvl1_vol
7408 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
7409 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7410 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
7411 ; GFX9-NEXT: s_cbranch_execnz .LBB122_1
7412 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7413 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
7414 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
7415 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7416 %result = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst
7420 define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
7421 ; SI-LABEL: global_atomic_min_i32_ret_offset:
7423 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7424 ; SI-NEXT: s_mov_b32 s6, 0
7425 ; SI-NEXT: s_mov_b32 s7, 0xf000
7426 ; SI-NEXT: s_mov_b32 s4, s6
7427 ; SI-NEXT: s_mov_b32 s5, s6
7428 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
7429 ; SI-NEXT: s_mov_b64 s[8:9], 0
7430 ; SI-NEXT: .LBB123_1: ; %atomicrmw.start
7431 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7432 ; SI-NEXT: s_waitcnt vmcnt(0)
7433 ; SI-NEXT: v_mov_b32_e32 v5, v3
7434 ; SI-NEXT: s_waitcnt expcnt(0)
7435 ; SI-NEXT: v_min_i32_e32 v4, v5, v2
7436 ; SI-NEXT: v_mov_b32_e32 v3, v4
7437 ; SI-NEXT: v_mov_b32_e32 v4, v5
7438 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
7439 ; SI-NEXT: s_waitcnt vmcnt(0)
7440 ; SI-NEXT: buffer_wbinvl1
7441 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
7442 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
7443 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
7444 ; SI-NEXT: s_cbranch_execnz .LBB123_1
7445 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7446 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
7447 ; SI-NEXT: v_mov_b32_e32 v0, v3
7448 ; SI-NEXT: s_waitcnt expcnt(0)
7449 ; SI-NEXT: s_setpc_b64 s[30:31]
7451 ; VI-LABEL: global_atomic_min_i32_ret_offset:
7453 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7454 ; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0
7455 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
7456 ; VI-NEXT: flat_load_dword v0, v[3:4]
7457 ; VI-NEXT: s_mov_b64 s[4:5], 0
7458 ; VI-NEXT: .LBB123_1: ; %atomicrmw.start
7459 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7460 ; VI-NEXT: s_waitcnt vmcnt(0)
7461 ; VI-NEXT: v_mov_b32_e32 v1, v0
7462 ; VI-NEXT: v_min_i32_e32 v0, v1, v2
7463 ; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
7464 ; VI-NEXT: s_waitcnt vmcnt(0)
7465 ; VI-NEXT: buffer_wbinvl1_vol
7466 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
7467 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7468 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
7469 ; VI-NEXT: s_cbranch_execnz .LBB123_1
7470 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7471 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
7472 ; VI-NEXT: s_setpc_b64 s[30:31]
7474 ; GFX9-LABEL: global_atomic_min_i32_ret_offset:
7476 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7477 ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16
7478 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
7479 ; GFX9-NEXT: .LBB123_1: ; %atomicrmw.start
7480 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7481 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7482 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
7483 ; GFX9-NEXT: v_min_i32_e32 v3, v4, v2
7484 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
7485 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7486 ; GFX9-NEXT: buffer_wbinvl1_vol
7487 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
7488 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7489 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
7490 ; GFX9-NEXT: s_cbranch_execnz .LBB123_1
7491 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7492 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
7493 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
7494 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7495 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
7496 %result = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst
7500 define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
7501 ; SI-LABEL: global_atomic_min_i32_noret_scalar:
7503 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7504 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7505 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
7506 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7507 ; SI-NEXT: s_waitcnt expcnt(0)
7508 ; SI-NEXT: v_writelane_b32 v4, s6, 0
7509 ; SI-NEXT: v_writelane_b32 v4, s7, 1
7510 ; SI-NEXT: s_mov_b32 s34, s6
7511 ; SI-NEXT: s_mov_b32 s7, 0xf000
7512 ; SI-NEXT: s_mov_b32 s6, -1
7513 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
7514 ; SI-NEXT: s_mov_b64 s[36:37], 0
7515 ; SI-NEXT: .LBB124_1: ; %atomicrmw.start
7516 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7517 ; SI-NEXT: s_waitcnt vmcnt(0)
7518 ; SI-NEXT: v_min_i32_e32 v0, s34, v1
7519 ; SI-NEXT: s_waitcnt expcnt(0)
7520 ; SI-NEXT: v_mov_b32_e32 v3, v1
7521 ; SI-NEXT: v_mov_b32_e32 v2, v0
7522 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
7523 ; SI-NEXT: s_waitcnt vmcnt(0)
7524 ; SI-NEXT: buffer_wbinvl1
7525 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
7526 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
7527 ; SI-NEXT: v_mov_b32_e32 v1, v2
7528 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
7529 ; SI-NEXT: s_cbranch_execnz .LBB124_1
7530 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7531 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
7532 ; SI-NEXT: v_readlane_b32 s7, v4, 1
7533 ; SI-NEXT: v_readlane_b32 s6, v4, 0
7534 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7535 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
7536 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7537 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
7538 ; SI-NEXT: s_setpc_b64 s[30:31]
7540 ; VI-LABEL: global_atomic_min_i32_noret_scalar:
7542 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7543 ; VI-NEXT: v_mov_b32_e32 v0, s4
7544 ; VI-NEXT: v_mov_b32_e32 v1, s5
7545 ; VI-NEXT: flat_load_dword v3, v[0:1]
7546 ; VI-NEXT: s_mov_b64 s[34:35], 0
7547 ; VI-NEXT: .LBB124_1: ; %atomicrmw.start
7548 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7549 ; VI-NEXT: s_waitcnt vmcnt(0)
7550 ; VI-NEXT: v_min_i32_e32 v2, s6, v3
7551 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7552 ; VI-NEXT: s_waitcnt vmcnt(0)
7553 ; VI-NEXT: buffer_wbinvl1_vol
7554 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
7555 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7556 ; VI-NEXT: v_mov_b32_e32 v3, v2
7557 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
7558 ; VI-NEXT: s_cbranch_execnz .LBB124_1
7559 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7560 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
7561 ; VI-NEXT: s_setpc_b64 s[30:31]
7563 ; GFX9-LABEL: global_atomic_min_i32_noret_scalar:
7565 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7566 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
7567 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5]
7568 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
7569 ; GFX9-NEXT: .LBB124_1: ; %atomicrmw.start
7570 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7571 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7572 ; GFX9-NEXT: v_min_i32_e32 v0, s6, v1
7573 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc
7574 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7575 ; GFX9-NEXT: buffer_wbinvl1_vol
7576 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
7577 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7578 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
7579 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
7580 ; GFX9-NEXT: s_cbranch_execnz .LBB124_1
7581 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7582 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
7583 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7584 %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst
7588 define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
7589 ; SI-LABEL: global_atomic_min_i32_noret_offset_scalar:
7591 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7592 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7593 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
7594 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7595 ; SI-NEXT: s_waitcnt expcnt(0)
7596 ; SI-NEXT: v_writelane_b32 v4, s6, 0
7597 ; SI-NEXT: v_writelane_b32 v4, s7, 1
7598 ; SI-NEXT: s_mov_b32 s34, s6
7599 ; SI-NEXT: s_mov_b32 s7, 0xf000
7600 ; SI-NEXT: s_mov_b32 s6, -1
7601 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16
7602 ; SI-NEXT: s_mov_b64 s[36:37], 0
7603 ; SI-NEXT: .LBB125_1: ; %atomicrmw.start
7604 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7605 ; SI-NEXT: s_waitcnt vmcnt(0)
7606 ; SI-NEXT: v_min_i32_e32 v0, s34, v1
7607 ; SI-NEXT: s_waitcnt expcnt(0)
7608 ; SI-NEXT: v_mov_b32_e32 v3, v1
7609 ; SI-NEXT: v_mov_b32_e32 v2, v0
7610 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
7611 ; SI-NEXT: s_waitcnt vmcnt(0)
7612 ; SI-NEXT: buffer_wbinvl1
7613 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
7614 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
7615 ; SI-NEXT: v_mov_b32_e32 v1, v2
7616 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
7617 ; SI-NEXT: s_cbranch_execnz .LBB125_1
7618 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7619 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
7620 ; SI-NEXT: v_readlane_b32 s7, v4, 1
7621 ; SI-NEXT: v_readlane_b32 s6, v4, 0
7622 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7623 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
7624 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7625 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
7626 ; SI-NEXT: s_setpc_b64 s[30:31]
7628 ; VI-LABEL: global_atomic_min_i32_noret_offset_scalar:
7630 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7631 ; VI-NEXT: s_add_u32 s34, s4, 16
7632 ; VI-NEXT: s_addc_u32 s35, s5, 0
7633 ; VI-NEXT: v_mov_b32_e32 v0, s34
7634 ; VI-NEXT: v_mov_b32_e32 v1, s35
7635 ; VI-NEXT: flat_load_dword v3, v[0:1]
7636 ; VI-NEXT: s_mov_b64 s[34:35], 0
7637 ; VI-NEXT: .LBB125_1: ; %atomicrmw.start
7638 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7639 ; VI-NEXT: s_waitcnt vmcnt(0)
7640 ; VI-NEXT: v_min_i32_e32 v2, s6, v3
7641 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7642 ; VI-NEXT: s_waitcnt vmcnt(0)
7643 ; VI-NEXT: buffer_wbinvl1_vol
7644 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
7645 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7646 ; VI-NEXT: v_mov_b32_e32 v3, v2
7647 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
7648 ; VI-NEXT: s_cbranch_execnz .LBB125_1
7649 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7650 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
7651 ; VI-NEXT: s_setpc_b64 s[30:31]
7653 ; GFX9-LABEL: global_atomic_min_i32_noret_offset_scalar:
7655 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7656 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
7657 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16
7658 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
7659 ; GFX9-NEXT: .LBB125_1: ; %atomicrmw.start
7660 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7661 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7662 ; GFX9-NEXT: v_min_i32_e32 v0, s6, v1
7663 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc
7664 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7665 ; GFX9-NEXT: buffer_wbinvl1_vol
7666 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
7667 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7668 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
7669 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
7670 ; GFX9-NEXT: s_cbranch_execnz .LBB125_1
7671 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7672 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
7673 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7674 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
7675 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst
7679 define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
7680 ; SI-LABEL: global_atomic_min_i32_ret_scalar:
7682 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7683 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7684 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
7685 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7686 ; SI-NEXT: s_waitcnt expcnt(0)
7687 ; SI-NEXT: v_writelane_b32 v3, s6, 0
7688 ; SI-NEXT: v_writelane_b32 v3, s7, 1
7689 ; SI-NEXT: s_mov_b32 s34, s6
7690 ; SI-NEXT: s_mov_b32 s7, 0xf000
7691 ; SI-NEXT: s_mov_b32 s6, -1
7692 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
7693 ; SI-NEXT: s_mov_b64 s[36:37], 0
7694 ; SI-NEXT: .LBB126_1: ; %atomicrmw.start
7695 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7696 ; SI-NEXT: s_waitcnt vmcnt(0)
7697 ; SI-NEXT: v_mov_b32_e32 v2, v0
7698 ; SI-NEXT: s_waitcnt expcnt(0)
7699 ; SI-NEXT: v_min_i32_e32 v1, s34, v2
7700 ; SI-NEXT: v_mov_b32_e32 v0, v1
7701 ; SI-NEXT: v_mov_b32_e32 v1, v2
7702 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
7703 ; SI-NEXT: s_waitcnt vmcnt(0)
7704 ; SI-NEXT: buffer_wbinvl1
7705 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
7706 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
7707 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
7708 ; SI-NEXT: s_cbranch_execnz .LBB126_1
7709 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7710 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
7711 ; SI-NEXT: v_readlane_b32 s7, v3, 1
7712 ; SI-NEXT: v_readlane_b32 s6, v3, 0
7713 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7714 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
7715 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7716 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
7717 ; SI-NEXT: s_setpc_b64 s[30:31]
7719 ; VI-LABEL: global_atomic_min_i32_ret_scalar:
7721 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7722 ; VI-NEXT: v_mov_b32_e32 v0, s4
7723 ; VI-NEXT: v_mov_b32_e32 v1, s5
7724 ; VI-NEXT: flat_load_dword v0, v[0:1]
7725 ; VI-NEXT: v_mov_b32_e32 v1, s4
7726 ; VI-NEXT: s_mov_b64 s[34:35], 0
7727 ; VI-NEXT: v_mov_b32_e32 v2, s5
7728 ; VI-NEXT: .LBB126_1: ; %atomicrmw.start
7729 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7730 ; VI-NEXT: s_waitcnt vmcnt(0)
7731 ; VI-NEXT: v_mov_b32_e32 v4, v0
7732 ; VI-NEXT: v_min_i32_e32 v3, s6, v4
7733 ; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
7734 ; VI-NEXT: s_waitcnt vmcnt(0)
7735 ; VI-NEXT: buffer_wbinvl1_vol
7736 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
7737 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7738 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
7739 ; VI-NEXT: s_cbranch_execnz .LBB126_1
7740 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7741 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
7742 ; VI-NEXT: s_setpc_b64 s[30:31]
7744 ; GFX9-LABEL: global_atomic_min_i32_ret_scalar:
7746 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7747 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
7748 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5]
7749 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
7750 ; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start
7751 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7752 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7753 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
7754 ; GFX9-NEXT: v_min_i32_e32 v2, s6, v3
7755 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
7756 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7757 ; GFX9-NEXT: buffer_wbinvl1_vol
7758 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
7759 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7760 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
7761 ; GFX9-NEXT: s_cbranch_execnz .LBB126_1
7762 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7763 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
7764 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7765 %result = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst
7769 define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
7770 ; SI-LABEL: global_atomic_min_i32_ret_offset_scalar:
7772 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7773 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7774 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
7775 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7776 ; SI-NEXT: s_waitcnt expcnt(0)
7777 ; SI-NEXT: v_writelane_b32 v3, s6, 0
7778 ; SI-NEXT: v_writelane_b32 v3, s7, 1
7779 ; SI-NEXT: s_mov_b32 s34, s6
7780 ; SI-NEXT: s_mov_b32 s7, 0xf000
7781 ; SI-NEXT: s_mov_b32 s6, -1
7782 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16
7783 ; SI-NEXT: s_mov_b64 s[36:37], 0
7784 ; SI-NEXT: .LBB127_1: ; %atomicrmw.start
7785 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7786 ; SI-NEXT: s_waitcnt vmcnt(0)
7787 ; SI-NEXT: v_mov_b32_e32 v2, v0
7788 ; SI-NEXT: s_waitcnt expcnt(0)
7789 ; SI-NEXT: v_min_i32_e32 v1, s34, v2
7790 ; SI-NEXT: v_mov_b32_e32 v0, v1
7791 ; SI-NEXT: v_mov_b32_e32 v1, v2
7792 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
7793 ; SI-NEXT: s_waitcnt vmcnt(0)
7794 ; SI-NEXT: buffer_wbinvl1
7795 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
7796 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
7797 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
7798 ; SI-NEXT: s_cbranch_execnz .LBB127_1
7799 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7800 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
7801 ; SI-NEXT: v_readlane_b32 s7, v3, 1
7802 ; SI-NEXT: v_readlane_b32 s6, v3, 0
7803 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7804 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
7805 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7806 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
7807 ; SI-NEXT: s_setpc_b64 s[30:31]
7809 ; VI-LABEL: global_atomic_min_i32_ret_offset_scalar:
7811 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7812 ; VI-NEXT: s_add_u32 s34, s4, 16
7813 ; VI-NEXT: s_addc_u32 s35, s5, 0
7814 ; VI-NEXT: v_mov_b32_e32 v1, s34
7815 ; VI-NEXT: v_mov_b32_e32 v2, s35
7816 ; VI-NEXT: flat_load_dword v0, v[1:2]
7817 ; VI-NEXT: s_mov_b64 s[34:35], 0
7818 ; VI-NEXT: .LBB127_1: ; %atomicrmw.start
7819 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7820 ; VI-NEXT: s_waitcnt vmcnt(0)
7821 ; VI-NEXT: v_mov_b32_e32 v4, v0
7822 ; VI-NEXT: v_min_i32_e32 v3, s6, v4
7823 ; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
7824 ; VI-NEXT: s_waitcnt vmcnt(0)
7825 ; VI-NEXT: buffer_wbinvl1_vol
7826 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
7827 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7828 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
7829 ; VI-NEXT: s_cbranch_execnz .LBB127_1
7830 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7831 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
7832 ; VI-NEXT: s_setpc_b64 s[30:31]
7834 ; GFX9-LABEL: global_atomic_min_i32_ret_offset_scalar:
7836 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7837 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
7838 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16
7839 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
7840 ; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start
7841 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7842 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7843 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
7844 ; GFX9-NEXT: v_min_i32_e32 v2, s6, v3
7845 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc
7846 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7847 ; GFX9-NEXT: buffer_wbinvl1_vol
7848 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
7849 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7850 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
7851 ; GFX9-NEXT: s_cbranch_execnz .LBB127_1
7852 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7853 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
7854 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7855 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
7856 %result = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst
7860 define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) {
7861 ; SI-LABEL: atomic_min_i32_addr64_offset:
7862 ; SI: ; %bb.0: ; %entry
7863 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
7864 ; SI-NEXT: s_waitcnt lgkmcnt(0)
7865 ; SI-NEXT: s_ashr_i32 s5, s3, 31
7866 ; SI-NEXT: s_mov_b32 s4, s3
7867 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
7868 ; SI-NEXT: s_add_u32 s4, s0, s4
7869 ; SI-NEXT: s_addc_u32 s5, s1, s5
7870 ; SI-NEXT: s_load_dword s3, s[4:5], 0x4
7871 ; SI-NEXT: s_mov_b64 s[0:1], 0
7872 ; SI-NEXT: s_mov_b32 s7, 0xf000
7873 ; SI-NEXT: s_waitcnt lgkmcnt(0)
7874 ; SI-NEXT: v_mov_b32_e32 v1, s3
7875 ; SI-NEXT: s_mov_b32 s6, -1
7876 ; SI-NEXT: .LBB128_1: ; %atomicrmw.start
7877 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7878 ; SI-NEXT: v_min_i32_e32 v0, s2, v1
7879 ; SI-NEXT: s_waitcnt expcnt(0)
7880 ; SI-NEXT: v_mov_b32_e32 v3, v1
7881 ; SI-NEXT: v_mov_b32_e32 v2, v0
7882 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
7883 ; SI-NEXT: s_waitcnt vmcnt(0)
7884 ; SI-NEXT: buffer_wbinvl1
7885 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
7886 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
7887 ; SI-NEXT: v_mov_b32_e32 v1, v2
7888 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
7889 ; SI-NEXT: s_cbranch_execnz .LBB128_1
7890 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7893 ; VI-LABEL: atomic_min_i32_addr64_offset:
7894 ; VI: ; %bb.0: ; %entry
7895 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
7896 ; VI-NEXT: s_waitcnt lgkmcnt(0)
7897 ; VI-NEXT: s_ashr_i32 s5, s3, 31
7898 ; VI-NEXT: s_mov_b32 s4, s3
7899 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
7900 ; VI-NEXT: s_add_u32 s4, s0, s4
7901 ; VI-NEXT: s_addc_u32 s5, s1, s5
7902 ; VI-NEXT: s_load_dword s3, s[4:5], 0x10
7903 ; VI-NEXT: s_add_u32 s4, s4, 16
7904 ; VI-NEXT: s_addc_u32 s5, s5, 0
7905 ; VI-NEXT: v_mov_b32_e32 v0, s4
7906 ; VI-NEXT: s_mov_b64 s[0:1], 0
7907 ; VI-NEXT: s_waitcnt lgkmcnt(0)
7908 ; VI-NEXT: v_mov_b32_e32 v3, s3
7909 ; VI-NEXT: v_mov_b32_e32 v1, s5
7910 ; VI-NEXT: .LBB128_1: ; %atomicrmw.start
7911 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7912 ; VI-NEXT: v_min_i32_e32 v2, s2, v3
7913 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7914 ; VI-NEXT: s_waitcnt vmcnt(0)
7915 ; VI-NEXT: buffer_wbinvl1_vol
7916 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
7917 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
7918 ; VI-NEXT: v_mov_b32_e32 v3, v2
7919 ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
7920 ; VI-NEXT: s_cbranch_execnz .LBB128_1
7921 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7924 ; GFX9-LABEL: atomic_min_i32_addr64_offset:
7925 ; GFX9: ; %bb.0: ; %entry
7926 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
7927 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
7928 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
7929 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7930 ; GFX9-NEXT: s_ashr_i32 s1, s7, 31
7931 ; GFX9-NEXT: s_mov_b32 s0, s7
7932 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
7933 ; GFX9-NEXT: s_add_u32 s0, s4, s0
7934 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
7935 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10
7936 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7937 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
7938 ; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start
7939 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7940 ; GFX9-NEXT: v_min_i32_e32 v0, s6, v1
7941 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
7942 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7943 ; GFX9-NEXT: buffer_wbinvl1_vol
7944 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
7945 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
7946 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
7947 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
7948 ; GFX9-NEXT: s_cbranch_execnz .LBB128_1
7949 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7950 ; GFX9-NEXT: s_endpgm
7952 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
7953 %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
7954 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst
7958 define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) {
7959 ; SI-LABEL: atomic_min_i32_ret_addr64_offset:
7960 ; SI: ; %bb.0: ; %entry
7961 ; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd
7962 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
7963 ; SI-NEXT: s_waitcnt lgkmcnt(0)
7964 ; SI-NEXT: s_ashr_i32 s5, s9, 31
7965 ; SI-NEXT: s_mov_b32 s4, s9
7966 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
7967 ; SI-NEXT: s_add_u32 s4, s0, s4
7968 ; SI-NEXT: s_addc_u32 s5, s1, s5
7969 ; SI-NEXT: s_load_dword s6, s[4:5], 0x4
7970 ; SI-NEXT: s_mov_b64 s[0:1], 0
7971 ; SI-NEXT: s_mov_b32 s7, 0xf000
7972 ; SI-NEXT: s_waitcnt lgkmcnt(0)
7973 ; SI-NEXT: v_mov_b32_e32 v1, s6
7974 ; SI-NEXT: s_mov_b32 s6, -1
7975 ; SI-NEXT: .LBB129_1: ; %atomicrmw.start
7976 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7977 ; SI-NEXT: v_min_i32_e32 v0, s8, v1
7978 ; SI-NEXT: s_waitcnt expcnt(0)
7979 ; SI-NEXT: v_mov_b32_e32 v3, v1
7980 ; SI-NEXT: v_mov_b32_e32 v2, v0
7981 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
7982 ; SI-NEXT: s_waitcnt vmcnt(0)
7983 ; SI-NEXT: buffer_wbinvl1
7984 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
7985 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
7986 ; SI-NEXT: v_mov_b32_e32 v1, v2
7987 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
7988 ; SI-NEXT: s_cbranch_execnz .LBB129_1
7989 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7990 ; SI-NEXT: s_or_b64 exec, exec, s[0:1]
7991 ; SI-NEXT: s_mov_b32 s7, 0xf000
7992 ; SI-NEXT: s_mov_b32 s6, -1
7993 ; SI-NEXT: s_mov_b32 s4, s2
7994 ; SI-NEXT: s_mov_b32 s5, s3
7995 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
7998 ; VI-LABEL: atomic_min_i32_ret_addr64_offset:
7999 ; VI: ; %bb.0: ; %entry
8000 ; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
8001 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
8002 ; VI-NEXT: s_waitcnt lgkmcnt(0)
8003 ; VI-NEXT: s_ashr_i32 s7, s5, 31
8004 ; VI-NEXT: s_mov_b32 s6, s5
8005 ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
8006 ; VI-NEXT: s_add_u32 s6, s0, s6
8007 ; VI-NEXT: s_addc_u32 s7, s1, s7
8008 ; VI-NEXT: s_load_dword s5, s[6:7], 0x10
8009 ; VI-NEXT: s_add_u32 s6, s6, 16
8010 ; VI-NEXT: s_addc_u32 s7, s7, 0
8011 ; VI-NEXT: v_mov_b32_e32 v0, s6
8012 ; VI-NEXT: s_mov_b64 s[0:1], 0
8013 ; VI-NEXT: s_waitcnt lgkmcnt(0)
8014 ; VI-NEXT: v_mov_b32_e32 v2, s5
8015 ; VI-NEXT: v_mov_b32_e32 v1, s7
8016 ; VI-NEXT: .LBB129_1: ; %atomicrmw.start
8017 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
8018 ; VI-NEXT: v_mov_b32_e32 v3, v2
8019 ; VI-NEXT: v_min_i32_e32 v2, s4, v3
8020 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8021 ; VI-NEXT: s_waitcnt vmcnt(0)
8022 ; VI-NEXT: buffer_wbinvl1_vol
8023 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
8024 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
8025 ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
8026 ; VI-NEXT: s_cbranch_execnz .LBB129_1
8027 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
8028 ; VI-NEXT: s_or_b64 exec, exec, s[0:1]
8029 ; VI-NEXT: v_mov_b32_e32 v0, s2
8030 ; VI-NEXT: v_mov_b32_e32 v1, s3
8031 ; VI-NEXT: flat_store_dword v[0:1], v2
8034 ; GFX9-LABEL: atomic_min_i32_ret_addr64_offset:
8035 ; GFX9: ; %bb.0: ; %entry
8036 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
8037 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
8038 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
8039 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
8040 ; GFX9-NEXT: s_ashr_i32 s3, s1, 31
8041 ; GFX9-NEXT: s_mov_b32 s2, s1
8042 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2
8043 ; GFX9-NEXT: s_add_u32 s2, s4, s2
8044 ; GFX9-NEXT: s_addc_u32 s3, s5, s3
8045 ; GFX9-NEXT: s_load_dword s1, s[2:3], 0x10
8046 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
8047 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
8048 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
8049 ; GFX9-NEXT: .LBB129_1: ; %atomicrmw.start
8050 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
8051 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
8052 ; GFX9-NEXT: v_min_i32_e32 v2, s0, v3
8053 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] offset:16 glc
8054 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8055 ; GFX9-NEXT: buffer_wbinvl1_vol
8056 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
8057 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
8058 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
8059 ; GFX9-NEXT: s_cbranch_execnz .LBB129_1
8060 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
8061 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
8062 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
8063 ; GFX9-NEXT: global_store_dword v1, v0, s[6:7]
8064 ; GFX9-NEXT: s_endpgm
8066 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
8067 %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
8068 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst
8069 store i32 %tmp0, ptr addrspace(1) %out2
8073 define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) {
8074 ; SI-LABEL: atomic_min_i32:
8075 ; SI: ; %bb.0: ; %entry
8076 ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
8077 ; SI-NEXT: s_load_dword s6, s[2:3], 0xb
8078 ; SI-NEXT: s_waitcnt lgkmcnt(0)
8079 ; SI-NEXT: s_load_dword s2, s[0:1], 0x0
8080 ; SI-NEXT: s_mov_b64 s[4:5], 0
8081 ; SI-NEXT: s_mov_b32 s3, 0xf000
8082 ; SI-NEXT: s_waitcnt lgkmcnt(0)
8083 ; SI-NEXT: v_mov_b32_e32 v1, s2
8084 ; SI-NEXT: s_mov_b32 s2, -1
8085 ; SI-NEXT: .LBB130_1: ; %atomicrmw.start
8086 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
8087 ; SI-NEXT: v_min_i32_e32 v0, s6, v1
8088 ; SI-NEXT: s_waitcnt expcnt(0)
8089 ; SI-NEXT: v_mov_b32_e32 v3, v1
8090 ; SI-NEXT: v_mov_b32_e32 v2, v0
8091 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
8092 ; SI-NEXT: s_waitcnt vmcnt(0)
8093 ; SI-NEXT: buffer_wbinvl1
8094 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
8095 ; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
8096 ; SI-NEXT: v_mov_b32_e32 v1, v2
8097 ; SI-NEXT: s_andn2_b64 exec, exec, s[4:5]
8098 ; SI-NEXT: s_cbranch_execnz .LBB130_1
8099 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
8102 ; VI-LABEL: atomic_min_i32:
8103 ; VI: ; %bb.0: ; %entry
8104 ; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
8105 ; VI-NEXT: s_load_dword s2, s[2:3], 0x2c
8106 ; VI-NEXT: s_mov_b64 s[0:1], 0
8107 ; VI-NEXT: s_waitcnt lgkmcnt(0)
8108 ; VI-NEXT: s_load_dword s3, s[4:5], 0x0
8109 ; VI-NEXT: v_mov_b32_e32 v0, s4
8110 ; VI-NEXT: v_mov_b32_e32 v1, s5
8111 ; VI-NEXT: s_waitcnt lgkmcnt(0)
8112 ; VI-NEXT: v_mov_b32_e32 v3, s3
8113 ; VI-NEXT: .LBB130_1: ; %atomicrmw.start
8114 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
8115 ; VI-NEXT: v_min_i32_e32 v2, s2, v3
8116 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8117 ; VI-NEXT: s_waitcnt vmcnt(0)
8118 ; VI-NEXT: buffer_wbinvl1_vol
8119 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
8120 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
8121 ; VI-NEXT: v_mov_b32_e32 v3, v2
8122 ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
8123 ; VI-NEXT: s_cbranch_execnz .LBB130_1
8124 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
8127 ; GFX9-LABEL: atomic_min_i32:
8128 ; GFX9: ; %bb.0: ; %entry
8129 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
8130 ; GFX9-NEXT: s_load_dword s4, s[2:3], 0x2c
8131 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
8132 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
8133 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
8134 ; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0
8135 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
8136 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
8137 ; GFX9-NEXT: .LBB130_1: ; %atomicrmw.start
8138 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
8139 ; GFX9-NEXT: v_min_i32_e32 v0, s4, v1
8140 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
8141 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8142 ; GFX9-NEXT: buffer_wbinvl1_vol
8143 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
8144 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
8145 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
8146 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
8147 ; GFX9-NEXT: s_cbranch_execnz .LBB130_1
8148 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
8149 ; GFX9-NEXT: s_endpgm
8151 %tmp0 = atomicrmw min ptr addrspace(1) %out, i32 %in seq_cst
8155 define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) {
8156 ; SI-LABEL: atomic_min_i32_ret_addr64:
8157 ; SI: ; %bb.0: ; %entry
8158 ; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd
8159 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
8160 ; SI-NEXT: s_waitcnt lgkmcnt(0)
8161 ; SI-NEXT: s_ashr_i32 s5, s9, 31
8162 ; SI-NEXT: s_mov_b32 s4, s9
8163 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
8164 ; SI-NEXT: s_add_u32 s4, s0, s4
8165 ; SI-NEXT: s_addc_u32 s5, s1, s5
8166 ; SI-NEXT: s_load_dword s6, s[4:5], 0x0
8167 ; SI-NEXT: s_mov_b64 s[0:1], 0
8168 ; SI-NEXT: s_mov_b32 s7, 0xf000
8169 ; SI-NEXT: s_waitcnt lgkmcnt(0)
8170 ; SI-NEXT: v_mov_b32_e32 v1, s6
8171 ; SI-NEXT: s_mov_b32 s6, -1
8172 ; SI-NEXT: .LBB131_1: ; %atomicrmw.start
8173 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
8174 ; SI-NEXT: v_min_i32_e32 v0, s8, v1
8175 ; SI-NEXT: s_waitcnt expcnt(0)
8176 ; SI-NEXT: v_mov_b32_e32 v3, v1
8177 ; SI-NEXT: v_mov_b32_e32 v2, v0
8178 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
8179 ; SI-NEXT: s_waitcnt vmcnt(0)
8180 ; SI-NEXT: buffer_wbinvl1
8181 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
8182 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
8183 ; SI-NEXT: v_mov_b32_e32 v1, v2
8184 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
8185 ; SI-NEXT: s_cbranch_execnz .LBB131_1
8186 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
8187 ; SI-NEXT: s_or_b64 exec, exec, s[0:1]
8188 ; SI-NEXT: s_mov_b32 s7, 0xf000
8189 ; SI-NEXT: s_mov_b32 s6, -1
8190 ; SI-NEXT: s_mov_b32 s4, s2
8191 ; SI-NEXT: s_mov_b32 s5, s3
8192 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
8195 ; VI-LABEL: atomic_min_i32_ret_addr64:
8196 ; VI: ; %bb.0: ; %entry
8197 ; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
8198 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
8199 ; VI-NEXT: s_waitcnt lgkmcnt(0)
8200 ; VI-NEXT: s_ashr_i32 s7, s5, 31
8201 ; VI-NEXT: s_mov_b32 s6, s5
8202 ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
8203 ; VI-NEXT: s_add_u32 s6, s0, s6
8204 ; VI-NEXT: s_addc_u32 s7, s1, s7
8205 ; VI-NEXT: s_load_dword s5, s[6:7], 0x0
8206 ; VI-NEXT: v_mov_b32_e32 v0, s6
8207 ; VI-NEXT: s_mov_b64 s[0:1], 0
8208 ; VI-NEXT: v_mov_b32_e32 v1, s7
8209 ; VI-NEXT: s_waitcnt lgkmcnt(0)
8210 ; VI-NEXT: v_mov_b32_e32 v2, s5
8211 ; VI-NEXT: .LBB131_1: ; %atomicrmw.start
8212 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
8213 ; VI-NEXT: v_mov_b32_e32 v3, v2
8214 ; VI-NEXT: v_min_i32_e32 v2, s4, v3
8215 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8216 ; VI-NEXT: s_waitcnt vmcnt(0)
8217 ; VI-NEXT: buffer_wbinvl1_vol
8218 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
8219 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
8220 ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
8221 ; VI-NEXT: s_cbranch_execnz .LBB131_1
8222 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
8223 ; VI-NEXT: s_or_b64 exec, exec, s[0:1]
8224 ; VI-NEXT: v_mov_b32_e32 v0, s2
8225 ; VI-NEXT: v_mov_b32_e32 v1, s3
8226 ; VI-NEXT: flat_store_dword v[0:1], v2
8229 ; GFX9-LABEL: atomic_min_i32_ret_addr64:
8230 ; GFX9: ; %bb.0: ; %entry
8231 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
8232 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
8233 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
8234 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
8235 ; GFX9-NEXT: s_ashr_i32 s3, s1, 31
8236 ; GFX9-NEXT: s_mov_b32 s2, s1
8237 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 2
8238 ; GFX9-NEXT: s_add_u32 s2, s4, s2
8239 ; GFX9-NEXT: s_addc_u32 s3, s5, s3
8240 ; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0
8241 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
8242 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
8243 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
8244 ; GFX9-NEXT: .LBB131_1: ; %atomicrmw.start
8245 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
8246 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
8247 ; GFX9-NEXT: v_min_i32_e32 v2, s0, v3
8248 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] glc
8249 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8250 ; GFX9-NEXT: buffer_wbinvl1_vol
8251 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
8252 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
8253 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
8254 ; GFX9-NEXT: s_cbranch_execnz .LBB131_1
8255 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
8256 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
8257 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
8258 ; GFX9-NEXT: global_store_dword v1, v0, s[6:7]
8259 ; GFX9-NEXT: s_endpgm
8261 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
8262 %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst
8263 store i32 %tmp0, ptr addrspace(1) %out2
8267 define void @global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
8268 ; SI-LABEL: global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory:
8270 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8271 ; SI-NEXT: s_mov_b32 s6, 0
8272 ; SI-NEXT: s_mov_b32 s7, 0xf000
8273 ; SI-NEXT: s_mov_b32 s4, s6
8274 ; SI-NEXT: s_mov_b32 s5, s6
8275 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
8276 ; SI-NEXT: s_mov_b64 s[8:9], 0
8277 ; SI-NEXT: .LBB132_1: ; %atomicrmw.start
8278 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
8279 ; SI-NEXT: s_waitcnt vmcnt(0)
8280 ; SI-NEXT: v_min_i32_e32 v3, v4, v2
8281 ; SI-NEXT: s_waitcnt expcnt(0)
8282 ; SI-NEXT: v_mov_b32_e32 v6, v4
8283 ; SI-NEXT: v_mov_b32_e32 v5, v3
8284 ; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
8285 ; SI-NEXT: s_waitcnt vmcnt(0)
8286 ; SI-NEXT: buffer_wbinvl1
8287 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
8288 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
8289 ; SI-NEXT: v_mov_b32_e32 v4, v5
8290 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
8291 ; SI-NEXT: s_cbranch_execnz .LBB132_1
8292 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
8293 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
8294 ; SI-NEXT: s_waitcnt expcnt(0)
8295 ; SI-NEXT: s_setpc_b64 s[30:31]
8297 ; VI-LABEL: global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory:
8299 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8300 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
8301 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
8302 ; VI-NEXT: flat_load_dword v4, v[0:1]
8303 ; VI-NEXT: s_mov_b64 s[4:5], 0
8304 ; VI-NEXT: .LBB132_1: ; %atomicrmw.start
8305 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
8306 ; VI-NEXT: s_waitcnt vmcnt(0)
8307 ; VI-NEXT: v_min_i32_e32 v3, v4, v2
8308 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
8309 ; VI-NEXT: s_waitcnt vmcnt(0)
8310 ; VI-NEXT: buffer_wbinvl1_vol
8311 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
8312 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
8313 ; VI-NEXT: v_mov_b32_e32 v4, v3
8314 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
8315 ; VI-NEXT: s_cbranch_execnz .LBB132_1
8316 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
8317 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
8318 ; VI-NEXT: s_setpc_b64 s[30:31]
8320 ; GFX9-LABEL: global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory:
8322 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8323 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16
8324 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
8325 ; GFX9-NEXT: .LBB132_1: ; %atomicrmw.start
8326 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
8327 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8328 ; GFX9-NEXT: v_min_i32_e32 v3, v4, v2
8329 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
8330 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8331 ; GFX9-NEXT: buffer_wbinvl1_vol
8332 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
8333 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
8334 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
8335 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
8336 ; GFX9-NEXT: s_cbranch_execnz .LBB132_1
8337 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
8338 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
8339 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8340 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
8341 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
8345 define i32 @global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
8346 ; SI-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory:
8348 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8349 ; SI-NEXT: s_mov_b32 s6, 0
8350 ; SI-NEXT: s_mov_b32 s7, 0xf000
8351 ; SI-NEXT: s_mov_b32 s4, s6
8352 ; SI-NEXT: s_mov_b32 s5, s6
8353 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
8354 ; SI-NEXT: s_mov_b64 s[8:9], 0
8355 ; SI-NEXT: .LBB133_1: ; %atomicrmw.start
8356 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
8357 ; SI-NEXT: s_waitcnt vmcnt(0)
8358 ; SI-NEXT: v_mov_b32_e32 v5, v3
8359 ; SI-NEXT: s_waitcnt expcnt(0)
8360 ; SI-NEXT: v_min_i32_e32 v4, v5, v2
8361 ; SI-NEXT: v_mov_b32_e32 v3, v4
8362 ; SI-NEXT: v_mov_b32_e32 v4, v5
8363 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
8364 ; SI-NEXT: s_waitcnt vmcnt(0)
8365 ; SI-NEXT: buffer_wbinvl1
8366 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
8367 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
8368 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
8369 ; SI-NEXT: s_cbranch_execnz .LBB133_1
8370 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
8371 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
8372 ; SI-NEXT: v_mov_b32_e32 v0, v3
8373 ; SI-NEXT: s_waitcnt expcnt(0)
8374 ; SI-NEXT: s_setpc_b64 s[30:31]
8376 ; VI-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory:
8378 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8379 ; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0
8380 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
8381 ; VI-NEXT: flat_load_dword v0, v[3:4]
8382 ; VI-NEXT: s_mov_b64 s[4:5], 0
8383 ; VI-NEXT: .LBB133_1: ; %atomicrmw.start
8384 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
8385 ; VI-NEXT: s_waitcnt vmcnt(0)
8386 ; VI-NEXT: v_mov_b32_e32 v1, v0
8387 ; VI-NEXT: v_min_i32_e32 v0, v1, v2
8388 ; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
8389 ; VI-NEXT: s_waitcnt vmcnt(0)
8390 ; VI-NEXT: buffer_wbinvl1_vol
8391 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
8392 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
8393 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
8394 ; VI-NEXT: s_cbranch_execnz .LBB133_1
8395 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
8396 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
8397 ; VI-NEXT: s_setpc_b64 s[30:31]
8399 ; GFX9-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory:
8401 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8402 ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16
8403 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
8404 ; GFX9-NEXT: .LBB133_1: ; %atomicrmw.start
8405 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
8406 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8407 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
8408 ; GFX9-NEXT: v_min_i32_e32 v3, v4, v2
8409 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
8410 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8411 ; GFX9-NEXT: buffer_wbinvl1_vol
8412 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
8413 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
8414 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
8415 ; GFX9-NEXT: s_cbranch_execnz .LBB133_1
8416 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
8417 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
8418 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
8419 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8420 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
8421 %result = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
8425 ; ---------------------------------------------------------------------
8426 ; atomicrmw uinc_wrap
8427 ; ---------------------------------------------------------------------
8429 define void @global_atomic_uinc_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
8430 ; SI-LABEL: global_atomic_uinc_wrap_i32_noret:
8432 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8433 ; SI-NEXT: s_mov_b32 s6, 0
8434 ; SI-NEXT: s_mov_b32 s7, 0xf000
8435 ; SI-NEXT: s_mov_b32 s4, s6
8436 ; SI-NEXT: s_mov_b32 s5, s6
8437 ; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64
8438 ; SI-NEXT: s_waitcnt vmcnt(0)
8439 ; SI-NEXT: buffer_wbinvl1
8440 ; SI-NEXT: s_waitcnt expcnt(0)
8441 ; SI-NEXT: s_setpc_b64 s[30:31]
8443 ; VI-LABEL: global_atomic_uinc_wrap_i32_noret:
8445 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8446 ; VI-NEXT: flat_atomic_inc v[0:1], v2
8447 ; VI-NEXT: s_waitcnt vmcnt(0)
8448 ; VI-NEXT: buffer_wbinvl1_vol
8449 ; VI-NEXT: s_setpc_b64 s[30:31]
8451 ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret:
8453 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8454 ; GFX9-NEXT: global_atomic_inc v[0:1], v2, off
8455 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8456 ; GFX9-NEXT: buffer_wbinvl1_vol
8457 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8458 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
8462 define void @global_atomic_uinc_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
8463 ; SI-LABEL: global_atomic_uinc_wrap_i32_noret_offset:
8465 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8466 ; SI-NEXT: s_mov_b32 s6, 0
8467 ; SI-NEXT: s_mov_b32 s7, 0xf000
8468 ; SI-NEXT: s_mov_b32 s4, s6
8469 ; SI-NEXT: s_mov_b32 s5, s6
8470 ; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16
8471 ; SI-NEXT: s_waitcnt vmcnt(0)
8472 ; SI-NEXT: buffer_wbinvl1
8473 ; SI-NEXT: s_waitcnt expcnt(0)
8474 ; SI-NEXT: s_setpc_b64 s[30:31]
8476 ; VI-LABEL: global_atomic_uinc_wrap_i32_noret_offset:
8478 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8479 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
8480 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
8481 ; VI-NEXT: flat_atomic_inc v[0:1], v2
8482 ; VI-NEXT: s_waitcnt vmcnt(0)
8483 ; VI-NEXT: buffer_wbinvl1_vol
8484 ; VI-NEXT: s_setpc_b64 s[30:31]
8486 ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset:
8488 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8489 ; GFX9-NEXT: global_atomic_inc v[0:1], v2, off offset:16
8490 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8491 ; GFX9-NEXT: buffer_wbinvl1_vol
8492 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8493 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
8494 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
8498 define i32 @global_atomic_uinc_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
8499 ; SI-LABEL: global_atomic_uinc_wrap_i32_ret:
8501 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8502 ; SI-NEXT: s_mov_b32 s6, 0
8503 ; SI-NEXT: s_mov_b32 s7, 0xf000
8504 ; SI-NEXT: s_mov_b32 s4, s6
8505 ; SI-NEXT: s_mov_b32 s5, s6
8506 ; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 glc
8507 ; SI-NEXT: s_waitcnt vmcnt(0)
8508 ; SI-NEXT: buffer_wbinvl1
8509 ; SI-NEXT: v_mov_b32_e32 v0, v2
8510 ; SI-NEXT: s_waitcnt expcnt(0)
8511 ; SI-NEXT: s_setpc_b64 s[30:31]
8513 ; VI-LABEL: global_atomic_uinc_wrap_i32_ret:
8515 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8516 ; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
8517 ; VI-NEXT: s_waitcnt vmcnt(0)
8518 ; VI-NEXT: buffer_wbinvl1_vol
8519 ; VI-NEXT: s_setpc_b64 s[30:31]
8521 ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret:
8523 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8524 ; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off glc
8525 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8526 ; GFX9-NEXT: buffer_wbinvl1_vol
8527 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8528 %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
8532 define i32 @global_atomic_uinc_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
8533 ; SI-LABEL: global_atomic_uinc_wrap_i32_ret_offset:
8535 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8536 ; SI-NEXT: s_mov_b32 s6, 0
8537 ; SI-NEXT: s_mov_b32 s7, 0xf000
8538 ; SI-NEXT: s_mov_b32 s4, s6
8539 ; SI-NEXT: s_mov_b32 s5, s6
8540 ; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
8541 ; SI-NEXT: s_waitcnt vmcnt(0)
8542 ; SI-NEXT: buffer_wbinvl1
8543 ; SI-NEXT: v_mov_b32_e32 v0, v2
8544 ; SI-NEXT: s_waitcnt expcnt(0)
8545 ; SI-NEXT: s_setpc_b64 s[30:31]
8547 ; VI-LABEL: global_atomic_uinc_wrap_i32_ret_offset:
8549 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8550 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
8551 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
8552 ; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
8553 ; VI-NEXT: s_waitcnt vmcnt(0)
8554 ; VI-NEXT: buffer_wbinvl1_vol
8555 ; VI-NEXT: s_setpc_b64 s[30:31]
8557 ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset:
8559 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8560 ; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off offset:16 glc
8561 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8562 ; GFX9-NEXT: buffer_wbinvl1_vol
8563 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8564 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
8565 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
8569 define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
8570 ; SI-LABEL: global_atomic_uinc_wrap_i32_noret_scalar:
8572 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8573 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8574 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
8575 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8576 ; SI-NEXT: s_waitcnt expcnt(0)
8577 ; SI-NEXT: v_writelane_b32 v1, s6, 0
8578 ; SI-NEXT: v_writelane_b32 v1, s7, 1
8579 ; SI-NEXT: s_mov_b32 s34, s6
8580 ; SI-NEXT: s_mov_b32 s7, 0xf000
8581 ; SI-NEXT: s_mov_b32 s6, -1
8582 ; SI-NEXT: v_mov_b32_e32 v0, s34
8583 ; SI-NEXT: s_waitcnt vmcnt(0)
8584 ; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0
8585 ; SI-NEXT: s_waitcnt vmcnt(0)
8586 ; SI-NEXT: buffer_wbinvl1
8587 ; SI-NEXT: v_readlane_b32 s7, v1, 1
8588 ; SI-NEXT: v_readlane_b32 s6, v1, 0
8589 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8590 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
8591 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8592 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
8593 ; SI-NEXT: s_setpc_b64 s[30:31]
8595 ; VI-LABEL: global_atomic_uinc_wrap_i32_noret_scalar:
8597 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8598 ; VI-NEXT: v_mov_b32_e32 v0, s4
8599 ; VI-NEXT: v_mov_b32_e32 v1, s5
8600 ; VI-NEXT: v_mov_b32_e32 v2, s6
8601 ; VI-NEXT: flat_atomic_inc v[0:1], v2
8602 ; VI-NEXT: s_waitcnt vmcnt(0)
8603 ; VI-NEXT: buffer_wbinvl1_vol
8604 ; VI-NEXT: s_setpc_b64 s[30:31]
8606 ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_scalar:
8608 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8609 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
8610 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
8611 ; GFX9-NEXT: global_atomic_inc v0, v1, s[4:5]
8612 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8613 ; GFX9-NEXT: buffer_wbinvl1_vol
8614 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8615 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
8619 define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
8620 ; SI-LABEL: global_atomic_uinc_wrap_i32_noret_offset_scalar:
8622 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8623 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8624 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
8625 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8626 ; SI-NEXT: s_waitcnt expcnt(0)
8627 ; SI-NEXT: v_writelane_b32 v1, s6, 0
8628 ; SI-NEXT: v_writelane_b32 v1, s7, 1
8629 ; SI-NEXT: s_mov_b32 s34, s6
8630 ; SI-NEXT: s_mov_b32 s7, 0xf000
8631 ; SI-NEXT: s_mov_b32 s6, -1
8632 ; SI-NEXT: v_mov_b32_e32 v0, s34
8633 ; SI-NEXT: s_waitcnt vmcnt(0)
8634 ; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 offset:16
8635 ; SI-NEXT: s_waitcnt vmcnt(0)
8636 ; SI-NEXT: buffer_wbinvl1
8637 ; SI-NEXT: v_readlane_b32 s7, v1, 1
8638 ; SI-NEXT: v_readlane_b32 s6, v1, 0
8639 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8640 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
8641 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8642 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
8643 ; SI-NEXT: s_setpc_b64 s[30:31]
8645 ; VI-LABEL: global_atomic_uinc_wrap_i32_noret_offset_scalar:
8647 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8648 ; VI-NEXT: s_add_u32 s34, s4, 16
8649 ; VI-NEXT: s_addc_u32 s35, s5, 0
8650 ; VI-NEXT: v_mov_b32_e32 v0, s34
8651 ; VI-NEXT: v_mov_b32_e32 v1, s35
8652 ; VI-NEXT: v_mov_b32_e32 v2, s6
8653 ; VI-NEXT: flat_atomic_inc v[0:1], v2
8654 ; VI-NEXT: s_waitcnt vmcnt(0)
8655 ; VI-NEXT: buffer_wbinvl1_vol
8656 ; VI-NEXT: s_setpc_b64 s[30:31]
8658 ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset_scalar:
8660 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8661 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
8662 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
8663 ; GFX9-NEXT: global_atomic_inc v0, v1, s[4:5] offset:16
8664 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8665 ; GFX9-NEXT: buffer_wbinvl1_vol
8666 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8667 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
8668 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
8672 define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
8673 ; SI-LABEL: global_atomic_uinc_wrap_i32_ret_scalar:
8675 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8676 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8677 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
8678 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8679 ; SI-NEXT: s_waitcnt expcnt(0)
8680 ; SI-NEXT: v_writelane_b32 v1, s6, 0
8681 ; SI-NEXT: v_writelane_b32 v1, s7, 1
8682 ; SI-NEXT: s_mov_b32 s34, s6
8683 ; SI-NEXT: s_mov_b32 s7, 0xf000
8684 ; SI-NEXT: s_mov_b32 s6, -1
8685 ; SI-NEXT: v_mov_b32_e32 v0, s34
8686 ; SI-NEXT: s_waitcnt vmcnt(0)
8687 ; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 glc
8688 ; SI-NEXT: s_waitcnt vmcnt(0)
8689 ; SI-NEXT: buffer_wbinvl1
8690 ; SI-NEXT: v_readlane_b32 s7, v1, 1
8691 ; SI-NEXT: v_readlane_b32 s6, v1, 0
8692 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8693 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
8694 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8695 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
8696 ; SI-NEXT: s_setpc_b64 s[30:31]
8698 ; VI-LABEL: global_atomic_uinc_wrap_i32_ret_scalar:
8700 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8701 ; VI-NEXT: v_mov_b32_e32 v0, s4
8702 ; VI-NEXT: v_mov_b32_e32 v1, s5
8703 ; VI-NEXT: v_mov_b32_e32 v2, s6
8704 ; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
8705 ; VI-NEXT: s_waitcnt vmcnt(0)
8706 ; VI-NEXT: buffer_wbinvl1_vol
8707 ; VI-NEXT: s_setpc_b64 s[30:31]
8709 ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_scalar:
8711 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8712 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
8713 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
8714 ; GFX9-NEXT: global_atomic_inc v0, v0, v1, s[4:5] glc
8715 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8716 ; GFX9-NEXT: buffer_wbinvl1_vol
8717 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8718 %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
8722 define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
8723 ; SI-LABEL: global_atomic_uinc_wrap_i32_ret_offset_scalar:
8725 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8726 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8727 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
8728 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8729 ; SI-NEXT: s_waitcnt expcnt(0)
8730 ; SI-NEXT: v_writelane_b32 v1, s6, 0
8731 ; SI-NEXT: v_writelane_b32 v1, s7, 1
8732 ; SI-NEXT: s_mov_b32 s34, s6
8733 ; SI-NEXT: s_mov_b32 s7, 0xf000
8734 ; SI-NEXT: s_mov_b32 s6, -1
8735 ; SI-NEXT: v_mov_b32_e32 v0, s34
8736 ; SI-NEXT: s_waitcnt vmcnt(0)
8737 ; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 offset:16 glc
8738 ; SI-NEXT: s_waitcnt vmcnt(0)
8739 ; SI-NEXT: buffer_wbinvl1
8740 ; SI-NEXT: v_readlane_b32 s7, v1, 1
8741 ; SI-NEXT: v_readlane_b32 s6, v1, 0
8742 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8743 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
8744 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8745 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
8746 ; SI-NEXT: s_setpc_b64 s[30:31]
8748 ; VI-LABEL: global_atomic_uinc_wrap_i32_ret_offset_scalar:
8750 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8751 ; VI-NEXT: s_add_u32 s34, s4, 16
8752 ; VI-NEXT: s_addc_u32 s35, s5, 0
8753 ; VI-NEXT: v_mov_b32_e32 v0, s34
8754 ; VI-NEXT: v_mov_b32_e32 v1, s35
8755 ; VI-NEXT: v_mov_b32_e32 v2, s6
8756 ; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
8757 ; VI-NEXT: s_waitcnt vmcnt(0)
8758 ; VI-NEXT: buffer_wbinvl1_vol
8759 ; VI-NEXT: s_setpc_b64 s[30:31]
8761 ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset_scalar:
8763 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8764 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
8765 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
8766 ; GFX9-NEXT: global_atomic_inc v0, v0, v1, s[4:5] offset:16 glc
8767 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8768 ; GFX9-NEXT: buffer_wbinvl1_vol
8769 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8770 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
8771 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
8775 define void @global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
8776 ; SI-LABEL: global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory:
8778 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8779 ; SI-NEXT: s_mov_b32 s6, 0
8780 ; SI-NEXT: s_mov_b32 s7, 0xf000
8781 ; SI-NEXT: s_mov_b32 s4, s6
8782 ; SI-NEXT: s_mov_b32 s5, s6
8783 ; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16
8784 ; SI-NEXT: s_waitcnt vmcnt(0)
8785 ; SI-NEXT: buffer_wbinvl1
8786 ; SI-NEXT: s_waitcnt expcnt(0)
8787 ; SI-NEXT: s_setpc_b64 s[30:31]
8789 ; VI-LABEL: global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory:
8791 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8792 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
8793 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
8794 ; VI-NEXT: flat_atomic_inc v[0:1], v2
8795 ; VI-NEXT: s_waitcnt vmcnt(0)
8796 ; VI-NEXT: buffer_wbinvl1_vol
8797 ; VI-NEXT: s_setpc_b64 s[30:31]
8799 ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory:
8801 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8802 ; GFX9-NEXT: global_atomic_inc v[0:1], v2, off offset:16
8803 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8804 ; GFX9-NEXT: buffer_wbinvl1_vol
8805 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8806 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
8807 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
8811 define i32 @global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
8812 ; SI-LABEL: global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory:
8814 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8815 ; SI-NEXT: s_mov_b32 s6, 0
8816 ; SI-NEXT: s_mov_b32 s7, 0xf000
8817 ; SI-NEXT: s_mov_b32 s4, s6
8818 ; SI-NEXT: s_mov_b32 s5, s6
8819 ; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
8820 ; SI-NEXT: s_waitcnt vmcnt(0)
8821 ; SI-NEXT: buffer_wbinvl1
8822 ; SI-NEXT: v_mov_b32_e32 v0, v2
8823 ; SI-NEXT: s_waitcnt expcnt(0)
8824 ; SI-NEXT: s_setpc_b64 s[30:31]
8826 ; VI-LABEL: global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory:
8828 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8829 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
8830 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
8831 ; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
8832 ; VI-NEXT: s_waitcnt vmcnt(0)
8833 ; VI-NEXT: buffer_wbinvl1_vol
8834 ; VI-NEXT: s_setpc_b64 s[30:31]
8836 ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory:
8838 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8839 ; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off offset:16 glc
8840 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8841 ; GFX9-NEXT: buffer_wbinvl1_vol
8842 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8843 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
8844 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
8848 ; ---------------------------------------------------------------------
8849 ; atomicrmw udec_wrap
8850 ; ---------------------------------------------------------------------
8852 define void @global_atomic_udec_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
8853 ; SI-LABEL: global_atomic_udec_wrap_i32_noret:
8855 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8856 ; SI-NEXT: s_mov_b32 s6, 0
8857 ; SI-NEXT: s_mov_b32 s7, 0xf000
8858 ; SI-NEXT: s_mov_b32 s4, s6
8859 ; SI-NEXT: s_mov_b32 s5, s6
8860 ; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64
8861 ; SI-NEXT: s_waitcnt vmcnt(0)
8862 ; SI-NEXT: buffer_wbinvl1
8863 ; SI-NEXT: s_waitcnt expcnt(0)
8864 ; SI-NEXT: s_setpc_b64 s[30:31]
8866 ; VI-LABEL: global_atomic_udec_wrap_i32_noret:
8868 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8869 ; VI-NEXT: flat_atomic_dec v[0:1], v2
8870 ; VI-NEXT: s_waitcnt vmcnt(0)
8871 ; VI-NEXT: buffer_wbinvl1_vol
8872 ; VI-NEXT: s_setpc_b64 s[30:31]
8874 ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret:
8876 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8877 ; GFX9-NEXT: global_atomic_dec v[0:1], v2, off
8878 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8879 ; GFX9-NEXT: buffer_wbinvl1_vol
8880 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8881 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
8885 define void @global_atomic_udec_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
8886 ; SI-LABEL: global_atomic_udec_wrap_i32_noret_offset:
8888 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8889 ; SI-NEXT: s_mov_b32 s6, 0
8890 ; SI-NEXT: s_mov_b32 s7, 0xf000
8891 ; SI-NEXT: s_mov_b32 s4, s6
8892 ; SI-NEXT: s_mov_b32 s5, s6
8893 ; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16
8894 ; SI-NEXT: s_waitcnt vmcnt(0)
8895 ; SI-NEXT: buffer_wbinvl1
8896 ; SI-NEXT: s_waitcnt expcnt(0)
8897 ; SI-NEXT: s_setpc_b64 s[30:31]
8899 ; VI-LABEL: global_atomic_udec_wrap_i32_noret_offset:
8901 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8902 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
8903 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
8904 ; VI-NEXT: flat_atomic_dec v[0:1], v2
8905 ; VI-NEXT: s_waitcnt vmcnt(0)
8906 ; VI-NEXT: buffer_wbinvl1_vol
8907 ; VI-NEXT: s_setpc_b64 s[30:31]
8909 ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset:
8911 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8912 ; GFX9-NEXT: global_atomic_dec v[0:1], v2, off offset:16
8913 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8914 ; GFX9-NEXT: buffer_wbinvl1_vol
8915 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8916 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
8917 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst
8921 define i32 @global_atomic_udec_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
8922 ; SI-LABEL: global_atomic_udec_wrap_i32_ret:
8924 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8925 ; SI-NEXT: s_mov_b32 s6, 0
8926 ; SI-NEXT: s_mov_b32 s7, 0xf000
8927 ; SI-NEXT: s_mov_b32 s4, s6
8928 ; SI-NEXT: s_mov_b32 s5, s6
8929 ; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 glc
8930 ; SI-NEXT: s_waitcnt vmcnt(0)
8931 ; SI-NEXT: buffer_wbinvl1
8932 ; SI-NEXT: v_mov_b32_e32 v0, v2
8933 ; SI-NEXT: s_waitcnt expcnt(0)
8934 ; SI-NEXT: s_setpc_b64 s[30:31]
8936 ; VI-LABEL: global_atomic_udec_wrap_i32_ret:
8938 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8939 ; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
8940 ; VI-NEXT: s_waitcnt vmcnt(0)
8941 ; VI-NEXT: buffer_wbinvl1_vol
8942 ; VI-NEXT: s_setpc_b64 s[30:31]
8944 ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret:
8946 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8947 ; GFX9-NEXT: global_atomic_dec v0, v[0:1], v2, off glc
8948 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8949 ; GFX9-NEXT: buffer_wbinvl1_vol
8950 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8951 %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
8955 define i32 @global_atomic_udec_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
8956 ; SI-LABEL: global_atomic_udec_wrap_i32_ret_offset:
8958 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8959 ; SI-NEXT: s_mov_b32 s6, 0
8960 ; SI-NEXT: s_mov_b32 s7, 0xf000
8961 ; SI-NEXT: s_mov_b32 s4, s6
8962 ; SI-NEXT: s_mov_b32 s5, s6
8963 ; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
8964 ; SI-NEXT: s_waitcnt vmcnt(0)
8965 ; SI-NEXT: buffer_wbinvl1
8966 ; SI-NEXT: v_mov_b32_e32 v0, v2
8967 ; SI-NEXT: s_waitcnt expcnt(0)
8968 ; SI-NEXT: s_setpc_b64 s[30:31]
8970 ; VI-LABEL: global_atomic_udec_wrap_i32_ret_offset:
8972 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8973 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
8974 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
8975 ; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
8976 ; VI-NEXT: s_waitcnt vmcnt(0)
8977 ; VI-NEXT: buffer_wbinvl1_vol
8978 ; VI-NEXT: s_setpc_b64 s[30:31]
8980 ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset:
8982 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8983 ; GFX9-NEXT: global_atomic_dec v0, v[0:1], v2, off offset:16 glc
8984 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8985 ; GFX9-NEXT: buffer_wbinvl1_vol
8986 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8987 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
8988 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst
8992 define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
8993 ; SI-LABEL: global_atomic_udec_wrap_i32_noret_scalar:
8995 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8996 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8997 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
8998 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8999 ; SI-NEXT: s_waitcnt expcnt(0)
9000 ; SI-NEXT: v_writelane_b32 v1, s6, 0
9001 ; SI-NEXT: v_writelane_b32 v1, s7, 1
9002 ; SI-NEXT: s_mov_b32 s34, s6
9003 ; SI-NEXT: s_mov_b32 s7, 0xf000
9004 ; SI-NEXT: s_mov_b32 s6, -1
9005 ; SI-NEXT: v_mov_b32_e32 v0, s34
9006 ; SI-NEXT: s_waitcnt vmcnt(0)
9007 ; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0
9008 ; SI-NEXT: s_waitcnt vmcnt(0)
9009 ; SI-NEXT: buffer_wbinvl1
9010 ; SI-NEXT: v_readlane_b32 s7, v1, 1
9011 ; SI-NEXT: v_readlane_b32 s6, v1, 0
9012 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
9013 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
9014 ; SI-NEXT: s_mov_b64 exec, s[34:35]
9015 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
9016 ; SI-NEXT: s_setpc_b64 s[30:31]
9018 ; VI-LABEL: global_atomic_udec_wrap_i32_noret_scalar:
9020 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9021 ; VI-NEXT: v_mov_b32_e32 v0, s4
9022 ; VI-NEXT: v_mov_b32_e32 v1, s5
9023 ; VI-NEXT: v_mov_b32_e32 v2, s6
9024 ; VI-NEXT: flat_atomic_dec v[0:1], v2
9025 ; VI-NEXT: s_waitcnt vmcnt(0)
9026 ; VI-NEXT: buffer_wbinvl1_vol
9027 ; VI-NEXT: s_setpc_b64 s[30:31]
9029 ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_scalar:
9031 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9032 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
9033 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
9034 ; GFX9-NEXT: global_atomic_dec v0, v1, s[4:5]
9035 ; GFX9-NEXT: s_waitcnt vmcnt(0)
9036 ; GFX9-NEXT: buffer_wbinvl1_vol
9037 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9038 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
9042 define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
9043 ; SI-LABEL: global_atomic_udec_wrap_i32_noret_offset_scalar:
9045 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9046 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
9047 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
9048 ; SI-NEXT: s_mov_b64 exec, s[34:35]
9049 ; SI-NEXT: s_waitcnt expcnt(0)
9050 ; SI-NEXT: v_writelane_b32 v1, s6, 0
9051 ; SI-NEXT: v_writelane_b32 v1, s7, 1
9052 ; SI-NEXT: s_mov_b32 s34, s6
9053 ; SI-NEXT: s_mov_b32 s7, 0xf000
9054 ; SI-NEXT: s_mov_b32 s6, -1
9055 ; SI-NEXT: v_mov_b32_e32 v0, s34
9056 ; SI-NEXT: s_waitcnt vmcnt(0)
9057 ; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 offset:16
9058 ; SI-NEXT: s_waitcnt vmcnt(0)
9059 ; SI-NEXT: buffer_wbinvl1
9060 ; SI-NEXT: v_readlane_b32 s7, v1, 1
9061 ; SI-NEXT: v_readlane_b32 s6, v1, 0
9062 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
9063 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
9064 ; SI-NEXT: s_mov_b64 exec, s[34:35]
9065 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
9066 ; SI-NEXT: s_setpc_b64 s[30:31]
9068 ; VI-LABEL: global_atomic_udec_wrap_i32_noret_offset_scalar:
9070 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9071 ; VI-NEXT: s_add_u32 s34, s4, 16
9072 ; VI-NEXT: s_addc_u32 s35, s5, 0
9073 ; VI-NEXT: v_mov_b32_e32 v0, s34
9074 ; VI-NEXT: v_mov_b32_e32 v1, s35
9075 ; VI-NEXT: v_mov_b32_e32 v2, s6
9076 ; VI-NEXT: flat_atomic_dec v[0:1], v2
9077 ; VI-NEXT: s_waitcnt vmcnt(0)
9078 ; VI-NEXT: buffer_wbinvl1_vol
9079 ; VI-NEXT: s_setpc_b64 s[30:31]
9081 ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset_scalar:
9083 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9084 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
9085 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
9086 ; GFX9-NEXT: global_atomic_dec v0, v1, s[4:5] offset:16
9087 ; GFX9-NEXT: s_waitcnt vmcnt(0)
9088 ; GFX9-NEXT: buffer_wbinvl1_vol
9089 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9090 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
9091 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst
9095 define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
9096 ; SI-LABEL: global_atomic_udec_wrap_i32_ret_scalar:
9098 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9099 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
9100 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
9101 ; SI-NEXT: s_mov_b64 exec, s[34:35]
9102 ; SI-NEXT: s_waitcnt expcnt(0)
9103 ; SI-NEXT: v_writelane_b32 v1, s6, 0
9104 ; SI-NEXT: v_writelane_b32 v1, s7, 1
9105 ; SI-NEXT: s_mov_b32 s34, s6
9106 ; SI-NEXT: s_mov_b32 s7, 0xf000
9107 ; SI-NEXT: s_mov_b32 s6, -1
9108 ; SI-NEXT: v_mov_b32_e32 v0, s34
9109 ; SI-NEXT: s_waitcnt vmcnt(0)
9110 ; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 glc
9111 ; SI-NEXT: s_waitcnt vmcnt(0)
9112 ; SI-NEXT: buffer_wbinvl1
9113 ; SI-NEXT: v_readlane_b32 s7, v1, 1
9114 ; SI-NEXT: v_readlane_b32 s6, v1, 0
9115 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
9116 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
9117 ; SI-NEXT: s_mov_b64 exec, s[34:35]
9118 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
9119 ; SI-NEXT: s_setpc_b64 s[30:31]
9121 ; VI-LABEL: global_atomic_udec_wrap_i32_ret_scalar:
9123 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9124 ; VI-NEXT: v_mov_b32_e32 v0, s4
9125 ; VI-NEXT: v_mov_b32_e32 v1, s5
9126 ; VI-NEXT: v_mov_b32_e32 v2, s6
9127 ; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
9128 ; VI-NEXT: s_waitcnt vmcnt(0)
9129 ; VI-NEXT: buffer_wbinvl1_vol
9130 ; VI-NEXT: s_setpc_b64 s[30:31]
9132 ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_scalar:
9134 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9135 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
9136 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
9137 ; GFX9-NEXT: global_atomic_dec v0, v0, v1, s[4:5] glc
9138 ; GFX9-NEXT: s_waitcnt vmcnt(0)
9139 ; GFX9-NEXT: buffer_wbinvl1_vol
9140 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9141 %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
9145 define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
9146 ; SI-LABEL: global_atomic_udec_wrap_i32_ret_offset_scalar:
9148 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9149 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
9150 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
9151 ; SI-NEXT: s_mov_b64 exec, s[34:35]
9152 ; SI-NEXT: s_waitcnt expcnt(0)
9153 ; SI-NEXT: v_writelane_b32 v1, s6, 0
9154 ; SI-NEXT: v_writelane_b32 v1, s7, 1
9155 ; SI-NEXT: s_mov_b32 s34, s6
9156 ; SI-NEXT: s_mov_b32 s7, 0xf000
9157 ; SI-NEXT: s_mov_b32 s6, -1
9158 ; SI-NEXT: v_mov_b32_e32 v0, s34
9159 ; SI-NEXT: s_waitcnt vmcnt(0)
9160 ; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 offset:16 glc
9161 ; SI-NEXT: s_waitcnt vmcnt(0)
9162 ; SI-NEXT: buffer_wbinvl1
9163 ; SI-NEXT: v_readlane_b32 s7, v1, 1
9164 ; SI-NEXT: v_readlane_b32 s6, v1, 0
9165 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
9166 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
9167 ; SI-NEXT: s_mov_b64 exec, s[34:35]
9168 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
9169 ; SI-NEXT: s_setpc_b64 s[30:31]
9171 ; VI-LABEL: global_atomic_udec_wrap_i32_ret_offset_scalar:
9173 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9174 ; VI-NEXT: s_add_u32 s34, s4, 16
9175 ; VI-NEXT: s_addc_u32 s35, s5, 0
9176 ; VI-NEXT: v_mov_b32_e32 v0, s34
9177 ; VI-NEXT: v_mov_b32_e32 v1, s35
9178 ; VI-NEXT: v_mov_b32_e32 v2, s6
9179 ; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
9180 ; VI-NEXT: s_waitcnt vmcnt(0)
9181 ; VI-NEXT: buffer_wbinvl1_vol
9182 ; VI-NEXT: s_setpc_b64 s[30:31]
9184 ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset_scalar:
9186 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9187 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
9188 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
9189 ; GFX9-NEXT: global_atomic_dec v0, v0, v1, s[4:5] offset:16 glc
9190 ; GFX9-NEXT: s_waitcnt vmcnt(0)
9191 ; GFX9-NEXT: buffer_wbinvl1_vol
9192 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9193 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
9194 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst
9198 define void @global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
9199 ; SI-LABEL: global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory:
9201 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9202 ; SI-NEXT: s_mov_b32 s6, 0
9203 ; SI-NEXT: s_mov_b32 s7, 0xf000
9204 ; SI-NEXT: s_mov_b32 s4, s6
9205 ; SI-NEXT: s_mov_b32 s5, s6
9206 ; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16
9207 ; SI-NEXT: s_waitcnt vmcnt(0)
9208 ; SI-NEXT: buffer_wbinvl1
9209 ; SI-NEXT: s_waitcnt expcnt(0)
9210 ; SI-NEXT: s_setpc_b64 s[30:31]
9212 ; VI-LABEL: global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory:
9214 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9215 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
9216 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
9217 ; VI-NEXT: flat_atomic_dec v[0:1], v2
9218 ; VI-NEXT: s_waitcnt vmcnt(0)
9219 ; VI-NEXT: buffer_wbinvl1_vol
9220 ; VI-NEXT: s_setpc_b64 s[30:31]
9222 ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory:
9224 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9225 ; GFX9-NEXT: global_atomic_dec v[0:1], v2, off offset:16
9226 ; GFX9-NEXT: s_waitcnt vmcnt(0)
9227 ; GFX9-NEXT: buffer_wbinvl1_vol
9228 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9229 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
9230 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
9234 define i32 @global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i32 %in) {
9235 ; SI-LABEL: global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory:
9237 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9238 ; SI-NEXT: s_mov_b32 s6, 0
9239 ; SI-NEXT: s_mov_b32 s7, 0xf000
9240 ; SI-NEXT: s_mov_b32 s4, s6
9241 ; SI-NEXT: s_mov_b32 s5, s6
9242 ; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
9243 ; SI-NEXT: s_waitcnt vmcnt(0)
9244 ; SI-NEXT: buffer_wbinvl1
9245 ; SI-NEXT: v_mov_b32_e32 v0, v2
9246 ; SI-NEXT: s_waitcnt expcnt(0)
9247 ; SI-NEXT: s_setpc_b64 s[30:31]
9249 ; VI-LABEL: global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory:
9251 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9252 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
9253 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
9254 ; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
9255 ; VI-NEXT: s_waitcnt vmcnt(0)
9256 ; VI-NEXT: buffer_wbinvl1_vol
9257 ; VI-NEXT: s_setpc_b64 s[30:31]
9259 ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory:
9261 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9262 ; GFX9-NEXT: global_atomic_dec v0, v[0:1], v2, off offset:16 glc
9263 ; GFX9-NEXT: s_waitcnt vmcnt(0)
9264 ; GFX9-NEXT: buffer_wbinvl1_vol
9265 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9266 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
9267 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0