1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
6 ; ---------------------------------------------------------------------
8 ; ---------------------------------------------------------------------
10 define void @global_atomic_xchg_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
11 ; SI-LABEL: global_atomic_xchg_i32_noret:
13 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14 ; SI-NEXT: s_mov_b32 s6, 0
15 ; SI-NEXT: s_mov_b32 s7, 0xf000
16 ; SI-NEXT: s_mov_b32 s4, s6
17 ; SI-NEXT: s_mov_b32 s5, s6
18 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19 ; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64
20 ; SI-NEXT: s_waitcnt vmcnt(0)
21 ; SI-NEXT: buffer_wbinvl1
22 ; SI-NEXT: s_waitcnt expcnt(0)
23 ; SI-NEXT: s_setpc_b64 s[30:31]
25 ; VI-LABEL: global_atomic_xchg_i32_noret:
27 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28 ; VI-NEXT: flat_atomic_swap v[0:1], v2
29 ; VI-NEXT: s_waitcnt vmcnt(0)
30 ; VI-NEXT: buffer_wbinvl1_vol
31 ; VI-NEXT: s_setpc_b64 s[30:31]
33 ; GFX9-LABEL: global_atomic_xchg_i32_noret:
35 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36 ; GFX9-NEXT: global_atomic_swap v[0:1], v2, off
37 ; GFX9-NEXT: s_waitcnt vmcnt(0)
38 ; GFX9-NEXT: buffer_wbinvl1_vol
39 ; GFX9-NEXT: s_setpc_b64 s[30:31]
40 %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, i32 %in seq_cst
44 define void @global_atomic_xchg_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
45 ; SI-LABEL: global_atomic_xchg_i32_noret_offset:
47 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
48 ; SI-NEXT: s_mov_b32 s6, 0
49 ; SI-NEXT: s_mov_b32 s7, 0xf000
50 ; SI-NEXT: s_mov_b32 s4, s6
51 ; SI-NEXT: s_mov_b32 s5, s6
52 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
53 ; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16
54 ; SI-NEXT: s_waitcnt vmcnt(0)
55 ; SI-NEXT: buffer_wbinvl1
56 ; SI-NEXT: s_waitcnt expcnt(0)
57 ; SI-NEXT: s_setpc_b64 s[30:31]
59 ; VI-LABEL: global_atomic_xchg_i32_noret_offset:
61 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
62 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
63 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
64 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
65 ; VI-NEXT: flat_atomic_swap v[0:1], v2
66 ; VI-NEXT: s_waitcnt vmcnt(0)
67 ; VI-NEXT: buffer_wbinvl1_vol
68 ; VI-NEXT: s_setpc_b64 s[30:31]
70 ; GFX9-LABEL: global_atomic_xchg_i32_noret_offset:
72 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
73 ; GFX9-NEXT: global_atomic_swap v[0:1], v2, off offset:16
74 ; GFX9-NEXT: s_waitcnt vmcnt(0)
75 ; GFX9-NEXT: buffer_wbinvl1_vol
76 ; GFX9-NEXT: s_setpc_b64 s[30:31]
77 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
78 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst
82 define i32 @global_atomic_xchg_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
83 ; SI-LABEL: global_atomic_xchg_i32_ret:
85 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
86 ; SI-NEXT: s_mov_b32 s6, 0
87 ; SI-NEXT: s_mov_b32 s7, 0xf000
88 ; SI-NEXT: s_mov_b32 s4, s6
89 ; SI-NEXT: s_mov_b32 s5, s6
90 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
91 ; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 glc
92 ; SI-NEXT: s_waitcnt vmcnt(0)
93 ; SI-NEXT: buffer_wbinvl1
94 ; SI-NEXT: v_mov_b32_e32 v0, v2
95 ; SI-NEXT: s_waitcnt expcnt(0)
96 ; SI-NEXT: s_setpc_b64 s[30:31]
98 ; VI-LABEL: global_atomic_xchg_i32_ret:
100 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
101 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
102 ; VI-NEXT: s_waitcnt vmcnt(0)
103 ; VI-NEXT: buffer_wbinvl1_vol
104 ; VI-NEXT: s_setpc_b64 s[30:31]
106 ; GFX9-LABEL: global_atomic_xchg_i32_ret:
108 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
109 ; GFX9-NEXT: global_atomic_swap v0, v[0:1], v2, off glc
110 ; GFX9-NEXT: s_waitcnt vmcnt(0)
111 ; GFX9-NEXT: buffer_wbinvl1_vol
112 ; GFX9-NEXT: s_setpc_b64 s[30:31]
113 %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %in seq_cst
117 define i32 @global_atomic_xchg_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
118 ; SI-LABEL: global_atomic_xchg_i32_ret_offset:
120 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
121 ; SI-NEXT: s_mov_b32 s6, 0
122 ; SI-NEXT: s_mov_b32 s7, 0xf000
123 ; SI-NEXT: s_mov_b32 s4, s6
124 ; SI-NEXT: s_mov_b32 s5, s6
125 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
126 ; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
127 ; SI-NEXT: s_waitcnt vmcnt(0)
128 ; SI-NEXT: buffer_wbinvl1
129 ; SI-NEXT: v_mov_b32_e32 v0, v2
130 ; SI-NEXT: s_waitcnt expcnt(0)
131 ; SI-NEXT: s_setpc_b64 s[30:31]
133 ; VI-LABEL: global_atomic_xchg_i32_ret_offset:
135 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
136 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
137 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
138 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
139 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
140 ; VI-NEXT: s_waitcnt vmcnt(0)
141 ; VI-NEXT: buffer_wbinvl1_vol
142 ; VI-NEXT: s_setpc_b64 s[30:31]
144 ; GFX9-LABEL: global_atomic_xchg_i32_ret_offset:
146 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
147 ; GFX9-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:16 glc
148 ; GFX9-NEXT: s_waitcnt vmcnt(0)
149 ; GFX9-NEXT: buffer_wbinvl1_vol
150 ; GFX9-NEXT: s_setpc_b64 s[30:31]
151 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
152 %result = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst
156 define amdgpu_gfx void @global_atomic_xchg_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
157 ; SI-LABEL: global_atomic_xchg_i32_noret_scalar:
159 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
160 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
161 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
162 ; SI-NEXT: s_mov_b64 exec, s[34:35]
163 ; SI-NEXT: s_waitcnt expcnt(0)
164 ; SI-NEXT: v_writelane_b32 v0, s6, 0
165 ; SI-NEXT: v_writelane_b32 v0, s7, 1
166 ; SI-NEXT: s_mov_b32 s34, s6
167 ; SI-NEXT: s_mov_b32 s7, 0xf000
168 ; SI-NEXT: s_mov_b32 s6, -1
169 ; SI-NEXT: v_mov_b32_e32 v1, s34
170 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
171 ; SI-NEXT: buffer_atomic_swap v1, off, s[4:7], 0
172 ; SI-NEXT: s_waitcnt vmcnt(0)
173 ; SI-NEXT: buffer_wbinvl1
174 ; SI-NEXT: v_readlane_b32 s7, v0, 1
175 ; SI-NEXT: v_readlane_b32 s6, v0, 0
176 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
177 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
178 ; SI-NEXT: s_mov_b64 exec, s[34:35]
179 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
180 ; SI-NEXT: s_setpc_b64 s[30:31]
182 ; VI-LABEL: global_atomic_xchg_i32_noret_scalar:
184 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
185 ; VI-NEXT: v_mov_b32_e32 v0, s4
186 ; VI-NEXT: v_mov_b32_e32 v1, s5
187 ; VI-NEXT: v_mov_b32_e32 v2, s6
188 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
189 ; VI-NEXT: flat_atomic_swap v[0:1], v2
190 ; VI-NEXT: s_waitcnt vmcnt(0)
191 ; VI-NEXT: buffer_wbinvl1_vol
192 ; VI-NEXT: s_setpc_b64 s[30:31]
194 ; GFX9-LABEL: global_atomic_xchg_i32_noret_scalar:
196 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
197 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
198 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
199 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
200 ; GFX9-NEXT: global_atomic_swap v0, v1, s[4:5]
201 ; GFX9-NEXT: s_waitcnt vmcnt(0)
202 ; GFX9-NEXT: buffer_wbinvl1_vol
203 ; GFX9-NEXT: s_setpc_b64 s[30:31]
204 %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, i32 %in seq_cst
208 define amdgpu_gfx void @global_atomic_xchg_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
209 ; SI-LABEL: global_atomic_xchg_i32_noret_offset_scalar:
211 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
212 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
213 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
214 ; SI-NEXT: s_mov_b64 exec, s[34:35]
215 ; SI-NEXT: s_waitcnt expcnt(0)
216 ; SI-NEXT: v_writelane_b32 v0, s6, 0
217 ; SI-NEXT: v_writelane_b32 v0, s7, 1
218 ; SI-NEXT: s_mov_b32 s34, s6
219 ; SI-NEXT: s_mov_b32 s7, 0xf000
220 ; SI-NEXT: s_mov_b32 s6, -1
221 ; SI-NEXT: v_mov_b32_e32 v1, s34
222 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
223 ; SI-NEXT: buffer_atomic_swap v1, off, s[4:7], 0 offset:16
224 ; SI-NEXT: s_waitcnt vmcnt(0)
225 ; SI-NEXT: buffer_wbinvl1
226 ; SI-NEXT: v_readlane_b32 s7, v0, 1
227 ; SI-NEXT: v_readlane_b32 s6, v0, 0
228 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
229 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
230 ; SI-NEXT: s_mov_b64 exec, s[34:35]
231 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
232 ; SI-NEXT: s_setpc_b64 s[30:31]
234 ; VI-LABEL: global_atomic_xchg_i32_noret_offset_scalar:
236 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
237 ; VI-NEXT: s_add_u32 s34, s4, 16
238 ; VI-NEXT: s_addc_u32 s35, s5, 0
239 ; VI-NEXT: v_mov_b32_e32 v0, s34
240 ; VI-NEXT: v_mov_b32_e32 v1, s35
241 ; VI-NEXT: v_mov_b32_e32 v2, s6
242 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
243 ; VI-NEXT: flat_atomic_swap v[0:1], v2
244 ; VI-NEXT: s_waitcnt vmcnt(0)
245 ; VI-NEXT: buffer_wbinvl1_vol
246 ; VI-NEXT: s_setpc_b64 s[30:31]
248 ; GFX9-LABEL: global_atomic_xchg_i32_noret_offset_scalar:
250 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
251 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
252 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
253 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
254 ; GFX9-NEXT: global_atomic_swap v0, v1, s[4:5] offset:16
255 ; GFX9-NEXT: s_waitcnt vmcnt(0)
256 ; GFX9-NEXT: buffer_wbinvl1_vol
257 ; GFX9-NEXT: s_setpc_b64 s[30:31]
258 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
259 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst
263 define amdgpu_gfx i32 @global_atomic_xchg_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
264 ; SI-LABEL: global_atomic_xchg_i32_ret_scalar:
266 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
267 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
268 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
269 ; SI-NEXT: s_mov_b64 exec, s[34:35]
270 ; SI-NEXT: s_waitcnt expcnt(0)
271 ; SI-NEXT: v_writelane_b32 v1, s6, 0
272 ; SI-NEXT: v_writelane_b32 v1, s7, 1
273 ; SI-NEXT: s_mov_b32 s34, s6
274 ; SI-NEXT: s_mov_b32 s7, 0xf000
275 ; SI-NEXT: s_mov_b32 s6, -1
276 ; SI-NEXT: v_mov_b32_e32 v0, s34
277 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
278 ; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc
279 ; SI-NEXT: s_waitcnt vmcnt(0)
280 ; SI-NEXT: buffer_wbinvl1
281 ; SI-NEXT: v_readlane_b32 s7, v1, 1
282 ; SI-NEXT: v_readlane_b32 s6, v1, 0
283 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
284 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
285 ; SI-NEXT: s_mov_b64 exec, s[34:35]
286 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
287 ; SI-NEXT: s_setpc_b64 s[30:31]
289 ; VI-LABEL: global_atomic_xchg_i32_ret_scalar:
291 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
292 ; VI-NEXT: v_mov_b32_e32 v0, s4
293 ; VI-NEXT: v_mov_b32_e32 v1, s5
294 ; VI-NEXT: v_mov_b32_e32 v2, s6
295 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
296 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
297 ; VI-NEXT: s_waitcnt vmcnt(0)
298 ; VI-NEXT: buffer_wbinvl1_vol
299 ; VI-NEXT: s_setpc_b64 s[30:31]
301 ; GFX9-LABEL: global_atomic_xchg_i32_ret_scalar:
303 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
304 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
305 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
306 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
307 ; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[4:5] glc
308 ; GFX9-NEXT: s_waitcnt vmcnt(0)
309 ; GFX9-NEXT: buffer_wbinvl1_vol
310 ; GFX9-NEXT: s_setpc_b64 s[30:31]
311 %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %in seq_cst
315 define amdgpu_gfx i32 @global_atomic_xchg_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
316 ; SI-LABEL: global_atomic_xchg_i32_ret_offset_scalar:
318 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
319 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
320 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
321 ; SI-NEXT: s_mov_b64 exec, s[34:35]
322 ; SI-NEXT: s_waitcnt expcnt(0)
323 ; SI-NEXT: v_writelane_b32 v1, s6, 0
324 ; SI-NEXT: v_writelane_b32 v1, s7, 1
325 ; SI-NEXT: s_mov_b32 s34, s6
326 ; SI-NEXT: s_mov_b32 s7, 0xf000
327 ; SI-NEXT: s_mov_b32 s6, -1
328 ; SI-NEXT: v_mov_b32_e32 v0, s34
329 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
330 ; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16 glc
331 ; SI-NEXT: s_waitcnt vmcnt(0)
332 ; SI-NEXT: buffer_wbinvl1
333 ; SI-NEXT: v_readlane_b32 s7, v1, 1
334 ; SI-NEXT: v_readlane_b32 s6, v1, 0
335 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
336 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
337 ; SI-NEXT: s_mov_b64 exec, s[34:35]
338 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
339 ; SI-NEXT: s_setpc_b64 s[30:31]
341 ; VI-LABEL: global_atomic_xchg_i32_ret_offset_scalar:
343 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
344 ; VI-NEXT: s_add_u32 s34, s4, 16
345 ; VI-NEXT: s_addc_u32 s35, s5, 0
346 ; VI-NEXT: v_mov_b32_e32 v0, s34
347 ; VI-NEXT: v_mov_b32_e32 v1, s35
348 ; VI-NEXT: v_mov_b32_e32 v2, s6
349 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
350 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
351 ; VI-NEXT: s_waitcnt vmcnt(0)
352 ; VI-NEXT: buffer_wbinvl1_vol
353 ; VI-NEXT: s_setpc_b64 s[30:31]
355 ; GFX9-LABEL: global_atomic_xchg_i32_ret_offset_scalar:
357 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
358 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
359 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
360 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
361 ; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[4:5] offset:16 glc
362 ; GFX9-NEXT: s_waitcnt vmcnt(0)
363 ; GFX9-NEXT: buffer_wbinvl1_vol
364 ; GFX9-NEXT: s_setpc_b64 s[30:31]
365 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
366 %result = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst
370 ; ---------------------------------------------------------------------
372 ; ---------------------------------------------------------------------
374 define void @global_atomic_xchg_f32_noret(ptr addrspace(1) %ptr, float %in) {
375 ; GCN1-LABEL: global_atomic_xchg_f32_noret:
377 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
378 ; GCN1-NEXT: global_load_dword v3, v[0:1]
379 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
380 ; GCN1-NEXT: .LBB0_1: ; %atomicrmw.start
381 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
382 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
383 ; GCN1-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
384 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
385 ; GCN1-NEXT: buffer_wbinvl1_vol
386 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
387 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
388 ; GCN1-NEXT: v_mov_b32_e32 v3, v4
389 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
390 ; GCN1-NEXT: s_cbranch_execnz .LBB0_1
391 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
392 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
393 ; GCN1-NEXT: s_setpc_b64 s[30:31]
395 ; GCN2-LABEL: global_atomic_xchg_f32_noret:
397 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
398 ; GCN2-NEXT: global_load_dword v3, v[0:1]
399 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
400 ; GCN2-NEXT: .LBB0_1: ; %atomicrmw.start
401 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
402 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
403 ; GCN2-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
404 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
405 ; GCN2-NEXT: buffer_wbinvl1_vol
406 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
407 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
408 ; GCN2-NEXT: v_mov_b32_e32 v3, v4
409 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
410 ; GCN2-NEXT: s_cbranch_execnz .LBB0_1
411 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
412 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
413 ; GCN2-NEXT: s_setpc_b64 s[30:31]
415 ; GCN3-LABEL: global_atomic_xchg_f32_noret:
417 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
418 ; GCN3-NEXT: global_load_dword v3, v[0:1]
419 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
420 ; GCN3-NEXT: .LBB0_1: ; %atomicrmw.start
421 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
422 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
423 ; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
424 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
425 ; GCN3-NEXT: buffer_wbinvl1_vol
426 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
427 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
428 ; GCN3-NEXT: v_mov_b32_e32 v3, v4
429 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
430 ; GCN3-NEXT: s_cbranch_execnz .LBB0_1
431 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
432 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
433 ; GCN3-NEXT: s_setpc_b64 s[30:31]
434 ; SI-LABEL: global_atomic_xchg_f32_noret:
436 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
437 ; SI-NEXT: s_mov_b32 s6, 0
438 ; SI-NEXT: s_mov_b32 s7, 0xf000
439 ; SI-NEXT: s_mov_b32 s4, s6
440 ; SI-NEXT: s_mov_b32 s5, s6
441 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
442 ; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64
443 ; SI-NEXT: s_waitcnt vmcnt(0)
444 ; SI-NEXT: buffer_wbinvl1
445 ; SI-NEXT: s_waitcnt expcnt(0)
446 ; SI-NEXT: s_setpc_b64 s[30:31]
448 ; VI-LABEL: global_atomic_xchg_f32_noret:
450 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
451 ; VI-NEXT: flat_atomic_swap v[0:1], v2
452 ; VI-NEXT: s_waitcnt vmcnt(0)
453 ; VI-NEXT: buffer_wbinvl1_vol
454 ; VI-NEXT: s_setpc_b64 s[30:31]
456 ; GFX9-LABEL: global_atomic_xchg_f32_noret:
458 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
459 ; GFX9-NEXT: global_atomic_swap v[0:1], v2, off
460 ; GFX9-NEXT: s_waitcnt vmcnt(0)
461 ; GFX9-NEXT: buffer_wbinvl1_vol
462 ; GFX9-NEXT: s_setpc_b64 s[30:31]
463 %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, float %in seq_cst
467 define void @global_atomic_xchg_f32_noret_offset(ptr addrspace(1) %out, float %in) {
468 ; GCN1-LABEL: global_atomic_xchg_f32_noret_offset:
470 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
471 ; GCN1-NEXT: v_add_f32_e32 v0, vcc, 16, v0
472 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
473 ; GCN1-NEXT: global_load_dword v3, v[0:1]
474 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
475 ; GCN1-NEXT: .LBB1_1: ; %atomicrmw.start
476 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
477 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
478 ; GCN1-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
479 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
480 ; GCN1-NEXT: buffer_wbinvl1_vol
481 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
482 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
483 ; GCN1-NEXT: v_mov_b32_e32 v3, v4
484 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
485 ; GCN1-NEXT: s_cbranch_execnz .LBB1_1
486 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
487 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
488 ; GCN1-NEXT: s_setpc_b64 s[30:31]
490 ; GCN2-LABEL: global_atomic_xchg_f32_noret_offset:
492 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
493 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
494 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
495 ; GCN2-NEXT: global_load_dword v3, v[0:1]
496 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
497 ; GCN2-NEXT: .LBB1_1: ; %atomicrmw.start
498 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
499 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
500 ; GCN2-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
501 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
502 ; GCN2-NEXT: buffer_wbinvl1_vol
503 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
504 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
505 ; GCN2-NEXT: v_mov_b32_e32 v3, v4
506 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
507 ; GCN2-NEXT: s_cbranch_execnz .LBB1_1
508 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
509 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
510 ; GCN2-NEXT: s_setpc_b64 s[30:31]
512 ; GCN3-LABEL: global_atomic_xchg_f32_noret_offset:
514 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
515 ; GCN3-NEXT: global_load_dword v3, v[0:1] offset:16
516 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
517 ; GCN3-NEXT: .LBB1_1: ; %atomicrmw.start
518 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
519 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
520 ; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc
521 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
522 ; GCN3-NEXT: buffer_wbinvl1_vol
523 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
524 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
525 ; GCN3-NEXT: v_mov_b32_e32 v3, v4
526 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
527 ; GCN3-NEXT: s_cbranch_execnz .LBB1_1
528 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
529 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
530 ; GCN3-NEXT: s_setpc_b64 s[30:31]
531 ; SI-LABEL: global_atomic_xchg_f32_noret_offset:
533 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
534 ; SI-NEXT: s_mov_b32 s6, 0
535 ; SI-NEXT: s_mov_b32 s7, 0xf000
536 ; SI-NEXT: s_mov_b32 s4, s6
537 ; SI-NEXT: s_mov_b32 s5, s6
538 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
539 ; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16
540 ; SI-NEXT: s_waitcnt vmcnt(0)
541 ; SI-NEXT: buffer_wbinvl1
542 ; SI-NEXT: s_waitcnt expcnt(0)
543 ; SI-NEXT: s_setpc_b64 s[30:31]
545 ; VI-LABEL: global_atomic_xchg_f32_noret_offset:
547 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
548 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
549 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
550 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
551 ; VI-NEXT: flat_atomic_swap v[0:1], v2
552 ; VI-NEXT: s_waitcnt vmcnt(0)
553 ; VI-NEXT: buffer_wbinvl1_vol
554 ; VI-NEXT: s_setpc_b64 s[30:31]
556 ; GFX9-LABEL: global_atomic_xchg_f32_noret_offset:
558 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
559 ; GFX9-NEXT: global_atomic_swap v[0:1], v2, off offset:16
560 ; GFX9-NEXT: s_waitcnt vmcnt(0)
561 ; GFX9-NEXT: buffer_wbinvl1_vol
562 ; GFX9-NEXT: s_setpc_b64 s[30:31]
563 %gep = getelementptr float, ptr addrspace(1) %out, i32 4
564 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst
568 define float @global_atomic_xchg_f32_ret(ptr addrspace(1) %ptr, float %in) {
569 ; GCN1-LABEL: global_atomic_xchg_f32_ret:
571 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
572 ; GCN1-NEXT: global_load_dword v4, v[0:1]
573 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
574 ; GCN1-NEXT: .LBB2_1: ; %atomicrmw.start
575 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
576 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
577 ; GCN1-NEXT: v_mov_b32_e32 v3, v4
578 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
579 ; GCN1-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
580 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
581 ; GCN1-NEXT: buffer_wbinvl1_vol
582 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
583 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
584 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
585 ; GCN1-NEXT: s_cbranch_execnz .LBB2_1
586 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
587 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
588 ; GCN1-NEXT: v_mov_b32_e32 v0, v4
589 ; GCN1-NEXT: s_setpc_b64 s[30:31]
591 ; GCN2-LABEL: global_atomic_xchg_f32_ret:
593 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
594 ; GCN2-NEXT: global_load_dword v4, v[0:1]
595 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
596 ; GCN2-NEXT: .LBB2_1: ; %atomicrmw.start
597 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
598 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
599 ; GCN2-NEXT: v_mov_b32_e32 v3, v4
600 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
601 ; GCN2-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
602 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
603 ; GCN2-NEXT: buffer_wbinvl1_vol
604 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
605 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
606 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
607 ; GCN2-NEXT: s_cbranch_execnz .LBB2_1
608 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
609 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
610 ; GCN2-NEXT: v_mov_b32_e32 v0, v4
611 ; GCN2-NEXT: s_setpc_b64 s[30:31]
613 ; GCN3-LABEL: global_atomic_xchg_f32_ret:
615 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
616 ; GCN3-NEXT: global_load_dword v4, v[0:1]
617 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
618 ; GCN3-NEXT: .LBB2_1: ; %atomicrmw.start
619 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
620 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
621 ; GCN3-NEXT: v_mov_b32_e32 v3, v4
622 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
623 ; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
624 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
625 ; GCN3-NEXT: buffer_wbinvl1_vol
626 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
627 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
628 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
629 ; GCN3-NEXT: s_cbranch_execnz .LBB2_1
630 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
631 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
632 ; GCN3-NEXT: v_mov_b32_e32 v0, v4
633 ; GCN3-NEXT: s_setpc_b64 s[30:31]
634 ; SI-LABEL: global_atomic_xchg_f32_ret:
636 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
637 ; SI-NEXT: s_mov_b32 s6, 0
638 ; SI-NEXT: s_mov_b32 s7, 0xf000
639 ; SI-NEXT: s_mov_b32 s4, s6
640 ; SI-NEXT: s_mov_b32 s5, s6
641 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
642 ; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 glc
643 ; SI-NEXT: s_waitcnt vmcnt(0)
644 ; SI-NEXT: buffer_wbinvl1
645 ; SI-NEXT: v_mov_b32_e32 v0, v2
646 ; SI-NEXT: s_waitcnt expcnt(0)
647 ; SI-NEXT: s_setpc_b64 s[30:31]
649 ; VI-LABEL: global_atomic_xchg_f32_ret:
651 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
652 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
653 ; VI-NEXT: s_waitcnt vmcnt(0)
654 ; VI-NEXT: buffer_wbinvl1_vol
655 ; VI-NEXT: s_setpc_b64 s[30:31]
657 ; GFX9-LABEL: global_atomic_xchg_f32_ret:
659 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
660 ; GFX9-NEXT: global_atomic_swap v0, v[0:1], v2, off glc
661 ; GFX9-NEXT: s_waitcnt vmcnt(0)
662 ; GFX9-NEXT: buffer_wbinvl1_vol
663 ; GFX9-NEXT: s_setpc_b64 s[30:31]
664 %result = atomicrmw xchg ptr addrspace(1) %ptr, float %in seq_cst
668 define float @global_atomic_xchg_f32_ret_offset(ptr addrspace(1) %out, float %in) {
669 ; GCN1-LABEL: global_atomic_xchg_f32_ret_offset:
671 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
672 ; GCN1-NEXT: v_add_f32_e32 v4, vcc, 16, v0
673 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
674 ; GCN1-NEXT: global_load_dword v0, v[4:5]
675 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
676 ; GCN1-NEXT: .LBB3_1: ; %atomicrmw.start
677 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
678 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
679 ; GCN1-NEXT: v_mov_b32_e32 v3, v0
680 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
681 ; GCN1-NEXT: global_atomic_cmpswap v0, v[4:5], v[2:3] glc
682 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
683 ; GCN1-NEXT: buffer_wbinvl1_vol
684 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
685 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
686 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
687 ; GCN1-NEXT: s_cbranch_execnz .LBB3_1
688 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
689 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
690 ; GCN1-NEXT: s_setpc_b64 s[30:31]
692 ; GCN2-LABEL: global_atomic_xchg_f32_ret_offset:
694 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
695 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 16, v0
696 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
697 ; GCN2-NEXT: global_load_dword v0, v[4:5]
698 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
699 ; GCN2-NEXT: .LBB3_1: ; %atomicrmw.start
700 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
701 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
702 ; GCN2-NEXT: v_mov_b32_e32 v3, v0
703 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
704 ; GCN2-NEXT: global_atomic_cmpswap v0, v[4:5], v[2:3] glc
705 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
706 ; GCN2-NEXT: buffer_wbinvl1_vol
707 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
708 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
709 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
710 ; GCN2-NEXT: s_cbranch_execnz .LBB3_1
711 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
712 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
713 ; GCN2-NEXT: s_setpc_b64 s[30:31]
715 ; GCN3-LABEL: global_atomic_xchg_f32_ret_offset:
717 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
718 ; GCN3-NEXT: global_load_dword v4, v[0:1] offset:16
719 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
720 ; GCN3-NEXT: .LBB3_1: ; %atomicrmw.start
721 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
722 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
723 ; GCN3-NEXT: v_mov_b32_e32 v3, v4
724 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
725 ; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc
726 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
727 ; GCN3-NEXT: buffer_wbinvl1_vol
728 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
729 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
730 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
731 ; GCN3-NEXT: s_cbranch_execnz .LBB3_1
732 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
733 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
734 ; GCN3-NEXT: v_mov_b32_e32 v0, v4
735 ; GCN3-NEXT: s_setpc_b64 s[30:31]
736 ; SI-LABEL: global_atomic_xchg_f32_ret_offset:
738 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
739 ; SI-NEXT: s_mov_b32 s6, 0
740 ; SI-NEXT: s_mov_b32 s7, 0xf000
741 ; SI-NEXT: s_mov_b32 s4, s6
742 ; SI-NEXT: s_mov_b32 s5, s6
743 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
744 ; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
745 ; SI-NEXT: s_waitcnt vmcnt(0)
746 ; SI-NEXT: buffer_wbinvl1
747 ; SI-NEXT: v_mov_b32_e32 v0, v2
748 ; SI-NEXT: s_waitcnt expcnt(0)
749 ; SI-NEXT: s_setpc_b64 s[30:31]
751 ; VI-LABEL: global_atomic_xchg_f32_ret_offset:
753 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
754 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
755 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
756 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
757 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
758 ; VI-NEXT: s_waitcnt vmcnt(0)
759 ; VI-NEXT: buffer_wbinvl1_vol
760 ; VI-NEXT: s_setpc_b64 s[30:31]
762 ; GFX9-LABEL: global_atomic_xchg_f32_ret_offset:
764 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
765 ; GFX9-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:16 glc
766 ; GFX9-NEXT: s_waitcnt vmcnt(0)
767 ; GFX9-NEXT: buffer_wbinvl1_vol
768 ; GFX9-NEXT: s_setpc_b64 s[30:31]
769 %gep = getelementptr float, ptr addrspace(1) %out, i32 4
770 %result = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst
774 define amdgpu_gfx void @global_atomic_xchg_f32_noret_scalar(ptr addrspace(1) inreg %ptr, float inreg %in) {
775 ; GCN1-LABEL: global_atomic_xchg_f32_noret_scalar:
777 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
778 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
779 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
780 ; GCN1-NEXT: global_load_dword v1, v[0:1]
781 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
782 ; GCN1-NEXT: .LBB4_1: ; %atomicrmw.start
783 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
784 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
785 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
786 ; GCN1-NEXT: v_mov_b32_e32 v3, s5
787 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
788 ; GCN1-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc
789 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
790 ; GCN1-NEXT: buffer_wbinvl1_vol
791 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
792 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
793 ; GCN1-NEXT: v_mov_b32_e32 v1, v0
794 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
795 ; GCN1-NEXT: s_cbranch_execnz .LBB4_1
796 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
797 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
798 ; GCN1-NEXT: s_setpc_b64 s[30:31]
800 ; GCN2-LABEL: global_atomic_xchg_f32_noret_scalar:
802 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
803 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
804 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
805 ; GCN2-NEXT: global_load_dword v1, v[0:1]
806 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
807 ; GCN2-NEXT: .LBB4_1: ; %atomicrmw.start
808 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
809 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
810 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
811 ; GCN2-NEXT: v_mov_b32_e32 v3, s5
812 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
813 ; GCN2-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc
814 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
815 ; GCN2-NEXT: buffer_wbinvl1_vol
816 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
817 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
818 ; GCN2-NEXT: v_mov_b32_e32 v1, v0
819 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
820 ; GCN2-NEXT: s_cbranch_execnz .LBB4_1
821 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
822 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
823 ; GCN2-NEXT: s_setpc_b64 s[30:31]
825 ; GCN3-LABEL: global_atomic_xchg_f32_noret_scalar:
827 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
828 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
829 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
830 ; GCN3-NEXT: global_load_dword v1, v[0:1]
831 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
832 ; GCN3-NEXT: .LBB4_1: ; %atomicrmw.start
833 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
834 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
835 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
836 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
837 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
838 ; GCN3-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc
839 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
840 ; GCN3-NEXT: buffer_wbinvl1_vol
841 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
842 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
843 ; GCN3-NEXT: v_mov_b32_e32 v1, v0
844 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
845 ; GCN3-NEXT: s_cbranch_execnz .LBB4_1
846 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
847 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
848 ; GCN3-NEXT: s_setpc_b64 s[30:31]
849 ; SI-LABEL: global_atomic_xchg_f32_noret_scalar:
851 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
852 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
853 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
854 ; SI-NEXT: s_mov_b64 exec, s[34:35]
855 ; SI-NEXT: s_waitcnt expcnt(0)
856 ; SI-NEXT: v_writelane_b32 v0, s6, 0
857 ; SI-NEXT: v_writelane_b32 v0, s7, 1
858 ; SI-NEXT: s_mov_b32 s34, s6
859 ; SI-NEXT: s_mov_b32 s7, 0xf000
860 ; SI-NEXT: s_mov_b32 s6, -1
861 ; SI-NEXT: v_mov_b32_e32 v1, s34
862 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
863 ; SI-NEXT: buffer_atomic_swap v1, off, s[4:7], 0
864 ; SI-NEXT: s_waitcnt vmcnt(0)
865 ; SI-NEXT: buffer_wbinvl1
866 ; SI-NEXT: v_readlane_b32 s7, v0, 1
867 ; SI-NEXT: v_readlane_b32 s6, v0, 0
868 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
869 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
870 ; SI-NEXT: s_mov_b64 exec, s[34:35]
871 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
872 ; SI-NEXT: s_setpc_b64 s[30:31]
874 ; VI-LABEL: global_atomic_xchg_f32_noret_scalar:
876 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
877 ; VI-NEXT: v_mov_b32_e32 v0, s4
878 ; VI-NEXT: v_mov_b32_e32 v1, s5
879 ; VI-NEXT: v_mov_b32_e32 v2, s6
880 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
881 ; VI-NEXT: flat_atomic_swap v[0:1], v2
882 ; VI-NEXT: s_waitcnt vmcnt(0)
883 ; VI-NEXT: buffer_wbinvl1_vol
884 ; VI-NEXT: s_setpc_b64 s[30:31]
886 ; GFX9-LABEL: global_atomic_xchg_f32_noret_scalar:
888 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
889 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
890 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
891 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
892 ; GFX9-NEXT: global_atomic_swap v0, v1, s[4:5]
893 ; GFX9-NEXT: s_waitcnt vmcnt(0)
894 ; GFX9-NEXT: buffer_wbinvl1_vol
895 ; GFX9-NEXT: s_setpc_b64 s[30:31]
896 %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, float %in seq_cst
900 define amdgpu_gfx void @global_atomic_xchg_f32_noret_offset_scalar(ptr addrspace(1) inreg %out, float inreg %in) {
901 ; GCN1-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
903 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
904 ; GCN1-NEXT: s_add_u32 s34, s4, 16
905 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
906 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
907 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
908 ; GCN1-NEXT: global_load_dword v1, v[0:1]
909 ; GCN1-NEXT: s_mov_b64 s[36:37], 0
910 ; GCN1-NEXT: .LBB5_1: ; %atomicrmw.start
911 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
912 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
913 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
914 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
915 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
916 ; GCN1-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc
917 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
918 ; GCN1-NEXT: buffer_wbinvl1_vol
919 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
920 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
921 ; GCN1-NEXT: v_mov_b32_e32 v1, v0
922 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
923 ; GCN1-NEXT: s_cbranch_execnz .LBB5_1
924 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
925 ; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
926 ; GCN1-NEXT: s_setpc_b64 s[30:31]
928 ; GCN2-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
930 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
931 ; GCN2-NEXT: s_add_u32 s34, s4, 16
932 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
933 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
934 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
935 ; GCN2-NEXT: global_load_dword v1, v[0:1]
936 ; GCN2-NEXT: s_mov_b64 s[36:37], 0
937 ; GCN2-NEXT: .LBB5_1: ; %atomicrmw.start
938 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
939 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
940 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
941 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
942 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
943 ; GCN2-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc
944 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
945 ; GCN2-NEXT: buffer_wbinvl1_vol
946 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
947 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
948 ; GCN2-NEXT: v_mov_b32_e32 v1, v0
949 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
950 ; GCN2-NEXT: s_cbranch_execnz .LBB5_1
951 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
952 ; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
953 ; GCN2-NEXT: s_setpc_b64 s[30:31]
955 ; GCN3-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
957 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
958 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
959 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
960 ; GCN3-NEXT: global_load_dword v1, v[0:1] offset:16
961 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
962 ; GCN3-NEXT: .LBB5_1: ; %atomicrmw.start
963 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
964 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
965 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
966 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
967 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
968 ; GCN3-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
969 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
970 ; GCN3-NEXT: buffer_wbinvl1_vol
971 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
972 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
973 ; GCN3-NEXT: v_mov_b32_e32 v1, v0
974 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
975 ; GCN3-NEXT: s_cbranch_execnz .LBB5_1
976 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
977 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
978 ; GCN3-NEXT: s_setpc_b64 s[30:31]
979 ; SI-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
981 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
982 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
983 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
984 ; SI-NEXT: s_mov_b64 exec, s[34:35]
985 ; SI-NEXT: s_waitcnt expcnt(0)
986 ; SI-NEXT: v_writelane_b32 v0, s6, 0
987 ; SI-NEXT: v_writelane_b32 v0, s7, 1
988 ; SI-NEXT: s_mov_b32 s34, s6
989 ; SI-NEXT: s_mov_b32 s7, 0xf000
990 ; SI-NEXT: s_mov_b32 s6, -1
991 ; SI-NEXT: v_mov_b32_e32 v1, s34
992 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
993 ; SI-NEXT: buffer_atomic_swap v1, off, s[4:7], 0 offset:16
994 ; SI-NEXT: s_waitcnt vmcnt(0)
995 ; SI-NEXT: buffer_wbinvl1
996 ; SI-NEXT: v_readlane_b32 s7, v0, 1
997 ; SI-NEXT: v_readlane_b32 s6, v0, 0
998 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
999 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
1000 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1001 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1002 ; SI-NEXT: s_setpc_b64 s[30:31]
1004 ; VI-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
1006 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1007 ; VI-NEXT: s_add_u32 s34, s4, 16
1008 ; VI-NEXT: s_addc_u32 s35, s5, 0
1009 ; VI-NEXT: v_mov_b32_e32 v0, s34
1010 ; VI-NEXT: v_mov_b32_e32 v1, s35
1011 ; VI-NEXT: v_mov_b32_e32 v2, s6
1012 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1013 ; VI-NEXT: flat_atomic_swap v[0:1], v2
1014 ; VI-NEXT: s_waitcnt vmcnt(0)
1015 ; VI-NEXT: buffer_wbinvl1_vol
1016 ; VI-NEXT: s_setpc_b64 s[30:31]
1018 ; GFX9-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
1020 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1021 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1022 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1023 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1024 ; GFX9-NEXT: global_atomic_swap v0, v1, s[4:5] offset:16
1025 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1026 ; GFX9-NEXT: buffer_wbinvl1_vol
1027 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1028 %gep = getelementptr float, ptr addrspace(1) %out, i32 4
1029 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst
1033 define amdgpu_gfx float @global_atomic_xchg_f32_ret_scalar(ptr addrspace(1) inreg %ptr, float inreg %in) {
1034 ; GCN1-LABEL: global_atomic_xchg_f32_ret_scalar:
1036 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1037 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
1038 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
1039 ; GCN1-NEXT: global_load_dword v0, v[0:1]
1040 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
1041 ; GCN1-NEXT: .LBB6_1: ; %atomicrmw.start
1042 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
1043 ; GCN1-NEXT: v_mov_b32_e32 v3, s4
1044 ; GCN1-NEXT: v_mov_b32_e32 v1, s6
1045 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1046 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
1047 ; GCN1-NEXT: v_mov_b32_e32 v4, s5
1048 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1049 ; GCN1-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc
1050 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1051 ; GCN1-NEXT: buffer_wbinvl1_vol
1052 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
1053 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
1054 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
1055 ; GCN1-NEXT: s_cbranch_execnz .LBB6_1
1056 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
1057 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
1058 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1060 ; GCN2-LABEL: global_atomic_xchg_f32_ret_scalar:
1062 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1063 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
1064 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
1065 ; GCN2-NEXT: global_load_dword v0, v[0:1]
1066 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
1067 ; GCN2-NEXT: .LBB6_1: ; %atomicrmw.start
1068 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
1069 ; GCN2-NEXT: v_mov_b32_e32 v3, s4
1070 ; GCN2-NEXT: v_mov_b32_e32 v1, s6
1071 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1072 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
1073 ; GCN2-NEXT: v_mov_b32_e32 v4, s5
1074 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1075 ; GCN2-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc
1076 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1077 ; GCN2-NEXT: buffer_wbinvl1_vol
1078 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
1079 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
1080 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
1081 ; GCN2-NEXT: s_cbranch_execnz .LBB6_1
1082 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
1083 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
1084 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1086 ; GCN3-LABEL: global_atomic_xchg_f32_ret_scalar:
1088 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1089 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
1090 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
1091 ; GCN3-NEXT: global_load_dword v0, v[0:1]
1092 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
1093 ; GCN3-NEXT: .LBB6_1: ; %atomicrmw.start
1094 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
1095 ; GCN3-NEXT: v_mov_b32_e32 v3, s4
1096 ; GCN3-NEXT: v_mov_b32_e32 v1, s6
1097 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1098 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
1099 ; GCN3-NEXT: v_mov_b32_e32 v4, s5
1100 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1101 ; GCN3-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc
1102 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1103 ; GCN3-NEXT: buffer_wbinvl1_vol
1104 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
1105 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
1106 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
1107 ; GCN3-NEXT: s_cbranch_execnz .LBB6_1
1108 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
1109 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
1110 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1111 ; SI-LABEL: global_atomic_xchg_f32_ret_scalar:
1113 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1114 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1115 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1116 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1117 ; SI-NEXT: s_waitcnt expcnt(0)
1118 ; SI-NEXT: v_writelane_b32 v1, s6, 0
1119 ; SI-NEXT: v_writelane_b32 v1, s7, 1
1120 ; SI-NEXT: s_mov_b32 s34, s6
1121 ; SI-NEXT: s_mov_b32 s7, 0xf000
1122 ; SI-NEXT: s_mov_b32 s6, -1
1123 ; SI-NEXT: v_mov_b32_e32 v0, s34
1124 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1125 ; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc
1126 ; SI-NEXT: s_waitcnt vmcnt(0)
1127 ; SI-NEXT: buffer_wbinvl1
1128 ; SI-NEXT: v_readlane_b32 s7, v1, 1
1129 ; SI-NEXT: v_readlane_b32 s6, v1, 0
1130 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1131 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1132 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1133 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1134 ; SI-NEXT: s_setpc_b64 s[30:31]
1136 ; VI-LABEL: global_atomic_xchg_f32_ret_scalar:
1138 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1139 ; VI-NEXT: v_mov_b32_e32 v0, s4
1140 ; VI-NEXT: v_mov_b32_e32 v1, s5
1141 ; VI-NEXT: v_mov_b32_e32 v2, s6
1142 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1143 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
1144 ; VI-NEXT: s_waitcnt vmcnt(0)
1145 ; VI-NEXT: buffer_wbinvl1_vol
1146 ; VI-NEXT: s_setpc_b64 s[30:31]
1148 ; GFX9-LABEL: global_atomic_xchg_f32_ret_scalar:
1150 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1151 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1152 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1153 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1154 ; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[4:5] glc
1155 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1156 ; GFX9-NEXT: buffer_wbinvl1_vol
1157 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1158 %result = atomicrmw xchg ptr addrspace(1) %ptr, float %in seq_cst
1162 define amdgpu_gfx float @global_atomic_xchg_f32_ret_offset_scalar(ptr addrspace(1) inreg %out, float inreg %in) {
1163 ; GCN1-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
1165 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1166 ; GCN1-NEXT: s_add_u32 s34, s4, 16
1167 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
1168 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
1169 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
1170 ; GCN1-NEXT: global_load_dword v0, v[0:1]
1171 ; GCN1-NEXT: s_mov_b64 s[36:37], 0
1172 ; GCN1-NEXT: .LBB7_1: ; %atomicrmw.start
1173 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
1174 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
1175 ; GCN1-NEXT: v_mov_b32_e32 v1, s6
1176 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1177 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
1178 ; GCN1-NEXT: v_mov_b32_e32 v4, s35
1179 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1180 ; GCN1-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc
1181 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1182 ; GCN1-NEXT: buffer_wbinvl1_vol
1183 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
1184 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
1185 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
1186 ; GCN1-NEXT: s_cbranch_execnz .LBB7_1
1187 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
1188 ; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
1189 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1191 ; GCN2-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
1193 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1194 ; GCN2-NEXT: s_add_u32 s34, s4, 16
1195 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
1196 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
1197 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
1198 ; GCN2-NEXT: global_load_dword v0, v[0:1]
1199 ; GCN2-NEXT: s_mov_b64 s[36:37], 0
1200 ; GCN2-NEXT: .LBB7_1: ; %atomicrmw.start
1201 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
1202 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
1203 ; GCN2-NEXT: v_mov_b32_e32 v1, s6
1204 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1205 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
1206 ; GCN2-NEXT: v_mov_b32_e32 v4, s35
1207 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1208 ; GCN2-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc
1209 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1210 ; GCN2-NEXT: buffer_wbinvl1_vol
1211 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
1212 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
1213 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
1214 ; GCN2-NEXT: s_cbranch_execnz .LBB7_1
1215 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
1216 ; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
1217 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1219 ; GCN3-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
1221 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1222 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
1223 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
1224 ; GCN3-NEXT: global_load_dword v0, v[0:1] offset:16
1225 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
1226 ; GCN3-NEXT: .LBB7_1: ; %atomicrmw.start
1227 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
1228 ; GCN3-NEXT: v_mov_b32_e32 v3, s4
1229 ; GCN3-NEXT: v_mov_b32_e32 v1, s6
1230 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1231 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
1232 ; GCN3-NEXT: v_mov_b32_e32 v4, s5
1233 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1234 ; GCN3-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] offset:16 glc
1235 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1236 ; GCN3-NEXT: buffer_wbinvl1_vol
1237 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
1238 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
1239 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
1240 ; GCN3-NEXT: s_cbranch_execnz .LBB7_1
1241 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
1242 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
1243 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1244 ; SI-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
1246 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1247 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1248 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1249 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1250 ; SI-NEXT: s_waitcnt expcnt(0)
1251 ; SI-NEXT: v_writelane_b32 v1, s6, 0
1252 ; SI-NEXT: v_writelane_b32 v1, s7, 1
1253 ; SI-NEXT: s_mov_b32 s34, s6
1254 ; SI-NEXT: s_mov_b32 s7, 0xf000
1255 ; SI-NEXT: s_mov_b32 s6, -1
1256 ; SI-NEXT: v_mov_b32_e32 v0, s34
1257 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1258 ; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16 glc
1259 ; SI-NEXT: s_waitcnt vmcnt(0)
1260 ; SI-NEXT: buffer_wbinvl1
1261 ; SI-NEXT: v_readlane_b32 s7, v1, 1
1262 ; SI-NEXT: v_readlane_b32 s6, v1, 0
1263 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1264 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1265 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1266 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1267 ; SI-NEXT: s_setpc_b64 s[30:31]
1269 ; VI-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
1271 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1272 ; VI-NEXT: s_add_u32 s34, s4, 16
1273 ; VI-NEXT: s_addc_u32 s35, s5, 0
1274 ; VI-NEXT: v_mov_b32_e32 v0, s34
1275 ; VI-NEXT: v_mov_b32_e32 v1, s35
1276 ; VI-NEXT: v_mov_b32_e32 v2, s6
1277 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1278 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
1279 ; VI-NEXT: s_waitcnt vmcnt(0)
1280 ; VI-NEXT: buffer_wbinvl1_vol
1281 ; VI-NEXT: s_setpc_b64 s[30:31]
1283 ; GFX9-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
1285 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1286 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1287 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1288 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1289 ; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[4:5] offset:16 glc
1290 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1291 ; GFX9-NEXT: buffer_wbinvl1_vol
1292 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1293 %gep = getelementptr float, ptr addrspace(1) %out, i32 4
1294 %result = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst
1298 ; ---------------------------------------------------------------------
1300 ; ---------------------------------------------------------------------
1302 define void @global_atomic_add_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
1303 ; SI-LABEL: global_atomic_add_i32_noret:
1305 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1306 ; SI-NEXT: s_mov_b32 s6, 0
1307 ; SI-NEXT: s_mov_b32 s7, 0xf000
1308 ; SI-NEXT: s_mov_b32 s4, s6
1309 ; SI-NEXT: s_mov_b32 s5, s6
1310 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1311 ; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64
1312 ; SI-NEXT: s_waitcnt vmcnt(0)
1313 ; SI-NEXT: buffer_wbinvl1
1314 ; SI-NEXT: s_waitcnt expcnt(0)
1315 ; SI-NEXT: s_setpc_b64 s[30:31]
1317 ; VI-LABEL: global_atomic_add_i32_noret:
1319 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1320 ; VI-NEXT: flat_atomic_add v[0:1], v2
1321 ; VI-NEXT: s_waitcnt vmcnt(0)
1322 ; VI-NEXT: buffer_wbinvl1_vol
1323 ; VI-NEXT: s_setpc_b64 s[30:31]
1325 ; GFX9-LABEL: global_atomic_add_i32_noret:
1327 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1328 ; GFX9-NEXT: global_atomic_add v[0:1], v2, off
1329 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1330 ; GFX9-NEXT: buffer_wbinvl1_vol
1331 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1332 %tmp0 = atomicrmw add ptr addrspace(1) %ptr, i32 %in seq_cst
1336 define void @global_atomic_add_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
1337 ; SI-LABEL: global_atomic_add_i32_noret_offset:
1339 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1340 ; SI-NEXT: s_mov_b32 s6, 0
1341 ; SI-NEXT: s_mov_b32 s7, 0xf000
1342 ; SI-NEXT: s_mov_b32 s4, s6
1343 ; SI-NEXT: s_mov_b32 s5, s6
1344 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1345 ; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 offset:16
1346 ; SI-NEXT: s_waitcnt vmcnt(0)
1347 ; SI-NEXT: buffer_wbinvl1
1348 ; SI-NEXT: s_waitcnt expcnt(0)
1349 ; SI-NEXT: s_setpc_b64 s[30:31]
1351 ; VI-LABEL: global_atomic_add_i32_noret_offset:
1353 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1354 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
1355 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1356 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1357 ; VI-NEXT: flat_atomic_add v[0:1], v2
1358 ; VI-NEXT: s_waitcnt vmcnt(0)
1359 ; VI-NEXT: buffer_wbinvl1_vol
1360 ; VI-NEXT: s_setpc_b64 s[30:31]
1362 ; GFX9-LABEL: global_atomic_add_i32_noret_offset:
1364 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1365 ; GFX9-NEXT: global_atomic_add v[0:1], v2, off offset:16
1366 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1367 ; GFX9-NEXT: buffer_wbinvl1_vol
1368 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1369 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
1370 %tmp0 = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst
1374 define i32 @global_atomic_add_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
1375 ; SI-LABEL: global_atomic_add_i32_ret:
1377 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1378 ; SI-NEXT: s_mov_b32 s6, 0
1379 ; SI-NEXT: s_mov_b32 s7, 0xf000
1380 ; SI-NEXT: s_mov_b32 s4, s6
1381 ; SI-NEXT: s_mov_b32 s5, s6
1382 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1383 ; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 glc
1384 ; SI-NEXT: s_waitcnt vmcnt(0)
1385 ; SI-NEXT: buffer_wbinvl1
1386 ; SI-NEXT: v_mov_b32_e32 v0, v2
1387 ; SI-NEXT: s_waitcnt expcnt(0)
1388 ; SI-NEXT: s_setpc_b64 s[30:31]
1390 ; VI-LABEL: global_atomic_add_i32_ret:
1392 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1393 ; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc
1394 ; VI-NEXT: s_waitcnt vmcnt(0)
1395 ; VI-NEXT: buffer_wbinvl1_vol
1396 ; VI-NEXT: s_setpc_b64 s[30:31]
1398 ; GFX9-LABEL: global_atomic_add_i32_ret:
1400 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1401 ; GFX9-NEXT: global_atomic_add v0, v[0:1], v2, off glc
1402 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1403 ; GFX9-NEXT: buffer_wbinvl1_vol
1404 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1405 %result = atomicrmw add ptr addrspace(1) %ptr, i32 %in seq_cst
1409 define i32 @global_atomic_add_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
1410 ; SI-LABEL: global_atomic_add_i32_ret_offset:
1412 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1413 ; SI-NEXT: s_mov_b32 s6, 0
1414 ; SI-NEXT: s_mov_b32 s7, 0xf000
1415 ; SI-NEXT: s_mov_b32 s4, s6
1416 ; SI-NEXT: s_mov_b32 s5, s6
1417 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1418 ; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
1419 ; SI-NEXT: s_waitcnt vmcnt(0)
1420 ; SI-NEXT: buffer_wbinvl1
1421 ; SI-NEXT: v_mov_b32_e32 v0, v2
1422 ; SI-NEXT: s_waitcnt expcnt(0)
1423 ; SI-NEXT: s_setpc_b64 s[30:31]
1425 ; VI-LABEL: global_atomic_add_i32_ret_offset:
1427 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1428 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
1429 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1430 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1431 ; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc
1432 ; VI-NEXT: s_waitcnt vmcnt(0)
1433 ; VI-NEXT: buffer_wbinvl1_vol
1434 ; VI-NEXT: s_setpc_b64 s[30:31]
1436 ; GFX9-LABEL: global_atomic_add_i32_ret_offset:
1438 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1439 ; GFX9-NEXT: global_atomic_add v0, v[0:1], v2, off offset:16 glc
1440 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1441 ; GFX9-NEXT: buffer_wbinvl1_vol
1442 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1443 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
1444 %result = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst
1448 define amdgpu_gfx void @global_atomic_add_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
1449 ; SI-LABEL: global_atomic_add_i32_noret_scalar:
1451 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1452 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1453 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
1454 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1455 ; SI-NEXT: s_waitcnt expcnt(0)
1456 ; SI-NEXT: v_writelane_b32 v0, s6, 0
1457 ; SI-NEXT: v_writelane_b32 v0, s7, 1
1458 ; SI-NEXT: s_mov_b32 s34, s6
1459 ; SI-NEXT: s_mov_b32 s7, 0xf000
1460 ; SI-NEXT: s_mov_b32 s6, -1
1461 ; SI-NEXT: v_mov_b32_e32 v1, s34
1462 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1463 ; SI-NEXT: buffer_atomic_add v1, off, s[4:7], 0
1464 ; SI-NEXT: s_waitcnt vmcnt(0)
1465 ; SI-NEXT: buffer_wbinvl1
1466 ; SI-NEXT: v_readlane_b32 s7, v0, 1
1467 ; SI-NEXT: v_readlane_b32 s6, v0, 0
1468 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1469 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
1470 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1471 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1472 ; SI-NEXT: s_setpc_b64 s[30:31]
1474 ; VI-LABEL: global_atomic_add_i32_noret_scalar:
1476 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1477 ; VI-NEXT: v_mov_b32_e32 v0, s4
1478 ; VI-NEXT: v_mov_b32_e32 v1, s5
1479 ; VI-NEXT: v_mov_b32_e32 v2, s6
1480 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1481 ; VI-NEXT: flat_atomic_add v[0:1], v2
1482 ; VI-NEXT: s_waitcnt vmcnt(0)
1483 ; VI-NEXT: buffer_wbinvl1_vol
1484 ; VI-NEXT: s_setpc_b64 s[30:31]
1486 ; GFX9-LABEL: global_atomic_add_i32_noret_scalar:
1488 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1489 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1490 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1491 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1492 ; GFX9-NEXT: global_atomic_add v0, v1, s[4:5]
1493 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1494 ; GFX9-NEXT: buffer_wbinvl1_vol
1495 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1496 %tmp0 = atomicrmw add ptr addrspace(1) %ptr, i32 %in seq_cst
1500 define amdgpu_gfx void @global_atomic_add_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
1501 ; SI-LABEL: global_atomic_add_i32_noret_offset_scalar:
1503 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1504 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1505 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
1506 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1507 ; SI-NEXT: s_waitcnt expcnt(0)
1508 ; SI-NEXT: v_writelane_b32 v0, s6, 0
1509 ; SI-NEXT: v_writelane_b32 v0, s7, 1
1510 ; SI-NEXT: s_mov_b32 s34, s6
1511 ; SI-NEXT: s_mov_b32 s7, 0xf000
1512 ; SI-NEXT: s_mov_b32 s6, -1
1513 ; SI-NEXT: v_mov_b32_e32 v1, s34
1514 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1515 ; SI-NEXT: buffer_atomic_add v1, off, s[4:7], 0 offset:16
1516 ; SI-NEXT: s_waitcnt vmcnt(0)
1517 ; SI-NEXT: buffer_wbinvl1
1518 ; SI-NEXT: v_readlane_b32 s7, v0, 1
1519 ; SI-NEXT: v_readlane_b32 s6, v0, 0
1520 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1521 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
1522 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1523 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1524 ; SI-NEXT: s_setpc_b64 s[30:31]
1526 ; VI-LABEL: global_atomic_add_i32_noret_offset_scalar:
1528 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1529 ; VI-NEXT: s_add_u32 s34, s4, 16
1530 ; VI-NEXT: s_addc_u32 s35, s5, 0
1531 ; VI-NEXT: v_mov_b32_e32 v0, s34
1532 ; VI-NEXT: v_mov_b32_e32 v1, s35
1533 ; VI-NEXT: v_mov_b32_e32 v2, s6
1534 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1535 ; VI-NEXT: flat_atomic_add v[0:1], v2
1536 ; VI-NEXT: s_waitcnt vmcnt(0)
1537 ; VI-NEXT: buffer_wbinvl1_vol
1538 ; VI-NEXT: s_setpc_b64 s[30:31]
1540 ; GFX9-LABEL: global_atomic_add_i32_noret_offset_scalar:
1542 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1543 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1544 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1545 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1546 ; GFX9-NEXT: global_atomic_add v0, v1, s[4:5] offset:16
1547 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1548 ; GFX9-NEXT: buffer_wbinvl1_vol
1549 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1550 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
1551 %tmp0 = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst
1555 define amdgpu_gfx i32 @global_atomic_add_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
1556 ; SI-LABEL: global_atomic_add_i32_ret_scalar:
1558 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1559 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1560 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1561 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1562 ; SI-NEXT: s_waitcnt expcnt(0)
1563 ; SI-NEXT: v_writelane_b32 v1, s6, 0
1564 ; SI-NEXT: v_writelane_b32 v1, s7, 1
1565 ; SI-NEXT: s_mov_b32 s34, s6
1566 ; SI-NEXT: s_mov_b32 s7, 0xf000
1567 ; SI-NEXT: s_mov_b32 s6, -1
1568 ; SI-NEXT: v_mov_b32_e32 v0, s34
1569 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1570 ; SI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc
1571 ; SI-NEXT: s_waitcnt vmcnt(0)
1572 ; SI-NEXT: buffer_wbinvl1
1573 ; SI-NEXT: v_readlane_b32 s7, v1, 1
1574 ; SI-NEXT: v_readlane_b32 s6, v1, 0
1575 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1576 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1577 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1578 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1579 ; SI-NEXT: s_setpc_b64 s[30:31]
1581 ; VI-LABEL: global_atomic_add_i32_ret_scalar:
1583 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1584 ; VI-NEXT: v_mov_b32_e32 v0, s4
1585 ; VI-NEXT: v_mov_b32_e32 v1, s5
1586 ; VI-NEXT: v_mov_b32_e32 v2, s6
1587 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1588 ; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc
1589 ; VI-NEXT: s_waitcnt vmcnt(0)
1590 ; VI-NEXT: buffer_wbinvl1_vol
1591 ; VI-NEXT: s_setpc_b64 s[30:31]
1593 ; GFX9-LABEL: global_atomic_add_i32_ret_scalar:
1595 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1596 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1597 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1598 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1599 ; GFX9-NEXT: global_atomic_add v0, v0, v1, s[4:5] glc
1600 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1601 ; GFX9-NEXT: buffer_wbinvl1_vol
1602 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1603 %result = atomicrmw add ptr addrspace(1) %ptr, i32 %in seq_cst
1607 define amdgpu_gfx i32 @global_atomic_add_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
1608 ; SI-LABEL: global_atomic_add_i32_ret_offset_scalar:
1610 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1611 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1612 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1613 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1614 ; SI-NEXT: s_waitcnt expcnt(0)
1615 ; SI-NEXT: v_writelane_b32 v1, s6, 0
1616 ; SI-NEXT: v_writelane_b32 v1, s7, 1
1617 ; SI-NEXT: s_mov_b32 s34, s6
1618 ; SI-NEXT: s_mov_b32 s7, 0xf000
1619 ; SI-NEXT: s_mov_b32 s6, -1
1620 ; SI-NEXT: v_mov_b32_e32 v0, s34
1621 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1622 ; SI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 offset:16 glc
1623 ; SI-NEXT: s_waitcnt vmcnt(0)
1624 ; SI-NEXT: buffer_wbinvl1
1625 ; SI-NEXT: v_readlane_b32 s7, v1, 1
1626 ; SI-NEXT: v_readlane_b32 s6, v1, 0
1627 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1628 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1629 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1630 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1631 ; SI-NEXT: s_setpc_b64 s[30:31]
1633 ; VI-LABEL: global_atomic_add_i32_ret_offset_scalar:
1635 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1636 ; VI-NEXT: s_add_u32 s34, s4, 16
1637 ; VI-NEXT: s_addc_u32 s35, s5, 0
1638 ; VI-NEXT: v_mov_b32_e32 v0, s34
1639 ; VI-NEXT: v_mov_b32_e32 v1, s35
1640 ; VI-NEXT: v_mov_b32_e32 v2, s6
1641 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1642 ; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc
1643 ; VI-NEXT: s_waitcnt vmcnt(0)
1644 ; VI-NEXT: buffer_wbinvl1_vol
1645 ; VI-NEXT: s_setpc_b64 s[30:31]
1647 ; GFX9-LABEL: global_atomic_add_i32_ret_offset_scalar:
1649 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1650 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1651 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1652 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1653 ; GFX9-NEXT: global_atomic_add v0, v0, v1, s[4:5] offset:16 glc
1654 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1655 ; GFX9-NEXT: buffer_wbinvl1_vol
1656 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1657 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
1658 %result = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst
1662 ; ---------------------------------------------------------------------
1664 ; ---------------------------------------------------------------------
1666 define void @global_atomic_sub_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
1667 ; SI-LABEL: global_atomic_sub_i32_noret:
1669 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1670 ; SI-NEXT: s_mov_b32 s6, 0
1671 ; SI-NEXT: s_mov_b32 s7, 0xf000
1672 ; SI-NEXT: s_mov_b32 s4, s6
1673 ; SI-NEXT: s_mov_b32 s5, s6
1674 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1675 ; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64
1676 ; SI-NEXT: s_waitcnt vmcnt(0)
1677 ; SI-NEXT: buffer_wbinvl1
1678 ; SI-NEXT: s_waitcnt expcnt(0)
1679 ; SI-NEXT: s_setpc_b64 s[30:31]
1681 ; VI-LABEL: global_atomic_sub_i32_noret:
1683 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1684 ; VI-NEXT: flat_atomic_sub v[0:1], v2
1685 ; VI-NEXT: s_waitcnt vmcnt(0)
1686 ; VI-NEXT: buffer_wbinvl1_vol
1687 ; VI-NEXT: s_setpc_b64 s[30:31]
1689 ; GFX9-LABEL: global_atomic_sub_i32_noret:
1691 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1692 ; GFX9-NEXT: global_atomic_sub v[0:1], v2, off
1693 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1694 ; GFX9-NEXT: buffer_wbinvl1_vol
1695 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1696 %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst
1700 define void @global_atomic_sub_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
1701 ; SI-LABEL: global_atomic_sub_i32_noret_offset:
1703 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1704 ; SI-NEXT: s_mov_b32 s6, 0
1705 ; SI-NEXT: s_mov_b32 s7, 0xf000
1706 ; SI-NEXT: s_mov_b32 s4, s6
1707 ; SI-NEXT: s_mov_b32 s5, s6
1708 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1709 ; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16
1710 ; SI-NEXT: s_waitcnt vmcnt(0)
1711 ; SI-NEXT: buffer_wbinvl1
1712 ; SI-NEXT: s_waitcnt expcnt(0)
1713 ; SI-NEXT: s_setpc_b64 s[30:31]
1715 ; VI-LABEL: global_atomic_sub_i32_noret_offset:
1717 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1718 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
1719 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1720 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1721 ; VI-NEXT: flat_atomic_sub v[0:1], v2
1722 ; VI-NEXT: s_waitcnt vmcnt(0)
1723 ; VI-NEXT: buffer_wbinvl1_vol
1724 ; VI-NEXT: s_setpc_b64 s[30:31]
1726 ; GFX9-LABEL: global_atomic_sub_i32_noret_offset:
1728 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1729 ; GFX9-NEXT: global_atomic_sub v[0:1], v2, off offset:16
1730 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1731 ; GFX9-NEXT: buffer_wbinvl1_vol
1732 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1733 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
1734 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst
1738 define i32 @global_atomic_sub_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
1739 ; SI-LABEL: global_atomic_sub_i32_ret:
1741 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1742 ; SI-NEXT: s_mov_b32 s6, 0
1743 ; SI-NEXT: s_mov_b32 s7, 0xf000
1744 ; SI-NEXT: s_mov_b32 s4, s6
1745 ; SI-NEXT: s_mov_b32 s5, s6
1746 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1747 ; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 glc
1748 ; SI-NEXT: s_waitcnt vmcnt(0)
1749 ; SI-NEXT: buffer_wbinvl1
1750 ; SI-NEXT: v_mov_b32_e32 v0, v2
1751 ; SI-NEXT: s_waitcnt expcnt(0)
1752 ; SI-NEXT: s_setpc_b64 s[30:31]
1754 ; VI-LABEL: global_atomic_sub_i32_ret:
1756 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1757 ; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
1758 ; VI-NEXT: s_waitcnt vmcnt(0)
1759 ; VI-NEXT: buffer_wbinvl1_vol
1760 ; VI-NEXT: s_setpc_b64 s[30:31]
1762 ; GFX9-LABEL: global_atomic_sub_i32_ret:
1764 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1765 ; GFX9-NEXT: global_atomic_sub v0, v[0:1], v2, off glc
1766 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1767 ; GFX9-NEXT: buffer_wbinvl1_vol
1768 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1769 %result = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst
1773 define i32 @global_atomic_sub_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
1774 ; SI-LABEL: global_atomic_sub_i32_ret_offset:
1776 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1777 ; SI-NEXT: s_mov_b32 s6, 0
1778 ; SI-NEXT: s_mov_b32 s7, 0xf000
1779 ; SI-NEXT: s_mov_b32 s4, s6
1780 ; SI-NEXT: s_mov_b32 s5, s6
1781 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1782 ; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
1783 ; SI-NEXT: s_waitcnt vmcnt(0)
1784 ; SI-NEXT: buffer_wbinvl1
1785 ; SI-NEXT: v_mov_b32_e32 v0, v2
1786 ; SI-NEXT: s_waitcnt expcnt(0)
1787 ; SI-NEXT: s_setpc_b64 s[30:31]
1789 ; VI-LABEL: global_atomic_sub_i32_ret_offset:
1791 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1792 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
1793 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1794 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1795 ; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
1796 ; VI-NEXT: s_waitcnt vmcnt(0)
1797 ; VI-NEXT: buffer_wbinvl1_vol
1798 ; VI-NEXT: s_setpc_b64 s[30:31]
1800 ; GFX9-LABEL: global_atomic_sub_i32_ret_offset:
1802 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1803 ; GFX9-NEXT: global_atomic_sub v0, v[0:1], v2, off offset:16 glc
1804 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1805 ; GFX9-NEXT: buffer_wbinvl1_vol
1806 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1807 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
1808 %result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst
1812 define amdgpu_gfx void @global_atomic_sub_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
1813 ; SI-LABEL: global_atomic_sub_i32_noret_scalar:
1815 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1816 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1817 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
1818 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1819 ; SI-NEXT: s_waitcnt expcnt(0)
1820 ; SI-NEXT: v_writelane_b32 v0, s6, 0
1821 ; SI-NEXT: v_writelane_b32 v0, s7, 1
1822 ; SI-NEXT: s_mov_b32 s34, s6
1823 ; SI-NEXT: s_mov_b32 s7, 0xf000
1824 ; SI-NEXT: s_mov_b32 s6, -1
1825 ; SI-NEXT: v_mov_b32_e32 v1, s34
1826 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1827 ; SI-NEXT: buffer_atomic_sub v1, off, s[4:7], 0
1828 ; SI-NEXT: s_waitcnt vmcnt(0)
1829 ; SI-NEXT: buffer_wbinvl1
1830 ; SI-NEXT: v_readlane_b32 s7, v0, 1
1831 ; SI-NEXT: v_readlane_b32 s6, v0, 0
1832 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1833 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
1834 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1835 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1836 ; SI-NEXT: s_setpc_b64 s[30:31]
1838 ; VI-LABEL: global_atomic_sub_i32_noret_scalar:
1840 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1841 ; VI-NEXT: v_mov_b32_e32 v0, s4
1842 ; VI-NEXT: v_mov_b32_e32 v1, s5
1843 ; VI-NEXT: v_mov_b32_e32 v2, s6
1844 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1845 ; VI-NEXT: flat_atomic_sub v[0:1], v2
1846 ; VI-NEXT: s_waitcnt vmcnt(0)
1847 ; VI-NEXT: buffer_wbinvl1_vol
1848 ; VI-NEXT: s_setpc_b64 s[30:31]
1850 ; GFX9-LABEL: global_atomic_sub_i32_noret_scalar:
1852 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1853 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1854 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1855 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1856 ; GFX9-NEXT: global_atomic_sub v0, v1, s[4:5]
1857 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1858 ; GFX9-NEXT: buffer_wbinvl1_vol
1859 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1860 %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst
1864 define amdgpu_gfx void @global_atomic_sub_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
1865 ; SI-LABEL: global_atomic_sub_i32_noret_offset_scalar:
1867 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1868 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1869 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
1870 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1871 ; SI-NEXT: s_waitcnt expcnt(0)
1872 ; SI-NEXT: v_writelane_b32 v0, s6, 0
1873 ; SI-NEXT: v_writelane_b32 v0, s7, 1
1874 ; SI-NEXT: s_mov_b32 s34, s6
1875 ; SI-NEXT: s_mov_b32 s7, 0xf000
1876 ; SI-NEXT: s_mov_b32 s6, -1
1877 ; SI-NEXT: v_mov_b32_e32 v1, s34
1878 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1879 ; SI-NEXT: buffer_atomic_sub v1, off, s[4:7], 0 offset:16
1880 ; SI-NEXT: s_waitcnt vmcnt(0)
1881 ; SI-NEXT: buffer_wbinvl1
1882 ; SI-NEXT: v_readlane_b32 s7, v0, 1
1883 ; SI-NEXT: v_readlane_b32 s6, v0, 0
1884 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1885 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
1886 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1887 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1888 ; SI-NEXT: s_setpc_b64 s[30:31]
1890 ; VI-LABEL: global_atomic_sub_i32_noret_offset_scalar:
1892 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1893 ; VI-NEXT: s_add_u32 s34, s4, 16
1894 ; VI-NEXT: s_addc_u32 s35, s5, 0
1895 ; VI-NEXT: v_mov_b32_e32 v0, s34
1896 ; VI-NEXT: v_mov_b32_e32 v1, s35
1897 ; VI-NEXT: v_mov_b32_e32 v2, s6
1898 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1899 ; VI-NEXT: flat_atomic_sub v[0:1], v2
1900 ; VI-NEXT: s_waitcnt vmcnt(0)
1901 ; VI-NEXT: buffer_wbinvl1_vol
1902 ; VI-NEXT: s_setpc_b64 s[30:31]
1904 ; GFX9-LABEL: global_atomic_sub_i32_noret_offset_scalar:
1906 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1907 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1908 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1909 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1910 ; GFX9-NEXT: global_atomic_sub v0, v1, s[4:5] offset:16
1911 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1912 ; GFX9-NEXT: buffer_wbinvl1_vol
1913 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1914 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
1915 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst
1919 define amdgpu_gfx i32 @global_atomic_sub_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
1920 ; SI-LABEL: global_atomic_sub_i32_ret_scalar:
1922 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1923 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1924 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1925 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1926 ; SI-NEXT: s_waitcnt expcnt(0)
1927 ; SI-NEXT: v_writelane_b32 v1, s6, 0
1928 ; SI-NEXT: v_writelane_b32 v1, s7, 1
1929 ; SI-NEXT: s_mov_b32 s34, s6
1930 ; SI-NEXT: s_mov_b32 s7, 0xf000
1931 ; SI-NEXT: s_mov_b32 s6, -1
1932 ; SI-NEXT: v_mov_b32_e32 v0, s34
1933 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1934 ; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc
1935 ; SI-NEXT: s_waitcnt vmcnt(0)
1936 ; SI-NEXT: buffer_wbinvl1
1937 ; SI-NEXT: v_readlane_b32 s7, v1, 1
1938 ; SI-NEXT: v_readlane_b32 s6, v1, 0
1939 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1940 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1941 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1942 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1943 ; SI-NEXT: s_setpc_b64 s[30:31]
1945 ; VI-LABEL: global_atomic_sub_i32_ret_scalar:
1947 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1948 ; VI-NEXT: v_mov_b32_e32 v0, s4
1949 ; VI-NEXT: v_mov_b32_e32 v1, s5
1950 ; VI-NEXT: v_mov_b32_e32 v2, s6
1951 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1952 ; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
1953 ; VI-NEXT: s_waitcnt vmcnt(0)
1954 ; VI-NEXT: buffer_wbinvl1_vol
1955 ; VI-NEXT: s_setpc_b64 s[30:31]
1957 ; GFX9-LABEL: global_atomic_sub_i32_ret_scalar:
1959 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1960 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1961 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1962 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1963 ; GFX9-NEXT: global_atomic_sub v0, v0, v1, s[4:5] glc
1964 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1965 ; GFX9-NEXT: buffer_wbinvl1_vol
1966 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1967 %result = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst
1971 define amdgpu_gfx i32 @global_atomic_sub_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
1972 ; SI-LABEL: global_atomic_sub_i32_ret_offset_scalar:
1974 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1975 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1976 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
1977 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1978 ; SI-NEXT: s_waitcnt expcnt(0)
1979 ; SI-NEXT: v_writelane_b32 v1, s6, 0
1980 ; SI-NEXT: v_writelane_b32 v1, s7, 1
1981 ; SI-NEXT: s_mov_b32 s34, s6
1982 ; SI-NEXT: s_mov_b32 s7, 0xf000
1983 ; SI-NEXT: s_mov_b32 s6, -1
1984 ; SI-NEXT: v_mov_b32_e32 v0, s34
1985 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1986 ; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 offset:16 glc
1987 ; SI-NEXT: s_waitcnt vmcnt(0)
1988 ; SI-NEXT: buffer_wbinvl1
1989 ; SI-NEXT: v_readlane_b32 s7, v1, 1
1990 ; SI-NEXT: v_readlane_b32 s6, v1, 0
1991 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1992 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
1993 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1994 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1995 ; SI-NEXT: s_setpc_b64 s[30:31]
1997 ; VI-LABEL: global_atomic_sub_i32_ret_offset_scalar:
1999 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2000 ; VI-NEXT: s_add_u32 s34, s4, 16
2001 ; VI-NEXT: s_addc_u32 s35, s5, 0
2002 ; VI-NEXT: v_mov_b32_e32 v0, s34
2003 ; VI-NEXT: v_mov_b32_e32 v1, s35
2004 ; VI-NEXT: v_mov_b32_e32 v2, s6
2005 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2006 ; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
2007 ; VI-NEXT: s_waitcnt vmcnt(0)
2008 ; VI-NEXT: buffer_wbinvl1_vol
2009 ; VI-NEXT: s_setpc_b64 s[30:31]
2011 ; GFX9-LABEL: global_atomic_sub_i32_ret_offset_scalar:
2013 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2014 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2015 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
2016 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2017 ; GFX9-NEXT: global_atomic_sub v0, v0, v1, s[4:5] offset:16 glc
2018 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2019 ; GFX9-NEXT: buffer_wbinvl1_vol
2020 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2021 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
2022 %result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst
2026 ; ---------------------------------------------------------------------
2028 ; ---------------------------------------------------------------------
2030 define void @global_atomic_and_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
2031 ; SI-LABEL: global_atomic_and_i32_noret:
2033 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2034 ; SI-NEXT: s_mov_b32 s6, 0
2035 ; SI-NEXT: s_mov_b32 s7, 0xf000
2036 ; SI-NEXT: s_mov_b32 s4, s6
2037 ; SI-NEXT: s_mov_b32 s5, s6
2038 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2039 ; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64
2040 ; SI-NEXT: s_waitcnt vmcnt(0)
2041 ; SI-NEXT: buffer_wbinvl1
2042 ; SI-NEXT: s_waitcnt expcnt(0)
2043 ; SI-NEXT: s_setpc_b64 s[30:31]
2045 ; VI-LABEL: global_atomic_and_i32_noret:
2047 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2048 ; VI-NEXT: flat_atomic_and v[0:1], v2
2049 ; VI-NEXT: s_waitcnt vmcnt(0)
2050 ; VI-NEXT: buffer_wbinvl1_vol
2051 ; VI-NEXT: s_setpc_b64 s[30:31]
2053 ; GFX9-LABEL: global_atomic_and_i32_noret:
2055 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2056 ; GFX9-NEXT: global_atomic_and v[0:1], v2, off
2057 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2058 ; GFX9-NEXT: buffer_wbinvl1_vol
2059 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2060 %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst
2064 define void @global_atomic_and_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
2065 ; SI-LABEL: global_atomic_and_i32_noret_offset:
2067 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2068 ; SI-NEXT: s_mov_b32 s6, 0
2069 ; SI-NEXT: s_mov_b32 s7, 0xf000
2070 ; SI-NEXT: s_mov_b32 s4, s6
2071 ; SI-NEXT: s_mov_b32 s5, s6
2072 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2073 ; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16
2074 ; SI-NEXT: s_waitcnt vmcnt(0)
2075 ; SI-NEXT: buffer_wbinvl1
2076 ; SI-NEXT: s_waitcnt expcnt(0)
2077 ; SI-NEXT: s_setpc_b64 s[30:31]
2079 ; VI-LABEL: global_atomic_and_i32_noret_offset:
2081 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2082 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
2083 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2084 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2085 ; VI-NEXT: flat_atomic_and v[0:1], v2
2086 ; VI-NEXT: s_waitcnt vmcnt(0)
2087 ; VI-NEXT: buffer_wbinvl1_vol
2088 ; VI-NEXT: s_setpc_b64 s[30:31]
2090 ; GFX9-LABEL: global_atomic_and_i32_noret_offset:
2092 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2093 ; GFX9-NEXT: global_atomic_and v[0:1], v2, off offset:16
2094 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2095 ; GFX9-NEXT: buffer_wbinvl1_vol
2096 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2097 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
2098 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst
2102 define i32 @global_atomic_and_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
2103 ; SI-LABEL: global_atomic_and_i32_ret:
2105 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2106 ; SI-NEXT: s_mov_b32 s6, 0
2107 ; SI-NEXT: s_mov_b32 s7, 0xf000
2108 ; SI-NEXT: s_mov_b32 s4, s6
2109 ; SI-NEXT: s_mov_b32 s5, s6
2110 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2111 ; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 glc
2112 ; SI-NEXT: s_waitcnt vmcnt(0)
2113 ; SI-NEXT: buffer_wbinvl1
2114 ; SI-NEXT: v_mov_b32_e32 v0, v2
2115 ; SI-NEXT: s_waitcnt expcnt(0)
2116 ; SI-NEXT: s_setpc_b64 s[30:31]
2118 ; VI-LABEL: global_atomic_and_i32_ret:
2120 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2121 ; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc
2122 ; VI-NEXT: s_waitcnt vmcnt(0)
2123 ; VI-NEXT: buffer_wbinvl1_vol
2124 ; VI-NEXT: s_setpc_b64 s[30:31]
2126 ; GFX9-LABEL: global_atomic_and_i32_ret:
2128 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2129 ; GFX9-NEXT: global_atomic_and v0, v[0:1], v2, off glc
2130 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2131 ; GFX9-NEXT: buffer_wbinvl1_vol
2132 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2133 %result = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst
2137 define i32 @global_atomic_and_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
2138 ; SI-LABEL: global_atomic_and_i32_ret_offset:
2140 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2141 ; SI-NEXT: s_mov_b32 s6, 0
2142 ; SI-NEXT: s_mov_b32 s7, 0xf000
2143 ; SI-NEXT: s_mov_b32 s4, s6
2144 ; SI-NEXT: s_mov_b32 s5, s6
2145 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2146 ; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
2147 ; SI-NEXT: s_waitcnt vmcnt(0)
2148 ; SI-NEXT: buffer_wbinvl1
2149 ; SI-NEXT: v_mov_b32_e32 v0, v2
2150 ; SI-NEXT: s_waitcnt expcnt(0)
2151 ; SI-NEXT: s_setpc_b64 s[30:31]
2153 ; VI-LABEL: global_atomic_and_i32_ret_offset:
2155 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2156 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
2157 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2158 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2159 ; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc
2160 ; VI-NEXT: s_waitcnt vmcnt(0)
2161 ; VI-NEXT: buffer_wbinvl1_vol
2162 ; VI-NEXT: s_setpc_b64 s[30:31]
2164 ; GFX9-LABEL: global_atomic_and_i32_ret_offset:
2166 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2167 ; GFX9-NEXT: global_atomic_and v0, v[0:1], v2, off offset:16 glc
2168 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2169 ; GFX9-NEXT: buffer_wbinvl1_vol
2170 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2171 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
2172 %result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst
2176 define amdgpu_gfx void @global_atomic_and_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
2177 ; SI-LABEL: global_atomic_and_i32_noret_scalar:
2179 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2180 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2181 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
2182 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2183 ; SI-NEXT: s_waitcnt expcnt(0)
2184 ; SI-NEXT: v_writelane_b32 v0, s6, 0
2185 ; SI-NEXT: v_writelane_b32 v0, s7, 1
2186 ; SI-NEXT: s_mov_b32 s34, s6
2187 ; SI-NEXT: s_mov_b32 s7, 0xf000
2188 ; SI-NEXT: s_mov_b32 s6, -1
2189 ; SI-NEXT: v_mov_b32_e32 v1, s34
2190 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2191 ; SI-NEXT: buffer_atomic_and v1, off, s[4:7], 0
2192 ; SI-NEXT: s_waitcnt vmcnt(0)
2193 ; SI-NEXT: buffer_wbinvl1
2194 ; SI-NEXT: v_readlane_b32 s7, v0, 1
2195 ; SI-NEXT: v_readlane_b32 s6, v0, 0
2196 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2197 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
2198 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2199 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2200 ; SI-NEXT: s_setpc_b64 s[30:31]
2202 ; VI-LABEL: global_atomic_and_i32_noret_scalar:
2204 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2205 ; VI-NEXT: v_mov_b32_e32 v0, s4
2206 ; VI-NEXT: v_mov_b32_e32 v1, s5
2207 ; VI-NEXT: v_mov_b32_e32 v2, s6
2208 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2209 ; VI-NEXT: flat_atomic_and v[0:1], v2
2210 ; VI-NEXT: s_waitcnt vmcnt(0)
2211 ; VI-NEXT: buffer_wbinvl1_vol
2212 ; VI-NEXT: s_setpc_b64 s[30:31]
2214 ; GFX9-LABEL: global_atomic_and_i32_noret_scalar:
2216 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2217 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2218 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
2219 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2220 ; GFX9-NEXT: global_atomic_and v0, v1, s[4:5]
2221 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2222 ; GFX9-NEXT: buffer_wbinvl1_vol
2223 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2224 %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst
2228 define amdgpu_gfx void @global_atomic_and_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
2229 ; SI-LABEL: global_atomic_and_i32_noret_offset_scalar:
2231 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2232 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2233 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
2234 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2235 ; SI-NEXT: s_waitcnt expcnt(0)
2236 ; SI-NEXT: v_writelane_b32 v0, s6, 0
2237 ; SI-NEXT: v_writelane_b32 v0, s7, 1
2238 ; SI-NEXT: s_mov_b32 s34, s6
2239 ; SI-NEXT: s_mov_b32 s7, 0xf000
2240 ; SI-NEXT: s_mov_b32 s6, -1
2241 ; SI-NEXT: v_mov_b32_e32 v1, s34
2242 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2243 ; SI-NEXT: buffer_atomic_and v1, off, s[4:7], 0 offset:16
2244 ; SI-NEXT: s_waitcnt vmcnt(0)
2245 ; SI-NEXT: buffer_wbinvl1
2246 ; SI-NEXT: v_readlane_b32 s7, v0, 1
2247 ; SI-NEXT: v_readlane_b32 s6, v0, 0
2248 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2249 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
2250 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2251 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2252 ; SI-NEXT: s_setpc_b64 s[30:31]
2254 ; VI-LABEL: global_atomic_and_i32_noret_offset_scalar:
2256 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2257 ; VI-NEXT: s_add_u32 s34, s4, 16
2258 ; VI-NEXT: s_addc_u32 s35, s5, 0
2259 ; VI-NEXT: v_mov_b32_e32 v0, s34
2260 ; VI-NEXT: v_mov_b32_e32 v1, s35
2261 ; VI-NEXT: v_mov_b32_e32 v2, s6
2262 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2263 ; VI-NEXT: flat_atomic_and v[0:1], v2
2264 ; VI-NEXT: s_waitcnt vmcnt(0)
2265 ; VI-NEXT: buffer_wbinvl1_vol
2266 ; VI-NEXT: s_setpc_b64 s[30:31]
2268 ; GFX9-LABEL: global_atomic_and_i32_noret_offset_scalar:
2270 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2271 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2272 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
2273 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2274 ; GFX9-NEXT: global_atomic_and v0, v1, s[4:5] offset:16
2275 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2276 ; GFX9-NEXT: buffer_wbinvl1_vol
2277 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2278 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
2279 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst
2283 define amdgpu_gfx i32 @global_atomic_and_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
2284 ; SI-LABEL: global_atomic_and_i32_ret_scalar:
2286 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2287 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2288 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
2289 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2290 ; SI-NEXT: s_waitcnt expcnt(0)
2291 ; SI-NEXT: v_writelane_b32 v1, s6, 0
2292 ; SI-NEXT: v_writelane_b32 v1, s7, 1
2293 ; SI-NEXT: s_mov_b32 s34, s6
2294 ; SI-NEXT: s_mov_b32 s7, 0xf000
2295 ; SI-NEXT: s_mov_b32 s6, -1
2296 ; SI-NEXT: v_mov_b32_e32 v0, s34
2297 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2298 ; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 glc
2299 ; SI-NEXT: s_waitcnt vmcnt(0)
2300 ; SI-NEXT: buffer_wbinvl1
2301 ; SI-NEXT: v_readlane_b32 s7, v1, 1
2302 ; SI-NEXT: v_readlane_b32 s6, v1, 0
2303 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2304 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
2305 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2306 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2307 ; SI-NEXT: s_setpc_b64 s[30:31]
2309 ; VI-LABEL: global_atomic_and_i32_ret_scalar:
2311 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2312 ; VI-NEXT: v_mov_b32_e32 v0, s4
2313 ; VI-NEXT: v_mov_b32_e32 v1, s5
2314 ; VI-NEXT: v_mov_b32_e32 v2, s6
2315 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2316 ; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc
2317 ; VI-NEXT: s_waitcnt vmcnt(0)
2318 ; VI-NEXT: buffer_wbinvl1_vol
2319 ; VI-NEXT: s_setpc_b64 s[30:31]
2321 ; GFX9-LABEL: global_atomic_and_i32_ret_scalar:
2323 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2324 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2325 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
2326 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2327 ; GFX9-NEXT: global_atomic_and v0, v0, v1, s[4:5] glc
2328 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2329 ; GFX9-NEXT: buffer_wbinvl1_vol
2330 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2331 %result = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst
2335 define amdgpu_gfx i32 @global_atomic_and_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
2336 ; SI-LABEL: global_atomic_and_i32_ret_offset_scalar:
2338 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2339 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2340 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
2341 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2342 ; SI-NEXT: s_waitcnt expcnt(0)
2343 ; SI-NEXT: v_writelane_b32 v1, s6, 0
2344 ; SI-NEXT: v_writelane_b32 v1, s7, 1
2345 ; SI-NEXT: s_mov_b32 s34, s6
2346 ; SI-NEXT: s_mov_b32 s7, 0xf000
2347 ; SI-NEXT: s_mov_b32 s6, -1
2348 ; SI-NEXT: v_mov_b32_e32 v0, s34
2349 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2350 ; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 offset:16 glc
2351 ; SI-NEXT: s_waitcnt vmcnt(0)
2352 ; SI-NEXT: buffer_wbinvl1
2353 ; SI-NEXT: v_readlane_b32 s7, v1, 1
2354 ; SI-NEXT: v_readlane_b32 s6, v1, 0
2355 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2356 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
2357 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2358 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2359 ; SI-NEXT: s_setpc_b64 s[30:31]
2361 ; VI-LABEL: global_atomic_and_i32_ret_offset_scalar:
2363 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2364 ; VI-NEXT: s_add_u32 s34, s4, 16
2365 ; VI-NEXT: s_addc_u32 s35, s5, 0
2366 ; VI-NEXT: v_mov_b32_e32 v0, s34
2367 ; VI-NEXT: v_mov_b32_e32 v1, s35
2368 ; VI-NEXT: v_mov_b32_e32 v2, s6
2369 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2370 ; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc
2371 ; VI-NEXT: s_waitcnt vmcnt(0)
2372 ; VI-NEXT: buffer_wbinvl1_vol
2373 ; VI-NEXT: s_setpc_b64 s[30:31]
2375 ; GFX9-LABEL: global_atomic_and_i32_ret_offset_scalar:
2377 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2378 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2379 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
2380 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2381 ; GFX9-NEXT: global_atomic_and v0, v0, v1, s[4:5] offset:16 glc
2382 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2383 ; GFX9-NEXT: buffer_wbinvl1_vol
2384 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2385 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
2386 %result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst
2390 ; ---------------------------------------------------------------------
2392 ; ---------------------------------------------------------------------
2394 define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
2395 ; SI-LABEL: global_atomic_nand_i32_noret:
2397 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2398 ; SI-NEXT: s_mov_b32 s6, 0
2399 ; SI-NEXT: s_mov_b32 s7, 0xf000
2400 ; SI-NEXT: s_mov_b32 s4, s6
2401 ; SI-NEXT: s_mov_b32 s5, s6
2402 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
2403 ; SI-NEXT: s_mov_b64 s[8:9], 0
2404 ; SI-NEXT: .LBB40_1: ; %atomicrmw.start
2405 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
2406 ; SI-NEXT: s_waitcnt vmcnt(0)
2407 ; SI-NEXT: v_and_b32_e32 v3, v4, v2
2408 ; SI-NEXT: v_not_b32_e32 v3, v3
2409 ; SI-NEXT: s_waitcnt expcnt(0)
2410 ; SI-NEXT: v_mov_b32_e32 v6, v4
2411 ; SI-NEXT: v_mov_b32_e32 v5, v3
2412 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2413 ; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
2414 ; SI-NEXT: s_waitcnt vmcnt(0)
2415 ; SI-NEXT: buffer_wbinvl1
2416 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
2417 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
2418 ; SI-NEXT: v_mov_b32_e32 v4, v5
2419 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
2420 ; SI-NEXT: s_cbranch_execnz .LBB40_1
2421 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
2422 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
2423 ; SI-NEXT: s_waitcnt expcnt(0)
2424 ; SI-NEXT: s_setpc_b64 s[30:31]
2426 ; VI-LABEL: global_atomic_nand_i32_noret:
2428 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2429 ; VI-NEXT: flat_load_dword v4, v[0:1]
2430 ; VI-NEXT: s_mov_b64 s[4:5], 0
2431 ; VI-NEXT: .LBB40_1: ; %atomicrmw.start
2432 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
2433 ; VI-NEXT: s_waitcnt vmcnt(0)
2434 ; VI-NEXT: v_and_b32_e32 v3, v4, v2
2435 ; VI-NEXT: v_not_b32_e32 v3, v3
2436 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2437 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2438 ; VI-NEXT: s_waitcnt vmcnt(0)
2439 ; VI-NEXT: buffer_wbinvl1_vol
2440 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
2441 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2442 ; VI-NEXT: v_mov_b32_e32 v4, v3
2443 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
2444 ; VI-NEXT: s_cbranch_execnz .LBB40_1
2445 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
2446 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
2447 ; VI-NEXT: s_setpc_b64 s[30:31]
2449 ; GFX9-LABEL: global_atomic_nand_i32_noret:
2451 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2452 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
2453 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
2454 ; GFX9-NEXT: .LBB40_1: ; %atomicrmw.start
2455 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2456 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2457 ; GFX9-NEXT: v_and_b32_e32 v3, v4, v2
2458 ; GFX9-NEXT: v_not_b32_e32 v3, v3
2459 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2460 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
2461 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2462 ; GFX9-NEXT: buffer_wbinvl1_vol
2463 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
2464 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2465 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
2466 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
2467 ; GFX9-NEXT: s_cbranch_execnz .LBB40_1
2468 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2469 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
2470 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2471 %tmp0 = atomicrmw nand ptr addrspace(1) %ptr, i32 %in seq_cst
2475 define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
2476 ; SI-LABEL: global_atomic_nand_i32_noret_offset:
2478 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2479 ; SI-NEXT: s_mov_b32 s6, 0
2480 ; SI-NEXT: s_mov_b32 s7, 0xf000
2481 ; SI-NEXT: s_mov_b32 s4, s6
2482 ; SI-NEXT: s_mov_b32 s5, s6
2483 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
2484 ; SI-NEXT: s_mov_b64 s[8:9], 0
2485 ; SI-NEXT: .LBB41_1: ; %atomicrmw.start
2486 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
2487 ; SI-NEXT: s_waitcnt vmcnt(0)
2488 ; SI-NEXT: v_and_b32_e32 v3, v4, v2
2489 ; SI-NEXT: v_not_b32_e32 v3, v3
2490 ; SI-NEXT: s_waitcnt expcnt(0)
2491 ; SI-NEXT: v_mov_b32_e32 v6, v4
2492 ; SI-NEXT: v_mov_b32_e32 v5, v3
2493 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2494 ; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
2495 ; SI-NEXT: s_waitcnt vmcnt(0)
2496 ; SI-NEXT: buffer_wbinvl1
2497 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
2498 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
2499 ; SI-NEXT: v_mov_b32_e32 v4, v5
2500 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
2501 ; SI-NEXT: s_cbranch_execnz .LBB41_1
2502 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
2503 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
2504 ; SI-NEXT: s_waitcnt expcnt(0)
2505 ; SI-NEXT: s_setpc_b64 s[30:31]
2507 ; VI-LABEL: global_atomic_nand_i32_noret_offset:
2509 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2510 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
2511 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2512 ; VI-NEXT: flat_load_dword v4, v[0:1]
2513 ; VI-NEXT: s_mov_b64 s[4:5], 0
2514 ; VI-NEXT: .LBB41_1: ; %atomicrmw.start
2515 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
2516 ; VI-NEXT: s_waitcnt vmcnt(0)
2517 ; VI-NEXT: v_and_b32_e32 v3, v4, v2
2518 ; VI-NEXT: v_not_b32_e32 v3, v3
2519 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2520 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2521 ; VI-NEXT: s_waitcnt vmcnt(0)
2522 ; VI-NEXT: buffer_wbinvl1_vol
2523 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
2524 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2525 ; VI-NEXT: v_mov_b32_e32 v4, v3
2526 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
2527 ; VI-NEXT: s_cbranch_execnz .LBB41_1
2528 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
2529 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
2530 ; VI-NEXT: s_setpc_b64 s[30:31]
2532 ; GFX9-LABEL: global_atomic_nand_i32_noret_offset:
2534 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2535 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16
2536 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
2537 ; GFX9-NEXT: .LBB41_1: ; %atomicrmw.start
2538 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2539 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2540 ; GFX9-NEXT: v_and_b32_e32 v3, v4, v2
2541 ; GFX9-NEXT: v_not_b32_e32 v3, v3
2542 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2543 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
2544 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2545 ; GFX9-NEXT: buffer_wbinvl1_vol
2546 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
2547 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2548 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
2549 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
2550 ; GFX9-NEXT: s_cbranch_execnz .LBB41_1
2551 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2552 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
2553 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2554 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
2555 %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst
2559 define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
2560 ; SI-LABEL: global_atomic_nand_i32_ret:
2562 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2563 ; SI-NEXT: s_mov_b32 s6, 0
2564 ; SI-NEXT: s_mov_b32 s7, 0xf000
2565 ; SI-NEXT: s_mov_b32 s4, s6
2566 ; SI-NEXT: s_mov_b32 s5, s6
2567 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
2568 ; SI-NEXT: s_mov_b64 s[8:9], 0
2569 ; SI-NEXT: .LBB42_1: ; %atomicrmw.start
2570 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
2571 ; SI-NEXT: s_waitcnt vmcnt(0)
2572 ; SI-NEXT: v_mov_b32_e32 v5, v3
2573 ; SI-NEXT: s_waitcnt expcnt(0)
2574 ; SI-NEXT: v_and_b32_e32 v3, v5, v2
2575 ; SI-NEXT: v_not_b32_e32 v4, v3
2576 ; SI-NEXT: v_mov_b32_e32 v3, v4
2577 ; SI-NEXT: v_mov_b32_e32 v4, v5
2578 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2579 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
2580 ; SI-NEXT: s_waitcnt vmcnt(0)
2581 ; SI-NEXT: buffer_wbinvl1
2582 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
2583 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
2584 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
2585 ; SI-NEXT: s_cbranch_execnz .LBB42_1
2586 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
2587 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
2588 ; SI-NEXT: v_mov_b32_e32 v0, v3
2589 ; SI-NEXT: s_waitcnt expcnt(0)
2590 ; SI-NEXT: s_setpc_b64 s[30:31]
2592 ; VI-LABEL: global_atomic_nand_i32_ret:
2594 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2595 ; VI-NEXT: flat_load_dword v3, v[0:1]
2596 ; VI-NEXT: s_mov_b64 s[4:5], 0
2597 ; VI-NEXT: .LBB42_1: ; %atomicrmw.start
2598 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
2599 ; VI-NEXT: s_waitcnt vmcnt(0)
2600 ; VI-NEXT: v_mov_b32_e32 v4, v3
2601 ; VI-NEXT: v_and_b32_e32 v3, v4, v2
2602 ; VI-NEXT: v_not_b32_e32 v3, v3
2603 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2604 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2605 ; VI-NEXT: s_waitcnt vmcnt(0)
2606 ; VI-NEXT: buffer_wbinvl1_vol
2607 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
2608 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2609 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
2610 ; VI-NEXT: s_cbranch_execnz .LBB42_1
2611 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
2612 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
2613 ; VI-NEXT: v_mov_b32_e32 v0, v3
2614 ; VI-NEXT: s_setpc_b64 s[30:31]
2616 ; GFX9-LABEL: global_atomic_nand_i32_ret:
2618 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2619 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
2620 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
2621 ; GFX9-NEXT: .LBB42_1: ; %atomicrmw.start
2622 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2623 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2624 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
2625 ; GFX9-NEXT: v_and_b32_e32 v3, v4, v2
2626 ; GFX9-NEXT: v_not_b32_e32 v3, v3
2627 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2628 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
2629 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2630 ; GFX9-NEXT: buffer_wbinvl1_vol
2631 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
2632 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2633 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
2634 ; GFX9-NEXT: s_cbranch_execnz .LBB42_1
2635 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2636 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
2637 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
2638 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2639 %result = atomicrmw nand ptr addrspace(1) %ptr, i32 %in seq_cst
2643 define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
2644 ; SI-LABEL: global_atomic_nand_i32_ret_offset:
2646 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2647 ; SI-NEXT: s_mov_b32 s6, 0
2648 ; SI-NEXT: s_mov_b32 s7, 0xf000
2649 ; SI-NEXT: s_mov_b32 s4, s6
2650 ; SI-NEXT: s_mov_b32 s5, s6
2651 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
2652 ; SI-NEXT: s_mov_b64 s[8:9], 0
2653 ; SI-NEXT: .LBB43_1: ; %atomicrmw.start
2654 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
2655 ; SI-NEXT: s_waitcnt vmcnt(0)
2656 ; SI-NEXT: v_mov_b32_e32 v5, v3
2657 ; SI-NEXT: s_waitcnt expcnt(0)
2658 ; SI-NEXT: v_and_b32_e32 v3, v5, v2
2659 ; SI-NEXT: v_not_b32_e32 v4, v3
2660 ; SI-NEXT: v_mov_b32_e32 v3, v4
2661 ; SI-NEXT: v_mov_b32_e32 v4, v5
2662 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2663 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
2664 ; SI-NEXT: s_waitcnt vmcnt(0)
2665 ; SI-NEXT: buffer_wbinvl1
2666 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
2667 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
2668 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
2669 ; SI-NEXT: s_cbranch_execnz .LBB43_1
2670 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
2671 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
2672 ; SI-NEXT: v_mov_b32_e32 v0, v3
2673 ; SI-NEXT: s_waitcnt expcnt(0)
2674 ; SI-NEXT: s_setpc_b64 s[30:31]
2676 ; VI-LABEL: global_atomic_nand_i32_ret_offset:
2678 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2679 ; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0
2680 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
2681 ; VI-NEXT: flat_load_dword v0, v[3:4]
2682 ; VI-NEXT: s_mov_b64 s[4:5], 0
2683 ; VI-NEXT: .LBB43_1: ; %atomicrmw.start
2684 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
2685 ; VI-NEXT: s_waitcnt vmcnt(0)
2686 ; VI-NEXT: v_mov_b32_e32 v1, v0
2687 ; VI-NEXT: v_and_b32_e32 v0, v1, v2
2688 ; VI-NEXT: v_not_b32_e32 v0, v0
2689 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2690 ; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2691 ; VI-NEXT: s_waitcnt vmcnt(0)
2692 ; VI-NEXT: buffer_wbinvl1_vol
2693 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2694 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2695 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
2696 ; VI-NEXT: s_cbranch_execnz .LBB43_1
2697 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
2698 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
2699 ; VI-NEXT: s_setpc_b64 s[30:31]
2701 ; GFX9-LABEL: global_atomic_nand_i32_ret_offset:
2703 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2704 ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16
2705 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
2706 ; GFX9-NEXT: .LBB43_1: ; %atomicrmw.start
2707 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2708 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2709 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
2710 ; GFX9-NEXT: v_and_b32_e32 v3, v4, v2
2711 ; GFX9-NEXT: v_not_b32_e32 v3, v3
2712 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2713 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
2714 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2715 ; GFX9-NEXT: buffer_wbinvl1_vol
2716 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
2717 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2718 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
2719 ; GFX9-NEXT: s_cbranch_execnz .LBB43_1
2720 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2721 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
2722 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
2723 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2724 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
2725 %result = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst
2729 define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
2730 ; SI-LABEL: global_atomic_nand_i32_noret_scalar:
2732 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2733 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2734 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
2735 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2736 ; SI-NEXT: s_waitcnt expcnt(0)
2737 ; SI-NEXT: v_writelane_b32 v0, s6, 0
2738 ; SI-NEXT: v_writelane_b32 v0, s7, 1
2739 ; SI-NEXT: s_mov_b32 s34, s6
2740 ; SI-NEXT: s_mov_b32 s7, 0xf000
2741 ; SI-NEXT: s_mov_b32 s6, -1
2742 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
2743 ; SI-NEXT: s_mov_b64 s[36:37], 0
2744 ; SI-NEXT: .LBB44_1: ; %atomicrmw.start
2745 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
2746 ; SI-NEXT: s_waitcnt vmcnt(0)
2747 ; SI-NEXT: v_and_b32_e32 v1, s34, v2
2748 ; SI-NEXT: v_not_b32_e32 v1, v1
2749 ; SI-NEXT: s_waitcnt expcnt(0)
2750 ; SI-NEXT: v_mov_b32_e32 v4, v2
2751 ; SI-NEXT: v_mov_b32_e32 v3, v1
2752 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2753 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc
2754 ; SI-NEXT: s_waitcnt vmcnt(0)
2755 ; SI-NEXT: buffer_wbinvl1
2756 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
2757 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
2758 ; SI-NEXT: v_mov_b32_e32 v2, v3
2759 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
2760 ; SI-NEXT: s_cbranch_execnz .LBB44_1
2761 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
2762 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
2763 ; SI-NEXT: v_readlane_b32 s7, v0, 1
2764 ; SI-NEXT: v_readlane_b32 s6, v0, 0
2765 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2766 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
2767 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2768 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2769 ; SI-NEXT: s_setpc_b64 s[30:31]
2771 ; VI-LABEL: global_atomic_nand_i32_noret_scalar:
2773 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2774 ; VI-NEXT: v_mov_b32_e32 v0, s4
2775 ; VI-NEXT: v_mov_b32_e32 v1, s5
2776 ; VI-NEXT: flat_load_dword v1, v[0:1]
2777 ; VI-NEXT: s_mov_b64 s[34:35], 0
2778 ; VI-NEXT: .LBB44_1: ; %atomicrmw.start
2779 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
2780 ; VI-NEXT: s_waitcnt vmcnt(0)
2781 ; VI-NEXT: v_and_b32_e32 v0, s6, v1
2782 ; VI-NEXT: v_mov_b32_e32 v2, s4
2783 ; VI-NEXT: v_mov_b32_e32 v3, s5
2784 ; VI-NEXT: v_not_b32_e32 v0, v0
2785 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2786 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
2787 ; VI-NEXT: s_waitcnt vmcnt(0)
2788 ; VI-NEXT: buffer_wbinvl1_vol
2789 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2790 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2791 ; VI-NEXT: v_mov_b32_e32 v1, v0
2792 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
2793 ; VI-NEXT: s_cbranch_execnz .LBB44_1
2794 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
2795 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
2796 ; VI-NEXT: s_setpc_b64 s[30:31]
2798 ; GFX9-LABEL: global_atomic_nand_i32_noret_scalar:
2800 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2801 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2802 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5]
2803 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
2804 ; GFX9-NEXT: .LBB44_1: ; %atomicrmw.start
2805 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2806 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2807 ; GFX9-NEXT: v_and_b32_e32 v0, s6, v1
2808 ; GFX9-NEXT: v_not_b32_e32 v0, v0
2809 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2810 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc
2811 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2812 ; GFX9-NEXT: buffer_wbinvl1_vol
2813 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2814 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2815 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
2816 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
2817 ; GFX9-NEXT: s_cbranch_execnz .LBB44_1
2818 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2819 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
2820 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2821 %tmp0 = atomicrmw nand ptr addrspace(1) %ptr, i32 %in seq_cst
2825 define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
2826 ; SI-LABEL: global_atomic_nand_i32_noret_offset_scalar:
2828 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2829 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2830 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
2831 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2832 ; SI-NEXT: s_waitcnt expcnt(0)
2833 ; SI-NEXT: v_writelane_b32 v0, s6, 0
2834 ; SI-NEXT: v_writelane_b32 v0, s7, 1
2835 ; SI-NEXT: s_mov_b32 s34, s6
2836 ; SI-NEXT: s_mov_b32 s7, 0xf000
2837 ; SI-NEXT: s_mov_b32 s6, -1
2838 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16
2839 ; SI-NEXT: s_mov_b64 s[36:37], 0
2840 ; SI-NEXT: .LBB45_1: ; %atomicrmw.start
2841 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
2842 ; SI-NEXT: s_waitcnt vmcnt(0)
2843 ; SI-NEXT: v_and_b32_e32 v1, s34, v2
2844 ; SI-NEXT: v_not_b32_e32 v1, v1
2845 ; SI-NEXT: s_waitcnt expcnt(0)
2846 ; SI-NEXT: v_mov_b32_e32 v4, v2
2847 ; SI-NEXT: v_mov_b32_e32 v3, v1
2848 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2849 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc
2850 ; SI-NEXT: s_waitcnt vmcnt(0)
2851 ; SI-NEXT: buffer_wbinvl1
2852 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
2853 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
2854 ; SI-NEXT: v_mov_b32_e32 v2, v3
2855 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
2856 ; SI-NEXT: s_cbranch_execnz .LBB45_1
2857 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
2858 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
2859 ; SI-NEXT: v_readlane_b32 s7, v0, 1
2860 ; SI-NEXT: v_readlane_b32 s6, v0, 0
2861 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2862 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
2863 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2864 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2865 ; SI-NEXT: s_setpc_b64 s[30:31]
2867 ; VI-LABEL: global_atomic_nand_i32_noret_offset_scalar:
2869 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2870 ; VI-NEXT: s_add_u32 s34, s4, 16
2871 ; VI-NEXT: s_addc_u32 s35, s5, 0
2872 ; VI-NEXT: v_mov_b32_e32 v0, s34
2873 ; VI-NEXT: v_mov_b32_e32 v1, s35
2874 ; VI-NEXT: flat_load_dword v1, v[0:1]
2875 ; VI-NEXT: s_mov_b64 s[36:37], 0
2876 ; VI-NEXT: .LBB45_1: ; %atomicrmw.start
2877 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
2878 ; VI-NEXT: s_waitcnt vmcnt(0)
2879 ; VI-NEXT: v_and_b32_e32 v0, s6, v1
2880 ; VI-NEXT: v_mov_b32_e32 v2, s34
2881 ; VI-NEXT: v_mov_b32_e32 v3, s35
2882 ; VI-NEXT: v_not_b32_e32 v0, v0
2883 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2884 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
2885 ; VI-NEXT: s_waitcnt vmcnt(0)
2886 ; VI-NEXT: buffer_wbinvl1_vol
2887 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2888 ; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
2889 ; VI-NEXT: v_mov_b32_e32 v1, v0
2890 ; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
2891 ; VI-NEXT: s_cbranch_execnz .LBB45_1
2892 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
2893 ; VI-NEXT: s_or_b64 exec, exec, s[36:37]
2894 ; VI-NEXT: s_setpc_b64 s[30:31]
2896 ; GFX9-LABEL: global_atomic_nand_i32_noret_offset_scalar:
2898 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2899 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2900 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16
2901 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
2902 ; GFX9-NEXT: .LBB45_1: ; %atomicrmw.start
2903 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2904 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2905 ; GFX9-NEXT: v_and_b32_e32 v0, s6, v1
2906 ; GFX9-NEXT: v_not_b32_e32 v0, v0
2907 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2908 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc
2909 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2910 ; GFX9-NEXT: buffer_wbinvl1_vol
2911 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2912 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2913 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
2914 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
2915 ; GFX9-NEXT: s_cbranch_execnz .LBB45_1
2916 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2917 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
2918 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2919 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
2920 %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst
2924 define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
2925 ; SI-LABEL: global_atomic_nand_i32_ret_scalar:
2927 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2928 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2929 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
2930 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2931 ; SI-NEXT: s_waitcnt expcnt(0)
2932 ; SI-NEXT: v_writelane_b32 v1, s6, 0
2933 ; SI-NEXT: v_writelane_b32 v1, s7, 1
2934 ; SI-NEXT: s_mov_b32 s34, s6
2935 ; SI-NEXT: s_mov_b32 s7, 0xf000
2936 ; SI-NEXT: s_mov_b32 s6, -1
2937 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
2938 ; SI-NEXT: s_mov_b64 s[36:37], 0
2939 ; SI-NEXT: .LBB46_1: ; %atomicrmw.start
2940 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
2941 ; SI-NEXT: s_waitcnt vmcnt(0)
2942 ; SI-NEXT: v_mov_b32_e32 v4, v2
2943 ; SI-NEXT: v_and_b32_e32 v0, s34, v4
2944 ; SI-NEXT: s_waitcnt expcnt(0)
2945 ; SI-NEXT: v_not_b32_e32 v3, v0
2946 ; SI-NEXT: v_mov_b32_e32 v2, v3
2947 ; SI-NEXT: v_mov_b32_e32 v3, v4
2948 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2949 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
2950 ; SI-NEXT: s_waitcnt vmcnt(0)
2951 ; SI-NEXT: buffer_wbinvl1
2952 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
2953 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
2954 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
2955 ; SI-NEXT: s_cbranch_execnz .LBB46_1
2956 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
2957 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
2958 ; SI-NEXT: v_mov_b32_e32 v0, v2
2959 ; SI-NEXT: v_readlane_b32 s7, v1, 1
2960 ; SI-NEXT: v_readlane_b32 s6, v1, 0
2961 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2962 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
2963 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2964 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2965 ; SI-NEXT: s_setpc_b64 s[30:31]
2967 ; VI-LABEL: global_atomic_nand_i32_ret_scalar:
2969 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2970 ; VI-NEXT: v_mov_b32_e32 v0, s4
2971 ; VI-NEXT: v_mov_b32_e32 v1, s5
2972 ; VI-NEXT: flat_load_dword v0, v[0:1]
2973 ; VI-NEXT: s_mov_b64 s[34:35], 0
2974 ; VI-NEXT: .LBB46_1: ; %atomicrmw.start
2975 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
2976 ; VI-NEXT: s_waitcnt vmcnt(0)
2977 ; VI-NEXT: v_mov_b32_e32 v1, v0
2978 ; VI-NEXT: v_mov_b32_e32 v2, s4
2979 ; VI-NEXT: v_and_b32_e32 v0, s6, v1
2980 ; VI-NEXT: v_mov_b32_e32 v3, s5
2981 ; VI-NEXT: v_not_b32_e32 v0, v0
2982 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2983 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
2984 ; VI-NEXT: s_waitcnt vmcnt(0)
2985 ; VI-NEXT: buffer_wbinvl1_vol
2986 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2987 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2988 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
2989 ; VI-NEXT: s_cbranch_execnz .LBB46_1
2990 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
2991 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
2992 ; VI-NEXT: s_setpc_b64 s[30:31]
2994 ; GFX9-LABEL: global_atomic_nand_i32_ret_scalar:
2996 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2997 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
2998 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5]
2999 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
3000 ; GFX9-NEXT: .LBB46_1: ; %atomicrmw.start
3001 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
3002 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3003 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
3004 ; GFX9-NEXT: v_and_b32_e32 v0, s6, v3
3005 ; GFX9-NEXT: v_not_b32_e32 v2, v0
3006 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3007 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
3008 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3009 ; GFX9-NEXT: buffer_wbinvl1_vol
3010 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
3011 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
3012 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
3013 ; GFX9-NEXT: s_cbranch_execnz .LBB46_1
3014 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
3015 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
3016 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3017 %result = atomicrmw nand ptr addrspace(1) %ptr, i32 %in seq_cst
3021 define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
3022 ; SI-LABEL: global_atomic_nand_i32_ret_offset_scalar:
3024 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3025 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3026 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
3027 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3028 ; SI-NEXT: s_waitcnt expcnt(0)
3029 ; SI-NEXT: v_writelane_b32 v1, s6, 0
3030 ; SI-NEXT: v_writelane_b32 v1, s7, 1
3031 ; SI-NEXT: s_mov_b32 s34, s6
3032 ; SI-NEXT: s_mov_b32 s7, 0xf000
3033 ; SI-NEXT: s_mov_b32 s6, -1
3034 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16
3035 ; SI-NEXT: s_mov_b64 s[36:37], 0
3036 ; SI-NEXT: .LBB47_1: ; %atomicrmw.start
3037 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
3038 ; SI-NEXT: s_waitcnt vmcnt(0)
3039 ; SI-NEXT: v_mov_b32_e32 v4, v2
3040 ; SI-NEXT: v_and_b32_e32 v0, s34, v4
3041 ; SI-NEXT: s_waitcnt expcnt(0)
3042 ; SI-NEXT: v_not_b32_e32 v3, v0
3043 ; SI-NEXT: v_mov_b32_e32 v2, v3
3044 ; SI-NEXT: v_mov_b32_e32 v3, v4
3045 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3046 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
3047 ; SI-NEXT: s_waitcnt vmcnt(0)
3048 ; SI-NEXT: buffer_wbinvl1
3049 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
3050 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
3051 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
3052 ; SI-NEXT: s_cbranch_execnz .LBB47_1
3053 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
3054 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
3055 ; SI-NEXT: v_mov_b32_e32 v0, v2
3056 ; SI-NEXT: v_readlane_b32 s7, v1, 1
3057 ; SI-NEXT: v_readlane_b32 s6, v1, 0
3058 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3059 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
3060 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3061 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3062 ; SI-NEXT: s_setpc_b64 s[30:31]
3064 ; VI-LABEL: global_atomic_nand_i32_ret_offset_scalar:
3066 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3067 ; VI-NEXT: s_add_u32 s34, s4, 16
3068 ; VI-NEXT: s_addc_u32 s35, s5, 0
3069 ; VI-NEXT: v_mov_b32_e32 v0, s34
3070 ; VI-NEXT: v_mov_b32_e32 v1, s35
3071 ; VI-NEXT: flat_load_dword v0, v[0:1]
3072 ; VI-NEXT: s_mov_b64 s[36:37], 0
3073 ; VI-NEXT: .LBB47_1: ; %atomicrmw.start
3074 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
3075 ; VI-NEXT: s_waitcnt vmcnt(0)
3076 ; VI-NEXT: v_mov_b32_e32 v1, v0
3077 ; VI-NEXT: v_mov_b32_e32 v2, s34
3078 ; VI-NEXT: v_and_b32_e32 v0, s6, v1
3079 ; VI-NEXT: v_mov_b32_e32 v3, s35
3080 ; VI-NEXT: v_not_b32_e32 v0, v0
3081 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3082 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
3083 ; VI-NEXT: s_waitcnt vmcnt(0)
3084 ; VI-NEXT: buffer_wbinvl1_vol
3085 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
3086 ; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
3087 ; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
3088 ; VI-NEXT: s_cbranch_execnz .LBB47_1
3089 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
3090 ; VI-NEXT: s_or_b64 exec, exec, s[36:37]
3091 ; VI-NEXT: s_setpc_b64 s[30:31]
3093 ; GFX9-LABEL: global_atomic_nand_i32_ret_offset_scalar:
3095 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3096 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
3097 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16
3098 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
3099 ; GFX9-NEXT: .LBB47_1: ; %atomicrmw.start
3100 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
3101 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3102 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
3103 ; GFX9-NEXT: v_and_b32_e32 v0, s6, v3
3104 ; GFX9-NEXT: v_not_b32_e32 v2, v0
3105 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3106 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc
3107 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3108 ; GFX9-NEXT: buffer_wbinvl1_vol
3109 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
3110 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
3111 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
3112 ; GFX9-NEXT: s_cbranch_execnz .LBB47_1
3113 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
3114 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
3115 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3116 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3117 %result = atomicrmw nand ptr addrspace(1) %gep, i32 %in seq_cst
3121 ; ---------------------------------------------------------------------
3123 ; ---------------------------------------------------------------------
3125 define void @global_atomic_or_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
3126 ; SI-LABEL: global_atomic_or_i32_noret:
3128 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3129 ; SI-NEXT: s_mov_b32 s6, 0
3130 ; SI-NEXT: s_mov_b32 s7, 0xf000
3131 ; SI-NEXT: s_mov_b32 s4, s6
3132 ; SI-NEXT: s_mov_b32 s5, s6
3133 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3134 ; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64
3135 ; SI-NEXT: s_waitcnt vmcnt(0)
3136 ; SI-NEXT: buffer_wbinvl1
3137 ; SI-NEXT: s_waitcnt expcnt(0)
3138 ; SI-NEXT: s_setpc_b64 s[30:31]
3140 ; VI-LABEL: global_atomic_or_i32_noret:
3142 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3143 ; VI-NEXT: flat_atomic_or v[0:1], v2
3144 ; VI-NEXT: s_waitcnt vmcnt(0)
3145 ; VI-NEXT: buffer_wbinvl1_vol
3146 ; VI-NEXT: s_setpc_b64 s[30:31]
3148 ; GFX9-LABEL: global_atomic_or_i32_noret:
3150 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3151 ; GFX9-NEXT: global_atomic_or v[0:1], v2, off
3152 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3153 ; GFX9-NEXT: buffer_wbinvl1_vol
3154 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3155 %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst
3159 define void @global_atomic_or_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
3160 ; SI-LABEL: global_atomic_or_i32_noret_offset:
3162 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3163 ; SI-NEXT: s_mov_b32 s6, 0
3164 ; SI-NEXT: s_mov_b32 s7, 0xf000
3165 ; SI-NEXT: s_mov_b32 s4, s6
3166 ; SI-NEXT: s_mov_b32 s5, s6
3167 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3168 ; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16
3169 ; SI-NEXT: s_waitcnt vmcnt(0)
3170 ; SI-NEXT: buffer_wbinvl1
3171 ; SI-NEXT: s_waitcnt expcnt(0)
3172 ; SI-NEXT: s_setpc_b64 s[30:31]
3174 ; VI-LABEL: global_atomic_or_i32_noret_offset:
3176 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3177 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
3178 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3179 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3180 ; VI-NEXT: flat_atomic_or v[0:1], v2
3181 ; VI-NEXT: s_waitcnt vmcnt(0)
3182 ; VI-NEXT: buffer_wbinvl1_vol
3183 ; VI-NEXT: s_setpc_b64 s[30:31]
3185 ; GFX9-LABEL: global_atomic_or_i32_noret_offset:
3187 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3188 ; GFX9-NEXT: global_atomic_or v[0:1], v2, off offset:16
3189 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3190 ; GFX9-NEXT: buffer_wbinvl1_vol
3191 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3192 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3193 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst
3197 define i32 @global_atomic_or_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
3198 ; SI-LABEL: global_atomic_or_i32_ret:
3200 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3201 ; SI-NEXT: s_mov_b32 s6, 0
3202 ; SI-NEXT: s_mov_b32 s7, 0xf000
3203 ; SI-NEXT: s_mov_b32 s4, s6
3204 ; SI-NEXT: s_mov_b32 s5, s6
3205 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3206 ; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 glc
3207 ; SI-NEXT: s_waitcnt vmcnt(0)
3208 ; SI-NEXT: buffer_wbinvl1
3209 ; SI-NEXT: v_mov_b32_e32 v0, v2
3210 ; SI-NEXT: s_waitcnt expcnt(0)
3211 ; SI-NEXT: s_setpc_b64 s[30:31]
3213 ; VI-LABEL: global_atomic_or_i32_ret:
3215 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3216 ; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc
3217 ; VI-NEXT: s_waitcnt vmcnt(0)
3218 ; VI-NEXT: buffer_wbinvl1_vol
3219 ; VI-NEXT: s_setpc_b64 s[30:31]
3221 ; GFX9-LABEL: global_atomic_or_i32_ret:
3223 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3224 ; GFX9-NEXT: global_atomic_or v0, v[0:1], v2, off glc
3225 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3226 ; GFX9-NEXT: buffer_wbinvl1_vol
3227 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3228 %result = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst
3232 define i32 @global_atomic_or_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
3233 ; SI-LABEL: global_atomic_or_i32_ret_offset:
3235 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3236 ; SI-NEXT: s_mov_b32 s6, 0
3237 ; SI-NEXT: s_mov_b32 s7, 0xf000
3238 ; SI-NEXT: s_mov_b32 s4, s6
3239 ; SI-NEXT: s_mov_b32 s5, s6
3240 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3241 ; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
3242 ; SI-NEXT: s_waitcnt vmcnt(0)
3243 ; SI-NEXT: buffer_wbinvl1
3244 ; SI-NEXT: v_mov_b32_e32 v0, v2
3245 ; SI-NEXT: s_waitcnt expcnt(0)
3246 ; SI-NEXT: s_setpc_b64 s[30:31]
3248 ; VI-LABEL: global_atomic_or_i32_ret_offset:
3250 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3251 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
3252 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3253 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3254 ; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc
3255 ; VI-NEXT: s_waitcnt vmcnt(0)
3256 ; VI-NEXT: buffer_wbinvl1_vol
3257 ; VI-NEXT: s_setpc_b64 s[30:31]
3259 ; GFX9-LABEL: global_atomic_or_i32_ret_offset:
3261 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3262 ; GFX9-NEXT: global_atomic_or v0, v[0:1], v2, off offset:16 glc
3263 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3264 ; GFX9-NEXT: buffer_wbinvl1_vol
3265 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3266 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3267 %result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst
3271 define amdgpu_gfx void @global_atomic_or_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
3272 ; SI-LABEL: global_atomic_or_i32_noret_scalar:
3274 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3275 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3276 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
3277 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3278 ; SI-NEXT: s_waitcnt expcnt(0)
3279 ; SI-NEXT: v_writelane_b32 v0, s6, 0
3280 ; SI-NEXT: v_writelane_b32 v0, s7, 1
3281 ; SI-NEXT: s_mov_b32 s34, s6
3282 ; SI-NEXT: s_mov_b32 s7, 0xf000
3283 ; SI-NEXT: s_mov_b32 s6, -1
3284 ; SI-NEXT: v_mov_b32_e32 v1, s34
3285 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3286 ; SI-NEXT: buffer_atomic_or v1, off, s[4:7], 0
3287 ; SI-NEXT: s_waitcnt vmcnt(0)
3288 ; SI-NEXT: buffer_wbinvl1
3289 ; SI-NEXT: v_readlane_b32 s7, v0, 1
3290 ; SI-NEXT: v_readlane_b32 s6, v0, 0
3291 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3292 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
3293 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3294 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3295 ; SI-NEXT: s_setpc_b64 s[30:31]
3297 ; VI-LABEL: global_atomic_or_i32_noret_scalar:
3299 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3300 ; VI-NEXT: v_mov_b32_e32 v0, s4
3301 ; VI-NEXT: v_mov_b32_e32 v1, s5
3302 ; VI-NEXT: v_mov_b32_e32 v2, s6
3303 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3304 ; VI-NEXT: flat_atomic_or v[0:1], v2
3305 ; VI-NEXT: s_waitcnt vmcnt(0)
3306 ; VI-NEXT: buffer_wbinvl1_vol
3307 ; VI-NEXT: s_setpc_b64 s[30:31]
3309 ; GFX9-LABEL: global_atomic_or_i32_noret_scalar:
3311 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3312 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3313 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
3314 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3315 ; GFX9-NEXT: global_atomic_or v0, v1, s[4:5]
3316 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3317 ; GFX9-NEXT: buffer_wbinvl1_vol
3318 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3319 %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst
3323 define amdgpu_gfx void @global_atomic_or_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
3324 ; SI-LABEL: global_atomic_or_i32_noret_offset_scalar:
3326 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3327 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3328 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
3329 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3330 ; SI-NEXT: s_waitcnt expcnt(0)
3331 ; SI-NEXT: v_writelane_b32 v0, s6, 0
3332 ; SI-NEXT: v_writelane_b32 v0, s7, 1
3333 ; SI-NEXT: s_mov_b32 s34, s6
3334 ; SI-NEXT: s_mov_b32 s7, 0xf000
3335 ; SI-NEXT: s_mov_b32 s6, -1
3336 ; SI-NEXT: v_mov_b32_e32 v1, s34
3337 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3338 ; SI-NEXT: buffer_atomic_or v1, off, s[4:7], 0 offset:16
3339 ; SI-NEXT: s_waitcnt vmcnt(0)
3340 ; SI-NEXT: buffer_wbinvl1
3341 ; SI-NEXT: v_readlane_b32 s7, v0, 1
3342 ; SI-NEXT: v_readlane_b32 s6, v0, 0
3343 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3344 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
3345 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3346 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3347 ; SI-NEXT: s_setpc_b64 s[30:31]
3349 ; VI-LABEL: global_atomic_or_i32_noret_offset_scalar:
3351 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3352 ; VI-NEXT: s_add_u32 s34, s4, 16
3353 ; VI-NEXT: s_addc_u32 s35, s5, 0
3354 ; VI-NEXT: v_mov_b32_e32 v0, s34
3355 ; VI-NEXT: v_mov_b32_e32 v1, s35
3356 ; VI-NEXT: v_mov_b32_e32 v2, s6
3357 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3358 ; VI-NEXT: flat_atomic_or v[0:1], v2
3359 ; VI-NEXT: s_waitcnt vmcnt(0)
3360 ; VI-NEXT: buffer_wbinvl1_vol
3361 ; VI-NEXT: s_setpc_b64 s[30:31]
3363 ; GFX9-LABEL: global_atomic_or_i32_noret_offset_scalar:
3365 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3366 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3367 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
3368 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3369 ; GFX9-NEXT: global_atomic_or v0, v1, s[4:5] offset:16
3370 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3371 ; GFX9-NEXT: buffer_wbinvl1_vol
3372 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3373 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3374 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst
3378 define amdgpu_gfx i32 @global_atomic_or_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
3379 ; SI-LABEL: global_atomic_or_i32_ret_scalar:
3381 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3382 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3383 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
3384 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3385 ; SI-NEXT: s_waitcnt expcnt(0)
3386 ; SI-NEXT: v_writelane_b32 v1, s6, 0
3387 ; SI-NEXT: v_writelane_b32 v1, s7, 1
3388 ; SI-NEXT: s_mov_b32 s34, s6
3389 ; SI-NEXT: s_mov_b32 s7, 0xf000
3390 ; SI-NEXT: s_mov_b32 s6, -1
3391 ; SI-NEXT: v_mov_b32_e32 v0, s34
3392 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3393 ; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 glc
3394 ; SI-NEXT: s_waitcnt vmcnt(0)
3395 ; SI-NEXT: buffer_wbinvl1
3396 ; SI-NEXT: v_readlane_b32 s7, v1, 1
3397 ; SI-NEXT: v_readlane_b32 s6, v1, 0
3398 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3399 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
3400 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3401 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3402 ; SI-NEXT: s_setpc_b64 s[30:31]
3404 ; VI-LABEL: global_atomic_or_i32_ret_scalar:
3406 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3407 ; VI-NEXT: v_mov_b32_e32 v0, s4
3408 ; VI-NEXT: v_mov_b32_e32 v1, s5
3409 ; VI-NEXT: v_mov_b32_e32 v2, s6
3410 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3411 ; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc
3412 ; VI-NEXT: s_waitcnt vmcnt(0)
3413 ; VI-NEXT: buffer_wbinvl1_vol
3414 ; VI-NEXT: s_setpc_b64 s[30:31]
3416 ; GFX9-LABEL: global_atomic_or_i32_ret_scalar:
3418 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3419 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3420 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
3421 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3422 ; GFX9-NEXT: global_atomic_or v0, v0, v1, s[4:5] glc
3423 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3424 ; GFX9-NEXT: buffer_wbinvl1_vol
3425 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3426 %result = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst
3430 define amdgpu_gfx i32 @global_atomic_or_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
3431 ; SI-LABEL: global_atomic_or_i32_ret_offset_scalar:
3433 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3434 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3435 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
3436 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3437 ; SI-NEXT: s_waitcnt expcnt(0)
3438 ; SI-NEXT: v_writelane_b32 v1, s6, 0
3439 ; SI-NEXT: v_writelane_b32 v1, s7, 1
3440 ; SI-NEXT: s_mov_b32 s34, s6
3441 ; SI-NEXT: s_mov_b32 s7, 0xf000
3442 ; SI-NEXT: s_mov_b32 s6, -1
3443 ; SI-NEXT: v_mov_b32_e32 v0, s34
3444 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3445 ; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 offset:16 glc
3446 ; SI-NEXT: s_waitcnt vmcnt(0)
3447 ; SI-NEXT: buffer_wbinvl1
3448 ; SI-NEXT: v_readlane_b32 s7, v1, 1
3449 ; SI-NEXT: v_readlane_b32 s6, v1, 0
3450 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3451 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
3452 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3453 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3454 ; SI-NEXT: s_setpc_b64 s[30:31]
3456 ; VI-LABEL: global_atomic_or_i32_ret_offset_scalar:
3458 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3459 ; VI-NEXT: s_add_u32 s34, s4, 16
3460 ; VI-NEXT: s_addc_u32 s35, s5, 0
3461 ; VI-NEXT: v_mov_b32_e32 v0, s34
3462 ; VI-NEXT: v_mov_b32_e32 v1, s35
3463 ; VI-NEXT: v_mov_b32_e32 v2, s6
3464 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3465 ; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc
3466 ; VI-NEXT: s_waitcnt vmcnt(0)
3467 ; VI-NEXT: buffer_wbinvl1_vol
3468 ; VI-NEXT: s_setpc_b64 s[30:31]
3470 ; GFX9-LABEL: global_atomic_or_i32_ret_offset_scalar:
3472 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3473 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3474 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
3475 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3476 ; GFX9-NEXT: global_atomic_or v0, v0, v1, s[4:5] offset:16 glc
3477 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3478 ; GFX9-NEXT: buffer_wbinvl1_vol
3479 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3480 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3481 %result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst
3485 ; ---------------------------------------------------------------------
3487 ; ---------------------------------------------------------------------
3489 define void @global_atomic_xor_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
3490 ; SI-LABEL: global_atomic_xor_i32_noret:
3492 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3493 ; SI-NEXT: s_mov_b32 s6, 0
3494 ; SI-NEXT: s_mov_b32 s7, 0xf000
3495 ; SI-NEXT: s_mov_b32 s4, s6
3496 ; SI-NEXT: s_mov_b32 s5, s6
3497 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3498 ; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64
3499 ; SI-NEXT: s_waitcnt vmcnt(0)
3500 ; SI-NEXT: buffer_wbinvl1
3501 ; SI-NEXT: s_waitcnt expcnt(0)
3502 ; SI-NEXT: s_setpc_b64 s[30:31]
3504 ; VI-LABEL: global_atomic_xor_i32_noret:
3506 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3507 ; VI-NEXT: flat_atomic_xor v[0:1], v2
3508 ; VI-NEXT: s_waitcnt vmcnt(0)
3509 ; VI-NEXT: buffer_wbinvl1_vol
3510 ; VI-NEXT: s_setpc_b64 s[30:31]
3512 ; GFX9-LABEL: global_atomic_xor_i32_noret:
3514 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3515 ; GFX9-NEXT: global_atomic_xor v[0:1], v2, off
3516 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3517 ; GFX9-NEXT: buffer_wbinvl1_vol
3518 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3519 %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst
3523 define void @global_atomic_xor_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
3524 ; SI-LABEL: global_atomic_xor_i32_noret_offset:
3526 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3527 ; SI-NEXT: s_mov_b32 s6, 0
3528 ; SI-NEXT: s_mov_b32 s7, 0xf000
3529 ; SI-NEXT: s_mov_b32 s4, s6
3530 ; SI-NEXT: s_mov_b32 s5, s6
3531 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3532 ; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16
3533 ; SI-NEXT: s_waitcnt vmcnt(0)
3534 ; SI-NEXT: buffer_wbinvl1
3535 ; SI-NEXT: s_waitcnt expcnt(0)
3536 ; SI-NEXT: s_setpc_b64 s[30:31]
3538 ; VI-LABEL: global_atomic_xor_i32_noret_offset:
3540 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3541 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
3542 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3543 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3544 ; VI-NEXT: flat_atomic_xor v[0:1], v2
3545 ; VI-NEXT: s_waitcnt vmcnt(0)
3546 ; VI-NEXT: buffer_wbinvl1_vol
3547 ; VI-NEXT: s_setpc_b64 s[30:31]
3549 ; GFX9-LABEL: global_atomic_xor_i32_noret_offset:
3551 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3552 ; GFX9-NEXT: global_atomic_xor v[0:1], v2, off offset:16
3553 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3554 ; GFX9-NEXT: buffer_wbinvl1_vol
3555 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3556 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3557 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst
3561 define i32 @global_atomic_xor_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
3562 ; SI-LABEL: global_atomic_xor_i32_ret:
3564 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3565 ; SI-NEXT: s_mov_b32 s6, 0
3566 ; SI-NEXT: s_mov_b32 s7, 0xf000
3567 ; SI-NEXT: s_mov_b32 s4, s6
3568 ; SI-NEXT: s_mov_b32 s5, s6
3569 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3570 ; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 glc
3571 ; SI-NEXT: s_waitcnt vmcnt(0)
3572 ; SI-NEXT: buffer_wbinvl1
3573 ; SI-NEXT: v_mov_b32_e32 v0, v2
3574 ; SI-NEXT: s_waitcnt expcnt(0)
3575 ; SI-NEXT: s_setpc_b64 s[30:31]
3577 ; VI-LABEL: global_atomic_xor_i32_ret:
3579 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3580 ; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
3581 ; VI-NEXT: s_waitcnt vmcnt(0)
3582 ; VI-NEXT: buffer_wbinvl1_vol
3583 ; VI-NEXT: s_setpc_b64 s[30:31]
3585 ; GFX9-LABEL: global_atomic_xor_i32_ret:
3587 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3588 ; GFX9-NEXT: global_atomic_xor v0, v[0:1], v2, off glc
3589 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3590 ; GFX9-NEXT: buffer_wbinvl1_vol
3591 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3592 %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst
3596 define i32 @global_atomic_xor_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
3597 ; SI-LABEL: global_atomic_xor_i32_ret_offset:
3599 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3600 ; SI-NEXT: s_mov_b32 s6, 0
3601 ; SI-NEXT: s_mov_b32 s7, 0xf000
3602 ; SI-NEXT: s_mov_b32 s4, s6
3603 ; SI-NEXT: s_mov_b32 s5, s6
3604 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3605 ; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
3606 ; SI-NEXT: s_waitcnt vmcnt(0)
3607 ; SI-NEXT: buffer_wbinvl1
3608 ; SI-NEXT: v_mov_b32_e32 v0, v2
3609 ; SI-NEXT: s_waitcnt expcnt(0)
3610 ; SI-NEXT: s_setpc_b64 s[30:31]
3612 ; VI-LABEL: global_atomic_xor_i32_ret_offset:
3614 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3615 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
3616 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3617 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3618 ; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
3619 ; VI-NEXT: s_waitcnt vmcnt(0)
3620 ; VI-NEXT: buffer_wbinvl1_vol
3621 ; VI-NEXT: s_setpc_b64 s[30:31]
3623 ; GFX9-LABEL: global_atomic_xor_i32_ret_offset:
3625 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3626 ; GFX9-NEXT: global_atomic_xor v0, v[0:1], v2, off offset:16 glc
3627 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3628 ; GFX9-NEXT: buffer_wbinvl1_vol
3629 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3630 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3631 %result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst
3635 define amdgpu_gfx void @global_atomic_xor_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
3636 ; SI-LABEL: global_atomic_xor_i32_noret_scalar:
3638 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3639 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3640 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
3641 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3642 ; SI-NEXT: s_waitcnt expcnt(0)
3643 ; SI-NEXT: v_writelane_b32 v0, s6, 0
3644 ; SI-NEXT: v_writelane_b32 v0, s7, 1
3645 ; SI-NEXT: s_mov_b32 s34, s6
3646 ; SI-NEXT: s_mov_b32 s7, 0xf000
3647 ; SI-NEXT: s_mov_b32 s6, -1
3648 ; SI-NEXT: v_mov_b32_e32 v1, s34
3649 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3650 ; SI-NEXT: buffer_atomic_xor v1, off, s[4:7], 0
3651 ; SI-NEXT: s_waitcnt vmcnt(0)
3652 ; SI-NEXT: buffer_wbinvl1
3653 ; SI-NEXT: v_readlane_b32 s7, v0, 1
3654 ; SI-NEXT: v_readlane_b32 s6, v0, 0
3655 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3656 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
3657 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3658 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3659 ; SI-NEXT: s_setpc_b64 s[30:31]
3661 ; VI-LABEL: global_atomic_xor_i32_noret_scalar:
3663 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3664 ; VI-NEXT: v_mov_b32_e32 v0, s4
3665 ; VI-NEXT: v_mov_b32_e32 v1, s5
3666 ; VI-NEXT: v_mov_b32_e32 v2, s6
3667 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3668 ; VI-NEXT: flat_atomic_xor v[0:1], v2
3669 ; VI-NEXT: s_waitcnt vmcnt(0)
3670 ; VI-NEXT: buffer_wbinvl1_vol
3671 ; VI-NEXT: s_setpc_b64 s[30:31]
3673 ; GFX9-LABEL: global_atomic_xor_i32_noret_scalar:
3675 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3676 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3677 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
3678 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3679 ; GFX9-NEXT: global_atomic_xor v0, v1, s[4:5]
3680 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3681 ; GFX9-NEXT: buffer_wbinvl1_vol
3682 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3683 %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst
3687 define amdgpu_gfx void @global_atomic_xor_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
3688 ; SI-LABEL: global_atomic_xor_i32_noret_offset_scalar:
3690 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3691 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3692 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
3693 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3694 ; SI-NEXT: s_waitcnt expcnt(0)
3695 ; SI-NEXT: v_writelane_b32 v0, s6, 0
3696 ; SI-NEXT: v_writelane_b32 v0, s7, 1
3697 ; SI-NEXT: s_mov_b32 s34, s6
3698 ; SI-NEXT: s_mov_b32 s7, 0xf000
3699 ; SI-NEXT: s_mov_b32 s6, -1
3700 ; SI-NEXT: v_mov_b32_e32 v1, s34
3701 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3702 ; SI-NEXT: buffer_atomic_xor v1, off, s[4:7], 0 offset:16
3703 ; SI-NEXT: s_waitcnt vmcnt(0)
3704 ; SI-NEXT: buffer_wbinvl1
3705 ; SI-NEXT: v_readlane_b32 s7, v0, 1
3706 ; SI-NEXT: v_readlane_b32 s6, v0, 0
3707 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3708 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
3709 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3710 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3711 ; SI-NEXT: s_setpc_b64 s[30:31]
3713 ; VI-LABEL: global_atomic_xor_i32_noret_offset_scalar:
3715 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3716 ; VI-NEXT: s_add_u32 s34, s4, 16
3717 ; VI-NEXT: s_addc_u32 s35, s5, 0
3718 ; VI-NEXT: v_mov_b32_e32 v0, s34
3719 ; VI-NEXT: v_mov_b32_e32 v1, s35
3720 ; VI-NEXT: v_mov_b32_e32 v2, s6
3721 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3722 ; VI-NEXT: flat_atomic_xor v[0:1], v2
3723 ; VI-NEXT: s_waitcnt vmcnt(0)
3724 ; VI-NEXT: buffer_wbinvl1_vol
3725 ; VI-NEXT: s_setpc_b64 s[30:31]
3727 ; GFX9-LABEL: global_atomic_xor_i32_noret_offset_scalar:
3729 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3730 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3731 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
3732 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3733 ; GFX9-NEXT: global_atomic_xor v0, v1, s[4:5] offset:16
3734 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3735 ; GFX9-NEXT: buffer_wbinvl1_vol
3736 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3737 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3738 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst
3742 define amdgpu_gfx i32 @global_atomic_xor_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
3743 ; SI-LABEL: global_atomic_xor_i32_ret_scalar:
3745 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3746 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3747 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
3748 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3749 ; SI-NEXT: s_waitcnt expcnt(0)
3750 ; SI-NEXT: v_writelane_b32 v1, s6, 0
3751 ; SI-NEXT: v_writelane_b32 v1, s7, 1
3752 ; SI-NEXT: s_mov_b32 s34, s6
3753 ; SI-NEXT: s_mov_b32 s7, 0xf000
3754 ; SI-NEXT: s_mov_b32 s6, -1
3755 ; SI-NEXT: v_mov_b32_e32 v0, s34
3756 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3757 ; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 glc
3758 ; SI-NEXT: s_waitcnt vmcnt(0)
3759 ; SI-NEXT: buffer_wbinvl1
3760 ; SI-NEXT: v_readlane_b32 s7, v1, 1
3761 ; SI-NEXT: v_readlane_b32 s6, v1, 0
3762 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3763 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
3764 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3765 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3766 ; SI-NEXT: s_setpc_b64 s[30:31]
3768 ; VI-LABEL: global_atomic_xor_i32_ret_scalar:
3770 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3771 ; VI-NEXT: v_mov_b32_e32 v0, s4
3772 ; VI-NEXT: v_mov_b32_e32 v1, s5
3773 ; VI-NEXT: v_mov_b32_e32 v2, s6
3774 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3775 ; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
3776 ; VI-NEXT: s_waitcnt vmcnt(0)
3777 ; VI-NEXT: buffer_wbinvl1_vol
3778 ; VI-NEXT: s_setpc_b64 s[30:31]
3780 ; GFX9-LABEL: global_atomic_xor_i32_ret_scalar:
3782 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3783 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3784 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
3785 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3786 ; GFX9-NEXT: global_atomic_xor v0, v0, v1, s[4:5] glc
3787 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3788 ; GFX9-NEXT: buffer_wbinvl1_vol
3789 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3790 %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst
3794 define amdgpu_gfx i32 @global_atomic_xor_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
3795 ; SI-LABEL: global_atomic_xor_i32_ret_offset_scalar:
3797 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3798 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3799 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
3800 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3801 ; SI-NEXT: s_waitcnt expcnt(0)
3802 ; SI-NEXT: v_writelane_b32 v1, s6, 0
3803 ; SI-NEXT: v_writelane_b32 v1, s7, 1
3804 ; SI-NEXT: s_mov_b32 s34, s6
3805 ; SI-NEXT: s_mov_b32 s7, 0xf000
3806 ; SI-NEXT: s_mov_b32 s6, -1
3807 ; SI-NEXT: v_mov_b32_e32 v0, s34
3808 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3809 ; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 offset:16 glc
3810 ; SI-NEXT: s_waitcnt vmcnt(0)
3811 ; SI-NEXT: buffer_wbinvl1
3812 ; SI-NEXT: v_readlane_b32 s7, v1, 1
3813 ; SI-NEXT: v_readlane_b32 s6, v1, 0
3814 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3815 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
3816 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3817 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3818 ; SI-NEXT: s_setpc_b64 s[30:31]
3820 ; VI-LABEL: global_atomic_xor_i32_ret_offset_scalar:
3822 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3823 ; VI-NEXT: s_add_u32 s34, s4, 16
3824 ; VI-NEXT: s_addc_u32 s35, s5, 0
3825 ; VI-NEXT: v_mov_b32_e32 v0, s34
3826 ; VI-NEXT: v_mov_b32_e32 v1, s35
3827 ; VI-NEXT: v_mov_b32_e32 v2, s6
3828 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3829 ; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
3830 ; VI-NEXT: s_waitcnt vmcnt(0)
3831 ; VI-NEXT: buffer_wbinvl1_vol
3832 ; VI-NEXT: s_setpc_b64 s[30:31]
3834 ; GFX9-LABEL: global_atomic_xor_i32_ret_offset_scalar:
3836 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3837 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3838 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
3839 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3840 ; GFX9-NEXT: global_atomic_xor v0, v0, v1, s[4:5] offset:16 glc
3841 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3842 ; GFX9-NEXT: buffer_wbinvl1_vol
3843 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3844 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
3845 %result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst
3849 ; ---------------------------------------------------------------------
3851 ; ---------------------------------------------------------------------
3853 define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
3854 ; SI-LABEL: global_atomic_max_i32_noret:
3856 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3857 ; SI-NEXT: s_mov_b32 s6, 0
3858 ; SI-NEXT: s_mov_b32 s7, 0xf000
3859 ; SI-NEXT: s_mov_b32 s4, s6
3860 ; SI-NEXT: s_mov_b32 s5, s6
3861 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
3862 ; SI-NEXT: s_mov_b64 s[8:9], 0
3863 ; SI-NEXT: .LBB64_1: ; %atomicrmw.start
3864 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
3865 ; SI-NEXT: s_waitcnt vmcnt(0)
3866 ; SI-NEXT: v_max_i32_e32 v3, v4, v2
3867 ; SI-NEXT: s_waitcnt expcnt(0)
3868 ; SI-NEXT: v_mov_b32_e32 v6, v4
3869 ; SI-NEXT: v_mov_b32_e32 v5, v3
3870 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3871 ; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
3872 ; SI-NEXT: s_waitcnt vmcnt(0)
3873 ; SI-NEXT: buffer_wbinvl1
3874 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
3875 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
3876 ; SI-NEXT: v_mov_b32_e32 v4, v5
3877 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
3878 ; SI-NEXT: s_cbranch_execnz .LBB64_1
3879 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
3880 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
3881 ; SI-NEXT: s_waitcnt expcnt(0)
3882 ; SI-NEXT: s_setpc_b64 s[30:31]
3884 ; VI-LABEL: global_atomic_max_i32_noret:
3886 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3887 ; VI-NEXT: flat_load_dword v4, v[0:1]
3888 ; VI-NEXT: s_mov_b64 s[4:5], 0
3889 ; VI-NEXT: .LBB64_1: ; %atomicrmw.start
3890 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
3891 ; VI-NEXT: s_waitcnt vmcnt(0)
3892 ; VI-NEXT: v_max_i32_e32 v3, v4, v2
3893 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3894 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3895 ; VI-NEXT: s_waitcnt vmcnt(0)
3896 ; VI-NEXT: buffer_wbinvl1_vol
3897 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
3898 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3899 ; VI-NEXT: v_mov_b32_e32 v4, v3
3900 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
3901 ; VI-NEXT: s_cbranch_execnz .LBB64_1
3902 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
3903 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
3904 ; VI-NEXT: s_setpc_b64 s[30:31]
3906 ; GFX9-LABEL: global_atomic_max_i32_noret:
3908 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3909 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
3910 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
3911 ; GFX9-NEXT: .LBB64_1: ; %atomicrmw.start
3912 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
3913 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3914 ; GFX9-NEXT: v_max_i32_e32 v3, v4, v2
3915 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3916 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
3917 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3918 ; GFX9-NEXT: buffer_wbinvl1_vol
3919 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
3920 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3921 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
3922 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
3923 ; GFX9-NEXT: s_cbranch_execnz .LBB64_1
3924 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
3925 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
3926 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3927 %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst
3931 define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
3932 ; SI-LABEL: global_atomic_max_i32_noret_offset:
3934 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3935 ; SI-NEXT: s_mov_b32 s6, 0
3936 ; SI-NEXT: s_mov_b32 s7, 0xf000
3937 ; SI-NEXT: s_mov_b32 s4, s6
3938 ; SI-NEXT: s_mov_b32 s5, s6
3939 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
3940 ; SI-NEXT: s_mov_b64 s[8:9], 0
3941 ; SI-NEXT: .LBB65_1: ; %atomicrmw.start
3942 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
3943 ; SI-NEXT: s_waitcnt vmcnt(0)
3944 ; SI-NEXT: v_max_i32_e32 v3, v4, v2
3945 ; SI-NEXT: s_waitcnt expcnt(0)
3946 ; SI-NEXT: v_mov_b32_e32 v6, v4
3947 ; SI-NEXT: v_mov_b32_e32 v5, v3
3948 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3949 ; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
3950 ; SI-NEXT: s_waitcnt vmcnt(0)
3951 ; SI-NEXT: buffer_wbinvl1
3952 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
3953 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
3954 ; SI-NEXT: v_mov_b32_e32 v4, v5
3955 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
3956 ; SI-NEXT: s_cbranch_execnz .LBB65_1
3957 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
3958 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
3959 ; SI-NEXT: s_waitcnt expcnt(0)
3960 ; SI-NEXT: s_setpc_b64 s[30:31]
3962 ; VI-LABEL: global_atomic_max_i32_noret_offset:
3964 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3965 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
3966 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3967 ; VI-NEXT: flat_load_dword v4, v[0:1]
3968 ; VI-NEXT: s_mov_b64 s[4:5], 0
3969 ; VI-NEXT: .LBB65_1: ; %atomicrmw.start
3970 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
3971 ; VI-NEXT: s_waitcnt vmcnt(0)
3972 ; VI-NEXT: v_max_i32_e32 v3, v4, v2
3973 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3974 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3975 ; VI-NEXT: s_waitcnt vmcnt(0)
3976 ; VI-NEXT: buffer_wbinvl1_vol
3977 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
3978 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3979 ; VI-NEXT: v_mov_b32_e32 v4, v3
3980 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
3981 ; VI-NEXT: s_cbranch_execnz .LBB65_1
3982 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
3983 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
3984 ; VI-NEXT: s_setpc_b64 s[30:31]
3986 ; GFX9-LABEL: global_atomic_max_i32_noret_offset:
3988 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3989 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16
3990 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
3991 ; GFX9-NEXT: .LBB65_1: ; %atomicrmw.start
3992 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
3993 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3994 ; GFX9-NEXT: v_max_i32_e32 v3, v4, v2
3995 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3996 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
3997 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3998 ; GFX9-NEXT: buffer_wbinvl1_vol
3999 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
4000 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4001 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
4002 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
4003 ; GFX9-NEXT: s_cbranch_execnz .LBB65_1
4004 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4005 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
4006 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4007 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
4008 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst
4012 define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
4013 ; SI-LABEL: global_atomic_max_i32_ret:
4015 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4016 ; SI-NEXT: s_mov_b32 s6, 0
4017 ; SI-NEXT: s_mov_b32 s7, 0xf000
4018 ; SI-NEXT: s_mov_b32 s4, s6
4019 ; SI-NEXT: s_mov_b32 s5, s6
4020 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
4021 ; SI-NEXT: s_mov_b64 s[8:9], 0
4022 ; SI-NEXT: .LBB66_1: ; %atomicrmw.start
4023 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4024 ; SI-NEXT: s_waitcnt vmcnt(0)
4025 ; SI-NEXT: v_mov_b32_e32 v5, v3
4026 ; SI-NEXT: s_waitcnt expcnt(0)
4027 ; SI-NEXT: v_max_i32_e32 v4, v5, v2
4028 ; SI-NEXT: v_mov_b32_e32 v3, v4
4029 ; SI-NEXT: v_mov_b32_e32 v4, v5
4030 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4031 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
4032 ; SI-NEXT: s_waitcnt vmcnt(0)
4033 ; SI-NEXT: buffer_wbinvl1
4034 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
4035 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
4036 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
4037 ; SI-NEXT: s_cbranch_execnz .LBB66_1
4038 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4039 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
4040 ; SI-NEXT: v_mov_b32_e32 v0, v3
4041 ; SI-NEXT: s_waitcnt expcnt(0)
4042 ; SI-NEXT: s_setpc_b64 s[30:31]
4044 ; VI-LABEL: global_atomic_max_i32_ret:
4046 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4047 ; VI-NEXT: flat_load_dword v3, v[0:1]
4048 ; VI-NEXT: s_mov_b64 s[4:5], 0
4049 ; VI-NEXT: .LBB66_1: ; %atomicrmw.start
4050 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4051 ; VI-NEXT: s_waitcnt vmcnt(0)
4052 ; VI-NEXT: v_mov_b32_e32 v4, v3
4053 ; VI-NEXT: v_max_i32_e32 v3, v4, v2
4054 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4055 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4056 ; VI-NEXT: s_waitcnt vmcnt(0)
4057 ; VI-NEXT: buffer_wbinvl1_vol
4058 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
4059 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4060 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
4061 ; VI-NEXT: s_cbranch_execnz .LBB66_1
4062 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4063 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
4064 ; VI-NEXT: v_mov_b32_e32 v0, v3
4065 ; VI-NEXT: s_setpc_b64 s[30:31]
4067 ; GFX9-LABEL: global_atomic_max_i32_ret:
4069 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4070 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
4071 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
4072 ; GFX9-NEXT: .LBB66_1: ; %atomicrmw.start
4073 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4074 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4075 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
4076 ; GFX9-NEXT: v_max_i32_e32 v3, v4, v2
4077 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4078 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
4079 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4080 ; GFX9-NEXT: buffer_wbinvl1_vol
4081 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
4082 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4083 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
4084 ; GFX9-NEXT: s_cbranch_execnz .LBB66_1
4085 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4086 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
4087 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
4088 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4089 %result = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst
4093 define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
4094 ; SI-LABEL: global_atomic_max_i32_ret_offset:
4096 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4097 ; SI-NEXT: s_mov_b32 s6, 0
4098 ; SI-NEXT: s_mov_b32 s7, 0xf000
4099 ; SI-NEXT: s_mov_b32 s4, s6
4100 ; SI-NEXT: s_mov_b32 s5, s6
4101 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
4102 ; SI-NEXT: s_mov_b64 s[8:9], 0
4103 ; SI-NEXT: .LBB67_1: ; %atomicrmw.start
4104 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4105 ; SI-NEXT: s_waitcnt vmcnt(0)
4106 ; SI-NEXT: v_mov_b32_e32 v5, v3
4107 ; SI-NEXT: s_waitcnt expcnt(0)
4108 ; SI-NEXT: v_max_i32_e32 v4, v5, v2
4109 ; SI-NEXT: v_mov_b32_e32 v3, v4
4110 ; SI-NEXT: v_mov_b32_e32 v4, v5
4111 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4112 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
4113 ; SI-NEXT: s_waitcnt vmcnt(0)
4114 ; SI-NEXT: buffer_wbinvl1
4115 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
4116 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
4117 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
4118 ; SI-NEXT: s_cbranch_execnz .LBB67_1
4119 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4120 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
4121 ; SI-NEXT: v_mov_b32_e32 v0, v3
4122 ; SI-NEXT: s_waitcnt expcnt(0)
4123 ; SI-NEXT: s_setpc_b64 s[30:31]
4125 ; VI-LABEL: global_atomic_max_i32_ret_offset:
4127 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4128 ; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0
4129 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
4130 ; VI-NEXT: flat_load_dword v0, v[3:4]
4131 ; VI-NEXT: s_mov_b64 s[4:5], 0
4132 ; VI-NEXT: .LBB67_1: ; %atomicrmw.start
4133 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4134 ; VI-NEXT: s_waitcnt vmcnt(0)
4135 ; VI-NEXT: v_mov_b32_e32 v1, v0
4136 ; VI-NEXT: v_max_i32_e32 v0, v1, v2
4137 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4138 ; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
4139 ; VI-NEXT: s_waitcnt vmcnt(0)
4140 ; VI-NEXT: buffer_wbinvl1_vol
4141 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4142 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4143 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
4144 ; VI-NEXT: s_cbranch_execnz .LBB67_1
4145 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4146 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
4147 ; VI-NEXT: s_setpc_b64 s[30:31]
4149 ; GFX9-LABEL: global_atomic_max_i32_ret_offset:
4151 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4152 ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16
4153 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
4154 ; GFX9-NEXT: .LBB67_1: ; %atomicrmw.start
4155 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4156 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4157 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
4158 ; GFX9-NEXT: v_max_i32_e32 v3, v4, v2
4159 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4160 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
4161 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4162 ; GFX9-NEXT: buffer_wbinvl1_vol
4163 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
4164 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4165 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
4166 ; GFX9-NEXT: s_cbranch_execnz .LBB67_1
4167 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4168 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
4169 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
4170 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4171 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
4172 %result = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst
4176 define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
4177 ; SI-LABEL: global_atomic_max_i32_noret_scalar:
4179 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4180 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
4181 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
4182 ; SI-NEXT: s_mov_b64 exec, s[34:35]
4183 ; SI-NEXT: s_waitcnt expcnt(0)
4184 ; SI-NEXT: v_writelane_b32 v0, s6, 0
4185 ; SI-NEXT: v_writelane_b32 v0, s7, 1
4186 ; SI-NEXT: s_mov_b32 s34, s6
4187 ; SI-NEXT: s_mov_b32 s7, 0xf000
4188 ; SI-NEXT: s_mov_b32 s6, -1
4189 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
4190 ; SI-NEXT: s_mov_b64 s[36:37], 0
4191 ; SI-NEXT: .LBB68_1: ; %atomicrmw.start
4192 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4193 ; SI-NEXT: s_waitcnt vmcnt(0)
4194 ; SI-NEXT: v_max_i32_e32 v1, s34, v2
4195 ; SI-NEXT: s_waitcnt expcnt(0)
4196 ; SI-NEXT: v_mov_b32_e32 v4, v2
4197 ; SI-NEXT: v_mov_b32_e32 v3, v1
4198 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4199 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc
4200 ; SI-NEXT: s_waitcnt vmcnt(0)
4201 ; SI-NEXT: buffer_wbinvl1
4202 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
4203 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
4204 ; SI-NEXT: v_mov_b32_e32 v2, v3
4205 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
4206 ; SI-NEXT: s_cbranch_execnz .LBB68_1
4207 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4208 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
4209 ; SI-NEXT: v_readlane_b32 s7, v0, 1
4210 ; SI-NEXT: v_readlane_b32 s6, v0, 0
4211 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
4212 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
4213 ; SI-NEXT: s_mov_b64 exec, s[34:35]
4214 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4215 ; SI-NEXT: s_setpc_b64 s[30:31]
4217 ; VI-LABEL: global_atomic_max_i32_noret_scalar:
4219 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4220 ; VI-NEXT: v_mov_b32_e32 v0, s4
4221 ; VI-NEXT: v_mov_b32_e32 v1, s5
4222 ; VI-NEXT: flat_load_dword v1, v[0:1]
4223 ; VI-NEXT: s_mov_b64 s[34:35], 0
4224 ; VI-NEXT: .LBB68_1: ; %atomicrmw.start
4225 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4226 ; VI-NEXT: v_mov_b32_e32 v2, s4
4227 ; VI-NEXT: s_waitcnt vmcnt(0)
4228 ; VI-NEXT: v_max_i32_e32 v0, s6, v1
4229 ; VI-NEXT: v_mov_b32_e32 v3, s5
4230 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4231 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
4232 ; VI-NEXT: s_waitcnt vmcnt(0)
4233 ; VI-NEXT: buffer_wbinvl1_vol
4234 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4235 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4236 ; VI-NEXT: v_mov_b32_e32 v1, v0
4237 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
4238 ; VI-NEXT: s_cbranch_execnz .LBB68_1
4239 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4240 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
4241 ; VI-NEXT: s_setpc_b64 s[30:31]
4243 ; GFX9-LABEL: global_atomic_max_i32_noret_scalar:
4245 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4246 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4247 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5]
4248 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
4249 ; GFX9-NEXT: .LBB68_1: ; %atomicrmw.start
4250 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4251 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4252 ; GFX9-NEXT: v_max_i32_e32 v0, s6, v1
4253 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4254 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc
4255 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4256 ; GFX9-NEXT: buffer_wbinvl1_vol
4257 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4258 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4259 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
4260 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
4261 ; GFX9-NEXT: s_cbranch_execnz .LBB68_1
4262 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4263 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
4264 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4265 %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst
4269 define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
4270 ; SI-LABEL: global_atomic_max_i32_noret_offset_scalar:
4272 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4273 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
4274 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
4275 ; SI-NEXT: s_mov_b64 exec, s[34:35]
4276 ; SI-NEXT: s_waitcnt expcnt(0)
4277 ; SI-NEXT: v_writelane_b32 v0, s6, 0
4278 ; SI-NEXT: v_writelane_b32 v0, s7, 1
4279 ; SI-NEXT: s_mov_b32 s34, s6
4280 ; SI-NEXT: s_mov_b32 s7, 0xf000
4281 ; SI-NEXT: s_mov_b32 s6, -1
4282 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16
4283 ; SI-NEXT: s_mov_b64 s[36:37], 0
4284 ; SI-NEXT: .LBB69_1: ; %atomicrmw.start
4285 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4286 ; SI-NEXT: s_waitcnt vmcnt(0)
4287 ; SI-NEXT: v_max_i32_e32 v1, s34, v2
4288 ; SI-NEXT: s_waitcnt expcnt(0)
4289 ; SI-NEXT: v_mov_b32_e32 v4, v2
4290 ; SI-NEXT: v_mov_b32_e32 v3, v1
4291 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4292 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc
4293 ; SI-NEXT: s_waitcnt vmcnt(0)
4294 ; SI-NEXT: buffer_wbinvl1
4295 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
4296 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
4297 ; SI-NEXT: v_mov_b32_e32 v2, v3
4298 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
4299 ; SI-NEXT: s_cbranch_execnz .LBB69_1
4300 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4301 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
4302 ; SI-NEXT: v_readlane_b32 s7, v0, 1
4303 ; SI-NEXT: v_readlane_b32 s6, v0, 0
4304 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
4305 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
4306 ; SI-NEXT: s_mov_b64 exec, s[34:35]
4307 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4308 ; SI-NEXT: s_setpc_b64 s[30:31]
4310 ; VI-LABEL: global_atomic_max_i32_noret_offset_scalar:
4312 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4313 ; VI-NEXT: s_add_u32 s34, s4, 16
4314 ; VI-NEXT: s_addc_u32 s35, s5, 0
4315 ; VI-NEXT: v_mov_b32_e32 v0, s34
4316 ; VI-NEXT: v_mov_b32_e32 v1, s35
4317 ; VI-NEXT: flat_load_dword v1, v[0:1]
4318 ; VI-NEXT: s_mov_b64 s[36:37], 0
4319 ; VI-NEXT: .LBB69_1: ; %atomicrmw.start
4320 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4321 ; VI-NEXT: v_mov_b32_e32 v2, s34
4322 ; VI-NEXT: s_waitcnt vmcnt(0)
4323 ; VI-NEXT: v_max_i32_e32 v0, s6, v1
4324 ; VI-NEXT: v_mov_b32_e32 v3, s35
4325 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4326 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
4327 ; VI-NEXT: s_waitcnt vmcnt(0)
4328 ; VI-NEXT: buffer_wbinvl1_vol
4329 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4330 ; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
4331 ; VI-NEXT: v_mov_b32_e32 v1, v0
4332 ; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
4333 ; VI-NEXT: s_cbranch_execnz .LBB69_1
4334 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4335 ; VI-NEXT: s_or_b64 exec, exec, s[36:37]
4336 ; VI-NEXT: s_setpc_b64 s[30:31]
4338 ; GFX9-LABEL: global_atomic_max_i32_noret_offset_scalar:
4340 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4341 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4342 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16
4343 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
4344 ; GFX9-NEXT: .LBB69_1: ; %atomicrmw.start
4345 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4346 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4347 ; GFX9-NEXT: v_max_i32_e32 v0, s6, v1
4348 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4349 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc
4350 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4351 ; GFX9-NEXT: buffer_wbinvl1_vol
4352 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4353 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4354 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
4355 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
4356 ; GFX9-NEXT: s_cbranch_execnz .LBB69_1
4357 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4358 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
4359 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4360 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
4361 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst
4365 define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
4366 ; SI-LABEL: global_atomic_max_i32_ret_scalar:
4368 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4369 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
4370 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
4371 ; SI-NEXT: s_mov_b64 exec, s[34:35]
4372 ; SI-NEXT: s_waitcnt expcnt(0)
4373 ; SI-NEXT: v_writelane_b32 v1, s6, 0
4374 ; SI-NEXT: v_writelane_b32 v1, s7, 1
4375 ; SI-NEXT: s_mov_b32 s34, s6
4376 ; SI-NEXT: s_mov_b32 s7, 0xf000
4377 ; SI-NEXT: s_mov_b32 s6, -1
4378 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
4379 ; SI-NEXT: s_mov_b64 s[36:37], 0
4380 ; SI-NEXT: .LBB70_1: ; %atomicrmw.start
4381 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4382 ; SI-NEXT: s_waitcnt vmcnt(0)
4383 ; SI-NEXT: v_mov_b32_e32 v4, v2
4384 ; SI-NEXT: s_waitcnt expcnt(0)
4385 ; SI-NEXT: v_max_i32_e32 v3, s34, v4
4386 ; SI-NEXT: v_mov_b32_e32 v2, v3
4387 ; SI-NEXT: v_mov_b32_e32 v3, v4
4388 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4389 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
4390 ; SI-NEXT: s_waitcnt vmcnt(0)
4391 ; SI-NEXT: buffer_wbinvl1
4392 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
4393 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
4394 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
4395 ; SI-NEXT: s_cbranch_execnz .LBB70_1
4396 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4397 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
4398 ; SI-NEXT: v_mov_b32_e32 v0, v2
4399 ; SI-NEXT: v_readlane_b32 s7, v1, 1
4400 ; SI-NEXT: v_readlane_b32 s6, v1, 0
4401 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
4402 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
4403 ; SI-NEXT: s_mov_b64 exec, s[34:35]
4404 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4405 ; SI-NEXT: s_setpc_b64 s[30:31]
4407 ; VI-LABEL: global_atomic_max_i32_ret_scalar:
4409 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4410 ; VI-NEXT: v_mov_b32_e32 v0, s4
4411 ; VI-NEXT: v_mov_b32_e32 v1, s5
4412 ; VI-NEXT: flat_load_dword v0, v[0:1]
4413 ; VI-NEXT: s_mov_b64 s[34:35], 0
4414 ; VI-NEXT: .LBB70_1: ; %atomicrmw.start
4415 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4416 ; VI-NEXT: s_waitcnt vmcnt(0)
4417 ; VI-NEXT: v_mov_b32_e32 v1, v0
4418 ; VI-NEXT: v_mov_b32_e32 v2, s4
4419 ; VI-NEXT: v_mov_b32_e32 v3, s5
4420 ; VI-NEXT: v_max_i32_e32 v0, s6, v1
4421 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4422 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
4423 ; VI-NEXT: s_waitcnt vmcnt(0)
4424 ; VI-NEXT: buffer_wbinvl1_vol
4425 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4426 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4427 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
4428 ; VI-NEXT: s_cbranch_execnz .LBB70_1
4429 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4430 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
4431 ; VI-NEXT: s_setpc_b64 s[30:31]
4433 ; GFX9-LABEL: global_atomic_max_i32_ret_scalar:
4435 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4436 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
4437 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5]
4438 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
4439 ; GFX9-NEXT: .LBB70_1: ; %atomicrmw.start
4440 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4441 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4442 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
4443 ; GFX9-NEXT: v_max_i32_e32 v2, s6, v3
4444 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4445 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
4446 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4447 ; GFX9-NEXT: buffer_wbinvl1_vol
4448 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
4449 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4450 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
4451 ; GFX9-NEXT: s_cbranch_execnz .LBB70_1
4452 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4453 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
4454 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4455 %result = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst
4459 define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
4460 ; SI-LABEL: global_atomic_max_i32_ret_offset_scalar:
4462 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4463 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
4464 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
4465 ; SI-NEXT: s_mov_b64 exec, s[34:35]
4466 ; SI-NEXT: s_waitcnt expcnt(0)
4467 ; SI-NEXT: v_writelane_b32 v1, s6, 0
4468 ; SI-NEXT: v_writelane_b32 v1, s7, 1
4469 ; SI-NEXT: s_mov_b32 s34, s6
4470 ; SI-NEXT: s_mov_b32 s7, 0xf000
4471 ; SI-NEXT: s_mov_b32 s6, -1
4472 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16
4473 ; SI-NEXT: s_mov_b64 s[36:37], 0
4474 ; SI-NEXT: .LBB71_1: ; %atomicrmw.start
4475 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4476 ; SI-NEXT: s_waitcnt vmcnt(0)
4477 ; SI-NEXT: v_mov_b32_e32 v4, v2
4478 ; SI-NEXT: s_waitcnt expcnt(0)
4479 ; SI-NEXT: v_max_i32_e32 v3, s34, v4
4480 ; SI-NEXT: v_mov_b32_e32 v2, v3
4481 ; SI-NEXT: v_mov_b32_e32 v3, v4
4482 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4483 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
4484 ; SI-NEXT: s_waitcnt vmcnt(0)
4485 ; SI-NEXT: buffer_wbinvl1
4486 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
4487 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
4488 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
4489 ; SI-NEXT: s_cbranch_execnz .LBB71_1
4490 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4491 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
4492 ; SI-NEXT: v_mov_b32_e32 v0, v2
4493 ; SI-NEXT: v_readlane_b32 s7, v1, 1
4494 ; SI-NEXT: v_readlane_b32 s6, v1, 0
4495 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
4496 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
4497 ; SI-NEXT: s_mov_b64 exec, s[34:35]
4498 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4499 ; SI-NEXT: s_setpc_b64 s[30:31]
4501 ; VI-LABEL: global_atomic_max_i32_ret_offset_scalar:
4503 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4504 ; VI-NEXT: s_add_u32 s34, s4, 16
4505 ; VI-NEXT: s_addc_u32 s35, s5, 0
4506 ; VI-NEXT: v_mov_b32_e32 v0, s34
4507 ; VI-NEXT: v_mov_b32_e32 v1, s35
4508 ; VI-NEXT: flat_load_dword v0, v[0:1]
4509 ; VI-NEXT: s_mov_b64 s[36:37], 0
4510 ; VI-NEXT: .LBB71_1: ; %atomicrmw.start
4511 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4512 ; VI-NEXT: s_waitcnt vmcnt(0)
4513 ; VI-NEXT: v_mov_b32_e32 v1, v0
4514 ; VI-NEXT: v_mov_b32_e32 v2, s34
4515 ; VI-NEXT: v_mov_b32_e32 v3, s35
4516 ; VI-NEXT: v_max_i32_e32 v0, s6, v1
4517 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4518 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
4519 ; VI-NEXT: s_waitcnt vmcnt(0)
4520 ; VI-NEXT: buffer_wbinvl1_vol
4521 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4522 ; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
4523 ; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
4524 ; VI-NEXT: s_cbranch_execnz .LBB71_1
4525 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4526 ; VI-NEXT: s_or_b64 exec, exec, s[36:37]
4527 ; VI-NEXT: s_setpc_b64 s[30:31]
4529 ; GFX9-LABEL: global_atomic_max_i32_ret_offset_scalar:
4531 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4532 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
4533 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16
4534 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
4535 ; GFX9-NEXT: .LBB71_1: ; %atomicrmw.start
4536 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4537 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4538 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
4539 ; GFX9-NEXT: v_max_i32_e32 v2, s6, v3
4540 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4541 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc
4542 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4543 ; GFX9-NEXT: buffer_wbinvl1_vol
4544 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
4545 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4546 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
4547 ; GFX9-NEXT: s_cbranch_execnz .LBB71_1
4548 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4549 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
4550 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4551 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
4552 %result = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst
4556 define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) {
4557 ; SI-LABEL: atomic_max_i32_addr64_offset:
4558 ; SI: ; %bb.0: ; %entry
4559 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
4560 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4561 ; SI-NEXT: s_ashr_i32 s5, s3, 31
4562 ; SI-NEXT: s_mov_b32 s4, s3
4563 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
4564 ; SI-NEXT: s_add_u32 s4, s0, s4
4565 ; SI-NEXT: s_addc_u32 s5, s1, s5
4566 ; SI-NEXT: s_load_dword s3, s[4:5], 0x4
4567 ; SI-NEXT: s_mov_b64 s[0:1], 0
4568 ; SI-NEXT: s_mov_b32 s7, 0xf000
4569 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4570 ; SI-NEXT: v_mov_b32_e32 v1, s3
4571 ; SI-NEXT: s_mov_b32 s6, -1
4572 ; SI-NEXT: .LBB72_1: ; %atomicrmw.start
4573 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4574 ; SI-NEXT: v_max_i32_e32 v0, s2, v1
4575 ; SI-NEXT: s_waitcnt expcnt(0)
4576 ; SI-NEXT: v_mov_b32_e32 v3, v1
4577 ; SI-NEXT: v_mov_b32_e32 v2, v0
4578 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4579 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
4580 ; SI-NEXT: s_waitcnt vmcnt(0)
4581 ; SI-NEXT: buffer_wbinvl1
4582 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
4583 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
4584 ; SI-NEXT: v_mov_b32_e32 v1, v2
4585 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
4586 ; SI-NEXT: s_cbranch_execnz .LBB72_1
4587 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4590 ; VI-LABEL: atomic_max_i32_addr64_offset:
4591 ; VI: ; %bb.0: ; %entry
4592 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
4593 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4594 ; VI-NEXT: s_ashr_i32 s5, s3, 31
4595 ; VI-NEXT: s_mov_b32 s4, s3
4596 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
4597 ; VI-NEXT: s_add_u32 s0, s0, s4
4598 ; VI-NEXT: s_addc_u32 s1, s1, s5
4599 ; VI-NEXT: s_load_dword s3, s[0:1], 0x10
4600 ; VI-NEXT: s_add_u32 s0, s0, 16
4601 ; VI-NEXT: s_addc_u32 s1, s1, 0
4602 ; VI-NEXT: s_mov_b64 s[4:5], 0
4603 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4604 ; VI-NEXT: v_mov_b32_e32 v1, s3
4605 ; VI-NEXT: .LBB72_1: ; %atomicrmw.start
4606 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4607 ; VI-NEXT: v_mov_b32_e32 v3, s1
4608 ; VI-NEXT: v_max_i32_e32 v0, s2, v1
4609 ; VI-NEXT: v_mov_b32_e32 v2, s0
4610 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4611 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
4612 ; VI-NEXT: s_waitcnt vmcnt(0)
4613 ; VI-NEXT: buffer_wbinvl1_vol
4614 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4615 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4616 ; VI-NEXT: v_mov_b32_e32 v1, v0
4617 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
4618 ; VI-NEXT: s_cbranch_execnz .LBB72_1
4619 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4622 ; GFX9-LABEL: atomic_max_i32_addr64_offset:
4623 ; GFX9: ; %bb.0: ; %entry
4624 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
4625 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4626 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4627 ; GFX9-NEXT: s_ashr_i32 s5, s3, 31
4628 ; GFX9-NEXT: s_mov_b32 s4, s3
4629 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
4630 ; GFX9-NEXT: s_add_u32 s0, s0, s4
4631 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
4632 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10
4633 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
4634 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4635 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
4636 ; GFX9-NEXT: .LBB72_1: ; %atomicrmw.start
4637 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4638 ; GFX9-NEXT: v_max_i32_e32 v0, s2, v1
4639 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4640 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
4641 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4642 ; GFX9-NEXT: buffer_wbinvl1_vol
4643 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4644 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4645 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
4646 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
4647 ; GFX9-NEXT: s_cbranch_execnz .LBB72_1
4648 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4649 ; GFX9-NEXT: s_endpgm
4651 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
4652 %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
4653 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst
4657 define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) {
4658 ; SI-LABEL: atomic_max_i32_ret_addr64_offset:
4659 ; SI: ; %bb.0: ; %entry
4660 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
4661 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
4662 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4663 ; SI-NEXT: s_ashr_i32 s5, s9, 31
4664 ; SI-NEXT: s_mov_b32 s4, s9
4665 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
4666 ; SI-NEXT: s_add_u32 s4, s0, s4
4667 ; SI-NEXT: s_addc_u32 s5, s1, s5
4668 ; SI-NEXT: s_load_dword s6, s[4:5], 0x4
4669 ; SI-NEXT: s_mov_b64 s[0:1], 0
4670 ; SI-NEXT: s_mov_b32 s7, 0xf000
4671 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4672 ; SI-NEXT: v_mov_b32_e32 v1, s6
4673 ; SI-NEXT: s_mov_b32 s6, -1
4674 ; SI-NEXT: .LBB73_1: ; %atomicrmw.start
4675 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4676 ; SI-NEXT: v_max_i32_e32 v0, s8, v1
4677 ; SI-NEXT: s_waitcnt expcnt(0)
4678 ; SI-NEXT: v_mov_b32_e32 v3, v1
4679 ; SI-NEXT: v_mov_b32_e32 v2, v0
4680 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4681 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
4682 ; SI-NEXT: s_waitcnt vmcnt(0)
4683 ; SI-NEXT: buffer_wbinvl1
4684 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
4685 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
4686 ; SI-NEXT: v_mov_b32_e32 v1, v2
4687 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
4688 ; SI-NEXT: s_cbranch_execnz .LBB73_1
4689 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4690 ; SI-NEXT: s_or_b64 exec, exec, s[0:1]
4691 ; SI-NEXT: s_mov_b32 s7, 0xf000
4692 ; SI-NEXT: s_mov_b32 s6, -1
4693 ; SI-NEXT: s_mov_b32 s4, s2
4694 ; SI-NEXT: s_mov_b32 s5, s3
4695 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
4698 ; VI-LABEL: atomic_max_i32_ret_addr64_offset:
4699 ; VI: ; %bb.0: ; %entry
4700 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
4701 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
4702 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4703 ; VI-NEXT: s_ashr_i32 s7, s5, 31
4704 ; VI-NEXT: s_mov_b32 s6, s5
4705 ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
4706 ; VI-NEXT: s_add_u32 s0, s0, s6
4707 ; VI-NEXT: s_addc_u32 s1, s1, s7
4708 ; VI-NEXT: s_load_dword s5, s[0:1], 0x10
4709 ; VI-NEXT: s_add_u32 s0, s0, 16
4710 ; VI-NEXT: s_addc_u32 s1, s1, 0
4711 ; VI-NEXT: s_mov_b64 s[6:7], 0
4712 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4713 ; VI-NEXT: v_mov_b32_e32 v0, s5
4714 ; VI-NEXT: .LBB73_1: ; %atomicrmw.start
4715 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4716 ; VI-NEXT: v_mov_b32_e32 v1, v0
4717 ; VI-NEXT: v_mov_b32_e32 v3, s1
4718 ; VI-NEXT: v_mov_b32_e32 v2, s0
4719 ; VI-NEXT: v_max_i32_e32 v0, s4, v1
4720 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4721 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
4722 ; VI-NEXT: s_waitcnt vmcnt(0)
4723 ; VI-NEXT: buffer_wbinvl1_vol
4724 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4725 ; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
4726 ; VI-NEXT: s_andn2_b64 exec, exec, s[6:7]
4727 ; VI-NEXT: s_cbranch_execnz .LBB73_1
4728 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4729 ; VI-NEXT: s_or_b64 exec, exec, s[6:7]
4730 ; VI-NEXT: v_mov_b32_e32 v1, s2
4731 ; VI-NEXT: v_mov_b32_e32 v2, s3
4732 ; VI-NEXT: flat_store_dword v[1:2], v0
4735 ; GFX9-LABEL: atomic_max_i32_ret_addr64_offset:
4736 ; GFX9: ; %bb.0: ; %entry
4737 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4738 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4739 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
4740 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4741 ; GFX9-NEXT: s_ashr_i32 s1, s3, 31
4742 ; GFX9-NEXT: s_mov_b32 s0, s3
4743 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
4744 ; GFX9-NEXT: s_add_u32 s0, s4, s0
4745 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
4746 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10
4747 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
4748 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4749 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
4750 ; GFX9-NEXT: .LBB73_1: ; %atomicrmw.start
4751 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4752 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
4753 ; GFX9-NEXT: v_max_i32_e32 v2, s2, v3
4754 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4755 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc
4756 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4757 ; GFX9-NEXT: buffer_wbinvl1_vol
4758 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
4759 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4760 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
4761 ; GFX9-NEXT: s_cbranch_execnz .LBB73_1
4762 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4763 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
4764 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
4765 ; GFX9-NEXT: global_store_dword v1, v0, s[6:7]
4766 ; GFX9-NEXT: s_endpgm
4768 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
4769 %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
4770 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst
4771 store i32 %tmp0, ptr addrspace(1) %out2
4775 define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i32 %index) {
4776 ; SI-LABEL: atomic_max_i32_addr64:
4777 ; SI: ; %bb.0: ; %entry
4778 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
4779 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4780 ; SI-NEXT: s_ashr_i32 s5, s3, 31
4781 ; SI-NEXT: s_mov_b32 s4, s3
4782 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
4783 ; SI-NEXT: s_add_u32 s4, s0, s4
4784 ; SI-NEXT: s_addc_u32 s5, s1, s5
4785 ; SI-NEXT: s_load_dword s3, s[4:5], 0x0
4786 ; SI-NEXT: s_mov_b64 s[0:1], 0
4787 ; SI-NEXT: s_mov_b32 s7, 0xf000
4788 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4789 ; SI-NEXT: v_mov_b32_e32 v1, s3
4790 ; SI-NEXT: s_mov_b32 s6, -1
4791 ; SI-NEXT: .LBB74_1: ; %atomicrmw.start
4792 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4793 ; SI-NEXT: v_max_i32_e32 v0, s2, v1
4794 ; SI-NEXT: s_waitcnt expcnt(0)
4795 ; SI-NEXT: v_mov_b32_e32 v3, v1
4796 ; SI-NEXT: v_mov_b32_e32 v2, v0
4797 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4798 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
4799 ; SI-NEXT: s_waitcnt vmcnt(0)
4800 ; SI-NEXT: buffer_wbinvl1
4801 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
4802 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
4803 ; SI-NEXT: v_mov_b32_e32 v1, v2
4804 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
4805 ; SI-NEXT: s_cbranch_execnz .LBB74_1
4806 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4809 ; VI-LABEL: atomic_max_i32_addr64:
4810 ; VI: ; %bb.0: ; %entry
4811 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
4812 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4813 ; VI-NEXT: s_ashr_i32 s5, s3, 31
4814 ; VI-NEXT: s_mov_b32 s4, s3
4815 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
4816 ; VI-NEXT: s_add_u32 s0, s0, s4
4817 ; VI-NEXT: s_addc_u32 s1, s1, s5
4818 ; VI-NEXT: s_load_dword s3, s[0:1], 0x0
4819 ; VI-NEXT: s_mov_b64 s[4:5], 0
4820 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4821 ; VI-NEXT: v_mov_b32_e32 v1, s3
4822 ; VI-NEXT: .LBB74_1: ; %atomicrmw.start
4823 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4824 ; VI-NEXT: v_mov_b32_e32 v3, s1
4825 ; VI-NEXT: v_max_i32_e32 v0, s2, v1
4826 ; VI-NEXT: v_mov_b32_e32 v2, s0
4827 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4828 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
4829 ; VI-NEXT: s_waitcnt vmcnt(0)
4830 ; VI-NEXT: buffer_wbinvl1_vol
4831 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4832 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4833 ; VI-NEXT: v_mov_b32_e32 v1, v0
4834 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
4835 ; VI-NEXT: s_cbranch_execnz .LBB74_1
4836 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4839 ; GFX9-LABEL: atomic_max_i32_addr64:
4840 ; GFX9: ; %bb.0: ; %entry
4841 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
4842 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4843 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4844 ; GFX9-NEXT: s_ashr_i32 s5, s3, 31
4845 ; GFX9-NEXT: s_mov_b32 s4, s3
4846 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
4847 ; GFX9-NEXT: s_add_u32 s0, s0, s4
4848 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
4849 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0
4850 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
4851 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4852 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
4853 ; GFX9-NEXT: .LBB74_1: ; %atomicrmw.start
4854 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4855 ; GFX9-NEXT: v_max_i32_e32 v0, s2, v1
4856 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4857 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
4858 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4859 ; GFX9-NEXT: buffer_wbinvl1_vol
4860 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4861 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4862 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
4863 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
4864 ; GFX9-NEXT: s_cbranch_execnz .LBB74_1
4865 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4866 ; GFX9-NEXT: s_endpgm
4868 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
4869 %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst
4873 define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) {
4874 ; SI-LABEL: atomic_max_i32_ret_addr64:
4875 ; SI: ; %bb.0: ; %entry
4876 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
4877 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
4878 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4879 ; SI-NEXT: s_ashr_i32 s5, s9, 31
4880 ; SI-NEXT: s_mov_b32 s4, s9
4881 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
4882 ; SI-NEXT: s_add_u32 s4, s0, s4
4883 ; SI-NEXT: s_addc_u32 s5, s1, s5
4884 ; SI-NEXT: s_load_dword s6, s[4:5], 0x0
4885 ; SI-NEXT: s_mov_b64 s[0:1], 0
4886 ; SI-NEXT: s_mov_b32 s7, 0xf000
4887 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4888 ; SI-NEXT: v_mov_b32_e32 v1, s6
4889 ; SI-NEXT: s_mov_b32 s6, -1
4890 ; SI-NEXT: .LBB75_1: ; %atomicrmw.start
4891 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4892 ; SI-NEXT: v_max_i32_e32 v0, s8, v1
4893 ; SI-NEXT: s_waitcnt expcnt(0)
4894 ; SI-NEXT: v_mov_b32_e32 v3, v1
4895 ; SI-NEXT: v_mov_b32_e32 v2, v0
4896 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4897 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
4898 ; SI-NEXT: s_waitcnt vmcnt(0)
4899 ; SI-NEXT: buffer_wbinvl1
4900 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
4901 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
4902 ; SI-NEXT: v_mov_b32_e32 v1, v2
4903 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
4904 ; SI-NEXT: s_cbranch_execnz .LBB75_1
4905 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4906 ; SI-NEXT: s_or_b64 exec, exec, s[0:1]
4907 ; SI-NEXT: s_mov_b32 s7, 0xf000
4908 ; SI-NEXT: s_mov_b32 s6, -1
4909 ; SI-NEXT: s_mov_b32 s4, s2
4910 ; SI-NEXT: s_mov_b32 s5, s3
4911 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
4914 ; VI-LABEL: atomic_max_i32_ret_addr64:
4915 ; VI: ; %bb.0: ; %entry
4916 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
4917 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
4918 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4919 ; VI-NEXT: s_ashr_i32 s7, s5, 31
4920 ; VI-NEXT: s_mov_b32 s6, s5
4921 ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
4922 ; VI-NEXT: s_add_u32 s0, s0, s6
4923 ; VI-NEXT: s_addc_u32 s1, s1, s7
4924 ; VI-NEXT: s_load_dword s5, s[0:1], 0x0
4925 ; VI-NEXT: s_mov_b64 s[6:7], 0
4926 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4927 ; VI-NEXT: v_mov_b32_e32 v0, s5
4928 ; VI-NEXT: .LBB75_1: ; %atomicrmw.start
4929 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4930 ; VI-NEXT: v_mov_b32_e32 v1, v0
4931 ; VI-NEXT: v_mov_b32_e32 v3, s1
4932 ; VI-NEXT: v_mov_b32_e32 v2, s0
4933 ; VI-NEXT: v_max_i32_e32 v0, s4, v1
4934 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4935 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
4936 ; VI-NEXT: s_waitcnt vmcnt(0)
4937 ; VI-NEXT: buffer_wbinvl1_vol
4938 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4939 ; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
4940 ; VI-NEXT: s_andn2_b64 exec, exec, s[6:7]
4941 ; VI-NEXT: s_cbranch_execnz .LBB75_1
4942 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4943 ; VI-NEXT: s_or_b64 exec, exec, s[6:7]
4944 ; VI-NEXT: v_mov_b32_e32 v1, s2
4945 ; VI-NEXT: v_mov_b32_e32 v2, s3
4946 ; VI-NEXT: flat_store_dword v[1:2], v0
4949 ; GFX9-LABEL: atomic_max_i32_ret_addr64:
4950 ; GFX9: ; %bb.0: ; %entry
4951 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4952 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4953 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
4954 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4955 ; GFX9-NEXT: s_ashr_i32 s1, s3, 31
4956 ; GFX9-NEXT: s_mov_b32 s0, s3
4957 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
4958 ; GFX9-NEXT: s_add_u32 s0, s4, s0
4959 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
4960 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0
4961 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
4962 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4963 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
4964 ; GFX9-NEXT: .LBB75_1: ; %atomicrmw.start
4965 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4966 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
4967 ; GFX9-NEXT: v_max_i32_e32 v2, s2, v3
4968 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4969 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
4970 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4971 ; GFX9-NEXT: buffer_wbinvl1_vol
4972 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
4973 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4974 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
4975 ; GFX9-NEXT: s_cbranch_execnz .LBB75_1
4976 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4977 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
4978 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
4979 ; GFX9-NEXT: global_store_dword v1, v0, s[6:7]
4980 ; GFX9-NEXT: s_endpgm
4982 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
4983 %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i32 %in seq_cst
4984 store i32 %tmp0, ptr addrspace(1) %out2
4988 ; ---------------------------------------------------------------------
4990 ; ---------------------------------------------------------------------
4992 define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
4993 ; SI-LABEL: global_atomic_umax_i32_noret:
4995 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4996 ; SI-NEXT: s_mov_b32 s6, 0
4997 ; SI-NEXT: s_mov_b32 s7, 0xf000
4998 ; SI-NEXT: s_mov_b32 s4, s6
4999 ; SI-NEXT: s_mov_b32 s5, s6
5000 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
5001 ; SI-NEXT: s_mov_b64 s[8:9], 0
5002 ; SI-NEXT: .LBB76_1: ; %atomicrmw.start
5003 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5004 ; SI-NEXT: s_waitcnt vmcnt(0)
5005 ; SI-NEXT: v_max_u32_e32 v3, v4, v2
5006 ; SI-NEXT: s_waitcnt expcnt(0)
5007 ; SI-NEXT: v_mov_b32_e32 v6, v4
5008 ; SI-NEXT: v_mov_b32_e32 v5, v3
5009 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5010 ; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
5011 ; SI-NEXT: s_waitcnt vmcnt(0)
5012 ; SI-NEXT: buffer_wbinvl1
5013 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
5014 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
5015 ; SI-NEXT: v_mov_b32_e32 v4, v5
5016 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
5017 ; SI-NEXT: s_cbranch_execnz .LBB76_1
5018 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5019 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
5020 ; SI-NEXT: s_waitcnt expcnt(0)
5021 ; SI-NEXT: s_setpc_b64 s[30:31]
5023 ; VI-LABEL: global_atomic_umax_i32_noret:
5025 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5026 ; VI-NEXT: flat_load_dword v4, v[0:1]
5027 ; VI-NEXT: s_mov_b64 s[4:5], 0
5028 ; VI-NEXT: .LBB76_1: ; %atomicrmw.start
5029 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5030 ; VI-NEXT: s_waitcnt vmcnt(0)
5031 ; VI-NEXT: v_max_u32_e32 v3, v4, v2
5032 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5033 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5034 ; VI-NEXT: s_waitcnt vmcnt(0)
5035 ; VI-NEXT: buffer_wbinvl1_vol
5036 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5037 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5038 ; VI-NEXT: v_mov_b32_e32 v4, v3
5039 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
5040 ; VI-NEXT: s_cbranch_execnz .LBB76_1
5041 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5042 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
5043 ; VI-NEXT: s_setpc_b64 s[30:31]
5045 ; GFX9-LABEL: global_atomic_umax_i32_noret:
5047 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5048 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
5049 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
5050 ; GFX9-NEXT: .LBB76_1: ; %atomicrmw.start
5051 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5052 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5053 ; GFX9-NEXT: v_max_u32_e32 v3, v4, v2
5054 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5055 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
5056 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5057 ; GFX9-NEXT: buffer_wbinvl1_vol
5058 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5059 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5060 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
5061 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
5062 ; GFX9-NEXT: s_cbranch_execnz .LBB76_1
5063 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5064 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
5065 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5066 %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst
5070 define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
5071 ; SI-LABEL: global_atomic_umax_i32_noret_offset:
5073 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5074 ; SI-NEXT: s_mov_b32 s6, 0
5075 ; SI-NEXT: s_mov_b32 s7, 0xf000
5076 ; SI-NEXT: s_mov_b32 s4, s6
5077 ; SI-NEXT: s_mov_b32 s5, s6
5078 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
5079 ; SI-NEXT: s_mov_b64 s[8:9], 0
5080 ; SI-NEXT: .LBB77_1: ; %atomicrmw.start
5081 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5082 ; SI-NEXT: s_waitcnt vmcnt(0)
5083 ; SI-NEXT: v_max_u32_e32 v3, v4, v2
5084 ; SI-NEXT: s_waitcnt expcnt(0)
5085 ; SI-NEXT: v_mov_b32_e32 v6, v4
5086 ; SI-NEXT: v_mov_b32_e32 v5, v3
5087 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5088 ; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
5089 ; SI-NEXT: s_waitcnt vmcnt(0)
5090 ; SI-NEXT: buffer_wbinvl1
5091 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
5092 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
5093 ; SI-NEXT: v_mov_b32_e32 v4, v5
5094 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
5095 ; SI-NEXT: s_cbranch_execnz .LBB77_1
5096 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5097 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
5098 ; SI-NEXT: s_waitcnt expcnt(0)
5099 ; SI-NEXT: s_setpc_b64 s[30:31]
5101 ; VI-LABEL: global_atomic_umax_i32_noret_offset:
5103 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5104 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
5105 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
5106 ; VI-NEXT: flat_load_dword v4, v[0:1]
5107 ; VI-NEXT: s_mov_b64 s[4:5], 0
5108 ; VI-NEXT: .LBB77_1: ; %atomicrmw.start
5109 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5110 ; VI-NEXT: s_waitcnt vmcnt(0)
5111 ; VI-NEXT: v_max_u32_e32 v3, v4, v2
5112 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5113 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5114 ; VI-NEXT: s_waitcnt vmcnt(0)
5115 ; VI-NEXT: buffer_wbinvl1_vol
5116 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5117 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5118 ; VI-NEXT: v_mov_b32_e32 v4, v3
5119 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
5120 ; VI-NEXT: s_cbranch_execnz .LBB77_1
5121 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5122 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
5123 ; VI-NEXT: s_setpc_b64 s[30:31]
5125 ; GFX9-LABEL: global_atomic_umax_i32_noret_offset:
5127 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5128 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16
5129 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
5130 ; GFX9-NEXT: .LBB77_1: ; %atomicrmw.start
5131 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5132 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5133 ; GFX9-NEXT: v_max_u32_e32 v3, v4, v2
5134 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5135 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
5136 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5137 ; GFX9-NEXT: buffer_wbinvl1_vol
5138 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5139 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5140 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
5141 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
5142 ; GFX9-NEXT: s_cbranch_execnz .LBB77_1
5143 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5144 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
5145 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5146 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
5147 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst
5151 define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
5152 ; SI-LABEL: global_atomic_umax_i32_ret:
5154 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5155 ; SI-NEXT: s_mov_b32 s6, 0
5156 ; SI-NEXT: s_mov_b32 s7, 0xf000
5157 ; SI-NEXT: s_mov_b32 s4, s6
5158 ; SI-NEXT: s_mov_b32 s5, s6
5159 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
5160 ; SI-NEXT: s_mov_b64 s[8:9], 0
5161 ; SI-NEXT: .LBB78_1: ; %atomicrmw.start
5162 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5163 ; SI-NEXT: s_waitcnt vmcnt(0)
5164 ; SI-NEXT: v_mov_b32_e32 v5, v3
5165 ; SI-NEXT: s_waitcnt expcnt(0)
5166 ; SI-NEXT: v_max_u32_e32 v4, v5, v2
5167 ; SI-NEXT: v_mov_b32_e32 v3, v4
5168 ; SI-NEXT: v_mov_b32_e32 v4, v5
5169 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5170 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
5171 ; SI-NEXT: s_waitcnt vmcnt(0)
5172 ; SI-NEXT: buffer_wbinvl1
5173 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
5174 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
5175 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
5176 ; SI-NEXT: s_cbranch_execnz .LBB78_1
5177 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5178 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
5179 ; SI-NEXT: v_mov_b32_e32 v0, v3
5180 ; SI-NEXT: s_waitcnt expcnt(0)
5181 ; SI-NEXT: s_setpc_b64 s[30:31]
5183 ; VI-LABEL: global_atomic_umax_i32_ret:
5185 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5186 ; VI-NEXT: flat_load_dword v3, v[0:1]
5187 ; VI-NEXT: s_mov_b64 s[4:5], 0
5188 ; VI-NEXT: .LBB78_1: ; %atomicrmw.start
5189 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5190 ; VI-NEXT: s_waitcnt vmcnt(0)
5191 ; VI-NEXT: v_mov_b32_e32 v4, v3
5192 ; VI-NEXT: v_max_u32_e32 v3, v4, v2
5193 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5194 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5195 ; VI-NEXT: s_waitcnt vmcnt(0)
5196 ; VI-NEXT: buffer_wbinvl1_vol
5197 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5198 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5199 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
5200 ; VI-NEXT: s_cbranch_execnz .LBB78_1
5201 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5202 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
5203 ; VI-NEXT: v_mov_b32_e32 v0, v3
5204 ; VI-NEXT: s_setpc_b64 s[30:31]
5206 ; GFX9-LABEL: global_atomic_umax_i32_ret:
5208 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5209 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
5210 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
5211 ; GFX9-NEXT: .LBB78_1: ; %atomicrmw.start
5212 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5213 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5214 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
5215 ; GFX9-NEXT: v_max_u32_e32 v3, v4, v2
5216 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5217 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
5218 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5219 ; GFX9-NEXT: buffer_wbinvl1_vol
5220 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5221 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5222 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
5223 ; GFX9-NEXT: s_cbranch_execnz .LBB78_1
5224 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5225 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
5226 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
5227 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5228 %result = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst
5232 define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
5233 ; SI-LABEL: global_atomic_umax_i32_ret_offset:
5235 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5236 ; SI-NEXT: s_mov_b32 s6, 0
5237 ; SI-NEXT: s_mov_b32 s7, 0xf000
5238 ; SI-NEXT: s_mov_b32 s4, s6
5239 ; SI-NEXT: s_mov_b32 s5, s6
5240 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
5241 ; SI-NEXT: s_mov_b64 s[8:9], 0
5242 ; SI-NEXT: .LBB79_1: ; %atomicrmw.start
5243 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5244 ; SI-NEXT: s_waitcnt vmcnt(0)
5245 ; SI-NEXT: v_mov_b32_e32 v5, v3
5246 ; SI-NEXT: s_waitcnt expcnt(0)
5247 ; SI-NEXT: v_max_u32_e32 v4, v5, v2
5248 ; SI-NEXT: v_mov_b32_e32 v3, v4
5249 ; SI-NEXT: v_mov_b32_e32 v4, v5
5250 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5251 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
5252 ; SI-NEXT: s_waitcnt vmcnt(0)
5253 ; SI-NEXT: buffer_wbinvl1
5254 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
5255 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
5256 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
5257 ; SI-NEXT: s_cbranch_execnz .LBB79_1
5258 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5259 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
5260 ; SI-NEXT: v_mov_b32_e32 v0, v3
5261 ; SI-NEXT: s_waitcnt expcnt(0)
5262 ; SI-NEXT: s_setpc_b64 s[30:31]
5264 ; VI-LABEL: global_atomic_umax_i32_ret_offset:
5266 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5267 ; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0
5268 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
5269 ; VI-NEXT: flat_load_dword v0, v[3:4]
5270 ; VI-NEXT: s_mov_b64 s[4:5], 0
5271 ; VI-NEXT: .LBB79_1: ; %atomicrmw.start
5272 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5273 ; VI-NEXT: s_waitcnt vmcnt(0)
5274 ; VI-NEXT: v_mov_b32_e32 v1, v0
5275 ; VI-NEXT: v_max_u32_e32 v0, v1, v2
5276 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5277 ; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
5278 ; VI-NEXT: s_waitcnt vmcnt(0)
5279 ; VI-NEXT: buffer_wbinvl1_vol
5280 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
5281 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5282 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
5283 ; VI-NEXT: s_cbranch_execnz .LBB79_1
5284 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5285 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
5286 ; VI-NEXT: s_setpc_b64 s[30:31]
5288 ; GFX9-LABEL: global_atomic_umax_i32_ret_offset:
5290 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5291 ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16
5292 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
5293 ; GFX9-NEXT: .LBB79_1: ; %atomicrmw.start
5294 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5295 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5296 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
5297 ; GFX9-NEXT: v_max_u32_e32 v3, v4, v2
5298 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5299 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
5300 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5301 ; GFX9-NEXT: buffer_wbinvl1_vol
5302 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5303 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5304 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
5305 ; GFX9-NEXT: s_cbranch_execnz .LBB79_1
5306 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5307 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
5308 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
5309 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5310 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
5311 %result = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst
5315 define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
5316 ; SI-LABEL: global_atomic_umax_i32_noret_scalar:
5318 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5319 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
5320 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
5321 ; SI-NEXT: s_mov_b64 exec, s[34:35]
5322 ; SI-NEXT: s_waitcnt expcnt(0)
5323 ; SI-NEXT: v_writelane_b32 v0, s6, 0
5324 ; SI-NEXT: v_writelane_b32 v0, s7, 1
5325 ; SI-NEXT: s_mov_b32 s34, s6
5326 ; SI-NEXT: s_mov_b32 s7, 0xf000
5327 ; SI-NEXT: s_mov_b32 s6, -1
5328 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
5329 ; SI-NEXT: s_mov_b64 s[36:37], 0
5330 ; SI-NEXT: .LBB80_1: ; %atomicrmw.start
5331 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5332 ; SI-NEXT: s_waitcnt vmcnt(0)
5333 ; SI-NEXT: v_max_u32_e32 v1, s34, v2
5334 ; SI-NEXT: s_waitcnt expcnt(0)
5335 ; SI-NEXT: v_mov_b32_e32 v4, v2
5336 ; SI-NEXT: v_mov_b32_e32 v3, v1
5337 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5338 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc
5339 ; SI-NEXT: s_waitcnt vmcnt(0)
5340 ; SI-NEXT: buffer_wbinvl1
5341 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
5342 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
5343 ; SI-NEXT: v_mov_b32_e32 v2, v3
5344 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
5345 ; SI-NEXT: s_cbranch_execnz .LBB80_1
5346 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5347 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
5348 ; SI-NEXT: v_readlane_b32 s7, v0, 1
5349 ; SI-NEXT: v_readlane_b32 s6, v0, 0
5350 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
5351 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
5352 ; SI-NEXT: s_mov_b64 exec, s[34:35]
5353 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5354 ; SI-NEXT: s_setpc_b64 s[30:31]
5356 ; VI-LABEL: global_atomic_umax_i32_noret_scalar:
5358 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5359 ; VI-NEXT: v_mov_b32_e32 v0, s4
5360 ; VI-NEXT: v_mov_b32_e32 v1, s5
5361 ; VI-NEXT: flat_load_dword v1, v[0:1]
5362 ; VI-NEXT: s_mov_b64 s[34:35], 0
5363 ; VI-NEXT: .LBB80_1: ; %atomicrmw.start
5364 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5365 ; VI-NEXT: v_mov_b32_e32 v2, s4
5366 ; VI-NEXT: s_waitcnt vmcnt(0)
5367 ; VI-NEXT: v_max_u32_e32 v0, s6, v1
5368 ; VI-NEXT: v_mov_b32_e32 v3, s5
5369 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5370 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
5371 ; VI-NEXT: s_waitcnt vmcnt(0)
5372 ; VI-NEXT: buffer_wbinvl1_vol
5373 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
5374 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5375 ; VI-NEXT: v_mov_b32_e32 v1, v0
5376 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
5377 ; VI-NEXT: s_cbranch_execnz .LBB80_1
5378 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5379 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
5380 ; VI-NEXT: s_setpc_b64 s[30:31]
5382 ; GFX9-LABEL: global_atomic_umax_i32_noret_scalar:
5384 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5385 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5386 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5]
5387 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
5388 ; GFX9-NEXT: .LBB80_1: ; %atomicrmw.start
5389 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5390 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5391 ; GFX9-NEXT: v_max_u32_e32 v0, s6, v1
5392 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5393 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc
5394 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5395 ; GFX9-NEXT: buffer_wbinvl1_vol
5396 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
5397 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5398 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
5399 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
5400 ; GFX9-NEXT: s_cbranch_execnz .LBB80_1
5401 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5402 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
5403 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5404 %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst
5408 define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
5409 ; SI-LABEL: global_atomic_umax_i32_noret_offset_scalar:
5411 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5412 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
5413 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
5414 ; SI-NEXT: s_mov_b64 exec, s[34:35]
5415 ; SI-NEXT: s_waitcnt expcnt(0)
5416 ; SI-NEXT: v_writelane_b32 v0, s6, 0
5417 ; SI-NEXT: v_writelane_b32 v0, s7, 1
5418 ; SI-NEXT: s_mov_b32 s34, s6
5419 ; SI-NEXT: s_mov_b32 s7, 0xf000
5420 ; SI-NEXT: s_mov_b32 s6, -1
5421 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16
5422 ; SI-NEXT: s_mov_b64 s[36:37], 0
5423 ; SI-NEXT: .LBB81_1: ; %atomicrmw.start
5424 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5425 ; SI-NEXT: s_waitcnt vmcnt(0)
5426 ; SI-NEXT: v_max_u32_e32 v1, s34, v2
5427 ; SI-NEXT: s_waitcnt expcnt(0)
5428 ; SI-NEXT: v_mov_b32_e32 v4, v2
5429 ; SI-NEXT: v_mov_b32_e32 v3, v1
5430 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5431 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc
5432 ; SI-NEXT: s_waitcnt vmcnt(0)
5433 ; SI-NEXT: buffer_wbinvl1
5434 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
5435 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
5436 ; SI-NEXT: v_mov_b32_e32 v2, v3
5437 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
5438 ; SI-NEXT: s_cbranch_execnz .LBB81_1
5439 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5440 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
5441 ; SI-NEXT: v_readlane_b32 s7, v0, 1
5442 ; SI-NEXT: v_readlane_b32 s6, v0, 0
5443 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
5444 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
5445 ; SI-NEXT: s_mov_b64 exec, s[34:35]
5446 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5447 ; SI-NEXT: s_setpc_b64 s[30:31]
5449 ; VI-LABEL: global_atomic_umax_i32_noret_offset_scalar:
5451 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5452 ; VI-NEXT: s_add_u32 s34, s4, 16
5453 ; VI-NEXT: s_addc_u32 s35, s5, 0
5454 ; VI-NEXT: v_mov_b32_e32 v0, s34
5455 ; VI-NEXT: v_mov_b32_e32 v1, s35
5456 ; VI-NEXT: flat_load_dword v1, v[0:1]
5457 ; VI-NEXT: s_mov_b64 s[36:37], 0
5458 ; VI-NEXT: .LBB81_1: ; %atomicrmw.start
5459 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5460 ; VI-NEXT: v_mov_b32_e32 v2, s34
5461 ; VI-NEXT: s_waitcnt vmcnt(0)
5462 ; VI-NEXT: v_max_u32_e32 v0, s6, v1
5463 ; VI-NEXT: v_mov_b32_e32 v3, s35
5464 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5465 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
5466 ; VI-NEXT: s_waitcnt vmcnt(0)
5467 ; VI-NEXT: buffer_wbinvl1_vol
5468 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
5469 ; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
5470 ; VI-NEXT: v_mov_b32_e32 v1, v0
5471 ; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
5472 ; VI-NEXT: s_cbranch_execnz .LBB81_1
5473 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5474 ; VI-NEXT: s_or_b64 exec, exec, s[36:37]
5475 ; VI-NEXT: s_setpc_b64 s[30:31]
5477 ; GFX9-LABEL: global_atomic_umax_i32_noret_offset_scalar:
5479 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5480 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5481 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16
5482 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
5483 ; GFX9-NEXT: .LBB81_1: ; %atomicrmw.start
5484 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5485 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5486 ; GFX9-NEXT: v_max_u32_e32 v0, s6, v1
5487 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5488 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc
5489 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5490 ; GFX9-NEXT: buffer_wbinvl1_vol
5491 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
5492 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5493 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
5494 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
5495 ; GFX9-NEXT: s_cbranch_execnz .LBB81_1
5496 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5497 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
5498 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5499 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
5500 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst
5504 define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
5505 ; SI-LABEL: global_atomic_umax_i32_ret_scalar:
5507 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5508 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
5509 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
5510 ; SI-NEXT: s_mov_b64 exec, s[34:35]
5511 ; SI-NEXT: s_waitcnt expcnt(0)
5512 ; SI-NEXT: v_writelane_b32 v1, s6, 0
5513 ; SI-NEXT: v_writelane_b32 v1, s7, 1
5514 ; SI-NEXT: s_mov_b32 s34, s6
5515 ; SI-NEXT: s_mov_b32 s7, 0xf000
5516 ; SI-NEXT: s_mov_b32 s6, -1
5517 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
5518 ; SI-NEXT: s_mov_b64 s[36:37], 0
5519 ; SI-NEXT: .LBB82_1: ; %atomicrmw.start
5520 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5521 ; SI-NEXT: s_waitcnt vmcnt(0)
5522 ; SI-NEXT: v_mov_b32_e32 v4, v2
5523 ; SI-NEXT: s_waitcnt expcnt(0)
5524 ; SI-NEXT: v_max_u32_e32 v3, s34, v4
5525 ; SI-NEXT: v_mov_b32_e32 v2, v3
5526 ; SI-NEXT: v_mov_b32_e32 v3, v4
5527 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5528 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
5529 ; SI-NEXT: s_waitcnt vmcnt(0)
5530 ; SI-NEXT: buffer_wbinvl1
5531 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
5532 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
5533 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
5534 ; SI-NEXT: s_cbranch_execnz .LBB82_1
5535 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5536 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
5537 ; SI-NEXT: v_mov_b32_e32 v0, v2
5538 ; SI-NEXT: v_readlane_b32 s7, v1, 1
5539 ; SI-NEXT: v_readlane_b32 s6, v1, 0
5540 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
5541 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
5542 ; SI-NEXT: s_mov_b64 exec, s[34:35]
5543 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5544 ; SI-NEXT: s_setpc_b64 s[30:31]
5546 ; VI-LABEL: global_atomic_umax_i32_ret_scalar:
5548 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5549 ; VI-NEXT: v_mov_b32_e32 v0, s4
5550 ; VI-NEXT: v_mov_b32_e32 v1, s5
5551 ; VI-NEXT: flat_load_dword v0, v[0:1]
5552 ; VI-NEXT: s_mov_b64 s[34:35], 0
5553 ; VI-NEXT: .LBB82_1: ; %atomicrmw.start
5554 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5555 ; VI-NEXT: s_waitcnt vmcnt(0)
5556 ; VI-NEXT: v_mov_b32_e32 v1, v0
5557 ; VI-NEXT: v_mov_b32_e32 v2, s4
5558 ; VI-NEXT: v_mov_b32_e32 v3, s5
5559 ; VI-NEXT: v_max_u32_e32 v0, s6, v1
5560 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5561 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
5562 ; VI-NEXT: s_waitcnt vmcnt(0)
5563 ; VI-NEXT: buffer_wbinvl1_vol
5564 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
5565 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5566 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
5567 ; VI-NEXT: s_cbranch_execnz .LBB82_1
5568 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5569 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
5570 ; VI-NEXT: s_setpc_b64 s[30:31]
5572 ; GFX9-LABEL: global_atomic_umax_i32_ret_scalar:
5574 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5575 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
5576 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5]
5577 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
5578 ; GFX9-NEXT: .LBB82_1: ; %atomicrmw.start
5579 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5580 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5581 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
5582 ; GFX9-NEXT: v_max_u32_e32 v2, s6, v3
5583 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5584 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
5585 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5586 ; GFX9-NEXT: buffer_wbinvl1_vol
5587 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
5588 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5589 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
5590 ; GFX9-NEXT: s_cbranch_execnz .LBB82_1
5591 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5592 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
5593 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5594 %result = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst
5598 define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
5599 ; SI-LABEL: global_atomic_umax_i32_ret_offset_scalar:
5601 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5602 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
5603 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
5604 ; SI-NEXT: s_mov_b64 exec, s[34:35]
5605 ; SI-NEXT: s_waitcnt expcnt(0)
5606 ; SI-NEXT: v_writelane_b32 v1, s6, 0
5607 ; SI-NEXT: v_writelane_b32 v1, s7, 1
5608 ; SI-NEXT: s_mov_b32 s34, s6
5609 ; SI-NEXT: s_mov_b32 s7, 0xf000
5610 ; SI-NEXT: s_mov_b32 s6, -1
5611 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16
5612 ; SI-NEXT: s_mov_b64 s[36:37], 0
5613 ; SI-NEXT: .LBB83_1: ; %atomicrmw.start
5614 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5615 ; SI-NEXT: s_waitcnt vmcnt(0)
5616 ; SI-NEXT: v_mov_b32_e32 v4, v2
5617 ; SI-NEXT: s_waitcnt expcnt(0)
5618 ; SI-NEXT: v_max_u32_e32 v3, s34, v4
5619 ; SI-NEXT: v_mov_b32_e32 v2, v3
5620 ; SI-NEXT: v_mov_b32_e32 v3, v4
5621 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5622 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
5623 ; SI-NEXT: s_waitcnt vmcnt(0)
5624 ; SI-NEXT: buffer_wbinvl1
5625 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
5626 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
5627 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
5628 ; SI-NEXT: s_cbranch_execnz .LBB83_1
5629 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5630 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
5631 ; SI-NEXT: v_mov_b32_e32 v0, v2
5632 ; SI-NEXT: v_readlane_b32 s7, v1, 1
5633 ; SI-NEXT: v_readlane_b32 s6, v1, 0
5634 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
5635 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
5636 ; SI-NEXT: s_mov_b64 exec, s[34:35]
5637 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5638 ; SI-NEXT: s_setpc_b64 s[30:31]
5640 ; VI-LABEL: global_atomic_umax_i32_ret_offset_scalar:
5642 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5643 ; VI-NEXT: s_add_u32 s34, s4, 16
5644 ; VI-NEXT: s_addc_u32 s35, s5, 0
5645 ; VI-NEXT: v_mov_b32_e32 v0, s34
5646 ; VI-NEXT: v_mov_b32_e32 v1, s35
5647 ; VI-NEXT: flat_load_dword v0, v[0:1]
5648 ; VI-NEXT: s_mov_b64 s[36:37], 0
5649 ; VI-NEXT: .LBB83_1: ; %atomicrmw.start
5650 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5651 ; VI-NEXT: s_waitcnt vmcnt(0)
5652 ; VI-NEXT: v_mov_b32_e32 v1, v0
5653 ; VI-NEXT: v_mov_b32_e32 v2, s34
5654 ; VI-NEXT: v_mov_b32_e32 v3, s35
5655 ; VI-NEXT: v_max_u32_e32 v0, s6, v1
5656 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5657 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
5658 ; VI-NEXT: s_waitcnt vmcnt(0)
5659 ; VI-NEXT: buffer_wbinvl1_vol
5660 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
5661 ; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
5662 ; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
5663 ; VI-NEXT: s_cbranch_execnz .LBB83_1
5664 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5665 ; VI-NEXT: s_or_b64 exec, exec, s[36:37]
5666 ; VI-NEXT: s_setpc_b64 s[30:31]
5668 ; GFX9-LABEL: global_atomic_umax_i32_ret_offset_scalar:
5670 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5671 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
5672 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16
5673 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
5674 ; GFX9-NEXT: .LBB83_1: ; %atomicrmw.start
5675 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5676 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5677 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
5678 ; GFX9-NEXT: v_max_u32_e32 v2, s6, v3
5679 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5680 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc
5681 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5682 ; GFX9-NEXT: buffer_wbinvl1_vol
5683 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
5684 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5685 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
5686 ; GFX9-NEXT: s_cbranch_execnz .LBB83_1
5687 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5688 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
5689 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5690 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
5691 %result = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst
5695 define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) {
5696 ; SI-LABEL: atomic_umax_i32_addr64_offset:
5697 ; SI: ; %bb.0: ; %entry
5698 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5699 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5700 ; SI-NEXT: s_ashr_i32 s5, s3, 31
5701 ; SI-NEXT: s_mov_b32 s4, s3
5702 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5703 ; SI-NEXT: s_add_u32 s4, s0, s4
5704 ; SI-NEXT: s_addc_u32 s5, s1, s5
5705 ; SI-NEXT: s_load_dword s3, s[4:5], 0x4
5706 ; SI-NEXT: s_mov_b64 s[0:1], 0
5707 ; SI-NEXT: s_mov_b32 s7, 0xf000
5708 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5709 ; SI-NEXT: v_mov_b32_e32 v1, s3
5710 ; SI-NEXT: s_mov_b32 s6, -1
5711 ; SI-NEXT: .LBB84_1: ; %atomicrmw.start
5712 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5713 ; SI-NEXT: v_max_u32_e32 v0, s2, v1
5714 ; SI-NEXT: s_waitcnt expcnt(0)
5715 ; SI-NEXT: v_mov_b32_e32 v3, v1
5716 ; SI-NEXT: v_mov_b32_e32 v2, v0
5717 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5718 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
5719 ; SI-NEXT: s_waitcnt vmcnt(0)
5720 ; SI-NEXT: buffer_wbinvl1
5721 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
5722 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
5723 ; SI-NEXT: v_mov_b32_e32 v1, v2
5724 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
5725 ; SI-NEXT: s_cbranch_execnz .LBB84_1
5726 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5729 ; VI-LABEL: atomic_umax_i32_addr64_offset:
5730 ; VI: ; %bb.0: ; %entry
5731 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5732 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5733 ; VI-NEXT: s_ashr_i32 s5, s3, 31
5734 ; VI-NEXT: s_mov_b32 s4, s3
5735 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5736 ; VI-NEXT: s_add_u32 s0, s0, s4
5737 ; VI-NEXT: s_addc_u32 s1, s1, s5
5738 ; VI-NEXT: s_load_dword s3, s[0:1], 0x10
5739 ; VI-NEXT: s_add_u32 s0, s0, 16
5740 ; VI-NEXT: s_addc_u32 s1, s1, 0
5741 ; VI-NEXT: s_mov_b64 s[4:5], 0
5742 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5743 ; VI-NEXT: v_mov_b32_e32 v1, s3
5744 ; VI-NEXT: .LBB84_1: ; %atomicrmw.start
5745 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5746 ; VI-NEXT: v_mov_b32_e32 v3, s1
5747 ; VI-NEXT: v_max_u32_e32 v0, s2, v1
5748 ; VI-NEXT: v_mov_b32_e32 v2, s0
5749 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5750 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
5751 ; VI-NEXT: s_waitcnt vmcnt(0)
5752 ; VI-NEXT: buffer_wbinvl1_vol
5753 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
5754 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5755 ; VI-NEXT: v_mov_b32_e32 v1, v0
5756 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
5757 ; VI-NEXT: s_cbranch_execnz .LBB84_1
5758 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5761 ; GFX9-LABEL: atomic_umax_i32_addr64_offset:
5762 ; GFX9: ; %bb.0: ; %entry
5763 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5764 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5765 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5766 ; GFX9-NEXT: s_ashr_i32 s5, s3, 31
5767 ; GFX9-NEXT: s_mov_b32 s4, s3
5768 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5769 ; GFX9-NEXT: s_add_u32 s0, s0, s4
5770 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
5771 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10
5772 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
5773 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5774 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
5775 ; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start
5776 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5777 ; GFX9-NEXT: v_max_u32_e32 v0, s2, v1
5778 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5779 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
5780 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5781 ; GFX9-NEXT: buffer_wbinvl1_vol
5782 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
5783 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5784 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
5785 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
5786 ; GFX9-NEXT: s_cbranch_execnz .LBB84_1
5787 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5788 ; GFX9-NEXT: s_endpgm
5790 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
5791 %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
5792 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst
5796 define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) {
5797 ; SI-LABEL: atomic_umax_i32_ret_addr64_offset:
5798 ; SI: ; %bb.0: ; %entry
5799 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
5800 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5801 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5802 ; SI-NEXT: s_ashr_i32 s5, s9, 31
5803 ; SI-NEXT: s_mov_b32 s4, s9
5804 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5805 ; SI-NEXT: s_add_u32 s4, s0, s4
5806 ; SI-NEXT: s_addc_u32 s5, s1, s5
5807 ; SI-NEXT: s_load_dword s6, s[4:5], 0x4
5808 ; SI-NEXT: s_mov_b64 s[0:1], 0
5809 ; SI-NEXT: s_mov_b32 s7, 0xf000
5810 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5811 ; SI-NEXT: v_mov_b32_e32 v1, s6
5812 ; SI-NEXT: s_mov_b32 s6, -1
5813 ; SI-NEXT: .LBB85_1: ; %atomicrmw.start
5814 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5815 ; SI-NEXT: v_max_u32_e32 v0, s8, v1
5816 ; SI-NEXT: s_waitcnt expcnt(0)
5817 ; SI-NEXT: v_mov_b32_e32 v3, v1
5818 ; SI-NEXT: v_mov_b32_e32 v2, v0
5819 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5820 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
5821 ; SI-NEXT: s_waitcnt vmcnt(0)
5822 ; SI-NEXT: buffer_wbinvl1
5823 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
5824 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
5825 ; SI-NEXT: v_mov_b32_e32 v1, v2
5826 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
5827 ; SI-NEXT: s_cbranch_execnz .LBB85_1
5828 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5829 ; SI-NEXT: s_or_b64 exec, exec, s[0:1]
5830 ; SI-NEXT: s_mov_b32 s7, 0xf000
5831 ; SI-NEXT: s_mov_b32 s6, -1
5832 ; SI-NEXT: s_mov_b32 s4, s2
5833 ; SI-NEXT: s_mov_b32 s5, s3
5834 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
5837 ; VI-LABEL: atomic_umax_i32_ret_addr64_offset:
5838 ; VI: ; %bb.0: ; %entry
5839 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
5840 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5841 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5842 ; VI-NEXT: s_ashr_i32 s7, s5, 31
5843 ; VI-NEXT: s_mov_b32 s6, s5
5844 ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
5845 ; VI-NEXT: s_add_u32 s0, s0, s6
5846 ; VI-NEXT: s_addc_u32 s1, s1, s7
5847 ; VI-NEXT: s_load_dword s5, s[0:1], 0x10
5848 ; VI-NEXT: s_add_u32 s0, s0, 16
5849 ; VI-NEXT: s_addc_u32 s1, s1, 0
5850 ; VI-NEXT: s_mov_b64 s[6:7], 0
5851 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5852 ; VI-NEXT: v_mov_b32_e32 v0, s5
5853 ; VI-NEXT: .LBB85_1: ; %atomicrmw.start
5854 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5855 ; VI-NEXT: v_mov_b32_e32 v1, v0
5856 ; VI-NEXT: v_mov_b32_e32 v3, s1
5857 ; VI-NEXT: v_mov_b32_e32 v2, s0
5858 ; VI-NEXT: v_max_u32_e32 v0, s4, v1
5859 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5860 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
5861 ; VI-NEXT: s_waitcnt vmcnt(0)
5862 ; VI-NEXT: buffer_wbinvl1_vol
5863 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
5864 ; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
5865 ; VI-NEXT: s_andn2_b64 exec, exec, s[6:7]
5866 ; VI-NEXT: s_cbranch_execnz .LBB85_1
5867 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5868 ; VI-NEXT: s_or_b64 exec, exec, s[6:7]
5869 ; VI-NEXT: v_mov_b32_e32 v1, s2
5870 ; VI-NEXT: v_mov_b32_e32 v2, s3
5871 ; VI-NEXT: flat_store_dword v[1:2], v0
5874 ; GFX9-LABEL: atomic_umax_i32_ret_addr64_offset:
5875 ; GFX9: ; %bb.0: ; %entry
5876 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
5877 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5878 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
5879 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5880 ; GFX9-NEXT: s_ashr_i32 s1, s3, 31
5881 ; GFX9-NEXT: s_mov_b32 s0, s3
5882 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
5883 ; GFX9-NEXT: s_add_u32 s0, s4, s0
5884 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
5885 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10
5886 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
5887 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5888 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
5889 ; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start
5890 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5891 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
5892 ; GFX9-NEXT: v_max_u32_e32 v2, s2, v3
5893 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5894 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc
5895 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5896 ; GFX9-NEXT: buffer_wbinvl1_vol
5897 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
5898 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5899 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
5900 ; GFX9-NEXT: s_cbranch_execnz .LBB85_1
5901 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5902 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
5903 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
5904 ; GFX9-NEXT: global_store_dword v1, v0, s[6:7]
5905 ; GFX9-NEXT: s_endpgm
5907 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
5908 %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
5909 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst
5910 store i32 %tmp0, ptr addrspace(1) %out2
5914 define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) {
5915 ; SI-LABEL: atomic_umax_i32_ret_addr64:
5916 ; SI: ; %bb.0: ; %entry
5917 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
5918 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5919 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5920 ; SI-NEXT: s_ashr_i32 s5, s9, 31
5921 ; SI-NEXT: s_mov_b32 s4, s9
5922 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5923 ; SI-NEXT: s_add_u32 s4, s0, s4
5924 ; SI-NEXT: s_addc_u32 s5, s1, s5
5925 ; SI-NEXT: s_load_dword s6, s[4:5], 0x0
5926 ; SI-NEXT: s_mov_b64 s[0:1], 0
5927 ; SI-NEXT: s_mov_b32 s7, 0xf000
5928 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5929 ; SI-NEXT: v_mov_b32_e32 v1, s6
5930 ; SI-NEXT: s_mov_b32 s6, -1
5931 ; SI-NEXT: .LBB86_1: ; %atomicrmw.start
5932 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5933 ; SI-NEXT: v_max_u32_e32 v0, s8, v1
5934 ; SI-NEXT: s_waitcnt expcnt(0)
5935 ; SI-NEXT: v_mov_b32_e32 v3, v1
5936 ; SI-NEXT: v_mov_b32_e32 v2, v0
5937 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5938 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
5939 ; SI-NEXT: s_waitcnt vmcnt(0)
5940 ; SI-NEXT: buffer_wbinvl1
5941 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
5942 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
5943 ; SI-NEXT: v_mov_b32_e32 v1, v2
5944 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
5945 ; SI-NEXT: s_cbranch_execnz .LBB86_1
5946 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5947 ; SI-NEXT: s_or_b64 exec, exec, s[0:1]
5948 ; SI-NEXT: s_mov_b32 s7, 0xf000
5949 ; SI-NEXT: s_mov_b32 s6, -1
5950 ; SI-NEXT: s_mov_b32 s4, s2
5951 ; SI-NEXT: s_mov_b32 s5, s3
5952 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
5955 ; VI-LABEL: atomic_umax_i32_ret_addr64:
5956 ; VI: ; %bb.0: ; %entry
5957 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
5958 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5959 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5960 ; VI-NEXT: s_ashr_i32 s7, s5, 31
5961 ; VI-NEXT: s_mov_b32 s6, s5
5962 ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
5963 ; VI-NEXT: s_add_u32 s0, s0, s6
5964 ; VI-NEXT: s_addc_u32 s1, s1, s7
5965 ; VI-NEXT: s_load_dword s5, s[0:1], 0x0
5966 ; VI-NEXT: s_mov_b64 s[6:7], 0
5967 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5968 ; VI-NEXT: v_mov_b32_e32 v0, s5
5969 ; VI-NEXT: .LBB86_1: ; %atomicrmw.start
5970 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5971 ; VI-NEXT: v_mov_b32_e32 v1, v0
5972 ; VI-NEXT: v_mov_b32_e32 v3, s1
5973 ; VI-NEXT: v_mov_b32_e32 v2, s0
5974 ; VI-NEXT: v_max_u32_e32 v0, s4, v1
5975 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5976 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
5977 ; VI-NEXT: s_waitcnt vmcnt(0)
5978 ; VI-NEXT: buffer_wbinvl1_vol
5979 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
5980 ; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
5981 ; VI-NEXT: s_andn2_b64 exec, exec, s[6:7]
5982 ; VI-NEXT: s_cbranch_execnz .LBB86_1
5983 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5984 ; VI-NEXT: s_or_b64 exec, exec, s[6:7]
5985 ; VI-NEXT: v_mov_b32_e32 v1, s2
5986 ; VI-NEXT: v_mov_b32_e32 v2, s3
5987 ; VI-NEXT: flat_store_dword v[1:2], v0
5990 ; GFX9-LABEL: atomic_umax_i32_ret_addr64:
5991 ; GFX9: ; %bb.0: ; %entry
5992 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
5993 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5994 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
5995 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5996 ; GFX9-NEXT: s_ashr_i32 s1, s3, 31
5997 ; GFX9-NEXT: s_mov_b32 s0, s3
5998 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
5999 ; GFX9-NEXT: s_add_u32 s0, s4, s0
6000 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
6001 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0
6002 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
6003 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6004 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
6005 ; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start
6006 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6007 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
6008 ; GFX9-NEXT: v_max_u32_e32 v2, s2, v3
6009 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6010 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
6011 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6012 ; GFX9-NEXT: buffer_wbinvl1_vol
6013 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
6014 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6015 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
6016 ; GFX9-NEXT: s_cbranch_execnz .LBB86_1
6017 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6018 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
6019 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
6020 ; GFX9-NEXT: global_store_dword v1, v0, s[6:7]
6021 ; GFX9-NEXT: s_endpgm
6023 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
6024 %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i32 %in seq_cst
6025 store i32 %tmp0, ptr addrspace(1) %out2
6029 ; ---------------------------------------------------------------------
6031 ; ---------------------------------------------------------------------
6033 define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
6034 ; SI-LABEL: global_atomic_umin_i32_noret:
6036 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6037 ; SI-NEXT: s_mov_b32 s6, 0
6038 ; SI-NEXT: s_mov_b32 s7, 0xf000
6039 ; SI-NEXT: s_mov_b32 s4, s6
6040 ; SI-NEXT: s_mov_b32 s5, s6
6041 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
6042 ; SI-NEXT: s_mov_b64 s[8:9], 0
6043 ; SI-NEXT: .LBB87_1: ; %atomicrmw.start
6044 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6045 ; SI-NEXT: s_waitcnt vmcnt(0)
6046 ; SI-NEXT: v_min_u32_e32 v3, v4, v2
6047 ; SI-NEXT: s_waitcnt expcnt(0)
6048 ; SI-NEXT: v_mov_b32_e32 v6, v4
6049 ; SI-NEXT: v_mov_b32_e32 v5, v3
6050 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6051 ; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
6052 ; SI-NEXT: s_waitcnt vmcnt(0)
6053 ; SI-NEXT: buffer_wbinvl1
6054 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
6055 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
6056 ; SI-NEXT: v_mov_b32_e32 v4, v5
6057 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
6058 ; SI-NEXT: s_cbranch_execnz .LBB87_1
6059 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6060 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
6061 ; SI-NEXT: s_waitcnt expcnt(0)
6062 ; SI-NEXT: s_setpc_b64 s[30:31]
6064 ; VI-LABEL: global_atomic_umin_i32_noret:
6066 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6067 ; VI-NEXT: flat_load_dword v4, v[0:1]
6068 ; VI-NEXT: s_mov_b64 s[4:5], 0
6069 ; VI-NEXT: .LBB87_1: ; %atomicrmw.start
6070 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6071 ; VI-NEXT: s_waitcnt vmcnt(0)
6072 ; VI-NEXT: v_min_u32_e32 v3, v4, v2
6073 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6074 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6075 ; VI-NEXT: s_waitcnt vmcnt(0)
6076 ; VI-NEXT: buffer_wbinvl1_vol
6077 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6078 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6079 ; VI-NEXT: v_mov_b32_e32 v4, v3
6080 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
6081 ; VI-NEXT: s_cbranch_execnz .LBB87_1
6082 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6083 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
6084 ; VI-NEXT: s_setpc_b64 s[30:31]
6086 ; GFX9-LABEL: global_atomic_umin_i32_noret:
6088 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6089 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
6090 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
6091 ; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start
6092 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6093 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6094 ; GFX9-NEXT: v_min_u32_e32 v3, v4, v2
6095 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6096 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
6097 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6098 ; GFX9-NEXT: buffer_wbinvl1_vol
6099 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6100 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6101 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
6102 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
6103 ; GFX9-NEXT: s_cbranch_execnz .LBB87_1
6104 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6105 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
6106 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6107 %tmp0 = atomicrmw umin ptr addrspace(1) %ptr, i32 %in seq_cst
6111 define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
6112 ; SI-LABEL: global_atomic_umin_i32_noret_offset:
6114 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6115 ; SI-NEXT: s_mov_b32 s6, 0
6116 ; SI-NEXT: s_mov_b32 s7, 0xf000
6117 ; SI-NEXT: s_mov_b32 s4, s6
6118 ; SI-NEXT: s_mov_b32 s5, s6
6119 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
6120 ; SI-NEXT: s_mov_b64 s[8:9], 0
6121 ; SI-NEXT: .LBB88_1: ; %atomicrmw.start
6122 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6123 ; SI-NEXT: s_waitcnt vmcnt(0)
6124 ; SI-NEXT: v_min_u32_e32 v3, v4, v2
6125 ; SI-NEXT: s_waitcnt expcnt(0)
6126 ; SI-NEXT: v_mov_b32_e32 v6, v4
6127 ; SI-NEXT: v_mov_b32_e32 v5, v3
6128 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6129 ; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
6130 ; SI-NEXT: s_waitcnt vmcnt(0)
6131 ; SI-NEXT: buffer_wbinvl1
6132 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
6133 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
6134 ; SI-NEXT: v_mov_b32_e32 v4, v5
6135 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
6136 ; SI-NEXT: s_cbranch_execnz .LBB88_1
6137 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6138 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
6139 ; SI-NEXT: s_waitcnt expcnt(0)
6140 ; SI-NEXT: s_setpc_b64 s[30:31]
6142 ; VI-LABEL: global_atomic_umin_i32_noret_offset:
6144 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6145 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
6146 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
6147 ; VI-NEXT: flat_load_dword v4, v[0:1]
6148 ; VI-NEXT: s_mov_b64 s[4:5], 0
6149 ; VI-NEXT: .LBB88_1: ; %atomicrmw.start
6150 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6151 ; VI-NEXT: s_waitcnt vmcnt(0)
6152 ; VI-NEXT: v_min_u32_e32 v3, v4, v2
6153 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6154 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6155 ; VI-NEXT: s_waitcnt vmcnt(0)
6156 ; VI-NEXT: buffer_wbinvl1_vol
6157 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6158 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6159 ; VI-NEXT: v_mov_b32_e32 v4, v3
6160 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
6161 ; VI-NEXT: s_cbranch_execnz .LBB88_1
6162 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6163 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
6164 ; VI-NEXT: s_setpc_b64 s[30:31]
6166 ; GFX9-LABEL: global_atomic_umin_i32_noret_offset:
6168 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6169 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16
6170 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
6171 ; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start
6172 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6173 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6174 ; GFX9-NEXT: v_min_u32_e32 v3, v4, v2
6175 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6176 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
6177 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6178 ; GFX9-NEXT: buffer_wbinvl1_vol
6179 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6180 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6181 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
6182 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
6183 ; GFX9-NEXT: s_cbranch_execnz .LBB88_1
6184 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6185 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
6186 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6187 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
6188 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst
6192 define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
6193 ; SI-LABEL: global_atomic_umin_i32_ret:
6195 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6196 ; SI-NEXT: s_mov_b32 s6, 0
6197 ; SI-NEXT: s_mov_b32 s7, 0xf000
6198 ; SI-NEXT: s_mov_b32 s4, s6
6199 ; SI-NEXT: s_mov_b32 s5, s6
6200 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
6201 ; SI-NEXT: s_mov_b64 s[8:9], 0
6202 ; SI-NEXT: .LBB89_1: ; %atomicrmw.start
6203 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6204 ; SI-NEXT: s_waitcnt vmcnt(0)
6205 ; SI-NEXT: v_mov_b32_e32 v5, v3
6206 ; SI-NEXT: s_waitcnt expcnt(0)
6207 ; SI-NEXT: v_min_u32_e32 v4, v5, v2
6208 ; SI-NEXT: v_mov_b32_e32 v3, v4
6209 ; SI-NEXT: v_mov_b32_e32 v4, v5
6210 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6211 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
6212 ; SI-NEXT: s_waitcnt vmcnt(0)
6213 ; SI-NEXT: buffer_wbinvl1
6214 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
6215 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
6216 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
6217 ; SI-NEXT: s_cbranch_execnz .LBB89_1
6218 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6219 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
6220 ; SI-NEXT: v_mov_b32_e32 v0, v3
6221 ; SI-NEXT: s_waitcnt expcnt(0)
6222 ; SI-NEXT: s_setpc_b64 s[30:31]
6224 ; VI-LABEL: global_atomic_umin_i32_ret:
6226 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6227 ; VI-NEXT: flat_load_dword v3, v[0:1]
6228 ; VI-NEXT: s_mov_b64 s[4:5], 0
6229 ; VI-NEXT: .LBB89_1: ; %atomicrmw.start
6230 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6231 ; VI-NEXT: s_waitcnt vmcnt(0)
6232 ; VI-NEXT: v_mov_b32_e32 v4, v3
6233 ; VI-NEXT: v_min_u32_e32 v3, v4, v2
6234 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6235 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6236 ; VI-NEXT: s_waitcnt vmcnt(0)
6237 ; VI-NEXT: buffer_wbinvl1_vol
6238 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6239 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6240 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
6241 ; VI-NEXT: s_cbranch_execnz .LBB89_1
6242 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6243 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
6244 ; VI-NEXT: v_mov_b32_e32 v0, v3
6245 ; VI-NEXT: s_setpc_b64 s[30:31]
6247 ; GFX9-LABEL: global_atomic_umin_i32_ret:
6249 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6250 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
6251 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
6252 ; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start
6253 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6254 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6255 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
6256 ; GFX9-NEXT: v_min_u32_e32 v3, v4, v2
6257 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6258 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
6259 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6260 ; GFX9-NEXT: buffer_wbinvl1_vol
6261 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6262 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6263 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
6264 ; GFX9-NEXT: s_cbranch_execnz .LBB89_1
6265 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6266 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
6267 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
6268 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6269 %result = atomicrmw umin ptr addrspace(1) %ptr, i32 %in seq_cst
6273 define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
6274 ; SI-LABEL: global_atomic_umin_i32_ret_offset:
6276 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6277 ; SI-NEXT: s_mov_b32 s6, 0
6278 ; SI-NEXT: s_mov_b32 s7, 0xf000
6279 ; SI-NEXT: s_mov_b32 s4, s6
6280 ; SI-NEXT: s_mov_b32 s5, s6
6281 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
6282 ; SI-NEXT: s_mov_b64 s[8:9], 0
6283 ; SI-NEXT: .LBB90_1: ; %atomicrmw.start
6284 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6285 ; SI-NEXT: s_waitcnt vmcnt(0)
6286 ; SI-NEXT: v_mov_b32_e32 v5, v3
6287 ; SI-NEXT: s_waitcnt expcnt(0)
6288 ; SI-NEXT: v_min_u32_e32 v4, v5, v2
6289 ; SI-NEXT: v_mov_b32_e32 v3, v4
6290 ; SI-NEXT: v_mov_b32_e32 v4, v5
6291 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6292 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
6293 ; SI-NEXT: s_waitcnt vmcnt(0)
6294 ; SI-NEXT: buffer_wbinvl1
6295 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
6296 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
6297 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
6298 ; SI-NEXT: s_cbranch_execnz .LBB90_1
6299 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6300 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
6301 ; SI-NEXT: v_mov_b32_e32 v0, v3
6302 ; SI-NEXT: s_waitcnt expcnt(0)
6303 ; SI-NEXT: s_setpc_b64 s[30:31]
6305 ; VI-LABEL: global_atomic_umin_i32_ret_offset:
6307 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6308 ; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0
6309 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
6310 ; VI-NEXT: flat_load_dword v0, v[3:4]
6311 ; VI-NEXT: s_mov_b64 s[4:5], 0
6312 ; VI-NEXT: .LBB90_1: ; %atomicrmw.start
6313 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6314 ; VI-NEXT: s_waitcnt vmcnt(0)
6315 ; VI-NEXT: v_mov_b32_e32 v1, v0
6316 ; VI-NEXT: v_min_u32_e32 v0, v1, v2
6317 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6318 ; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
6319 ; VI-NEXT: s_waitcnt vmcnt(0)
6320 ; VI-NEXT: buffer_wbinvl1_vol
6321 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
6322 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6323 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
6324 ; VI-NEXT: s_cbranch_execnz .LBB90_1
6325 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6326 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
6327 ; VI-NEXT: s_setpc_b64 s[30:31]
6329 ; GFX9-LABEL: global_atomic_umin_i32_ret_offset:
6331 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6332 ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16
6333 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
6334 ; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start
6335 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6336 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6337 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
6338 ; GFX9-NEXT: v_min_u32_e32 v3, v4, v2
6339 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6340 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
6341 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6342 ; GFX9-NEXT: buffer_wbinvl1_vol
6343 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6344 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6345 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
6346 ; GFX9-NEXT: s_cbranch_execnz .LBB90_1
6347 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6348 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
6349 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
6350 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6351 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
6352 %result = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst
6356 define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
6357 ; SI-LABEL: global_atomic_umin_i32_noret_scalar:
6359 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6360 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
6361 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
6362 ; SI-NEXT: s_mov_b64 exec, s[34:35]
6363 ; SI-NEXT: s_waitcnt expcnt(0)
6364 ; SI-NEXT: v_writelane_b32 v0, s6, 0
6365 ; SI-NEXT: v_writelane_b32 v0, s7, 1
6366 ; SI-NEXT: s_mov_b32 s34, s6
6367 ; SI-NEXT: s_mov_b32 s7, 0xf000
6368 ; SI-NEXT: s_mov_b32 s6, -1
6369 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
6370 ; SI-NEXT: s_mov_b64 s[36:37], 0
6371 ; SI-NEXT: .LBB91_1: ; %atomicrmw.start
6372 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6373 ; SI-NEXT: s_waitcnt vmcnt(0)
6374 ; SI-NEXT: v_min_u32_e32 v1, s34, v2
6375 ; SI-NEXT: s_waitcnt expcnt(0)
6376 ; SI-NEXT: v_mov_b32_e32 v4, v2
6377 ; SI-NEXT: v_mov_b32_e32 v3, v1
6378 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6379 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc
6380 ; SI-NEXT: s_waitcnt vmcnt(0)
6381 ; SI-NEXT: buffer_wbinvl1
6382 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
6383 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
6384 ; SI-NEXT: v_mov_b32_e32 v2, v3
6385 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
6386 ; SI-NEXT: s_cbranch_execnz .LBB91_1
6387 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6388 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
6389 ; SI-NEXT: v_readlane_b32 s7, v0, 1
6390 ; SI-NEXT: v_readlane_b32 s6, v0, 0
6391 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
6392 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
6393 ; SI-NEXT: s_mov_b64 exec, s[34:35]
6394 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
6395 ; SI-NEXT: s_setpc_b64 s[30:31]
6397 ; VI-LABEL: global_atomic_umin_i32_noret_scalar:
6399 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6400 ; VI-NEXT: v_mov_b32_e32 v0, s4
6401 ; VI-NEXT: v_mov_b32_e32 v1, s5
6402 ; VI-NEXT: flat_load_dword v1, v[0:1]
6403 ; VI-NEXT: s_mov_b64 s[34:35], 0
6404 ; VI-NEXT: .LBB91_1: ; %atomicrmw.start
6405 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6406 ; VI-NEXT: v_mov_b32_e32 v2, s4
6407 ; VI-NEXT: s_waitcnt vmcnt(0)
6408 ; VI-NEXT: v_min_u32_e32 v0, s6, v1
6409 ; VI-NEXT: v_mov_b32_e32 v3, s5
6410 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6411 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
6412 ; VI-NEXT: s_waitcnt vmcnt(0)
6413 ; VI-NEXT: buffer_wbinvl1_vol
6414 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
6415 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6416 ; VI-NEXT: v_mov_b32_e32 v1, v0
6417 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
6418 ; VI-NEXT: s_cbranch_execnz .LBB91_1
6419 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6420 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
6421 ; VI-NEXT: s_setpc_b64 s[30:31]
6423 ; GFX9-LABEL: global_atomic_umin_i32_noret_scalar:
6425 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6426 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
6427 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5]
6428 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
6429 ; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start
6430 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6431 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6432 ; GFX9-NEXT: v_min_u32_e32 v0, s6, v1
6433 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6434 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc
6435 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6436 ; GFX9-NEXT: buffer_wbinvl1_vol
6437 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
6438 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6439 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
6440 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
6441 ; GFX9-NEXT: s_cbranch_execnz .LBB91_1
6442 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6443 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
6444 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6445 %tmp0 = atomicrmw umin ptr addrspace(1) %ptr, i32 %in seq_cst
6449 define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
6450 ; SI-LABEL: global_atomic_umin_i32_noret_offset_scalar:
6452 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6453 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
6454 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
6455 ; SI-NEXT: s_mov_b64 exec, s[34:35]
6456 ; SI-NEXT: s_waitcnt expcnt(0)
6457 ; SI-NEXT: v_writelane_b32 v0, s6, 0
6458 ; SI-NEXT: v_writelane_b32 v0, s7, 1
6459 ; SI-NEXT: s_mov_b32 s34, s6
6460 ; SI-NEXT: s_mov_b32 s7, 0xf000
6461 ; SI-NEXT: s_mov_b32 s6, -1
6462 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16
6463 ; SI-NEXT: s_mov_b64 s[36:37], 0
6464 ; SI-NEXT: .LBB92_1: ; %atomicrmw.start
6465 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6466 ; SI-NEXT: s_waitcnt vmcnt(0)
6467 ; SI-NEXT: v_min_u32_e32 v1, s34, v2
6468 ; SI-NEXT: s_waitcnt expcnt(0)
6469 ; SI-NEXT: v_mov_b32_e32 v4, v2
6470 ; SI-NEXT: v_mov_b32_e32 v3, v1
6471 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6472 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc
6473 ; SI-NEXT: s_waitcnt vmcnt(0)
6474 ; SI-NEXT: buffer_wbinvl1
6475 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
6476 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
6477 ; SI-NEXT: v_mov_b32_e32 v2, v3
6478 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
6479 ; SI-NEXT: s_cbranch_execnz .LBB92_1
6480 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6481 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
6482 ; SI-NEXT: v_readlane_b32 s7, v0, 1
6483 ; SI-NEXT: v_readlane_b32 s6, v0, 0
6484 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
6485 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
6486 ; SI-NEXT: s_mov_b64 exec, s[34:35]
6487 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
6488 ; SI-NEXT: s_setpc_b64 s[30:31]
6490 ; VI-LABEL: global_atomic_umin_i32_noret_offset_scalar:
6492 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6493 ; VI-NEXT: s_add_u32 s34, s4, 16
6494 ; VI-NEXT: s_addc_u32 s35, s5, 0
6495 ; VI-NEXT: v_mov_b32_e32 v0, s34
6496 ; VI-NEXT: v_mov_b32_e32 v1, s35
6497 ; VI-NEXT: flat_load_dword v1, v[0:1]
6498 ; VI-NEXT: s_mov_b64 s[36:37], 0
6499 ; VI-NEXT: .LBB92_1: ; %atomicrmw.start
6500 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6501 ; VI-NEXT: v_mov_b32_e32 v2, s34
6502 ; VI-NEXT: s_waitcnt vmcnt(0)
6503 ; VI-NEXT: v_min_u32_e32 v0, s6, v1
6504 ; VI-NEXT: v_mov_b32_e32 v3, s35
6505 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6506 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
6507 ; VI-NEXT: s_waitcnt vmcnt(0)
6508 ; VI-NEXT: buffer_wbinvl1_vol
6509 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
6510 ; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
6511 ; VI-NEXT: v_mov_b32_e32 v1, v0
6512 ; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
6513 ; VI-NEXT: s_cbranch_execnz .LBB92_1
6514 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6515 ; VI-NEXT: s_or_b64 exec, exec, s[36:37]
6516 ; VI-NEXT: s_setpc_b64 s[30:31]
6518 ; GFX9-LABEL: global_atomic_umin_i32_noret_offset_scalar:
6520 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6521 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
6522 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16
6523 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
6524 ; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start
6525 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6526 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6527 ; GFX9-NEXT: v_min_u32_e32 v0, s6, v1
6528 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6529 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc
6530 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6531 ; GFX9-NEXT: buffer_wbinvl1_vol
6532 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
6533 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6534 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
6535 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
6536 ; GFX9-NEXT: s_cbranch_execnz .LBB92_1
6537 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6538 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
6539 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6540 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
6541 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst
6545 define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
6546 ; SI-LABEL: global_atomic_umin_i32_ret_scalar:
6548 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6549 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
6550 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
6551 ; SI-NEXT: s_mov_b64 exec, s[34:35]
6552 ; SI-NEXT: s_waitcnt expcnt(0)
6553 ; SI-NEXT: v_writelane_b32 v1, s6, 0
6554 ; SI-NEXT: v_writelane_b32 v1, s7, 1
6555 ; SI-NEXT: s_mov_b32 s34, s6
6556 ; SI-NEXT: s_mov_b32 s7, 0xf000
6557 ; SI-NEXT: s_mov_b32 s6, -1
6558 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
6559 ; SI-NEXT: s_mov_b64 s[36:37], 0
6560 ; SI-NEXT: .LBB93_1: ; %atomicrmw.start
6561 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6562 ; SI-NEXT: s_waitcnt vmcnt(0)
6563 ; SI-NEXT: v_mov_b32_e32 v4, v2
6564 ; SI-NEXT: s_waitcnt expcnt(0)
6565 ; SI-NEXT: v_min_u32_e32 v3, s34, v4
6566 ; SI-NEXT: v_mov_b32_e32 v2, v3
6567 ; SI-NEXT: v_mov_b32_e32 v3, v4
6568 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6569 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
6570 ; SI-NEXT: s_waitcnt vmcnt(0)
6571 ; SI-NEXT: buffer_wbinvl1
6572 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
6573 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
6574 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
6575 ; SI-NEXT: s_cbranch_execnz .LBB93_1
6576 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6577 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
6578 ; SI-NEXT: v_mov_b32_e32 v0, v2
6579 ; SI-NEXT: v_readlane_b32 s7, v1, 1
6580 ; SI-NEXT: v_readlane_b32 s6, v1, 0
6581 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
6582 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
6583 ; SI-NEXT: s_mov_b64 exec, s[34:35]
6584 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
6585 ; SI-NEXT: s_setpc_b64 s[30:31]
6587 ; VI-LABEL: global_atomic_umin_i32_ret_scalar:
6589 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6590 ; VI-NEXT: v_mov_b32_e32 v0, s4
6591 ; VI-NEXT: v_mov_b32_e32 v1, s5
6592 ; VI-NEXT: flat_load_dword v0, v[0:1]
6593 ; VI-NEXT: s_mov_b64 s[34:35], 0
6594 ; VI-NEXT: .LBB93_1: ; %atomicrmw.start
6595 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6596 ; VI-NEXT: s_waitcnt vmcnt(0)
6597 ; VI-NEXT: v_mov_b32_e32 v1, v0
6598 ; VI-NEXT: v_mov_b32_e32 v2, s4
6599 ; VI-NEXT: v_mov_b32_e32 v3, s5
6600 ; VI-NEXT: v_min_u32_e32 v0, s6, v1
6601 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6602 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
6603 ; VI-NEXT: s_waitcnt vmcnt(0)
6604 ; VI-NEXT: buffer_wbinvl1_vol
6605 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
6606 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6607 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
6608 ; VI-NEXT: s_cbranch_execnz .LBB93_1
6609 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6610 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
6611 ; VI-NEXT: s_setpc_b64 s[30:31]
6613 ; GFX9-LABEL: global_atomic_umin_i32_ret_scalar:
6615 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6616 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
6617 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5]
6618 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
6619 ; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start
6620 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6621 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6622 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
6623 ; GFX9-NEXT: v_min_u32_e32 v2, s6, v3
6624 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6625 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
6626 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6627 ; GFX9-NEXT: buffer_wbinvl1_vol
6628 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
6629 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6630 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
6631 ; GFX9-NEXT: s_cbranch_execnz .LBB93_1
6632 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6633 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
6634 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6635 %result = atomicrmw umin ptr addrspace(1) %ptr, i32 %in seq_cst
6639 define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
6640 ; SI-LABEL: global_atomic_umin_i32_ret_offset_scalar:
6642 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6643 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
6644 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
6645 ; SI-NEXT: s_mov_b64 exec, s[34:35]
6646 ; SI-NEXT: s_waitcnt expcnt(0)
6647 ; SI-NEXT: v_writelane_b32 v1, s6, 0
6648 ; SI-NEXT: v_writelane_b32 v1, s7, 1
6649 ; SI-NEXT: s_mov_b32 s34, s6
6650 ; SI-NEXT: s_mov_b32 s7, 0xf000
6651 ; SI-NEXT: s_mov_b32 s6, -1
6652 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16
6653 ; SI-NEXT: s_mov_b64 s[36:37], 0
6654 ; SI-NEXT: .LBB94_1: ; %atomicrmw.start
6655 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6656 ; SI-NEXT: s_waitcnt vmcnt(0)
6657 ; SI-NEXT: v_mov_b32_e32 v4, v2
6658 ; SI-NEXT: s_waitcnt expcnt(0)
6659 ; SI-NEXT: v_min_u32_e32 v3, s34, v4
6660 ; SI-NEXT: v_mov_b32_e32 v2, v3
6661 ; SI-NEXT: v_mov_b32_e32 v3, v4
6662 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6663 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
6664 ; SI-NEXT: s_waitcnt vmcnt(0)
6665 ; SI-NEXT: buffer_wbinvl1
6666 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
6667 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
6668 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
6669 ; SI-NEXT: s_cbranch_execnz .LBB94_1
6670 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6671 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
6672 ; SI-NEXT: v_mov_b32_e32 v0, v2
6673 ; SI-NEXT: v_readlane_b32 s7, v1, 1
6674 ; SI-NEXT: v_readlane_b32 s6, v1, 0
6675 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
6676 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
6677 ; SI-NEXT: s_mov_b64 exec, s[34:35]
6678 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
6679 ; SI-NEXT: s_setpc_b64 s[30:31]
6681 ; VI-LABEL: global_atomic_umin_i32_ret_offset_scalar:
6683 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6684 ; VI-NEXT: s_add_u32 s34, s4, 16
6685 ; VI-NEXT: s_addc_u32 s35, s5, 0
6686 ; VI-NEXT: v_mov_b32_e32 v0, s34
6687 ; VI-NEXT: v_mov_b32_e32 v1, s35
6688 ; VI-NEXT: flat_load_dword v0, v[0:1]
6689 ; VI-NEXT: s_mov_b64 s[36:37], 0
6690 ; VI-NEXT: .LBB94_1: ; %atomicrmw.start
6691 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6692 ; VI-NEXT: s_waitcnt vmcnt(0)
6693 ; VI-NEXT: v_mov_b32_e32 v1, v0
6694 ; VI-NEXT: v_mov_b32_e32 v2, s34
6695 ; VI-NEXT: v_mov_b32_e32 v3, s35
6696 ; VI-NEXT: v_min_u32_e32 v0, s6, v1
6697 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6698 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
6699 ; VI-NEXT: s_waitcnt vmcnt(0)
6700 ; VI-NEXT: buffer_wbinvl1_vol
6701 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
6702 ; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
6703 ; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
6704 ; VI-NEXT: s_cbranch_execnz .LBB94_1
6705 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6706 ; VI-NEXT: s_or_b64 exec, exec, s[36:37]
6707 ; VI-NEXT: s_setpc_b64 s[30:31]
6709 ; GFX9-LABEL: global_atomic_umin_i32_ret_offset_scalar:
6711 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6712 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
6713 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16
6714 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
6715 ; GFX9-NEXT: .LBB94_1: ; %atomicrmw.start
6716 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6717 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6718 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
6719 ; GFX9-NEXT: v_min_u32_e32 v2, s6, v3
6720 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6721 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc
6722 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6723 ; GFX9-NEXT: buffer_wbinvl1_vol
6724 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
6725 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6726 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
6727 ; GFX9-NEXT: s_cbranch_execnz .LBB94_1
6728 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6729 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
6730 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6731 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
6732 %result = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst
6736 ; ---------------------------------------------------------------------
6738 ; ---------------------------------------------------------------------
6740 define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
6741 ; SI-LABEL: global_atomic_min_i32_noret:
6743 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6744 ; SI-NEXT: s_mov_b32 s6, 0
6745 ; SI-NEXT: s_mov_b32 s7, 0xf000
6746 ; SI-NEXT: s_mov_b32 s4, s6
6747 ; SI-NEXT: s_mov_b32 s5, s6
6748 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
6749 ; SI-NEXT: s_mov_b64 s[8:9], 0
6750 ; SI-NEXT: .LBB95_1: ; %atomicrmw.start
6751 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6752 ; SI-NEXT: s_waitcnt vmcnt(0)
6753 ; SI-NEXT: v_min_i32_e32 v3, v4, v2
6754 ; SI-NEXT: s_waitcnt expcnt(0)
6755 ; SI-NEXT: v_mov_b32_e32 v6, v4
6756 ; SI-NEXT: v_mov_b32_e32 v5, v3
6757 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6758 ; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
6759 ; SI-NEXT: s_waitcnt vmcnt(0)
6760 ; SI-NEXT: buffer_wbinvl1
6761 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
6762 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
6763 ; SI-NEXT: v_mov_b32_e32 v4, v5
6764 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
6765 ; SI-NEXT: s_cbranch_execnz .LBB95_1
6766 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6767 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
6768 ; SI-NEXT: s_waitcnt expcnt(0)
6769 ; SI-NEXT: s_setpc_b64 s[30:31]
6771 ; VI-LABEL: global_atomic_min_i32_noret:
6773 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6774 ; VI-NEXT: flat_load_dword v4, v[0:1]
6775 ; VI-NEXT: s_mov_b64 s[4:5], 0
6776 ; VI-NEXT: .LBB95_1: ; %atomicrmw.start
6777 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6778 ; VI-NEXT: s_waitcnt vmcnt(0)
6779 ; VI-NEXT: v_min_i32_e32 v3, v4, v2
6780 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6781 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6782 ; VI-NEXT: s_waitcnt vmcnt(0)
6783 ; VI-NEXT: buffer_wbinvl1_vol
6784 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6785 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6786 ; VI-NEXT: v_mov_b32_e32 v4, v3
6787 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
6788 ; VI-NEXT: s_cbranch_execnz .LBB95_1
6789 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6790 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
6791 ; VI-NEXT: s_setpc_b64 s[30:31]
6793 ; GFX9-LABEL: global_atomic_min_i32_noret:
6795 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6796 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
6797 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
6798 ; GFX9-NEXT: .LBB95_1: ; %atomicrmw.start
6799 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6800 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6801 ; GFX9-NEXT: v_min_i32_e32 v3, v4, v2
6802 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6803 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
6804 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6805 ; GFX9-NEXT: buffer_wbinvl1_vol
6806 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6807 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6808 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
6809 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
6810 ; GFX9-NEXT: s_cbranch_execnz .LBB95_1
6811 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6812 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
6813 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6814 %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst
6818 define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
6819 ; SI-LABEL: global_atomic_min_i32_noret_offset:
6821 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6822 ; SI-NEXT: s_mov_b32 s6, 0
6823 ; SI-NEXT: s_mov_b32 s7, 0xf000
6824 ; SI-NEXT: s_mov_b32 s4, s6
6825 ; SI-NEXT: s_mov_b32 s5, s6
6826 ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
6827 ; SI-NEXT: s_mov_b64 s[8:9], 0
6828 ; SI-NEXT: .LBB96_1: ; %atomicrmw.start
6829 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6830 ; SI-NEXT: s_waitcnt vmcnt(0)
6831 ; SI-NEXT: v_min_i32_e32 v3, v4, v2
6832 ; SI-NEXT: s_waitcnt expcnt(0)
6833 ; SI-NEXT: v_mov_b32_e32 v6, v4
6834 ; SI-NEXT: v_mov_b32_e32 v5, v3
6835 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6836 ; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
6837 ; SI-NEXT: s_waitcnt vmcnt(0)
6838 ; SI-NEXT: buffer_wbinvl1
6839 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
6840 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
6841 ; SI-NEXT: v_mov_b32_e32 v4, v5
6842 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
6843 ; SI-NEXT: s_cbranch_execnz .LBB96_1
6844 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6845 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
6846 ; SI-NEXT: s_waitcnt expcnt(0)
6847 ; SI-NEXT: s_setpc_b64 s[30:31]
6849 ; VI-LABEL: global_atomic_min_i32_noret_offset:
6851 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6852 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
6853 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
6854 ; VI-NEXT: flat_load_dword v4, v[0:1]
6855 ; VI-NEXT: s_mov_b64 s[4:5], 0
6856 ; VI-NEXT: .LBB96_1: ; %atomicrmw.start
6857 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6858 ; VI-NEXT: s_waitcnt vmcnt(0)
6859 ; VI-NEXT: v_min_i32_e32 v3, v4, v2
6860 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6861 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6862 ; VI-NEXT: s_waitcnt vmcnt(0)
6863 ; VI-NEXT: buffer_wbinvl1_vol
6864 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6865 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6866 ; VI-NEXT: v_mov_b32_e32 v4, v3
6867 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
6868 ; VI-NEXT: s_cbranch_execnz .LBB96_1
6869 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6870 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
6871 ; VI-NEXT: s_setpc_b64 s[30:31]
6873 ; GFX9-LABEL: global_atomic_min_i32_noret_offset:
6875 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6876 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16
6877 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
6878 ; GFX9-NEXT: .LBB96_1: ; %atomicrmw.start
6879 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6880 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6881 ; GFX9-NEXT: v_min_i32_e32 v3, v4, v2
6882 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6883 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
6884 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6885 ; GFX9-NEXT: buffer_wbinvl1_vol
6886 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6887 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6888 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
6889 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
6890 ; GFX9-NEXT: s_cbranch_execnz .LBB96_1
6891 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6892 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
6893 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6894 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
6895 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst
6899 define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
6900 ; SI-LABEL: global_atomic_min_i32_ret:
6902 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6903 ; SI-NEXT: s_mov_b32 s6, 0
6904 ; SI-NEXT: s_mov_b32 s7, 0xf000
6905 ; SI-NEXT: s_mov_b32 s4, s6
6906 ; SI-NEXT: s_mov_b32 s5, s6
6907 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
6908 ; SI-NEXT: s_mov_b64 s[8:9], 0
6909 ; SI-NEXT: .LBB97_1: ; %atomicrmw.start
6910 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6911 ; SI-NEXT: s_waitcnt vmcnt(0)
6912 ; SI-NEXT: v_mov_b32_e32 v5, v3
6913 ; SI-NEXT: s_waitcnt expcnt(0)
6914 ; SI-NEXT: v_min_i32_e32 v4, v5, v2
6915 ; SI-NEXT: v_mov_b32_e32 v3, v4
6916 ; SI-NEXT: v_mov_b32_e32 v4, v5
6917 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6918 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
6919 ; SI-NEXT: s_waitcnt vmcnt(0)
6920 ; SI-NEXT: buffer_wbinvl1
6921 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
6922 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
6923 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
6924 ; SI-NEXT: s_cbranch_execnz .LBB97_1
6925 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6926 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
6927 ; SI-NEXT: v_mov_b32_e32 v0, v3
6928 ; SI-NEXT: s_waitcnt expcnt(0)
6929 ; SI-NEXT: s_setpc_b64 s[30:31]
6931 ; VI-LABEL: global_atomic_min_i32_ret:
6933 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6934 ; VI-NEXT: flat_load_dword v3, v[0:1]
6935 ; VI-NEXT: s_mov_b64 s[4:5], 0
6936 ; VI-NEXT: .LBB97_1: ; %atomicrmw.start
6937 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6938 ; VI-NEXT: s_waitcnt vmcnt(0)
6939 ; VI-NEXT: v_mov_b32_e32 v4, v3
6940 ; VI-NEXT: v_min_i32_e32 v3, v4, v2
6941 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6942 ; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6943 ; VI-NEXT: s_waitcnt vmcnt(0)
6944 ; VI-NEXT: buffer_wbinvl1_vol
6945 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6946 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6947 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
6948 ; VI-NEXT: s_cbranch_execnz .LBB97_1
6949 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6950 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
6951 ; VI-NEXT: v_mov_b32_e32 v0, v3
6952 ; VI-NEXT: s_setpc_b64 s[30:31]
6954 ; GFX9-LABEL: global_atomic_min_i32_ret:
6956 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6957 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
6958 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
6959 ; GFX9-NEXT: .LBB97_1: ; %atomicrmw.start
6960 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6961 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6962 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
6963 ; GFX9-NEXT: v_min_i32_e32 v3, v4, v2
6964 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6965 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
6966 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6967 ; GFX9-NEXT: buffer_wbinvl1_vol
6968 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6969 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6970 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
6971 ; GFX9-NEXT: s_cbranch_execnz .LBB97_1
6972 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6973 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
6974 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
6975 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6976 %result = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst
6980 define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
6981 ; SI-LABEL: global_atomic_min_i32_ret_offset:
6983 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6984 ; SI-NEXT: s_mov_b32 s6, 0
6985 ; SI-NEXT: s_mov_b32 s7, 0xf000
6986 ; SI-NEXT: s_mov_b32 s4, s6
6987 ; SI-NEXT: s_mov_b32 s5, s6
6988 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
6989 ; SI-NEXT: s_mov_b64 s[8:9], 0
6990 ; SI-NEXT: .LBB98_1: ; %atomicrmw.start
6991 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6992 ; SI-NEXT: s_waitcnt vmcnt(0)
6993 ; SI-NEXT: v_mov_b32_e32 v5, v3
6994 ; SI-NEXT: s_waitcnt expcnt(0)
6995 ; SI-NEXT: v_min_i32_e32 v4, v5, v2
6996 ; SI-NEXT: v_mov_b32_e32 v3, v4
6997 ; SI-NEXT: v_mov_b32_e32 v4, v5
6998 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6999 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
7000 ; SI-NEXT: s_waitcnt vmcnt(0)
7001 ; SI-NEXT: buffer_wbinvl1
7002 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
7003 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
7004 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
7005 ; SI-NEXT: s_cbranch_execnz .LBB98_1
7006 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7007 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
7008 ; SI-NEXT: v_mov_b32_e32 v0, v3
7009 ; SI-NEXT: s_waitcnt expcnt(0)
7010 ; SI-NEXT: s_setpc_b64 s[30:31]
7012 ; VI-LABEL: global_atomic_min_i32_ret_offset:
7014 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7015 ; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0
7016 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
7017 ; VI-NEXT: flat_load_dword v0, v[3:4]
7018 ; VI-NEXT: s_mov_b64 s[4:5], 0
7019 ; VI-NEXT: .LBB98_1: ; %atomicrmw.start
7020 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7021 ; VI-NEXT: s_waitcnt vmcnt(0)
7022 ; VI-NEXT: v_mov_b32_e32 v1, v0
7023 ; VI-NEXT: v_min_i32_e32 v0, v1, v2
7024 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7025 ; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
7026 ; VI-NEXT: s_waitcnt vmcnt(0)
7027 ; VI-NEXT: buffer_wbinvl1_vol
7028 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
7029 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7030 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
7031 ; VI-NEXT: s_cbranch_execnz .LBB98_1
7032 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7033 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
7034 ; VI-NEXT: s_setpc_b64 s[30:31]
7036 ; GFX9-LABEL: global_atomic_min_i32_ret_offset:
7038 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7039 ; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16
7040 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
7041 ; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start
7042 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7043 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7044 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
7045 ; GFX9-NEXT: v_min_i32_e32 v3, v4, v2
7046 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7047 ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
7048 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7049 ; GFX9-NEXT: buffer_wbinvl1_vol
7050 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
7051 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7052 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
7053 ; GFX9-NEXT: s_cbranch_execnz .LBB98_1
7054 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7055 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
7056 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
7057 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7058 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
7059 %result = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst
7063 define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
7064 ; SI-LABEL: global_atomic_min_i32_noret_scalar:
7066 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7067 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7068 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
7069 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7070 ; SI-NEXT: s_waitcnt expcnt(0)
7071 ; SI-NEXT: v_writelane_b32 v0, s6, 0
7072 ; SI-NEXT: v_writelane_b32 v0, s7, 1
7073 ; SI-NEXT: s_mov_b32 s34, s6
7074 ; SI-NEXT: s_mov_b32 s7, 0xf000
7075 ; SI-NEXT: s_mov_b32 s6, -1
7076 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
7077 ; SI-NEXT: s_mov_b64 s[36:37], 0
7078 ; SI-NEXT: .LBB99_1: ; %atomicrmw.start
7079 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7080 ; SI-NEXT: s_waitcnt vmcnt(0)
7081 ; SI-NEXT: v_min_i32_e32 v1, s34, v2
7082 ; SI-NEXT: s_waitcnt expcnt(0)
7083 ; SI-NEXT: v_mov_b32_e32 v4, v2
7084 ; SI-NEXT: v_mov_b32_e32 v3, v1
7085 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7086 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc
7087 ; SI-NEXT: s_waitcnt vmcnt(0)
7088 ; SI-NEXT: buffer_wbinvl1
7089 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
7090 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
7091 ; SI-NEXT: v_mov_b32_e32 v2, v3
7092 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
7093 ; SI-NEXT: s_cbranch_execnz .LBB99_1
7094 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7095 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
7096 ; SI-NEXT: v_readlane_b32 s7, v0, 1
7097 ; SI-NEXT: v_readlane_b32 s6, v0, 0
7098 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7099 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
7100 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7101 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
7102 ; SI-NEXT: s_setpc_b64 s[30:31]
7104 ; VI-LABEL: global_atomic_min_i32_noret_scalar:
7106 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7107 ; VI-NEXT: v_mov_b32_e32 v0, s4
7108 ; VI-NEXT: v_mov_b32_e32 v1, s5
7109 ; VI-NEXT: flat_load_dword v1, v[0:1]
7110 ; VI-NEXT: s_mov_b64 s[34:35], 0
7111 ; VI-NEXT: .LBB99_1: ; %atomicrmw.start
7112 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7113 ; VI-NEXT: v_mov_b32_e32 v2, s4
7114 ; VI-NEXT: s_waitcnt vmcnt(0)
7115 ; VI-NEXT: v_min_i32_e32 v0, s6, v1
7116 ; VI-NEXT: v_mov_b32_e32 v3, s5
7117 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7118 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
7119 ; VI-NEXT: s_waitcnt vmcnt(0)
7120 ; VI-NEXT: buffer_wbinvl1_vol
7121 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
7122 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7123 ; VI-NEXT: v_mov_b32_e32 v1, v0
7124 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
7125 ; VI-NEXT: s_cbranch_execnz .LBB99_1
7126 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7127 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
7128 ; VI-NEXT: s_setpc_b64 s[30:31]
7130 ; GFX9-LABEL: global_atomic_min_i32_noret_scalar:
7132 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7133 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
7134 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5]
7135 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
7136 ; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start
7137 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7138 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7139 ; GFX9-NEXT: v_min_i32_e32 v0, s6, v1
7140 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7141 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc
7142 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7143 ; GFX9-NEXT: buffer_wbinvl1_vol
7144 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
7145 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7146 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
7147 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
7148 ; GFX9-NEXT: s_cbranch_execnz .LBB99_1
7149 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7150 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
7151 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7152 %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst
7156 define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
7157 ; SI-LABEL: global_atomic_min_i32_noret_offset_scalar:
7159 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7160 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7161 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
7162 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7163 ; SI-NEXT: s_waitcnt expcnt(0)
7164 ; SI-NEXT: v_writelane_b32 v0, s6, 0
7165 ; SI-NEXT: v_writelane_b32 v0, s7, 1
7166 ; SI-NEXT: s_mov_b32 s34, s6
7167 ; SI-NEXT: s_mov_b32 s7, 0xf000
7168 ; SI-NEXT: s_mov_b32 s6, -1
7169 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16
7170 ; SI-NEXT: s_mov_b64 s[36:37], 0
7171 ; SI-NEXT: .LBB100_1: ; %atomicrmw.start
7172 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7173 ; SI-NEXT: s_waitcnt vmcnt(0)
7174 ; SI-NEXT: v_min_i32_e32 v1, s34, v2
7175 ; SI-NEXT: s_waitcnt expcnt(0)
7176 ; SI-NEXT: v_mov_b32_e32 v4, v2
7177 ; SI-NEXT: v_mov_b32_e32 v3, v1
7178 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7179 ; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc
7180 ; SI-NEXT: s_waitcnt vmcnt(0)
7181 ; SI-NEXT: buffer_wbinvl1
7182 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
7183 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
7184 ; SI-NEXT: v_mov_b32_e32 v2, v3
7185 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
7186 ; SI-NEXT: s_cbranch_execnz .LBB100_1
7187 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7188 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
7189 ; SI-NEXT: v_readlane_b32 s7, v0, 1
7190 ; SI-NEXT: v_readlane_b32 s6, v0, 0
7191 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7192 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
7193 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7194 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
7195 ; SI-NEXT: s_setpc_b64 s[30:31]
7197 ; VI-LABEL: global_atomic_min_i32_noret_offset_scalar:
7199 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7200 ; VI-NEXT: s_add_u32 s34, s4, 16
7201 ; VI-NEXT: s_addc_u32 s35, s5, 0
7202 ; VI-NEXT: v_mov_b32_e32 v0, s34
7203 ; VI-NEXT: v_mov_b32_e32 v1, s35
7204 ; VI-NEXT: flat_load_dword v1, v[0:1]
7205 ; VI-NEXT: s_mov_b64 s[36:37], 0
7206 ; VI-NEXT: .LBB100_1: ; %atomicrmw.start
7207 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7208 ; VI-NEXT: v_mov_b32_e32 v2, s34
7209 ; VI-NEXT: s_waitcnt vmcnt(0)
7210 ; VI-NEXT: v_min_i32_e32 v0, s6, v1
7211 ; VI-NEXT: v_mov_b32_e32 v3, s35
7212 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7213 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
7214 ; VI-NEXT: s_waitcnt vmcnt(0)
7215 ; VI-NEXT: buffer_wbinvl1_vol
7216 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
7217 ; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
7218 ; VI-NEXT: v_mov_b32_e32 v1, v0
7219 ; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
7220 ; VI-NEXT: s_cbranch_execnz .LBB100_1
7221 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7222 ; VI-NEXT: s_or_b64 exec, exec, s[36:37]
7223 ; VI-NEXT: s_setpc_b64 s[30:31]
7225 ; GFX9-LABEL: global_atomic_min_i32_noret_offset_scalar:
7227 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7228 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
7229 ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16
7230 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
7231 ; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start
7232 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7233 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7234 ; GFX9-NEXT: v_min_i32_e32 v0, s6, v1
7235 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7236 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc
7237 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7238 ; GFX9-NEXT: buffer_wbinvl1_vol
7239 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
7240 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7241 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
7242 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
7243 ; GFX9-NEXT: s_cbranch_execnz .LBB100_1
7244 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7245 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
7246 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7247 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
7248 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst
7252 define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
7253 ; SI-LABEL: global_atomic_min_i32_ret_scalar:
7255 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7256 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7257 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
7258 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7259 ; SI-NEXT: s_waitcnt expcnt(0)
7260 ; SI-NEXT: v_writelane_b32 v1, s6, 0
7261 ; SI-NEXT: v_writelane_b32 v1, s7, 1
7262 ; SI-NEXT: s_mov_b32 s34, s6
7263 ; SI-NEXT: s_mov_b32 s7, 0xf000
7264 ; SI-NEXT: s_mov_b32 s6, -1
7265 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
7266 ; SI-NEXT: s_mov_b64 s[36:37], 0
7267 ; SI-NEXT: .LBB101_1: ; %atomicrmw.start
7268 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7269 ; SI-NEXT: s_waitcnt vmcnt(0)
7270 ; SI-NEXT: v_mov_b32_e32 v4, v2
7271 ; SI-NEXT: s_waitcnt expcnt(0)
7272 ; SI-NEXT: v_min_i32_e32 v3, s34, v4
7273 ; SI-NEXT: v_mov_b32_e32 v2, v3
7274 ; SI-NEXT: v_mov_b32_e32 v3, v4
7275 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7276 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
7277 ; SI-NEXT: s_waitcnt vmcnt(0)
7278 ; SI-NEXT: buffer_wbinvl1
7279 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
7280 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
7281 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
7282 ; SI-NEXT: s_cbranch_execnz .LBB101_1
7283 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7284 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
7285 ; SI-NEXT: v_mov_b32_e32 v0, v2
7286 ; SI-NEXT: v_readlane_b32 s7, v1, 1
7287 ; SI-NEXT: v_readlane_b32 s6, v1, 0
7288 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7289 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
7290 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7291 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
7292 ; SI-NEXT: s_setpc_b64 s[30:31]
7294 ; VI-LABEL: global_atomic_min_i32_ret_scalar:
7296 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7297 ; VI-NEXT: v_mov_b32_e32 v0, s4
7298 ; VI-NEXT: v_mov_b32_e32 v1, s5
7299 ; VI-NEXT: flat_load_dword v0, v[0:1]
7300 ; VI-NEXT: s_mov_b64 s[34:35], 0
7301 ; VI-NEXT: .LBB101_1: ; %atomicrmw.start
7302 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7303 ; VI-NEXT: s_waitcnt vmcnt(0)
7304 ; VI-NEXT: v_mov_b32_e32 v1, v0
7305 ; VI-NEXT: v_mov_b32_e32 v2, s4
7306 ; VI-NEXT: v_mov_b32_e32 v3, s5
7307 ; VI-NEXT: v_min_i32_e32 v0, s6, v1
7308 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7309 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
7310 ; VI-NEXT: s_waitcnt vmcnt(0)
7311 ; VI-NEXT: buffer_wbinvl1_vol
7312 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
7313 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7314 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
7315 ; VI-NEXT: s_cbranch_execnz .LBB101_1
7316 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7317 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
7318 ; VI-NEXT: s_setpc_b64 s[30:31]
7320 ; GFX9-LABEL: global_atomic_min_i32_ret_scalar:
7322 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7323 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
7324 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5]
7325 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
7326 ; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start
7327 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7328 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7329 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
7330 ; GFX9-NEXT: v_min_i32_e32 v2, s6, v3
7331 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7332 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
7333 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7334 ; GFX9-NEXT: buffer_wbinvl1_vol
7335 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
7336 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7337 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
7338 ; GFX9-NEXT: s_cbranch_execnz .LBB101_1
7339 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7340 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
7341 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7342 %result = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst
7346 define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
7347 ; SI-LABEL: global_atomic_min_i32_ret_offset_scalar:
7349 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7350 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7351 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
7352 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7353 ; SI-NEXT: s_waitcnt expcnt(0)
7354 ; SI-NEXT: v_writelane_b32 v1, s6, 0
7355 ; SI-NEXT: v_writelane_b32 v1, s7, 1
7356 ; SI-NEXT: s_mov_b32 s34, s6
7357 ; SI-NEXT: s_mov_b32 s7, 0xf000
7358 ; SI-NEXT: s_mov_b32 s6, -1
7359 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16
7360 ; SI-NEXT: s_mov_b64 s[36:37], 0
7361 ; SI-NEXT: .LBB102_1: ; %atomicrmw.start
7362 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7363 ; SI-NEXT: s_waitcnt vmcnt(0)
7364 ; SI-NEXT: v_mov_b32_e32 v4, v2
7365 ; SI-NEXT: s_waitcnt expcnt(0)
7366 ; SI-NEXT: v_min_i32_e32 v3, s34, v4
7367 ; SI-NEXT: v_mov_b32_e32 v2, v3
7368 ; SI-NEXT: v_mov_b32_e32 v3, v4
7369 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7370 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
7371 ; SI-NEXT: s_waitcnt vmcnt(0)
7372 ; SI-NEXT: buffer_wbinvl1
7373 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
7374 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
7375 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
7376 ; SI-NEXT: s_cbranch_execnz .LBB102_1
7377 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7378 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
7379 ; SI-NEXT: v_mov_b32_e32 v0, v2
7380 ; SI-NEXT: v_readlane_b32 s7, v1, 1
7381 ; SI-NEXT: v_readlane_b32 s6, v1, 0
7382 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7383 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
7384 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7385 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
7386 ; SI-NEXT: s_setpc_b64 s[30:31]
7388 ; VI-LABEL: global_atomic_min_i32_ret_offset_scalar:
7390 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7391 ; VI-NEXT: s_add_u32 s34, s4, 16
7392 ; VI-NEXT: s_addc_u32 s35, s5, 0
7393 ; VI-NEXT: v_mov_b32_e32 v0, s34
7394 ; VI-NEXT: v_mov_b32_e32 v1, s35
7395 ; VI-NEXT: flat_load_dword v0, v[0:1]
7396 ; VI-NEXT: s_mov_b64 s[36:37], 0
7397 ; VI-NEXT: .LBB102_1: ; %atomicrmw.start
7398 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7399 ; VI-NEXT: s_waitcnt vmcnt(0)
7400 ; VI-NEXT: v_mov_b32_e32 v1, v0
7401 ; VI-NEXT: v_mov_b32_e32 v2, s34
7402 ; VI-NEXT: v_mov_b32_e32 v3, s35
7403 ; VI-NEXT: v_min_i32_e32 v0, s6, v1
7404 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7405 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
7406 ; VI-NEXT: s_waitcnt vmcnt(0)
7407 ; VI-NEXT: buffer_wbinvl1_vol
7408 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
7409 ; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
7410 ; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
7411 ; VI-NEXT: s_cbranch_execnz .LBB102_1
7412 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7413 ; VI-NEXT: s_or_b64 exec, exec, s[36:37]
7414 ; VI-NEXT: s_setpc_b64 s[30:31]
7416 ; GFX9-LABEL: global_atomic_min_i32_ret_offset_scalar:
7418 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7419 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
7420 ; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16
7421 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
7422 ; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start
7423 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7424 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7425 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
7426 ; GFX9-NEXT: v_min_i32_e32 v2, s6, v3
7427 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7428 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc
7429 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7430 ; GFX9-NEXT: buffer_wbinvl1_vol
7431 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
7432 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7433 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
7434 ; GFX9-NEXT: s_cbranch_execnz .LBB102_1
7435 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7436 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
7437 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7438 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
7439 %result = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst
7443 define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i32 %index) {
7444 ; SI-LABEL: atomic_min_i32_addr64_offset:
7445 ; SI: ; %bb.0: ; %entry
7446 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
7447 ; SI-NEXT: s_waitcnt lgkmcnt(0)
7448 ; SI-NEXT: s_ashr_i32 s5, s3, 31
7449 ; SI-NEXT: s_mov_b32 s4, s3
7450 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
7451 ; SI-NEXT: s_add_u32 s4, s0, s4
7452 ; SI-NEXT: s_addc_u32 s5, s1, s5
7453 ; SI-NEXT: s_load_dword s3, s[4:5], 0x4
7454 ; SI-NEXT: s_mov_b64 s[0:1], 0
7455 ; SI-NEXT: s_mov_b32 s7, 0xf000
7456 ; SI-NEXT: s_waitcnt lgkmcnt(0)
7457 ; SI-NEXT: v_mov_b32_e32 v1, s3
7458 ; SI-NEXT: s_mov_b32 s6, -1
7459 ; SI-NEXT: .LBB103_1: ; %atomicrmw.start
7460 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7461 ; SI-NEXT: v_min_i32_e32 v0, s2, v1
7462 ; SI-NEXT: s_waitcnt expcnt(0)
7463 ; SI-NEXT: v_mov_b32_e32 v3, v1
7464 ; SI-NEXT: v_mov_b32_e32 v2, v0
7465 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7466 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
7467 ; SI-NEXT: s_waitcnt vmcnt(0)
7468 ; SI-NEXT: buffer_wbinvl1
7469 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
7470 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
7471 ; SI-NEXT: v_mov_b32_e32 v1, v2
7472 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
7473 ; SI-NEXT: s_cbranch_execnz .LBB103_1
7474 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7477 ; VI-LABEL: atomic_min_i32_addr64_offset:
7478 ; VI: ; %bb.0: ; %entry
7479 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
7480 ; VI-NEXT: s_waitcnt lgkmcnt(0)
7481 ; VI-NEXT: s_ashr_i32 s5, s3, 31
7482 ; VI-NEXT: s_mov_b32 s4, s3
7483 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
7484 ; VI-NEXT: s_add_u32 s0, s0, s4
7485 ; VI-NEXT: s_addc_u32 s1, s1, s5
7486 ; VI-NEXT: s_load_dword s3, s[0:1], 0x10
7487 ; VI-NEXT: s_add_u32 s0, s0, 16
7488 ; VI-NEXT: s_addc_u32 s1, s1, 0
7489 ; VI-NEXT: s_mov_b64 s[4:5], 0
7490 ; VI-NEXT: s_waitcnt lgkmcnt(0)
7491 ; VI-NEXT: v_mov_b32_e32 v1, s3
7492 ; VI-NEXT: .LBB103_1: ; %atomicrmw.start
7493 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7494 ; VI-NEXT: v_mov_b32_e32 v3, s1
7495 ; VI-NEXT: v_min_i32_e32 v0, s2, v1
7496 ; VI-NEXT: v_mov_b32_e32 v2, s0
7497 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7498 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
7499 ; VI-NEXT: s_waitcnt vmcnt(0)
7500 ; VI-NEXT: buffer_wbinvl1_vol
7501 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
7502 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7503 ; VI-NEXT: v_mov_b32_e32 v1, v0
7504 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
7505 ; VI-NEXT: s_cbranch_execnz .LBB103_1
7506 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7509 ; GFX9-LABEL: atomic_min_i32_addr64_offset:
7510 ; GFX9: ; %bb.0: ; %entry
7511 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
7512 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
7513 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7514 ; GFX9-NEXT: s_ashr_i32 s5, s3, 31
7515 ; GFX9-NEXT: s_mov_b32 s4, s3
7516 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
7517 ; GFX9-NEXT: s_add_u32 s0, s0, s4
7518 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
7519 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10
7520 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
7521 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7522 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
7523 ; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start
7524 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7525 ; GFX9-NEXT: v_min_i32_e32 v0, s2, v1
7526 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7527 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
7528 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7529 ; GFX9-NEXT: buffer_wbinvl1_vol
7530 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
7531 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7532 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
7533 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
7534 ; GFX9-NEXT: s_cbranch_execnz .LBB103_1
7535 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7536 ; GFX9-NEXT: s_endpgm
7538 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
7539 %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
7540 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst
7544 define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) {
7545 ; SI-LABEL: atomic_min_i32_ret_addr64_offset:
7546 ; SI: ; %bb.0: ; %entry
7547 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
7548 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
7549 ; SI-NEXT: s_waitcnt lgkmcnt(0)
7550 ; SI-NEXT: s_ashr_i32 s5, s9, 31
7551 ; SI-NEXT: s_mov_b32 s4, s9
7552 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
7553 ; SI-NEXT: s_add_u32 s4, s0, s4
7554 ; SI-NEXT: s_addc_u32 s5, s1, s5
7555 ; SI-NEXT: s_load_dword s6, s[4:5], 0x4
7556 ; SI-NEXT: s_mov_b64 s[0:1], 0
7557 ; SI-NEXT: s_mov_b32 s7, 0xf000
7558 ; SI-NEXT: s_waitcnt lgkmcnt(0)
7559 ; SI-NEXT: v_mov_b32_e32 v1, s6
7560 ; SI-NEXT: s_mov_b32 s6, -1
7561 ; SI-NEXT: .LBB104_1: ; %atomicrmw.start
7562 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7563 ; SI-NEXT: v_min_i32_e32 v0, s8, v1
7564 ; SI-NEXT: s_waitcnt expcnt(0)
7565 ; SI-NEXT: v_mov_b32_e32 v3, v1
7566 ; SI-NEXT: v_mov_b32_e32 v2, v0
7567 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7568 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
7569 ; SI-NEXT: s_waitcnt vmcnt(0)
7570 ; SI-NEXT: buffer_wbinvl1
7571 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
7572 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
7573 ; SI-NEXT: v_mov_b32_e32 v1, v2
7574 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
7575 ; SI-NEXT: s_cbranch_execnz .LBB104_1
7576 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7577 ; SI-NEXT: s_or_b64 exec, exec, s[0:1]
7578 ; SI-NEXT: s_mov_b32 s7, 0xf000
7579 ; SI-NEXT: s_mov_b32 s6, -1
7580 ; SI-NEXT: s_mov_b32 s4, s2
7581 ; SI-NEXT: s_mov_b32 s5, s3
7582 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
7585 ; VI-LABEL: atomic_min_i32_ret_addr64_offset:
7586 ; VI: ; %bb.0: ; %entry
7587 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
7588 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
7589 ; VI-NEXT: s_waitcnt lgkmcnt(0)
7590 ; VI-NEXT: s_ashr_i32 s7, s5, 31
7591 ; VI-NEXT: s_mov_b32 s6, s5
7592 ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
7593 ; VI-NEXT: s_add_u32 s0, s0, s6
7594 ; VI-NEXT: s_addc_u32 s1, s1, s7
7595 ; VI-NEXT: s_load_dword s5, s[0:1], 0x10
7596 ; VI-NEXT: s_add_u32 s0, s0, 16
7597 ; VI-NEXT: s_addc_u32 s1, s1, 0
7598 ; VI-NEXT: s_mov_b64 s[6:7], 0
7599 ; VI-NEXT: s_waitcnt lgkmcnt(0)
7600 ; VI-NEXT: v_mov_b32_e32 v0, s5
7601 ; VI-NEXT: .LBB104_1: ; %atomicrmw.start
7602 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7603 ; VI-NEXT: v_mov_b32_e32 v1, v0
7604 ; VI-NEXT: v_mov_b32_e32 v3, s1
7605 ; VI-NEXT: v_mov_b32_e32 v2, s0
7606 ; VI-NEXT: v_min_i32_e32 v0, s4, v1
7607 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7608 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
7609 ; VI-NEXT: s_waitcnt vmcnt(0)
7610 ; VI-NEXT: buffer_wbinvl1_vol
7611 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
7612 ; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
7613 ; VI-NEXT: s_andn2_b64 exec, exec, s[6:7]
7614 ; VI-NEXT: s_cbranch_execnz .LBB104_1
7615 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7616 ; VI-NEXT: s_or_b64 exec, exec, s[6:7]
7617 ; VI-NEXT: v_mov_b32_e32 v1, s2
7618 ; VI-NEXT: v_mov_b32_e32 v2, s3
7619 ; VI-NEXT: flat_store_dword v[1:2], v0
7622 ; GFX9-LABEL: atomic_min_i32_ret_addr64_offset:
7623 ; GFX9: ; %bb.0: ; %entry
7624 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
7625 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
7626 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
7627 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7628 ; GFX9-NEXT: s_ashr_i32 s1, s3, 31
7629 ; GFX9-NEXT: s_mov_b32 s0, s3
7630 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
7631 ; GFX9-NEXT: s_add_u32 s0, s4, s0
7632 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
7633 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10
7634 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
7635 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7636 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
7637 ; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start
7638 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7639 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
7640 ; GFX9-NEXT: v_min_i32_e32 v2, s2, v3
7641 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7642 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] offset:16 glc
7643 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7644 ; GFX9-NEXT: buffer_wbinvl1_vol
7645 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
7646 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7647 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
7648 ; GFX9-NEXT: s_cbranch_execnz .LBB104_1
7649 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7650 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
7651 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
7652 ; GFX9-NEXT: global_store_dword v1, v0, s[6:7]
7653 ; GFX9-NEXT: s_endpgm
7655 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
7656 %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
7657 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst
7658 store i32 %tmp0, ptr addrspace(1) %out2
7662 define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) {
7663 ; SI-LABEL: atomic_min_i32:
7664 ; SI: ; %bb.0: ; %entry
7665 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
7666 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
7667 ; SI-NEXT: s_waitcnt lgkmcnt(0)
7668 ; SI-NEXT: s_load_dword s3, s[4:5], 0x0
7669 ; SI-NEXT: s_mov_b64 s[0:1], 0
7670 ; SI-NEXT: s_mov_b32 s7, 0xf000
7671 ; SI-NEXT: s_waitcnt lgkmcnt(0)
7672 ; SI-NEXT: v_mov_b32_e32 v1, s3
7673 ; SI-NEXT: s_mov_b32 s6, -1
7674 ; SI-NEXT: .LBB105_1: ; %atomicrmw.start
7675 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7676 ; SI-NEXT: v_min_i32_e32 v0, s2, v1
7677 ; SI-NEXT: s_waitcnt expcnt(0)
7678 ; SI-NEXT: v_mov_b32_e32 v3, v1
7679 ; SI-NEXT: v_mov_b32_e32 v2, v0
7680 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7681 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
7682 ; SI-NEXT: s_waitcnt vmcnt(0)
7683 ; SI-NEXT: buffer_wbinvl1
7684 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
7685 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
7686 ; SI-NEXT: v_mov_b32_e32 v1, v2
7687 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
7688 ; SI-NEXT: s_cbranch_execnz .LBB105_1
7689 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7692 ; VI-LABEL: atomic_min_i32:
7693 ; VI: ; %bb.0: ; %entry
7694 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
7695 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
7696 ; VI-NEXT: s_mov_b64 s[0:1], 0
7697 ; VI-NEXT: s_waitcnt lgkmcnt(0)
7698 ; VI-NEXT: s_load_dword s5, s[2:3], 0x0
7699 ; VI-NEXT: s_waitcnt lgkmcnt(0)
7700 ; VI-NEXT: v_mov_b32_e32 v1, s5
7701 ; VI-NEXT: .LBB105_1: ; %atomicrmw.start
7702 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7703 ; VI-NEXT: v_mov_b32_e32 v2, s2
7704 ; VI-NEXT: v_min_i32_e32 v0, s4, v1
7705 ; VI-NEXT: v_mov_b32_e32 v3, s3
7706 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7707 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
7708 ; VI-NEXT: s_waitcnt vmcnt(0)
7709 ; VI-NEXT: buffer_wbinvl1_vol
7710 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
7711 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
7712 ; VI-NEXT: v_mov_b32_e32 v1, v0
7713 ; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
7714 ; VI-NEXT: s_cbranch_execnz .LBB105_1
7715 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7718 ; GFX9-LABEL: atomic_min_i32:
7719 ; GFX9: ; %bb.0: ; %entry
7720 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
7721 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
7722 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
7723 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
7724 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7725 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
7726 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7727 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
7728 ; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start
7729 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7730 ; GFX9-NEXT: v_min_i32_e32 v0, s4, v1
7731 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7732 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
7733 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7734 ; GFX9-NEXT: buffer_wbinvl1_vol
7735 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
7736 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
7737 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
7738 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
7739 ; GFX9-NEXT: s_cbranch_execnz .LBB105_1
7740 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7741 ; GFX9-NEXT: s_endpgm
7743 %tmp0 = atomicrmw min ptr addrspace(1) %out, i32 %in seq_cst
7747 define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %index) {
7748 ; SI-LABEL: atomic_min_i32_ret_addr64:
7749 ; SI: ; %bb.0: ; %entry
7750 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
7751 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
7752 ; SI-NEXT: s_waitcnt lgkmcnt(0)
7753 ; SI-NEXT: s_ashr_i32 s5, s9, 31
7754 ; SI-NEXT: s_mov_b32 s4, s9
7755 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
7756 ; SI-NEXT: s_add_u32 s4, s0, s4
7757 ; SI-NEXT: s_addc_u32 s5, s1, s5
7758 ; SI-NEXT: s_load_dword s6, s[4:5], 0x0
7759 ; SI-NEXT: s_mov_b64 s[0:1], 0
7760 ; SI-NEXT: s_mov_b32 s7, 0xf000
7761 ; SI-NEXT: s_waitcnt lgkmcnt(0)
7762 ; SI-NEXT: v_mov_b32_e32 v1, s6
7763 ; SI-NEXT: s_mov_b32 s6, -1
7764 ; SI-NEXT: .LBB106_1: ; %atomicrmw.start
7765 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7766 ; SI-NEXT: v_min_i32_e32 v0, s8, v1
7767 ; SI-NEXT: s_waitcnt expcnt(0)
7768 ; SI-NEXT: v_mov_b32_e32 v3, v1
7769 ; SI-NEXT: v_mov_b32_e32 v2, v0
7770 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7771 ; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
7772 ; SI-NEXT: s_waitcnt vmcnt(0)
7773 ; SI-NEXT: buffer_wbinvl1
7774 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
7775 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
7776 ; SI-NEXT: v_mov_b32_e32 v1, v2
7777 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
7778 ; SI-NEXT: s_cbranch_execnz .LBB106_1
7779 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7780 ; SI-NEXT: s_or_b64 exec, exec, s[0:1]
7781 ; SI-NEXT: s_mov_b32 s7, 0xf000
7782 ; SI-NEXT: s_mov_b32 s6, -1
7783 ; SI-NEXT: s_mov_b32 s4, s2
7784 ; SI-NEXT: s_mov_b32 s5, s3
7785 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
7788 ; VI-LABEL: atomic_min_i32_ret_addr64:
7789 ; VI: ; %bb.0: ; %entry
7790 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
7791 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
7792 ; VI-NEXT: s_waitcnt lgkmcnt(0)
7793 ; VI-NEXT: s_ashr_i32 s7, s5, 31
7794 ; VI-NEXT: s_mov_b32 s6, s5
7795 ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
7796 ; VI-NEXT: s_add_u32 s0, s0, s6
7797 ; VI-NEXT: s_addc_u32 s1, s1, s7
7798 ; VI-NEXT: s_load_dword s5, s[0:1], 0x0
7799 ; VI-NEXT: s_mov_b64 s[6:7], 0
7800 ; VI-NEXT: s_waitcnt lgkmcnt(0)
7801 ; VI-NEXT: v_mov_b32_e32 v0, s5
7802 ; VI-NEXT: .LBB106_1: ; %atomicrmw.start
7803 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7804 ; VI-NEXT: v_mov_b32_e32 v1, v0
7805 ; VI-NEXT: v_mov_b32_e32 v3, s1
7806 ; VI-NEXT: v_mov_b32_e32 v2, s0
7807 ; VI-NEXT: v_min_i32_e32 v0, s4, v1
7808 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7809 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
7810 ; VI-NEXT: s_waitcnt vmcnt(0)
7811 ; VI-NEXT: buffer_wbinvl1_vol
7812 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
7813 ; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
7814 ; VI-NEXT: s_andn2_b64 exec, exec, s[6:7]
7815 ; VI-NEXT: s_cbranch_execnz .LBB106_1
7816 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7817 ; VI-NEXT: s_or_b64 exec, exec, s[6:7]
7818 ; VI-NEXT: v_mov_b32_e32 v1, s2
7819 ; VI-NEXT: v_mov_b32_e32 v2, s3
7820 ; VI-NEXT: flat_store_dword v[1:2], v0
7823 ; GFX9-LABEL: atomic_min_i32_ret_addr64:
7824 ; GFX9: ; %bb.0: ; %entry
7825 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
7826 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
7827 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
7828 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7829 ; GFX9-NEXT: s_ashr_i32 s1, s3, 31
7830 ; GFX9-NEXT: s_mov_b32 s0, s3
7831 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
7832 ; GFX9-NEXT: s_add_u32 s0, s4, s0
7833 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
7834 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0
7835 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
7836 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7837 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
7838 ; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start
7839 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7840 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
7841 ; GFX9-NEXT: v_min_i32_e32 v2, s2, v3
7842 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7843 ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
7844 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7845 ; GFX9-NEXT: buffer_wbinvl1_vol
7846 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
7847 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7848 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
7849 ; GFX9-NEXT: s_cbranch_execnz .LBB106_1
7850 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7851 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
7852 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
7853 ; GFX9-NEXT: global_store_dword v1, v0, s[6:7]
7854 ; GFX9-NEXT: s_endpgm
7856 %ptr = getelementptr i32, ptr addrspace(1) %out, i32 %index
7857 %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i32 %in seq_cst
7858 store i32 %tmp0, ptr addrspace(1) %out2
7862 ; ---------------------------------------------------------------------
7863 ; atomicrmw uinc_wrap
7864 ; ---------------------------------------------------------------------
7866 define void @global_atomic_uinc_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
7867 ; SI-LABEL: global_atomic_uinc_wrap_i32_noret:
7869 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7870 ; SI-NEXT: s_mov_b32 s6, 0
7871 ; SI-NEXT: s_mov_b32 s7, 0xf000
7872 ; SI-NEXT: s_mov_b32 s4, s6
7873 ; SI-NEXT: s_mov_b32 s5, s6
7874 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7875 ; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64
7876 ; SI-NEXT: s_waitcnt vmcnt(0)
7877 ; SI-NEXT: buffer_wbinvl1
7878 ; SI-NEXT: s_waitcnt expcnt(0)
7879 ; SI-NEXT: s_setpc_b64 s[30:31]
7881 ; VI-LABEL: global_atomic_uinc_wrap_i32_noret:
7883 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7884 ; VI-NEXT: flat_atomic_inc v[0:1], v2
7885 ; VI-NEXT: s_waitcnt vmcnt(0)
7886 ; VI-NEXT: buffer_wbinvl1_vol
7887 ; VI-NEXT: s_setpc_b64 s[30:31]
7889 ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret:
7891 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7892 ; GFX9-NEXT: global_atomic_inc v[0:1], v2, off
7893 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7894 ; GFX9-NEXT: buffer_wbinvl1_vol
7895 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7896 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
7900 define void @global_atomic_uinc_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
7901 ; SI-LABEL: global_atomic_uinc_wrap_i32_noret_offset:
7903 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7904 ; SI-NEXT: s_mov_b32 s6, 0
7905 ; SI-NEXT: s_mov_b32 s7, 0xf000
7906 ; SI-NEXT: s_mov_b32 s4, s6
7907 ; SI-NEXT: s_mov_b32 s5, s6
7908 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7909 ; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16
7910 ; SI-NEXT: s_waitcnt vmcnt(0)
7911 ; SI-NEXT: buffer_wbinvl1
7912 ; SI-NEXT: s_waitcnt expcnt(0)
7913 ; SI-NEXT: s_setpc_b64 s[30:31]
7915 ; VI-LABEL: global_atomic_uinc_wrap_i32_noret_offset:
7917 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7918 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
7919 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7920 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7921 ; VI-NEXT: flat_atomic_inc v[0:1], v2
7922 ; VI-NEXT: s_waitcnt vmcnt(0)
7923 ; VI-NEXT: buffer_wbinvl1_vol
7924 ; VI-NEXT: s_setpc_b64 s[30:31]
7926 ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset:
7928 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7929 ; GFX9-NEXT: global_atomic_inc v[0:1], v2, off offset:16
7930 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7931 ; GFX9-NEXT: buffer_wbinvl1_vol
7932 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7933 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
7934 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
7938 define i32 @global_atomic_uinc_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
7939 ; SI-LABEL: global_atomic_uinc_wrap_i32_ret:
7941 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7942 ; SI-NEXT: s_mov_b32 s6, 0
7943 ; SI-NEXT: s_mov_b32 s7, 0xf000
7944 ; SI-NEXT: s_mov_b32 s4, s6
7945 ; SI-NEXT: s_mov_b32 s5, s6
7946 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7947 ; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 glc
7948 ; SI-NEXT: s_waitcnt vmcnt(0)
7949 ; SI-NEXT: buffer_wbinvl1
7950 ; SI-NEXT: v_mov_b32_e32 v0, v2
7951 ; SI-NEXT: s_waitcnt expcnt(0)
7952 ; SI-NEXT: s_setpc_b64 s[30:31]
7954 ; VI-LABEL: global_atomic_uinc_wrap_i32_ret:
7956 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7957 ; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
7958 ; VI-NEXT: s_waitcnt vmcnt(0)
7959 ; VI-NEXT: buffer_wbinvl1_vol
7960 ; VI-NEXT: s_setpc_b64 s[30:31]
7962 ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret:
7964 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7965 ; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off glc
7966 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7967 ; GFX9-NEXT: buffer_wbinvl1_vol
7968 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7969 %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
7973 define i32 @global_atomic_uinc_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
7974 ; SI-LABEL: global_atomic_uinc_wrap_i32_ret_offset:
7976 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7977 ; SI-NEXT: s_mov_b32 s6, 0
7978 ; SI-NEXT: s_mov_b32 s7, 0xf000
7979 ; SI-NEXT: s_mov_b32 s4, s6
7980 ; SI-NEXT: s_mov_b32 s5, s6
7981 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7982 ; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
7983 ; SI-NEXT: s_waitcnt vmcnt(0)
7984 ; SI-NEXT: buffer_wbinvl1
7985 ; SI-NEXT: v_mov_b32_e32 v0, v2
7986 ; SI-NEXT: s_waitcnt expcnt(0)
7987 ; SI-NEXT: s_setpc_b64 s[30:31]
7989 ; VI-LABEL: global_atomic_uinc_wrap_i32_ret_offset:
7991 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7992 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
7993 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7994 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7995 ; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
7996 ; VI-NEXT: s_waitcnt vmcnt(0)
7997 ; VI-NEXT: buffer_wbinvl1_vol
7998 ; VI-NEXT: s_setpc_b64 s[30:31]
8000 ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset:
8002 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8003 ; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off offset:16 glc
8004 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8005 ; GFX9-NEXT: buffer_wbinvl1_vol
8006 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8007 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
8008 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
8012 define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
8013 ; SI-LABEL: global_atomic_uinc_wrap_i32_noret_scalar:
8015 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8016 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8017 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
8018 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8019 ; SI-NEXT: s_waitcnt expcnt(0)
8020 ; SI-NEXT: v_writelane_b32 v0, s6, 0
8021 ; SI-NEXT: v_writelane_b32 v0, s7, 1
8022 ; SI-NEXT: s_mov_b32 s34, s6
8023 ; SI-NEXT: s_mov_b32 s7, 0xf000
8024 ; SI-NEXT: s_mov_b32 s6, -1
8025 ; SI-NEXT: v_mov_b32_e32 v1, s34
8026 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8027 ; SI-NEXT: buffer_atomic_inc v1, off, s[4:7], 0
8028 ; SI-NEXT: s_waitcnt vmcnt(0)
8029 ; SI-NEXT: buffer_wbinvl1
8030 ; SI-NEXT: v_readlane_b32 s7, v0, 1
8031 ; SI-NEXT: v_readlane_b32 s6, v0, 0
8032 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8033 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
8034 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8035 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
8036 ; SI-NEXT: s_setpc_b64 s[30:31]
8038 ; VI-LABEL: global_atomic_uinc_wrap_i32_noret_scalar:
8040 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8041 ; VI-NEXT: v_mov_b32_e32 v0, s4
8042 ; VI-NEXT: v_mov_b32_e32 v1, s5
8043 ; VI-NEXT: v_mov_b32_e32 v2, s6
8044 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8045 ; VI-NEXT: flat_atomic_inc v[0:1], v2
8046 ; VI-NEXT: s_waitcnt vmcnt(0)
8047 ; VI-NEXT: buffer_wbinvl1_vol
8048 ; VI-NEXT: s_setpc_b64 s[30:31]
8050 ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_scalar:
8052 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8053 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
8054 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
8055 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8056 ; GFX9-NEXT: global_atomic_inc v0, v1, s[4:5]
8057 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8058 ; GFX9-NEXT: buffer_wbinvl1_vol
8059 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8060 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
8064 define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
8065 ; SI-LABEL: global_atomic_uinc_wrap_i32_noret_offset_scalar:
8067 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8068 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8069 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
8070 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8071 ; SI-NEXT: s_waitcnt expcnt(0)
8072 ; SI-NEXT: v_writelane_b32 v0, s6, 0
8073 ; SI-NEXT: v_writelane_b32 v0, s7, 1
8074 ; SI-NEXT: s_mov_b32 s34, s6
8075 ; SI-NEXT: s_mov_b32 s7, 0xf000
8076 ; SI-NEXT: s_mov_b32 s6, -1
8077 ; SI-NEXT: v_mov_b32_e32 v1, s34
8078 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8079 ; SI-NEXT: buffer_atomic_inc v1, off, s[4:7], 0 offset:16
8080 ; SI-NEXT: s_waitcnt vmcnt(0)
8081 ; SI-NEXT: buffer_wbinvl1
8082 ; SI-NEXT: v_readlane_b32 s7, v0, 1
8083 ; SI-NEXT: v_readlane_b32 s6, v0, 0
8084 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8085 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
8086 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8087 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
8088 ; SI-NEXT: s_setpc_b64 s[30:31]
8090 ; VI-LABEL: global_atomic_uinc_wrap_i32_noret_offset_scalar:
8092 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8093 ; VI-NEXT: s_add_u32 s34, s4, 16
8094 ; VI-NEXT: s_addc_u32 s35, s5, 0
8095 ; VI-NEXT: v_mov_b32_e32 v0, s34
8096 ; VI-NEXT: v_mov_b32_e32 v1, s35
8097 ; VI-NEXT: v_mov_b32_e32 v2, s6
8098 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8099 ; VI-NEXT: flat_atomic_inc v[0:1], v2
8100 ; VI-NEXT: s_waitcnt vmcnt(0)
8101 ; VI-NEXT: buffer_wbinvl1_vol
8102 ; VI-NEXT: s_setpc_b64 s[30:31]
8104 ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset_scalar:
8106 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8107 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
8108 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
8109 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8110 ; GFX9-NEXT: global_atomic_inc v0, v1, s[4:5] offset:16
8111 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8112 ; GFX9-NEXT: buffer_wbinvl1_vol
8113 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8114 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
8115 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
8119 define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
8120 ; SI-LABEL: global_atomic_uinc_wrap_i32_ret_scalar:
8122 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8123 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8124 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
8125 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8126 ; SI-NEXT: s_waitcnt expcnt(0)
8127 ; SI-NEXT: v_writelane_b32 v1, s6, 0
8128 ; SI-NEXT: v_writelane_b32 v1, s7, 1
8129 ; SI-NEXT: s_mov_b32 s34, s6
8130 ; SI-NEXT: s_mov_b32 s7, 0xf000
8131 ; SI-NEXT: s_mov_b32 s6, -1
8132 ; SI-NEXT: v_mov_b32_e32 v0, s34
8133 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8134 ; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 glc
8135 ; SI-NEXT: s_waitcnt vmcnt(0)
8136 ; SI-NEXT: buffer_wbinvl1
8137 ; SI-NEXT: v_readlane_b32 s7, v1, 1
8138 ; SI-NEXT: v_readlane_b32 s6, v1, 0
8139 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8140 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
8141 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8142 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
8143 ; SI-NEXT: s_setpc_b64 s[30:31]
8145 ; VI-LABEL: global_atomic_uinc_wrap_i32_ret_scalar:
8147 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8148 ; VI-NEXT: v_mov_b32_e32 v0, s4
8149 ; VI-NEXT: v_mov_b32_e32 v1, s5
8150 ; VI-NEXT: v_mov_b32_e32 v2, s6
8151 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8152 ; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
8153 ; VI-NEXT: s_waitcnt vmcnt(0)
8154 ; VI-NEXT: buffer_wbinvl1_vol
8155 ; VI-NEXT: s_setpc_b64 s[30:31]
8157 ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_scalar:
8159 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8160 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
8161 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
8162 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8163 ; GFX9-NEXT: global_atomic_inc v0, v0, v1, s[4:5] glc
8164 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8165 ; GFX9-NEXT: buffer_wbinvl1_vol
8166 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8167 %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
8171 define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
8172 ; SI-LABEL: global_atomic_uinc_wrap_i32_ret_offset_scalar:
8174 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8175 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8176 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
8177 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8178 ; SI-NEXT: s_waitcnt expcnt(0)
8179 ; SI-NEXT: v_writelane_b32 v1, s6, 0
8180 ; SI-NEXT: v_writelane_b32 v1, s7, 1
8181 ; SI-NEXT: s_mov_b32 s34, s6
8182 ; SI-NEXT: s_mov_b32 s7, 0xf000
8183 ; SI-NEXT: s_mov_b32 s6, -1
8184 ; SI-NEXT: v_mov_b32_e32 v0, s34
8185 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8186 ; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 offset:16 glc
8187 ; SI-NEXT: s_waitcnt vmcnt(0)
8188 ; SI-NEXT: buffer_wbinvl1
8189 ; SI-NEXT: v_readlane_b32 s7, v1, 1
8190 ; SI-NEXT: v_readlane_b32 s6, v1, 0
8191 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8192 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
8193 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8194 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
8195 ; SI-NEXT: s_setpc_b64 s[30:31]
8197 ; VI-LABEL: global_atomic_uinc_wrap_i32_ret_offset_scalar:
8199 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8200 ; VI-NEXT: s_add_u32 s34, s4, 16
8201 ; VI-NEXT: s_addc_u32 s35, s5, 0
8202 ; VI-NEXT: v_mov_b32_e32 v0, s34
8203 ; VI-NEXT: v_mov_b32_e32 v1, s35
8204 ; VI-NEXT: v_mov_b32_e32 v2, s6
8205 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8206 ; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
8207 ; VI-NEXT: s_waitcnt vmcnt(0)
8208 ; VI-NEXT: buffer_wbinvl1_vol
8209 ; VI-NEXT: s_setpc_b64 s[30:31]
8211 ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset_scalar:
8213 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8214 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
8215 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
8216 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8217 ; GFX9-NEXT: global_atomic_inc v0, v0, v1, s[4:5] offset:16 glc
8218 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8219 ; GFX9-NEXT: buffer_wbinvl1_vol
8220 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8221 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
8222 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
8226 ; ---------------------------------------------------------------------
8227 ; atomicrmw udec_wrap
8228 ; ---------------------------------------------------------------------
8230 define void @global_atomic_udec_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
8231 ; SI-LABEL: global_atomic_udec_wrap_i32_noret:
8233 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8234 ; SI-NEXT: s_mov_b32 s6, 0
8235 ; SI-NEXT: s_mov_b32 s7, 0xf000
8236 ; SI-NEXT: s_mov_b32 s4, s6
8237 ; SI-NEXT: s_mov_b32 s5, s6
8238 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8239 ; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64
8240 ; SI-NEXT: s_waitcnt vmcnt(0)
8241 ; SI-NEXT: buffer_wbinvl1
8242 ; SI-NEXT: s_waitcnt expcnt(0)
8243 ; SI-NEXT: s_setpc_b64 s[30:31]
8245 ; VI-LABEL: global_atomic_udec_wrap_i32_noret:
8247 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8248 ; VI-NEXT: flat_atomic_dec v[0:1], v2
8249 ; VI-NEXT: s_waitcnt vmcnt(0)
8250 ; VI-NEXT: buffer_wbinvl1_vol
8251 ; VI-NEXT: s_setpc_b64 s[30:31]
8253 ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret:
8255 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8256 ; GFX9-NEXT: global_atomic_dec v[0:1], v2, off
8257 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8258 ; GFX9-NEXT: buffer_wbinvl1_vol
8259 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8260 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
8264 define void @global_atomic_udec_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
8265 ; SI-LABEL: global_atomic_udec_wrap_i32_noret_offset:
8267 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8268 ; SI-NEXT: s_mov_b32 s6, 0
8269 ; SI-NEXT: s_mov_b32 s7, 0xf000
8270 ; SI-NEXT: s_mov_b32 s4, s6
8271 ; SI-NEXT: s_mov_b32 s5, s6
8272 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8273 ; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16
8274 ; SI-NEXT: s_waitcnt vmcnt(0)
8275 ; SI-NEXT: buffer_wbinvl1
8276 ; SI-NEXT: s_waitcnt expcnt(0)
8277 ; SI-NEXT: s_setpc_b64 s[30:31]
8279 ; VI-LABEL: global_atomic_udec_wrap_i32_noret_offset:
8281 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8282 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
8283 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
8284 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8285 ; VI-NEXT: flat_atomic_dec v[0:1], v2
8286 ; VI-NEXT: s_waitcnt vmcnt(0)
8287 ; VI-NEXT: buffer_wbinvl1_vol
8288 ; VI-NEXT: s_setpc_b64 s[30:31]
8290 ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset:
8292 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8293 ; GFX9-NEXT: global_atomic_dec v[0:1], v2, off offset:16
8294 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8295 ; GFX9-NEXT: buffer_wbinvl1_vol
8296 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8297 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
8298 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst
8302 define i32 @global_atomic_udec_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
8303 ; SI-LABEL: global_atomic_udec_wrap_i32_ret:
8305 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8306 ; SI-NEXT: s_mov_b32 s6, 0
8307 ; SI-NEXT: s_mov_b32 s7, 0xf000
8308 ; SI-NEXT: s_mov_b32 s4, s6
8309 ; SI-NEXT: s_mov_b32 s5, s6
8310 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8311 ; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 glc
8312 ; SI-NEXT: s_waitcnt vmcnt(0)
8313 ; SI-NEXT: buffer_wbinvl1
8314 ; SI-NEXT: v_mov_b32_e32 v0, v2
8315 ; SI-NEXT: s_waitcnt expcnt(0)
8316 ; SI-NEXT: s_setpc_b64 s[30:31]
8318 ; VI-LABEL: global_atomic_udec_wrap_i32_ret:
8320 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8321 ; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
8322 ; VI-NEXT: s_waitcnt vmcnt(0)
8323 ; VI-NEXT: buffer_wbinvl1_vol
8324 ; VI-NEXT: s_setpc_b64 s[30:31]
8326 ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret:
8328 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8329 ; GFX9-NEXT: global_atomic_dec v0, v[0:1], v2, off glc
8330 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8331 ; GFX9-NEXT: buffer_wbinvl1_vol
8332 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8333 %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
8337 define i32 @global_atomic_udec_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
8338 ; SI-LABEL: global_atomic_udec_wrap_i32_ret_offset:
8340 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8341 ; SI-NEXT: s_mov_b32 s6, 0
8342 ; SI-NEXT: s_mov_b32 s7, 0xf000
8343 ; SI-NEXT: s_mov_b32 s4, s6
8344 ; SI-NEXT: s_mov_b32 s5, s6
8345 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8346 ; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
8347 ; SI-NEXT: s_waitcnt vmcnt(0)
8348 ; SI-NEXT: buffer_wbinvl1
8349 ; SI-NEXT: v_mov_b32_e32 v0, v2
8350 ; SI-NEXT: s_waitcnt expcnt(0)
8351 ; SI-NEXT: s_setpc_b64 s[30:31]
8353 ; VI-LABEL: global_atomic_udec_wrap_i32_ret_offset:
8355 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8356 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
8357 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
8358 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8359 ; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
8360 ; VI-NEXT: s_waitcnt vmcnt(0)
8361 ; VI-NEXT: buffer_wbinvl1_vol
8362 ; VI-NEXT: s_setpc_b64 s[30:31]
8364 ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset:
8366 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8367 ; GFX9-NEXT: global_atomic_dec v0, v[0:1], v2, off offset:16 glc
8368 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8369 ; GFX9-NEXT: buffer_wbinvl1_vol
8370 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8371 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
8372 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst
8376 define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
8377 ; SI-LABEL: global_atomic_udec_wrap_i32_noret_scalar:
8379 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8380 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8381 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
8382 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8383 ; SI-NEXT: s_waitcnt expcnt(0)
8384 ; SI-NEXT: v_writelane_b32 v0, s6, 0
8385 ; SI-NEXT: v_writelane_b32 v0, s7, 1
8386 ; SI-NEXT: s_mov_b32 s34, s6
8387 ; SI-NEXT: s_mov_b32 s7, 0xf000
8388 ; SI-NEXT: s_mov_b32 s6, -1
8389 ; SI-NEXT: v_mov_b32_e32 v1, s34
8390 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8391 ; SI-NEXT: buffer_atomic_dec v1, off, s[4:7], 0
8392 ; SI-NEXT: s_waitcnt vmcnt(0)
8393 ; SI-NEXT: buffer_wbinvl1
8394 ; SI-NEXT: v_readlane_b32 s7, v0, 1
8395 ; SI-NEXT: v_readlane_b32 s6, v0, 0
8396 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8397 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
8398 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8399 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
8400 ; SI-NEXT: s_setpc_b64 s[30:31]
8402 ; VI-LABEL: global_atomic_udec_wrap_i32_noret_scalar:
8404 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8405 ; VI-NEXT: v_mov_b32_e32 v0, s4
8406 ; VI-NEXT: v_mov_b32_e32 v1, s5
8407 ; VI-NEXT: v_mov_b32_e32 v2, s6
8408 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8409 ; VI-NEXT: flat_atomic_dec v[0:1], v2
8410 ; VI-NEXT: s_waitcnt vmcnt(0)
8411 ; VI-NEXT: buffer_wbinvl1_vol
8412 ; VI-NEXT: s_setpc_b64 s[30:31]
8414 ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_scalar:
8416 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8417 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
8418 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
8419 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8420 ; GFX9-NEXT: global_atomic_dec v0, v1, s[4:5]
8421 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8422 ; GFX9-NEXT: buffer_wbinvl1_vol
8423 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8424 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
8428 define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
8429 ; SI-LABEL: global_atomic_udec_wrap_i32_noret_offset_scalar:
8431 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8432 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8433 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
8434 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8435 ; SI-NEXT: s_waitcnt expcnt(0)
8436 ; SI-NEXT: v_writelane_b32 v0, s6, 0
8437 ; SI-NEXT: v_writelane_b32 v0, s7, 1
8438 ; SI-NEXT: s_mov_b32 s34, s6
8439 ; SI-NEXT: s_mov_b32 s7, 0xf000
8440 ; SI-NEXT: s_mov_b32 s6, -1
8441 ; SI-NEXT: v_mov_b32_e32 v1, s34
8442 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8443 ; SI-NEXT: buffer_atomic_dec v1, off, s[4:7], 0 offset:16
8444 ; SI-NEXT: s_waitcnt vmcnt(0)
8445 ; SI-NEXT: buffer_wbinvl1
8446 ; SI-NEXT: v_readlane_b32 s7, v0, 1
8447 ; SI-NEXT: v_readlane_b32 s6, v0, 0
8448 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8449 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
8450 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8451 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
8452 ; SI-NEXT: s_setpc_b64 s[30:31]
8454 ; VI-LABEL: global_atomic_udec_wrap_i32_noret_offset_scalar:
8456 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8457 ; VI-NEXT: s_add_u32 s34, s4, 16
8458 ; VI-NEXT: s_addc_u32 s35, s5, 0
8459 ; VI-NEXT: v_mov_b32_e32 v0, s34
8460 ; VI-NEXT: v_mov_b32_e32 v1, s35
8461 ; VI-NEXT: v_mov_b32_e32 v2, s6
8462 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8463 ; VI-NEXT: flat_atomic_dec v[0:1], v2
8464 ; VI-NEXT: s_waitcnt vmcnt(0)
8465 ; VI-NEXT: buffer_wbinvl1_vol
8466 ; VI-NEXT: s_setpc_b64 s[30:31]
8468 ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset_scalar:
8470 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8471 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
8472 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
8473 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8474 ; GFX9-NEXT: global_atomic_dec v0, v1, s[4:5] offset:16
8475 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8476 ; GFX9-NEXT: buffer_wbinvl1_vol
8477 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8478 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
8479 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst
8483 define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
8484 ; SI-LABEL: global_atomic_udec_wrap_i32_ret_scalar:
8486 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8487 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8488 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
8489 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8490 ; SI-NEXT: s_waitcnt expcnt(0)
8491 ; SI-NEXT: v_writelane_b32 v1, s6, 0
8492 ; SI-NEXT: v_writelane_b32 v1, s7, 1
8493 ; SI-NEXT: s_mov_b32 s34, s6
8494 ; SI-NEXT: s_mov_b32 s7, 0xf000
8495 ; SI-NEXT: s_mov_b32 s6, -1
8496 ; SI-NEXT: v_mov_b32_e32 v0, s34
8497 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8498 ; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 glc
8499 ; SI-NEXT: s_waitcnt vmcnt(0)
8500 ; SI-NEXT: buffer_wbinvl1
8501 ; SI-NEXT: v_readlane_b32 s7, v1, 1
8502 ; SI-NEXT: v_readlane_b32 s6, v1, 0
8503 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8504 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
8505 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8506 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
8507 ; SI-NEXT: s_setpc_b64 s[30:31]
8509 ; VI-LABEL: global_atomic_udec_wrap_i32_ret_scalar:
8511 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8512 ; VI-NEXT: v_mov_b32_e32 v0, s4
8513 ; VI-NEXT: v_mov_b32_e32 v1, s5
8514 ; VI-NEXT: v_mov_b32_e32 v2, s6
8515 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8516 ; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
8517 ; VI-NEXT: s_waitcnt vmcnt(0)
8518 ; VI-NEXT: buffer_wbinvl1_vol
8519 ; VI-NEXT: s_setpc_b64 s[30:31]
8521 ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_scalar:
8523 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8524 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
8525 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
8526 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8527 ; GFX9-NEXT: global_atomic_dec v0, v0, v1, s[4:5] glc
8528 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8529 ; GFX9-NEXT: buffer_wbinvl1_vol
8530 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8531 %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
8535 define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
8536 ; SI-LABEL: global_atomic_udec_wrap_i32_ret_offset_scalar:
8538 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8539 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8540 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
8541 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8542 ; SI-NEXT: s_waitcnt expcnt(0)
8543 ; SI-NEXT: v_writelane_b32 v1, s6, 0
8544 ; SI-NEXT: v_writelane_b32 v1, s7, 1
8545 ; SI-NEXT: s_mov_b32 s34, s6
8546 ; SI-NEXT: s_mov_b32 s7, 0xf000
8547 ; SI-NEXT: s_mov_b32 s6, -1
8548 ; SI-NEXT: v_mov_b32_e32 v0, s34
8549 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8550 ; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 offset:16 glc
8551 ; SI-NEXT: s_waitcnt vmcnt(0)
8552 ; SI-NEXT: buffer_wbinvl1
8553 ; SI-NEXT: v_readlane_b32 s7, v1, 1
8554 ; SI-NEXT: v_readlane_b32 s6, v1, 0
8555 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8556 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
8557 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8558 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
8559 ; SI-NEXT: s_setpc_b64 s[30:31]
8561 ; VI-LABEL: global_atomic_udec_wrap_i32_ret_offset_scalar:
8563 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8564 ; VI-NEXT: s_add_u32 s34, s4, 16
8565 ; VI-NEXT: s_addc_u32 s35, s5, 0
8566 ; VI-NEXT: v_mov_b32_e32 v0, s34
8567 ; VI-NEXT: v_mov_b32_e32 v1, s35
8568 ; VI-NEXT: v_mov_b32_e32 v2, s6
8569 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8570 ; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
8571 ; VI-NEXT: s_waitcnt vmcnt(0)
8572 ; VI-NEXT: buffer_wbinvl1_vol
8573 ; VI-NEXT: s_setpc_b64 s[30:31]
8575 ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset_scalar:
8577 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8578 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
8579 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
8580 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8581 ; GFX9-NEXT: global_atomic_dec v0, v0, v1, s[4:5] offset:16 glc
8582 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8583 ; GFX9-NEXT: buffer_wbinvl1_vol
8584 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8585 %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
8586 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst