1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
6 ; ---------------------------------------------------------------------
8 ; ---------------------------------------------------------------------
10 define void @global_atomic_xchg_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
11 ; SI-LABEL: global_atomic_xchg_i64_noret:
13 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14 ; SI-NEXT: s_mov_b32 s6, 0
15 ; SI-NEXT: s_mov_b32 s7, 0xf000
16 ; SI-NEXT: s_mov_b32 s4, s6
17 ; SI-NEXT: s_mov_b32 s5, s6
18 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19 ; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64
20 ; SI-NEXT: s_waitcnt vmcnt(0)
21 ; SI-NEXT: buffer_wbinvl1
22 ; SI-NEXT: s_waitcnt expcnt(0)
23 ; SI-NEXT: s_setpc_b64 s[30:31]
25 ; VI-LABEL: global_atomic_xchg_i64_noret:
27 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28 ; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
29 ; VI-NEXT: s_waitcnt vmcnt(0)
30 ; VI-NEXT: buffer_wbinvl1_vol
31 ; VI-NEXT: s_setpc_b64 s[30:31]
33 ; GFX9-LABEL: global_atomic_xchg_i64_noret:
35 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[2:3], off
37 ; GFX9-NEXT: s_waitcnt vmcnt(0)
38 ; GFX9-NEXT: buffer_wbinvl1_vol
39 ; GFX9-NEXT: s_setpc_b64 s[30:31]
40 %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, i64 %in seq_cst
44 define void @global_atomic_xchg_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
45 ; SI-LABEL: global_atomic_xchg_i64_noret_offset:
47 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
48 ; SI-NEXT: s_mov_b32 s6, 0
49 ; SI-NEXT: s_mov_b32 s7, 0xf000
50 ; SI-NEXT: s_mov_b32 s4, s6
51 ; SI-NEXT: s_mov_b32 s5, s6
52 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
53 ; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
54 ; SI-NEXT: s_waitcnt vmcnt(0)
55 ; SI-NEXT: buffer_wbinvl1
56 ; SI-NEXT: s_waitcnt expcnt(0)
57 ; SI-NEXT: s_setpc_b64 s[30:31]
59 ; VI-LABEL: global_atomic_xchg_i64_noret_offset:
61 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
62 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
63 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
64 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
65 ; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
66 ; VI-NEXT: s_waitcnt vmcnt(0)
67 ; VI-NEXT: buffer_wbinvl1_vol
68 ; VI-NEXT: s_setpc_b64 s[30:31]
70 ; GFX9-LABEL: global_atomic_xchg_i64_noret_offset:
72 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
73 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[2:3], off offset:32
74 ; GFX9-NEXT: s_waitcnt vmcnt(0)
75 ; GFX9-NEXT: buffer_wbinvl1_vol
76 ; GFX9-NEXT: s_setpc_b64 s[30:31]
77 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
78 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst
82 define i64 @global_atomic_xchg_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
83 ; SI-LABEL: global_atomic_xchg_i64_ret:
85 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
86 ; SI-NEXT: s_mov_b32 s6, 0
87 ; SI-NEXT: s_mov_b32 s7, 0xf000
88 ; SI-NEXT: s_mov_b32 s4, s6
89 ; SI-NEXT: s_mov_b32 s5, s6
90 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
91 ; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
92 ; SI-NEXT: s_waitcnt vmcnt(0)
93 ; SI-NEXT: buffer_wbinvl1
94 ; SI-NEXT: v_mov_b32_e32 v0, v2
95 ; SI-NEXT: v_mov_b32_e32 v1, v3
96 ; SI-NEXT: s_waitcnt expcnt(0)
97 ; SI-NEXT: s_setpc_b64 s[30:31]
99 ; VI-LABEL: global_atomic_xchg_i64_ret:
101 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
102 ; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
103 ; VI-NEXT: s_waitcnt vmcnt(0)
104 ; VI-NEXT: buffer_wbinvl1_vol
105 ; VI-NEXT: s_setpc_b64 s[30:31]
107 ; GFX9-LABEL: global_atomic_xchg_i64_ret:
109 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
110 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off glc
111 ; GFX9-NEXT: s_waitcnt vmcnt(0)
112 ; GFX9-NEXT: buffer_wbinvl1_vol
113 ; GFX9-NEXT: s_setpc_b64 s[30:31]
114 %result = atomicrmw xchg ptr addrspace(1) %ptr, i64 %in seq_cst
118 define i64 @global_atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
119 ; SI-LABEL: global_atomic_xchg_i64_ret_offset:
121 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
122 ; SI-NEXT: s_mov_b32 s6, 0
123 ; SI-NEXT: s_mov_b32 s7, 0xf000
124 ; SI-NEXT: s_mov_b32 s4, s6
125 ; SI-NEXT: s_mov_b32 s5, s6
126 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
127 ; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
128 ; SI-NEXT: s_waitcnt vmcnt(0)
129 ; SI-NEXT: buffer_wbinvl1
130 ; SI-NEXT: v_mov_b32_e32 v0, v2
131 ; SI-NEXT: v_mov_b32_e32 v1, v3
132 ; SI-NEXT: s_waitcnt expcnt(0)
133 ; SI-NEXT: s_setpc_b64 s[30:31]
135 ; VI-LABEL: global_atomic_xchg_i64_ret_offset:
137 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
138 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
139 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
140 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
141 ; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
142 ; VI-NEXT: s_waitcnt vmcnt(0)
143 ; VI-NEXT: buffer_wbinvl1_vol
144 ; VI-NEXT: s_setpc_b64 s[30:31]
146 ; GFX9-LABEL: global_atomic_xchg_i64_ret_offset:
148 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
149 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
150 ; GFX9-NEXT: s_waitcnt vmcnt(0)
151 ; GFX9-NEXT: buffer_wbinvl1_vol
152 ; GFX9-NEXT: s_setpc_b64 s[30:31]
153 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
154 %result = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst
158 define amdgpu_gfx void @global_atomic_xchg_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
159 ; SI-LABEL: global_atomic_xchg_i64_noret_scalar:
161 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
162 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
163 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
164 ; SI-NEXT: s_mov_b64 exec, s[34:35]
165 ; SI-NEXT: s_waitcnt expcnt(0)
166 ; SI-NEXT: v_writelane_b32 v0, s6, 0
167 ; SI-NEXT: v_writelane_b32 v0, s7, 1
168 ; SI-NEXT: s_mov_b32 s34, s7
169 ; SI-NEXT: s_mov_b32 s35, s6
170 ; SI-NEXT: s_mov_b32 s7, 0xf000
171 ; SI-NEXT: s_mov_b32 s6, -1
172 ; SI-NEXT: v_mov_b32_e32 v1, s35
173 ; SI-NEXT: v_mov_b32_e32 v2, s34
174 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
175 ; SI-NEXT: buffer_atomic_swap_x2 v[1:2], off, s[4:7], 0
176 ; SI-NEXT: s_waitcnt vmcnt(0)
177 ; SI-NEXT: buffer_wbinvl1
178 ; SI-NEXT: v_readlane_b32 s7, v0, 1
179 ; SI-NEXT: v_readlane_b32 s6, v0, 0
180 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
181 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
182 ; SI-NEXT: s_mov_b64 exec, s[34:35]
183 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
184 ; SI-NEXT: s_setpc_b64 s[30:31]
186 ; VI-LABEL: global_atomic_xchg_i64_noret_scalar:
188 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
189 ; VI-NEXT: v_mov_b32_e32 v0, s6
190 ; VI-NEXT: v_mov_b32_e32 v1, s7
191 ; VI-NEXT: v_mov_b32_e32 v2, s4
192 ; VI-NEXT: v_mov_b32_e32 v3, s5
193 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
194 ; VI-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
195 ; VI-NEXT: s_waitcnt vmcnt(0)
196 ; VI-NEXT: buffer_wbinvl1_vol
197 ; VI-NEXT: s_setpc_b64 s[30:31]
199 ; GFX9-LABEL: global_atomic_xchg_i64_noret_scalar:
201 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
202 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
203 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
204 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
205 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
206 ; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5]
207 ; GFX9-NEXT: s_waitcnt vmcnt(0)
208 ; GFX9-NEXT: buffer_wbinvl1_vol
209 ; GFX9-NEXT: s_setpc_b64 s[30:31]
210 %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, i64 %in seq_cst
214 define amdgpu_gfx void @global_atomic_xchg_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
215 ; SI-LABEL: global_atomic_xchg_i64_noret_offset_scalar:
217 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
218 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
219 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
220 ; SI-NEXT: s_mov_b64 exec, s[34:35]
221 ; SI-NEXT: s_waitcnt expcnt(0)
222 ; SI-NEXT: v_writelane_b32 v0, s6, 0
223 ; SI-NEXT: v_writelane_b32 v0, s7, 1
224 ; SI-NEXT: v_mov_b32_e32 v1, s6
225 ; SI-NEXT: v_mov_b32_e32 v2, s7
226 ; SI-NEXT: s_mov_b32 s7, 0xf000
227 ; SI-NEXT: s_mov_b32 s6, -1
228 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
229 ; SI-NEXT: buffer_atomic_swap_x2 v[1:2], off, s[4:7], 0 offset:32
230 ; SI-NEXT: s_waitcnt vmcnt(0)
231 ; SI-NEXT: buffer_wbinvl1
232 ; SI-NEXT: v_readlane_b32 s7, v0, 1
233 ; SI-NEXT: v_readlane_b32 s6, v0, 0
234 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
235 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
236 ; SI-NEXT: s_mov_b64 exec, s[34:35]
237 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
238 ; SI-NEXT: s_setpc_b64 s[30:31]
240 ; VI-LABEL: global_atomic_xchg_i64_noret_offset_scalar:
242 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
243 ; VI-NEXT: s_add_u32 s34, s4, 32
244 ; VI-NEXT: s_addc_u32 s35, s5, 0
245 ; VI-NEXT: v_mov_b32_e32 v2, s34
246 ; VI-NEXT: v_mov_b32_e32 v0, s6
247 ; VI-NEXT: v_mov_b32_e32 v1, s7
248 ; VI-NEXT: v_mov_b32_e32 v3, s35
249 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
250 ; VI-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
251 ; VI-NEXT: s_waitcnt vmcnt(0)
252 ; VI-NEXT: buffer_wbinvl1_vol
253 ; VI-NEXT: s_setpc_b64 s[30:31]
255 ; GFX9-LABEL: global_atomic_xchg_i64_noret_offset_scalar:
257 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
258 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
259 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
260 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
261 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
262 ; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32
263 ; GFX9-NEXT: s_waitcnt vmcnt(0)
264 ; GFX9-NEXT: buffer_wbinvl1_vol
265 ; GFX9-NEXT: s_setpc_b64 s[30:31]
266 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
267 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst
271 define amdgpu_gfx i64 @global_atomic_xchg_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
272 ; SI-LABEL: global_atomic_xchg_i64_ret_scalar:
274 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
275 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
276 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
277 ; SI-NEXT: s_mov_b64 exec, s[34:35]
278 ; SI-NEXT: s_waitcnt expcnt(0)
279 ; SI-NEXT: v_writelane_b32 v2, s6, 0
280 ; SI-NEXT: v_writelane_b32 v2, s7, 1
281 ; SI-NEXT: s_mov_b32 s34, s7
282 ; SI-NEXT: s_mov_b32 s35, s6
283 ; SI-NEXT: s_mov_b32 s7, 0xf000
284 ; SI-NEXT: s_mov_b32 s6, -1
285 ; SI-NEXT: v_mov_b32_e32 v0, s35
286 ; SI-NEXT: v_mov_b32_e32 v1, s34
287 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
288 ; SI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 glc
289 ; SI-NEXT: s_waitcnt vmcnt(0)
290 ; SI-NEXT: buffer_wbinvl1
291 ; SI-NEXT: v_readlane_b32 s7, v2, 1
292 ; SI-NEXT: v_readlane_b32 s6, v2, 0
293 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
294 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
295 ; SI-NEXT: s_mov_b64 exec, s[34:35]
296 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
297 ; SI-NEXT: s_setpc_b64 s[30:31]
299 ; VI-LABEL: global_atomic_xchg_i64_ret_scalar:
301 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
302 ; VI-NEXT: v_mov_b32_e32 v0, s6
303 ; VI-NEXT: v_mov_b32_e32 v1, s7
304 ; VI-NEXT: v_mov_b32_e32 v2, s4
305 ; VI-NEXT: v_mov_b32_e32 v3, s5
306 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
307 ; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
308 ; VI-NEXT: s_waitcnt vmcnt(0)
309 ; VI-NEXT: buffer_wbinvl1_vol
310 ; VI-NEXT: s_setpc_b64 s[30:31]
312 ; GFX9-LABEL: global_atomic_xchg_i64_ret_scalar:
314 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
315 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
316 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
317 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
318 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
319 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] glc
320 ; GFX9-NEXT: s_waitcnt vmcnt(0)
321 ; GFX9-NEXT: buffer_wbinvl1_vol
322 ; GFX9-NEXT: s_setpc_b64 s[30:31]
323 %result = atomicrmw xchg ptr addrspace(1) %ptr, i64 %in seq_cst
327 define amdgpu_gfx i64 @global_atomic_xchg_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
328 ; SI-LABEL: global_atomic_xchg_i64_ret_offset_scalar:
330 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
331 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
332 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
333 ; SI-NEXT: s_mov_b64 exec, s[34:35]
334 ; SI-NEXT: s_waitcnt expcnt(0)
335 ; SI-NEXT: v_writelane_b32 v2, s6, 0
336 ; SI-NEXT: v_writelane_b32 v2, s7, 1
337 ; SI-NEXT: v_mov_b32_e32 v0, s6
338 ; SI-NEXT: v_mov_b32_e32 v1, s7
339 ; SI-NEXT: s_mov_b32 s7, 0xf000
340 ; SI-NEXT: s_mov_b32 s6, -1
341 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
342 ; SI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32 glc
343 ; SI-NEXT: s_waitcnt vmcnt(0)
344 ; SI-NEXT: buffer_wbinvl1
345 ; SI-NEXT: v_readlane_b32 s7, v2, 1
346 ; SI-NEXT: v_readlane_b32 s6, v2, 0
347 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
348 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
349 ; SI-NEXT: s_mov_b64 exec, s[34:35]
350 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
351 ; SI-NEXT: s_setpc_b64 s[30:31]
353 ; VI-LABEL: global_atomic_xchg_i64_ret_offset_scalar:
355 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
356 ; VI-NEXT: s_add_u32 s34, s4, 32
357 ; VI-NEXT: s_addc_u32 s35, s5, 0
358 ; VI-NEXT: v_mov_b32_e32 v2, s34
359 ; VI-NEXT: v_mov_b32_e32 v0, s6
360 ; VI-NEXT: v_mov_b32_e32 v1, s7
361 ; VI-NEXT: v_mov_b32_e32 v3, s35
362 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
363 ; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
364 ; VI-NEXT: s_waitcnt vmcnt(0)
365 ; VI-NEXT: buffer_wbinvl1_vol
366 ; VI-NEXT: s_setpc_b64 s[30:31]
368 ; GFX9-LABEL: global_atomic_xchg_i64_ret_offset_scalar:
370 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
371 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
372 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
373 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
374 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
375 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
376 ; GFX9-NEXT: s_waitcnt vmcnt(0)
377 ; GFX9-NEXT: buffer_wbinvl1_vol
378 ; GFX9-NEXT: s_setpc_b64 s[30:31]
379 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
380 %result = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst
384 ; ---------------------------------------------------------------------
386 ; ---------------------------------------------------------------------
388 define void @global_atomic_xchg_f64_noret(ptr addrspace(1) %ptr, double %in) {
389 ; GCN1-LABEL: global_atomic_xchg_f64_noret:
391 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
392 ; GCN1-NEXT: global_load_dword v3, v[0:1]
393 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
394 ; GCN1-NEXT: .LBB0_1: ; %atomicrmw.start
395 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
396 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
397 ; GCN1-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
398 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
399 ; GCN1-NEXT: buffer_wbinvl1_vol
400 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
401 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
402 ; GCN1-NEXT: v_mov_b32_e32 v3, v4
403 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
404 ; GCN1-NEXT: s_cbranch_execnz .LBB0_1
405 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
406 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
407 ; GCN1-NEXT: s_setpc_b64 s[30:31]
409 ; GCN2-LABEL: global_atomic_xchg_f64_noret:
411 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
412 ; GCN2-NEXT: global_load_dword v3, v[0:1]
413 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
414 ; GCN2-NEXT: .LBB0_1: ; %atomicrmw.start
415 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
416 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
417 ; GCN2-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
418 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
419 ; GCN2-NEXT: buffer_wbinvl1_vol
420 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
421 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
422 ; GCN2-NEXT: v_mov_b32_e32 v3, v4
423 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
424 ; GCN2-NEXT: s_cbranch_execnz .LBB0_1
425 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
426 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
427 ; GCN2-NEXT: s_setpc_b64 s[30:31]
429 ; GCN3-LABEL: global_atomic_xchg_f64_noret:
431 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
432 ; GCN3-NEXT: global_load_dword v3, v[0:1]
433 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
434 ; GCN3-NEXT: .LBB0_1: ; %atomicrmw.start
435 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
436 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
437 ; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
438 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
439 ; GCN3-NEXT: buffer_wbinvl1_vol
440 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
441 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
442 ; GCN3-NEXT: v_mov_b32_e32 v3, v4
443 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
444 ; GCN3-NEXT: s_cbranch_execnz .LBB0_1
445 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
446 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
447 ; GCN3-NEXT: s_setpc_b64 s[30:31]
448 ; SI-LABEL: global_atomic_xchg_f64_noret:
450 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
451 ; SI-NEXT: s_mov_b32 s6, 0
452 ; SI-NEXT: s_mov_b32 s7, 0xf000
453 ; SI-NEXT: s_mov_b32 s4, s6
454 ; SI-NEXT: s_mov_b32 s5, s6
455 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
456 ; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64
457 ; SI-NEXT: s_waitcnt vmcnt(0)
458 ; SI-NEXT: buffer_wbinvl1
459 ; SI-NEXT: s_waitcnt expcnt(0)
460 ; SI-NEXT: s_setpc_b64 s[30:31]
462 ; VI-LABEL: global_atomic_xchg_f64_noret:
464 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
465 ; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
466 ; VI-NEXT: s_waitcnt vmcnt(0)
467 ; VI-NEXT: buffer_wbinvl1_vol
468 ; VI-NEXT: s_setpc_b64 s[30:31]
470 ; GFX9-LABEL: global_atomic_xchg_f64_noret:
472 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
473 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[2:3], off
474 ; GFX9-NEXT: s_waitcnt vmcnt(0)
475 ; GFX9-NEXT: buffer_wbinvl1_vol
476 ; GFX9-NEXT: s_setpc_b64 s[30:31]
477 %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, double %in seq_cst
481 define void @global_atomic_xchg_f64_noret_offset(ptr addrspace(1) %out, double %in) {
482 ; GCN1-LABEL: global_atomic_xchg_f64_noret_offset:
484 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
485 ; GCN1-NEXT: v_add_f64_e32 v0, vcc, 16, v0
486 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
487 ; GCN1-NEXT: global_load_dword v3, v[0:1]
488 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
489 ; GCN1-NEXT: .LBB1_1: ; %atomicrmw.start
490 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
491 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
492 ; GCN1-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
493 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
494 ; GCN1-NEXT: buffer_wbinvl1_vol
495 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
496 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
497 ; GCN1-NEXT: v_mov_b32_e32 v3, v4
498 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
499 ; GCN1-NEXT: s_cbranch_execnz .LBB1_1
500 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
501 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
502 ; GCN1-NEXT: s_setpc_b64 s[30:31]
504 ; GCN2-LABEL: global_atomic_xchg_f64_noret_offset:
506 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
507 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
508 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
509 ; GCN2-NEXT: global_load_dword v3, v[0:1]
510 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
511 ; GCN2-NEXT: .LBB1_1: ; %atomicrmw.start
512 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
513 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
514 ; GCN2-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
515 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
516 ; GCN2-NEXT: buffer_wbinvl1_vol
517 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
518 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
519 ; GCN2-NEXT: v_mov_b32_e32 v3, v4
520 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
521 ; GCN2-NEXT: s_cbranch_execnz .LBB1_1
522 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
523 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
524 ; GCN2-NEXT: s_setpc_b64 s[30:31]
526 ; GCN3-LABEL: global_atomic_xchg_f64_noret_offset:
528 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
529 ; GCN3-NEXT: global_load_dword v3, v[0:1] offset:16
530 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
531 ; GCN3-NEXT: .LBB1_1: ; %atomicrmw.start
532 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
533 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
534 ; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc
535 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
536 ; GCN3-NEXT: buffer_wbinvl1_vol
537 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
538 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
539 ; GCN3-NEXT: v_mov_b32_e32 v3, v4
540 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
541 ; GCN3-NEXT: s_cbranch_execnz .LBB1_1
542 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
543 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
544 ; GCN3-NEXT: s_setpc_b64 s[30:31]
545 ; SI-LABEL: global_atomic_xchg_f64_noret_offset:
547 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
548 ; SI-NEXT: s_mov_b32 s6, 0
549 ; SI-NEXT: s_mov_b32 s7, 0xf000
550 ; SI-NEXT: s_mov_b32 s4, s6
551 ; SI-NEXT: s_mov_b32 s5, s6
552 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
553 ; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
554 ; SI-NEXT: s_waitcnt vmcnt(0)
555 ; SI-NEXT: buffer_wbinvl1
556 ; SI-NEXT: s_waitcnt expcnt(0)
557 ; SI-NEXT: s_setpc_b64 s[30:31]
559 ; VI-LABEL: global_atomic_xchg_f64_noret_offset:
561 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
562 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
563 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
564 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
565 ; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
566 ; VI-NEXT: s_waitcnt vmcnt(0)
567 ; VI-NEXT: buffer_wbinvl1_vol
568 ; VI-NEXT: s_setpc_b64 s[30:31]
570 ; GFX9-LABEL: global_atomic_xchg_f64_noret_offset:
572 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
573 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[2:3], off offset:32
574 ; GFX9-NEXT: s_waitcnt vmcnt(0)
575 ; GFX9-NEXT: buffer_wbinvl1_vol
576 ; GFX9-NEXT: s_setpc_b64 s[30:31]
577 %gep = getelementptr double, ptr addrspace(1) %out, i32 4
578 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst
582 define double @global_atomic_xchg_f64_ret(ptr addrspace(1) %ptr, double %in) {
583 ; GCN1-LABEL: global_atomic_xchg_f64_ret:
585 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
586 ; GCN1-NEXT: global_load_dword v4, v[0:1]
587 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
588 ; GCN1-NEXT: .LBB2_1: ; %atomicrmw.start
589 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
590 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
591 ; GCN1-NEXT: v_mov_b32_e32 v3, v4
592 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
593 ; GCN1-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
594 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
595 ; GCN1-NEXT: buffer_wbinvl1_vol
596 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
597 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
598 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
599 ; GCN1-NEXT: s_cbranch_execnz .LBB2_1
600 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
601 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
602 ; GCN1-NEXT: v_mov_b32_e32 v0, v4
603 ; GCN1-NEXT: s_setpc_b64 s[30:31]
605 ; GCN2-LABEL: global_atomic_xchg_f64_ret:
607 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
608 ; GCN2-NEXT: global_load_dword v4, v[0:1]
609 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
610 ; GCN2-NEXT: .LBB2_1: ; %atomicrmw.start
611 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
612 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
613 ; GCN2-NEXT: v_mov_b32_e32 v3, v4
614 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
615 ; GCN2-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
616 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
617 ; GCN2-NEXT: buffer_wbinvl1_vol
618 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
619 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
620 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
621 ; GCN2-NEXT: s_cbranch_execnz .LBB2_1
622 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
623 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
624 ; GCN2-NEXT: v_mov_b32_e32 v0, v4
625 ; GCN2-NEXT: s_setpc_b64 s[30:31]
627 ; GCN3-LABEL: global_atomic_xchg_f64_ret:
629 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
630 ; GCN3-NEXT: global_load_dword v4, v[0:1]
631 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
632 ; GCN3-NEXT: .LBB2_1: ; %atomicrmw.start
633 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
634 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
635 ; GCN3-NEXT: v_mov_b32_e32 v3, v4
636 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
637 ; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
638 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
639 ; GCN3-NEXT: buffer_wbinvl1_vol
640 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
641 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
642 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
643 ; GCN3-NEXT: s_cbranch_execnz .LBB2_1
644 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
645 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
646 ; GCN3-NEXT: v_mov_b32_e32 v0, v4
647 ; GCN3-NEXT: s_setpc_b64 s[30:31]
648 ; SI-LABEL: global_atomic_xchg_f64_ret:
650 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
651 ; SI-NEXT: s_mov_b32 s6, 0
652 ; SI-NEXT: s_mov_b32 s7, 0xf000
653 ; SI-NEXT: s_mov_b32 s4, s6
654 ; SI-NEXT: s_mov_b32 s5, s6
655 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
656 ; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
657 ; SI-NEXT: s_waitcnt vmcnt(0)
658 ; SI-NEXT: buffer_wbinvl1
659 ; SI-NEXT: v_mov_b32_e32 v0, v2
660 ; SI-NEXT: v_mov_b32_e32 v1, v3
661 ; SI-NEXT: s_waitcnt expcnt(0)
662 ; SI-NEXT: s_setpc_b64 s[30:31]
664 ; VI-LABEL: global_atomic_xchg_f64_ret:
666 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
667 ; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
668 ; VI-NEXT: s_waitcnt vmcnt(0)
669 ; VI-NEXT: buffer_wbinvl1_vol
670 ; VI-NEXT: s_setpc_b64 s[30:31]
672 ; GFX9-LABEL: global_atomic_xchg_f64_ret:
674 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
675 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off glc
676 ; GFX9-NEXT: s_waitcnt vmcnt(0)
677 ; GFX9-NEXT: buffer_wbinvl1_vol
678 ; GFX9-NEXT: s_setpc_b64 s[30:31]
679 %result = atomicrmw xchg ptr addrspace(1) %ptr, double %in seq_cst
683 define double @global_atomic_xchg_f64_ret_offset(ptr addrspace(1) %out, double %in) {
684 ; GCN1-LABEL: global_atomic_xchg_f64_ret_offset:
686 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
687 ; GCN1-NEXT: v_add_f64_e32 v4, vcc, 16, v0
688 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
689 ; GCN1-NEXT: global_load_dword v0, v[4:5]
690 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
691 ; GCN1-NEXT: .LBB3_1: ; %atomicrmw.start
692 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
693 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
694 ; GCN1-NEXT: v_mov_b32_e32 v3, v0
695 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
696 ; GCN1-NEXT: global_atomic_cmpswap v0, v[4:5], v[2:3] glc
697 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
698 ; GCN1-NEXT: buffer_wbinvl1_vol
699 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
700 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
701 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
702 ; GCN1-NEXT: s_cbranch_execnz .LBB3_1
703 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
704 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
705 ; GCN1-NEXT: s_setpc_b64 s[30:31]
707 ; GCN2-LABEL: global_atomic_xchg_f64_ret_offset:
709 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
710 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 16, v0
711 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
712 ; GCN2-NEXT: global_load_dword v0, v[4:5]
713 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
714 ; GCN2-NEXT: .LBB3_1: ; %atomicrmw.start
715 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
716 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
717 ; GCN2-NEXT: v_mov_b32_e32 v3, v0
718 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
719 ; GCN2-NEXT: global_atomic_cmpswap v0, v[4:5], v[2:3] glc
720 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
721 ; GCN2-NEXT: buffer_wbinvl1_vol
722 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
723 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
724 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
725 ; GCN2-NEXT: s_cbranch_execnz .LBB3_1
726 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
727 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
728 ; GCN2-NEXT: s_setpc_b64 s[30:31]
730 ; GCN3-LABEL: global_atomic_xchg_f64_ret_offset:
732 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
733 ; GCN3-NEXT: global_load_dword v4, v[0:1] offset:16
734 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
735 ; GCN3-NEXT: .LBB3_1: ; %atomicrmw.start
736 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
737 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
738 ; GCN3-NEXT: v_mov_b32_e32 v3, v4
739 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
740 ; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc
741 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
742 ; GCN3-NEXT: buffer_wbinvl1_vol
743 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
744 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
745 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
746 ; GCN3-NEXT: s_cbranch_execnz .LBB3_1
747 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
748 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
749 ; GCN3-NEXT: v_mov_b32_e32 v0, v4
750 ; GCN3-NEXT: s_setpc_b64 s[30:31]
751 ; SI-LABEL: global_atomic_xchg_f64_ret_offset:
753 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
754 ; SI-NEXT: s_mov_b32 s6, 0
755 ; SI-NEXT: s_mov_b32 s7, 0xf000
756 ; SI-NEXT: s_mov_b32 s4, s6
757 ; SI-NEXT: s_mov_b32 s5, s6
758 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
759 ; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
760 ; SI-NEXT: s_waitcnt vmcnt(0)
761 ; SI-NEXT: buffer_wbinvl1
762 ; SI-NEXT: v_mov_b32_e32 v0, v2
763 ; SI-NEXT: v_mov_b32_e32 v1, v3
764 ; SI-NEXT: s_waitcnt expcnt(0)
765 ; SI-NEXT: s_setpc_b64 s[30:31]
767 ; VI-LABEL: global_atomic_xchg_f64_ret_offset:
769 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
770 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
771 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
772 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
773 ; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
774 ; VI-NEXT: s_waitcnt vmcnt(0)
775 ; VI-NEXT: buffer_wbinvl1_vol
776 ; VI-NEXT: s_setpc_b64 s[30:31]
778 ; GFX9-LABEL: global_atomic_xchg_f64_ret_offset:
780 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
781 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
782 ; GFX9-NEXT: s_waitcnt vmcnt(0)
783 ; GFX9-NEXT: buffer_wbinvl1_vol
784 ; GFX9-NEXT: s_setpc_b64 s[30:31]
785 %gep = getelementptr double, ptr addrspace(1) %out, i32 4
786 %result = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst
790 define amdgpu_gfx void @global_atomic_xchg_f64_noret_scalar(ptr addrspace(1) inreg %ptr, double inreg %in) {
791 ; GCN1-LABEL: global_atomic_xchg_f64_noret_scalar:
793 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
794 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
795 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
796 ; GCN1-NEXT: global_load_dword v1, v[0:1]
797 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
798 ; GCN1-NEXT: .LBB4_1: ; %atomicrmw.start
799 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
800 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
801 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
802 ; GCN1-NEXT: v_mov_b32_e32 v3, s5
803 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
804 ; GCN1-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc
805 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
806 ; GCN1-NEXT: buffer_wbinvl1_vol
807 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
808 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
809 ; GCN1-NEXT: v_mov_b32_e32 v1, v0
810 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
811 ; GCN1-NEXT: s_cbranch_execnz .LBB4_1
812 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
813 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
814 ; GCN1-NEXT: s_setpc_b64 s[30:31]
816 ; GCN2-LABEL: global_atomic_xchg_f64_noret_scalar:
818 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
819 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
820 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
821 ; GCN2-NEXT: global_load_dword v1, v[0:1]
822 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
823 ; GCN2-NEXT: .LBB4_1: ; %atomicrmw.start
824 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
825 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
826 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
827 ; GCN2-NEXT: v_mov_b32_e32 v3, s5
828 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
829 ; GCN2-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc
830 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
831 ; GCN2-NEXT: buffer_wbinvl1_vol
832 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
833 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
834 ; GCN2-NEXT: v_mov_b32_e32 v1, v0
835 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
836 ; GCN2-NEXT: s_cbranch_execnz .LBB4_1
837 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
838 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
839 ; GCN2-NEXT: s_setpc_b64 s[30:31]
841 ; GCN3-LABEL: global_atomic_xchg_f64_noret_scalar:
843 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
844 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
845 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
846 ; GCN3-NEXT: global_load_dword v1, v[0:1]
847 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
848 ; GCN3-NEXT: .LBB4_1: ; %atomicrmw.start
849 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
850 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
851 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
852 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
853 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
854 ; GCN3-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc
855 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
856 ; GCN3-NEXT: buffer_wbinvl1_vol
857 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
858 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
859 ; GCN3-NEXT: v_mov_b32_e32 v1, v0
860 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
861 ; GCN3-NEXT: s_cbranch_execnz .LBB4_1
862 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
863 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
864 ; GCN3-NEXT: s_setpc_b64 s[30:31]
865 ; SI-LABEL: global_atomic_xchg_f64_noret_scalar:
867 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
868 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
869 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
870 ; SI-NEXT: s_mov_b64 exec, s[34:35]
871 ; SI-NEXT: s_waitcnt expcnt(0)
872 ; SI-NEXT: v_writelane_b32 v0, s6, 0
873 ; SI-NEXT: v_writelane_b32 v0, s7, 1
874 ; SI-NEXT: s_mov_b32 s34, s7
875 ; SI-NEXT: s_mov_b32 s35, s6
876 ; SI-NEXT: s_mov_b32 s7, 0xf000
877 ; SI-NEXT: s_mov_b32 s6, -1
878 ; SI-NEXT: v_mov_b32_e32 v1, s35
879 ; SI-NEXT: v_mov_b32_e32 v2, s34
880 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
881 ; SI-NEXT: buffer_atomic_swap_x2 v[1:2], off, s[4:7], 0
882 ; SI-NEXT: s_waitcnt vmcnt(0)
883 ; SI-NEXT: buffer_wbinvl1
884 ; SI-NEXT: v_readlane_b32 s7, v0, 1
885 ; SI-NEXT: v_readlane_b32 s6, v0, 0
886 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
887 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
888 ; SI-NEXT: s_mov_b64 exec, s[34:35]
889 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
890 ; SI-NEXT: s_setpc_b64 s[30:31]
892 ; VI-LABEL: global_atomic_xchg_f64_noret_scalar:
894 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
895 ; VI-NEXT: v_mov_b32_e32 v0, s6
896 ; VI-NEXT: v_mov_b32_e32 v1, s7
897 ; VI-NEXT: v_mov_b32_e32 v2, s4
898 ; VI-NEXT: v_mov_b32_e32 v3, s5
899 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
900 ; VI-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
901 ; VI-NEXT: s_waitcnt vmcnt(0)
902 ; VI-NEXT: buffer_wbinvl1_vol
903 ; VI-NEXT: s_setpc_b64 s[30:31]
905 ; GFX9-LABEL: global_atomic_xchg_f64_noret_scalar:
907 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
908 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
909 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
910 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
911 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
912 ; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5]
913 ; GFX9-NEXT: s_waitcnt vmcnt(0)
914 ; GFX9-NEXT: buffer_wbinvl1_vol
915 ; GFX9-NEXT: s_setpc_b64 s[30:31]
916 %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, double %in seq_cst
920 define amdgpu_gfx void @global_atomic_xchg_f64_noret_offset_scalar(ptr addrspace(1) inreg %out, double inreg %in) {
921 ; GCN1-LABEL: global_atomic_xchg_f64_noret_offset_scalar:
923 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
924 ; GCN1-NEXT: s_add_u32 s34, s4, 16
925 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
926 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
927 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
928 ; GCN1-NEXT: global_load_dword v1, v[0:1]
929 ; GCN1-NEXT: s_mov_b64 s[36:37], 0
930 ; GCN1-NEXT: .LBB5_1: ; %atomicrmw.start
931 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
932 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
933 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
934 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
935 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
936 ; GCN1-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc
937 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
938 ; GCN1-NEXT: buffer_wbinvl1_vol
939 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
940 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
941 ; GCN1-NEXT: v_mov_b32_e32 v1, v0
942 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
943 ; GCN1-NEXT: s_cbranch_execnz .LBB5_1
944 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
945 ; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
946 ; GCN1-NEXT: s_setpc_b64 s[30:31]
948 ; GCN2-LABEL: global_atomic_xchg_f64_noret_offset_scalar:
950 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
951 ; GCN2-NEXT: s_add_u32 s34, s4, 16
952 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
953 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
954 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
955 ; GCN2-NEXT: global_load_dword v1, v[0:1]
956 ; GCN2-NEXT: s_mov_b64 s[36:37], 0
957 ; GCN2-NEXT: .LBB5_1: ; %atomicrmw.start
958 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
959 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
960 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
961 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
962 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
963 ; GCN2-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc
964 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
965 ; GCN2-NEXT: buffer_wbinvl1_vol
966 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
967 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
968 ; GCN2-NEXT: v_mov_b32_e32 v1, v0
969 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
970 ; GCN2-NEXT: s_cbranch_execnz .LBB5_1
971 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
972 ; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
973 ; GCN2-NEXT: s_setpc_b64 s[30:31]
975 ; GCN3-LABEL: global_atomic_xchg_f64_noret_offset_scalar:
977 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
978 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
979 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
980 ; GCN3-NEXT: global_load_dword v1, v[0:1] offset:16
981 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
982 ; GCN3-NEXT: .LBB5_1: ; %atomicrmw.start
983 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
984 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
985 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
986 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
987 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
988 ; GCN3-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
989 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
990 ; GCN3-NEXT: buffer_wbinvl1_vol
991 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
992 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
993 ; GCN3-NEXT: v_mov_b32_e32 v1, v0
994 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
995 ; GCN3-NEXT: s_cbranch_execnz .LBB5_1
996 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
997 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
998 ; GCN3-NEXT: s_setpc_b64 s[30:31]
999 ; SI-LABEL: global_atomic_xchg_f64_noret_offset_scalar:
1001 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1002 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1003 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
1004 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1005 ; SI-NEXT: s_waitcnt expcnt(0)
1006 ; SI-NEXT: v_writelane_b32 v0, s6, 0
1007 ; SI-NEXT: v_writelane_b32 v0, s7, 1
1008 ; SI-NEXT: v_mov_b32_e32 v1, s6
1009 ; SI-NEXT: v_mov_b32_e32 v2, s7
1010 ; SI-NEXT: s_mov_b32 s7, 0xf000
1011 ; SI-NEXT: s_mov_b32 s6, -1
1012 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1013 ; SI-NEXT: buffer_atomic_swap_x2 v[1:2], off, s[4:7], 0 offset:32
1014 ; SI-NEXT: s_waitcnt vmcnt(0)
1015 ; SI-NEXT: buffer_wbinvl1
1016 ; SI-NEXT: v_readlane_b32 s7, v0, 1
1017 ; SI-NEXT: v_readlane_b32 s6, v0, 0
1018 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1019 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
1020 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1021 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1022 ; SI-NEXT: s_setpc_b64 s[30:31]
1024 ; VI-LABEL: global_atomic_xchg_f64_noret_offset_scalar:
1026 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1027 ; VI-NEXT: s_add_u32 s34, s4, 32
1028 ; VI-NEXT: s_addc_u32 s35, s5, 0
1029 ; VI-NEXT: v_mov_b32_e32 v2, s34
1030 ; VI-NEXT: v_mov_b32_e32 v0, s6
1031 ; VI-NEXT: v_mov_b32_e32 v1, s7
1032 ; VI-NEXT: v_mov_b32_e32 v3, s35
1033 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1034 ; VI-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
1035 ; VI-NEXT: s_waitcnt vmcnt(0)
1036 ; VI-NEXT: buffer_wbinvl1_vol
1037 ; VI-NEXT: s_setpc_b64 s[30:31]
1039 ; GFX9-LABEL: global_atomic_xchg_f64_noret_offset_scalar:
1041 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1042 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
1043 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
1044 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1045 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1046 ; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32
1047 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1048 ; GFX9-NEXT: buffer_wbinvl1_vol
1049 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1050 %gep = getelementptr double, ptr addrspace(1) %out, i32 4
1051 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst
1055 define amdgpu_gfx double @global_atomic_xchg_f64_ret_scalar(ptr addrspace(1) inreg %ptr, double inreg %in) {
1056 ; GCN1-LABEL: global_atomic_xchg_f64_ret_scalar:
1058 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1059 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
1060 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
1061 ; GCN1-NEXT: global_load_dword v0, v[0:1]
1062 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
1063 ; GCN1-NEXT: .LBB6_1: ; %atomicrmw.start
1064 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
1065 ; GCN1-NEXT: v_mov_b32_e32 v3, s4
1066 ; GCN1-NEXT: v_mov_b32_e32 v1, s6
1067 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1068 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
1069 ; GCN1-NEXT: v_mov_b32_e32 v4, s5
1070 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1071 ; GCN1-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc
1072 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1073 ; GCN1-NEXT: buffer_wbinvl1_vol
1074 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
1075 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
1076 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
1077 ; GCN1-NEXT: s_cbranch_execnz .LBB6_1
1078 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
1079 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
1080 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1082 ; GCN2-LABEL: global_atomic_xchg_f64_ret_scalar:
1084 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1085 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
1086 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
1087 ; GCN2-NEXT: global_load_dword v0, v[0:1]
1088 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
1089 ; GCN2-NEXT: .LBB6_1: ; %atomicrmw.start
1090 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
1091 ; GCN2-NEXT: v_mov_b32_e32 v3, s4
1092 ; GCN2-NEXT: v_mov_b32_e32 v1, s6
1093 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1094 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
1095 ; GCN2-NEXT: v_mov_b32_e32 v4, s5
1096 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1097 ; GCN2-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc
1098 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1099 ; GCN2-NEXT: buffer_wbinvl1_vol
1100 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
1101 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
1102 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
1103 ; GCN2-NEXT: s_cbranch_execnz .LBB6_1
1104 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
1105 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
1106 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1108 ; GCN3-LABEL: global_atomic_xchg_f64_ret_scalar:
1110 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1111 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
1112 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
1113 ; GCN3-NEXT: global_load_dword v0, v[0:1]
1114 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
1115 ; GCN3-NEXT: .LBB6_1: ; %atomicrmw.start
1116 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
1117 ; GCN3-NEXT: v_mov_b32_e32 v3, s4
1118 ; GCN3-NEXT: v_mov_b32_e32 v1, s6
1119 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1120 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
1121 ; GCN3-NEXT: v_mov_b32_e32 v4, s5
1122 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1123 ; GCN3-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc
1124 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1125 ; GCN3-NEXT: buffer_wbinvl1_vol
1126 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
1127 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
1128 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
1129 ; GCN3-NEXT: s_cbranch_execnz .LBB6_1
1130 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
1131 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
1132 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1133 ; SI-LABEL: global_atomic_xchg_f64_ret_scalar:
1135 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1136 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1137 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
1138 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1139 ; SI-NEXT: s_waitcnt expcnt(0)
1140 ; SI-NEXT: v_writelane_b32 v2, s6, 0
1141 ; SI-NEXT: v_writelane_b32 v2, s7, 1
1142 ; SI-NEXT: s_mov_b32 s34, s7
1143 ; SI-NEXT: s_mov_b32 s35, s6
1144 ; SI-NEXT: s_mov_b32 s7, 0xf000
1145 ; SI-NEXT: s_mov_b32 s6, -1
1146 ; SI-NEXT: v_mov_b32_e32 v0, s35
1147 ; SI-NEXT: v_mov_b32_e32 v1, s34
1148 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1149 ; SI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 glc
1150 ; SI-NEXT: s_waitcnt vmcnt(0)
1151 ; SI-NEXT: buffer_wbinvl1
1152 ; SI-NEXT: v_readlane_b32 s7, v2, 1
1153 ; SI-NEXT: v_readlane_b32 s6, v2, 0
1154 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1155 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
1156 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1157 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1158 ; SI-NEXT: s_setpc_b64 s[30:31]
1160 ; VI-LABEL: global_atomic_xchg_f64_ret_scalar:
1162 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1163 ; VI-NEXT: v_mov_b32_e32 v0, s6
1164 ; VI-NEXT: v_mov_b32_e32 v1, s7
1165 ; VI-NEXT: v_mov_b32_e32 v2, s4
1166 ; VI-NEXT: v_mov_b32_e32 v3, s5
1167 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1168 ; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
1169 ; VI-NEXT: s_waitcnt vmcnt(0)
1170 ; VI-NEXT: buffer_wbinvl1_vol
1171 ; VI-NEXT: s_setpc_b64 s[30:31]
1173 ; GFX9-LABEL: global_atomic_xchg_f64_ret_scalar:
1175 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1176 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
1177 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
1178 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1179 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1180 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] glc
1181 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1182 ; GFX9-NEXT: buffer_wbinvl1_vol
1183 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1184 %result = atomicrmw xchg ptr addrspace(1) %ptr, double %in seq_cst
1188 define amdgpu_gfx double @global_atomic_xchg_f64_ret_offset_scalar(ptr addrspace(1) inreg %out, double inreg %in) {
1189 ; GCN1-LABEL: global_atomic_xchg_f64_ret_offset_scalar:
1191 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1192 ; GCN1-NEXT: s_add_u32 s34, s4, 16
1193 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
1194 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
1195 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
1196 ; GCN1-NEXT: global_load_dword v0, v[0:1]
1197 ; GCN1-NEXT: s_mov_b64 s[36:37], 0
1198 ; GCN1-NEXT: .LBB7_1: ; %atomicrmw.start
1199 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
1200 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
1201 ; GCN1-NEXT: v_mov_b32_e32 v1, s6
1202 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1203 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
1204 ; GCN1-NEXT: v_mov_b32_e32 v4, s35
1205 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1206 ; GCN1-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc
1207 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1208 ; GCN1-NEXT: buffer_wbinvl1_vol
1209 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
1210 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
1211 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
1212 ; GCN1-NEXT: s_cbranch_execnz .LBB7_1
1213 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
1214 ; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
1215 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1217 ; GCN2-LABEL: global_atomic_xchg_f64_ret_offset_scalar:
1219 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1220 ; GCN2-NEXT: s_add_u32 s34, s4, 16
1221 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
1222 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
1223 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
1224 ; GCN2-NEXT: global_load_dword v0, v[0:1]
1225 ; GCN2-NEXT: s_mov_b64 s[36:37], 0
1226 ; GCN2-NEXT: .LBB7_1: ; %atomicrmw.start
1227 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
1228 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
1229 ; GCN2-NEXT: v_mov_b32_e32 v1, s6
1230 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1231 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
1232 ; GCN2-NEXT: v_mov_b32_e32 v4, s35
1233 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1234 ; GCN2-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc
1235 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1236 ; GCN2-NEXT: buffer_wbinvl1_vol
1237 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
1238 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
1239 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
1240 ; GCN2-NEXT: s_cbranch_execnz .LBB7_1
1241 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
1242 ; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
1243 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1245 ; GCN3-LABEL: global_atomic_xchg_f64_ret_offset_scalar:
1247 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1248 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
1249 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
1250 ; GCN3-NEXT: global_load_dword v0, v[0:1] offset:16
1251 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
1252 ; GCN3-NEXT: .LBB7_1: ; %atomicrmw.start
1253 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
1254 ; GCN3-NEXT: v_mov_b32_e32 v3, s4
1255 ; GCN3-NEXT: v_mov_b32_e32 v1, s6
1256 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1257 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
1258 ; GCN3-NEXT: v_mov_b32_e32 v4, s5
1259 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1260 ; GCN3-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] offset:16 glc
1261 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1262 ; GCN3-NEXT: buffer_wbinvl1_vol
1263 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
1264 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
1265 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
1266 ; GCN3-NEXT: s_cbranch_execnz .LBB7_1
1267 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
1268 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
1269 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1270 ; SI-LABEL: global_atomic_xchg_f64_ret_offset_scalar:
1272 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1273 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1274 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
1275 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1276 ; SI-NEXT: s_waitcnt expcnt(0)
1277 ; SI-NEXT: v_writelane_b32 v2, s6, 0
1278 ; SI-NEXT: v_writelane_b32 v2, s7, 1
1279 ; SI-NEXT: v_mov_b32_e32 v0, s6
1280 ; SI-NEXT: v_mov_b32_e32 v1, s7
1281 ; SI-NEXT: s_mov_b32 s7, 0xf000
1282 ; SI-NEXT: s_mov_b32 s6, -1
1283 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1284 ; SI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32 glc
1285 ; SI-NEXT: s_waitcnt vmcnt(0)
1286 ; SI-NEXT: buffer_wbinvl1
1287 ; SI-NEXT: v_readlane_b32 s7, v2, 1
1288 ; SI-NEXT: v_readlane_b32 s6, v2, 0
1289 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1290 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
1291 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1292 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1293 ; SI-NEXT: s_setpc_b64 s[30:31]
1295 ; VI-LABEL: global_atomic_xchg_f64_ret_offset_scalar:
1297 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1298 ; VI-NEXT: s_add_u32 s34, s4, 32
1299 ; VI-NEXT: s_addc_u32 s35, s5, 0
1300 ; VI-NEXT: v_mov_b32_e32 v2, s34
1301 ; VI-NEXT: v_mov_b32_e32 v0, s6
1302 ; VI-NEXT: v_mov_b32_e32 v1, s7
1303 ; VI-NEXT: v_mov_b32_e32 v3, s35
1304 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1305 ; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
1306 ; VI-NEXT: s_waitcnt vmcnt(0)
1307 ; VI-NEXT: buffer_wbinvl1_vol
1308 ; VI-NEXT: s_setpc_b64 s[30:31]
1310 ; GFX9-LABEL: global_atomic_xchg_f64_ret_offset_scalar:
1312 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1313 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
1314 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
1315 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1316 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1317 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
1318 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1319 ; GFX9-NEXT: buffer_wbinvl1_vol
1320 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1321 %gep = getelementptr double, ptr addrspace(1) %out, i32 4
1322 %result = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst
1326 ; ---------------------------------------------------------------------
1328 ; ---------------------------------------------------------------------
1330 define void @global_atomic_add_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
1331 ; SI-LABEL: global_atomic_add_i64_noret:
1333 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1334 ; SI-NEXT: s_mov_b32 s6, 0
1335 ; SI-NEXT: s_mov_b32 s7, 0xf000
1336 ; SI-NEXT: s_mov_b32 s4, s6
1337 ; SI-NEXT: s_mov_b32 s5, s6
1338 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1339 ; SI-NEXT: buffer_atomic_add_x2 v[2:3], v[0:1], s[4:7], 0 addr64
1340 ; SI-NEXT: s_waitcnt vmcnt(0)
1341 ; SI-NEXT: buffer_wbinvl1
1342 ; SI-NEXT: s_waitcnt expcnt(0)
1343 ; SI-NEXT: s_setpc_b64 s[30:31]
1345 ; VI-LABEL: global_atomic_add_i64_noret:
1347 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1348 ; VI-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
1349 ; VI-NEXT: s_waitcnt vmcnt(0)
1350 ; VI-NEXT: buffer_wbinvl1_vol
1351 ; VI-NEXT: s_setpc_b64 s[30:31]
1353 ; GFX9-LABEL: global_atomic_add_i64_noret:
1355 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1356 ; GFX9-NEXT: global_atomic_add_x2 v[0:1], v[2:3], off
1357 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1358 ; GFX9-NEXT: buffer_wbinvl1_vol
1359 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1360 %tmp0 = atomicrmw add ptr addrspace(1) %ptr, i64 %in seq_cst
1364 define void @global_atomic_add_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
1365 ; SI-LABEL: global_atomic_add_i64_noret_offset:
1367 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1368 ; SI-NEXT: s_mov_b32 s6, 0
1369 ; SI-NEXT: s_mov_b32 s7, 0xf000
1370 ; SI-NEXT: s_mov_b32 s4, s6
1371 ; SI-NEXT: s_mov_b32 s5, s6
1372 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1373 ; SI-NEXT: buffer_atomic_add_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
1374 ; SI-NEXT: s_waitcnt vmcnt(0)
1375 ; SI-NEXT: buffer_wbinvl1
1376 ; SI-NEXT: s_waitcnt expcnt(0)
1377 ; SI-NEXT: s_setpc_b64 s[30:31]
1379 ; VI-LABEL: global_atomic_add_i64_noret_offset:
1381 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1382 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
1383 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1384 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1385 ; VI-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
1386 ; VI-NEXT: s_waitcnt vmcnt(0)
1387 ; VI-NEXT: buffer_wbinvl1_vol
1388 ; VI-NEXT: s_setpc_b64 s[30:31]
1390 ; GFX9-LABEL: global_atomic_add_i64_noret_offset:
1392 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1393 ; GFX9-NEXT: global_atomic_add_x2 v[0:1], v[2:3], off offset:32
1394 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1395 ; GFX9-NEXT: buffer_wbinvl1_vol
1396 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1397 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
1398 %tmp0 = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst
1402 define i64 @global_atomic_add_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
1403 ; SI-LABEL: global_atomic_add_i64_ret:
1405 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1406 ; SI-NEXT: s_mov_b32 s6, 0
1407 ; SI-NEXT: s_mov_b32 s7, 0xf000
1408 ; SI-NEXT: s_mov_b32 s4, s6
1409 ; SI-NEXT: s_mov_b32 s5, s6
1410 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1411 ; SI-NEXT: buffer_atomic_add_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
1412 ; SI-NEXT: s_waitcnt vmcnt(0)
1413 ; SI-NEXT: buffer_wbinvl1
1414 ; SI-NEXT: v_mov_b32_e32 v0, v2
1415 ; SI-NEXT: v_mov_b32_e32 v1, v3
1416 ; SI-NEXT: s_waitcnt expcnt(0)
1417 ; SI-NEXT: s_setpc_b64 s[30:31]
1419 ; VI-LABEL: global_atomic_add_i64_ret:
1421 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1422 ; VI-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
1423 ; VI-NEXT: s_waitcnt vmcnt(0)
1424 ; VI-NEXT: buffer_wbinvl1_vol
1425 ; VI-NEXT: s_setpc_b64 s[30:31]
1427 ; GFX9-LABEL: global_atomic_add_i64_ret:
1429 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1430 ; GFX9-NEXT: global_atomic_add_x2 v[0:1], v[0:1], v[2:3], off glc
1431 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1432 ; GFX9-NEXT: buffer_wbinvl1_vol
1433 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1434 %result = atomicrmw add ptr addrspace(1) %ptr, i64 %in seq_cst
1438 define i64 @global_atomic_add_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
1439 ; SI-LABEL: global_atomic_add_i64_ret_offset:
1441 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1442 ; SI-NEXT: s_mov_b32 s6, 0
1443 ; SI-NEXT: s_mov_b32 s7, 0xf000
1444 ; SI-NEXT: s_mov_b32 s4, s6
1445 ; SI-NEXT: s_mov_b32 s5, s6
1446 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1447 ; SI-NEXT: buffer_atomic_add_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
1448 ; SI-NEXT: s_waitcnt vmcnt(0)
1449 ; SI-NEXT: buffer_wbinvl1
1450 ; SI-NEXT: v_mov_b32_e32 v0, v2
1451 ; SI-NEXT: v_mov_b32_e32 v1, v3
1452 ; SI-NEXT: s_waitcnt expcnt(0)
1453 ; SI-NEXT: s_setpc_b64 s[30:31]
1455 ; VI-LABEL: global_atomic_add_i64_ret_offset:
1457 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1458 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
1459 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1460 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1461 ; VI-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
1462 ; VI-NEXT: s_waitcnt vmcnt(0)
1463 ; VI-NEXT: buffer_wbinvl1_vol
1464 ; VI-NEXT: s_setpc_b64 s[30:31]
1466 ; GFX9-LABEL: global_atomic_add_i64_ret_offset:
1468 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1469 ; GFX9-NEXT: global_atomic_add_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
1470 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1471 ; GFX9-NEXT: buffer_wbinvl1_vol
1472 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1473 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
1474 %result = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst
1478 define amdgpu_gfx void @global_atomic_add_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
1479 ; SI-LABEL: global_atomic_add_i64_noret_scalar:
1481 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1482 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1483 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
1484 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1485 ; SI-NEXT: s_waitcnt expcnt(0)
1486 ; SI-NEXT: v_writelane_b32 v0, s6, 0
1487 ; SI-NEXT: v_writelane_b32 v0, s7, 1
1488 ; SI-NEXT: s_mov_b32 s34, s7
1489 ; SI-NEXT: s_mov_b32 s35, s6
1490 ; SI-NEXT: s_mov_b32 s7, 0xf000
1491 ; SI-NEXT: s_mov_b32 s6, -1
1492 ; SI-NEXT: v_mov_b32_e32 v1, s35
1493 ; SI-NEXT: v_mov_b32_e32 v2, s34
1494 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1495 ; SI-NEXT: buffer_atomic_add_x2 v[1:2], off, s[4:7], 0
1496 ; SI-NEXT: s_waitcnt vmcnt(0)
1497 ; SI-NEXT: buffer_wbinvl1
1498 ; SI-NEXT: v_readlane_b32 s7, v0, 1
1499 ; SI-NEXT: v_readlane_b32 s6, v0, 0
1500 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1501 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
1502 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1503 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1504 ; SI-NEXT: s_setpc_b64 s[30:31]
1506 ; VI-LABEL: global_atomic_add_i64_noret_scalar:
1508 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1509 ; VI-NEXT: v_mov_b32_e32 v0, s6
1510 ; VI-NEXT: v_mov_b32_e32 v1, s7
1511 ; VI-NEXT: v_mov_b32_e32 v2, s4
1512 ; VI-NEXT: v_mov_b32_e32 v3, s5
1513 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1514 ; VI-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
1515 ; VI-NEXT: s_waitcnt vmcnt(0)
1516 ; VI-NEXT: buffer_wbinvl1_vol
1517 ; VI-NEXT: s_setpc_b64 s[30:31]
1519 ; GFX9-LABEL: global_atomic_add_i64_noret_scalar:
1521 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1522 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
1523 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
1524 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1525 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1526 ; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[4:5]
1527 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1528 ; GFX9-NEXT: buffer_wbinvl1_vol
1529 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1530 %tmp0 = atomicrmw add ptr addrspace(1) %ptr, i64 %in seq_cst
1534 define amdgpu_gfx void @global_atomic_add_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
1535 ; SI-LABEL: global_atomic_add_i64_noret_offset_scalar:
1537 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1538 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1539 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
1540 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1541 ; SI-NEXT: s_waitcnt expcnt(0)
1542 ; SI-NEXT: v_writelane_b32 v0, s6, 0
1543 ; SI-NEXT: v_writelane_b32 v0, s7, 1
1544 ; SI-NEXT: v_mov_b32_e32 v1, s6
1545 ; SI-NEXT: v_mov_b32_e32 v2, s7
1546 ; SI-NEXT: s_mov_b32 s7, 0xf000
1547 ; SI-NEXT: s_mov_b32 s6, -1
1548 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1549 ; SI-NEXT: buffer_atomic_add_x2 v[1:2], off, s[4:7], 0 offset:32
1550 ; SI-NEXT: s_waitcnt vmcnt(0)
1551 ; SI-NEXT: buffer_wbinvl1
1552 ; SI-NEXT: v_readlane_b32 s7, v0, 1
1553 ; SI-NEXT: v_readlane_b32 s6, v0, 0
1554 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1555 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
1556 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1557 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1558 ; SI-NEXT: s_setpc_b64 s[30:31]
1560 ; VI-LABEL: global_atomic_add_i64_noret_offset_scalar:
1562 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1563 ; VI-NEXT: s_add_u32 s34, s4, 32
1564 ; VI-NEXT: s_addc_u32 s35, s5, 0
1565 ; VI-NEXT: v_mov_b32_e32 v2, s34
1566 ; VI-NEXT: v_mov_b32_e32 v0, s6
1567 ; VI-NEXT: v_mov_b32_e32 v1, s7
1568 ; VI-NEXT: v_mov_b32_e32 v3, s35
1569 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1570 ; VI-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
1571 ; VI-NEXT: s_waitcnt vmcnt(0)
1572 ; VI-NEXT: buffer_wbinvl1_vol
1573 ; VI-NEXT: s_setpc_b64 s[30:31]
1575 ; GFX9-LABEL: global_atomic_add_i64_noret_offset_scalar:
1577 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1578 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
1579 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
1580 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1581 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1582 ; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[4:5] offset:32
1583 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1584 ; GFX9-NEXT: buffer_wbinvl1_vol
1585 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1586 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
1587 %tmp0 = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst
1591 define amdgpu_gfx i64 @global_atomic_add_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
1592 ; SI-LABEL: global_atomic_add_i64_ret_scalar:
1594 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1595 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1596 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
1597 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1598 ; SI-NEXT: s_waitcnt expcnt(0)
1599 ; SI-NEXT: v_writelane_b32 v2, s6, 0
1600 ; SI-NEXT: v_writelane_b32 v2, s7, 1
1601 ; SI-NEXT: s_mov_b32 s34, s7
1602 ; SI-NEXT: s_mov_b32 s35, s6
1603 ; SI-NEXT: s_mov_b32 s7, 0xf000
1604 ; SI-NEXT: s_mov_b32 s6, -1
1605 ; SI-NEXT: v_mov_b32_e32 v0, s35
1606 ; SI-NEXT: v_mov_b32_e32 v1, s34
1607 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1608 ; SI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 glc
1609 ; SI-NEXT: s_waitcnt vmcnt(0)
1610 ; SI-NEXT: buffer_wbinvl1
1611 ; SI-NEXT: v_readlane_b32 s7, v2, 1
1612 ; SI-NEXT: v_readlane_b32 s6, v2, 0
1613 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1614 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
1615 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1616 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1617 ; SI-NEXT: s_setpc_b64 s[30:31]
1619 ; VI-LABEL: global_atomic_add_i64_ret_scalar:
1621 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1622 ; VI-NEXT: v_mov_b32_e32 v0, s6
1623 ; VI-NEXT: v_mov_b32_e32 v1, s7
1624 ; VI-NEXT: v_mov_b32_e32 v2, s4
1625 ; VI-NEXT: v_mov_b32_e32 v3, s5
1626 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1627 ; VI-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
1628 ; VI-NEXT: s_waitcnt vmcnt(0)
1629 ; VI-NEXT: buffer_wbinvl1_vol
1630 ; VI-NEXT: s_setpc_b64 s[30:31]
1632 ; GFX9-LABEL: global_atomic_add_i64_ret_scalar:
1634 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1635 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
1636 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
1637 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1638 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1639 ; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[4:5] glc
1640 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1641 ; GFX9-NEXT: buffer_wbinvl1_vol
1642 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1643 %result = atomicrmw add ptr addrspace(1) %ptr, i64 %in seq_cst
1647 define amdgpu_gfx i64 @global_atomic_add_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
1648 ; SI-LABEL: global_atomic_add_i64_ret_offset_scalar:
1650 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1651 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1652 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
1653 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1654 ; SI-NEXT: s_waitcnt expcnt(0)
1655 ; SI-NEXT: v_writelane_b32 v2, s6, 0
1656 ; SI-NEXT: v_writelane_b32 v2, s7, 1
1657 ; SI-NEXT: v_mov_b32_e32 v0, s6
1658 ; SI-NEXT: v_mov_b32_e32 v1, s7
1659 ; SI-NEXT: s_mov_b32 s7, 0xf000
1660 ; SI-NEXT: s_mov_b32 s6, -1
1661 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1662 ; SI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 offset:32 glc
1663 ; SI-NEXT: s_waitcnt vmcnt(0)
1664 ; SI-NEXT: buffer_wbinvl1
1665 ; SI-NEXT: v_readlane_b32 s7, v2, 1
1666 ; SI-NEXT: v_readlane_b32 s6, v2, 0
1667 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1668 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
1669 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1670 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1671 ; SI-NEXT: s_setpc_b64 s[30:31]
1673 ; VI-LABEL: global_atomic_add_i64_ret_offset_scalar:
1675 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1676 ; VI-NEXT: s_add_u32 s34, s4, 32
1677 ; VI-NEXT: s_addc_u32 s35, s5, 0
1678 ; VI-NEXT: v_mov_b32_e32 v2, s34
1679 ; VI-NEXT: v_mov_b32_e32 v0, s6
1680 ; VI-NEXT: v_mov_b32_e32 v1, s7
1681 ; VI-NEXT: v_mov_b32_e32 v3, s35
1682 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1683 ; VI-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
1684 ; VI-NEXT: s_waitcnt vmcnt(0)
1685 ; VI-NEXT: buffer_wbinvl1_vol
1686 ; VI-NEXT: s_setpc_b64 s[30:31]
1688 ; GFX9-LABEL: global_atomic_add_i64_ret_offset_scalar:
1690 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1691 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
1692 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
1693 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1694 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1695 ; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
1696 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1697 ; GFX9-NEXT: buffer_wbinvl1_vol
1698 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1699 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
1700 %result = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst
1704 ; ---------------------------------------------------------------------
1706 ; ---------------------------------------------------------------------
1708 define void @global_atomic_sub_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
1709 ; SI-LABEL: global_atomic_sub_i64_noret:
1711 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1712 ; SI-NEXT: s_mov_b32 s6, 0
1713 ; SI-NEXT: s_mov_b32 s7, 0xf000
1714 ; SI-NEXT: s_mov_b32 s4, s6
1715 ; SI-NEXT: s_mov_b32 s5, s6
1716 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1717 ; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64
1718 ; SI-NEXT: s_waitcnt vmcnt(0)
1719 ; SI-NEXT: buffer_wbinvl1
1720 ; SI-NEXT: s_waitcnt expcnt(0)
1721 ; SI-NEXT: s_setpc_b64 s[30:31]
1723 ; VI-LABEL: global_atomic_sub_i64_noret:
1725 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1726 ; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
1727 ; VI-NEXT: s_waitcnt vmcnt(0)
1728 ; VI-NEXT: buffer_wbinvl1_vol
1729 ; VI-NEXT: s_setpc_b64 s[30:31]
1731 ; GFX9-LABEL: global_atomic_sub_i64_noret:
1733 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1734 ; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[2:3], off
1735 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1736 ; GFX9-NEXT: buffer_wbinvl1_vol
1737 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1738 %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst
1742 define void @global_atomic_sub_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
1743 ; SI-LABEL: global_atomic_sub_i64_noret_offset:
1745 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1746 ; SI-NEXT: s_mov_b32 s6, 0
1747 ; SI-NEXT: s_mov_b32 s7, 0xf000
1748 ; SI-NEXT: s_mov_b32 s4, s6
1749 ; SI-NEXT: s_mov_b32 s5, s6
1750 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1751 ; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
1752 ; SI-NEXT: s_waitcnt vmcnt(0)
1753 ; SI-NEXT: buffer_wbinvl1
1754 ; SI-NEXT: s_waitcnt expcnt(0)
1755 ; SI-NEXT: s_setpc_b64 s[30:31]
1757 ; VI-LABEL: global_atomic_sub_i64_noret_offset:
1759 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1760 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
1761 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1762 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1763 ; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
1764 ; VI-NEXT: s_waitcnt vmcnt(0)
1765 ; VI-NEXT: buffer_wbinvl1_vol
1766 ; VI-NEXT: s_setpc_b64 s[30:31]
1768 ; GFX9-LABEL: global_atomic_sub_i64_noret_offset:
1770 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1771 ; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[2:3], off offset:32
1772 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1773 ; GFX9-NEXT: buffer_wbinvl1_vol
1774 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1775 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
1776 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst
1780 define i64 @global_atomic_sub_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
1781 ; SI-LABEL: global_atomic_sub_i64_ret:
1783 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1784 ; SI-NEXT: s_mov_b32 s6, 0
1785 ; SI-NEXT: s_mov_b32 s7, 0xf000
1786 ; SI-NEXT: s_mov_b32 s4, s6
1787 ; SI-NEXT: s_mov_b32 s5, s6
1788 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1789 ; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
1790 ; SI-NEXT: s_waitcnt vmcnt(0)
1791 ; SI-NEXT: buffer_wbinvl1
1792 ; SI-NEXT: v_mov_b32_e32 v0, v2
1793 ; SI-NEXT: v_mov_b32_e32 v1, v3
1794 ; SI-NEXT: s_waitcnt expcnt(0)
1795 ; SI-NEXT: s_setpc_b64 s[30:31]
1797 ; VI-LABEL: global_atomic_sub_i64_ret:
1799 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1800 ; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
1801 ; VI-NEXT: s_waitcnt vmcnt(0)
1802 ; VI-NEXT: buffer_wbinvl1_vol
1803 ; VI-NEXT: s_setpc_b64 s[30:31]
1805 ; GFX9-LABEL: global_atomic_sub_i64_ret:
1807 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1808 ; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off glc
1809 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1810 ; GFX9-NEXT: buffer_wbinvl1_vol
1811 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1812 %result = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst
1816 define i64 @global_atomic_sub_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
1817 ; SI-LABEL: global_atomic_sub_i64_ret_offset:
1819 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1820 ; SI-NEXT: s_mov_b32 s6, 0
1821 ; SI-NEXT: s_mov_b32 s7, 0xf000
1822 ; SI-NEXT: s_mov_b32 s4, s6
1823 ; SI-NEXT: s_mov_b32 s5, s6
1824 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1825 ; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
1826 ; SI-NEXT: s_waitcnt vmcnt(0)
1827 ; SI-NEXT: buffer_wbinvl1
1828 ; SI-NEXT: v_mov_b32_e32 v0, v2
1829 ; SI-NEXT: v_mov_b32_e32 v1, v3
1830 ; SI-NEXT: s_waitcnt expcnt(0)
1831 ; SI-NEXT: s_setpc_b64 s[30:31]
1833 ; VI-LABEL: global_atomic_sub_i64_ret_offset:
1835 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1836 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
1837 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1838 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1839 ; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
1840 ; VI-NEXT: s_waitcnt vmcnt(0)
1841 ; VI-NEXT: buffer_wbinvl1_vol
1842 ; VI-NEXT: s_setpc_b64 s[30:31]
1844 ; GFX9-LABEL: global_atomic_sub_i64_ret_offset:
1846 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1847 ; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
1848 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1849 ; GFX9-NEXT: buffer_wbinvl1_vol
1850 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1851 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
1852 %result = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst
1856 define amdgpu_gfx void @global_atomic_sub_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
1857 ; SI-LABEL: global_atomic_sub_i64_noret_scalar:
1859 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1860 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1861 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
1862 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1863 ; SI-NEXT: s_waitcnt expcnt(0)
1864 ; SI-NEXT: v_writelane_b32 v0, s6, 0
1865 ; SI-NEXT: v_writelane_b32 v0, s7, 1
1866 ; SI-NEXT: s_mov_b32 s34, s7
1867 ; SI-NEXT: s_mov_b32 s35, s6
1868 ; SI-NEXT: s_mov_b32 s7, 0xf000
1869 ; SI-NEXT: s_mov_b32 s6, -1
1870 ; SI-NEXT: v_mov_b32_e32 v1, s35
1871 ; SI-NEXT: v_mov_b32_e32 v2, s34
1872 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1873 ; SI-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[4:7], 0
1874 ; SI-NEXT: s_waitcnt vmcnt(0)
1875 ; SI-NEXT: buffer_wbinvl1
1876 ; SI-NEXT: v_readlane_b32 s7, v0, 1
1877 ; SI-NEXT: v_readlane_b32 s6, v0, 0
1878 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1879 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
1880 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1881 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1882 ; SI-NEXT: s_setpc_b64 s[30:31]
1884 ; VI-LABEL: global_atomic_sub_i64_noret_scalar:
1886 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1887 ; VI-NEXT: v_mov_b32_e32 v0, s6
1888 ; VI-NEXT: v_mov_b32_e32 v1, s7
1889 ; VI-NEXT: v_mov_b32_e32 v2, s4
1890 ; VI-NEXT: v_mov_b32_e32 v3, s5
1891 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1892 ; VI-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
1893 ; VI-NEXT: s_waitcnt vmcnt(0)
1894 ; VI-NEXT: buffer_wbinvl1_vol
1895 ; VI-NEXT: s_setpc_b64 s[30:31]
1897 ; GFX9-LABEL: global_atomic_sub_i64_noret_scalar:
1899 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1900 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
1901 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
1902 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1903 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1904 ; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[4:5]
1905 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1906 ; GFX9-NEXT: buffer_wbinvl1_vol
1907 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1908 %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst
1912 define amdgpu_gfx void @global_atomic_sub_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
1913 ; SI-LABEL: global_atomic_sub_i64_noret_offset_scalar:
1915 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1916 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1917 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
1918 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1919 ; SI-NEXT: s_waitcnt expcnt(0)
1920 ; SI-NEXT: v_writelane_b32 v0, s6, 0
1921 ; SI-NEXT: v_writelane_b32 v0, s7, 1
1922 ; SI-NEXT: v_mov_b32_e32 v1, s6
1923 ; SI-NEXT: v_mov_b32_e32 v2, s7
1924 ; SI-NEXT: s_mov_b32 s7, 0xf000
1925 ; SI-NEXT: s_mov_b32 s6, -1
1926 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1927 ; SI-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[4:7], 0 offset:32
1928 ; SI-NEXT: s_waitcnt vmcnt(0)
1929 ; SI-NEXT: buffer_wbinvl1
1930 ; SI-NEXT: v_readlane_b32 s7, v0, 1
1931 ; SI-NEXT: v_readlane_b32 s6, v0, 0
1932 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1933 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
1934 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1935 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1936 ; SI-NEXT: s_setpc_b64 s[30:31]
1938 ; VI-LABEL: global_atomic_sub_i64_noret_offset_scalar:
1940 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1941 ; VI-NEXT: s_add_u32 s34, s4, 32
1942 ; VI-NEXT: s_addc_u32 s35, s5, 0
1943 ; VI-NEXT: v_mov_b32_e32 v2, s34
1944 ; VI-NEXT: v_mov_b32_e32 v0, s6
1945 ; VI-NEXT: v_mov_b32_e32 v1, s7
1946 ; VI-NEXT: v_mov_b32_e32 v3, s35
1947 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1948 ; VI-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
1949 ; VI-NEXT: s_waitcnt vmcnt(0)
1950 ; VI-NEXT: buffer_wbinvl1_vol
1951 ; VI-NEXT: s_setpc_b64 s[30:31]
1953 ; GFX9-LABEL: global_atomic_sub_i64_noret_offset_scalar:
1955 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1956 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
1957 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
1958 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1959 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1960 ; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[4:5] offset:32
1961 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1962 ; GFX9-NEXT: buffer_wbinvl1_vol
1963 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1964 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
1965 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst
1969 define amdgpu_gfx i64 @global_atomic_sub_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
1970 ; SI-LABEL: global_atomic_sub_i64_ret_scalar:
1972 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1973 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1974 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
1975 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1976 ; SI-NEXT: s_waitcnt expcnt(0)
1977 ; SI-NEXT: v_writelane_b32 v2, s6, 0
1978 ; SI-NEXT: v_writelane_b32 v2, s7, 1
1979 ; SI-NEXT: s_mov_b32 s34, s7
1980 ; SI-NEXT: s_mov_b32 s35, s6
1981 ; SI-NEXT: s_mov_b32 s7, 0xf000
1982 ; SI-NEXT: s_mov_b32 s6, -1
1983 ; SI-NEXT: v_mov_b32_e32 v0, s35
1984 ; SI-NEXT: v_mov_b32_e32 v1, s34
1985 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1986 ; SI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 glc
1987 ; SI-NEXT: s_waitcnt vmcnt(0)
1988 ; SI-NEXT: buffer_wbinvl1
1989 ; SI-NEXT: v_readlane_b32 s7, v2, 1
1990 ; SI-NEXT: v_readlane_b32 s6, v2, 0
1991 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
1992 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
1993 ; SI-NEXT: s_mov_b64 exec, s[34:35]
1994 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1995 ; SI-NEXT: s_setpc_b64 s[30:31]
1997 ; VI-LABEL: global_atomic_sub_i64_ret_scalar:
1999 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2000 ; VI-NEXT: v_mov_b32_e32 v0, s6
2001 ; VI-NEXT: v_mov_b32_e32 v1, s7
2002 ; VI-NEXT: v_mov_b32_e32 v2, s4
2003 ; VI-NEXT: v_mov_b32_e32 v3, s5
2004 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2005 ; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
2006 ; VI-NEXT: s_waitcnt vmcnt(0)
2007 ; VI-NEXT: buffer_wbinvl1_vol
2008 ; VI-NEXT: s_setpc_b64 s[30:31]
2010 ; GFX9-LABEL: global_atomic_sub_i64_ret_scalar:
2012 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2013 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
2014 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
2015 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2016 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2017 ; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] glc
2018 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2019 ; GFX9-NEXT: buffer_wbinvl1_vol
2020 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2021 %result = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst
2025 define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
2026 ; SI-LABEL: global_atomic_sub_i64_ret_offset_scalar:
2028 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2029 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2030 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
2031 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2032 ; SI-NEXT: s_waitcnt expcnt(0)
2033 ; SI-NEXT: v_writelane_b32 v2, s6, 0
2034 ; SI-NEXT: v_writelane_b32 v2, s7, 1
2035 ; SI-NEXT: v_mov_b32_e32 v0, s6
2036 ; SI-NEXT: v_mov_b32_e32 v1, s7
2037 ; SI-NEXT: s_mov_b32 s7, 0xf000
2038 ; SI-NEXT: s_mov_b32 s6, -1
2039 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2040 ; SI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32 glc
2041 ; SI-NEXT: s_waitcnt vmcnt(0)
2042 ; SI-NEXT: buffer_wbinvl1
2043 ; SI-NEXT: v_readlane_b32 s7, v2, 1
2044 ; SI-NEXT: v_readlane_b32 s6, v2, 0
2045 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2046 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
2047 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2048 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2049 ; SI-NEXT: s_setpc_b64 s[30:31]
2051 ; VI-LABEL: global_atomic_sub_i64_ret_offset_scalar:
2053 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2054 ; VI-NEXT: s_add_u32 s34, s4, 32
2055 ; VI-NEXT: s_addc_u32 s35, s5, 0
2056 ; VI-NEXT: v_mov_b32_e32 v2, s34
2057 ; VI-NEXT: v_mov_b32_e32 v0, s6
2058 ; VI-NEXT: v_mov_b32_e32 v1, s7
2059 ; VI-NEXT: v_mov_b32_e32 v3, s35
2060 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2061 ; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
2062 ; VI-NEXT: s_waitcnt vmcnt(0)
2063 ; VI-NEXT: buffer_wbinvl1_vol
2064 ; VI-NEXT: s_setpc_b64 s[30:31]
2066 ; GFX9-LABEL: global_atomic_sub_i64_ret_offset_scalar:
2068 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2069 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
2070 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
2071 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2072 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2073 ; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
2074 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2075 ; GFX9-NEXT: buffer_wbinvl1_vol
2076 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2077 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
2078 %result = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst
2082 ; ---------------------------------------------------------------------
2084 ; ---------------------------------------------------------------------
2086 define void @global_atomic_and_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
2087 ; SI-LABEL: global_atomic_and_i64_noret:
2089 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2090 ; SI-NEXT: s_mov_b32 s6, 0
2091 ; SI-NEXT: s_mov_b32 s7, 0xf000
2092 ; SI-NEXT: s_mov_b32 s4, s6
2093 ; SI-NEXT: s_mov_b32 s5, s6
2094 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2095 ; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64
2096 ; SI-NEXT: s_waitcnt vmcnt(0)
2097 ; SI-NEXT: buffer_wbinvl1
2098 ; SI-NEXT: s_waitcnt expcnt(0)
2099 ; SI-NEXT: s_setpc_b64 s[30:31]
2101 ; VI-LABEL: global_atomic_and_i64_noret:
2103 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2104 ; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
2105 ; VI-NEXT: s_waitcnt vmcnt(0)
2106 ; VI-NEXT: buffer_wbinvl1_vol
2107 ; VI-NEXT: s_setpc_b64 s[30:31]
2109 ; GFX9-LABEL: global_atomic_and_i64_noret:
2111 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2112 ; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[2:3], off
2113 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2114 ; GFX9-NEXT: buffer_wbinvl1_vol
2115 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2116 %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst
2120 define void @global_atomic_and_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
2121 ; SI-LABEL: global_atomic_and_i64_noret_offset:
2123 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2124 ; SI-NEXT: s_mov_b32 s6, 0
2125 ; SI-NEXT: s_mov_b32 s7, 0xf000
2126 ; SI-NEXT: s_mov_b32 s4, s6
2127 ; SI-NEXT: s_mov_b32 s5, s6
2128 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2129 ; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
2130 ; SI-NEXT: s_waitcnt vmcnt(0)
2131 ; SI-NEXT: buffer_wbinvl1
2132 ; SI-NEXT: s_waitcnt expcnt(0)
2133 ; SI-NEXT: s_setpc_b64 s[30:31]
2135 ; VI-LABEL: global_atomic_and_i64_noret_offset:
2137 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2138 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
2139 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2140 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2141 ; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
2142 ; VI-NEXT: s_waitcnt vmcnt(0)
2143 ; VI-NEXT: buffer_wbinvl1_vol
2144 ; VI-NEXT: s_setpc_b64 s[30:31]
2146 ; GFX9-LABEL: global_atomic_and_i64_noret_offset:
2148 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2149 ; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[2:3], off offset:32
2150 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2151 ; GFX9-NEXT: buffer_wbinvl1_vol
2152 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2153 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
2154 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst
2158 define i64 @global_atomic_and_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
2159 ; SI-LABEL: global_atomic_and_i64_ret:
2161 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2162 ; SI-NEXT: s_mov_b32 s6, 0
2163 ; SI-NEXT: s_mov_b32 s7, 0xf000
2164 ; SI-NEXT: s_mov_b32 s4, s6
2165 ; SI-NEXT: s_mov_b32 s5, s6
2166 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2167 ; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
2168 ; SI-NEXT: s_waitcnt vmcnt(0)
2169 ; SI-NEXT: buffer_wbinvl1
2170 ; SI-NEXT: v_mov_b32_e32 v0, v2
2171 ; SI-NEXT: v_mov_b32_e32 v1, v3
2172 ; SI-NEXT: s_waitcnt expcnt(0)
2173 ; SI-NEXT: s_setpc_b64 s[30:31]
2175 ; VI-LABEL: global_atomic_and_i64_ret:
2177 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2178 ; VI-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
2179 ; VI-NEXT: s_waitcnt vmcnt(0)
2180 ; VI-NEXT: buffer_wbinvl1_vol
2181 ; VI-NEXT: s_setpc_b64 s[30:31]
2183 ; GFX9-LABEL: global_atomic_and_i64_ret:
2185 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2186 ; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off glc
2187 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2188 ; GFX9-NEXT: buffer_wbinvl1_vol
2189 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2190 %result = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst
2194 define i64 @global_atomic_and_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
2195 ; SI-LABEL: global_atomic_and_i64_ret_offset:
2197 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2198 ; SI-NEXT: s_mov_b32 s6, 0
2199 ; SI-NEXT: s_mov_b32 s7, 0xf000
2200 ; SI-NEXT: s_mov_b32 s4, s6
2201 ; SI-NEXT: s_mov_b32 s5, s6
2202 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2203 ; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
2204 ; SI-NEXT: s_waitcnt vmcnt(0)
2205 ; SI-NEXT: buffer_wbinvl1
2206 ; SI-NEXT: v_mov_b32_e32 v0, v2
2207 ; SI-NEXT: v_mov_b32_e32 v1, v3
2208 ; SI-NEXT: s_waitcnt expcnt(0)
2209 ; SI-NEXT: s_setpc_b64 s[30:31]
2211 ; VI-LABEL: global_atomic_and_i64_ret_offset:
2213 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2214 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
2215 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2216 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2217 ; VI-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
2218 ; VI-NEXT: s_waitcnt vmcnt(0)
2219 ; VI-NEXT: buffer_wbinvl1_vol
2220 ; VI-NEXT: s_setpc_b64 s[30:31]
2222 ; GFX9-LABEL: global_atomic_and_i64_ret_offset:
2224 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2225 ; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
2226 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2227 ; GFX9-NEXT: buffer_wbinvl1_vol
2228 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2229 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
2230 %result = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst
2234 define amdgpu_gfx void @global_atomic_and_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
2235 ; SI-LABEL: global_atomic_and_i64_noret_scalar:
2237 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2238 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2239 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
2240 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2241 ; SI-NEXT: s_waitcnt expcnt(0)
2242 ; SI-NEXT: v_writelane_b32 v0, s6, 0
2243 ; SI-NEXT: v_writelane_b32 v0, s7, 1
2244 ; SI-NEXT: s_mov_b32 s34, s7
2245 ; SI-NEXT: s_mov_b32 s35, s6
2246 ; SI-NEXT: s_mov_b32 s7, 0xf000
2247 ; SI-NEXT: s_mov_b32 s6, -1
2248 ; SI-NEXT: v_mov_b32_e32 v1, s35
2249 ; SI-NEXT: v_mov_b32_e32 v2, s34
2250 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2251 ; SI-NEXT: buffer_atomic_and_x2 v[1:2], off, s[4:7], 0
2252 ; SI-NEXT: s_waitcnt vmcnt(0)
2253 ; SI-NEXT: buffer_wbinvl1
2254 ; SI-NEXT: v_readlane_b32 s7, v0, 1
2255 ; SI-NEXT: v_readlane_b32 s6, v0, 0
2256 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2257 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
2258 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2259 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2260 ; SI-NEXT: s_setpc_b64 s[30:31]
2262 ; VI-LABEL: global_atomic_and_i64_noret_scalar:
2264 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2265 ; VI-NEXT: v_mov_b32_e32 v0, s6
2266 ; VI-NEXT: v_mov_b32_e32 v1, s7
2267 ; VI-NEXT: v_mov_b32_e32 v2, s4
2268 ; VI-NEXT: v_mov_b32_e32 v3, s5
2269 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2270 ; VI-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
2271 ; VI-NEXT: s_waitcnt vmcnt(0)
2272 ; VI-NEXT: buffer_wbinvl1_vol
2273 ; VI-NEXT: s_setpc_b64 s[30:31]
2275 ; GFX9-LABEL: global_atomic_and_i64_noret_scalar:
2277 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2278 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
2279 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
2280 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2281 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2282 ; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[4:5]
2283 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2284 ; GFX9-NEXT: buffer_wbinvl1_vol
2285 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2286 %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst
2290 define amdgpu_gfx void @global_atomic_and_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
2291 ; SI-LABEL: global_atomic_and_i64_noret_offset_scalar:
2293 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2294 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2295 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
2296 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2297 ; SI-NEXT: s_waitcnt expcnt(0)
2298 ; SI-NEXT: v_writelane_b32 v0, s6, 0
2299 ; SI-NEXT: v_writelane_b32 v0, s7, 1
2300 ; SI-NEXT: v_mov_b32_e32 v1, s6
2301 ; SI-NEXT: v_mov_b32_e32 v2, s7
2302 ; SI-NEXT: s_mov_b32 s7, 0xf000
2303 ; SI-NEXT: s_mov_b32 s6, -1
2304 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2305 ; SI-NEXT: buffer_atomic_and_x2 v[1:2], off, s[4:7], 0 offset:32
2306 ; SI-NEXT: s_waitcnt vmcnt(0)
2307 ; SI-NEXT: buffer_wbinvl1
2308 ; SI-NEXT: v_readlane_b32 s7, v0, 1
2309 ; SI-NEXT: v_readlane_b32 s6, v0, 0
2310 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2311 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
2312 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2313 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2314 ; SI-NEXT: s_setpc_b64 s[30:31]
2316 ; VI-LABEL: global_atomic_and_i64_noret_offset_scalar:
2318 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2319 ; VI-NEXT: s_add_u32 s34, s4, 32
2320 ; VI-NEXT: s_addc_u32 s35, s5, 0
2321 ; VI-NEXT: v_mov_b32_e32 v2, s34
2322 ; VI-NEXT: v_mov_b32_e32 v0, s6
2323 ; VI-NEXT: v_mov_b32_e32 v1, s7
2324 ; VI-NEXT: v_mov_b32_e32 v3, s35
2325 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2326 ; VI-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
2327 ; VI-NEXT: s_waitcnt vmcnt(0)
2328 ; VI-NEXT: buffer_wbinvl1_vol
2329 ; VI-NEXT: s_setpc_b64 s[30:31]
2331 ; GFX9-LABEL: global_atomic_and_i64_noret_offset_scalar:
2333 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2334 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
2335 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
2336 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2337 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2338 ; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[4:5] offset:32
2339 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2340 ; GFX9-NEXT: buffer_wbinvl1_vol
2341 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2342 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
2343 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst
2347 define amdgpu_gfx i64 @global_atomic_and_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
2348 ; SI-LABEL: global_atomic_and_i64_ret_scalar:
2350 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2351 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2352 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
2353 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2354 ; SI-NEXT: s_waitcnt expcnt(0)
2355 ; SI-NEXT: v_writelane_b32 v2, s6, 0
2356 ; SI-NEXT: v_writelane_b32 v2, s7, 1
2357 ; SI-NEXT: s_mov_b32 s34, s7
2358 ; SI-NEXT: s_mov_b32 s35, s6
2359 ; SI-NEXT: s_mov_b32 s7, 0xf000
2360 ; SI-NEXT: s_mov_b32 s6, -1
2361 ; SI-NEXT: v_mov_b32_e32 v0, s35
2362 ; SI-NEXT: v_mov_b32_e32 v1, s34
2363 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2364 ; SI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 glc
2365 ; SI-NEXT: s_waitcnt vmcnt(0)
2366 ; SI-NEXT: buffer_wbinvl1
2367 ; SI-NEXT: v_readlane_b32 s7, v2, 1
2368 ; SI-NEXT: v_readlane_b32 s6, v2, 0
2369 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2370 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
2371 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2372 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2373 ; SI-NEXT: s_setpc_b64 s[30:31]
2375 ; VI-LABEL: global_atomic_and_i64_ret_scalar:
2377 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2378 ; VI-NEXT: v_mov_b32_e32 v0, s6
2379 ; VI-NEXT: v_mov_b32_e32 v1, s7
2380 ; VI-NEXT: v_mov_b32_e32 v2, s4
2381 ; VI-NEXT: v_mov_b32_e32 v3, s5
2382 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2383 ; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
2384 ; VI-NEXT: s_waitcnt vmcnt(0)
2385 ; VI-NEXT: buffer_wbinvl1_vol
2386 ; VI-NEXT: s_setpc_b64 s[30:31]
2388 ; GFX9-LABEL: global_atomic_and_i64_ret_scalar:
2390 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2391 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
2392 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
2393 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2394 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2395 ; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] glc
2396 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2397 ; GFX9-NEXT: buffer_wbinvl1_vol
2398 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2399 %result = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst
2403 define amdgpu_gfx i64 @global_atomic_and_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
2404 ; SI-LABEL: global_atomic_and_i64_ret_offset_scalar:
2406 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2407 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2408 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
2409 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2410 ; SI-NEXT: s_waitcnt expcnt(0)
2411 ; SI-NEXT: v_writelane_b32 v2, s6, 0
2412 ; SI-NEXT: v_writelane_b32 v2, s7, 1
2413 ; SI-NEXT: v_mov_b32_e32 v0, s6
2414 ; SI-NEXT: v_mov_b32_e32 v1, s7
2415 ; SI-NEXT: s_mov_b32 s7, 0xf000
2416 ; SI-NEXT: s_mov_b32 s6, -1
2417 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2418 ; SI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32 glc
2419 ; SI-NEXT: s_waitcnt vmcnt(0)
2420 ; SI-NEXT: buffer_wbinvl1
2421 ; SI-NEXT: v_readlane_b32 s7, v2, 1
2422 ; SI-NEXT: v_readlane_b32 s6, v2, 0
2423 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2424 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
2425 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2426 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2427 ; SI-NEXT: s_setpc_b64 s[30:31]
2429 ; VI-LABEL: global_atomic_and_i64_ret_offset_scalar:
2431 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2432 ; VI-NEXT: s_add_u32 s34, s4, 32
2433 ; VI-NEXT: s_addc_u32 s35, s5, 0
2434 ; VI-NEXT: v_mov_b32_e32 v2, s34
2435 ; VI-NEXT: v_mov_b32_e32 v0, s6
2436 ; VI-NEXT: v_mov_b32_e32 v1, s7
2437 ; VI-NEXT: v_mov_b32_e32 v3, s35
2438 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2439 ; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
2440 ; VI-NEXT: s_waitcnt vmcnt(0)
2441 ; VI-NEXT: buffer_wbinvl1_vol
2442 ; VI-NEXT: s_setpc_b64 s[30:31]
2444 ; GFX9-LABEL: global_atomic_and_i64_ret_offset_scalar:
2446 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2447 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
2448 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
2449 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2450 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2451 ; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
2452 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2453 ; GFX9-NEXT: buffer_wbinvl1_vol
2454 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2455 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
2456 %result = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst
2460 ; ---------------------------------------------------------------------
2462 ; ---------------------------------------------------------------------
2464 define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
2465 ; SI-LABEL: global_atomic_nand_i64_noret:
2467 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2468 ; SI-NEXT: s_mov_b32 s6, 0
2469 ; SI-NEXT: s_mov_b32 s7, 0xf000
2470 ; SI-NEXT: s_mov_b32 s4, s6
2471 ; SI-NEXT: s_mov_b32 s5, s6
2472 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
2473 ; SI-NEXT: s_mov_b64 s[8:9], 0
2474 ; SI-NEXT: .LBB40_1: ; %atomicrmw.start
2475 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
2476 ; SI-NEXT: s_waitcnt vmcnt(0)
2477 ; SI-NEXT: v_and_b32_e32 v4, v7, v3
2478 ; SI-NEXT: s_waitcnt expcnt(0)
2479 ; SI-NEXT: v_and_b32_e32 v8, v6, v2
2480 ; SI-NEXT: v_not_b32_e32 v5, v4
2481 ; SI-NEXT: v_not_b32_e32 v4, v8
2482 ; SI-NEXT: v_mov_b32_e32 v11, v7
2483 ; SI-NEXT: v_mov_b32_e32 v10, v6
2484 ; SI-NEXT: v_mov_b32_e32 v9, v5
2485 ; SI-NEXT: v_mov_b32_e32 v8, v4
2486 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2487 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
2488 ; SI-NEXT: s_waitcnt vmcnt(0)
2489 ; SI-NEXT: buffer_wbinvl1
2490 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
2491 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
2492 ; SI-NEXT: v_mov_b32_e32 v6, v8
2493 ; SI-NEXT: v_mov_b32_e32 v7, v9
2494 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
2495 ; SI-NEXT: s_cbranch_execnz .LBB40_1
2496 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
2497 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
2498 ; SI-NEXT: s_waitcnt expcnt(0)
2499 ; SI-NEXT: s_setpc_b64 s[30:31]
2501 ; VI-LABEL: global_atomic_nand_i64_noret:
2503 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2504 ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
2505 ; VI-NEXT: s_mov_b64 s[4:5], 0
2506 ; VI-NEXT: .LBB40_1: ; %atomicrmw.start
2507 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
2508 ; VI-NEXT: s_waitcnt vmcnt(0)
2509 ; VI-NEXT: v_and_b32_e32 v4, v7, v3
2510 ; VI-NEXT: v_and_b32_e32 v8, v6, v2
2511 ; VI-NEXT: v_not_b32_e32 v5, v4
2512 ; VI-NEXT: v_not_b32_e32 v4, v8
2513 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2514 ; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
2515 ; VI-NEXT: s_waitcnt vmcnt(0)
2516 ; VI-NEXT: buffer_wbinvl1_vol
2517 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
2518 ; VI-NEXT: v_mov_b32_e32 v7, v5
2519 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2520 ; VI-NEXT: v_mov_b32_e32 v6, v4
2521 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
2522 ; VI-NEXT: s_cbranch_execnz .LBB40_1
2523 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
2524 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
2525 ; VI-NEXT: s_setpc_b64 s[30:31]
2527 ; GFX9-LABEL: global_atomic_nand_i64_noret:
2529 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2530 ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
2531 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
2532 ; GFX9-NEXT: .LBB40_1: ; %atomicrmw.start
2533 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2534 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2535 ; GFX9-NEXT: v_and_b32_e32 v4, v7, v3
2536 ; GFX9-NEXT: v_and_b32_e32 v8, v6, v2
2537 ; GFX9-NEXT: v_not_b32_e32 v5, v4
2538 ; GFX9-NEXT: v_not_b32_e32 v4, v8
2539 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2540 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
2541 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2542 ; GFX9-NEXT: buffer_wbinvl1_vol
2543 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
2544 ; GFX9-NEXT: v_mov_b32_e32 v7, v5
2545 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2546 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
2547 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
2548 ; GFX9-NEXT: s_cbranch_execnz .LBB40_1
2549 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2550 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
2551 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2552 %tmp0 = atomicrmw nand ptr addrspace(1) %ptr, i64 %in seq_cst
2556 define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
2557 ; SI-LABEL: global_atomic_nand_i64_noret_offset:
2559 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2560 ; SI-NEXT: s_mov_b32 s6, 0
2561 ; SI-NEXT: s_mov_b32 s7, 0xf000
2562 ; SI-NEXT: s_mov_b32 s4, s6
2563 ; SI-NEXT: s_mov_b32 s5, s6
2564 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
2565 ; SI-NEXT: s_mov_b64 s[8:9], 0
2566 ; SI-NEXT: .LBB41_1: ; %atomicrmw.start
2567 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
2568 ; SI-NEXT: s_waitcnt vmcnt(0)
2569 ; SI-NEXT: v_and_b32_e32 v4, v7, v3
2570 ; SI-NEXT: s_waitcnt expcnt(0)
2571 ; SI-NEXT: v_and_b32_e32 v8, v6, v2
2572 ; SI-NEXT: v_not_b32_e32 v5, v4
2573 ; SI-NEXT: v_not_b32_e32 v4, v8
2574 ; SI-NEXT: v_mov_b32_e32 v11, v7
2575 ; SI-NEXT: v_mov_b32_e32 v10, v6
2576 ; SI-NEXT: v_mov_b32_e32 v9, v5
2577 ; SI-NEXT: v_mov_b32_e32 v8, v4
2578 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2579 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
2580 ; SI-NEXT: s_waitcnt vmcnt(0)
2581 ; SI-NEXT: buffer_wbinvl1
2582 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
2583 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
2584 ; SI-NEXT: v_mov_b32_e32 v6, v8
2585 ; SI-NEXT: v_mov_b32_e32 v7, v9
2586 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
2587 ; SI-NEXT: s_cbranch_execnz .LBB41_1
2588 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
2589 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
2590 ; SI-NEXT: s_waitcnt expcnt(0)
2591 ; SI-NEXT: s_setpc_b64 s[30:31]
2593 ; VI-LABEL: global_atomic_nand_i64_noret_offset:
2595 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2596 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
2597 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2598 ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
2599 ; VI-NEXT: s_mov_b64 s[4:5], 0
2600 ; VI-NEXT: .LBB41_1: ; %atomicrmw.start
2601 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
2602 ; VI-NEXT: s_waitcnt vmcnt(0)
2603 ; VI-NEXT: v_and_b32_e32 v4, v7, v3
2604 ; VI-NEXT: v_and_b32_e32 v8, v6, v2
2605 ; VI-NEXT: v_not_b32_e32 v5, v4
2606 ; VI-NEXT: v_not_b32_e32 v4, v8
2607 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2608 ; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
2609 ; VI-NEXT: s_waitcnt vmcnt(0)
2610 ; VI-NEXT: buffer_wbinvl1_vol
2611 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
2612 ; VI-NEXT: v_mov_b32_e32 v7, v5
2613 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2614 ; VI-NEXT: v_mov_b32_e32 v6, v4
2615 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
2616 ; VI-NEXT: s_cbranch_execnz .LBB41_1
2617 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
2618 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
2619 ; VI-NEXT: s_setpc_b64 s[30:31]
2621 ; GFX9-LABEL: global_atomic_nand_i64_noret_offset:
2623 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2624 ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32
2625 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
2626 ; GFX9-NEXT: .LBB41_1: ; %atomicrmw.start
2627 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2628 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2629 ; GFX9-NEXT: v_and_b32_e32 v4, v7, v3
2630 ; GFX9-NEXT: v_and_b32_e32 v8, v6, v2
2631 ; GFX9-NEXT: v_not_b32_e32 v5, v4
2632 ; GFX9-NEXT: v_not_b32_e32 v4, v8
2633 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2634 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
2635 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2636 ; GFX9-NEXT: buffer_wbinvl1_vol
2637 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
2638 ; GFX9-NEXT: v_mov_b32_e32 v7, v5
2639 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2640 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
2641 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
2642 ; GFX9-NEXT: s_cbranch_execnz .LBB41_1
2643 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2644 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
2645 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2646 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
2647 %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst
2651 define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
2652 ; SI-LABEL: global_atomic_nand_i64_ret:
2654 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2655 ; SI-NEXT: v_mov_b32_e32 v4, v3
2656 ; SI-NEXT: v_mov_b32_e32 v5, v2
2657 ; SI-NEXT: v_mov_b32_e32 v7, v1
2658 ; SI-NEXT: v_mov_b32_e32 v6, v0
2659 ; SI-NEXT: s_mov_b32 s6, 0
2660 ; SI-NEXT: s_mov_b32 s7, 0xf000
2661 ; SI-NEXT: s_mov_b32 s4, s6
2662 ; SI-NEXT: s_mov_b32 s5, s6
2663 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
2664 ; SI-NEXT: s_mov_b64 s[8:9], 0
2665 ; SI-NEXT: .LBB42_1: ; %atomicrmw.start
2666 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
2667 ; SI-NEXT: s_waitcnt vmcnt(0)
2668 ; SI-NEXT: v_mov_b32_e32 v11, v1
2669 ; SI-NEXT: v_mov_b32_e32 v10, v0
2670 ; SI-NEXT: s_waitcnt expcnt(0)
2671 ; SI-NEXT: v_and_b32_e32 v0, v11, v4
2672 ; SI-NEXT: v_and_b32_e32 v1, v10, v5
2673 ; SI-NEXT: v_not_b32_e32 v9, v0
2674 ; SI-NEXT: v_not_b32_e32 v8, v1
2675 ; SI-NEXT: v_mov_b32_e32 v0, v8
2676 ; SI-NEXT: v_mov_b32_e32 v1, v9
2677 ; SI-NEXT: v_mov_b32_e32 v2, v10
2678 ; SI-NEXT: v_mov_b32_e32 v3, v11
2679 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2680 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
2681 ; SI-NEXT: s_waitcnt vmcnt(0)
2682 ; SI-NEXT: buffer_wbinvl1
2683 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
2684 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
2685 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
2686 ; SI-NEXT: s_cbranch_execnz .LBB42_1
2687 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
2688 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
2689 ; SI-NEXT: s_waitcnt expcnt(0)
2690 ; SI-NEXT: s_setpc_b64 s[30:31]
2692 ; VI-LABEL: global_atomic_nand_i64_ret:
2694 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2695 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
2696 ; VI-NEXT: s_mov_b64 s[4:5], 0
2697 ; VI-NEXT: .LBB42_1: ; %atomicrmw.start
2698 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
2699 ; VI-NEXT: s_waitcnt vmcnt(0)
2700 ; VI-NEXT: v_mov_b32_e32 v7, v5
2701 ; VI-NEXT: v_mov_b32_e32 v6, v4
2702 ; VI-NEXT: v_and_b32_e32 v4, v7, v3
2703 ; VI-NEXT: v_and_b32_e32 v8, v6, v2
2704 ; VI-NEXT: v_not_b32_e32 v5, v4
2705 ; VI-NEXT: v_not_b32_e32 v4, v8
2706 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2707 ; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
2708 ; VI-NEXT: s_waitcnt vmcnt(0)
2709 ; VI-NEXT: buffer_wbinvl1_vol
2710 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
2711 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2712 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
2713 ; VI-NEXT: s_cbranch_execnz .LBB42_1
2714 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
2715 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
2716 ; VI-NEXT: v_mov_b32_e32 v0, v4
2717 ; VI-NEXT: v_mov_b32_e32 v1, v5
2718 ; VI-NEXT: s_setpc_b64 s[30:31]
2720 ; GFX9-LABEL: global_atomic_nand_i64_ret:
2722 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2723 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
2724 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
2725 ; GFX9-NEXT: .LBB42_1: ; %atomicrmw.start
2726 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2727 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2728 ; GFX9-NEXT: v_mov_b32_e32 v7, v5
2729 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
2730 ; GFX9-NEXT: v_and_b32_e32 v4, v7, v3
2731 ; GFX9-NEXT: v_and_b32_e32 v8, v6, v2
2732 ; GFX9-NEXT: v_not_b32_e32 v5, v4
2733 ; GFX9-NEXT: v_not_b32_e32 v4, v8
2734 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2735 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
2736 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2737 ; GFX9-NEXT: buffer_wbinvl1_vol
2738 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
2739 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2740 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
2741 ; GFX9-NEXT: s_cbranch_execnz .LBB42_1
2742 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2743 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
2744 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
2745 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
2746 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2747 %result = atomicrmw nand ptr addrspace(1) %ptr, i64 %in seq_cst
2751 define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
2752 ; SI-LABEL: global_atomic_nand_i64_ret_offset:
2754 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2755 ; SI-NEXT: v_mov_b32_e32 v4, v3
2756 ; SI-NEXT: v_mov_b32_e32 v5, v2
2757 ; SI-NEXT: v_mov_b32_e32 v7, v1
2758 ; SI-NEXT: v_mov_b32_e32 v6, v0
2759 ; SI-NEXT: s_mov_b32 s6, 0
2760 ; SI-NEXT: s_mov_b32 s7, 0xf000
2761 ; SI-NEXT: s_mov_b32 s4, s6
2762 ; SI-NEXT: s_mov_b32 s5, s6
2763 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32
2764 ; SI-NEXT: s_mov_b64 s[8:9], 0
2765 ; SI-NEXT: .LBB43_1: ; %atomicrmw.start
2766 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
2767 ; SI-NEXT: s_waitcnt vmcnt(0)
2768 ; SI-NEXT: v_mov_b32_e32 v11, v1
2769 ; SI-NEXT: v_mov_b32_e32 v10, v0
2770 ; SI-NEXT: s_waitcnt expcnt(0)
2771 ; SI-NEXT: v_and_b32_e32 v0, v11, v4
2772 ; SI-NEXT: v_and_b32_e32 v1, v10, v5
2773 ; SI-NEXT: v_not_b32_e32 v9, v0
2774 ; SI-NEXT: v_not_b32_e32 v8, v1
2775 ; SI-NEXT: v_mov_b32_e32 v0, v8
2776 ; SI-NEXT: v_mov_b32_e32 v1, v9
2777 ; SI-NEXT: v_mov_b32_e32 v2, v10
2778 ; SI-NEXT: v_mov_b32_e32 v3, v11
2779 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2780 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc
2781 ; SI-NEXT: s_waitcnt vmcnt(0)
2782 ; SI-NEXT: buffer_wbinvl1
2783 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
2784 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
2785 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
2786 ; SI-NEXT: s_cbranch_execnz .LBB43_1
2787 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
2788 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
2789 ; SI-NEXT: s_waitcnt expcnt(0)
2790 ; SI-NEXT: s_setpc_b64 s[30:31]
2792 ; VI-LABEL: global_atomic_nand_i64_ret_offset:
2794 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2795 ; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0
2796 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
2797 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
2798 ; VI-NEXT: s_mov_b64 s[4:5], 0
2799 ; VI-NEXT: .LBB43_1: ; %atomicrmw.start
2800 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
2801 ; VI-NEXT: s_waitcnt vmcnt(0)
2802 ; VI-NEXT: v_mov_b32_e32 v9, v1
2803 ; VI-NEXT: v_mov_b32_e32 v8, v0
2804 ; VI-NEXT: v_and_b32_e32 v0, v9, v3
2805 ; VI-NEXT: v_and_b32_e32 v1, v8, v2
2806 ; VI-NEXT: v_not_b32_e32 v7, v0
2807 ; VI-NEXT: v_not_b32_e32 v6, v1
2808 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2809 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
2810 ; VI-NEXT: s_waitcnt vmcnt(0)
2811 ; VI-NEXT: buffer_wbinvl1_vol
2812 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
2813 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2814 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
2815 ; VI-NEXT: s_cbranch_execnz .LBB43_1
2816 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
2817 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
2818 ; VI-NEXT: s_setpc_b64 s[30:31]
2820 ; GFX9-LABEL: global_atomic_nand_i64_ret_offset:
2822 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2823 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32
2824 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
2825 ; GFX9-NEXT: .LBB43_1: ; %atomicrmw.start
2826 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2827 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2828 ; GFX9-NEXT: v_mov_b32_e32 v7, v5
2829 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
2830 ; GFX9-NEXT: v_and_b32_e32 v4, v7, v3
2831 ; GFX9-NEXT: v_and_b32_e32 v8, v6, v2
2832 ; GFX9-NEXT: v_not_b32_e32 v5, v4
2833 ; GFX9-NEXT: v_not_b32_e32 v4, v8
2834 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2835 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
2836 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2837 ; GFX9-NEXT: buffer_wbinvl1_vol
2838 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
2839 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2840 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
2841 ; GFX9-NEXT: s_cbranch_execnz .LBB43_1
2842 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2843 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
2844 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
2845 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
2846 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2847 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
2848 %result = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst
2852 define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
2853 ; SI-LABEL: global_atomic_nand_i64_noret_scalar:
2855 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2856 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2857 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
2858 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2859 ; SI-NEXT: s_waitcnt expcnt(0)
2860 ; SI-NEXT: v_writelane_b32 v0, s6, 0
2861 ; SI-NEXT: v_writelane_b32 v0, s7, 1
2862 ; SI-NEXT: s_mov_b32 s35, s7
2863 ; SI-NEXT: s_mov_b32 s34, s6
2864 ; SI-NEXT: s_mov_b32 s7, 0xf000
2865 ; SI-NEXT: s_mov_b32 s6, -1
2866 ; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0
2867 ; SI-NEXT: s_mov_b64 s[36:37], 0
2868 ; SI-NEXT: .LBB44_1: ; %atomicrmw.start
2869 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
2870 ; SI-NEXT: s_waitcnt vmcnt(0)
2871 ; SI-NEXT: v_and_b32_e32 v1, s35, v4
2872 ; SI-NEXT: s_waitcnt expcnt(0)
2873 ; SI-NEXT: v_and_b32_e32 v5, s34, v3
2874 ; SI-NEXT: v_not_b32_e32 v2, v1
2875 ; SI-NEXT: v_not_b32_e32 v1, v5
2876 ; SI-NEXT: v_mov_b32_e32 v8, v4
2877 ; SI-NEXT: v_mov_b32_e32 v7, v3
2878 ; SI-NEXT: v_mov_b32_e32 v6, v2
2879 ; SI-NEXT: v_mov_b32_e32 v5, v1
2880 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2881 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc
2882 ; SI-NEXT: s_waitcnt vmcnt(0)
2883 ; SI-NEXT: buffer_wbinvl1
2884 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4]
2885 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
2886 ; SI-NEXT: v_mov_b32_e32 v3, v5
2887 ; SI-NEXT: v_mov_b32_e32 v4, v6
2888 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
2889 ; SI-NEXT: s_cbranch_execnz .LBB44_1
2890 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
2891 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
2892 ; SI-NEXT: v_readlane_b32 s7, v0, 1
2893 ; SI-NEXT: v_readlane_b32 s6, v0, 0
2894 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2895 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
2896 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2897 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
2898 ; SI-NEXT: s_setpc_b64 s[30:31]
2900 ; VI-LABEL: global_atomic_nand_i64_noret_scalar:
2902 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2903 ; VI-NEXT: v_mov_b32_e32 v0, s4
2904 ; VI-NEXT: v_mov_b32_e32 v1, s5
2905 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
2906 ; VI-NEXT: s_mov_b64 s[34:35], 0
2907 ; VI-NEXT: .LBB44_1: ; %atomicrmw.start
2908 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
2909 ; VI-NEXT: s_waitcnt vmcnt(0)
2910 ; VI-NEXT: v_and_b32_e32 v0, s7, v3
2911 ; VI-NEXT: v_and_b32_e32 v6, s6, v2
2912 ; VI-NEXT: v_mov_b32_e32 v4, s4
2913 ; VI-NEXT: v_mov_b32_e32 v5, s5
2914 ; VI-NEXT: v_not_b32_e32 v1, v0
2915 ; VI-NEXT: v_not_b32_e32 v0, v6
2916 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2917 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
2918 ; VI-NEXT: s_waitcnt vmcnt(0)
2919 ; VI-NEXT: buffer_wbinvl1_vol
2920 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
2921 ; VI-NEXT: v_mov_b32_e32 v3, v1
2922 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2923 ; VI-NEXT: v_mov_b32_e32 v2, v0
2924 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
2925 ; VI-NEXT: s_cbranch_execnz .LBB44_1
2926 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
2927 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
2928 ; VI-NEXT: s_setpc_b64 s[30:31]
2930 ; GFX9-LABEL: global_atomic_nand_i64_noret_scalar:
2932 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2933 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
2934 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5]
2935 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
2936 ; GFX9-NEXT: .LBB44_1: ; %atomicrmw.start
2937 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2938 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2939 ; GFX9-NEXT: v_and_b32_e32 v0, s7, v3
2940 ; GFX9-NEXT: v_and_b32_e32 v5, s6, v2
2941 ; GFX9-NEXT: v_not_b32_e32 v1, v0
2942 ; GFX9-NEXT: v_not_b32_e32 v0, v5
2943 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2944 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
2945 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2946 ; GFX9-NEXT: buffer_wbinvl1_vol
2947 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
2948 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
2949 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2950 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
2951 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
2952 ; GFX9-NEXT: s_cbranch_execnz .LBB44_1
2953 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2954 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
2955 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2956 %tmp0 = atomicrmw nand ptr addrspace(1) %ptr, i64 %in seq_cst
2960 define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
2961 ; SI-LABEL: global_atomic_nand_i64_noret_offset_scalar:
2963 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2964 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
2965 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
2966 ; SI-NEXT: s_mov_b64 exec, s[34:35]
2967 ; SI-NEXT: s_waitcnt expcnt(0)
2968 ; SI-NEXT: v_writelane_b32 v0, s6, 0
2969 ; SI-NEXT: v_writelane_b32 v0, s7, 1
2970 ; SI-NEXT: s_mov_b32 s35, s7
2971 ; SI-NEXT: s_mov_b32 s34, s6
2972 ; SI-NEXT: s_mov_b32 s7, 0xf000
2973 ; SI-NEXT: s_mov_b32 s6, -1
2974 ; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
2975 ; SI-NEXT: s_mov_b64 s[36:37], 0
2976 ; SI-NEXT: .LBB45_1: ; %atomicrmw.start
2977 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
2978 ; SI-NEXT: s_waitcnt vmcnt(0)
2979 ; SI-NEXT: v_and_b32_e32 v1, s35, v4
2980 ; SI-NEXT: s_waitcnt expcnt(0)
2981 ; SI-NEXT: v_and_b32_e32 v5, s34, v3
2982 ; SI-NEXT: v_not_b32_e32 v2, v1
2983 ; SI-NEXT: v_not_b32_e32 v1, v5
2984 ; SI-NEXT: v_mov_b32_e32 v8, v4
2985 ; SI-NEXT: v_mov_b32_e32 v7, v3
2986 ; SI-NEXT: v_mov_b32_e32 v6, v2
2987 ; SI-NEXT: v_mov_b32_e32 v5, v1
2988 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2989 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc
2990 ; SI-NEXT: s_waitcnt vmcnt(0)
2991 ; SI-NEXT: buffer_wbinvl1
2992 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4]
2993 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
2994 ; SI-NEXT: v_mov_b32_e32 v3, v5
2995 ; SI-NEXT: v_mov_b32_e32 v4, v6
2996 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
2997 ; SI-NEXT: s_cbranch_execnz .LBB45_1
2998 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
2999 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
3000 ; SI-NEXT: v_readlane_b32 s7, v0, 1
3001 ; SI-NEXT: v_readlane_b32 s6, v0, 0
3002 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3003 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
3004 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3005 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3006 ; SI-NEXT: s_setpc_b64 s[30:31]
3008 ; VI-LABEL: global_atomic_nand_i64_noret_offset_scalar:
3010 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3011 ; VI-NEXT: s_add_u32 s34, s4, 32
3012 ; VI-NEXT: s_addc_u32 s35, s5, 0
3013 ; VI-NEXT: v_mov_b32_e32 v0, s34
3014 ; VI-NEXT: v_mov_b32_e32 v1, s35
3015 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
3016 ; VI-NEXT: s_mov_b64 s[36:37], 0
3017 ; VI-NEXT: .LBB45_1: ; %atomicrmw.start
3018 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
3019 ; VI-NEXT: s_waitcnt vmcnt(0)
3020 ; VI-NEXT: v_and_b32_e32 v0, s7, v3
3021 ; VI-NEXT: v_and_b32_e32 v6, s6, v2
3022 ; VI-NEXT: v_mov_b32_e32 v4, s34
3023 ; VI-NEXT: v_mov_b32_e32 v5, s35
3024 ; VI-NEXT: v_not_b32_e32 v1, v0
3025 ; VI-NEXT: v_not_b32_e32 v0, v6
3026 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3027 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
3028 ; VI-NEXT: s_waitcnt vmcnt(0)
3029 ; VI-NEXT: buffer_wbinvl1_vol
3030 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
3031 ; VI-NEXT: v_mov_b32_e32 v3, v1
3032 ; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
3033 ; VI-NEXT: v_mov_b32_e32 v2, v0
3034 ; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
3035 ; VI-NEXT: s_cbranch_execnz .LBB45_1
3036 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
3037 ; VI-NEXT: s_or_b64 exec, exec, s[36:37]
3038 ; VI-NEXT: s_setpc_b64 s[30:31]
3040 ; GFX9-LABEL: global_atomic_nand_i64_noret_offset_scalar:
3042 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3043 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
3044 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
3045 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
3046 ; GFX9-NEXT: .LBB45_1: ; %atomicrmw.start
3047 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
3048 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3049 ; GFX9-NEXT: v_and_b32_e32 v0, s7, v3
3050 ; GFX9-NEXT: v_and_b32_e32 v5, s6, v2
3051 ; GFX9-NEXT: v_not_b32_e32 v1, v0
3052 ; GFX9-NEXT: v_not_b32_e32 v0, v5
3053 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3054 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
3055 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3056 ; GFX9-NEXT: buffer_wbinvl1_vol
3057 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
3058 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
3059 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
3060 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
3061 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
3062 ; GFX9-NEXT: s_cbranch_execnz .LBB45_1
3063 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
3064 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
3065 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3066 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
3067 %tmp0 = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst
3071 define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
3072 ; SI-LABEL: global_atomic_nand_i64_ret_scalar:
3074 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3075 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3076 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
3077 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3078 ; SI-NEXT: s_waitcnt expcnt(0)
3079 ; SI-NEXT: v_writelane_b32 v2, s6, 0
3080 ; SI-NEXT: v_writelane_b32 v2, s7, 1
3081 ; SI-NEXT: s_mov_b32 s35, s7
3082 ; SI-NEXT: s_mov_b32 s34, s6
3083 ; SI-NEXT: s_mov_b32 s7, 0xf000
3084 ; SI-NEXT: s_mov_b32 s6, -1
3085 ; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0
3086 ; SI-NEXT: s_mov_b64 s[36:37], 0
3087 ; SI-NEXT: .LBB46_1: ; %atomicrmw.start
3088 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
3089 ; SI-NEXT: s_waitcnt vmcnt(0)
3090 ; SI-NEXT: v_mov_b32_e32 v8, v4
3091 ; SI-NEXT: v_mov_b32_e32 v7, v3
3092 ; SI-NEXT: v_and_b32_e32 v0, s35, v8
3093 ; SI-NEXT: v_and_b32_e32 v1, s34, v7
3094 ; SI-NEXT: s_waitcnt expcnt(0)
3095 ; SI-NEXT: v_not_b32_e32 v6, v0
3096 ; SI-NEXT: v_not_b32_e32 v5, v1
3097 ; SI-NEXT: v_mov_b32_e32 v3, v5
3098 ; SI-NEXT: v_mov_b32_e32 v4, v6
3099 ; SI-NEXT: v_mov_b32_e32 v5, v7
3100 ; SI-NEXT: v_mov_b32_e32 v6, v8
3101 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3102 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc
3103 ; SI-NEXT: s_waitcnt vmcnt(0)
3104 ; SI-NEXT: buffer_wbinvl1
3105 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
3106 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
3107 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
3108 ; SI-NEXT: s_cbranch_execnz .LBB46_1
3109 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
3110 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
3111 ; SI-NEXT: v_mov_b32_e32 v0, v3
3112 ; SI-NEXT: v_mov_b32_e32 v1, v4
3113 ; SI-NEXT: v_readlane_b32 s7, v2, 1
3114 ; SI-NEXT: v_readlane_b32 s6, v2, 0
3115 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3116 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
3117 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3118 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3119 ; SI-NEXT: s_setpc_b64 s[30:31]
3121 ; VI-LABEL: global_atomic_nand_i64_ret_scalar:
3123 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3124 ; VI-NEXT: v_mov_b32_e32 v0, s4
3125 ; VI-NEXT: v_mov_b32_e32 v1, s5
3126 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
3127 ; VI-NEXT: s_mov_b64 s[34:35], 0
3128 ; VI-NEXT: .LBB46_1: ; %atomicrmw.start
3129 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
3130 ; VI-NEXT: s_waitcnt vmcnt(0)
3131 ; VI-NEXT: v_mov_b32_e32 v3, v1
3132 ; VI-NEXT: v_mov_b32_e32 v2, v0
3133 ; VI-NEXT: v_mov_b32_e32 v4, s4
3134 ; VI-NEXT: v_and_b32_e32 v0, s7, v3
3135 ; VI-NEXT: v_and_b32_e32 v6, s6, v2
3136 ; VI-NEXT: v_mov_b32_e32 v5, s5
3137 ; VI-NEXT: v_not_b32_e32 v1, v0
3138 ; VI-NEXT: v_not_b32_e32 v0, v6
3139 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3140 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
3141 ; VI-NEXT: s_waitcnt vmcnt(0)
3142 ; VI-NEXT: buffer_wbinvl1_vol
3143 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
3144 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
3145 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
3146 ; VI-NEXT: s_cbranch_execnz .LBB46_1
3147 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
3148 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
3149 ; VI-NEXT: s_setpc_b64 s[30:31]
3151 ; GFX9-LABEL: global_atomic_nand_i64_ret_scalar:
3153 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3154 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3155 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
3156 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
3157 ; GFX9-NEXT: .LBB46_1: ; %atomicrmw.start
3158 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
3159 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3160 ; GFX9-NEXT: v_mov_b32_e32 v6, v1
3161 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
3162 ; GFX9-NEXT: v_and_b32_e32 v0, s7, v6
3163 ; GFX9-NEXT: v_and_b32_e32 v1, s6, v5
3164 ; GFX9-NEXT: v_not_b32_e32 v4, v0
3165 ; GFX9-NEXT: v_not_b32_e32 v3, v1
3166 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3167 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc
3168 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3169 ; GFX9-NEXT: buffer_wbinvl1_vol
3170 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
3171 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
3172 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
3173 ; GFX9-NEXT: s_cbranch_execnz .LBB46_1
3174 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
3175 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
3176 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3177 %result = atomicrmw nand ptr addrspace(1) %ptr, i64 %in seq_cst
3181 define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
3182 ; SI-LABEL: global_atomic_nand_i64_ret_offset_scalar:
3184 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3185 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3186 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
3187 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3188 ; SI-NEXT: s_waitcnt expcnt(0)
3189 ; SI-NEXT: v_writelane_b32 v2, s6, 0
3190 ; SI-NEXT: v_writelane_b32 v2, s7, 1
3191 ; SI-NEXT: s_mov_b32 s35, s7
3192 ; SI-NEXT: s_mov_b32 s34, s6
3193 ; SI-NEXT: s_mov_b32 s7, 0xf000
3194 ; SI-NEXT: s_mov_b32 s6, -1
3195 ; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
3196 ; SI-NEXT: s_mov_b64 s[36:37], 0
3197 ; SI-NEXT: .LBB47_1: ; %atomicrmw.start
3198 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
3199 ; SI-NEXT: s_waitcnt vmcnt(0)
3200 ; SI-NEXT: v_mov_b32_e32 v8, v4
3201 ; SI-NEXT: v_mov_b32_e32 v7, v3
3202 ; SI-NEXT: v_and_b32_e32 v0, s35, v8
3203 ; SI-NEXT: v_and_b32_e32 v1, s34, v7
3204 ; SI-NEXT: s_waitcnt expcnt(0)
3205 ; SI-NEXT: v_not_b32_e32 v6, v0
3206 ; SI-NEXT: v_not_b32_e32 v5, v1
3207 ; SI-NEXT: v_mov_b32_e32 v3, v5
3208 ; SI-NEXT: v_mov_b32_e32 v4, v6
3209 ; SI-NEXT: v_mov_b32_e32 v5, v7
3210 ; SI-NEXT: v_mov_b32_e32 v6, v8
3211 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3212 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc
3213 ; SI-NEXT: s_waitcnt vmcnt(0)
3214 ; SI-NEXT: buffer_wbinvl1
3215 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
3216 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
3217 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
3218 ; SI-NEXT: s_cbranch_execnz .LBB47_1
3219 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
3220 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
3221 ; SI-NEXT: v_mov_b32_e32 v0, v3
3222 ; SI-NEXT: v_mov_b32_e32 v1, v4
3223 ; SI-NEXT: v_readlane_b32 s7, v2, 1
3224 ; SI-NEXT: v_readlane_b32 s6, v2, 0
3225 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3226 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
3227 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3228 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3229 ; SI-NEXT: s_setpc_b64 s[30:31]
3231 ; VI-LABEL: global_atomic_nand_i64_ret_offset_scalar:
3233 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3234 ; VI-NEXT: s_add_u32 s34, s4, 32
3235 ; VI-NEXT: s_addc_u32 s35, s5, 0
3236 ; VI-NEXT: v_mov_b32_e32 v0, s34
3237 ; VI-NEXT: v_mov_b32_e32 v1, s35
3238 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
3239 ; VI-NEXT: s_mov_b64 s[36:37], 0
3240 ; VI-NEXT: .LBB47_1: ; %atomicrmw.start
3241 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
3242 ; VI-NEXT: s_waitcnt vmcnt(0)
3243 ; VI-NEXT: v_mov_b32_e32 v3, v1
3244 ; VI-NEXT: v_mov_b32_e32 v2, v0
3245 ; VI-NEXT: v_mov_b32_e32 v4, s34
3246 ; VI-NEXT: v_and_b32_e32 v0, s7, v3
3247 ; VI-NEXT: v_and_b32_e32 v6, s6, v2
3248 ; VI-NEXT: v_mov_b32_e32 v5, s35
3249 ; VI-NEXT: v_not_b32_e32 v1, v0
3250 ; VI-NEXT: v_not_b32_e32 v0, v6
3251 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3252 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
3253 ; VI-NEXT: s_waitcnt vmcnt(0)
3254 ; VI-NEXT: buffer_wbinvl1_vol
3255 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
3256 ; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
3257 ; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
3258 ; VI-NEXT: s_cbranch_execnz .LBB47_1
3259 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
3260 ; VI-NEXT: s_or_b64 exec, exec, s[36:37]
3261 ; VI-NEXT: s_setpc_b64 s[30:31]
3263 ; GFX9-LABEL: global_atomic_nand_i64_ret_offset_scalar:
3265 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3266 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3267 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
3268 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
3269 ; GFX9-NEXT: .LBB47_1: ; %atomicrmw.start
3270 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
3271 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3272 ; GFX9-NEXT: v_mov_b32_e32 v6, v1
3273 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
3274 ; GFX9-NEXT: v_and_b32_e32 v0, s7, v6
3275 ; GFX9-NEXT: v_and_b32_e32 v1, s6, v5
3276 ; GFX9-NEXT: v_not_b32_e32 v4, v0
3277 ; GFX9-NEXT: v_not_b32_e32 v3, v1
3278 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3279 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc
3280 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3281 ; GFX9-NEXT: buffer_wbinvl1_vol
3282 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
3283 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
3284 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
3285 ; GFX9-NEXT: s_cbranch_execnz .LBB47_1
3286 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
3287 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
3288 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3289 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
3290 %result = atomicrmw nand ptr addrspace(1) %gep, i64 %in seq_cst
3294 ; ---------------------------------------------------------------------
3296 ; ---------------------------------------------------------------------
3298 define void @global_atomic_or_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
3299 ; SI-LABEL: global_atomic_or_i64_noret:
3301 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3302 ; SI-NEXT: s_mov_b32 s6, 0
3303 ; SI-NEXT: s_mov_b32 s7, 0xf000
3304 ; SI-NEXT: s_mov_b32 s4, s6
3305 ; SI-NEXT: s_mov_b32 s5, s6
3306 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3307 ; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64
3308 ; SI-NEXT: s_waitcnt vmcnt(0)
3309 ; SI-NEXT: buffer_wbinvl1
3310 ; SI-NEXT: s_waitcnt expcnt(0)
3311 ; SI-NEXT: s_setpc_b64 s[30:31]
3313 ; VI-LABEL: global_atomic_or_i64_noret:
3315 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3316 ; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
3317 ; VI-NEXT: s_waitcnt vmcnt(0)
3318 ; VI-NEXT: buffer_wbinvl1_vol
3319 ; VI-NEXT: s_setpc_b64 s[30:31]
3321 ; GFX9-LABEL: global_atomic_or_i64_noret:
3323 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3324 ; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[2:3], off
3325 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3326 ; GFX9-NEXT: buffer_wbinvl1_vol
3327 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3328 %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst
3332 define void @global_atomic_or_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
3333 ; SI-LABEL: global_atomic_or_i64_noret_offset:
3335 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3336 ; SI-NEXT: s_mov_b32 s6, 0
3337 ; SI-NEXT: s_mov_b32 s7, 0xf000
3338 ; SI-NEXT: s_mov_b32 s4, s6
3339 ; SI-NEXT: s_mov_b32 s5, s6
3340 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3341 ; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
3342 ; SI-NEXT: s_waitcnt vmcnt(0)
3343 ; SI-NEXT: buffer_wbinvl1
3344 ; SI-NEXT: s_waitcnt expcnt(0)
3345 ; SI-NEXT: s_setpc_b64 s[30:31]
3347 ; VI-LABEL: global_atomic_or_i64_noret_offset:
3349 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3350 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
3351 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3352 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3353 ; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
3354 ; VI-NEXT: s_waitcnt vmcnt(0)
3355 ; VI-NEXT: buffer_wbinvl1_vol
3356 ; VI-NEXT: s_setpc_b64 s[30:31]
3358 ; GFX9-LABEL: global_atomic_or_i64_noret_offset:
3360 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3361 ; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[2:3], off offset:32
3362 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3363 ; GFX9-NEXT: buffer_wbinvl1_vol
3364 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3365 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
3366 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst
3370 define i64 @global_atomic_or_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
3371 ; SI-LABEL: global_atomic_or_i64_ret:
3373 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3374 ; SI-NEXT: s_mov_b32 s6, 0
3375 ; SI-NEXT: s_mov_b32 s7, 0xf000
3376 ; SI-NEXT: s_mov_b32 s4, s6
3377 ; SI-NEXT: s_mov_b32 s5, s6
3378 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3379 ; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
3380 ; SI-NEXT: s_waitcnt vmcnt(0)
3381 ; SI-NEXT: buffer_wbinvl1
3382 ; SI-NEXT: v_mov_b32_e32 v0, v2
3383 ; SI-NEXT: v_mov_b32_e32 v1, v3
3384 ; SI-NEXT: s_waitcnt expcnt(0)
3385 ; SI-NEXT: s_setpc_b64 s[30:31]
3387 ; VI-LABEL: global_atomic_or_i64_ret:
3389 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3390 ; VI-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
3391 ; VI-NEXT: s_waitcnt vmcnt(0)
3392 ; VI-NEXT: buffer_wbinvl1_vol
3393 ; VI-NEXT: s_setpc_b64 s[30:31]
3395 ; GFX9-LABEL: global_atomic_or_i64_ret:
3397 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3398 ; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off glc
3399 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3400 ; GFX9-NEXT: buffer_wbinvl1_vol
3401 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3402 %result = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst
3406 define i64 @global_atomic_or_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
3407 ; SI-LABEL: global_atomic_or_i64_ret_offset:
3409 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3410 ; SI-NEXT: s_mov_b32 s6, 0
3411 ; SI-NEXT: s_mov_b32 s7, 0xf000
3412 ; SI-NEXT: s_mov_b32 s4, s6
3413 ; SI-NEXT: s_mov_b32 s5, s6
3414 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3415 ; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
3416 ; SI-NEXT: s_waitcnt vmcnt(0)
3417 ; SI-NEXT: buffer_wbinvl1
3418 ; SI-NEXT: v_mov_b32_e32 v0, v2
3419 ; SI-NEXT: v_mov_b32_e32 v1, v3
3420 ; SI-NEXT: s_waitcnt expcnt(0)
3421 ; SI-NEXT: s_setpc_b64 s[30:31]
3423 ; VI-LABEL: global_atomic_or_i64_ret_offset:
3425 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3426 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
3427 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3428 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3429 ; VI-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
3430 ; VI-NEXT: s_waitcnt vmcnt(0)
3431 ; VI-NEXT: buffer_wbinvl1_vol
3432 ; VI-NEXT: s_setpc_b64 s[30:31]
3434 ; GFX9-LABEL: global_atomic_or_i64_ret_offset:
3436 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3437 ; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
3438 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3439 ; GFX9-NEXT: buffer_wbinvl1_vol
3440 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3441 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
3442 %result = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst
3446 define amdgpu_gfx void @global_atomic_or_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
3447 ; SI-LABEL: global_atomic_or_i64_noret_scalar:
3449 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3450 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3451 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
3452 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3453 ; SI-NEXT: s_waitcnt expcnt(0)
3454 ; SI-NEXT: v_writelane_b32 v0, s6, 0
3455 ; SI-NEXT: v_writelane_b32 v0, s7, 1
3456 ; SI-NEXT: s_mov_b32 s34, s7
3457 ; SI-NEXT: s_mov_b32 s35, s6
3458 ; SI-NEXT: s_mov_b32 s7, 0xf000
3459 ; SI-NEXT: s_mov_b32 s6, -1
3460 ; SI-NEXT: v_mov_b32_e32 v1, s35
3461 ; SI-NEXT: v_mov_b32_e32 v2, s34
3462 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3463 ; SI-NEXT: buffer_atomic_or_x2 v[1:2], off, s[4:7], 0
3464 ; SI-NEXT: s_waitcnt vmcnt(0)
3465 ; SI-NEXT: buffer_wbinvl1
3466 ; SI-NEXT: v_readlane_b32 s7, v0, 1
3467 ; SI-NEXT: v_readlane_b32 s6, v0, 0
3468 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3469 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
3470 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3471 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3472 ; SI-NEXT: s_setpc_b64 s[30:31]
3474 ; VI-LABEL: global_atomic_or_i64_noret_scalar:
3476 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3477 ; VI-NEXT: v_mov_b32_e32 v0, s6
3478 ; VI-NEXT: v_mov_b32_e32 v1, s7
3479 ; VI-NEXT: v_mov_b32_e32 v2, s4
3480 ; VI-NEXT: v_mov_b32_e32 v3, s5
3481 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3482 ; VI-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
3483 ; VI-NEXT: s_waitcnt vmcnt(0)
3484 ; VI-NEXT: buffer_wbinvl1_vol
3485 ; VI-NEXT: s_setpc_b64 s[30:31]
3487 ; GFX9-LABEL: global_atomic_or_i64_noret_scalar:
3489 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3490 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
3491 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
3492 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3493 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3494 ; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[4:5]
3495 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3496 ; GFX9-NEXT: buffer_wbinvl1_vol
3497 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3498 %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst
3502 define amdgpu_gfx void @global_atomic_or_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
3503 ; SI-LABEL: global_atomic_or_i64_noret_offset_scalar:
3505 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3506 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3507 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
3508 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3509 ; SI-NEXT: s_waitcnt expcnt(0)
3510 ; SI-NEXT: v_writelane_b32 v0, s6, 0
3511 ; SI-NEXT: v_writelane_b32 v0, s7, 1
3512 ; SI-NEXT: v_mov_b32_e32 v1, s6
3513 ; SI-NEXT: v_mov_b32_e32 v2, s7
3514 ; SI-NEXT: s_mov_b32 s7, 0xf000
3515 ; SI-NEXT: s_mov_b32 s6, -1
3516 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3517 ; SI-NEXT: buffer_atomic_or_x2 v[1:2], off, s[4:7], 0 offset:32
3518 ; SI-NEXT: s_waitcnt vmcnt(0)
3519 ; SI-NEXT: buffer_wbinvl1
3520 ; SI-NEXT: v_readlane_b32 s7, v0, 1
3521 ; SI-NEXT: v_readlane_b32 s6, v0, 0
3522 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3523 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
3524 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3525 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3526 ; SI-NEXT: s_setpc_b64 s[30:31]
3528 ; VI-LABEL: global_atomic_or_i64_noret_offset_scalar:
3530 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3531 ; VI-NEXT: s_add_u32 s34, s4, 32
3532 ; VI-NEXT: s_addc_u32 s35, s5, 0
3533 ; VI-NEXT: v_mov_b32_e32 v2, s34
3534 ; VI-NEXT: v_mov_b32_e32 v0, s6
3535 ; VI-NEXT: v_mov_b32_e32 v1, s7
3536 ; VI-NEXT: v_mov_b32_e32 v3, s35
3537 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3538 ; VI-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
3539 ; VI-NEXT: s_waitcnt vmcnt(0)
3540 ; VI-NEXT: buffer_wbinvl1_vol
3541 ; VI-NEXT: s_setpc_b64 s[30:31]
3543 ; GFX9-LABEL: global_atomic_or_i64_noret_offset_scalar:
3545 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3546 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
3547 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
3548 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3549 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3550 ; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[4:5] offset:32
3551 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3552 ; GFX9-NEXT: buffer_wbinvl1_vol
3553 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3554 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
3555 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst
3559 define amdgpu_gfx i64 @global_atomic_or_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
3560 ; SI-LABEL: global_atomic_or_i64_ret_scalar:
3562 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3563 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3564 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
3565 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3566 ; SI-NEXT: s_waitcnt expcnt(0)
3567 ; SI-NEXT: v_writelane_b32 v2, s6, 0
3568 ; SI-NEXT: v_writelane_b32 v2, s7, 1
3569 ; SI-NEXT: s_mov_b32 s34, s7
3570 ; SI-NEXT: s_mov_b32 s35, s6
3571 ; SI-NEXT: s_mov_b32 s7, 0xf000
3572 ; SI-NEXT: s_mov_b32 s6, -1
3573 ; SI-NEXT: v_mov_b32_e32 v0, s35
3574 ; SI-NEXT: v_mov_b32_e32 v1, s34
3575 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3576 ; SI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 glc
3577 ; SI-NEXT: s_waitcnt vmcnt(0)
3578 ; SI-NEXT: buffer_wbinvl1
3579 ; SI-NEXT: v_readlane_b32 s7, v2, 1
3580 ; SI-NEXT: v_readlane_b32 s6, v2, 0
3581 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3582 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
3583 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3584 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3585 ; SI-NEXT: s_setpc_b64 s[30:31]
3587 ; VI-LABEL: global_atomic_or_i64_ret_scalar:
3589 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3590 ; VI-NEXT: v_mov_b32_e32 v0, s6
3591 ; VI-NEXT: v_mov_b32_e32 v1, s7
3592 ; VI-NEXT: v_mov_b32_e32 v2, s4
3593 ; VI-NEXT: v_mov_b32_e32 v3, s5
3594 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3595 ; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
3596 ; VI-NEXT: s_waitcnt vmcnt(0)
3597 ; VI-NEXT: buffer_wbinvl1_vol
3598 ; VI-NEXT: s_setpc_b64 s[30:31]
3600 ; GFX9-LABEL: global_atomic_or_i64_ret_scalar:
3602 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3603 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
3604 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
3605 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3606 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3607 ; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] glc
3608 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3609 ; GFX9-NEXT: buffer_wbinvl1_vol
3610 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3611 %result = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst
3615 define amdgpu_gfx i64 @global_atomic_or_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
3616 ; SI-LABEL: global_atomic_or_i64_ret_offset_scalar:
3618 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3619 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3620 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
3621 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3622 ; SI-NEXT: s_waitcnt expcnt(0)
3623 ; SI-NEXT: v_writelane_b32 v2, s6, 0
3624 ; SI-NEXT: v_writelane_b32 v2, s7, 1
3625 ; SI-NEXT: v_mov_b32_e32 v0, s6
3626 ; SI-NEXT: v_mov_b32_e32 v1, s7
3627 ; SI-NEXT: s_mov_b32 s7, 0xf000
3628 ; SI-NEXT: s_mov_b32 s6, -1
3629 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3630 ; SI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32 glc
3631 ; SI-NEXT: s_waitcnt vmcnt(0)
3632 ; SI-NEXT: buffer_wbinvl1
3633 ; SI-NEXT: v_readlane_b32 s7, v2, 1
3634 ; SI-NEXT: v_readlane_b32 s6, v2, 0
3635 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3636 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
3637 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3638 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3639 ; SI-NEXT: s_setpc_b64 s[30:31]
3641 ; VI-LABEL: global_atomic_or_i64_ret_offset_scalar:
3643 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3644 ; VI-NEXT: s_add_u32 s34, s4, 32
3645 ; VI-NEXT: s_addc_u32 s35, s5, 0
3646 ; VI-NEXT: v_mov_b32_e32 v2, s34
3647 ; VI-NEXT: v_mov_b32_e32 v0, s6
3648 ; VI-NEXT: v_mov_b32_e32 v1, s7
3649 ; VI-NEXT: v_mov_b32_e32 v3, s35
3650 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3651 ; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
3652 ; VI-NEXT: s_waitcnt vmcnt(0)
3653 ; VI-NEXT: buffer_wbinvl1_vol
3654 ; VI-NEXT: s_setpc_b64 s[30:31]
3656 ; GFX9-LABEL: global_atomic_or_i64_ret_offset_scalar:
3658 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3659 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
3660 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
3661 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3662 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3663 ; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
3664 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3665 ; GFX9-NEXT: buffer_wbinvl1_vol
3666 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3667 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
3668 %result = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst
3672 ; ---------------------------------------------------------------------
3674 ; ---------------------------------------------------------------------
3676 define void @global_atomic_xor_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
3677 ; SI-LABEL: global_atomic_xor_i64_noret:
3679 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3680 ; SI-NEXT: s_mov_b32 s6, 0
3681 ; SI-NEXT: s_mov_b32 s7, 0xf000
3682 ; SI-NEXT: s_mov_b32 s4, s6
3683 ; SI-NEXT: s_mov_b32 s5, s6
3684 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3685 ; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64
3686 ; SI-NEXT: s_waitcnt vmcnt(0)
3687 ; SI-NEXT: buffer_wbinvl1
3688 ; SI-NEXT: s_waitcnt expcnt(0)
3689 ; SI-NEXT: s_setpc_b64 s[30:31]
3691 ; VI-LABEL: global_atomic_xor_i64_noret:
3693 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3694 ; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
3695 ; VI-NEXT: s_waitcnt vmcnt(0)
3696 ; VI-NEXT: buffer_wbinvl1_vol
3697 ; VI-NEXT: s_setpc_b64 s[30:31]
3699 ; GFX9-LABEL: global_atomic_xor_i64_noret:
3701 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3702 ; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[2:3], off
3703 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3704 ; GFX9-NEXT: buffer_wbinvl1_vol
3705 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3706 %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst
3710 define void @global_atomic_xor_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
3711 ; SI-LABEL: global_atomic_xor_i64_noret_offset:
3713 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3714 ; SI-NEXT: s_mov_b32 s6, 0
3715 ; SI-NEXT: s_mov_b32 s7, 0xf000
3716 ; SI-NEXT: s_mov_b32 s4, s6
3717 ; SI-NEXT: s_mov_b32 s5, s6
3718 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3719 ; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
3720 ; SI-NEXT: s_waitcnt vmcnt(0)
3721 ; SI-NEXT: buffer_wbinvl1
3722 ; SI-NEXT: s_waitcnt expcnt(0)
3723 ; SI-NEXT: s_setpc_b64 s[30:31]
3725 ; VI-LABEL: global_atomic_xor_i64_noret_offset:
3727 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3728 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
3729 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3730 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3731 ; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
3732 ; VI-NEXT: s_waitcnt vmcnt(0)
3733 ; VI-NEXT: buffer_wbinvl1_vol
3734 ; VI-NEXT: s_setpc_b64 s[30:31]
3736 ; GFX9-LABEL: global_atomic_xor_i64_noret_offset:
3738 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3739 ; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[2:3], off offset:32
3740 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3741 ; GFX9-NEXT: buffer_wbinvl1_vol
3742 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3743 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
3744 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst
3748 define i64 @global_atomic_xor_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
3749 ; SI-LABEL: global_atomic_xor_i64_ret:
3751 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3752 ; SI-NEXT: s_mov_b32 s6, 0
3753 ; SI-NEXT: s_mov_b32 s7, 0xf000
3754 ; SI-NEXT: s_mov_b32 s4, s6
3755 ; SI-NEXT: s_mov_b32 s5, s6
3756 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3757 ; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
3758 ; SI-NEXT: s_waitcnt vmcnt(0)
3759 ; SI-NEXT: buffer_wbinvl1
3760 ; SI-NEXT: v_mov_b32_e32 v0, v2
3761 ; SI-NEXT: v_mov_b32_e32 v1, v3
3762 ; SI-NEXT: s_waitcnt expcnt(0)
3763 ; SI-NEXT: s_setpc_b64 s[30:31]
3765 ; VI-LABEL: global_atomic_xor_i64_ret:
3767 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3768 ; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
3769 ; VI-NEXT: s_waitcnt vmcnt(0)
3770 ; VI-NEXT: buffer_wbinvl1_vol
3771 ; VI-NEXT: s_setpc_b64 s[30:31]
3773 ; GFX9-LABEL: global_atomic_xor_i64_ret:
3775 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3776 ; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off glc
3777 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3778 ; GFX9-NEXT: buffer_wbinvl1_vol
3779 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3780 %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst
3784 define i64 @global_atomic_xor_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
3785 ; SI-LABEL: global_atomic_xor_i64_ret_offset:
3787 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3788 ; SI-NEXT: s_mov_b32 s6, 0
3789 ; SI-NEXT: s_mov_b32 s7, 0xf000
3790 ; SI-NEXT: s_mov_b32 s4, s6
3791 ; SI-NEXT: s_mov_b32 s5, s6
3792 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3793 ; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
3794 ; SI-NEXT: s_waitcnt vmcnt(0)
3795 ; SI-NEXT: buffer_wbinvl1
3796 ; SI-NEXT: v_mov_b32_e32 v0, v2
3797 ; SI-NEXT: v_mov_b32_e32 v1, v3
3798 ; SI-NEXT: s_waitcnt expcnt(0)
3799 ; SI-NEXT: s_setpc_b64 s[30:31]
3801 ; VI-LABEL: global_atomic_xor_i64_ret_offset:
3803 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3804 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
3805 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3806 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3807 ; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
3808 ; VI-NEXT: s_waitcnt vmcnt(0)
3809 ; VI-NEXT: buffer_wbinvl1_vol
3810 ; VI-NEXT: s_setpc_b64 s[30:31]
3812 ; GFX9-LABEL: global_atomic_xor_i64_ret_offset:
3814 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3815 ; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
3816 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3817 ; GFX9-NEXT: buffer_wbinvl1_vol
3818 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3819 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
3820 %result = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst
3824 define amdgpu_gfx void @global_atomic_xor_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
3825 ; SI-LABEL: global_atomic_xor_i64_noret_scalar:
3827 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3828 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3829 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
3830 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3831 ; SI-NEXT: s_waitcnt expcnt(0)
3832 ; SI-NEXT: v_writelane_b32 v0, s6, 0
3833 ; SI-NEXT: v_writelane_b32 v0, s7, 1
3834 ; SI-NEXT: s_mov_b32 s34, s7
3835 ; SI-NEXT: s_mov_b32 s35, s6
3836 ; SI-NEXT: s_mov_b32 s7, 0xf000
3837 ; SI-NEXT: s_mov_b32 s6, -1
3838 ; SI-NEXT: v_mov_b32_e32 v1, s35
3839 ; SI-NEXT: v_mov_b32_e32 v2, s34
3840 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3841 ; SI-NEXT: buffer_atomic_xor_x2 v[1:2], off, s[4:7], 0
3842 ; SI-NEXT: s_waitcnt vmcnt(0)
3843 ; SI-NEXT: buffer_wbinvl1
3844 ; SI-NEXT: v_readlane_b32 s7, v0, 1
3845 ; SI-NEXT: v_readlane_b32 s6, v0, 0
3846 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3847 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
3848 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3849 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3850 ; SI-NEXT: s_setpc_b64 s[30:31]
3852 ; VI-LABEL: global_atomic_xor_i64_noret_scalar:
3854 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3855 ; VI-NEXT: v_mov_b32_e32 v0, s6
3856 ; VI-NEXT: v_mov_b32_e32 v1, s7
3857 ; VI-NEXT: v_mov_b32_e32 v2, s4
3858 ; VI-NEXT: v_mov_b32_e32 v3, s5
3859 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3860 ; VI-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
3861 ; VI-NEXT: s_waitcnt vmcnt(0)
3862 ; VI-NEXT: buffer_wbinvl1_vol
3863 ; VI-NEXT: s_setpc_b64 s[30:31]
3865 ; GFX9-LABEL: global_atomic_xor_i64_noret_scalar:
3867 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3868 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
3869 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
3870 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3871 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3872 ; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[4:5]
3873 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3874 ; GFX9-NEXT: buffer_wbinvl1_vol
3875 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3876 %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst
3880 define amdgpu_gfx void @global_atomic_xor_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
3881 ; SI-LABEL: global_atomic_xor_i64_noret_offset_scalar:
3883 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3884 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3885 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
3886 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3887 ; SI-NEXT: s_waitcnt expcnt(0)
3888 ; SI-NEXT: v_writelane_b32 v0, s6, 0
3889 ; SI-NEXT: v_writelane_b32 v0, s7, 1
3890 ; SI-NEXT: v_mov_b32_e32 v1, s6
3891 ; SI-NEXT: v_mov_b32_e32 v2, s7
3892 ; SI-NEXT: s_mov_b32 s7, 0xf000
3893 ; SI-NEXT: s_mov_b32 s6, -1
3894 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3895 ; SI-NEXT: buffer_atomic_xor_x2 v[1:2], off, s[4:7], 0 offset:32
3896 ; SI-NEXT: s_waitcnt vmcnt(0)
3897 ; SI-NEXT: buffer_wbinvl1
3898 ; SI-NEXT: v_readlane_b32 s7, v0, 1
3899 ; SI-NEXT: v_readlane_b32 s6, v0, 0
3900 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3901 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
3902 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3903 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3904 ; SI-NEXT: s_setpc_b64 s[30:31]
3906 ; VI-LABEL: global_atomic_xor_i64_noret_offset_scalar:
3908 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3909 ; VI-NEXT: s_add_u32 s34, s4, 32
3910 ; VI-NEXT: s_addc_u32 s35, s5, 0
3911 ; VI-NEXT: v_mov_b32_e32 v2, s34
3912 ; VI-NEXT: v_mov_b32_e32 v0, s6
3913 ; VI-NEXT: v_mov_b32_e32 v1, s7
3914 ; VI-NEXT: v_mov_b32_e32 v3, s35
3915 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3916 ; VI-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
3917 ; VI-NEXT: s_waitcnt vmcnt(0)
3918 ; VI-NEXT: buffer_wbinvl1_vol
3919 ; VI-NEXT: s_setpc_b64 s[30:31]
3921 ; GFX9-LABEL: global_atomic_xor_i64_noret_offset_scalar:
3923 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3924 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
3925 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
3926 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3927 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3928 ; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[4:5] offset:32
3929 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3930 ; GFX9-NEXT: buffer_wbinvl1_vol
3931 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3932 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
3933 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst
3937 define amdgpu_gfx i64 @global_atomic_xor_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
3938 ; SI-LABEL: global_atomic_xor_i64_ret_scalar:
3940 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3941 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3942 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
3943 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3944 ; SI-NEXT: s_waitcnt expcnt(0)
3945 ; SI-NEXT: v_writelane_b32 v2, s6, 0
3946 ; SI-NEXT: v_writelane_b32 v2, s7, 1
3947 ; SI-NEXT: s_mov_b32 s34, s7
3948 ; SI-NEXT: s_mov_b32 s35, s6
3949 ; SI-NEXT: s_mov_b32 s7, 0xf000
3950 ; SI-NEXT: s_mov_b32 s6, -1
3951 ; SI-NEXT: v_mov_b32_e32 v0, s35
3952 ; SI-NEXT: v_mov_b32_e32 v1, s34
3953 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3954 ; SI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 glc
3955 ; SI-NEXT: s_waitcnt vmcnt(0)
3956 ; SI-NEXT: buffer_wbinvl1
3957 ; SI-NEXT: v_readlane_b32 s7, v2, 1
3958 ; SI-NEXT: v_readlane_b32 s6, v2, 0
3959 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3960 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
3961 ; SI-NEXT: s_mov_b64 exec, s[34:35]
3962 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3963 ; SI-NEXT: s_setpc_b64 s[30:31]
3965 ; VI-LABEL: global_atomic_xor_i64_ret_scalar:
3967 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3968 ; VI-NEXT: v_mov_b32_e32 v0, s6
3969 ; VI-NEXT: v_mov_b32_e32 v1, s7
3970 ; VI-NEXT: v_mov_b32_e32 v2, s4
3971 ; VI-NEXT: v_mov_b32_e32 v3, s5
3972 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3973 ; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
3974 ; VI-NEXT: s_waitcnt vmcnt(0)
3975 ; VI-NEXT: buffer_wbinvl1_vol
3976 ; VI-NEXT: s_setpc_b64 s[30:31]
3978 ; GFX9-LABEL: global_atomic_xor_i64_ret_scalar:
3980 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3981 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
3982 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
3983 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3984 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3985 ; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] glc
3986 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3987 ; GFX9-NEXT: buffer_wbinvl1_vol
3988 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3989 %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst
3993 define amdgpu_gfx i64 @global_atomic_xor_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
3994 ; SI-LABEL: global_atomic_xor_i64_ret_offset_scalar:
3996 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3997 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
3998 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
3999 ; SI-NEXT: s_mov_b64 exec, s[34:35]
4000 ; SI-NEXT: s_waitcnt expcnt(0)
4001 ; SI-NEXT: v_writelane_b32 v2, s6, 0
4002 ; SI-NEXT: v_writelane_b32 v2, s7, 1
4003 ; SI-NEXT: v_mov_b32_e32 v0, s6
4004 ; SI-NEXT: v_mov_b32_e32 v1, s7
4005 ; SI-NEXT: s_mov_b32 s7, 0xf000
4006 ; SI-NEXT: s_mov_b32 s6, -1
4007 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4008 ; SI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32 glc
4009 ; SI-NEXT: s_waitcnt vmcnt(0)
4010 ; SI-NEXT: buffer_wbinvl1
4011 ; SI-NEXT: v_readlane_b32 s7, v2, 1
4012 ; SI-NEXT: v_readlane_b32 s6, v2, 0
4013 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
4014 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
4015 ; SI-NEXT: s_mov_b64 exec, s[34:35]
4016 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4017 ; SI-NEXT: s_setpc_b64 s[30:31]
4019 ; VI-LABEL: global_atomic_xor_i64_ret_offset_scalar:
4021 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4022 ; VI-NEXT: s_add_u32 s34, s4, 32
4023 ; VI-NEXT: s_addc_u32 s35, s5, 0
4024 ; VI-NEXT: v_mov_b32_e32 v2, s34
4025 ; VI-NEXT: v_mov_b32_e32 v0, s6
4026 ; VI-NEXT: v_mov_b32_e32 v1, s7
4027 ; VI-NEXT: v_mov_b32_e32 v3, s35
4028 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4029 ; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
4030 ; VI-NEXT: s_waitcnt vmcnt(0)
4031 ; VI-NEXT: buffer_wbinvl1_vol
4032 ; VI-NEXT: s_setpc_b64 s[30:31]
4034 ; GFX9-LABEL: global_atomic_xor_i64_ret_offset_scalar:
4036 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4037 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
4038 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
4039 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4040 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4041 ; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
4042 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4043 ; GFX9-NEXT: buffer_wbinvl1_vol
4044 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4045 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
4046 %result = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst
4050 ; ---------------------------------------------------------------------
4052 ; ---------------------------------------------------------------------
4054 define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
4055 ; SI-LABEL: global_atomic_max_i64_noret:
4057 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4058 ; SI-NEXT: s_mov_b32 s6, 0
4059 ; SI-NEXT: s_mov_b32 s7, 0xf000
4060 ; SI-NEXT: s_mov_b32 s4, s6
4061 ; SI-NEXT: s_mov_b32 s5, s6
4062 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
4063 ; SI-NEXT: s_mov_b64 s[8:9], 0
4064 ; SI-NEXT: .LBB64_1: ; %atomicrmw.start
4065 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4066 ; SI-NEXT: s_waitcnt vmcnt(0)
4067 ; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
4068 ; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
4069 ; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
4070 ; SI-NEXT: s_waitcnt expcnt(0)
4071 ; SI-NEXT: v_mov_b32_e32 v11, v7
4072 ; SI-NEXT: v_mov_b32_e32 v10, v6
4073 ; SI-NEXT: v_mov_b32_e32 v9, v5
4074 ; SI-NEXT: v_mov_b32_e32 v8, v4
4075 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4076 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
4077 ; SI-NEXT: s_waitcnt vmcnt(0)
4078 ; SI-NEXT: buffer_wbinvl1
4079 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
4080 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
4081 ; SI-NEXT: v_mov_b32_e32 v6, v8
4082 ; SI-NEXT: v_mov_b32_e32 v7, v9
4083 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
4084 ; SI-NEXT: s_cbranch_execnz .LBB64_1
4085 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4086 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
4087 ; SI-NEXT: s_waitcnt expcnt(0)
4088 ; SI-NEXT: s_setpc_b64 s[30:31]
4090 ; VI-LABEL: global_atomic_max_i64_noret:
4092 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4093 ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
4094 ; VI-NEXT: s_mov_b64 s[4:5], 0
4095 ; VI-NEXT: .LBB64_1: ; %atomicrmw.start
4096 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4097 ; VI-NEXT: s_waitcnt vmcnt(0)
4098 ; VI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
4099 ; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
4100 ; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
4101 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4102 ; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
4103 ; VI-NEXT: s_waitcnt vmcnt(0)
4104 ; VI-NEXT: buffer_wbinvl1_vol
4105 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4106 ; VI-NEXT: v_mov_b32_e32 v7, v5
4107 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4108 ; VI-NEXT: v_mov_b32_e32 v6, v4
4109 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
4110 ; VI-NEXT: s_cbranch_execnz .LBB64_1
4111 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4112 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
4113 ; VI-NEXT: s_setpc_b64 s[30:31]
4115 ; GFX9-LABEL: global_atomic_max_i64_noret:
4117 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4118 ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
4119 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
4120 ; GFX9-NEXT: .LBB64_1: ; %atomicrmw.start
4121 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4122 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4123 ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
4124 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
4125 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
4126 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4127 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
4128 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4129 ; GFX9-NEXT: buffer_wbinvl1_vol
4130 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4131 ; GFX9-NEXT: v_mov_b32_e32 v7, v5
4132 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4133 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
4134 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
4135 ; GFX9-NEXT: s_cbranch_execnz .LBB64_1
4136 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4137 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
4138 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4139 %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i64 %in seq_cst
4143 define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
4144 ; SI-LABEL: global_atomic_max_i64_noret_offset:
4146 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4147 ; SI-NEXT: s_mov_b32 s6, 0
4148 ; SI-NEXT: s_mov_b32 s7, 0xf000
4149 ; SI-NEXT: s_mov_b32 s4, s6
4150 ; SI-NEXT: s_mov_b32 s5, s6
4151 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
4152 ; SI-NEXT: s_mov_b64 s[8:9], 0
4153 ; SI-NEXT: .LBB65_1: ; %atomicrmw.start
4154 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4155 ; SI-NEXT: s_waitcnt vmcnt(0)
4156 ; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
4157 ; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
4158 ; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
4159 ; SI-NEXT: s_waitcnt expcnt(0)
4160 ; SI-NEXT: v_mov_b32_e32 v11, v7
4161 ; SI-NEXT: v_mov_b32_e32 v10, v6
4162 ; SI-NEXT: v_mov_b32_e32 v9, v5
4163 ; SI-NEXT: v_mov_b32_e32 v8, v4
4164 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4165 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
4166 ; SI-NEXT: s_waitcnt vmcnt(0)
4167 ; SI-NEXT: buffer_wbinvl1
4168 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
4169 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
4170 ; SI-NEXT: v_mov_b32_e32 v6, v8
4171 ; SI-NEXT: v_mov_b32_e32 v7, v9
4172 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
4173 ; SI-NEXT: s_cbranch_execnz .LBB65_1
4174 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4175 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
4176 ; SI-NEXT: s_waitcnt expcnt(0)
4177 ; SI-NEXT: s_setpc_b64 s[30:31]
4179 ; VI-LABEL: global_atomic_max_i64_noret_offset:
4181 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4182 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
4183 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4184 ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
4185 ; VI-NEXT: s_mov_b64 s[4:5], 0
4186 ; VI-NEXT: .LBB65_1: ; %atomicrmw.start
4187 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4188 ; VI-NEXT: s_waitcnt vmcnt(0)
4189 ; VI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
4190 ; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
4191 ; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
4192 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4193 ; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
4194 ; VI-NEXT: s_waitcnt vmcnt(0)
4195 ; VI-NEXT: buffer_wbinvl1_vol
4196 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4197 ; VI-NEXT: v_mov_b32_e32 v7, v5
4198 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4199 ; VI-NEXT: v_mov_b32_e32 v6, v4
4200 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
4201 ; VI-NEXT: s_cbranch_execnz .LBB65_1
4202 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4203 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
4204 ; VI-NEXT: s_setpc_b64 s[30:31]
4206 ; GFX9-LABEL: global_atomic_max_i64_noret_offset:
4208 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4209 ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32
4210 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
4211 ; GFX9-NEXT: .LBB65_1: ; %atomicrmw.start
4212 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4213 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4214 ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
4215 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
4216 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
4217 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4218 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
4219 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4220 ; GFX9-NEXT: buffer_wbinvl1_vol
4221 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4222 ; GFX9-NEXT: v_mov_b32_e32 v7, v5
4223 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4224 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
4225 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
4226 ; GFX9-NEXT: s_cbranch_execnz .LBB65_1
4227 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4228 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
4229 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4230 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
4231 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst
4235 define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
4236 ; SI-LABEL: global_atomic_max_i64_ret:
4238 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4239 ; SI-NEXT: v_mov_b32_e32 v5, v3
4240 ; SI-NEXT: v_mov_b32_e32 v4, v2
4241 ; SI-NEXT: v_mov_b32_e32 v7, v1
4242 ; SI-NEXT: v_mov_b32_e32 v6, v0
4243 ; SI-NEXT: s_mov_b32 s6, 0
4244 ; SI-NEXT: s_mov_b32 s7, 0xf000
4245 ; SI-NEXT: s_mov_b32 s4, s6
4246 ; SI-NEXT: s_mov_b32 s5, s6
4247 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
4248 ; SI-NEXT: s_mov_b64 s[8:9], 0
4249 ; SI-NEXT: .LBB66_1: ; %atomicrmw.start
4250 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4251 ; SI-NEXT: s_waitcnt vmcnt(0)
4252 ; SI-NEXT: v_mov_b32_e32 v11, v1
4253 ; SI-NEXT: v_mov_b32_e32 v10, v0
4254 ; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[10:11], v[4:5]
4255 ; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc
4256 ; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc
4257 ; SI-NEXT: s_waitcnt expcnt(0)
4258 ; SI-NEXT: v_mov_b32_e32 v0, v8
4259 ; SI-NEXT: v_mov_b32_e32 v1, v9
4260 ; SI-NEXT: v_mov_b32_e32 v2, v10
4261 ; SI-NEXT: v_mov_b32_e32 v3, v11
4262 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4263 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
4264 ; SI-NEXT: s_waitcnt vmcnt(0)
4265 ; SI-NEXT: buffer_wbinvl1
4266 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
4267 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
4268 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
4269 ; SI-NEXT: s_cbranch_execnz .LBB66_1
4270 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4271 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
4272 ; SI-NEXT: s_waitcnt expcnt(0)
4273 ; SI-NEXT: s_setpc_b64 s[30:31]
4275 ; VI-LABEL: global_atomic_max_i64_ret:
4277 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4278 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
4279 ; VI-NEXT: s_mov_b64 s[4:5], 0
4280 ; VI-NEXT: .LBB66_1: ; %atomicrmw.start
4281 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4282 ; VI-NEXT: s_waitcnt vmcnt(0)
4283 ; VI-NEXT: v_mov_b32_e32 v7, v5
4284 ; VI-NEXT: v_mov_b32_e32 v6, v4
4285 ; VI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
4286 ; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
4287 ; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
4288 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4289 ; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
4290 ; VI-NEXT: s_waitcnt vmcnt(0)
4291 ; VI-NEXT: buffer_wbinvl1_vol
4292 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4293 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4294 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
4295 ; VI-NEXT: s_cbranch_execnz .LBB66_1
4296 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4297 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
4298 ; VI-NEXT: v_mov_b32_e32 v0, v4
4299 ; VI-NEXT: v_mov_b32_e32 v1, v5
4300 ; VI-NEXT: s_setpc_b64 s[30:31]
4302 ; GFX9-LABEL: global_atomic_max_i64_ret:
4304 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4305 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
4306 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
4307 ; GFX9-NEXT: .LBB66_1: ; %atomicrmw.start
4308 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4309 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4310 ; GFX9-NEXT: v_mov_b32_e32 v7, v5
4311 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
4312 ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
4313 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
4314 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
4315 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4316 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
4317 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4318 ; GFX9-NEXT: buffer_wbinvl1_vol
4319 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4320 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4321 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
4322 ; GFX9-NEXT: s_cbranch_execnz .LBB66_1
4323 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4324 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
4325 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
4326 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
4327 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4328 %result = atomicrmw max ptr addrspace(1) %ptr, i64 %in seq_cst
4332 define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
4333 ; SI-LABEL: global_atomic_max_i64_ret_offset:
4335 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4336 ; SI-NEXT: v_mov_b32_e32 v5, v3
4337 ; SI-NEXT: v_mov_b32_e32 v4, v2
4338 ; SI-NEXT: v_mov_b32_e32 v7, v1
4339 ; SI-NEXT: v_mov_b32_e32 v6, v0
4340 ; SI-NEXT: s_mov_b32 s6, 0
4341 ; SI-NEXT: s_mov_b32 s7, 0xf000
4342 ; SI-NEXT: s_mov_b32 s4, s6
4343 ; SI-NEXT: s_mov_b32 s5, s6
4344 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32
4345 ; SI-NEXT: s_mov_b64 s[8:9], 0
4346 ; SI-NEXT: .LBB67_1: ; %atomicrmw.start
4347 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4348 ; SI-NEXT: s_waitcnt vmcnt(0)
4349 ; SI-NEXT: v_mov_b32_e32 v11, v1
4350 ; SI-NEXT: v_mov_b32_e32 v10, v0
4351 ; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[10:11], v[4:5]
4352 ; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc
4353 ; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc
4354 ; SI-NEXT: s_waitcnt expcnt(0)
4355 ; SI-NEXT: v_mov_b32_e32 v0, v8
4356 ; SI-NEXT: v_mov_b32_e32 v1, v9
4357 ; SI-NEXT: v_mov_b32_e32 v2, v10
4358 ; SI-NEXT: v_mov_b32_e32 v3, v11
4359 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4360 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc
4361 ; SI-NEXT: s_waitcnt vmcnt(0)
4362 ; SI-NEXT: buffer_wbinvl1
4363 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
4364 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
4365 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
4366 ; SI-NEXT: s_cbranch_execnz .LBB67_1
4367 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4368 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
4369 ; SI-NEXT: s_waitcnt expcnt(0)
4370 ; SI-NEXT: s_setpc_b64 s[30:31]
4372 ; VI-LABEL: global_atomic_max_i64_ret_offset:
4374 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4375 ; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0
4376 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
4377 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
4378 ; VI-NEXT: s_mov_b64 s[4:5], 0
4379 ; VI-NEXT: .LBB67_1: ; %atomicrmw.start
4380 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4381 ; VI-NEXT: s_waitcnt vmcnt(0)
4382 ; VI-NEXT: v_mov_b32_e32 v9, v1
4383 ; VI-NEXT: v_mov_b32_e32 v8, v0
4384 ; VI-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
4385 ; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
4386 ; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
4387 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4388 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
4389 ; VI-NEXT: s_waitcnt vmcnt(0)
4390 ; VI-NEXT: buffer_wbinvl1_vol
4391 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
4392 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4393 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
4394 ; VI-NEXT: s_cbranch_execnz .LBB67_1
4395 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4396 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
4397 ; VI-NEXT: s_setpc_b64 s[30:31]
4399 ; GFX9-LABEL: global_atomic_max_i64_ret_offset:
4401 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4402 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32
4403 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
4404 ; GFX9-NEXT: .LBB67_1: ; %atomicrmw.start
4405 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4406 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4407 ; GFX9-NEXT: v_mov_b32_e32 v7, v5
4408 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
4409 ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
4410 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
4411 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
4412 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4413 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
4414 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4415 ; GFX9-NEXT: buffer_wbinvl1_vol
4416 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4417 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4418 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
4419 ; GFX9-NEXT: s_cbranch_execnz .LBB67_1
4420 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4421 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
4422 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
4423 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
4424 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4425 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
4426 %result = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst
4430 define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
4431 ; SI-LABEL: global_atomic_max_i64_noret_scalar:
4433 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4434 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
4435 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
4436 ; SI-NEXT: s_mov_b64 exec, s[34:35]
4437 ; SI-NEXT: s_waitcnt expcnt(0)
4438 ; SI-NEXT: v_writelane_b32 v0, s6, 0
4439 ; SI-NEXT: v_writelane_b32 v0, s7, 1
4440 ; SI-NEXT: s_mov_b32 s35, s7
4441 ; SI-NEXT: s_mov_b32 s34, s6
4442 ; SI-NEXT: s_mov_b32 s7, 0xf000
4443 ; SI-NEXT: s_mov_b32 s6, -1
4444 ; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0
4445 ; SI-NEXT: s_mov_b64 s[36:37], 0
4446 ; SI-NEXT: .LBB68_1: ; %atomicrmw.start
4447 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4448 ; SI-NEXT: v_mov_b32_e32 v1, s35
4449 ; SI-NEXT: s_waitcnt vmcnt(0)
4450 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[3:4]
4451 ; SI-NEXT: v_cndmask_b32_e32 v2, v1, v4, vcc
4452 ; SI-NEXT: v_mov_b32_e32 v1, s34
4453 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
4454 ; SI-NEXT: s_waitcnt expcnt(0)
4455 ; SI-NEXT: v_mov_b32_e32 v8, v4
4456 ; SI-NEXT: v_mov_b32_e32 v7, v3
4457 ; SI-NEXT: v_mov_b32_e32 v6, v2
4458 ; SI-NEXT: v_mov_b32_e32 v5, v1
4459 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4460 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc
4461 ; SI-NEXT: s_waitcnt vmcnt(0)
4462 ; SI-NEXT: buffer_wbinvl1
4463 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4]
4464 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
4465 ; SI-NEXT: v_mov_b32_e32 v3, v5
4466 ; SI-NEXT: v_mov_b32_e32 v4, v6
4467 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
4468 ; SI-NEXT: s_cbranch_execnz .LBB68_1
4469 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4470 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
4471 ; SI-NEXT: v_readlane_b32 s7, v0, 1
4472 ; SI-NEXT: v_readlane_b32 s6, v0, 0
4473 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
4474 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
4475 ; SI-NEXT: s_mov_b64 exec, s[34:35]
4476 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4477 ; SI-NEXT: s_setpc_b64 s[30:31]
4479 ; VI-LABEL: global_atomic_max_i64_noret_scalar:
4481 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4482 ; VI-NEXT: v_mov_b32_e32 v0, s4
4483 ; VI-NEXT: v_mov_b32_e32 v1, s5
4484 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
4485 ; VI-NEXT: s_mov_b64 s[34:35], 0
4486 ; VI-NEXT: .LBB68_1: ; %atomicrmw.start
4487 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4488 ; VI-NEXT: s_waitcnt vmcnt(0)
4489 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
4490 ; VI-NEXT: v_mov_b32_e32 v0, s7
4491 ; VI-NEXT: v_mov_b32_e32 v6, s6
4492 ; VI-NEXT: v_mov_b32_e32 v4, s4
4493 ; VI-NEXT: v_mov_b32_e32 v5, s5
4494 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
4495 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
4496 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4497 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
4498 ; VI-NEXT: s_waitcnt vmcnt(0)
4499 ; VI-NEXT: buffer_wbinvl1_vol
4500 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4501 ; VI-NEXT: v_mov_b32_e32 v3, v1
4502 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4503 ; VI-NEXT: v_mov_b32_e32 v2, v0
4504 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
4505 ; VI-NEXT: s_cbranch_execnz .LBB68_1
4506 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4507 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
4508 ; VI-NEXT: s_setpc_b64 s[30:31]
4510 ; GFX9-LABEL: global_atomic_max_i64_noret_scalar:
4512 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4513 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
4514 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5]
4515 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
4516 ; GFX9-NEXT: .LBB68_1: ; %atomicrmw.start
4517 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4518 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4519 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
4520 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
4521 ; GFX9-NEXT: v_mov_b32_e32 v5, s6
4522 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
4523 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
4524 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4525 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
4526 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4527 ; GFX9-NEXT: buffer_wbinvl1_vol
4528 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4529 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
4530 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4531 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
4532 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
4533 ; GFX9-NEXT: s_cbranch_execnz .LBB68_1
4534 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4535 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
4536 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4537 %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i64 %in seq_cst
4541 define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
4542 ; SI-LABEL: global_atomic_max_i64_noret_offset_scalar:
4544 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4545 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
4546 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
4547 ; SI-NEXT: s_mov_b64 exec, s[34:35]
4548 ; SI-NEXT: s_waitcnt expcnt(0)
4549 ; SI-NEXT: v_writelane_b32 v0, s6, 0
4550 ; SI-NEXT: v_writelane_b32 v0, s7, 1
4551 ; SI-NEXT: s_mov_b32 s35, s7
4552 ; SI-NEXT: s_mov_b32 s34, s6
4553 ; SI-NEXT: s_mov_b32 s7, 0xf000
4554 ; SI-NEXT: s_mov_b32 s6, -1
4555 ; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
4556 ; SI-NEXT: s_mov_b64 s[36:37], 0
4557 ; SI-NEXT: .LBB69_1: ; %atomicrmw.start
4558 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4559 ; SI-NEXT: v_mov_b32_e32 v1, s35
4560 ; SI-NEXT: s_waitcnt vmcnt(0)
4561 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[3:4]
4562 ; SI-NEXT: v_cndmask_b32_e32 v2, v1, v4, vcc
4563 ; SI-NEXT: v_mov_b32_e32 v1, s34
4564 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
4565 ; SI-NEXT: s_waitcnt expcnt(0)
4566 ; SI-NEXT: v_mov_b32_e32 v8, v4
4567 ; SI-NEXT: v_mov_b32_e32 v7, v3
4568 ; SI-NEXT: v_mov_b32_e32 v6, v2
4569 ; SI-NEXT: v_mov_b32_e32 v5, v1
4570 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4571 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc
4572 ; SI-NEXT: s_waitcnt vmcnt(0)
4573 ; SI-NEXT: buffer_wbinvl1
4574 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4]
4575 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
4576 ; SI-NEXT: v_mov_b32_e32 v3, v5
4577 ; SI-NEXT: v_mov_b32_e32 v4, v6
4578 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
4579 ; SI-NEXT: s_cbranch_execnz .LBB69_1
4580 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4581 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
4582 ; SI-NEXT: v_readlane_b32 s7, v0, 1
4583 ; SI-NEXT: v_readlane_b32 s6, v0, 0
4584 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
4585 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
4586 ; SI-NEXT: s_mov_b64 exec, s[34:35]
4587 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4588 ; SI-NEXT: s_setpc_b64 s[30:31]
4590 ; VI-LABEL: global_atomic_max_i64_noret_offset_scalar:
4592 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4593 ; VI-NEXT: s_add_u32 s34, s4, 32
4594 ; VI-NEXT: s_addc_u32 s35, s5, 0
4595 ; VI-NEXT: v_mov_b32_e32 v0, s34
4596 ; VI-NEXT: v_mov_b32_e32 v1, s35
4597 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
4598 ; VI-NEXT: s_mov_b64 s[36:37], 0
4599 ; VI-NEXT: .LBB69_1: ; %atomicrmw.start
4600 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4601 ; VI-NEXT: s_waitcnt vmcnt(0)
4602 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
4603 ; VI-NEXT: v_mov_b32_e32 v0, s7
4604 ; VI-NEXT: v_mov_b32_e32 v6, s6
4605 ; VI-NEXT: v_mov_b32_e32 v4, s34
4606 ; VI-NEXT: v_mov_b32_e32 v5, s35
4607 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
4608 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
4609 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4610 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
4611 ; VI-NEXT: s_waitcnt vmcnt(0)
4612 ; VI-NEXT: buffer_wbinvl1_vol
4613 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4614 ; VI-NEXT: v_mov_b32_e32 v3, v1
4615 ; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
4616 ; VI-NEXT: v_mov_b32_e32 v2, v0
4617 ; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
4618 ; VI-NEXT: s_cbranch_execnz .LBB69_1
4619 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4620 ; VI-NEXT: s_or_b64 exec, exec, s[36:37]
4621 ; VI-NEXT: s_setpc_b64 s[30:31]
4623 ; GFX9-LABEL: global_atomic_max_i64_noret_offset_scalar:
4625 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4626 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
4627 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
4628 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
4629 ; GFX9-NEXT: .LBB69_1: ; %atomicrmw.start
4630 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4631 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4632 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
4633 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
4634 ; GFX9-NEXT: v_mov_b32_e32 v5, s6
4635 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
4636 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
4637 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4638 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
4639 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4640 ; GFX9-NEXT: buffer_wbinvl1_vol
4641 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4642 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
4643 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4644 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
4645 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
4646 ; GFX9-NEXT: s_cbranch_execnz .LBB69_1
4647 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4648 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
4649 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4650 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
4651 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst
4655 define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
4656 ; SI-LABEL: global_atomic_max_i64_ret_scalar:
4658 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4659 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
4660 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
4661 ; SI-NEXT: s_mov_b64 exec, s[34:35]
4662 ; SI-NEXT: s_waitcnt expcnt(0)
4663 ; SI-NEXT: v_writelane_b32 v2, s6, 0
4664 ; SI-NEXT: v_writelane_b32 v2, s7, 1
4665 ; SI-NEXT: s_mov_b32 s35, s7
4666 ; SI-NEXT: s_mov_b32 s34, s6
4667 ; SI-NEXT: s_mov_b32 s7, 0xf000
4668 ; SI-NEXT: s_mov_b32 s6, -1
4669 ; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0
4670 ; SI-NEXT: s_mov_b64 s[36:37], 0
4671 ; SI-NEXT: .LBB70_1: ; %atomicrmw.start
4672 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4673 ; SI-NEXT: s_waitcnt vmcnt(0)
4674 ; SI-NEXT: v_mov_b32_e32 v8, v4
4675 ; SI-NEXT: v_mov_b32_e32 v7, v3
4676 ; SI-NEXT: v_mov_b32_e32 v0, s35
4677 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[7:8]
4678 ; SI-NEXT: s_waitcnt expcnt(0)
4679 ; SI-NEXT: v_cndmask_b32_e32 v6, v0, v8, vcc
4680 ; SI-NEXT: v_mov_b32_e32 v0, s34
4681 ; SI-NEXT: v_cndmask_b32_e32 v5, v0, v7, vcc
4682 ; SI-NEXT: v_mov_b32_e32 v3, v5
4683 ; SI-NEXT: v_mov_b32_e32 v4, v6
4684 ; SI-NEXT: v_mov_b32_e32 v5, v7
4685 ; SI-NEXT: v_mov_b32_e32 v6, v8
4686 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4687 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc
4688 ; SI-NEXT: s_waitcnt vmcnt(0)
4689 ; SI-NEXT: buffer_wbinvl1
4690 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
4691 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
4692 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
4693 ; SI-NEXT: s_cbranch_execnz .LBB70_1
4694 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4695 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
4696 ; SI-NEXT: v_mov_b32_e32 v0, v3
4697 ; SI-NEXT: v_mov_b32_e32 v1, v4
4698 ; SI-NEXT: v_readlane_b32 s7, v2, 1
4699 ; SI-NEXT: v_readlane_b32 s6, v2, 0
4700 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
4701 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
4702 ; SI-NEXT: s_mov_b64 exec, s[34:35]
4703 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4704 ; SI-NEXT: s_setpc_b64 s[30:31]
4706 ; VI-LABEL: global_atomic_max_i64_ret_scalar:
4708 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4709 ; VI-NEXT: v_mov_b32_e32 v0, s4
4710 ; VI-NEXT: v_mov_b32_e32 v1, s5
4711 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
4712 ; VI-NEXT: s_mov_b64 s[34:35], 0
4713 ; VI-NEXT: .LBB70_1: ; %atomicrmw.start
4714 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4715 ; VI-NEXT: s_waitcnt vmcnt(0)
4716 ; VI-NEXT: v_mov_b32_e32 v3, v1
4717 ; VI-NEXT: v_mov_b32_e32 v2, v0
4718 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
4719 ; VI-NEXT: v_mov_b32_e32 v0, s7
4720 ; VI-NEXT: v_mov_b32_e32 v6, s6
4721 ; VI-NEXT: v_mov_b32_e32 v4, s4
4722 ; VI-NEXT: v_mov_b32_e32 v5, s5
4723 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
4724 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
4725 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4726 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
4727 ; VI-NEXT: s_waitcnt vmcnt(0)
4728 ; VI-NEXT: buffer_wbinvl1_vol
4729 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4730 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4731 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
4732 ; VI-NEXT: s_cbranch_execnz .LBB70_1
4733 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4734 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
4735 ; VI-NEXT: s_setpc_b64 s[30:31]
4737 ; GFX9-LABEL: global_atomic_max_i64_ret_scalar:
4739 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4740 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4741 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
4742 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
4743 ; GFX9-NEXT: .LBB70_1: ; %atomicrmw.start
4744 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4745 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4746 ; GFX9-NEXT: v_mov_b32_e32 v6, v1
4747 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
4748 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[5:6]
4749 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
4750 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
4751 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
4752 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
4753 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4754 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc
4755 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4756 ; GFX9-NEXT: buffer_wbinvl1_vol
4757 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
4758 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4759 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
4760 ; GFX9-NEXT: s_cbranch_execnz .LBB70_1
4761 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4762 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
4763 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4764 %result = atomicrmw max ptr addrspace(1) %ptr, i64 %in seq_cst
4768 define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
4769 ; SI-LABEL: global_atomic_max_i64_ret_offset_scalar:
4771 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4772 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
4773 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
4774 ; SI-NEXT: s_mov_b64 exec, s[34:35]
4775 ; SI-NEXT: s_waitcnt expcnt(0)
4776 ; SI-NEXT: v_writelane_b32 v2, s6, 0
4777 ; SI-NEXT: v_writelane_b32 v2, s7, 1
4778 ; SI-NEXT: s_mov_b32 s35, s7
4779 ; SI-NEXT: s_mov_b32 s34, s6
4780 ; SI-NEXT: s_mov_b32 s7, 0xf000
4781 ; SI-NEXT: s_mov_b32 s6, -1
4782 ; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
4783 ; SI-NEXT: s_mov_b64 s[36:37], 0
4784 ; SI-NEXT: .LBB71_1: ; %atomicrmw.start
4785 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4786 ; SI-NEXT: s_waitcnt vmcnt(0)
4787 ; SI-NEXT: v_mov_b32_e32 v8, v4
4788 ; SI-NEXT: v_mov_b32_e32 v7, v3
4789 ; SI-NEXT: v_mov_b32_e32 v0, s35
4790 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[34:35], v[7:8]
4791 ; SI-NEXT: s_waitcnt expcnt(0)
4792 ; SI-NEXT: v_cndmask_b32_e32 v6, v0, v8, vcc
4793 ; SI-NEXT: v_mov_b32_e32 v0, s34
4794 ; SI-NEXT: v_cndmask_b32_e32 v5, v0, v7, vcc
4795 ; SI-NEXT: v_mov_b32_e32 v3, v5
4796 ; SI-NEXT: v_mov_b32_e32 v4, v6
4797 ; SI-NEXT: v_mov_b32_e32 v5, v7
4798 ; SI-NEXT: v_mov_b32_e32 v6, v8
4799 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4800 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc
4801 ; SI-NEXT: s_waitcnt vmcnt(0)
4802 ; SI-NEXT: buffer_wbinvl1
4803 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
4804 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
4805 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
4806 ; SI-NEXT: s_cbranch_execnz .LBB71_1
4807 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4808 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
4809 ; SI-NEXT: v_mov_b32_e32 v0, v3
4810 ; SI-NEXT: v_mov_b32_e32 v1, v4
4811 ; SI-NEXT: v_readlane_b32 s7, v2, 1
4812 ; SI-NEXT: v_readlane_b32 s6, v2, 0
4813 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
4814 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
4815 ; SI-NEXT: s_mov_b64 exec, s[34:35]
4816 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4817 ; SI-NEXT: s_setpc_b64 s[30:31]
4819 ; VI-LABEL: global_atomic_max_i64_ret_offset_scalar:
4821 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4822 ; VI-NEXT: s_add_u32 s34, s4, 32
4823 ; VI-NEXT: s_addc_u32 s35, s5, 0
4824 ; VI-NEXT: v_mov_b32_e32 v0, s34
4825 ; VI-NEXT: v_mov_b32_e32 v1, s35
4826 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
4827 ; VI-NEXT: s_mov_b64 s[36:37], 0
4828 ; VI-NEXT: .LBB71_1: ; %atomicrmw.start
4829 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4830 ; VI-NEXT: s_waitcnt vmcnt(0)
4831 ; VI-NEXT: v_mov_b32_e32 v3, v1
4832 ; VI-NEXT: v_mov_b32_e32 v2, v0
4833 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
4834 ; VI-NEXT: v_mov_b32_e32 v0, s7
4835 ; VI-NEXT: v_mov_b32_e32 v6, s6
4836 ; VI-NEXT: v_mov_b32_e32 v4, s34
4837 ; VI-NEXT: v_mov_b32_e32 v5, s35
4838 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
4839 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
4840 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4841 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
4842 ; VI-NEXT: s_waitcnt vmcnt(0)
4843 ; VI-NEXT: buffer_wbinvl1_vol
4844 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4845 ; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
4846 ; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
4847 ; VI-NEXT: s_cbranch_execnz .LBB71_1
4848 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4849 ; VI-NEXT: s_or_b64 exec, exec, s[36:37]
4850 ; VI-NEXT: s_setpc_b64 s[30:31]
4852 ; GFX9-LABEL: global_atomic_max_i64_ret_offset_scalar:
4854 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4855 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4856 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
4857 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
4858 ; GFX9-NEXT: .LBB71_1: ; %atomicrmw.start
4859 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4860 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4861 ; GFX9-NEXT: v_mov_b32_e32 v6, v1
4862 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
4863 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[5:6]
4864 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
4865 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
4866 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
4867 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
4868 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4869 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc
4870 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4871 ; GFX9-NEXT: buffer_wbinvl1_vol
4872 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
4873 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4874 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
4875 ; GFX9-NEXT: s_cbranch_execnz .LBB71_1
4876 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4877 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
4878 ; GFX9-NEXT: s_setpc_b64 s[30:31]
4879 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
4880 %result = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst
4884 define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
4885 ; SI-LABEL: atomic_max_i64_addr64_offset:
4886 ; SI: ; %bb.0: ; %entry
4887 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
4888 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
4889 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4890 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
4891 ; SI-NEXT: s_add_u32 s4, s0, s4
4892 ; SI-NEXT: s_addc_u32 s5, s1, s5
4893 ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
4894 ; SI-NEXT: s_mov_b64 s[0:1], 0
4895 ; SI-NEXT: s_mov_b32 s7, 0xf000
4896 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4897 ; SI-NEXT: v_mov_b32_e32 v2, s8
4898 ; SI-NEXT: v_mov_b32_e32 v3, s9
4899 ; SI-NEXT: s_mov_b32 s6, -1
4900 ; SI-NEXT: .LBB72_1: ; %atomicrmw.start
4901 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
4902 ; SI-NEXT: v_mov_b32_e32 v0, s3
4903 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
4904 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
4905 ; SI-NEXT: v_mov_b32_e32 v0, s2
4906 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
4907 ; SI-NEXT: s_waitcnt expcnt(0)
4908 ; SI-NEXT: v_mov_b32_e32 v7, v3
4909 ; SI-NEXT: v_mov_b32_e32 v6, v2
4910 ; SI-NEXT: v_mov_b32_e32 v5, v1
4911 ; SI-NEXT: v_mov_b32_e32 v4, v0
4912 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4913 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc
4914 ; SI-NEXT: s_waitcnt vmcnt(0)
4915 ; SI-NEXT: buffer_wbinvl1
4916 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
4917 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
4918 ; SI-NEXT: v_mov_b32_e32 v2, v4
4919 ; SI-NEXT: v_mov_b32_e32 v3, v5
4920 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
4921 ; SI-NEXT: s_cbranch_execnz .LBB72_1
4922 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
4925 ; VI-LABEL: atomic_max_i64_addr64_offset:
4926 ; VI: ; %bb.0: ; %entry
4927 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
4928 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
4929 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4930 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
4931 ; VI-NEXT: s_add_u32 s0, s0, s4
4932 ; VI-NEXT: s_addc_u32 s1, s1, s5
4933 ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
4934 ; VI-NEXT: s_add_u32 s0, s0, 32
4935 ; VI-NEXT: s_addc_u32 s1, s1, 0
4936 ; VI-NEXT: s_mov_b64 s[4:5], 0
4937 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4938 ; VI-NEXT: v_mov_b32_e32 v2, s6
4939 ; VI-NEXT: v_mov_b32_e32 v3, s7
4940 ; VI-NEXT: .LBB72_1: ; %atomicrmw.start
4941 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
4942 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
4943 ; VI-NEXT: v_mov_b32_e32 v0, s3
4944 ; VI-NEXT: v_mov_b32_e32 v6, s2
4945 ; VI-NEXT: v_mov_b32_e32 v5, s1
4946 ; VI-NEXT: v_mov_b32_e32 v4, s0
4947 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
4948 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
4949 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4950 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
4951 ; VI-NEXT: s_waitcnt vmcnt(0)
4952 ; VI-NEXT: buffer_wbinvl1_vol
4953 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4954 ; VI-NEXT: v_mov_b32_e32 v3, v1
4955 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4956 ; VI-NEXT: v_mov_b32_e32 v2, v0
4957 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
4958 ; VI-NEXT: s_cbranch_execnz .LBB72_1
4959 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
4962 ; GFX9-LABEL: atomic_max_i64_addr64_offset:
4963 ; GFX9: ; %bb.0: ; %entry
4964 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4965 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4966 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
4967 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4968 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
4969 ; GFX9-NEXT: s_add_u32 s0, s4, s0
4970 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
4971 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20
4972 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
4973 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4974 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
4975 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
4976 ; GFX9-NEXT: .LBB72_1: ; %atomicrmw.start
4977 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4978 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
4979 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
4980 ; GFX9-NEXT: v_mov_b32_e32 v5, s6
4981 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
4982 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
4983 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4984 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc
4985 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4986 ; GFX9-NEXT: buffer_wbinvl1_vol
4987 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4988 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
4989 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
4990 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
4991 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
4992 ; GFX9-NEXT: s_cbranch_execnz .LBB72_1
4993 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
4994 ; GFX9-NEXT: s_endpgm
4996 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
4997 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
4998 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst
5002 define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
5003 ; SI-LABEL: atomic_max_i64_ret_addr64_offset:
5004 ; SI: ; %bb.0: ; %entry
5005 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
5006 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5007 ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
5008 ; SI-NEXT: s_add_u32 s8, s0, s6
5009 ; SI-NEXT: s_addc_u32 s9, s1, s7
5010 ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
5011 ; SI-NEXT: s_mov_b64 s[0:1], 0
5012 ; SI-NEXT: s_mov_b32 s11, 0xf000
5013 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5014 ; SI-NEXT: v_mov_b32_e32 v2, s6
5015 ; SI-NEXT: v_mov_b32_e32 v3, s7
5016 ; SI-NEXT: s_mov_b32 s10, -1
5017 ; SI-NEXT: .LBB73_1: ; %atomicrmw.start
5018 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5019 ; SI-NEXT: v_mov_b32_e32 v0, s5
5020 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
5021 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5022 ; SI-NEXT: v_mov_b32_e32 v0, s4
5023 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
5024 ; SI-NEXT: s_waitcnt expcnt(0)
5025 ; SI-NEXT: v_mov_b32_e32 v7, v3
5026 ; SI-NEXT: v_mov_b32_e32 v6, v2
5027 ; SI-NEXT: v_mov_b32_e32 v5, v1
5028 ; SI-NEXT: v_mov_b32_e32 v4, v0
5029 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5030 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 offset:32 glc
5031 ; SI-NEXT: s_waitcnt vmcnt(0)
5032 ; SI-NEXT: buffer_wbinvl1
5033 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
5034 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
5035 ; SI-NEXT: v_mov_b32_e32 v2, v4
5036 ; SI-NEXT: v_mov_b32_e32 v3, v5
5037 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
5038 ; SI-NEXT: s_cbranch_execnz .LBB73_1
5039 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5040 ; SI-NEXT: s_or_b64 exec, exec, s[0:1]
5041 ; SI-NEXT: s_mov_b32 s7, 0xf000
5042 ; SI-NEXT: s_mov_b32 s6, -1
5043 ; SI-NEXT: s_mov_b32 s4, s2
5044 ; SI-NEXT: s_mov_b32 s5, s3
5045 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0
5048 ; VI-LABEL: atomic_max_i64_ret_addr64_offset:
5049 ; VI: ; %bb.0: ; %entry
5050 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
5051 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5052 ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
5053 ; VI-NEXT: s_add_u32 s0, s0, s6
5054 ; VI-NEXT: s_addc_u32 s1, s1, s7
5055 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20
5056 ; VI-NEXT: s_add_u32 s0, s0, 32
5057 ; VI-NEXT: s_addc_u32 s1, s1, 0
5058 ; VI-NEXT: s_mov_b64 s[6:7], 0
5059 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5060 ; VI-NEXT: v_mov_b32_e32 v0, s8
5061 ; VI-NEXT: v_mov_b32_e32 v1, s9
5062 ; VI-NEXT: .LBB73_1: ; %atomicrmw.start
5063 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5064 ; VI-NEXT: v_mov_b32_e32 v3, v1
5065 ; VI-NEXT: v_mov_b32_e32 v2, v0
5066 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
5067 ; VI-NEXT: v_mov_b32_e32 v0, s5
5068 ; VI-NEXT: v_mov_b32_e32 v6, s4
5069 ; VI-NEXT: v_mov_b32_e32 v5, s1
5070 ; VI-NEXT: v_mov_b32_e32 v4, s0
5071 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5072 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
5073 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5074 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
5075 ; VI-NEXT: s_waitcnt vmcnt(0)
5076 ; VI-NEXT: buffer_wbinvl1_vol
5077 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5078 ; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
5079 ; VI-NEXT: s_andn2_b64 exec, exec, s[6:7]
5080 ; VI-NEXT: s_cbranch_execnz .LBB73_1
5081 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5082 ; VI-NEXT: s_or_b64 exec, exec, s[6:7]
5083 ; VI-NEXT: v_mov_b32_e32 v2, s2
5084 ; VI-NEXT: v_mov_b32_e32 v3, s3
5085 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
5088 ; GFX9-LABEL: atomic_max_i64_ret_addr64_offset:
5089 ; GFX9: ; %bb.0: ; %entry
5090 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
5091 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5092 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5093 ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
5094 ; GFX9-NEXT: s_add_u32 s0, s0, s6
5095 ; GFX9-NEXT: s_addc_u32 s1, s1, s7
5096 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20
5097 ; GFX9-NEXT: s_mov_b64 s[6:7], 0
5098 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5099 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
5100 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
5101 ; GFX9-NEXT: .LBB73_1: ; %atomicrmw.start
5102 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5103 ; GFX9-NEXT: v_mov_b32_e32 v6, v1
5104 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
5105 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[5:6]
5106 ; GFX9-NEXT: v_mov_b32_e32 v0, s5
5107 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
5108 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
5109 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
5110 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5111 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] offset:32 glc
5112 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5113 ; GFX9-NEXT: buffer_wbinvl1_vol
5114 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
5115 ; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
5116 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
5117 ; GFX9-NEXT: s_cbranch_execnz .LBB73_1
5118 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5119 ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
5120 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5121 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
5122 ; GFX9-NEXT: s_endpgm
5124 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
5125 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
5126 %tmp0 = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst
5127 store i64 %tmp0, ptr addrspace(1) %out2
5131 define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) {
5132 ; SI-LABEL: atomic_max_i64_addr64:
5133 ; SI: ; %bb.0: ; %entry
5134 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
5135 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5136 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5137 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
5138 ; SI-NEXT: s_add_u32 s4, s0, s4
5139 ; SI-NEXT: s_addc_u32 s5, s1, s5
5140 ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
5141 ; SI-NEXT: s_mov_b64 s[0:1], 0
5142 ; SI-NEXT: s_mov_b32 s7, 0xf000
5143 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5144 ; SI-NEXT: v_mov_b32_e32 v2, s8
5145 ; SI-NEXT: v_mov_b32_e32 v3, s9
5146 ; SI-NEXT: s_mov_b32 s6, -1
5147 ; SI-NEXT: .LBB74_1: ; %atomicrmw.start
5148 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5149 ; SI-NEXT: v_mov_b32_e32 v0, s3
5150 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
5151 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5152 ; SI-NEXT: v_mov_b32_e32 v0, s2
5153 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
5154 ; SI-NEXT: s_waitcnt expcnt(0)
5155 ; SI-NEXT: v_mov_b32_e32 v7, v3
5156 ; SI-NEXT: v_mov_b32_e32 v6, v2
5157 ; SI-NEXT: v_mov_b32_e32 v5, v1
5158 ; SI-NEXT: v_mov_b32_e32 v4, v0
5159 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5160 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc
5161 ; SI-NEXT: s_waitcnt vmcnt(0)
5162 ; SI-NEXT: buffer_wbinvl1
5163 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
5164 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
5165 ; SI-NEXT: v_mov_b32_e32 v2, v4
5166 ; SI-NEXT: v_mov_b32_e32 v3, v5
5167 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
5168 ; SI-NEXT: s_cbranch_execnz .LBB74_1
5169 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5172 ; VI-LABEL: atomic_max_i64_addr64:
5173 ; VI: ; %bb.0: ; %entry
5174 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
5175 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5176 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5177 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
5178 ; VI-NEXT: s_add_u32 s0, s0, s4
5179 ; VI-NEXT: s_addc_u32 s1, s1, s5
5180 ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
5181 ; VI-NEXT: s_mov_b64 s[4:5], 0
5182 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5183 ; VI-NEXT: v_mov_b32_e32 v2, s6
5184 ; VI-NEXT: v_mov_b32_e32 v3, s7
5185 ; VI-NEXT: .LBB74_1: ; %atomicrmw.start
5186 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5187 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
5188 ; VI-NEXT: v_mov_b32_e32 v0, s3
5189 ; VI-NEXT: v_mov_b32_e32 v6, s2
5190 ; VI-NEXT: v_mov_b32_e32 v5, s1
5191 ; VI-NEXT: v_mov_b32_e32 v4, s0
5192 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5193 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
5194 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5195 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
5196 ; VI-NEXT: s_waitcnt vmcnt(0)
5197 ; VI-NEXT: buffer_wbinvl1_vol
5198 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5199 ; VI-NEXT: v_mov_b32_e32 v3, v1
5200 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5201 ; VI-NEXT: v_mov_b32_e32 v2, v0
5202 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
5203 ; VI-NEXT: s_cbranch_execnz .LBB74_1
5204 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5207 ; GFX9-LABEL: atomic_max_i64_addr64:
5208 ; GFX9: ; %bb.0: ; %entry
5209 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
5210 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5211 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
5212 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5213 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
5214 ; GFX9-NEXT: s_add_u32 s0, s4, s0
5215 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
5216 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
5217 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
5218 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5219 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
5220 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
5221 ; GFX9-NEXT: .LBB74_1: ; %atomicrmw.start
5222 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5223 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
5224 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
5225 ; GFX9-NEXT: v_mov_b32_e32 v5, s6
5226 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5227 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
5228 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5229 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
5230 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5231 ; GFX9-NEXT: buffer_wbinvl1_vol
5232 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5233 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
5234 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
5235 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
5236 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
5237 ; GFX9-NEXT: s_cbranch_execnz .LBB74_1
5238 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5239 ; GFX9-NEXT: s_endpgm
5241 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
5242 %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i64 %in seq_cst
5246 define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
5247 ; SI-LABEL: atomic_max_i64_ret_addr64:
5248 ; SI: ; %bb.0: ; %entry
5249 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
5250 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5251 ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
5252 ; SI-NEXT: s_add_u32 s8, s0, s6
5253 ; SI-NEXT: s_addc_u32 s9, s1, s7
5254 ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
5255 ; SI-NEXT: s_mov_b64 s[0:1], 0
5256 ; SI-NEXT: s_mov_b32 s11, 0xf000
5257 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5258 ; SI-NEXT: v_mov_b32_e32 v2, s6
5259 ; SI-NEXT: v_mov_b32_e32 v3, s7
5260 ; SI-NEXT: s_mov_b32 s10, -1
5261 ; SI-NEXT: .LBB75_1: ; %atomicrmw.start
5262 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5263 ; SI-NEXT: v_mov_b32_e32 v0, s5
5264 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
5265 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5266 ; SI-NEXT: v_mov_b32_e32 v0, s4
5267 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
5268 ; SI-NEXT: s_waitcnt expcnt(0)
5269 ; SI-NEXT: v_mov_b32_e32 v7, v3
5270 ; SI-NEXT: v_mov_b32_e32 v6, v2
5271 ; SI-NEXT: v_mov_b32_e32 v5, v1
5272 ; SI-NEXT: v_mov_b32_e32 v4, v0
5273 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5274 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 glc
5275 ; SI-NEXT: s_waitcnt vmcnt(0)
5276 ; SI-NEXT: buffer_wbinvl1
5277 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
5278 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
5279 ; SI-NEXT: v_mov_b32_e32 v2, v4
5280 ; SI-NEXT: v_mov_b32_e32 v3, v5
5281 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
5282 ; SI-NEXT: s_cbranch_execnz .LBB75_1
5283 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5284 ; SI-NEXT: s_or_b64 exec, exec, s[0:1]
5285 ; SI-NEXT: s_mov_b32 s7, 0xf000
5286 ; SI-NEXT: s_mov_b32 s6, -1
5287 ; SI-NEXT: s_mov_b32 s4, s2
5288 ; SI-NEXT: s_mov_b32 s5, s3
5289 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0
5292 ; VI-LABEL: atomic_max_i64_ret_addr64:
5293 ; VI: ; %bb.0: ; %entry
5294 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
5295 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5296 ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
5297 ; VI-NEXT: s_add_u32 s0, s0, s6
5298 ; VI-NEXT: s_addc_u32 s1, s1, s7
5299 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
5300 ; VI-NEXT: s_mov_b64 s[6:7], 0
5301 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5302 ; VI-NEXT: v_mov_b32_e32 v0, s8
5303 ; VI-NEXT: v_mov_b32_e32 v1, s9
5304 ; VI-NEXT: .LBB75_1: ; %atomicrmw.start
5305 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5306 ; VI-NEXT: v_mov_b32_e32 v3, v1
5307 ; VI-NEXT: v_mov_b32_e32 v2, v0
5308 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
5309 ; VI-NEXT: v_mov_b32_e32 v0, s5
5310 ; VI-NEXT: v_mov_b32_e32 v6, s4
5311 ; VI-NEXT: v_mov_b32_e32 v5, s1
5312 ; VI-NEXT: v_mov_b32_e32 v4, s0
5313 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5314 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
5315 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5316 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
5317 ; VI-NEXT: s_waitcnt vmcnt(0)
5318 ; VI-NEXT: buffer_wbinvl1_vol
5319 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5320 ; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
5321 ; VI-NEXT: s_andn2_b64 exec, exec, s[6:7]
5322 ; VI-NEXT: s_cbranch_execnz .LBB75_1
5323 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5324 ; VI-NEXT: s_or_b64 exec, exec, s[6:7]
5325 ; VI-NEXT: v_mov_b32_e32 v2, s2
5326 ; VI-NEXT: v_mov_b32_e32 v3, s3
5327 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
5330 ; GFX9-LABEL: atomic_max_i64_ret_addr64:
5331 ; GFX9: ; %bb.0: ; %entry
5332 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
5333 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5334 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5335 ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
5336 ; GFX9-NEXT: s_add_u32 s0, s0, s6
5337 ; GFX9-NEXT: s_addc_u32 s1, s1, s7
5338 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
5339 ; GFX9-NEXT: s_mov_b64 s[6:7], 0
5340 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5341 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
5342 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
5343 ; GFX9-NEXT: .LBB75_1: ; %atomicrmw.start
5344 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5345 ; GFX9-NEXT: v_mov_b32_e32 v6, v1
5346 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
5347 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[5:6]
5348 ; GFX9-NEXT: v_mov_b32_e32 v0, s5
5349 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
5350 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
5351 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
5352 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5353 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] glc
5354 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5355 ; GFX9-NEXT: buffer_wbinvl1_vol
5356 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
5357 ; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
5358 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
5359 ; GFX9-NEXT: s_cbranch_execnz .LBB75_1
5360 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5361 ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
5362 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5363 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
5364 ; GFX9-NEXT: s_endpgm
5366 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
5367 %tmp0 = atomicrmw max ptr addrspace(1) %ptr, i64 %in seq_cst
5368 store i64 %tmp0, ptr addrspace(1) %out2
5372 ; ---------------------------------------------------------------------
5374 ; ---------------------------------------------------------------------
5376 define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
5377 ; SI-LABEL: global_atomic_umax_i64_noret:
5379 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5380 ; SI-NEXT: s_mov_b32 s6, 0
5381 ; SI-NEXT: s_mov_b32 s7, 0xf000
5382 ; SI-NEXT: s_mov_b32 s4, s6
5383 ; SI-NEXT: s_mov_b32 s5, s6
5384 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
5385 ; SI-NEXT: s_mov_b64 s[8:9], 0
5386 ; SI-NEXT: .LBB76_1: ; %atomicrmw.start
5387 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5388 ; SI-NEXT: s_waitcnt vmcnt(0)
5389 ; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
5390 ; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
5391 ; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
5392 ; SI-NEXT: s_waitcnt expcnt(0)
5393 ; SI-NEXT: v_mov_b32_e32 v11, v7
5394 ; SI-NEXT: v_mov_b32_e32 v10, v6
5395 ; SI-NEXT: v_mov_b32_e32 v9, v5
5396 ; SI-NEXT: v_mov_b32_e32 v8, v4
5397 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5398 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
5399 ; SI-NEXT: s_waitcnt vmcnt(0)
5400 ; SI-NEXT: buffer_wbinvl1
5401 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
5402 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
5403 ; SI-NEXT: v_mov_b32_e32 v6, v8
5404 ; SI-NEXT: v_mov_b32_e32 v7, v9
5405 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
5406 ; SI-NEXT: s_cbranch_execnz .LBB76_1
5407 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5408 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
5409 ; SI-NEXT: s_waitcnt expcnt(0)
5410 ; SI-NEXT: s_setpc_b64 s[30:31]
5412 ; VI-LABEL: global_atomic_umax_i64_noret:
5414 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5415 ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
5416 ; VI-NEXT: s_mov_b64 s[4:5], 0
5417 ; VI-NEXT: .LBB76_1: ; %atomicrmw.start
5418 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5419 ; VI-NEXT: s_waitcnt vmcnt(0)
5420 ; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
5421 ; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
5422 ; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
5423 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5424 ; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
5425 ; VI-NEXT: s_waitcnt vmcnt(0)
5426 ; VI-NEXT: buffer_wbinvl1_vol
5427 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5428 ; VI-NEXT: v_mov_b32_e32 v7, v5
5429 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5430 ; VI-NEXT: v_mov_b32_e32 v6, v4
5431 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
5432 ; VI-NEXT: s_cbranch_execnz .LBB76_1
5433 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5434 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
5435 ; VI-NEXT: s_setpc_b64 s[30:31]
5437 ; GFX9-LABEL: global_atomic_umax_i64_noret:
5439 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5440 ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
5441 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
5442 ; GFX9-NEXT: .LBB76_1: ; %atomicrmw.start
5443 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5444 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5445 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
5446 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
5447 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
5448 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5449 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
5450 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5451 ; GFX9-NEXT: buffer_wbinvl1_vol
5452 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5453 ; GFX9-NEXT: v_mov_b32_e32 v7, v5
5454 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5455 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
5456 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
5457 ; GFX9-NEXT: s_cbranch_execnz .LBB76_1
5458 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5459 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
5460 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5461 %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i64 %in seq_cst
5465 define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
5466 ; SI-LABEL: global_atomic_umax_i64_noret_offset:
5468 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5469 ; SI-NEXT: s_mov_b32 s6, 0
5470 ; SI-NEXT: s_mov_b32 s7, 0xf000
5471 ; SI-NEXT: s_mov_b32 s4, s6
5472 ; SI-NEXT: s_mov_b32 s5, s6
5473 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
5474 ; SI-NEXT: s_mov_b64 s[8:9], 0
5475 ; SI-NEXT: .LBB77_1: ; %atomicrmw.start
5476 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5477 ; SI-NEXT: s_waitcnt vmcnt(0)
5478 ; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
5479 ; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
5480 ; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
5481 ; SI-NEXT: s_waitcnt expcnt(0)
5482 ; SI-NEXT: v_mov_b32_e32 v11, v7
5483 ; SI-NEXT: v_mov_b32_e32 v10, v6
5484 ; SI-NEXT: v_mov_b32_e32 v9, v5
5485 ; SI-NEXT: v_mov_b32_e32 v8, v4
5486 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5487 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
5488 ; SI-NEXT: s_waitcnt vmcnt(0)
5489 ; SI-NEXT: buffer_wbinvl1
5490 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
5491 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
5492 ; SI-NEXT: v_mov_b32_e32 v6, v8
5493 ; SI-NEXT: v_mov_b32_e32 v7, v9
5494 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
5495 ; SI-NEXT: s_cbranch_execnz .LBB77_1
5496 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5497 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
5498 ; SI-NEXT: s_waitcnt expcnt(0)
5499 ; SI-NEXT: s_setpc_b64 s[30:31]
5501 ; VI-LABEL: global_atomic_umax_i64_noret_offset:
5503 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5504 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
5505 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
5506 ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
5507 ; VI-NEXT: s_mov_b64 s[4:5], 0
5508 ; VI-NEXT: .LBB77_1: ; %atomicrmw.start
5509 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5510 ; VI-NEXT: s_waitcnt vmcnt(0)
5511 ; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
5512 ; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
5513 ; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
5514 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5515 ; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
5516 ; VI-NEXT: s_waitcnt vmcnt(0)
5517 ; VI-NEXT: buffer_wbinvl1_vol
5518 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5519 ; VI-NEXT: v_mov_b32_e32 v7, v5
5520 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5521 ; VI-NEXT: v_mov_b32_e32 v6, v4
5522 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
5523 ; VI-NEXT: s_cbranch_execnz .LBB77_1
5524 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5525 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
5526 ; VI-NEXT: s_setpc_b64 s[30:31]
5528 ; GFX9-LABEL: global_atomic_umax_i64_noret_offset:
5530 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5531 ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32
5532 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
5533 ; GFX9-NEXT: .LBB77_1: ; %atomicrmw.start
5534 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5535 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5536 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
5537 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
5538 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
5539 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5540 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
5541 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5542 ; GFX9-NEXT: buffer_wbinvl1_vol
5543 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5544 ; GFX9-NEXT: v_mov_b32_e32 v7, v5
5545 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5546 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
5547 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
5548 ; GFX9-NEXT: s_cbranch_execnz .LBB77_1
5549 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5550 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
5551 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5552 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
5553 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst
5557 define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
5558 ; SI-LABEL: global_atomic_umax_i64_ret:
5560 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5561 ; SI-NEXT: v_mov_b32_e32 v5, v3
5562 ; SI-NEXT: v_mov_b32_e32 v4, v2
5563 ; SI-NEXT: v_mov_b32_e32 v7, v1
5564 ; SI-NEXT: v_mov_b32_e32 v6, v0
5565 ; SI-NEXT: s_mov_b32 s6, 0
5566 ; SI-NEXT: s_mov_b32 s7, 0xf000
5567 ; SI-NEXT: s_mov_b32 s4, s6
5568 ; SI-NEXT: s_mov_b32 s5, s6
5569 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
5570 ; SI-NEXT: s_mov_b64 s[8:9], 0
5571 ; SI-NEXT: .LBB78_1: ; %atomicrmw.start
5572 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5573 ; SI-NEXT: s_waitcnt vmcnt(0)
5574 ; SI-NEXT: v_mov_b32_e32 v11, v1
5575 ; SI-NEXT: v_mov_b32_e32 v10, v0
5576 ; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[10:11], v[4:5]
5577 ; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc
5578 ; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc
5579 ; SI-NEXT: s_waitcnt expcnt(0)
5580 ; SI-NEXT: v_mov_b32_e32 v0, v8
5581 ; SI-NEXT: v_mov_b32_e32 v1, v9
5582 ; SI-NEXT: v_mov_b32_e32 v2, v10
5583 ; SI-NEXT: v_mov_b32_e32 v3, v11
5584 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5585 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
5586 ; SI-NEXT: s_waitcnt vmcnt(0)
5587 ; SI-NEXT: buffer_wbinvl1
5588 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
5589 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
5590 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
5591 ; SI-NEXT: s_cbranch_execnz .LBB78_1
5592 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5593 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
5594 ; SI-NEXT: s_waitcnt expcnt(0)
5595 ; SI-NEXT: s_setpc_b64 s[30:31]
5597 ; VI-LABEL: global_atomic_umax_i64_ret:
5599 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5600 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
5601 ; VI-NEXT: s_mov_b64 s[4:5], 0
5602 ; VI-NEXT: .LBB78_1: ; %atomicrmw.start
5603 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5604 ; VI-NEXT: s_waitcnt vmcnt(0)
5605 ; VI-NEXT: v_mov_b32_e32 v7, v5
5606 ; VI-NEXT: v_mov_b32_e32 v6, v4
5607 ; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
5608 ; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
5609 ; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
5610 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5611 ; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
5612 ; VI-NEXT: s_waitcnt vmcnt(0)
5613 ; VI-NEXT: buffer_wbinvl1_vol
5614 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5615 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5616 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
5617 ; VI-NEXT: s_cbranch_execnz .LBB78_1
5618 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5619 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
5620 ; VI-NEXT: v_mov_b32_e32 v0, v4
5621 ; VI-NEXT: v_mov_b32_e32 v1, v5
5622 ; VI-NEXT: s_setpc_b64 s[30:31]
5624 ; GFX9-LABEL: global_atomic_umax_i64_ret:
5626 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5627 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
5628 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
5629 ; GFX9-NEXT: .LBB78_1: ; %atomicrmw.start
5630 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5631 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5632 ; GFX9-NEXT: v_mov_b32_e32 v7, v5
5633 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
5634 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
5635 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
5636 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
5637 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5638 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
5639 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5640 ; GFX9-NEXT: buffer_wbinvl1_vol
5641 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5642 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5643 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
5644 ; GFX9-NEXT: s_cbranch_execnz .LBB78_1
5645 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5646 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
5647 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
5648 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
5649 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5650 %result = atomicrmw umax ptr addrspace(1) %ptr, i64 %in seq_cst
5654 define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
5655 ; SI-LABEL: global_atomic_umax_i64_ret_offset:
5657 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5658 ; SI-NEXT: v_mov_b32_e32 v5, v3
5659 ; SI-NEXT: v_mov_b32_e32 v4, v2
5660 ; SI-NEXT: v_mov_b32_e32 v7, v1
5661 ; SI-NEXT: v_mov_b32_e32 v6, v0
5662 ; SI-NEXT: s_mov_b32 s6, 0
5663 ; SI-NEXT: s_mov_b32 s7, 0xf000
5664 ; SI-NEXT: s_mov_b32 s4, s6
5665 ; SI-NEXT: s_mov_b32 s5, s6
5666 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32
5667 ; SI-NEXT: s_mov_b64 s[8:9], 0
5668 ; SI-NEXT: .LBB79_1: ; %atomicrmw.start
5669 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5670 ; SI-NEXT: s_waitcnt vmcnt(0)
5671 ; SI-NEXT: v_mov_b32_e32 v11, v1
5672 ; SI-NEXT: v_mov_b32_e32 v10, v0
5673 ; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[10:11], v[4:5]
5674 ; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc
5675 ; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc
5676 ; SI-NEXT: s_waitcnt expcnt(0)
5677 ; SI-NEXT: v_mov_b32_e32 v0, v8
5678 ; SI-NEXT: v_mov_b32_e32 v1, v9
5679 ; SI-NEXT: v_mov_b32_e32 v2, v10
5680 ; SI-NEXT: v_mov_b32_e32 v3, v11
5681 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5682 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc
5683 ; SI-NEXT: s_waitcnt vmcnt(0)
5684 ; SI-NEXT: buffer_wbinvl1
5685 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
5686 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
5687 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
5688 ; SI-NEXT: s_cbranch_execnz .LBB79_1
5689 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5690 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
5691 ; SI-NEXT: s_waitcnt expcnt(0)
5692 ; SI-NEXT: s_setpc_b64 s[30:31]
5694 ; VI-LABEL: global_atomic_umax_i64_ret_offset:
5696 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5697 ; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0
5698 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
5699 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
5700 ; VI-NEXT: s_mov_b64 s[4:5], 0
5701 ; VI-NEXT: .LBB79_1: ; %atomicrmw.start
5702 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5703 ; VI-NEXT: s_waitcnt vmcnt(0)
5704 ; VI-NEXT: v_mov_b32_e32 v9, v1
5705 ; VI-NEXT: v_mov_b32_e32 v8, v0
5706 ; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
5707 ; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
5708 ; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
5709 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5710 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
5711 ; VI-NEXT: s_waitcnt vmcnt(0)
5712 ; VI-NEXT: buffer_wbinvl1_vol
5713 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
5714 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5715 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
5716 ; VI-NEXT: s_cbranch_execnz .LBB79_1
5717 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5718 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
5719 ; VI-NEXT: s_setpc_b64 s[30:31]
5721 ; GFX9-LABEL: global_atomic_umax_i64_ret_offset:
5723 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5724 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32
5725 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
5726 ; GFX9-NEXT: .LBB79_1: ; %atomicrmw.start
5727 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5728 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5729 ; GFX9-NEXT: v_mov_b32_e32 v7, v5
5730 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
5731 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
5732 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
5733 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
5734 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5735 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
5736 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5737 ; GFX9-NEXT: buffer_wbinvl1_vol
5738 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5739 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5740 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
5741 ; GFX9-NEXT: s_cbranch_execnz .LBB79_1
5742 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5743 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
5744 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
5745 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
5746 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5747 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
5748 %result = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst
5752 define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
5753 ; SI-LABEL: global_atomic_umax_i64_noret_scalar:
5755 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5756 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
5757 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
5758 ; SI-NEXT: s_mov_b64 exec, s[34:35]
5759 ; SI-NEXT: s_waitcnt expcnt(0)
5760 ; SI-NEXT: v_writelane_b32 v0, s6, 0
5761 ; SI-NEXT: v_writelane_b32 v0, s7, 1
5762 ; SI-NEXT: s_mov_b32 s35, s7
5763 ; SI-NEXT: s_mov_b32 s34, s6
5764 ; SI-NEXT: s_mov_b32 s7, 0xf000
5765 ; SI-NEXT: s_mov_b32 s6, -1
5766 ; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0
5767 ; SI-NEXT: s_mov_b64 s[36:37], 0
5768 ; SI-NEXT: .LBB80_1: ; %atomicrmw.start
5769 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5770 ; SI-NEXT: v_mov_b32_e32 v1, s35
5771 ; SI-NEXT: s_waitcnt vmcnt(0)
5772 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[3:4]
5773 ; SI-NEXT: v_cndmask_b32_e32 v2, v1, v4, vcc
5774 ; SI-NEXT: v_mov_b32_e32 v1, s34
5775 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
5776 ; SI-NEXT: s_waitcnt expcnt(0)
5777 ; SI-NEXT: v_mov_b32_e32 v8, v4
5778 ; SI-NEXT: v_mov_b32_e32 v7, v3
5779 ; SI-NEXT: v_mov_b32_e32 v6, v2
5780 ; SI-NEXT: v_mov_b32_e32 v5, v1
5781 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5782 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc
5783 ; SI-NEXT: s_waitcnt vmcnt(0)
5784 ; SI-NEXT: buffer_wbinvl1
5785 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4]
5786 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
5787 ; SI-NEXT: v_mov_b32_e32 v3, v5
5788 ; SI-NEXT: v_mov_b32_e32 v4, v6
5789 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
5790 ; SI-NEXT: s_cbranch_execnz .LBB80_1
5791 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5792 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
5793 ; SI-NEXT: v_readlane_b32 s7, v0, 1
5794 ; SI-NEXT: v_readlane_b32 s6, v0, 0
5795 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
5796 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
5797 ; SI-NEXT: s_mov_b64 exec, s[34:35]
5798 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5799 ; SI-NEXT: s_setpc_b64 s[30:31]
5801 ; VI-LABEL: global_atomic_umax_i64_noret_scalar:
5803 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5804 ; VI-NEXT: v_mov_b32_e32 v0, s4
5805 ; VI-NEXT: v_mov_b32_e32 v1, s5
5806 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
5807 ; VI-NEXT: s_mov_b64 s[34:35], 0
5808 ; VI-NEXT: .LBB80_1: ; %atomicrmw.start
5809 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5810 ; VI-NEXT: s_waitcnt vmcnt(0)
5811 ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
5812 ; VI-NEXT: v_mov_b32_e32 v0, s7
5813 ; VI-NEXT: v_mov_b32_e32 v6, s6
5814 ; VI-NEXT: v_mov_b32_e32 v4, s4
5815 ; VI-NEXT: v_mov_b32_e32 v5, s5
5816 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5817 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
5818 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5819 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
5820 ; VI-NEXT: s_waitcnt vmcnt(0)
5821 ; VI-NEXT: buffer_wbinvl1_vol
5822 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5823 ; VI-NEXT: v_mov_b32_e32 v3, v1
5824 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5825 ; VI-NEXT: v_mov_b32_e32 v2, v0
5826 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
5827 ; VI-NEXT: s_cbranch_execnz .LBB80_1
5828 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5829 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
5830 ; VI-NEXT: s_setpc_b64 s[30:31]
5832 ; GFX9-LABEL: global_atomic_umax_i64_noret_scalar:
5834 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5835 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
5836 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5]
5837 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
5838 ; GFX9-NEXT: .LBB80_1: ; %atomicrmw.start
5839 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5840 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5841 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
5842 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
5843 ; GFX9-NEXT: v_mov_b32_e32 v5, s6
5844 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5845 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
5846 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5847 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
5848 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5849 ; GFX9-NEXT: buffer_wbinvl1_vol
5850 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5851 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
5852 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5853 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
5854 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
5855 ; GFX9-NEXT: s_cbranch_execnz .LBB80_1
5856 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5857 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
5858 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5859 %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i64 %in seq_cst
5863 define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
5864 ; SI-LABEL: global_atomic_umax_i64_noret_offset_scalar:
5866 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5867 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
5868 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
5869 ; SI-NEXT: s_mov_b64 exec, s[34:35]
5870 ; SI-NEXT: s_waitcnt expcnt(0)
5871 ; SI-NEXT: v_writelane_b32 v0, s6, 0
5872 ; SI-NEXT: v_writelane_b32 v0, s7, 1
5873 ; SI-NEXT: s_mov_b32 s35, s7
5874 ; SI-NEXT: s_mov_b32 s34, s6
5875 ; SI-NEXT: s_mov_b32 s7, 0xf000
5876 ; SI-NEXT: s_mov_b32 s6, -1
5877 ; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
5878 ; SI-NEXT: s_mov_b64 s[36:37], 0
5879 ; SI-NEXT: .LBB81_1: ; %atomicrmw.start
5880 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5881 ; SI-NEXT: v_mov_b32_e32 v1, s35
5882 ; SI-NEXT: s_waitcnt vmcnt(0)
5883 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[3:4]
5884 ; SI-NEXT: v_cndmask_b32_e32 v2, v1, v4, vcc
5885 ; SI-NEXT: v_mov_b32_e32 v1, s34
5886 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
5887 ; SI-NEXT: s_waitcnt expcnt(0)
5888 ; SI-NEXT: v_mov_b32_e32 v8, v4
5889 ; SI-NEXT: v_mov_b32_e32 v7, v3
5890 ; SI-NEXT: v_mov_b32_e32 v6, v2
5891 ; SI-NEXT: v_mov_b32_e32 v5, v1
5892 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5893 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc
5894 ; SI-NEXT: s_waitcnt vmcnt(0)
5895 ; SI-NEXT: buffer_wbinvl1
5896 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4]
5897 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
5898 ; SI-NEXT: v_mov_b32_e32 v3, v5
5899 ; SI-NEXT: v_mov_b32_e32 v4, v6
5900 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
5901 ; SI-NEXT: s_cbranch_execnz .LBB81_1
5902 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
5903 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
5904 ; SI-NEXT: v_readlane_b32 s7, v0, 1
5905 ; SI-NEXT: v_readlane_b32 s6, v0, 0
5906 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
5907 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
5908 ; SI-NEXT: s_mov_b64 exec, s[34:35]
5909 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5910 ; SI-NEXT: s_setpc_b64 s[30:31]
5912 ; VI-LABEL: global_atomic_umax_i64_noret_offset_scalar:
5914 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5915 ; VI-NEXT: s_add_u32 s34, s4, 32
5916 ; VI-NEXT: s_addc_u32 s35, s5, 0
5917 ; VI-NEXT: v_mov_b32_e32 v0, s34
5918 ; VI-NEXT: v_mov_b32_e32 v1, s35
5919 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
5920 ; VI-NEXT: s_mov_b64 s[36:37], 0
5921 ; VI-NEXT: .LBB81_1: ; %atomicrmw.start
5922 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
5923 ; VI-NEXT: s_waitcnt vmcnt(0)
5924 ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
5925 ; VI-NEXT: v_mov_b32_e32 v0, s7
5926 ; VI-NEXT: v_mov_b32_e32 v6, s6
5927 ; VI-NEXT: v_mov_b32_e32 v4, s34
5928 ; VI-NEXT: v_mov_b32_e32 v5, s35
5929 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5930 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
5931 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5932 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
5933 ; VI-NEXT: s_waitcnt vmcnt(0)
5934 ; VI-NEXT: buffer_wbinvl1_vol
5935 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5936 ; VI-NEXT: v_mov_b32_e32 v3, v1
5937 ; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
5938 ; VI-NEXT: v_mov_b32_e32 v2, v0
5939 ; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
5940 ; VI-NEXT: s_cbranch_execnz .LBB81_1
5941 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
5942 ; VI-NEXT: s_or_b64 exec, exec, s[36:37]
5943 ; VI-NEXT: s_setpc_b64 s[30:31]
5945 ; GFX9-LABEL: global_atomic_umax_i64_noret_offset_scalar:
5947 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5948 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
5949 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
5950 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
5951 ; GFX9-NEXT: .LBB81_1: ; %atomicrmw.start
5952 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5953 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5954 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
5955 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
5956 ; GFX9-NEXT: v_mov_b32_e32 v5, s6
5957 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5958 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
5959 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5960 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
5961 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5962 ; GFX9-NEXT: buffer_wbinvl1_vol
5963 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5964 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
5965 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5966 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
5967 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
5968 ; GFX9-NEXT: s_cbranch_execnz .LBB81_1
5969 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
5970 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
5971 ; GFX9-NEXT: s_setpc_b64 s[30:31]
5972 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
5973 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst
5977 define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
5978 ; SI-LABEL: global_atomic_umax_i64_ret_scalar:
5980 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5981 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
5982 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
5983 ; SI-NEXT: s_mov_b64 exec, s[34:35]
5984 ; SI-NEXT: s_waitcnt expcnt(0)
5985 ; SI-NEXT: v_writelane_b32 v2, s6, 0
5986 ; SI-NEXT: v_writelane_b32 v2, s7, 1
5987 ; SI-NEXT: s_mov_b32 s35, s7
5988 ; SI-NEXT: s_mov_b32 s34, s6
5989 ; SI-NEXT: s_mov_b32 s7, 0xf000
5990 ; SI-NEXT: s_mov_b32 s6, -1
5991 ; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0
5992 ; SI-NEXT: s_mov_b64 s[36:37], 0
5993 ; SI-NEXT: .LBB82_1: ; %atomicrmw.start
5994 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
5995 ; SI-NEXT: s_waitcnt vmcnt(0)
5996 ; SI-NEXT: v_mov_b32_e32 v8, v4
5997 ; SI-NEXT: v_mov_b32_e32 v7, v3
5998 ; SI-NEXT: v_mov_b32_e32 v0, s35
5999 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[7:8]
6000 ; SI-NEXT: s_waitcnt expcnt(0)
6001 ; SI-NEXT: v_cndmask_b32_e32 v6, v0, v8, vcc
6002 ; SI-NEXT: v_mov_b32_e32 v0, s34
6003 ; SI-NEXT: v_cndmask_b32_e32 v5, v0, v7, vcc
6004 ; SI-NEXT: v_mov_b32_e32 v3, v5
6005 ; SI-NEXT: v_mov_b32_e32 v4, v6
6006 ; SI-NEXT: v_mov_b32_e32 v5, v7
6007 ; SI-NEXT: v_mov_b32_e32 v6, v8
6008 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6009 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc
6010 ; SI-NEXT: s_waitcnt vmcnt(0)
6011 ; SI-NEXT: buffer_wbinvl1
6012 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
6013 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
6014 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
6015 ; SI-NEXT: s_cbranch_execnz .LBB82_1
6016 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6017 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
6018 ; SI-NEXT: v_mov_b32_e32 v0, v3
6019 ; SI-NEXT: v_mov_b32_e32 v1, v4
6020 ; SI-NEXT: v_readlane_b32 s7, v2, 1
6021 ; SI-NEXT: v_readlane_b32 s6, v2, 0
6022 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
6023 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
6024 ; SI-NEXT: s_mov_b64 exec, s[34:35]
6025 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
6026 ; SI-NEXT: s_setpc_b64 s[30:31]
6028 ; VI-LABEL: global_atomic_umax_i64_ret_scalar:
6030 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6031 ; VI-NEXT: v_mov_b32_e32 v0, s4
6032 ; VI-NEXT: v_mov_b32_e32 v1, s5
6033 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
6034 ; VI-NEXT: s_mov_b64 s[34:35], 0
6035 ; VI-NEXT: .LBB82_1: ; %atomicrmw.start
6036 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6037 ; VI-NEXT: s_waitcnt vmcnt(0)
6038 ; VI-NEXT: v_mov_b32_e32 v3, v1
6039 ; VI-NEXT: v_mov_b32_e32 v2, v0
6040 ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
6041 ; VI-NEXT: v_mov_b32_e32 v0, s7
6042 ; VI-NEXT: v_mov_b32_e32 v6, s6
6043 ; VI-NEXT: v_mov_b32_e32 v4, s4
6044 ; VI-NEXT: v_mov_b32_e32 v5, s5
6045 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
6046 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
6047 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6048 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
6049 ; VI-NEXT: s_waitcnt vmcnt(0)
6050 ; VI-NEXT: buffer_wbinvl1_vol
6051 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6052 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6053 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
6054 ; VI-NEXT: s_cbranch_execnz .LBB82_1
6055 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6056 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
6057 ; VI-NEXT: s_setpc_b64 s[30:31]
6059 ; GFX9-LABEL: global_atomic_umax_i64_ret_scalar:
6061 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6062 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
6063 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
6064 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
6065 ; GFX9-NEXT: .LBB82_1: ; %atomicrmw.start
6066 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6067 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6068 ; GFX9-NEXT: v_mov_b32_e32 v6, v1
6069 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
6070 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[5:6]
6071 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
6072 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
6073 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
6074 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
6075 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6076 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc
6077 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6078 ; GFX9-NEXT: buffer_wbinvl1_vol
6079 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
6080 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6081 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
6082 ; GFX9-NEXT: s_cbranch_execnz .LBB82_1
6083 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6084 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
6085 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6086 %result = atomicrmw umax ptr addrspace(1) %ptr, i64 %in seq_cst
6090 define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
6091 ; SI-LABEL: global_atomic_umax_i64_ret_offset_scalar:
6093 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6094 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
6095 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
6096 ; SI-NEXT: s_mov_b64 exec, s[34:35]
6097 ; SI-NEXT: s_waitcnt expcnt(0)
6098 ; SI-NEXT: v_writelane_b32 v2, s6, 0
6099 ; SI-NEXT: v_writelane_b32 v2, s7, 1
6100 ; SI-NEXT: s_mov_b32 s35, s7
6101 ; SI-NEXT: s_mov_b32 s34, s6
6102 ; SI-NEXT: s_mov_b32 s7, 0xf000
6103 ; SI-NEXT: s_mov_b32 s6, -1
6104 ; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
6105 ; SI-NEXT: s_mov_b64 s[36:37], 0
6106 ; SI-NEXT: .LBB83_1: ; %atomicrmw.start
6107 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6108 ; SI-NEXT: s_waitcnt vmcnt(0)
6109 ; SI-NEXT: v_mov_b32_e32 v8, v4
6110 ; SI-NEXT: v_mov_b32_e32 v7, v3
6111 ; SI-NEXT: v_mov_b32_e32 v0, s35
6112 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[34:35], v[7:8]
6113 ; SI-NEXT: s_waitcnt expcnt(0)
6114 ; SI-NEXT: v_cndmask_b32_e32 v6, v0, v8, vcc
6115 ; SI-NEXT: v_mov_b32_e32 v0, s34
6116 ; SI-NEXT: v_cndmask_b32_e32 v5, v0, v7, vcc
6117 ; SI-NEXT: v_mov_b32_e32 v3, v5
6118 ; SI-NEXT: v_mov_b32_e32 v4, v6
6119 ; SI-NEXT: v_mov_b32_e32 v5, v7
6120 ; SI-NEXT: v_mov_b32_e32 v6, v8
6121 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6122 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc
6123 ; SI-NEXT: s_waitcnt vmcnt(0)
6124 ; SI-NEXT: buffer_wbinvl1
6125 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
6126 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
6127 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
6128 ; SI-NEXT: s_cbranch_execnz .LBB83_1
6129 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6130 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
6131 ; SI-NEXT: v_mov_b32_e32 v0, v3
6132 ; SI-NEXT: v_mov_b32_e32 v1, v4
6133 ; SI-NEXT: v_readlane_b32 s7, v2, 1
6134 ; SI-NEXT: v_readlane_b32 s6, v2, 0
6135 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
6136 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
6137 ; SI-NEXT: s_mov_b64 exec, s[34:35]
6138 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
6139 ; SI-NEXT: s_setpc_b64 s[30:31]
6141 ; VI-LABEL: global_atomic_umax_i64_ret_offset_scalar:
6143 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6144 ; VI-NEXT: s_add_u32 s34, s4, 32
6145 ; VI-NEXT: s_addc_u32 s35, s5, 0
6146 ; VI-NEXT: v_mov_b32_e32 v0, s34
6147 ; VI-NEXT: v_mov_b32_e32 v1, s35
6148 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
6149 ; VI-NEXT: s_mov_b64 s[36:37], 0
6150 ; VI-NEXT: .LBB83_1: ; %atomicrmw.start
6151 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6152 ; VI-NEXT: s_waitcnt vmcnt(0)
6153 ; VI-NEXT: v_mov_b32_e32 v3, v1
6154 ; VI-NEXT: v_mov_b32_e32 v2, v0
6155 ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
6156 ; VI-NEXT: v_mov_b32_e32 v0, s7
6157 ; VI-NEXT: v_mov_b32_e32 v6, s6
6158 ; VI-NEXT: v_mov_b32_e32 v4, s34
6159 ; VI-NEXT: v_mov_b32_e32 v5, s35
6160 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
6161 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
6162 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6163 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
6164 ; VI-NEXT: s_waitcnt vmcnt(0)
6165 ; VI-NEXT: buffer_wbinvl1_vol
6166 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6167 ; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
6168 ; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
6169 ; VI-NEXT: s_cbranch_execnz .LBB83_1
6170 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6171 ; VI-NEXT: s_or_b64 exec, exec, s[36:37]
6172 ; VI-NEXT: s_setpc_b64 s[30:31]
6174 ; GFX9-LABEL: global_atomic_umax_i64_ret_offset_scalar:
6176 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6177 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
6178 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
6179 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
6180 ; GFX9-NEXT: .LBB83_1: ; %atomicrmw.start
6181 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6182 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6183 ; GFX9-NEXT: v_mov_b32_e32 v6, v1
6184 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
6185 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[5:6]
6186 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
6187 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
6188 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
6189 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
6190 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6191 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc
6192 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6193 ; GFX9-NEXT: buffer_wbinvl1_vol
6194 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
6195 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6196 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
6197 ; GFX9-NEXT: s_cbranch_execnz .LBB83_1
6198 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6199 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
6200 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6201 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
6202 %result = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst
6206 define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
6207 ; SI-LABEL: atomic_umax_i64_addr64_offset:
6208 ; SI: ; %bb.0: ; %entry
6209 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
6210 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
6211 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6212 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
6213 ; SI-NEXT: s_add_u32 s4, s0, s4
6214 ; SI-NEXT: s_addc_u32 s5, s1, s5
6215 ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
6216 ; SI-NEXT: s_mov_b64 s[0:1], 0
6217 ; SI-NEXT: s_mov_b32 s7, 0xf000
6218 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6219 ; SI-NEXT: v_mov_b32_e32 v2, s8
6220 ; SI-NEXT: v_mov_b32_e32 v3, s9
6221 ; SI-NEXT: s_mov_b32 s6, -1
6222 ; SI-NEXT: .LBB84_1: ; %atomicrmw.start
6223 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6224 ; SI-NEXT: v_mov_b32_e32 v0, s3
6225 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
6226 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
6227 ; SI-NEXT: v_mov_b32_e32 v0, s2
6228 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
6229 ; SI-NEXT: s_waitcnt expcnt(0)
6230 ; SI-NEXT: v_mov_b32_e32 v7, v3
6231 ; SI-NEXT: v_mov_b32_e32 v6, v2
6232 ; SI-NEXT: v_mov_b32_e32 v5, v1
6233 ; SI-NEXT: v_mov_b32_e32 v4, v0
6234 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6235 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc
6236 ; SI-NEXT: s_waitcnt vmcnt(0)
6237 ; SI-NEXT: buffer_wbinvl1
6238 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
6239 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
6240 ; SI-NEXT: v_mov_b32_e32 v2, v4
6241 ; SI-NEXT: v_mov_b32_e32 v3, v5
6242 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
6243 ; SI-NEXT: s_cbranch_execnz .LBB84_1
6244 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6247 ; VI-LABEL: atomic_umax_i64_addr64_offset:
6248 ; VI: ; %bb.0: ; %entry
6249 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
6250 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
6251 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6252 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
6253 ; VI-NEXT: s_add_u32 s0, s0, s4
6254 ; VI-NEXT: s_addc_u32 s1, s1, s5
6255 ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
6256 ; VI-NEXT: s_add_u32 s0, s0, 32
6257 ; VI-NEXT: s_addc_u32 s1, s1, 0
6258 ; VI-NEXT: s_mov_b64 s[4:5], 0
6259 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6260 ; VI-NEXT: v_mov_b32_e32 v2, s6
6261 ; VI-NEXT: v_mov_b32_e32 v3, s7
6262 ; VI-NEXT: .LBB84_1: ; %atomicrmw.start
6263 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6264 ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
6265 ; VI-NEXT: v_mov_b32_e32 v0, s3
6266 ; VI-NEXT: v_mov_b32_e32 v6, s2
6267 ; VI-NEXT: v_mov_b32_e32 v5, s1
6268 ; VI-NEXT: v_mov_b32_e32 v4, s0
6269 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
6270 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
6271 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6272 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
6273 ; VI-NEXT: s_waitcnt vmcnt(0)
6274 ; VI-NEXT: buffer_wbinvl1_vol
6275 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6276 ; VI-NEXT: v_mov_b32_e32 v3, v1
6277 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6278 ; VI-NEXT: v_mov_b32_e32 v2, v0
6279 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
6280 ; VI-NEXT: s_cbranch_execnz .LBB84_1
6281 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6284 ; GFX9-LABEL: atomic_umax_i64_addr64_offset:
6285 ; GFX9: ; %bb.0: ; %entry
6286 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
6287 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
6288 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
6289 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6290 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
6291 ; GFX9-NEXT: s_add_u32 s0, s4, s0
6292 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
6293 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20
6294 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
6295 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6296 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
6297 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
6298 ; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start
6299 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6300 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
6301 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
6302 ; GFX9-NEXT: v_mov_b32_e32 v5, s6
6303 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
6304 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
6305 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6306 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc
6307 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6308 ; GFX9-NEXT: buffer_wbinvl1_vol
6309 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6310 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
6311 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
6312 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
6313 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
6314 ; GFX9-NEXT: s_cbranch_execnz .LBB84_1
6315 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6316 ; GFX9-NEXT: s_endpgm
6318 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
6319 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
6320 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst
6324 define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
6325 ; SI-LABEL: atomic_umax_i64_ret_addr64_offset:
6326 ; SI: ; %bb.0: ; %entry
6327 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
6328 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6329 ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
6330 ; SI-NEXT: s_add_u32 s8, s0, s6
6331 ; SI-NEXT: s_addc_u32 s9, s1, s7
6332 ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
6333 ; SI-NEXT: s_mov_b64 s[0:1], 0
6334 ; SI-NEXT: s_mov_b32 s11, 0xf000
6335 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6336 ; SI-NEXT: v_mov_b32_e32 v2, s6
6337 ; SI-NEXT: v_mov_b32_e32 v3, s7
6338 ; SI-NEXT: s_mov_b32 s10, -1
6339 ; SI-NEXT: .LBB85_1: ; %atomicrmw.start
6340 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6341 ; SI-NEXT: v_mov_b32_e32 v0, s5
6342 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
6343 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
6344 ; SI-NEXT: v_mov_b32_e32 v0, s4
6345 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
6346 ; SI-NEXT: s_waitcnt expcnt(0)
6347 ; SI-NEXT: v_mov_b32_e32 v7, v3
6348 ; SI-NEXT: v_mov_b32_e32 v6, v2
6349 ; SI-NEXT: v_mov_b32_e32 v5, v1
6350 ; SI-NEXT: v_mov_b32_e32 v4, v0
6351 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6352 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 offset:32 glc
6353 ; SI-NEXT: s_waitcnt vmcnt(0)
6354 ; SI-NEXT: buffer_wbinvl1
6355 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
6356 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
6357 ; SI-NEXT: v_mov_b32_e32 v2, v4
6358 ; SI-NEXT: v_mov_b32_e32 v3, v5
6359 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
6360 ; SI-NEXT: s_cbranch_execnz .LBB85_1
6361 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6362 ; SI-NEXT: s_or_b64 exec, exec, s[0:1]
6363 ; SI-NEXT: s_mov_b32 s7, 0xf000
6364 ; SI-NEXT: s_mov_b32 s6, -1
6365 ; SI-NEXT: s_mov_b32 s4, s2
6366 ; SI-NEXT: s_mov_b32 s5, s3
6367 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0
6370 ; VI-LABEL: atomic_umax_i64_ret_addr64_offset:
6371 ; VI: ; %bb.0: ; %entry
6372 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
6373 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6374 ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
6375 ; VI-NEXT: s_add_u32 s0, s0, s6
6376 ; VI-NEXT: s_addc_u32 s1, s1, s7
6377 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20
6378 ; VI-NEXT: s_add_u32 s0, s0, 32
6379 ; VI-NEXT: s_addc_u32 s1, s1, 0
6380 ; VI-NEXT: s_mov_b64 s[6:7], 0
6381 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6382 ; VI-NEXT: v_mov_b32_e32 v0, s8
6383 ; VI-NEXT: v_mov_b32_e32 v1, s9
6384 ; VI-NEXT: .LBB85_1: ; %atomicrmw.start
6385 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6386 ; VI-NEXT: v_mov_b32_e32 v3, v1
6387 ; VI-NEXT: v_mov_b32_e32 v2, v0
6388 ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
6389 ; VI-NEXT: v_mov_b32_e32 v0, s5
6390 ; VI-NEXT: v_mov_b32_e32 v6, s4
6391 ; VI-NEXT: v_mov_b32_e32 v5, s1
6392 ; VI-NEXT: v_mov_b32_e32 v4, s0
6393 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
6394 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
6395 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6396 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
6397 ; VI-NEXT: s_waitcnt vmcnt(0)
6398 ; VI-NEXT: buffer_wbinvl1_vol
6399 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6400 ; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
6401 ; VI-NEXT: s_andn2_b64 exec, exec, s[6:7]
6402 ; VI-NEXT: s_cbranch_execnz .LBB85_1
6403 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6404 ; VI-NEXT: s_or_b64 exec, exec, s[6:7]
6405 ; VI-NEXT: v_mov_b32_e32 v2, s2
6406 ; VI-NEXT: v_mov_b32_e32 v3, s3
6407 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
6410 ; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset:
6411 ; GFX9: ; %bb.0: ; %entry
6412 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
6413 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
6414 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6415 ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
6416 ; GFX9-NEXT: s_add_u32 s0, s0, s6
6417 ; GFX9-NEXT: s_addc_u32 s1, s1, s7
6418 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20
6419 ; GFX9-NEXT: s_mov_b64 s[6:7], 0
6420 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6421 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
6422 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
6423 ; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start
6424 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6425 ; GFX9-NEXT: v_mov_b32_e32 v6, v1
6426 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
6427 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[5:6]
6428 ; GFX9-NEXT: v_mov_b32_e32 v0, s5
6429 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
6430 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
6431 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
6432 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6433 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] offset:32 glc
6434 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6435 ; GFX9-NEXT: buffer_wbinvl1_vol
6436 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
6437 ; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
6438 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
6439 ; GFX9-NEXT: s_cbranch_execnz .LBB85_1
6440 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6441 ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
6442 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
6443 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
6444 ; GFX9-NEXT: s_endpgm
6446 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
6447 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
6448 %tmp0 = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst
6449 store i64 %tmp0, ptr addrspace(1) %out2
6453 define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
6454 ; SI-LABEL: atomic_umax_i64_ret_addr64:
6455 ; SI: ; %bb.0: ; %entry
6456 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
6457 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6458 ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
6459 ; SI-NEXT: s_add_u32 s8, s0, s6
6460 ; SI-NEXT: s_addc_u32 s9, s1, s7
6461 ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
6462 ; SI-NEXT: s_mov_b64 s[0:1], 0
6463 ; SI-NEXT: s_mov_b32 s11, 0xf000
6464 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6465 ; SI-NEXT: v_mov_b32_e32 v2, s6
6466 ; SI-NEXT: v_mov_b32_e32 v3, s7
6467 ; SI-NEXT: s_mov_b32 s10, -1
6468 ; SI-NEXT: .LBB86_1: ; %atomicrmw.start
6469 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6470 ; SI-NEXT: v_mov_b32_e32 v0, s5
6471 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
6472 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
6473 ; SI-NEXT: v_mov_b32_e32 v0, s4
6474 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
6475 ; SI-NEXT: s_waitcnt expcnt(0)
6476 ; SI-NEXT: v_mov_b32_e32 v7, v3
6477 ; SI-NEXT: v_mov_b32_e32 v6, v2
6478 ; SI-NEXT: v_mov_b32_e32 v5, v1
6479 ; SI-NEXT: v_mov_b32_e32 v4, v0
6480 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6481 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 glc
6482 ; SI-NEXT: s_waitcnt vmcnt(0)
6483 ; SI-NEXT: buffer_wbinvl1
6484 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
6485 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
6486 ; SI-NEXT: v_mov_b32_e32 v2, v4
6487 ; SI-NEXT: v_mov_b32_e32 v3, v5
6488 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
6489 ; SI-NEXT: s_cbranch_execnz .LBB86_1
6490 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6491 ; SI-NEXT: s_or_b64 exec, exec, s[0:1]
6492 ; SI-NEXT: s_mov_b32 s7, 0xf000
6493 ; SI-NEXT: s_mov_b32 s6, -1
6494 ; SI-NEXT: s_mov_b32 s4, s2
6495 ; SI-NEXT: s_mov_b32 s5, s3
6496 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0
6499 ; VI-LABEL: atomic_umax_i64_ret_addr64:
6500 ; VI: ; %bb.0: ; %entry
6501 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
6502 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6503 ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
6504 ; VI-NEXT: s_add_u32 s0, s0, s6
6505 ; VI-NEXT: s_addc_u32 s1, s1, s7
6506 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
6507 ; VI-NEXT: s_mov_b64 s[6:7], 0
6508 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6509 ; VI-NEXT: v_mov_b32_e32 v0, s8
6510 ; VI-NEXT: v_mov_b32_e32 v1, s9
6511 ; VI-NEXT: .LBB86_1: ; %atomicrmw.start
6512 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6513 ; VI-NEXT: v_mov_b32_e32 v3, v1
6514 ; VI-NEXT: v_mov_b32_e32 v2, v0
6515 ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
6516 ; VI-NEXT: v_mov_b32_e32 v0, s5
6517 ; VI-NEXT: v_mov_b32_e32 v6, s4
6518 ; VI-NEXT: v_mov_b32_e32 v5, s1
6519 ; VI-NEXT: v_mov_b32_e32 v4, s0
6520 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
6521 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
6522 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6523 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
6524 ; VI-NEXT: s_waitcnt vmcnt(0)
6525 ; VI-NEXT: buffer_wbinvl1_vol
6526 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6527 ; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
6528 ; VI-NEXT: s_andn2_b64 exec, exec, s[6:7]
6529 ; VI-NEXT: s_cbranch_execnz .LBB86_1
6530 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6531 ; VI-NEXT: s_or_b64 exec, exec, s[6:7]
6532 ; VI-NEXT: v_mov_b32_e32 v2, s2
6533 ; VI-NEXT: v_mov_b32_e32 v3, s3
6534 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
6537 ; GFX9-LABEL: atomic_umax_i64_ret_addr64:
6538 ; GFX9: ; %bb.0: ; %entry
6539 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
6540 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
6541 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6542 ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
6543 ; GFX9-NEXT: s_add_u32 s0, s0, s6
6544 ; GFX9-NEXT: s_addc_u32 s1, s1, s7
6545 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
6546 ; GFX9-NEXT: s_mov_b64 s[6:7], 0
6547 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6548 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
6549 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
6550 ; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start
6551 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6552 ; GFX9-NEXT: v_mov_b32_e32 v6, v1
6553 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
6554 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[5:6]
6555 ; GFX9-NEXT: v_mov_b32_e32 v0, s5
6556 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
6557 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
6558 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
6559 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6560 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] glc
6561 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6562 ; GFX9-NEXT: buffer_wbinvl1_vol
6563 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
6564 ; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
6565 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
6566 ; GFX9-NEXT: s_cbranch_execnz .LBB86_1
6567 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6568 ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
6569 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
6570 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
6571 ; GFX9-NEXT: s_endpgm
6573 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
6574 %tmp0 = atomicrmw umax ptr addrspace(1) %ptr, i64 %in seq_cst
6575 store i64 %tmp0, ptr addrspace(1) %out2
6579 ; ---------------------------------------------------------------------
6581 ; ---------------------------------------------------------------------
6583 define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
6584 ; SI-LABEL: global_atomic_umin_i64_noret:
6586 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6587 ; SI-NEXT: s_mov_b32 s6, 0
6588 ; SI-NEXT: s_mov_b32 s7, 0xf000
6589 ; SI-NEXT: s_mov_b32 s4, s6
6590 ; SI-NEXT: s_mov_b32 s5, s6
6591 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
6592 ; SI-NEXT: s_mov_b64 s[8:9], 0
6593 ; SI-NEXT: .LBB87_1: ; %atomicrmw.start
6594 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6595 ; SI-NEXT: s_waitcnt vmcnt(0)
6596 ; SI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
6597 ; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
6598 ; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
6599 ; SI-NEXT: s_waitcnt expcnt(0)
6600 ; SI-NEXT: v_mov_b32_e32 v11, v7
6601 ; SI-NEXT: v_mov_b32_e32 v10, v6
6602 ; SI-NEXT: v_mov_b32_e32 v9, v5
6603 ; SI-NEXT: v_mov_b32_e32 v8, v4
6604 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6605 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
6606 ; SI-NEXT: s_waitcnt vmcnt(0)
6607 ; SI-NEXT: buffer_wbinvl1
6608 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
6609 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
6610 ; SI-NEXT: v_mov_b32_e32 v6, v8
6611 ; SI-NEXT: v_mov_b32_e32 v7, v9
6612 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
6613 ; SI-NEXT: s_cbranch_execnz .LBB87_1
6614 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6615 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
6616 ; SI-NEXT: s_waitcnt expcnt(0)
6617 ; SI-NEXT: s_setpc_b64 s[30:31]
6619 ; VI-LABEL: global_atomic_umin_i64_noret:
6621 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6622 ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
6623 ; VI-NEXT: s_mov_b64 s[4:5], 0
6624 ; VI-NEXT: .LBB87_1: ; %atomicrmw.start
6625 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6626 ; VI-NEXT: s_waitcnt vmcnt(0)
6627 ; VI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
6628 ; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
6629 ; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
6630 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6631 ; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
6632 ; VI-NEXT: s_waitcnt vmcnt(0)
6633 ; VI-NEXT: buffer_wbinvl1_vol
6634 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6635 ; VI-NEXT: v_mov_b32_e32 v7, v5
6636 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6637 ; VI-NEXT: v_mov_b32_e32 v6, v4
6638 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
6639 ; VI-NEXT: s_cbranch_execnz .LBB87_1
6640 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6641 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
6642 ; VI-NEXT: s_setpc_b64 s[30:31]
6644 ; GFX9-LABEL: global_atomic_umin_i64_noret:
6646 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6647 ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
6648 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
6649 ; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start
6650 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6651 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6652 ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
6653 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
6654 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
6655 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6656 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
6657 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6658 ; GFX9-NEXT: buffer_wbinvl1_vol
6659 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6660 ; GFX9-NEXT: v_mov_b32_e32 v7, v5
6661 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6662 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
6663 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
6664 ; GFX9-NEXT: s_cbranch_execnz .LBB87_1
6665 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6666 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
6667 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6668 %tmp0 = atomicrmw umin ptr addrspace(1) %ptr, i64 %in seq_cst
6672 define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
6673 ; SI-LABEL: global_atomic_umin_i64_noret_offset:
6675 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6676 ; SI-NEXT: s_mov_b32 s6, 0
6677 ; SI-NEXT: s_mov_b32 s7, 0xf000
6678 ; SI-NEXT: s_mov_b32 s4, s6
6679 ; SI-NEXT: s_mov_b32 s5, s6
6680 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
6681 ; SI-NEXT: s_mov_b64 s[8:9], 0
6682 ; SI-NEXT: .LBB88_1: ; %atomicrmw.start
6683 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6684 ; SI-NEXT: s_waitcnt vmcnt(0)
6685 ; SI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
6686 ; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
6687 ; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
6688 ; SI-NEXT: s_waitcnt expcnt(0)
6689 ; SI-NEXT: v_mov_b32_e32 v11, v7
6690 ; SI-NEXT: v_mov_b32_e32 v10, v6
6691 ; SI-NEXT: v_mov_b32_e32 v9, v5
6692 ; SI-NEXT: v_mov_b32_e32 v8, v4
6693 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6694 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
6695 ; SI-NEXT: s_waitcnt vmcnt(0)
6696 ; SI-NEXT: buffer_wbinvl1
6697 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
6698 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
6699 ; SI-NEXT: v_mov_b32_e32 v6, v8
6700 ; SI-NEXT: v_mov_b32_e32 v7, v9
6701 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
6702 ; SI-NEXT: s_cbranch_execnz .LBB88_1
6703 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6704 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
6705 ; SI-NEXT: s_waitcnt expcnt(0)
6706 ; SI-NEXT: s_setpc_b64 s[30:31]
6708 ; VI-LABEL: global_atomic_umin_i64_noret_offset:
6710 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6711 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
6712 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
6713 ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
6714 ; VI-NEXT: s_mov_b64 s[4:5], 0
6715 ; VI-NEXT: .LBB88_1: ; %atomicrmw.start
6716 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6717 ; VI-NEXT: s_waitcnt vmcnt(0)
6718 ; VI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
6719 ; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
6720 ; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
6721 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6722 ; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
6723 ; VI-NEXT: s_waitcnt vmcnt(0)
6724 ; VI-NEXT: buffer_wbinvl1_vol
6725 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6726 ; VI-NEXT: v_mov_b32_e32 v7, v5
6727 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6728 ; VI-NEXT: v_mov_b32_e32 v6, v4
6729 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
6730 ; VI-NEXT: s_cbranch_execnz .LBB88_1
6731 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6732 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
6733 ; VI-NEXT: s_setpc_b64 s[30:31]
6735 ; GFX9-LABEL: global_atomic_umin_i64_noret_offset:
6737 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6738 ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32
6739 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
6740 ; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start
6741 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6742 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6743 ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
6744 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
6745 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
6746 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6747 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
6748 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6749 ; GFX9-NEXT: buffer_wbinvl1_vol
6750 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6751 ; GFX9-NEXT: v_mov_b32_e32 v7, v5
6752 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6753 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
6754 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
6755 ; GFX9-NEXT: s_cbranch_execnz .LBB88_1
6756 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6757 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
6758 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6759 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
6760 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst
6764 define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
6765 ; SI-LABEL: global_atomic_umin_i64_ret:
6767 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6768 ; SI-NEXT: v_mov_b32_e32 v5, v3
6769 ; SI-NEXT: v_mov_b32_e32 v4, v2
6770 ; SI-NEXT: v_mov_b32_e32 v7, v1
6771 ; SI-NEXT: v_mov_b32_e32 v6, v0
6772 ; SI-NEXT: s_mov_b32 s6, 0
6773 ; SI-NEXT: s_mov_b32 s7, 0xf000
6774 ; SI-NEXT: s_mov_b32 s4, s6
6775 ; SI-NEXT: s_mov_b32 s5, s6
6776 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
6777 ; SI-NEXT: s_mov_b64 s[8:9], 0
6778 ; SI-NEXT: .LBB89_1: ; %atomicrmw.start
6779 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6780 ; SI-NEXT: s_waitcnt vmcnt(0)
6781 ; SI-NEXT: v_mov_b32_e32 v11, v1
6782 ; SI-NEXT: v_mov_b32_e32 v10, v0
6783 ; SI-NEXT: v_cmp_le_u64_e32 vcc, v[10:11], v[4:5]
6784 ; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc
6785 ; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc
6786 ; SI-NEXT: s_waitcnt expcnt(0)
6787 ; SI-NEXT: v_mov_b32_e32 v0, v8
6788 ; SI-NEXT: v_mov_b32_e32 v1, v9
6789 ; SI-NEXT: v_mov_b32_e32 v2, v10
6790 ; SI-NEXT: v_mov_b32_e32 v3, v11
6791 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6792 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
6793 ; SI-NEXT: s_waitcnt vmcnt(0)
6794 ; SI-NEXT: buffer_wbinvl1
6795 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
6796 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
6797 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
6798 ; SI-NEXT: s_cbranch_execnz .LBB89_1
6799 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6800 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
6801 ; SI-NEXT: s_waitcnt expcnt(0)
6802 ; SI-NEXT: s_setpc_b64 s[30:31]
6804 ; VI-LABEL: global_atomic_umin_i64_ret:
6806 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6807 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
6808 ; VI-NEXT: s_mov_b64 s[4:5], 0
6809 ; VI-NEXT: .LBB89_1: ; %atomicrmw.start
6810 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6811 ; VI-NEXT: s_waitcnt vmcnt(0)
6812 ; VI-NEXT: v_mov_b32_e32 v7, v5
6813 ; VI-NEXT: v_mov_b32_e32 v6, v4
6814 ; VI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
6815 ; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
6816 ; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
6817 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6818 ; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
6819 ; VI-NEXT: s_waitcnt vmcnt(0)
6820 ; VI-NEXT: buffer_wbinvl1_vol
6821 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6822 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6823 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
6824 ; VI-NEXT: s_cbranch_execnz .LBB89_1
6825 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6826 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
6827 ; VI-NEXT: v_mov_b32_e32 v0, v4
6828 ; VI-NEXT: v_mov_b32_e32 v1, v5
6829 ; VI-NEXT: s_setpc_b64 s[30:31]
6831 ; GFX9-LABEL: global_atomic_umin_i64_ret:
6833 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6834 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
6835 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
6836 ; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start
6837 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6838 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6839 ; GFX9-NEXT: v_mov_b32_e32 v7, v5
6840 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
6841 ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
6842 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
6843 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
6844 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6845 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
6846 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6847 ; GFX9-NEXT: buffer_wbinvl1_vol
6848 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6849 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6850 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
6851 ; GFX9-NEXT: s_cbranch_execnz .LBB89_1
6852 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6853 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
6854 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
6855 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
6856 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6857 %result = atomicrmw umin ptr addrspace(1) %ptr, i64 %in seq_cst
6861 define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
6862 ; SI-LABEL: global_atomic_umin_i64_ret_offset:
6864 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6865 ; SI-NEXT: v_mov_b32_e32 v5, v3
6866 ; SI-NEXT: v_mov_b32_e32 v4, v2
6867 ; SI-NEXT: v_mov_b32_e32 v7, v1
6868 ; SI-NEXT: v_mov_b32_e32 v6, v0
6869 ; SI-NEXT: s_mov_b32 s6, 0
6870 ; SI-NEXT: s_mov_b32 s7, 0xf000
6871 ; SI-NEXT: s_mov_b32 s4, s6
6872 ; SI-NEXT: s_mov_b32 s5, s6
6873 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32
6874 ; SI-NEXT: s_mov_b64 s[8:9], 0
6875 ; SI-NEXT: .LBB90_1: ; %atomicrmw.start
6876 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6877 ; SI-NEXT: s_waitcnt vmcnt(0)
6878 ; SI-NEXT: v_mov_b32_e32 v11, v1
6879 ; SI-NEXT: v_mov_b32_e32 v10, v0
6880 ; SI-NEXT: v_cmp_le_u64_e32 vcc, v[10:11], v[4:5]
6881 ; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc
6882 ; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc
6883 ; SI-NEXT: s_waitcnt expcnt(0)
6884 ; SI-NEXT: v_mov_b32_e32 v0, v8
6885 ; SI-NEXT: v_mov_b32_e32 v1, v9
6886 ; SI-NEXT: v_mov_b32_e32 v2, v10
6887 ; SI-NEXT: v_mov_b32_e32 v3, v11
6888 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6889 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc
6890 ; SI-NEXT: s_waitcnt vmcnt(0)
6891 ; SI-NEXT: buffer_wbinvl1
6892 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
6893 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
6894 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
6895 ; SI-NEXT: s_cbranch_execnz .LBB90_1
6896 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6897 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
6898 ; SI-NEXT: s_waitcnt expcnt(0)
6899 ; SI-NEXT: s_setpc_b64 s[30:31]
6901 ; VI-LABEL: global_atomic_umin_i64_ret_offset:
6903 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6904 ; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0
6905 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
6906 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
6907 ; VI-NEXT: s_mov_b64 s[4:5], 0
6908 ; VI-NEXT: .LBB90_1: ; %atomicrmw.start
6909 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
6910 ; VI-NEXT: s_waitcnt vmcnt(0)
6911 ; VI-NEXT: v_mov_b32_e32 v9, v1
6912 ; VI-NEXT: v_mov_b32_e32 v8, v0
6913 ; VI-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
6914 ; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
6915 ; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
6916 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6917 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
6918 ; VI-NEXT: s_waitcnt vmcnt(0)
6919 ; VI-NEXT: buffer_wbinvl1_vol
6920 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
6921 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6922 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
6923 ; VI-NEXT: s_cbranch_execnz .LBB90_1
6924 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
6925 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
6926 ; VI-NEXT: s_setpc_b64 s[30:31]
6928 ; GFX9-LABEL: global_atomic_umin_i64_ret_offset:
6930 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6931 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32
6932 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
6933 ; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start
6934 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6935 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6936 ; GFX9-NEXT: v_mov_b32_e32 v7, v5
6937 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
6938 ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
6939 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
6940 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
6941 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6942 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
6943 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6944 ; GFX9-NEXT: buffer_wbinvl1_vol
6945 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6946 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6947 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
6948 ; GFX9-NEXT: s_cbranch_execnz .LBB90_1
6949 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
6950 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
6951 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
6952 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
6953 ; GFX9-NEXT: s_setpc_b64 s[30:31]
6954 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
6955 %result = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst
6959 define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
6960 ; SI-LABEL: global_atomic_umin_i64_noret_scalar:
6962 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6963 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
6964 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
6965 ; SI-NEXT: s_mov_b64 exec, s[34:35]
6966 ; SI-NEXT: s_waitcnt expcnt(0)
6967 ; SI-NEXT: v_writelane_b32 v0, s6, 0
6968 ; SI-NEXT: v_writelane_b32 v0, s7, 1
6969 ; SI-NEXT: s_mov_b32 s35, s7
6970 ; SI-NEXT: s_mov_b32 s34, s6
6971 ; SI-NEXT: s_mov_b32 s7, 0xf000
6972 ; SI-NEXT: s_mov_b32 s6, -1
6973 ; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0
6974 ; SI-NEXT: s_mov_b64 s[36:37], 0
6975 ; SI-NEXT: .LBB91_1: ; %atomicrmw.start
6976 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
6977 ; SI-NEXT: v_mov_b32_e32 v1, s35
6978 ; SI-NEXT: s_waitcnt vmcnt(0)
6979 ; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[3:4]
6980 ; SI-NEXT: v_cndmask_b32_e32 v2, v1, v4, vcc
6981 ; SI-NEXT: v_mov_b32_e32 v1, s34
6982 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
6983 ; SI-NEXT: s_waitcnt expcnt(0)
6984 ; SI-NEXT: v_mov_b32_e32 v8, v4
6985 ; SI-NEXT: v_mov_b32_e32 v7, v3
6986 ; SI-NEXT: v_mov_b32_e32 v6, v2
6987 ; SI-NEXT: v_mov_b32_e32 v5, v1
6988 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6989 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc
6990 ; SI-NEXT: s_waitcnt vmcnt(0)
6991 ; SI-NEXT: buffer_wbinvl1
6992 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4]
6993 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
6994 ; SI-NEXT: v_mov_b32_e32 v3, v5
6995 ; SI-NEXT: v_mov_b32_e32 v4, v6
6996 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
6997 ; SI-NEXT: s_cbranch_execnz .LBB91_1
6998 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
6999 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
7000 ; SI-NEXT: v_readlane_b32 s7, v0, 1
7001 ; SI-NEXT: v_readlane_b32 s6, v0, 0
7002 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7003 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
7004 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7005 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
7006 ; SI-NEXT: s_setpc_b64 s[30:31]
7008 ; VI-LABEL: global_atomic_umin_i64_noret_scalar:
7010 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7011 ; VI-NEXT: v_mov_b32_e32 v0, s4
7012 ; VI-NEXT: v_mov_b32_e32 v1, s5
7013 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
7014 ; VI-NEXT: s_mov_b64 s[34:35], 0
7015 ; VI-NEXT: .LBB91_1: ; %atomicrmw.start
7016 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7017 ; VI-NEXT: s_waitcnt vmcnt(0)
7018 ; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
7019 ; VI-NEXT: v_mov_b32_e32 v0, s7
7020 ; VI-NEXT: v_mov_b32_e32 v6, s6
7021 ; VI-NEXT: v_mov_b32_e32 v4, s4
7022 ; VI-NEXT: v_mov_b32_e32 v5, s5
7023 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
7024 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
7025 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7026 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
7027 ; VI-NEXT: s_waitcnt vmcnt(0)
7028 ; VI-NEXT: buffer_wbinvl1_vol
7029 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7030 ; VI-NEXT: v_mov_b32_e32 v3, v1
7031 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7032 ; VI-NEXT: v_mov_b32_e32 v2, v0
7033 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
7034 ; VI-NEXT: s_cbranch_execnz .LBB91_1
7035 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7036 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
7037 ; VI-NEXT: s_setpc_b64 s[30:31]
7039 ; GFX9-LABEL: global_atomic_umin_i64_noret_scalar:
7041 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7042 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
7043 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5]
7044 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
7045 ; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start
7046 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7047 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7048 ; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
7049 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
7050 ; GFX9-NEXT: v_mov_b32_e32 v5, s6
7051 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
7052 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
7053 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7054 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
7055 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7056 ; GFX9-NEXT: buffer_wbinvl1_vol
7057 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7058 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
7059 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7060 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
7061 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
7062 ; GFX9-NEXT: s_cbranch_execnz .LBB91_1
7063 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7064 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
7065 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7066 %tmp0 = atomicrmw umin ptr addrspace(1) %ptr, i64 %in seq_cst
7070 define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
7071 ; SI-LABEL: global_atomic_umin_i64_noret_offset_scalar:
7073 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7074 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7075 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
7076 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7077 ; SI-NEXT: s_waitcnt expcnt(0)
7078 ; SI-NEXT: v_writelane_b32 v0, s6, 0
7079 ; SI-NEXT: v_writelane_b32 v0, s7, 1
7080 ; SI-NEXT: s_mov_b32 s35, s7
7081 ; SI-NEXT: s_mov_b32 s34, s6
7082 ; SI-NEXT: s_mov_b32 s7, 0xf000
7083 ; SI-NEXT: s_mov_b32 s6, -1
7084 ; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
7085 ; SI-NEXT: s_mov_b64 s[36:37], 0
7086 ; SI-NEXT: .LBB92_1: ; %atomicrmw.start
7087 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7088 ; SI-NEXT: v_mov_b32_e32 v1, s35
7089 ; SI-NEXT: s_waitcnt vmcnt(0)
7090 ; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[3:4]
7091 ; SI-NEXT: v_cndmask_b32_e32 v2, v1, v4, vcc
7092 ; SI-NEXT: v_mov_b32_e32 v1, s34
7093 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
7094 ; SI-NEXT: s_waitcnt expcnt(0)
7095 ; SI-NEXT: v_mov_b32_e32 v8, v4
7096 ; SI-NEXT: v_mov_b32_e32 v7, v3
7097 ; SI-NEXT: v_mov_b32_e32 v6, v2
7098 ; SI-NEXT: v_mov_b32_e32 v5, v1
7099 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7100 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc
7101 ; SI-NEXT: s_waitcnt vmcnt(0)
7102 ; SI-NEXT: buffer_wbinvl1
7103 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4]
7104 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
7105 ; SI-NEXT: v_mov_b32_e32 v3, v5
7106 ; SI-NEXT: v_mov_b32_e32 v4, v6
7107 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
7108 ; SI-NEXT: s_cbranch_execnz .LBB92_1
7109 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7110 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
7111 ; SI-NEXT: v_readlane_b32 s7, v0, 1
7112 ; SI-NEXT: v_readlane_b32 s6, v0, 0
7113 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7114 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
7115 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7116 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
7117 ; SI-NEXT: s_setpc_b64 s[30:31]
7119 ; VI-LABEL: global_atomic_umin_i64_noret_offset_scalar:
7121 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7122 ; VI-NEXT: s_add_u32 s34, s4, 32
7123 ; VI-NEXT: s_addc_u32 s35, s5, 0
7124 ; VI-NEXT: v_mov_b32_e32 v0, s34
7125 ; VI-NEXT: v_mov_b32_e32 v1, s35
7126 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
7127 ; VI-NEXT: s_mov_b64 s[36:37], 0
7128 ; VI-NEXT: .LBB92_1: ; %atomicrmw.start
7129 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7130 ; VI-NEXT: s_waitcnt vmcnt(0)
7131 ; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
7132 ; VI-NEXT: v_mov_b32_e32 v0, s7
7133 ; VI-NEXT: v_mov_b32_e32 v6, s6
7134 ; VI-NEXT: v_mov_b32_e32 v4, s34
7135 ; VI-NEXT: v_mov_b32_e32 v5, s35
7136 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
7137 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
7138 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7139 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
7140 ; VI-NEXT: s_waitcnt vmcnt(0)
7141 ; VI-NEXT: buffer_wbinvl1_vol
7142 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7143 ; VI-NEXT: v_mov_b32_e32 v3, v1
7144 ; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
7145 ; VI-NEXT: v_mov_b32_e32 v2, v0
7146 ; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
7147 ; VI-NEXT: s_cbranch_execnz .LBB92_1
7148 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7149 ; VI-NEXT: s_or_b64 exec, exec, s[36:37]
7150 ; VI-NEXT: s_setpc_b64 s[30:31]
7152 ; GFX9-LABEL: global_atomic_umin_i64_noret_offset_scalar:
7154 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7155 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
7156 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
7157 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
7158 ; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start
7159 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7160 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7161 ; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
7162 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
7163 ; GFX9-NEXT: v_mov_b32_e32 v5, s6
7164 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
7165 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
7166 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7167 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
7168 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7169 ; GFX9-NEXT: buffer_wbinvl1_vol
7170 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7171 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
7172 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7173 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
7174 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
7175 ; GFX9-NEXT: s_cbranch_execnz .LBB92_1
7176 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7177 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
7178 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7179 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
7180 %tmp0 = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst
7184 define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
7185 ; SI-LABEL: global_atomic_umin_i64_ret_scalar:
7187 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7188 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7189 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
7190 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7191 ; SI-NEXT: s_waitcnt expcnt(0)
7192 ; SI-NEXT: v_writelane_b32 v2, s6, 0
7193 ; SI-NEXT: v_writelane_b32 v2, s7, 1
7194 ; SI-NEXT: s_mov_b32 s35, s7
7195 ; SI-NEXT: s_mov_b32 s34, s6
7196 ; SI-NEXT: s_mov_b32 s7, 0xf000
7197 ; SI-NEXT: s_mov_b32 s6, -1
7198 ; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0
7199 ; SI-NEXT: s_mov_b64 s[36:37], 0
7200 ; SI-NEXT: .LBB93_1: ; %atomicrmw.start
7201 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7202 ; SI-NEXT: s_waitcnt vmcnt(0)
7203 ; SI-NEXT: v_mov_b32_e32 v8, v4
7204 ; SI-NEXT: v_mov_b32_e32 v7, v3
7205 ; SI-NEXT: v_mov_b32_e32 v0, s35
7206 ; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[7:8]
7207 ; SI-NEXT: s_waitcnt expcnt(0)
7208 ; SI-NEXT: v_cndmask_b32_e32 v6, v0, v8, vcc
7209 ; SI-NEXT: v_mov_b32_e32 v0, s34
7210 ; SI-NEXT: v_cndmask_b32_e32 v5, v0, v7, vcc
7211 ; SI-NEXT: v_mov_b32_e32 v3, v5
7212 ; SI-NEXT: v_mov_b32_e32 v4, v6
7213 ; SI-NEXT: v_mov_b32_e32 v5, v7
7214 ; SI-NEXT: v_mov_b32_e32 v6, v8
7215 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7216 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc
7217 ; SI-NEXT: s_waitcnt vmcnt(0)
7218 ; SI-NEXT: buffer_wbinvl1
7219 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
7220 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
7221 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
7222 ; SI-NEXT: s_cbranch_execnz .LBB93_1
7223 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7224 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
7225 ; SI-NEXT: v_mov_b32_e32 v0, v3
7226 ; SI-NEXT: v_mov_b32_e32 v1, v4
7227 ; SI-NEXT: v_readlane_b32 s7, v2, 1
7228 ; SI-NEXT: v_readlane_b32 s6, v2, 0
7229 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7230 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
7231 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7232 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
7233 ; SI-NEXT: s_setpc_b64 s[30:31]
7235 ; VI-LABEL: global_atomic_umin_i64_ret_scalar:
7237 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7238 ; VI-NEXT: v_mov_b32_e32 v0, s4
7239 ; VI-NEXT: v_mov_b32_e32 v1, s5
7240 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
7241 ; VI-NEXT: s_mov_b64 s[34:35], 0
7242 ; VI-NEXT: .LBB93_1: ; %atomicrmw.start
7243 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7244 ; VI-NEXT: s_waitcnt vmcnt(0)
7245 ; VI-NEXT: v_mov_b32_e32 v3, v1
7246 ; VI-NEXT: v_mov_b32_e32 v2, v0
7247 ; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
7248 ; VI-NEXT: v_mov_b32_e32 v0, s7
7249 ; VI-NEXT: v_mov_b32_e32 v6, s6
7250 ; VI-NEXT: v_mov_b32_e32 v4, s4
7251 ; VI-NEXT: v_mov_b32_e32 v5, s5
7252 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
7253 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
7254 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7255 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
7256 ; VI-NEXT: s_waitcnt vmcnt(0)
7257 ; VI-NEXT: buffer_wbinvl1_vol
7258 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7259 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7260 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
7261 ; VI-NEXT: s_cbranch_execnz .LBB93_1
7262 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7263 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
7264 ; VI-NEXT: s_setpc_b64 s[30:31]
7266 ; GFX9-LABEL: global_atomic_umin_i64_ret_scalar:
7268 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7269 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
7270 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
7271 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
7272 ; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start
7273 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7274 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7275 ; GFX9-NEXT: v_mov_b32_e32 v6, v1
7276 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
7277 ; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[5:6]
7278 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
7279 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
7280 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
7281 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
7282 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7283 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc
7284 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7285 ; GFX9-NEXT: buffer_wbinvl1_vol
7286 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
7287 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7288 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
7289 ; GFX9-NEXT: s_cbranch_execnz .LBB93_1
7290 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7291 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
7292 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7293 %result = atomicrmw umin ptr addrspace(1) %ptr, i64 %in seq_cst
7297 define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
7298 ; SI-LABEL: global_atomic_umin_i64_ret_offset_scalar:
7300 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7301 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7302 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
7303 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7304 ; SI-NEXT: s_waitcnt expcnt(0)
7305 ; SI-NEXT: v_writelane_b32 v2, s6, 0
7306 ; SI-NEXT: v_writelane_b32 v2, s7, 1
7307 ; SI-NEXT: s_mov_b32 s35, s7
7308 ; SI-NEXT: s_mov_b32 s34, s6
7309 ; SI-NEXT: s_mov_b32 s7, 0xf000
7310 ; SI-NEXT: s_mov_b32 s6, -1
7311 ; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
7312 ; SI-NEXT: s_mov_b64 s[36:37], 0
7313 ; SI-NEXT: .LBB94_1: ; %atomicrmw.start
7314 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7315 ; SI-NEXT: s_waitcnt vmcnt(0)
7316 ; SI-NEXT: v_mov_b32_e32 v8, v4
7317 ; SI-NEXT: v_mov_b32_e32 v7, v3
7318 ; SI-NEXT: v_mov_b32_e32 v0, s35
7319 ; SI-NEXT: v_cmp_ge_u64_e32 vcc, s[34:35], v[7:8]
7320 ; SI-NEXT: s_waitcnt expcnt(0)
7321 ; SI-NEXT: v_cndmask_b32_e32 v6, v0, v8, vcc
7322 ; SI-NEXT: v_mov_b32_e32 v0, s34
7323 ; SI-NEXT: v_cndmask_b32_e32 v5, v0, v7, vcc
7324 ; SI-NEXT: v_mov_b32_e32 v3, v5
7325 ; SI-NEXT: v_mov_b32_e32 v4, v6
7326 ; SI-NEXT: v_mov_b32_e32 v5, v7
7327 ; SI-NEXT: v_mov_b32_e32 v6, v8
7328 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7329 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc
7330 ; SI-NEXT: s_waitcnt vmcnt(0)
7331 ; SI-NEXT: buffer_wbinvl1
7332 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
7333 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
7334 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
7335 ; SI-NEXT: s_cbranch_execnz .LBB94_1
7336 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7337 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
7338 ; SI-NEXT: v_mov_b32_e32 v0, v3
7339 ; SI-NEXT: v_mov_b32_e32 v1, v4
7340 ; SI-NEXT: v_readlane_b32 s7, v2, 1
7341 ; SI-NEXT: v_readlane_b32 s6, v2, 0
7342 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7343 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
7344 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7345 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
7346 ; SI-NEXT: s_setpc_b64 s[30:31]
7348 ; VI-LABEL: global_atomic_umin_i64_ret_offset_scalar:
7350 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7351 ; VI-NEXT: s_add_u32 s34, s4, 32
7352 ; VI-NEXT: s_addc_u32 s35, s5, 0
7353 ; VI-NEXT: v_mov_b32_e32 v0, s34
7354 ; VI-NEXT: v_mov_b32_e32 v1, s35
7355 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
7356 ; VI-NEXT: s_mov_b64 s[36:37], 0
7357 ; VI-NEXT: .LBB94_1: ; %atomicrmw.start
7358 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7359 ; VI-NEXT: s_waitcnt vmcnt(0)
7360 ; VI-NEXT: v_mov_b32_e32 v3, v1
7361 ; VI-NEXT: v_mov_b32_e32 v2, v0
7362 ; VI-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
7363 ; VI-NEXT: v_mov_b32_e32 v0, s7
7364 ; VI-NEXT: v_mov_b32_e32 v6, s6
7365 ; VI-NEXT: v_mov_b32_e32 v4, s34
7366 ; VI-NEXT: v_mov_b32_e32 v5, s35
7367 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
7368 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
7369 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7370 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
7371 ; VI-NEXT: s_waitcnt vmcnt(0)
7372 ; VI-NEXT: buffer_wbinvl1_vol
7373 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7374 ; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
7375 ; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
7376 ; VI-NEXT: s_cbranch_execnz .LBB94_1
7377 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7378 ; VI-NEXT: s_or_b64 exec, exec, s[36:37]
7379 ; VI-NEXT: s_setpc_b64 s[30:31]
7381 ; GFX9-LABEL: global_atomic_umin_i64_ret_offset_scalar:
7383 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7384 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
7385 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
7386 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
7387 ; GFX9-NEXT: .LBB94_1: ; %atomicrmw.start
7388 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7389 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7390 ; GFX9-NEXT: v_mov_b32_e32 v6, v1
7391 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
7392 ; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[5:6]
7393 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
7394 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
7395 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
7396 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
7397 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7398 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc
7399 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7400 ; GFX9-NEXT: buffer_wbinvl1_vol
7401 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
7402 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7403 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
7404 ; GFX9-NEXT: s_cbranch_execnz .LBB94_1
7405 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7406 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
7407 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7408 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
7409 %result = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst
7413 ; ---------------------------------------------------------------------
7415 ; ---------------------------------------------------------------------
7417 define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
7418 ; SI-LABEL: global_atomic_min_i64_noret:
7420 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7421 ; SI-NEXT: s_mov_b32 s6, 0
7422 ; SI-NEXT: s_mov_b32 s7, 0xf000
7423 ; SI-NEXT: s_mov_b32 s4, s6
7424 ; SI-NEXT: s_mov_b32 s5, s6
7425 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
7426 ; SI-NEXT: s_mov_b64 s[8:9], 0
7427 ; SI-NEXT: .LBB95_1: ; %atomicrmw.start
7428 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7429 ; SI-NEXT: s_waitcnt vmcnt(0)
7430 ; SI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
7431 ; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
7432 ; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
7433 ; SI-NEXT: s_waitcnt expcnt(0)
7434 ; SI-NEXT: v_mov_b32_e32 v11, v7
7435 ; SI-NEXT: v_mov_b32_e32 v10, v6
7436 ; SI-NEXT: v_mov_b32_e32 v9, v5
7437 ; SI-NEXT: v_mov_b32_e32 v8, v4
7438 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7439 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
7440 ; SI-NEXT: s_waitcnt vmcnt(0)
7441 ; SI-NEXT: buffer_wbinvl1
7442 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
7443 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
7444 ; SI-NEXT: v_mov_b32_e32 v6, v8
7445 ; SI-NEXT: v_mov_b32_e32 v7, v9
7446 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
7447 ; SI-NEXT: s_cbranch_execnz .LBB95_1
7448 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7449 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
7450 ; SI-NEXT: s_waitcnt expcnt(0)
7451 ; SI-NEXT: s_setpc_b64 s[30:31]
7453 ; VI-LABEL: global_atomic_min_i64_noret:
7455 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7456 ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
7457 ; VI-NEXT: s_mov_b64 s[4:5], 0
7458 ; VI-NEXT: .LBB95_1: ; %atomicrmw.start
7459 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7460 ; VI-NEXT: s_waitcnt vmcnt(0)
7461 ; VI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
7462 ; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
7463 ; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
7464 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7465 ; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
7466 ; VI-NEXT: s_waitcnt vmcnt(0)
7467 ; VI-NEXT: buffer_wbinvl1_vol
7468 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7469 ; VI-NEXT: v_mov_b32_e32 v7, v5
7470 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7471 ; VI-NEXT: v_mov_b32_e32 v6, v4
7472 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
7473 ; VI-NEXT: s_cbranch_execnz .LBB95_1
7474 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7475 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
7476 ; VI-NEXT: s_setpc_b64 s[30:31]
7478 ; GFX9-LABEL: global_atomic_min_i64_noret:
7480 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7481 ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
7482 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
7483 ; GFX9-NEXT: .LBB95_1: ; %atomicrmw.start
7484 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7485 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7486 ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
7487 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
7488 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
7489 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7490 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
7491 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7492 ; GFX9-NEXT: buffer_wbinvl1_vol
7493 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7494 ; GFX9-NEXT: v_mov_b32_e32 v7, v5
7495 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7496 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
7497 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
7498 ; GFX9-NEXT: s_cbranch_execnz .LBB95_1
7499 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7500 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
7501 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7502 %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i64 %in seq_cst
7506 define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
7507 ; SI-LABEL: global_atomic_min_i64_noret_offset:
7509 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7510 ; SI-NEXT: s_mov_b32 s6, 0
7511 ; SI-NEXT: s_mov_b32 s7, 0xf000
7512 ; SI-NEXT: s_mov_b32 s4, s6
7513 ; SI-NEXT: s_mov_b32 s5, s6
7514 ; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
7515 ; SI-NEXT: s_mov_b64 s[8:9], 0
7516 ; SI-NEXT: .LBB96_1: ; %atomicrmw.start
7517 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7518 ; SI-NEXT: s_waitcnt vmcnt(0)
7519 ; SI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
7520 ; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
7521 ; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
7522 ; SI-NEXT: s_waitcnt expcnt(0)
7523 ; SI-NEXT: v_mov_b32_e32 v11, v7
7524 ; SI-NEXT: v_mov_b32_e32 v10, v6
7525 ; SI-NEXT: v_mov_b32_e32 v9, v5
7526 ; SI-NEXT: v_mov_b32_e32 v8, v4
7527 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7528 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
7529 ; SI-NEXT: s_waitcnt vmcnt(0)
7530 ; SI-NEXT: buffer_wbinvl1
7531 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
7532 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
7533 ; SI-NEXT: v_mov_b32_e32 v6, v8
7534 ; SI-NEXT: v_mov_b32_e32 v7, v9
7535 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
7536 ; SI-NEXT: s_cbranch_execnz .LBB96_1
7537 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7538 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
7539 ; SI-NEXT: s_waitcnt expcnt(0)
7540 ; SI-NEXT: s_setpc_b64 s[30:31]
7542 ; VI-LABEL: global_atomic_min_i64_noret_offset:
7544 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7545 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
7546 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7547 ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
7548 ; VI-NEXT: s_mov_b64 s[4:5], 0
7549 ; VI-NEXT: .LBB96_1: ; %atomicrmw.start
7550 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7551 ; VI-NEXT: s_waitcnt vmcnt(0)
7552 ; VI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
7553 ; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
7554 ; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
7555 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7556 ; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
7557 ; VI-NEXT: s_waitcnt vmcnt(0)
7558 ; VI-NEXT: buffer_wbinvl1_vol
7559 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7560 ; VI-NEXT: v_mov_b32_e32 v7, v5
7561 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7562 ; VI-NEXT: v_mov_b32_e32 v6, v4
7563 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
7564 ; VI-NEXT: s_cbranch_execnz .LBB96_1
7565 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7566 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
7567 ; VI-NEXT: s_setpc_b64 s[30:31]
7569 ; GFX9-LABEL: global_atomic_min_i64_noret_offset:
7571 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7572 ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32
7573 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
7574 ; GFX9-NEXT: .LBB96_1: ; %atomicrmw.start
7575 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7576 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7577 ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
7578 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
7579 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
7580 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7581 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
7582 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7583 ; GFX9-NEXT: buffer_wbinvl1_vol
7584 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7585 ; GFX9-NEXT: v_mov_b32_e32 v7, v5
7586 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7587 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
7588 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
7589 ; GFX9-NEXT: s_cbranch_execnz .LBB96_1
7590 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7591 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
7592 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7593 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
7594 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst
7598 define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
7599 ; SI-LABEL: global_atomic_min_i64_ret:
7601 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7602 ; SI-NEXT: v_mov_b32_e32 v5, v3
7603 ; SI-NEXT: v_mov_b32_e32 v4, v2
7604 ; SI-NEXT: v_mov_b32_e32 v7, v1
7605 ; SI-NEXT: v_mov_b32_e32 v6, v0
7606 ; SI-NEXT: s_mov_b32 s6, 0
7607 ; SI-NEXT: s_mov_b32 s7, 0xf000
7608 ; SI-NEXT: s_mov_b32 s4, s6
7609 ; SI-NEXT: s_mov_b32 s5, s6
7610 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
7611 ; SI-NEXT: s_mov_b64 s[8:9], 0
7612 ; SI-NEXT: .LBB97_1: ; %atomicrmw.start
7613 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7614 ; SI-NEXT: s_waitcnt vmcnt(0)
7615 ; SI-NEXT: v_mov_b32_e32 v11, v1
7616 ; SI-NEXT: v_mov_b32_e32 v10, v0
7617 ; SI-NEXT: v_cmp_le_i64_e32 vcc, v[10:11], v[4:5]
7618 ; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc
7619 ; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc
7620 ; SI-NEXT: s_waitcnt expcnt(0)
7621 ; SI-NEXT: v_mov_b32_e32 v0, v8
7622 ; SI-NEXT: v_mov_b32_e32 v1, v9
7623 ; SI-NEXT: v_mov_b32_e32 v2, v10
7624 ; SI-NEXT: v_mov_b32_e32 v3, v11
7625 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7626 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
7627 ; SI-NEXT: s_waitcnt vmcnt(0)
7628 ; SI-NEXT: buffer_wbinvl1
7629 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
7630 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
7631 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
7632 ; SI-NEXT: s_cbranch_execnz .LBB97_1
7633 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7634 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
7635 ; SI-NEXT: s_waitcnt expcnt(0)
7636 ; SI-NEXT: s_setpc_b64 s[30:31]
7638 ; VI-LABEL: global_atomic_min_i64_ret:
7640 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7641 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
7642 ; VI-NEXT: s_mov_b64 s[4:5], 0
7643 ; VI-NEXT: .LBB97_1: ; %atomicrmw.start
7644 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7645 ; VI-NEXT: s_waitcnt vmcnt(0)
7646 ; VI-NEXT: v_mov_b32_e32 v7, v5
7647 ; VI-NEXT: v_mov_b32_e32 v6, v4
7648 ; VI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
7649 ; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
7650 ; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
7651 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7652 ; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
7653 ; VI-NEXT: s_waitcnt vmcnt(0)
7654 ; VI-NEXT: buffer_wbinvl1_vol
7655 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7656 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7657 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
7658 ; VI-NEXT: s_cbranch_execnz .LBB97_1
7659 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7660 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
7661 ; VI-NEXT: v_mov_b32_e32 v0, v4
7662 ; VI-NEXT: v_mov_b32_e32 v1, v5
7663 ; VI-NEXT: s_setpc_b64 s[30:31]
7665 ; GFX9-LABEL: global_atomic_min_i64_ret:
7667 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7668 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
7669 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
7670 ; GFX9-NEXT: .LBB97_1: ; %atomicrmw.start
7671 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7672 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7673 ; GFX9-NEXT: v_mov_b32_e32 v7, v5
7674 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
7675 ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
7676 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
7677 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
7678 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7679 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
7680 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7681 ; GFX9-NEXT: buffer_wbinvl1_vol
7682 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7683 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7684 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
7685 ; GFX9-NEXT: s_cbranch_execnz .LBB97_1
7686 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7687 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
7688 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
7689 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
7690 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7691 %result = atomicrmw min ptr addrspace(1) %ptr, i64 %in seq_cst
7695 define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
7696 ; SI-LABEL: global_atomic_min_i64_ret_offset:
7698 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7699 ; SI-NEXT: v_mov_b32_e32 v5, v3
7700 ; SI-NEXT: v_mov_b32_e32 v4, v2
7701 ; SI-NEXT: v_mov_b32_e32 v7, v1
7702 ; SI-NEXT: v_mov_b32_e32 v6, v0
7703 ; SI-NEXT: s_mov_b32 s6, 0
7704 ; SI-NEXT: s_mov_b32 s7, 0xf000
7705 ; SI-NEXT: s_mov_b32 s4, s6
7706 ; SI-NEXT: s_mov_b32 s5, s6
7707 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32
7708 ; SI-NEXT: s_mov_b64 s[8:9], 0
7709 ; SI-NEXT: .LBB98_1: ; %atomicrmw.start
7710 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7711 ; SI-NEXT: s_waitcnt vmcnt(0)
7712 ; SI-NEXT: v_mov_b32_e32 v11, v1
7713 ; SI-NEXT: v_mov_b32_e32 v10, v0
7714 ; SI-NEXT: v_cmp_le_i64_e32 vcc, v[10:11], v[4:5]
7715 ; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc
7716 ; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc
7717 ; SI-NEXT: s_waitcnt expcnt(0)
7718 ; SI-NEXT: v_mov_b32_e32 v0, v8
7719 ; SI-NEXT: v_mov_b32_e32 v1, v9
7720 ; SI-NEXT: v_mov_b32_e32 v2, v10
7721 ; SI-NEXT: v_mov_b32_e32 v3, v11
7722 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7723 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc
7724 ; SI-NEXT: s_waitcnt vmcnt(0)
7725 ; SI-NEXT: buffer_wbinvl1
7726 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
7727 ; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
7728 ; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
7729 ; SI-NEXT: s_cbranch_execnz .LBB98_1
7730 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7731 ; SI-NEXT: s_or_b64 exec, exec, s[8:9]
7732 ; SI-NEXT: s_waitcnt expcnt(0)
7733 ; SI-NEXT: s_setpc_b64 s[30:31]
7735 ; VI-LABEL: global_atomic_min_i64_ret_offset:
7737 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7738 ; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0
7739 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
7740 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
7741 ; VI-NEXT: s_mov_b64 s[4:5], 0
7742 ; VI-NEXT: .LBB98_1: ; %atomicrmw.start
7743 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7744 ; VI-NEXT: s_waitcnt vmcnt(0)
7745 ; VI-NEXT: v_mov_b32_e32 v9, v1
7746 ; VI-NEXT: v_mov_b32_e32 v8, v0
7747 ; VI-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
7748 ; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
7749 ; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
7750 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7751 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
7752 ; VI-NEXT: s_waitcnt vmcnt(0)
7753 ; VI-NEXT: buffer_wbinvl1_vol
7754 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
7755 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7756 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
7757 ; VI-NEXT: s_cbranch_execnz .LBB98_1
7758 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7759 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
7760 ; VI-NEXT: s_setpc_b64 s[30:31]
7762 ; GFX9-LABEL: global_atomic_min_i64_ret_offset:
7764 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7765 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32
7766 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
7767 ; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start
7768 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7769 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7770 ; GFX9-NEXT: v_mov_b32_e32 v7, v5
7771 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
7772 ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
7773 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
7774 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
7775 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7776 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
7777 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7778 ; GFX9-NEXT: buffer_wbinvl1_vol
7779 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7780 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7781 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
7782 ; GFX9-NEXT: s_cbranch_execnz .LBB98_1
7783 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7784 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
7785 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
7786 ; GFX9-NEXT: v_mov_b32_e32 v1, v5
7787 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7788 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
7789 %result = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst
7793 define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
7794 ; SI-LABEL: global_atomic_min_i64_noret_scalar:
7796 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7797 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7798 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
7799 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7800 ; SI-NEXT: s_waitcnt expcnt(0)
7801 ; SI-NEXT: v_writelane_b32 v0, s6, 0
7802 ; SI-NEXT: v_writelane_b32 v0, s7, 1
7803 ; SI-NEXT: s_mov_b32 s35, s7
7804 ; SI-NEXT: s_mov_b32 s34, s6
7805 ; SI-NEXT: s_mov_b32 s7, 0xf000
7806 ; SI-NEXT: s_mov_b32 s6, -1
7807 ; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0
7808 ; SI-NEXT: s_mov_b64 s[36:37], 0
7809 ; SI-NEXT: .LBB99_1: ; %atomicrmw.start
7810 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7811 ; SI-NEXT: v_mov_b32_e32 v1, s35
7812 ; SI-NEXT: s_waitcnt vmcnt(0)
7813 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[3:4]
7814 ; SI-NEXT: v_cndmask_b32_e32 v2, v1, v4, vcc
7815 ; SI-NEXT: v_mov_b32_e32 v1, s34
7816 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
7817 ; SI-NEXT: s_waitcnt expcnt(0)
7818 ; SI-NEXT: v_mov_b32_e32 v8, v4
7819 ; SI-NEXT: v_mov_b32_e32 v7, v3
7820 ; SI-NEXT: v_mov_b32_e32 v6, v2
7821 ; SI-NEXT: v_mov_b32_e32 v5, v1
7822 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7823 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc
7824 ; SI-NEXT: s_waitcnt vmcnt(0)
7825 ; SI-NEXT: buffer_wbinvl1
7826 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4]
7827 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
7828 ; SI-NEXT: v_mov_b32_e32 v3, v5
7829 ; SI-NEXT: v_mov_b32_e32 v4, v6
7830 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
7831 ; SI-NEXT: s_cbranch_execnz .LBB99_1
7832 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7833 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
7834 ; SI-NEXT: v_readlane_b32 s7, v0, 1
7835 ; SI-NEXT: v_readlane_b32 s6, v0, 0
7836 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7837 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
7838 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7839 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
7840 ; SI-NEXT: s_setpc_b64 s[30:31]
7842 ; VI-LABEL: global_atomic_min_i64_noret_scalar:
7844 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7845 ; VI-NEXT: v_mov_b32_e32 v0, s4
7846 ; VI-NEXT: v_mov_b32_e32 v1, s5
7847 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
7848 ; VI-NEXT: s_mov_b64 s[34:35], 0
7849 ; VI-NEXT: .LBB99_1: ; %atomicrmw.start
7850 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7851 ; VI-NEXT: s_waitcnt vmcnt(0)
7852 ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
7853 ; VI-NEXT: v_mov_b32_e32 v0, s7
7854 ; VI-NEXT: v_mov_b32_e32 v6, s6
7855 ; VI-NEXT: v_mov_b32_e32 v4, s4
7856 ; VI-NEXT: v_mov_b32_e32 v5, s5
7857 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
7858 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
7859 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7860 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
7861 ; VI-NEXT: s_waitcnt vmcnt(0)
7862 ; VI-NEXT: buffer_wbinvl1_vol
7863 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7864 ; VI-NEXT: v_mov_b32_e32 v3, v1
7865 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7866 ; VI-NEXT: v_mov_b32_e32 v2, v0
7867 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
7868 ; VI-NEXT: s_cbranch_execnz .LBB99_1
7869 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7870 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
7871 ; VI-NEXT: s_setpc_b64 s[30:31]
7873 ; GFX9-LABEL: global_atomic_min_i64_noret_scalar:
7875 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7876 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
7877 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5]
7878 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
7879 ; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start
7880 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7881 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7882 ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
7883 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
7884 ; GFX9-NEXT: v_mov_b32_e32 v5, s6
7885 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
7886 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
7887 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7888 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
7889 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7890 ; GFX9-NEXT: buffer_wbinvl1_vol
7891 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7892 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
7893 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7894 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
7895 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
7896 ; GFX9-NEXT: s_cbranch_execnz .LBB99_1
7897 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
7898 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
7899 ; GFX9-NEXT: s_setpc_b64 s[30:31]
7900 %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i64 %in seq_cst
7904 define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
7905 ; SI-LABEL: global_atomic_min_i64_noret_offset_scalar:
7907 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7908 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7909 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
7910 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7911 ; SI-NEXT: s_waitcnt expcnt(0)
7912 ; SI-NEXT: v_writelane_b32 v0, s6, 0
7913 ; SI-NEXT: v_writelane_b32 v0, s7, 1
7914 ; SI-NEXT: s_mov_b32 s35, s7
7915 ; SI-NEXT: s_mov_b32 s34, s6
7916 ; SI-NEXT: s_mov_b32 s7, 0xf000
7917 ; SI-NEXT: s_mov_b32 s6, -1
7918 ; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
7919 ; SI-NEXT: s_mov_b64 s[36:37], 0
7920 ; SI-NEXT: .LBB100_1: ; %atomicrmw.start
7921 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
7922 ; SI-NEXT: v_mov_b32_e32 v1, s35
7923 ; SI-NEXT: s_waitcnt vmcnt(0)
7924 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[3:4]
7925 ; SI-NEXT: v_cndmask_b32_e32 v2, v1, v4, vcc
7926 ; SI-NEXT: v_mov_b32_e32 v1, s34
7927 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
7928 ; SI-NEXT: s_waitcnt expcnt(0)
7929 ; SI-NEXT: v_mov_b32_e32 v8, v4
7930 ; SI-NEXT: v_mov_b32_e32 v7, v3
7931 ; SI-NEXT: v_mov_b32_e32 v6, v2
7932 ; SI-NEXT: v_mov_b32_e32 v5, v1
7933 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7934 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc
7935 ; SI-NEXT: s_waitcnt vmcnt(0)
7936 ; SI-NEXT: buffer_wbinvl1
7937 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4]
7938 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
7939 ; SI-NEXT: v_mov_b32_e32 v3, v5
7940 ; SI-NEXT: v_mov_b32_e32 v4, v6
7941 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
7942 ; SI-NEXT: s_cbranch_execnz .LBB100_1
7943 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
7944 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
7945 ; SI-NEXT: v_readlane_b32 s7, v0, 1
7946 ; SI-NEXT: v_readlane_b32 s6, v0, 0
7947 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
7948 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
7949 ; SI-NEXT: s_mov_b64 exec, s[34:35]
7950 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
7951 ; SI-NEXT: s_setpc_b64 s[30:31]
7953 ; VI-LABEL: global_atomic_min_i64_noret_offset_scalar:
7955 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7956 ; VI-NEXT: s_add_u32 s34, s4, 32
7957 ; VI-NEXT: s_addc_u32 s35, s5, 0
7958 ; VI-NEXT: v_mov_b32_e32 v0, s34
7959 ; VI-NEXT: v_mov_b32_e32 v1, s35
7960 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
7961 ; VI-NEXT: s_mov_b64 s[36:37], 0
7962 ; VI-NEXT: .LBB100_1: ; %atomicrmw.start
7963 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
7964 ; VI-NEXT: s_waitcnt vmcnt(0)
7965 ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
7966 ; VI-NEXT: v_mov_b32_e32 v0, s7
7967 ; VI-NEXT: v_mov_b32_e32 v6, s6
7968 ; VI-NEXT: v_mov_b32_e32 v4, s34
7969 ; VI-NEXT: v_mov_b32_e32 v5, s35
7970 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
7971 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
7972 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7973 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
7974 ; VI-NEXT: s_waitcnt vmcnt(0)
7975 ; VI-NEXT: buffer_wbinvl1_vol
7976 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7977 ; VI-NEXT: v_mov_b32_e32 v3, v1
7978 ; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
7979 ; VI-NEXT: v_mov_b32_e32 v2, v0
7980 ; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
7981 ; VI-NEXT: s_cbranch_execnz .LBB100_1
7982 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
7983 ; VI-NEXT: s_or_b64 exec, exec, s[36:37]
7984 ; VI-NEXT: s_setpc_b64 s[30:31]
7986 ; GFX9-LABEL: global_atomic_min_i64_noret_offset_scalar:
7988 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7989 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
7990 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
7991 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
7992 ; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start
7993 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7994 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7995 ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
7996 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
7997 ; GFX9-NEXT: v_mov_b32_e32 v5, s6
7998 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
7999 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
8000 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8001 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
8002 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8003 ; GFX9-NEXT: buffer_wbinvl1_vol
8004 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
8005 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
8006 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
8007 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
8008 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
8009 ; GFX9-NEXT: s_cbranch_execnz .LBB100_1
8010 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
8011 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
8012 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8013 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
8014 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst
8018 define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
8019 ; SI-LABEL: global_atomic_min_i64_ret_scalar:
8021 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8022 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8023 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
8024 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8025 ; SI-NEXT: s_waitcnt expcnt(0)
8026 ; SI-NEXT: v_writelane_b32 v2, s6, 0
8027 ; SI-NEXT: v_writelane_b32 v2, s7, 1
8028 ; SI-NEXT: s_mov_b32 s35, s7
8029 ; SI-NEXT: s_mov_b32 s34, s6
8030 ; SI-NEXT: s_mov_b32 s7, 0xf000
8031 ; SI-NEXT: s_mov_b32 s6, -1
8032 ; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0
8033 ; SI-NEXT: s_mov_b64 s[36:37], 0
8034 ; SI-NEXT: .LBB101_1: ; %atomicrmw.start
8035 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
8036 ; SI-NEXT: s_waitcnt vmcnt(0)
8037 ; SI-NEXT: v_mov_b32_e32 v8, v4
8038 ; SI-NEXT: v_mov_b32_e32 v7, v3
8039 ; SI-NEXT: v_mov_b32_e32 v0, s35
8040 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[7:8]
8041 ; SI-NEXT: s_waitcnt expcnt(0)
8042 ; SI-NEXT: v_cndmask_b32_e32 v6, v0, v8, vcc
8043 ; SI-NEXT: v_mov_b32_e32 v0, s34
8044 ; SI-NEXT: v_cndmask_b32_e32 v5, v0, v7, vcc
8045 ; SI-NEXT: v_mov_b32_e32 v3, v5
8046 ; SI-NEXT: v_mov_b32_e32 v4, v6
8047 ; SI-NEXT: v_mov_b32_e32 v5, v7
8048 ; SI-NEXT: v_mov_b32_e32 v6, v8
8049 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8050 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc
8051 ; SI-NEXT: s_waitcnt vmcnt(0)
8052 ; SI-NEXT: buffer_wbinvl1
8053 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
8054 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
8055 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
8056 ; SI-NEXT: s_cbranch_execnz .LBB101_1
8057 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
8058 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
8059 ; SI-NEXT: v_mov_b32_e32 v0, v3
8060 ; SI-NEXT: v_mov_b32_e32 v1, v4
8061 ; SI-NEXT: v_readlane_b32 s7, v2, 1
8062 ; SI-NEXT: v_readlane_b32 s6, v2, 0
8063 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8064 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
8065 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8066 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
8067 ; SI-NEXT: s_setpc_b64 s[30:31]
8069 ; VI-LABEL: global_atomic_min_i64_ret_scalar:
8071 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8072 ; VI-NEXT: v_mov_b32_e32 v0, s4
8073 ; VI-NEXT: v_mov_b32_e32 v1, s5
8074 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
8075 ; VI-NEXT: s_mov_b64 s[34:35], 0
8076 ; VI-NEXT: .LBB101_1: ; %atomicrmw.start
8077 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
8078 ; VI-NEXT: s_waitcnt vmcnt(0)
8079 ; VI-NEXT: v_mov_b32_e32 v3, v1
8080 ; VI-NEXT: v_mov_b32_e32 v2, v0
8081 ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
8082 ; VI-NEXT: v_mov_b32_e32 v0, s7
8083 ; VI-NEXT: v_mov_b32_e32 v6, s6
8084 ; VI-NEXT: v_mov_b32_e32 v4, s4
8085 ; VI-NEXT: v_mov_b32_e32 v5, s5
8086 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
8087 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
8088 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8089 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
8090 ; VI-NEXT: s_waitcnt vmcnt(0)
8091 ; VI-NEXT: buffer_wbinvl1_vol
8092 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
8093 ; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
8094 ; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
8095 ; VI-NEXT: s_cbranch_execnz .LBB101_1
8096 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
8097 ; VI-NEXT: s_or_b64 exec, exec, s[34:35]
8098 ; VI-NEXT: s_setpc_b64 s[30:31]
8100 ; GFX9-LABEL: global_atomic_min_i64_ret_scalar:
8102 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8103 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
8104 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
8105 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
8106 ; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start
8107 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
8108 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8109 ; GFX9-NEXT: v_mov_b32_e32 v6, v1
8110 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
8111 ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[5:6]
8112 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
8113 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
8114 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
8115 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
8116 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8117 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc
8118 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8119 ; GFX9-NEXT: buffer_wbinvl1_vol
8120 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
8121 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
8122 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
8123 ; GFX9-NEXT: s_cbranch_execnz .LBB101_1
8124 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
8125 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
8126 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8127 %result = atomicrmw min ptr addrspace(1) %ptr, i64 %in seq_cst
8131 define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
8132 ; SI-LABEL: global_atomic_min_i64_ret_offset_scalar:
8134 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8135 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8136 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
8137 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8138 ; SI-NEXT: s_waitcnt expcnt(0)
8139 ; SI-NEXT: v_writelane_b32 v2, s6, 0
8140 ; SI-NEXT: v_writelane_b32 v2, s7, 1
8141 ; SI-NEXT: s_mov_b32 s35, s7
8142 ; SI-NEXT: s_mov_b32 s34, s6
8143 ; SI-NEXT: s_mov_b32 s7, 0xf000
8144 ; SI-NEXT: s_mov_b32 s6, -1
8145 ; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
8146 ; SI-NEXT: s_mov_b64 s[36:37], 0
8147 ; SI-NEXT: .LBB102_1: ; %atomicrmw.start
8148 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
8149 ; SI-NEXT: s_waitcnt vmcnt(0)
8150 ; SI-NEXT: v_mov_b32_e32 v8, v4
8151 ; SI-NEXT: v_mov_b32_e32 v7, v3
8152 ; SI-NEXT: v_mov_b32_e32 v0, s35
8153 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[34:35], v[7:8]
8154 ; SI-NEXT: s_waitcnt expcnt(0)
8155 ; SI-NEXT: v_cndmask_b32_e32 v6, v0, v8, vcc
8156 ; SI-NEXT: v_mov_b32_e32 v0, s34
8157 ; SI-NEXT: v_cndmask_b32_e32 v5, v0, v7, vcc
8158 ; SI-NEXT: v_mov_b32_e32 v3, v5
8159 ; SI-NEXT: v_mov_b32_e32 v4, v6
8160 ; SI-NEXT: v_mov_b32_e32 v5, v7
8161 ; SI-NEXT: v_mov_b32_e32 v6, v8
8162 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8163 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc
8164 ; SI-NEXT: s_waitcnt vmcnt(0)
8165 ; SI-NEXT: buffer_wbinvl1
8166 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
8167 ; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
8168 ; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
8169 ; SI-NEXT: s_cbranch_execnz .LBB102_1
8170 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
8171 ; SI-NEXT: s_or_b64 exec, exec, s[36:37]
8172 ; SI-NEXT: v_mov_b32_e32 v0, v3
8173 ; SI-NEXT: v_mov_b32_e32 v1, v4
8174 ; SI-NEXT: v_readlane_b32 s7, v2, 1
8175 ; SI-NEXT: v_readlane_b32 s6, v2, 0
8176 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8177 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
8178 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8179 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
8180 ; SI-NEXT: s_setpc_b64 s[30:31]
8182 ; VI-LABEL: global_atomic_min_i64_ret_offset_scalar:
8184 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8185 ; VI-NEXT: s_add_u32 s34, s4, 32
8186 ; VI-NEXT: s_addc_u32 s35, s5, 0
8187 ; VI-NEXT: v_mov_b32_e32 v0, s34
8188 ; VI-NEXT: v_mov_b32_e32 v1, s35
8189 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
8190 ; VI-NEXT: s_mov_b64 s[36:37], 0
8191 ; VI-NEXT: .LBB102_1: ; %atomicrmw.start
8192 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
8193 ; VI-NEXT: s_waitcnt vmcnt(0)
8194 ; VI-NEXT: v_mov_b32_e32 v3, v1
8195 ; VI-NEXT: v_mov_b32_e32 v2, v0
8196 ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
8197 ; VI-NEXT: v_mov_b32_e32 v0, s7
8198 ; VI-NEXT: v_mov_b32_e32 v6, s6
8199 ; VI-NEXT: v_mov_b32_e32 v4, s34
8200 ; VI-NEXT: v_mov_b32_e32 v5, s35
8201 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
8202 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
8203 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8204 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
8205 ; VI-NEXT: s_waitcnt vmcnt(0)
8206 ; VI-NEXT: buffer_wbinvl1_vol
8207 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
8208 ; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
8209 ; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
8210 ; VI-NEXT: s_cbranch_execnz .LBB102_1
8211 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
8212 ; VI-NEXT: s_or_b64 exec, exec, s[36:37]
8213 ; VI-NEXT: s_setpc_b64 s[30:31]
8215 ; GFX9-LABEL: global_atomic_min_i64_ret_offset_scalar:
8217 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8218 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
8219 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
8220 ; GFX9-NEXT: s_mov_b64 s[34:35], 0
8221 ; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start
8222 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
8223 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8224 ; GFX9-NEXT: v_mov_b32_e32 v6, v1
8225 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
8226 ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[5:6]
8227 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
8228 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
8229 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
8230 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
8231 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8232 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc
8233 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8234 ; GFX9-NEXT: buffer_wbinvl1_vol
8235 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
8236 ; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
8237 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
8238 ; GFX9-NEXT: s_cbranch_execnz .LBB102_1
8239 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
8240 ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
8241 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8242 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
8243 %result = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst
8247 define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
8248 ; SI-LABEL: atomic_min_i64_addr64_offset:
8249 ; SI: ; %bb.0: ; %entry
8250 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
8251 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
8252 ; SI-NEXT: s_waitcnt lgkmcnt(0)
8253 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
8254 ; SI-NEXT: s_add_u32 s4, s0, s4
8255 ; SI-NEXT: s_addc_u32 s5, s1, s5
8256 ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
8257 ; SI-NEXT: s_mov_b64 s[0:1], 0
8258 ; SI-NEXT: s_mov_b32 s7, 0xf000
8259 ; SI-NEXT: s_waitcnt lgkmcnt(0)
8260 ; SI-NEXT: v_mov_b32_e32 v2, s8
8261 ; SI-NEXT: v_mov_b32_e32 v3, s9
8262 ; SI-NEXT: s_mov_b32 s6, -1
8263 ; SI-NEXT: .LBB103_1: ; %atomicrmw.start
8264 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
8265 ; SI-NEXT: v_mov_b32_e32 v0, s3
8266 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
8267 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
8268 ; SI-NEXT: v_mov_b32_e32 v0, s2
8269 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
8270 ; SI-NEXT: s_waitcnt expcnt(0)
8271 ; SI-NEXT: v_mov_b32_e32 v7, v3
8272 ; SI-NEXT: v_mov_b32_e32 v6, v2
8273 ; SI-NEXT: v_mov_b32_e32 v5, v1
8274 ; SI-NEXT: v_mov_b32_e32 v4, v0
8275 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8276 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc
8277 ; SI-NEXT: s_waitcnt vmcnt(0)
8278 ; SI-NEXT: buffer_wbinvl1
8279 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
8280 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
8281 ; SI-NEXT: v_mov_b32_e32 v2, v4
8282 ; SI-NEXT: v_mov_b32_e32 v3, v5
8283 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
8284 ; SI-NEXT: s_cbranch_execnz .LBB103_1
8285 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
8288 ; VI-LABEL: atomic_min_i64_addr64_offset:
8289 ; VI: ; %bb.0: ; %entry
8290 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
8291 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
8292 ; VI-NEXT: s_waitcnt lgkmcnt(0)
8293 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
8294 ; VI-NEXT: s_add_u32 s0, s0, s4
8295 ; VI-NEXT: s_addc_u32 s1, s1, s5
8296 ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
8297 ; VI-NEXT: s_add_u32 s0, s0, 32
8298 ; VI-NEXT: s_addc_u32 s1, s1, 0
8299 ; VI-NEXT: s_mov_b64 s[4:5], 0
8300 ; VI-NEXT: s_waitcnt lgkmcnt(0)
8301 ; VI-NEXT: v_mov_b32_e32 v2, s6
8302 ; VI-NEXT: v_mov_b32_e32 v3, s7
8303 ; VI-NEXT: .LBB103_1: ; %atomicrmw.start
8304 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
8305 ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
8306 ; VI-NEXT: v_mov_b32_e32 v0, s3
8307 ; VI-NEXT: v_mov_b32_e32 v6, s2
8308 ; VI-NEXT: v_mov_b32_e32 v5, s1
8309 ; VI-NEXT: v_mov_b32_e32 v4, s0
8310 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
8311 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
8312 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8313 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
8314 ; VI-NEXT: s_waitcnt vmcnt(0)
8315 ; VI-NEXT: buffer_wbinvl1_vol
8316 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
8317 ; VI-NEXT: v_mov_b32_e32 v3, v1
8318 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
8319 ; VI-NEXT: v_mov_b32_e32 v2, v0
8320 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
8321 ; VI-NEXT: s_cbranch_execnz .LBB103_1
8322 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
8325 ; GFX9-LABEL: atomic_min_i64_addr64_offset:
8326 ; GFX9: ; %bb.0: ; %entry
8327 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
8328 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
8329 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
8330 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
8331 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
8332 ; GFX9-NEXT: s_add_u32 s0, s4, s0
8333 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
8334 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20
8335 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
8336 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
8337 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
8338 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
8339 ; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start
8340 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
8341 ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
8342 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
8343 ; GFX9-NEXT: v_mov_b32_e32 v5, s6
8344 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
8345 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
8346 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8347 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc
8348 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8349 ; GFX9-NEXT: buffer_wbinvl1_vol
8350 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
8351 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
8352 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
8353 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
8354 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
8355 ; GFX9-NEXT: s_cbranch_execnz .LBB103_1
8356 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
8357 ; GFX9-NEXT: s_endpgm
8359 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
8360 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
8361 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst
8365 define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
8366 ; SI-LABEL: atomic_min_i64_ret_addr64_offset:
8367 ; SI: ; %bb.0: ; %entry
8368 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
8369 ; SI-NEXT: s_waitcnt lgkmcnt(0)
8370 ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
8371 ; SI-NEXT: s_add_u32 s8, s0, s6
8372 ; SI-NEXT: s_addc_u32 s9, s1, s7
8373 ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
8374 ; SI-NEXT: s_mov_b64 s[0:1], 0
8375 ; SI-NEXT: s_mov_b32 s11, 0xf000
8376 ; SI-NEXT: s_waitcnt lgkmcnt(0)
8377 ; SI-NEXT: v_mov_b32_e32 v2, s6
8378 ; SI-NEXT: v_mov_b32_e32 v3, s7
8379 ; SI-NEXT: s_mov_b32 s10, -1
8380 ; SI-NEXT: .LBB104_1: ; %atomicrmw.start
8381 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
8382 ; SI-NEXT: v_mov_b32_e32 v0, s5
8383 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
8384 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
8385 ; SI-NEXT: v_mov_b32_e32 v0, s4
8386 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
8387 ; SI-NEXT: s_waitcnt expcnt(0)
8388 ; SI-NEXT: v_mov_b32_e32 v7, v3
8389 ; SI-NEXT: v_mov_b32_e32 v6, v2
8390 ; SI-NEXT: v_mov_b32_e32 v5, v1
8391 ; SI-NEXT: v_mov_b32_e32 v4, v0
8392 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8393 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 offset:32 glc
8394 ; SI-NEXT: s_waitcnt vmcnt(0)
8395 ; SI-NEXT: buffer_wbinvl1
8396 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
8397 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
8398 ; SI-NEXT: v_mov_b32_e32 v2, v4
8399 ; SI-NEXT: v_mov_b32_e32 v3, v5
8400 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
8401 ; SI-NEXT: s_cbranch_execnz .LBB104_1
8402 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
8403 ; SI-NEXT: s_or_b64 exec, exec, s[0:1]
8404 ; SI-NEXT: s_mov_b32 s7, 0xf000
8405 ; SI-NEXT: s_mov_b32 s6, -1
8406 ; SI-NEXT: s_mov_b32 s4, s2
8407 ; SI-NEXT: s_mov_b32 s5, s3
8408 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0
8411 ; VI-LABEL: atomic_min_i64_ret_addr64_offset:
8412 ; VI: ; %bb.0: ; %entry
8413 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
8414 ; VI-NEXT: s_waitcnt lgkmcnt(0)
8415 ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
8416 ; VI-NEXT: s_add_u32 s0, s0, s6
8417 ; VI-NEXT: s_addc_u32 s1, s1, s7
8418 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20
8419 ; VI-NEXT: s_add_u32 s0, s0, 32
8420 ; VI-NEXT: s_addc_u32 s1, s1, 0
8421 ; VI-NEXT: s_mov_b64 s[6:7], 0
8422 ; VI-NEXT: s_waitcnt lgkmcnt(0)
8423 ; VI-NEXT: v_mov_b32_e32 v0, s8
8424 ; VI-NEXT: v_mov_b32_e32 v1, s9
8425 ; VI-NEXT: .LBB104_1: ; %atomicrmw.start
8426 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
8427 ; VI-NEXT: v_mov_b32_e32 v3, v1
8428 ; VI-NEXT: v_mov_b32_e32 v2, v0
8429 ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
8430 ; VI-NEXT: v_mov_b32_e32 v0, s5
8431 ; VI-NEXT: v_mov_b32_e32 v6, s4
8432 ; VI-NEXT: v_mov_b32_e32 v5, s1
8433 ; VI-NEXT: v_mov_b32_e32 v4, s0
8434 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
8435 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
8436 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8437 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
8438 ; VI-NEXT: s_waitcnt vmcnt(0)
8439 ; VI-NEXT: buffer_wbinvl1_vol
8440 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
8441 ; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
8442 ; VI-NEXT: s_andn2_b64 exec, exec, s[6:7]
8443 ; VI-NEXT: s_cbranch_execnz .LBB104_1
8444 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
8445 ; VI-NEXT: s_or_b64 exec, exec, s[6:7]
8446 ; VI-NEXT: v_mov_b32_e32 v2, s2
8447 ; VI-NEXT: v_mov_b32_e32 v3, s3
8448 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
8451 ; GFX9-LABEL: atomic_min_i64_ret_addr64_offset:
8452 ; GFX9: ; %bb.0: ; %entry
8453 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
8454 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
8455 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
8456 ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
8457 ; GFX9-NEXT: s_add_u32 s0, s0, s6
8458 ; GFX9-NEXT: s_addc_u32 s1, s1, s7
8459 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20
8460 ; GFX9-NEXT: s_mov_b64 s[6:7], 0
8461 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
8462 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
8463 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
8464 ; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start
8465 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
8466 ; GFX9-NEXT: v_mov_b32_e32 v6, v1
8467 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
8468 ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[5:6]
8469 ; GFX9-NEXT: v_mov_b32_e32 v0, s5
8470 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
8471 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
8472 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
8473 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8474 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] offset:32 glc
8475 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8476 ; GFX9-NEXT: buffer_wbinvl1_vol
8477 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
8478 ; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
8479 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
8480 ; GFX9-NEXT: s_cbranch_execnz .LBB104_1
8481 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
8482 ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
8483 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
8484 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
8485 ; GFX9-NEXT: s_endpgm
8487 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
8488 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
8489 %tmp0 = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst
8490 store i64 %tmp0, ptr addrspace(1) %out2
8494 define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
8495 ; SI-LABEL: atomic_min_i64:
8496 ; SI: ; %bb.0: ; %entry
8497 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
8498 ; SI-NEXT: s_waitcnt lgkmcnt(0)
8499 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
8500 ; SI-NEXT: s_mov_b64 s[4:5], 0
8501 ; SI-NEXT: s_mov_b32 s6, s2
8502 ; SI-NEXT: s_mov_b32 s7, s3
8503 ; SI-NEXT: s_mov_b32 s3, 0xf000
8504 ; SI-NEXT: s_waitcnt lgkmcnt(0)
8505 ; SI-NEXT: v_mov_b32_e32 v2, s8
8506 ; SI-NEXT: v_mov_b32_e32 v3, s9
8507 ; SI-NEXT: s_mov_b32 s2, -1
8508 ; SI-NEXT: .LBB105_1: ; %atomicrmw.start
8509 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
8510 ; SI-NEXT: v_mov_b32_e32 v0, s7
8511 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
8512 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
8513 ; SI-NEXT: v_mov_b32_e32 v0, s6
8514 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
8515 ; SI-NEXT: s_waitcnt expcnt(0)
8516 ; SI-NEXT: v_mov_b32_e32 v7, v3
8517 ; SI-NEXT: v_mov_b32_e32 v6, v2
8518 ; SI-NEXT: v_mov_b32_e32 v5, v1
8519 ; SI-NEXT: v_mov_b32_e32 v4, v0
8520 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8521 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
8522 ; SI-NEXT: s_waitcnt vmcnt(0)
8523 ; SI-NEXT: buffer_wbinvl1
8524 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
8525 ; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
8526 ; SI-NEXT: v_mov_b32_e32 v2, v4
8527 ; SI-NEXT: v_mov_b32_e32 v3, v5
8528 ; SI-NEXT: s_andn2_b64 exec, exec, s[4:5]
8529 ; SI-NEXT: s_cbranch_execnz .LBB105_1
8530 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
8533 ; VI-LABEL: atomic_min_i64:
8534 ; VI: ; %bb.0: ; %entry
8535 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
8536 ; VI-NEXT: s_mov_b64 s[4:5], 0
8537 ; VI-NEXT: s_waitcnt lgkmcnt(0)
8538 ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
8539 ; VI-NEXT: s_waitcnt lgkmcnt(0)
8540 ; VI-NEXT: v_mov_b32_e32 v2, s6
8541 ; VI-NEXT: v_mov_b32_e32 v3, s7
8542 ; VI-NEXT: .LBB105_1: ; %atomicrmw.start
8543 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
8544 ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
8545 ; VI-NEXT: v_mov_b32_e32 v0, s3
8546 ; VI-NEXT: v_mov_b32_e32 v6, s2
8547 ; VI-NEXT: v_mov_b32_e32 v5, s1
8548 ; VI-NEXT: v_mov_b32_e32 v4, s0
8549 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
8550 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
8551 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8552 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
8553 ; VI-NEXT: s_waitcnt vmcnt(0)
8554 ; VI-NEXT: buffer_wbinvl1_vol
8555 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
8556 ; VI-NEXT: v_mov_b32_e32 v3, v1
8557 ; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
8558 ; VI-NEXT: v_mov_b32_e32 v2, v0
8559 ; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
8560 ; VI-NEXT: s_cbranch_execnz .LBB105_1
8561 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
8564 ; GFX9-LABEL: atomic_min_i64:
8565 ; GFX9: ; %bb.0: ; %entry
8566 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
8567 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
8568 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
8569 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
8570 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
8571 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
8572 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
8573 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
8574 ; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start
8575 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
8576 ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
8577 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
8578 ; GFX9-NEXT: v_mov_b32_e32 v5, s2
8579 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
8580 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
8581 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8582 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
8583 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8584 ; GFX9-NEXT: buffer_wbinvl1_vol
8585 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
8586 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
8587 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
8588 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
8589 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
8590 ; GFX9-NEXT: s_cbranch_execnz .LBB105_1
8591 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
8592 ; GFX9-NEXT: s_endpgm
8594 %tmp0 = atomicrmw min ptr addrspace(1) %out, i64 %in seq_cst
8598 define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
8599 ; SI-LABEL: atomic_min_i64_ret_addr64:
8600 ; SI: ; %bb.0: ; %entry
8601 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
8602 ; SI-NEXT: s_waitcnt lgkmcnt(0)
8603 ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
8604 ; SI-NEXT: s_add_u32 s8, s0, s6
8605 ; SI-NEXT: s_addc_u32 s9, s1, s7
8606 ; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
8607 ; SI-NEXT: s_mov_b64 s[0:1], 0
8608 ; SI-NEXT: s_mov_b32 s11, 0xf000
8609 ; SI-NEXT: s_waitcnt lgkmcnt(0)
8610 ; SI-NEXT: v_mov_b32_e32 v2, s6
8611 ; SI-NEXT: v_mov_b32_e32 v3, s7
8612 ; SI-NEXT: s_mov_b32 s10, -1
8613 ; SI-NEXT: .LBB106_1: ; %atomicrmw.start
8614 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1
8615 ; SI-NEXT: v_mov_b32_e32 v0, s5
8616 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
8617 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
8618 ; SI-NEXT: v_mov_b32_e32 v0, s4
8619 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
8620 ; SI-NEXT: s_waitcnt expcnt(0)
8621 ; SI-NEXT: v_mov_b32_e32 v7, v3
8622 ; SI-NEXT: v_mov_b32_e32 v6, v2
8623 ; SI-NEXT: v_mov_b32_e32 v5, v1
8624 ; SI-NEXT: v_mov_b32_e32 v4, v0
8625 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8626 ; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[8:11], 0 glc
8627 ; SI-NEXT: s_waitcnt vmcnt(0)
8628 ; SI-NEXT: buffer_wbinvl1
8629 ; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
8630 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
8631 ; SI-NEXT: v_mov_b32_e32 v2, v4
8632 ; SI-NEXT: v_mov_b32_e32 v3, v5
8633 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
8634 ; SI-NEXT: s_cbranch_execnz .LBB106_1
8635 ; SI-NEXT: ; %bb.2: ; %atomicrmw.end
8636 ; SI-NEXT: s_or_b64 exec, exec, s[0:1]
8637 ; SI-NEXT: s_mov_b32 s7, 0xf000
8638 ; SI-NEXT: s_mov_b32 s6, -1
8639 ; SI-NEXT: s_mov_b32 s4, s2
8640 ; SI-NEXT: s_mov_b32 s5, s3
8641 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0
8644 ; VI-LABEL: atomic_min_i64_ret_addr64:
8645 ; VI: ; %bb.0: ; %entry
8646 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
8647 ; VI-NEXT: s_waitcnt lgkmcnt(0)
8648 ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
8649 ; VI-NEXT: s_add_u32 s0, s0, s6
8650 ; VI-NEXT: s_addc_u32 s1, s1, s7
8651 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
8652 ; VI-NEXT: s_mov_b64 s[6:7], 0
8653 ; VI-NEXT: s_waitcnt lgkmcnt(0)
8654 ; VI-NEXT: v_mov_b32_e32 v0, s8
8655 ; VI-NEXT: v_mov_b32_e32 v1, s9
8656 ; VI-NEXT: .LBB106_1: ; %atomicrmw.start
8657 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1
8658 ; VI-NEXT: v_mov_b32_e32 v3, v1
8659 ; VI-NEXT: v_mov_b32_e32 v2, v0
8660 ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
8661 ; VI-NEXT: v_mov_b32_e32 v0, s5
8662 ; VI-NEXT: v_mov_b32_e32 v6, s4
8663 ; VI-NEXT: v_mov_b32_e32 v5, s1
8664 ; VI-NEXT: v_mov_b32_e32 v4, s0
8665 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
8666 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
8667 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8668 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
8669 ; VI-NEXT: s_waitcnt vmcnt(0)
8670 ; VI-NEXT: buffer_wbinvl1_vol
8671 ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
8672 ; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
8673 ; VI-NEXT: s_andn2_b64 exec, exec, s[6:7]
8674 ; VI-NEXT: s_cbranch_execnz .LBB106_1
8675 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end
8676 ; VI-NEXT: s_or_b64 exec, exec, s[6:7]
8677 ; VI-NEXT: v_mov_b32_e32 v2, s2
8678 ; VI-NEXT: v_mov_b32_e32 v3, s3
8679 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
8682 ; GFX9-LABEL: atomic_min_i64_ret_addr64:
8683 ; GFX9: ; %bb.0: ; %entry
8684 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
8685 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
8686 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
8687 ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
8688 ; GFX9-NEXT: s_add_u32 s0, s0, s6
8689 ; GFX9-NEXT: s_addc_u32 s1, s1, s7
8690 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
8691 ; GFX9-NEXT: s_mov_b64 s[6:7], 0
8692 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
8693 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
8694 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
8695 ; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start
8696 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
8697 ; GFX9-NEXT: v_mov_b32_e32 v6, v1
8698 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
8699 ; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[5:6]
8700 ; GFX9-NEXT: v_mov_b32_e32 v0, s5
8701 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
8702 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v6, vcc
8703 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
8704 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8705 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[0:1] glc
8706 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8707 ; GFX9-NEXT: buffer_wbinvl1_vol
8708 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
8709 ; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
8710 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
8711 ; GFX9-NEXT: s_cbranch_execnz .LBB106_1
8712 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
8713 ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
8714 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
8715 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
8716 ; GFX9-NEXT: s_endpgm
8718 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
8719 %tmp0 = atomicrmw min ptr addrspace(1) %ptr, i64 %in seq_cst
8720 store i64 %tmp0, ptr addrspace(1) %out2
8724 ; ---------------------------------------------------------------------
8725 ; atomicrmw uinc_wrap
8726 ; ---------------------------------------------------------------------
8728 define void @global_atomic_uinc_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
8729 ; SI-LABEL: global_atomic_uinc_wrap_i64_noret:
8731 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8732 ; SI-NEXT: s_mov_b32 s6, 0
8733 ; SI-NEXT: s_mov_b32 s7, 0xf000
8734 ; SI-NEXT: s_mov_b32 s4, s6
8735 ; SI-NEXT: s_mov_b32 s5, s6
8736 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8737 ; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64
8738 ; SI-NEXT: s_waitcnt vmcnt(0)
8739 ; SI-NEXT: buffer_wbinvl1
8740 ; SI-NEXT: s_waitcnt expcnt(0)
8741 ; SI-NEXT: s_setpc_b64 s[30:31]
8743 ; VI-LABEL: global_atomic_uinc_wrap_i64_noret:
8745 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8746 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
8747 ; VI-NEXT: s_waitcnt vmcnt(0)
8748 ; VI-NEXT: buffer_wbinvl1_vol
8749 ; VI-NEXT: s_setpc_b64 s[30:31]
8751 ; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret:
8753 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8754 ; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[2:3], off
8755 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8756 ; GFX9-NEXT: buffer_wbinvl1_vol
8757 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8758 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
8762 define void @global_atomic_uinc_wrap_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
8763 ; SI-LABEL: global_atomic_uinc_wrap_i64_noret_offset:
8765 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8766 ; SI-NEXT: s_mov_b32 s6, 0
8767 ; SI-NEXT: s_mov_b32 s7, 0xf000
8768 ; SI-NEXT: s_mov_b32 s4, s6
8769 ; SI-NEXT: s_mov_b32 s5, s6
8770 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8771 ; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
8772 ; SI-NEXT: s_waitcnt vmcnt(0)
8773 ; SI-NEXT: buffer_wbinvl1
8774 ; SI-NEXT: s_waitcnt expcnt(0)
8775 ; SI-NEXT: s_setpc_b64 s[30:31]
8777 ; VI-LABEL: global_atomic_uinc_wrap_i64_noret_offset:
8779 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8780 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
8781 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
8782 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8783 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
8784 ; VI-NEXT: s_waitcnt vmcnt(0)
8785 ; VI-NEXT: buffer_wbinvl1_vol
8786 ; VI-NEXT: s_setpc_b64 s[30:31]
8788 ; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_offset:
8790 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8791 ; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[2:3], off offset:32
8792 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8793 ; GFX9-NEXT: buffer_wbinvl1_vol
8794 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8795 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
8796 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst
8800 define i64 @global_atomic_uinc_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
8801 ; SI-LABEL: global_atomic_uinc_wrap_i64_ret:
8803 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8804 ; SI-NEXT: s_mov_b32 s6, 0
8805 ; SI-NEXT: s_mov_b32 s7, 0xf000
8806 ; SI-NEXT: s_mov_b32 s4, s6
8807 ; SI-NEXT: s_mov_b32 s5, s6
8808 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8809 ; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
8810 ; SI-NEXT: s_waitcnt vmcnt(0)
8811 ; SI-NEXT: buffer_wbinvl1
8812 ; SI-NEXT: v_mov_b32_e32 v0, v2
8813 ; SI-NEXT: v_mov_b32_e32 v1, v3
8814 ; SI-NEXT: s_waitcnt expcnt(0)
8815 ; SI-NEXT: s_setpc_b64 s[30:31]
8817 ; VI-LABEL: global_atomic_uinc_wrap_i64_ret:
8819 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8820 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
8821 ; VI-NEXT: s_waitcnt vmcnt(0)
8822 ; VI-NEXT: buffer_wbinvl1_vol
8823 ; VI-NEXT: s_setpc_b64 s[30:31]
8825 ; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret:
8827 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8828 ; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc
8829 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8830 ; GFX9-NEXT: buffer_wbinvl1_vol
8831 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8832 %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
8836 define i64 @global_atomic_uinc_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
8837 ; SI-LABEL: global_atomic_uinc_wrap_i64_ret_offset:
8839 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8840 ; SI-NEXT: s_mov_b32 s6, 0
8841 ; SI-NEXT: s_mov_b32 s7, 0xf000
8842 ; SI-NEXT: s_mov_b32 s4, s6
8843 ; SI-NEXT: s_mov_b32 s5, s6
8844 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8845 ; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
8846 ; SI-NEXT: s_waitcnt vmcnt(0)
8847 ; SI-NEXT: buffer_wbinvl1
8848 ; SI-NEXT: v_mov_b32_e32 v0, v2
8849 ; SI-NEXT: v_mov_b32_e32 v1, v3
8850 ; SI-NEXT: s_waitcnt expcnt(0)
8851 ; SI-NEXT: s_setpc_b64 s[30:31]
8853 ; VI-LABEL: global_atomic_uinc_wrap_i64_ret_offset:
8855 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8856 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
8857 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
8858 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8859 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
8860 ; VI-NEXT: s_waitcnt vmcnt(0)
8861 ; VI-NEXT: buffer_wbinvl1_vol
8862 ; VI-NEXT: s_setpc_b64 s[30:31]
8864 ; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_offset:
8866 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8867 ; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
8868 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8869 ; GFX9-NEXT: buffer_wbinvl1_vol
8870 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8871 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
8872 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst
8876 define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
8877 ; SI-LABEL: global_atomic_uinc_wrap_i64_noret_scalar:
8879 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8880 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8881 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
8882 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8883 ; SI-NEXT: s_waitcnt expcnt(0)
8884 ; SI-NEXT: v_writelane_b32 v0, s6, 0
8885 ; SI-NEXT: v_writelane_b32 v0, s7, 1
8886 ; SI-NEXT: s_mov_b32 s34, s7
8887 ; SI-NEXT: s_mov_b32 s35, s6
8888 ; SI-NEXT: s_mov_b32 s7, 0xf000
8889 ; SI-NEXT: s_mov_b32 s6, -1
8890 ; SI-NEXT: v_mov_b32_e32 v1, s35
8891 ; SI-NEXT: v_mov_b32_e32 v2, s34
8892 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8893 ; SI-NEXT: buffer_atomic_inc_x2 v[1:2], off, s[4:7], 0
8894 ; SI-NEXT: s_waitcnt vmcnt(0)
8895 ; SI-NEXT: buffer_wbinvl1
8896 ; SI-NEXT: v_readlane_b32 s7, v0, 1
8897 ; SI-NEXT: v_readlane_b32 s6, v0, 0
8898 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8899 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
8900 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8901 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
8902 ; SI-NEXT: s_setpc_b64 s[30:31]
8904 ; VI-LABEL: global_atomic_uinc_wrap_i64_noret_scalar:
8906 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8907 ; VI-NEXT: v_mov_b32_e32 v0, s6
8908 ; VI-NEXT: v_mov_b32_e32 v1, s7
8909 ; VI-NEXT: v_mov_b32_e32 v2, s4
8910 ; VI-NEXT: v_mov_b32_e32 v3, s5
8911 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8912 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
8913 ; VI-NEXT: s_waitcnt vmcnt(0)
8914 ; VI-NEXT: buffer_wbinvl1_vol
8915 ; VI-NEXT: s_setpc_b64 s[30:31]
8917 ; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_scalar:
8919 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8920 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
8921 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
8922 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
8923 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8924 ; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[4:5]
8925 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8926 ; GFX9-NEXT: buffer_wbinvl1_vol
8927 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8928 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
8932 define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
8933 ; SI-LABEL: global_atomic_uinc_wrap_i64_noret_offset_scalar:
8935 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8936 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8937 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
8938 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8939 ; SI-NEXT: s_waitcnt expcnt(0)
8940 ; SI-NEXT: v_writelane_b32 v0, s6, 0
8941 ; SI-NEXT: v_writelane_b32 v0, s7, 1
8942 ; SI-NEXT: v_mov_b32_e32 v1, s6
8943 ; SI-NEXT: v_mov_b32_e32 v2, s7
8944 ; SI-NEXT: s_mov_b32 s7, 0xf000
8945 ; SI-NEXT: s_mov_b32 s6, -1
8946 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8947 ; SI-NEXT: buffer_atomic_inc_x2 v[1:2], off, s[4:7], 0 offset:32
8948 ; SI-NEXT: s_waitcnt vmcnt(0)
8949 ; SI-NEXT: buffer_wbinvl1
8950 ; SI-NEXT: v_readlane_b32 s7, v0, 1
8951 ; SI-NEXT: v_readlane_b32 s6, v0, 0
8952 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8953 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
8954 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8955 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
8956 ; SI-NEXT: s_setpc_b64 s[30:31]
8958 ; VI-LABEL: global_atomic_uinc_wrap_i64_noret_offset_scalar:
8960 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8961 ; VI-NEXT: s_add_u32 s34, s4, 32
8962 ; VI-NEXT: s_addc_u32 s35, s5, 0
8963 ; VI-NEXT: v_mov_b32_e32 v2, s34
8964 ; VI-NEXT: v_mov_b32_e32 v0, s6
8965 ; VI-NEXT: v_mov_b32_e32 v1, s7
8966 ; VI-NEXT: v_mov_b32_e32 v3, s35
8967 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8968 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
8969 ; VI-NEXT: s_waitcnt vmcnt(0)
8970 ; VI-NEXT: buffer_wbinvl1_vol
8971 ; VI-NEXT: s_setpc_b64 s[30:31]
8973 ; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_offset_scalar:
8975 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8976 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
8977 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
8978 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
8979 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8980 ; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[4:5] offset:32
8981 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8982 ; GFX9-NEXT: buffer_wbinvl1_vol
8983 ; GFX9-NEXT: s_setpc_b64 s[30:31]
8984 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
8985 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst
8989 define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
8990 ; SI-LABEL: global_atomic_uinc_wrap_i64_ret_scalar:
8992 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8993 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
8994 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
8995 ; SI-NEXT: s_mov_b64 exec, s[34:35]
8996 ; SI-NEXT: s_waitcnt expcnt(0)
8997 ; SI-NEXT: v_writelane_b32 v2, s6, 0
8998 ; SI-NEXT: v_writelane_b32 v2, s7, 1
8999 ; SI-NEXT: s_mov_b32 s34, s7
9000 ; SI-NEXT: s_mov_b32 s35, s6
9001 ; SI-NEXT: s_mov_b32 s7, 0xf000
9002 ; SI-NEXT: s_mov_b32 s6, -1
9003 ; SI-NEXT: v_mov_b32_e32 v0, s35
9004 ; SI-NEXT: v_mov_b32_e32 v1, s34
9005 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9006 ; SI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 glc
9007 ; SI-NEXT: s_waitcnt vmcnt(0)
9008 ; SI-NEXT: buffer_wbinvl1
9009 ; SI-NEXT: v_readlane_b32 s7, v2, 1
9010 ; SI-NEXT: v_readlane_b32 s6, v2, 0
9011 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
9012 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
9013 ; SI-NEXT: s_mov_b64 exec, s[34:35]
9014 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
9015 ; SI-NEXT: s_setpc_b64 s[30:31]
9017 ; VI-LABEL: global_atomic_uinc_wrap_i64_ret_scalar:
9019 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9020 ; VI-NEXT: v_mov_b32_e32 v0, s6
9021 ; VI-NEXT: v_mov_b32_e32 v1, s7
9022 ; VI-NEXT: v_mov_b32_e32 v2, s4
9023 ; VI-NEXT: v_mov_b32_e32 v3, s5
9024 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9025 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
9026 ; VI-NEXT: s_waitcnt vmcnt(0)
9027 ; VI-NEXT: buffer_wbinvl1_vol
9028 ; VI-NEXT: s_setpc_b64 s[30:31]
9030 ; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_scalar:
9032 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9033 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
9034 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
9035 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
9036 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9037 ; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[4:5] glc
9038 ; GFX9-NEXT: s_waitcnt vmcnt(0)
9039 ; GFX9-NEXT: buffer_wbinvl1_vol
9040 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9041 %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
9045 define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
9046 ; SI-LABEL: global_atomic_uinc_wrap_i64_ret_offset_scalar:
9048 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9049 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
9050 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
9051 ; SI-NEXT: s_mov_b64 exec, s[34:35]
9052 ; SI-NEXT: s_waitcnt expcnt(0)
9053 ; SI-NEXT: v_writelane_b32 v2, s6, 0
9054 ; SI-NEXT: v_writelane_b32 v2, s7, 1
9055 ; SI-NEXT: v_mov_b32_e32 v0, s6
9056 ; SI-NEXT: v_mov_b32_e32 v1, s7
9057 ; SI-NEXT: s_mov_b32 s7, 0xf000
9058 ; SI-NEXT: s_mov_b32 s6, -1
9059 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9060 ; SI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32 glc
9061 ; SI-NEXT: s_waitcnt vmcnt(0)
9062 ; SI-NEXT: buffer_wbinvl1
9063 ; SI-NEXT: v_readlane_b32 s7, v2, 1
9064 ; SI-NEXT: v_readlane_b32 s6, v2, 0
9065 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
9066 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
9067 ; SI-NEXT: s_mov_b64 exec, s[34:35]
9068 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
9069 ; SI-NEXT: s_setpc_b64 s[30:31]
9071 ; VI-LABEL: global_atomic_uinc_wrap_i64_ret_offset_scalar:
9073 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9074 ; VI-NEXT: s_add_u32 s34, s4, 32
9075 ; VI-NEXT: s_addc_u32 s35, s5, 0
9076 ; VI-NEXT: v_mov_b32_e32 v2, s34
9077 ; VI-NEXT: v_mov_b32_e32 v0, s6
9078 ; VI-NEXT: v_mov_b32_e32 v1, s7
9079 ; VI-NEXT: v_mov_b32_e32 v3, s35
9080 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9081 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
9082 ; VI-NEXT: s_waitcnt vmcnt(0)
9083 ; VI-NEXT: buffer_wbinvl1_vol
9084 ; VI-NEXT: s_setpc_b64 s[30:31]
9086 ; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_offset_scalar:
9088 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9089 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
9090 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
9091 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
9092 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9093 ; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
9094 ; GFX9-NEXT: s_waitcnt vmcnt(0)
9095 ; GFX9-NEXT: buffer_wbinvl1_vol
9096 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9097 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
9098 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst
9102 ; ---------------------------------------------------------------------
9103 ; atomicrmw udec_wrap
9104 ; ---------------------------------------------------------------------
9106 define void @global_atomic_udec_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
9107 ; SI-LABEL: global_atomic_udec_wrap_i64_noret:
9109 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9110 ; SI-NEXT: s_mov_b32 s6, 0
9111 ; SI-NEXT: s_mov_b32 s7, 0xf000
9112 ; SI-NEXT: s_mov_b32 s4, s6
9113 ; SI-NEXT: s_mov_b32 s5, s6
9114 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9115 ; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64
9116 ; SI-NEXT: s_waitcnt vmcnt(0)
9117 ; SI-NEXT: buffer_wbinvl1
9118 ; SI-NEXT: s_waitcnt expcnt(0)
9119 ; SI-NEXT: s_setpc_b64 s[30:31]
9121 ; VI-LABEL: global_atomic_udec_wrap_i64_noret:
9123 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9124 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
9125 ; VI-NEXT: s_waitcnt vmcnt(0)
9126 ; VI-NEXT: buffer_wbinvl1_vol
9127 ; VI-NEXT: s_setpc_b64 s[30:31]
9129 ; GFX9-LABEL: global_atomic_udec_wrap_i64_noret:
9131 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9132 ; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[2:3], off
9133 ; GFX9-NEXT: s_waitcnt vmcnt(0)
9134 ; GFX9-NEXT: buffer_wbinvl1_vol
9135 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9136 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
9140 define void @global_atomic_udec_wrap_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
9141 ; SI-LABEL: global_atomic_udec_wrap_i64_noret_offset:
9143 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9144 ; SI-NEXT: s_mov_b32 s6, 0
9145 ; SI-NEXT: s_mov_b32 s7, 0xf000
9146 ; SI-NEXT: s_mov_b32 s4, s6
9147 ; SI-NEXT: s_mov_b32 s5, s6
9148 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9149 ; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
9150 ; SI-NEXT: s_waitcnt vmcnt(0)
9151 ; SI-NEXT: buffer_wbinvl1
9152 ; SI-NEXT: s_waitcnt expcnt(0)
9153 ; SI-NEXT: s_setpc_b64 s[30:31]
9155 ; VI-LABEL: global_atomic_udec_wrap_i64_noret_offset:
9157 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9158 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
9159 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
9160 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9161 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
9162 ; VI-NEXT: s_waitcnt vmcnt(0)
9163 ; VI-NEXT: buffer_wbinvl1_vol
9164 ; VI-NEXT: s_setpc_b64 s[30:31]
9166 ; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_offset:
9168 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9169 ; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[2:3], off offset:32
9170 ; GFX9-NEXT: s_waitcnt vmcnt(0)
9171 ; GFX9-NEXT: buffer_wbinvl1_vol
9172 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9173 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
9174 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst
9178 define i64 @global_atomic_udec_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
9179 ; SI-LABEL: global_atomic_udec_wrap_i64_ret:
9181 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9182 ; SI-NEXT: s_mov_b32 s6, 0
9183 ; SI-NEXT: s_mov_b32 s7, 0xf000
9184 ; SI-NEXT: s_mov_b32 s4, s6
9185 ; SI-NEXT: s_mov_b32 s5, s6
9186 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9187 ; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
9188 ; SI-NEXT: s_waitcnt vmcnt(0)
9189 ; SI-NEXT: buffer_wbinvl1
9190 ; SI-NEXT: v_mov_b32_e32 v0, v2
9191 ; SI-NEXT: v_mov_b32_e32 v1, v3
9192 ; SI-NEXT: s_waitcnt expcnt(0)
9193 ; SI-NEXT: s_setpc_b64 s[30:31]
9195 ; VI-LABEL: global_atomic_udec_wrap_i64_ret:
9197 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9198 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
9199 ; VI-NEXT: s_waitcnt vmcnt(0)
9200 ; VI-NEXT: buffer_wbinvl1_vol
9201 ; VI-NEXT: s_setpc_b64 s[30:31]
9203 ; GFX9-LABEL: global_atomic_udec_wrap_i64_ret:
9205 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9206 ; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off glc
9207 ; GFX9-NEXT: s_waitcnt vmcnt(0)
9208 ; GFX9-NEXT: buffer_wbinvl1_vol
9209 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9210 %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
9214 define i64 @global_atomic_udec_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
9215 ; SI-LABEL: global_atomic_udec_wrap_i64_ret_offset:
9217 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9218 ; SI-NEXT: s_mov_b32 s6, 0
9219 ; SI-NEXT: s_mov_b32 s7, 0xf000
9220 ; SI-NEXT: s_mov_b32 s4, s6
9221 ; SI-NEXT: s_mov_b32 s5, s6
9222 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9223 ; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
9224 ; SI-NEXT: s_waitcnt vmcnt(0)
9225 ; SI-NEXT: buffer_wbinvl1
9226 ; SI-NEXT: v_mov_b32_e32 v0, v2
9227 ; SI-NEXT: v_mov_b32_e32 v1, v3
9228 ; SI-NEXT: s_waitcnt expcnt(0)
9229 ; SI-NEXT: s_setpc_b64 s[30:31]
9231 ; VI-LABEL: global_atomic_udec_wrap_i64_ret_offset:
9233 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9234 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
9235 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
9236 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9237 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
9238 ; VI-NEXT: s_waitcnt vmcnt(0)
9239 ; VI-NEXT: buffer_wbinvl1_vol
9240 ; VI-NEXT: s_setpc_b64 s[30:31]
9242 ; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_offset:
9244 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9245 ; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
9246 ; GFX9-NEXT: s_waitcnt vmcnt(0)
9247 ; GFX9-NEXT: buffer_wbinvl1_vol
9248 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9249 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
9250 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst
9254 define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
9255 ; SI-LABEL: global_atomic_udec_wrap_i64_noret_scalar:
9257 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9258 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
9259 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
9260 ; SI-NEXT: s_mov_b64 exec, s[34:35]
9261 ; SI-NEXT: s_waitcnt expcnt(0)
9262 ; SI-NEXT: v_writelane_b32 v0, s6, 0
9263 ; SI-NEXT: v_writelane_b32 v0, s7, 1
9264 ; SI-NEXT: s_mov_b32 s34, s7
9265 ; SI-NEXT: s_mov_b32 s35, s6
9266 ; SI-NEXT: s_mov_b32 s7, 0xf000
9267 ; SI-NEXT: s_mov_b32 s6, -1
9268 ; SI-NEXT: v_mov_b32_e32 v1, s35
9269 ; SI-NEXT: v_mov_b32_e32 v2, s34
9270 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9271 ; SI-NEXT: buffer_atomic_dec_x2 v[1:2], off, s[4:7], 0
9272 ; SI-NEXT: s_waitcnt vmcnt(0)
9273 ; SI-NEXT: buffer_wbinvl1
9274 ; SI-NEXT: v_readlane_b32 s7, v0, 1
9275 ; SI-NEXT: v_readlane_b32 s6, v0, 0
9276 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
9277 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
9278 ; SI-NEXT: s_mov_b64 exec, s[34:35]
9279 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
9280 ; SI-NEXT: s_setpc_b64 s[30:31]
9282 ; VI-LABEL: global_atomic_udec_wrap_i64_noret_scalar:
9284 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9285 ; VI-NEXT: v_mov_b32_e32 v0, s6
9286 ; VI-NEXT: v_mov_b32_e32 v1, s7
9287 ; VI-NEXT: v_mov_b32_e32 v2, s4
9288 ; VI-NEXT: v_mov_b32_e32 v3, s5
9289 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9290 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
9291 ; VI-NEXT: s_waitcnt vmcnt(0)
9292 ; VI-NEXT: buffer_wbinvl1_vol
9293 ; VI-NEXT: s_setpc_b64 s[30:31]
9295 ; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_scalar:
9297 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9298 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
9299 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
9300 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
9301 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9302 ; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[4:5]
9303 ; GFX9-NEXT: s_waitcnt vmcnt(0)
9304 ; GFX9-NEXT: buffer_wbinvl1_vol
9305 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9306 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
9310 define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
9311 ; SI-LABEL: global_atomic_udec_wrap_i64_noret_offset_scalar:
9313 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9314 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
9315 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
9316 ; SI-NEXT: s_mov_b64 exec, s[34:35]
9317 ; SI-NEXT: s_waitcnt expcnt(0)
9318 ; SI-NEXT: v_writelane_b32 v0, s6, 0
9319 ; SI-NEXT: v_writelane_b32 v0, s7, 1
9320 ; SI-NEXT: v_mov_b32_e32 v1, s6
9321 ; SI-NEXT: v_mov_b32_e32 v2, s7
9322 ; SI-NEXT: s_mov_b32 s7, 0xf000
9323 ; SI-NEXT: s_mov_b32 s6, -1
9324 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9325 ; SI-NEXT: buffer_atomic_dec_x2 v[1:2], off, s[4:7], 0 offset:32
9326 ; SI-NEXT: s_waitcnt vmcnt(0)
9327 ; SI-NEXT: buffer_wbinvl1
9328 ; SI-NEXT: v_readlane_b32 s7, v0, 1
9329 ; SI-NEXT: v_readlane_b32 s6, v0, 0
9330 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
9331 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
9332 ; SI-NEXT: s_mov_b64 exec, s[34:35]
9333 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
9334 ; SI-NEXT: s_setpc_b64 s[30:31]
9336 ; VI-LABEL: global_atomic_udec_wrap_i64_noret_offset_scalar:
9338 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9339 ; VI-NEXT: s_add_u32 s34, s4, 32
9340 ; VI-NEXT: s_addc_u32 s35, s5, 0
9341 ; VI-NEXT: v_mov_b32_e32 v2, s34
9342 ; VI-NEXT: v_mov_b32_e32 v0, s6
9343 ; VI-NEXT: v_mov_b32_e32 v1, s7
9344 ; VI-NEXT: v_mov_b32_e32 v3, s35
9345 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9346 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
9347 ; VI-NEXT: s_waitcnt vmcnt(0)
9348 ; VI-NEXT: buffer_wbinvl1_vol
9349 ; VI-NEXT: s_setpc_b64 s[30:31]
9351 ; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_offset_scalar:
9353 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9354 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
9355 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
9356 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
9357 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9358 ; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[4:5] offset:32
9359 ; GFX9-NEXT: s_waitcnt vmcnt(0)
9360 ; GFX9-NEXT: buffer_wbinvl1_vol
9361 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9362 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
9363 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst
9367 define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
9368 ; SI-LABEL: global_atomic_udec_wrap_i64_ret_scalar:
9370 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9371 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
9372 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
9373 ; SI-NEXT: s_mov_b64 exec, s[34:35]
9374 ; SI-NEXT: s_waitcnt expcnt(0)
9375 ; SI-NEXT: v_writelane_b32 v2, s6, 0
9376 ; SI-NEXT: v_writelane_b32 v2, s7, 1
9377 ; SI-NEXT: s_mov_b32 s34, s7
9378 ; SI-NEXT: s_mov_b32 s35, s6
9379 ; SI-NEXT: s_mov_b32 s7, 0xf000
9380 ; SI-NEXT: s_mov_b32 s6, -1
9381 ; SI-NEXT: v_mov_b32_e32 v0, s35
9382 ; SI-NEXT: v_mov_b32_e32 v1, s34
9383 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9384 ; SI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 glc
9385 ; SI-NEXT: s_waitcnt vmcnt(0)
9386 ; SI-NEXT: buffer_wbinvl1
9387 ; SI-NEXT: v_readlane_b32 s7, v2, 1
9388 ; SI-NEXT: v_readlane_b32 s6, v2, 0
9389 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
9390 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
9391 ; SI-NEXT: s_mov_b64 exec, s[34:35]
9392 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
9393 ; SI-NEXT: s_setpc_b64 s[30:31]
9395 ; VI-LABEL: global_atomic_udec_wrap_i64_ret_scalar:
9397 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9398 ; VI-NEXT: v_mov_b32_e32 v0, s6
9399 ; VI-NEXT: v_mov_b32_e32 v1, s7
9400 ; VI-NEXT: v_mov_b32_e32 v2, s4
9401 ; VI-NEXT: v_mov_b32_e32 v3, s5
9402 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9403 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
9404 ; VI-NEXT: s_waitcnt vmcnt(0)
9405 ; VI-NEXT: buffer_wbinvl1_vol
9406 ; VI-NEXT: s_setpc_b64 s[30:31]
9408 ; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_scalar:
9410 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9411 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
9412 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
9413 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
9414 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9415 ; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[4:5] glc
9416 ; GFX9-NEXT: s_waitcnt vmcnt(0)
9417 ; GFX9-NEXT: buffer_wbinvl1_vol
9418 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9419 %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
9423 define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
9424 ; SI-LABEL: global_atomic_udec_wrap_i64_ret_offset_scalar:
9426 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9427 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
9428 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
9429 ; SI-NEXT: s_mov_b64 exec, s[34:35]
9430 ; SI-NEXT: s_waitcnt expcnt(0)
9431 ; SI-NEXT: v_writelane_b32 v2, s6, 0
9432 ; SI-NEXT: v_writelane_b32 v2, s7, 1
9433 ; SI-NEXT: v_mov_b32_e32 v0, s6
9434 ; SI-NEXT: v_mov_b32_e32 v1, s7
9435 ; SI-NEXT: s_mov_b32 s7, 0xf000
9436 ; SI-NEXT: s_mov_b32 s6, -1
9437 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9438 ; SI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32 glc
9439 ; SI-NEXT: s_waitcnt vmcnt(0)
9440 ; SI-NEXT: buffer_wbinvl1
9441 ; SI-NEXT: v_readlane_b32 s7, v2, 1
9442 ; SI-NEXT: v_readlane_b32 s6, v2, 0
9443 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
9444 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
9445 ; SI-NEXT: s_mov_b64 exec, s[34:35]
9446 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
9447 ; SI-NEXT: s_setpc_b64 s[30:31]
9449 ; VI-LABEL: global_atomic_udec_wrap_i64_ret_offset_scalar:
9451 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9452 ; VI-NEXT: s_add_u32 s34, s4, 32
9453 ; VI-NEXT: s_addc_u32 s35, s5, 0
9454 ; VI-NEXT: v_mov_b32_e32 v2, s34
9455 ; VI-NEXT: v_mov_b32_e32 v0, s6
9456 ; VI-NEXT: v_mov_b32_e32 v1, s7
9457 ; VI-NEXT: v_mov_b32_e32 v3, s35
9458 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9459 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
9460 ; VI-NEXT: s_waitcnt vmcnt(0)
9461 ; VI-NEXT: buffer_wbinvl1_vol
9462 ; VI-NEXT: s_setpc_b64 s[30:31]
9464 ; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_offset_scalar:
9466 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9467 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
9468 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
9469 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
9470 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9471 ; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
9472 ; GFX9-NEXT: s_waitcnt vmcnt(0)
9473 ; GFX9-NEXT: buffer_wbinvl1_vol
9474 ; GFX9-NEXT: s_setpc_b64 s[30:31]
9475 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
9476 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst