1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN1 %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN2 %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN3 %s
6 ; ---------------------------------------------------------------------
8 ; ---------------------------------------------------------------------
10 define void @flat_atomic_xchg_i64_noret(ptr %ptr, i64 %in) {
11 ; GCN1-LABEL: flat_atomic_xchg_i64_noret:
13 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
15 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16 ; GCN1-NEXT: buffer_wbinvl1_vol
17 ; GCN1-NEXT: s_setpc_b64 s[30:31]
19 ; GCN2-LABEL: flat_atomic_xchg_i64_noret:
21 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
23 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
24 ; GCN2-NEXT: buffer_wbinvl1_vol
25 ; GCN2-NEXT: s_setpc_b64 s[30:31]
27 ; GCN3-LABEL: flat_atomic_xchg_i64_noret:
29 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
31 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
32 ; GCN3-NEXT: buffer_wbinvl1_vol
33 ; GCN3-NEXT: s_setpc_b64 s[30:31]
34 %tmp0 = atomicrmw xchg ptr %ptr, i64 %in seq_cst
38 define void @flat_atomic_xchg_i64_noret_offset(ptr %out, i64 %in) {
39 ; GCN1-LABEL: flat_atomic_xchg_i64_noret_offset:
41 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
42 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
43 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
44 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
45 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
46 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
47 ; GCN1-NEXT: buffer_wbinvl1_vol
48 ; GCN1-NEXT: s_setpc_b64 s[30:31]
50 ; GCN2-LABEL: flat_atomic_xchg_i64_noret_offset:
52 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
53 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
54 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
55 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
56 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
57 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
58 ; GCN2-NEXT: buffer_wbinvl1_vol
59 ; GCN2-NEXT: s_setpc_b64 s[30:31]
61 ; GCN3-LABEL: flat_atomic_xchg_i64_noret_offset:
63 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
64 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32
65 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
66 ; GCN3-NEXT: buffer_wbinvl1_vol
67 ; GCN3-NEXT: s_setpc_b64 s[30:31]
68 %gep = getelementptr i64, ptr %out, i64 4
69 %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst
73 define i64 @flat_atomic_xchg_i64_ret(ptr %ptr, i64 %in) {
74 ; GCN1-LABEL: flat_atomic_xchg_i64_ret:
76 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
77 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
78 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
79 ; GCN1-NEXT: buffer_wbinvl1_vol
80 ; GCN1-NEXT: s_setpc_b64 s[30:31]
82 ; GCN2-LABEL: flat_atomic_xchg_i64_ret:
84 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
85 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
86 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
87 ; GCN2-NEXT: buffer_wbinvl1_vol
88 ; GCN2-NEXT: s_setpc_b64 s[30:31]
90 ; GCN3-LABEL: flat_atomic_xchg_i64_ret:
92 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
93 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
94 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
95 ; GCN3-NEXT: buffer_wbinvl1_vol
96 ; GCN3-NEXT: s_setpc_b64 s[30:31]
97 %result = atomicrmw xchg ptr %ptr, i64 %in seq_cst
101 define i64 @flat_atomic_xchg_i64_ret_offset(ptr %out, i64 %in) {
102 ; GCN1-LABEL: flat_atomic_xchg_i64_ret_offset:
104 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
105 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
106 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
107 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
108 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
109 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
110 ; GCN1-NEXT: buffer_wbinvl1_vol
111 ; GCN1-NEXT: s_setpc_b64 s[30:31]
113 ; GCN2-LABEL: flat_atomic_xchg_i64_ret_offset:
115 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
116 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
117 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
118 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
119 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
120 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
121 ; GCN2-NEXT: buffer_wbinvl1_vol
122 ; GCN2-NEXT: s_setpc_b64 s[30:31]
124 ; GCN3-LABEL: flat_atomic_xchg_i64_ret_offset:
126 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
127 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
128 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
129 ; GCN3-NEXT: buffer_wbinvl1_vol
130 ; GCN3-NEXT: s_setpc_b64 s[30:31]
131 %gep = getelementptr i64, ptr %out, i64 4
132 %result = atomicrmw xchg ptr %gep, i64 %in seq_cst
136 define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
137 ; GCN1-LABEL: flat_atomic_xchg_i64_noret_scalar:
139 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
140 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
141 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
142 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
143 ; GCN1-NEXT: v_mov_b32_e32 v3, s5
144 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
145 ; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
146 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
147 ; GCN1-NEXT: buffer_wbinvl1_vol
148 ; GCN1-NEXT: s_setpc_b64 s[30:31]
150 ; GCN2-LABEL: flat_atomic_xchg_i64_noret_scalar:
152 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
153 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
154 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
155 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
156 ; GCN2-NEXT: v_mov_b32_e32 v3, s5
157 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
158 ; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
159 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
160 ; GCN2-NEXT: buffer_wbinvl1_vol
161 ; GCN2-NEXT: s_setpc_b64 s[30:31]
163 ; GCN3-LABEL: flat_atomic_xchg_i64_noret_scalar:
165 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
166 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
167 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
168 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
169 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
170 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
171 ; GCN3-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
172 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
173 ; GCN3-NEXT: buffer_wbinvl1_vol
174 ; GCN3-NEXT: s_setpc_b64 s[30:31]
175 %tmp0 = atomicrmw xchg ptr %ptr, i64 %in seq_cst
179 define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
180 ; GCN1-LABEL: flat_atomic_xchg_i64_noret_offset_scalar:
182 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
183 ; GCN1-NEXT: s_add_u32 s34, s4, 32
184 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
185 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
186 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
187 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
188 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
189 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
190 ; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
191 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
192 ; GCN1-NEXT: buffer_wbinvl1_vol
193 ; GCN1-NEXT: s_setpc_b64 s[30:31]
195 ; GCN2-LABEL: flat_atomic_xchg_i64_noret_offset_scalar:
197 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
198 ; GCN2-NEXT: s_add_u32 s34, s4, 32
199 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
200 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
201 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
202 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
203 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
204 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
205 ; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
206 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
207 ; GCN2-NEXT: buffer_wbinvl1_vol
208 ; GCN2-NEXT: s_setpc_b64 s[30:31]
210 ; GCN3-LABEL: flat_atomic_xchg_i64_noret_offset_scalar:
212 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
213 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
214 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
215 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
216 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
217 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
218 ; GCN3-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] offset:32
219 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
220 ; GCN3-NEXT: buffer_wbinvl1_vol
221 ; GCN3-NEXT: s_setpc_b64 s[30:31]
222 %gep = getelementptr i64, ptr %out, i64 4
223 %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst
227 define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
228 ; GCN1-LABEL: flat_atomic_xchg_i64_ret_scalar:
230 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
231 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
232 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
233 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
234 ; GCN1-NEXT: v_mov_b32_e32 v3, s5
235 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
236 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
237 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
238 ; GCN1-NEXT: buffer_wbinvl1_vol
239 ; GCN1-NEXT: s_setpc_b64 s[30:31]
241 ; GCN2-LABEL: flat_atomic_xchg_i64_ret_scalar:
243 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
244 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
245 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
246 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
247 ; GCN2-NEXT: v_mov_b32_e32 v3, s5
248 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
249 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
250 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
251 ; GCN2-NEXT: buffer_wbinvl1_vol
252 ; GCN2-NEXT: s_setpc_b64 s[30:31]
254 ; GCN3-LABEL: flat_atomic_xchg_i64_ret_scalar:
256 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
257 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
258 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
259 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
260 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
261 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
262 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
263 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
264 ; GCN3-NEXT: buffer_wbinvl1_vol
265 ; GCN3-NEXT: s_setpc_b64 s[30:31]
266 %result = atomicrmw xchg ptr %ptr, i64 %in seq_cst
270 define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
271 ; GCN1-LABEL: flat_atomic_xchg_i64_ret_offset_scalar:
273 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
274 ; GCN1-NEXT: s_add_u32 s34, s4, 32
275 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
276 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
277 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
278 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
279 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
280 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
281 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
282 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
283 ; GCN1-NEXT: buffer_wbinvl1_vol
284 ; GCN1-NEXT: s_setpc_b64 s[30:31]
286 ; GCN2-LABEL: flat_atomic_xchg_i64_ret_offset_scalar:
288 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
289 ; GCN2-NEXT: s_add_u32 s34, s4, 32
290 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
291 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
292 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
293 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
294 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
295 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
296 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
297 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
298 ; GCN2-NEXT: buffer_wbinvl1_vol
299 ; GCN2-NEXT: s_setpc_b64 s[30:31]
301 ; GCN3-LABEL: flat_atomic_xchg_i64_ret_offset_scalar:
303 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
304 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
305 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
306 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
307 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
308 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
309 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
310 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
311 ; GCN3-NEXT: buffer_wbinvl1_vol
312 ; GCN3-NEXT: s_setpc_b64 s[30:31]
313 %gep = getelementptr i64, ptr %out, i64 4
314 %result = atomicrmw xchg ptr %gep, i64 %in seq_cst
318 ; ---------------------------------------------------------------------
320 ; ---------------------------------------------------------------------
322 define void @flat_atomic_xchg_f64_noret(ptr %ptr, double %in) {
323 ; GCN1-LABEL: flat_atomic_xchg_f64_noret:
325 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
326 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
327 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
328 ; GCN1-NEXT: buffer_wbinvl1_vol
329 ; GCN1-NEXT: s_setpc_b64 s[30:31]
331 ; GCN2-LABEL: flat_atomic_xchg_f64_noret:
333 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
334 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
335 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
336 ; GCN2-NEXT: buffer_wbinvl1_vol
337 ; GCN2-NEXT: s_setpc_b64 s[30:31]
339 ; GCN3-LABEL: flat_atomic_xchg_f64_noret:
341 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
342 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
343 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
344 ; GCN3-NEXT: buffer_wbinvl1_vol
345 ; GCN3-NEXT: s_setpc_b64 s[30:31]
346 %tmp0 = atomicrmw xchg ptr %ptr, double %in seq_cst
350 define void @flat_atomic_xchg_f64_noret_offset(ptr %out, double %in) {
351 ; GCN1-LABEL: flat_atomic_xchg_f64_noret_offset:
353 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
354 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
355 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
356 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
357 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
358 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
359 ; GCN1-NEXT: buffer_wbinvl1_vol
360 ; GCN1-NEXT: s_setpc_b64 s[30:31]
362 ; GCN2-LABEL: flat_atomic_xchg_f64_noret_offset:
364 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
365 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
366 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
367 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
368 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
369 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
370 ; GCN2-NEXT: buffer_wbinvl1_vol
371 ; GCN2-NEXT: s_setpc_b64 s[30:31]
373 ; GCN3-LABEL: flat_atomic_xchg_f64_noret_offset:
375 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
376 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32
377 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
378 ; GCN3-NEXT: buffer_wbinvl1_vol
379 ; GCN3-NEXT: s_setpc_b64 s[30:31]
380 %gep = getelementptr double, ptr %out, i32 4
381 %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst
385 define double @flat_atomic_xchg_f64_ret(ptr %ptr, double %in) {
386 ; GCN1-LABEL: flat_atomic_xchg_f64_ret:
388 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
389 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
390 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
391 ; GCN1-NEXT: buffer_wbinvl1_vol
392 ; GCN1-NEXT: s_setpc_b64 s[30:31]
394 ; GCN2-LABEL: flat_atomic_xchg_f64_ret:
396 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
397 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
398 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
399 ; GCN2-NEXT: buffer_wbinvl1_vol
400 ; GCN2-NEXT: s_setpc_b64 s[30:31]
402 ; GCN3-LABEL: flat_atomic_xchg_f64_ret:
404 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
405 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
406 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
407 ; GCN3-NEXT: buffer_wbinvl1_vol
408 ; GCN3-NEXT: s_setpc_b64 s[30:31]
409 %result = atomicrmw xchg ptr %ptr, double %in seq_cst
413 define double @flat_atomic_xchg_f64_ret_offset(ptr %out, double %in) {
414 ; GCN1-LABEL: flat_atomic_xchg_f64_ret_offset:
416 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
417 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
418 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
419 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
420 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
421 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
422 ; GCN1-NEXT: buffer_wbinvl1_vol
423 ; GCN1-NEXT: s_setpc_b64 s[30:31]
425 ; GCN2-LABEL: flat_atomic_xchg_f64_ret_offset:
427 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
428 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
429 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
430 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
431 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
432 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
433 ; GCN2-NEXT: buffer_wbinvl1_vol
434 ; GCN2-NEXT: s_setpc_b64 s[30:31]
436 ; GCN3-LABEL: flat_atomic_xchg_f64_ret_offset:
438 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
439 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
440 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
441 ; GCN3-NEXT: buffer_wbinvl1_vol
442 ; GCN3-NEXT: s_setpc_b64 s[30:31]
443 %gep = getelementptr double, ptr %out, i32 4
444 %result = atomicrmw xchg ptr %gep, double %in seq_cst
448 define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double inreg %in) {
449 ; GCN1-LABEL: flat_atomic_xchg_f64_noret_scalar:
451 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
452 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
453 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
454 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
455 ; GCN1-NEXT: v_mov_b32_e32 v3, s5
456 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
457 ; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
458 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
459 ; GCN1-NEXT: buffer_wbinvl1_vol
460 ; GCN1-NEXT: s_setpc_b64 s[30:31]
462 ; GCN2-LABEL: flat_atomic_xchg_f64_noret_scalar:
464 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
465 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
466 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
467 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
468 ; GCN2-NEXT: v_mov_b32_e32 v3, s5
469 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
470 ; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
471 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
472 ; GCN2-NEXT: buffer_wbinvl1_vol
473 ; GCN2-NEXT: s_setpc_b64 s[30:31]
475 ; GCN3-LABEL: flat_atomic_xchg_f64_noret_scalar:
477 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
478 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
479 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
480 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
481 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
482 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
483 ; GCN3-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
484 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
485 ; GCN3-NEXT: buffer_wbinvl1_vol
486 ; GCN3-NEXT: s_setpc_b64 s[30:31]
487 %tmp0 = atomicrmw xchg ptr %ptr, double %in seq_cst
491 define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out, double inreg %in) {
492 ; GCN1-LABEL: flat_atomic_xchg_f64_noret_offset_scalar:
494 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
495 ; GCN1-NEXT: s_add_u32 s34, s4, 32
496 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
497 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
498 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
499 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
500 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
501 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
502 ; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
503 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
504 ; GCN1-NEXT: buffer_wbinvl1_vol
505 ; GCN1-NEXT: s_setpc_b64 s[30:31]
507 ; GCN2-LABEL: flat_atomic_xchg_f64_noret_offset_scalar:
509 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
510 ; GCN2-NEXT: s_add_u32 s34, s4, 32
511 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
512 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
513 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
514 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
515 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
516 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
517 ; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
518 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
519 ; GCN2-NEXT: buffer_wbinvl1_vol
520 ; GCN2-NEXT: s_setpc_b64 s[30:31]
522 ; GCN3-LABEL: flat_atomic_xchg_f64_noret_offset_scalar:
524 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
525 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
526 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
527 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
528 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
529 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
530 ; GCN3-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] offset:32
531 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
532 ; GCN3-NEXT: buffer_wbinvl1_vol
533 ; GCN3-NEXT: s_setpc_b64 s[30:31]
534 %gep = getelementptr double, ptr %out, i32 4
535 %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst
539 define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double inreg %in) {
540 ; GCN1-LABEL: flat_atomic_xchg_f64_ret_scalar:
542 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
543 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
544 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
545 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
546 ; GCN1-NEXT: v_mov_b32_e32 v3, s5
547 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
548 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
549 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
550 ; GCN1-NEXT: buffer_wbinvl1_vol
551 ; GCN1-NEXT: s_setpc_b64 s[30:31]
553 ; GCN2-LABEL: flat_atomic_xchg_f64_ret_scalar:
555 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
556 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
557 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
558 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
559 ; GCN2-NEXT: v_mov_b32_e32 v3, s5
560 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
561 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
562 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
563 ; GCN2-NEXT: buffer_wbinvl1_vol
564 ; GCN2-NEXT: s_setpc_b64 s[30:31]
566 ; GCN3-LABEL: flat_atomic_xchg_f64_ret_scalar:
568 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
569 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
570 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
571 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
572 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
573 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
574 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
575 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
576 ; GCN3-NEXT: buffer_wbinvl1_vol
577 ; GCN3-NEXT: s_setpc_b64 s[30:31]
578 %result = atomicrmw xchg ptr %ptr, double %in seq_cst
582 define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, double inreg %in) {
583 ; GCN1-LABEL: flat_atomic_xchg_f64_ret_offset_scalar:
585 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
586 ; GCN1-NEXT: s_add_u32 s34, s4, 32
587 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
588 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
589 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
590 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
591 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
592 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
593 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
594 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
595 ; GCN1-NEXT: buffer_wbinvl1_vol
596 ; GCN1-NEXT: s_setpc_b64 s[30:31]
598 ; GCN2-LABEL: flat_atomic_xchg_f64_ret_offset_scalar:
600 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
601 ; GCN2-NEXT: s_add_u32 s34, s4, 32
602 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
603 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
604 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
605 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
606 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
607 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
608 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
609 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
610 ; GCN2-NEXT: buffer_wbinvl1_vol
611 ; GCN2-NEXT: s_setpc_b64 s[30:31]
613 ; GCN3-LABEL: flat_atomic_xchg_f64_ret_offset_scalar:
615 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
616 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
617 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
618 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
619 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
620 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
621 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
622 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
623 ; GCN3-NEXT: buffer_wbinvl1_vol
624 ; GCN3-NEXT: s_setpc_b64 s[30:31]
625 %gep = getelementptr double, ptr %out, i32 4
626 %result = atomicrmw xchg ptr %gep, double %in seq_cst
630 ; ---------------------------------------------------------------------
632 ; ---------------------------------------------------------------------
634 define void @flat_atomic_add_i64_noret(ptr %ptr, i64 %in) {
635 ; GCN1-LABEL: flat_atomic_add_i64_noret:
637 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
638 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
639 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
640 ; GCN1-NEXT: buffer_wbinvl1_vol
641 ; GCN1-NEXT: s_setpc_b64 s[30:31]
643 ; GCN2-LABEL: flat_atomic_add_i64_noret:
645 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
646 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
647 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
648 ; GCN2-NEXT: buffer_wbinvl1_vol
649 ; GCN2-NEXT: s_setpc_b64 s[30:31]
651 ; GCN3-LABEL: flat_atomic_add_i64_noret:
653 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
654 ; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
655 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
656 ; GCN3-NEXT: buffer_wbinvl1_vol
657 ; GCN3-NEXT: s_setpc_b64 s[30:31]
658 %tmp0 = atomicrmw add ptr %ptr, i64 %in seq_cst
662 define void @flat_atomic_add_i64_noret_offset(ptr %out, i64 %in) {
663 ; GCN1-LABEL: flat_atomic_add_i64_noret_offset:
665 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
666 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
667 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
668 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
669 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
670 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
671 ; GCN1-NEXT: buffer_wbinvl1_vol
672 ; GCN1-NEXT: s_setpc_b64 s[30:31]
674 ; GCN2-LABEL: flat_atomic_add_i64_noret_offset:
676 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
677 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
678 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
679 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
680 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
681 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
682 ; GCN2-NEXT: buffer_wbinvl1_vol
683 ; GCN2-NEXT: s_setpc_b64 s[30:31]
685 ; GCN3-LABEL: flat_atomic_add_i64_noret_offset:
687 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
688 ; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] offset:32
689 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
690 ; GCN3-NEXT: buffer_wbinvl1_vol
691 ; GCN3-NEXT: s_setpc_b64 s[30:31]
692 %gep = getelementptr i64, ptr %out, i64 4
693 %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst
697 define i64 @flat_atomic_add_i64_ret(ptr %ptr, i64 %in) {
698 ; GCN1-LABEL: flat_atomic_add_i64_ret:
700 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
701 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
702 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
703 ; GCN1-NEXT: buffer_wbinvl1_vol
704 ; GCN1-NEXT: s_setpc_b64 s[30:31]
706 ; GCN2-LABEL: flat_atomic_add_i64_ret:
708 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
709 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
710 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
711 ; GCN2-NEXT: buffer_wbinvl1_vol
712 ; GCN2-NEXT: s_setpc_b64 s[30:31]
714 ; GCN3-LABEL: flat_atomic_add_i64_ret:
716 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
717 ; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
718 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
719 ; GCN3-NEXT: buffer_wbinvl1_vol
720 ; GCN3-NEXT: s_setpc_b64 s[30:31]
721 %result = atomicrmw add ptr %ptr, i64 %in seq_cst
725 define i64 @flat_atomic_add_i64_ret_offset(ptr %out, i64 %in) {
726 ; GCN1-LABEL: flat_atomic_add_i64_ret_offset:
728 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
729 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
730 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
731 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
732 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
733 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
734 ; GCN1-NEXT: buffer_wbinvl1_vol
735 ; GCN1-NEXT: s_setpc_b64 s[30:31]
737 ; GCN2-LABEL: flat_atomic_add_i64_ret_offset:
739 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
740 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
741 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
742 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
743 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
744 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
745 ; GCN2-NEXT: buffer_wbinvl1_vol
746 ; GCN2-NEXT: s_setpc_b64 s[30:31]
748 ; GCN3-LABEL: flat_atomic_add_i64_ret_offset:
750 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
751 ; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
752 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
753 ; GCN3-NEXT: buffer_wbinvl1_vol
754 ; GCN3-NEXT: s_setpc_b64 s[30:31]
755 %gep = getelementptr i64, ptr %out, i64 4
756 %result = atomicrmw add ptr %gep, i64 %in seq_cst
760 define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
761 ; GCN1-LABEL: flat_atomic_add_i64_noret_scalar:
763 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
764 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
765 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
766 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
767 ; GCN1-NEXT: v_mov_b32_e32 v3, s5
768 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
769 ; GCN1-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
770 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
771 ; GCN1-NEXT: buffer_wbinvl1_vol
772 ; GCN1-NEXT: s_setpc_b64 s[30:31]
774 ; GCN2-LABEL: flat_atomic_add_i64_noret_scalar:
776 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
777 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
778 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
779 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
780 ; GCN2-NEXT: v_mov_b32_e32 v3, s5
781 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
782 ; GCN2-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
783 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
784 ; GCN2-NEXT: buffer_wbinvl1_vol
785 ; GCN2-NEXT: s_setpc_b64 s[30:31]
787 ; GCN3-LABEL: flat_atomic_add_i64_noret_scalar:
789 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
790 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
791 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
792 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
793 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
794 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
795 ; GCN3-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
796 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
797 ; GCN3-NEXT: buffer_wbinvl1_vol
798 ; GCN3-NEXT: s_setpc_b64 s[30:31]
799 %tmp0 = atomicrmw add ptr %ptr, i64 %in seq_cst
803 define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
804 ; GCN1-LABEL: flat_atomic_add_i64_noret_offset_scalar:
806 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
807 ; GCN1-NEXT: s_add_u32 s34, s4, 32
808 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
809 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
810 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
811 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
812 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
813 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
814 ; GCN1-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
815 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
816 ; GCN1-NEXT: buffer_wbinvl1_vol
817 ; GCN1-NEXT: s_setpc_b64 s[30:31]
819 ; GCN2-LABEL: flat_atomic_add_i64_noret_offset_scalar:
821 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
822 ; GCN2-NEXT: s_add_u32 s34, s4, 32
823 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
824 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
825 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
826 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
827 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
828 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
829 ; GCN2-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
830 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
831 ; GCN2-NEXT: buffer_wbinvl1_vol
832 ; GCN2-NEXT: s_setpc_b64 s[30:31]
834 ; GCN3-LABEL: flat_atomic_add_i64_noret_offset_scalar:
836 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
837 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
838 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
839 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
840 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
841 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
842 ; GCN3-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] offset:32
843 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
844 ; GCN3-NEXT: buffer_wbinvl1_vol
845 ; GCN3-NEXT: s_setpc_b64 s[30:31]
846 %gep = getelementptr i64, ptr %out, i64 4
847 %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst
851 define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
852 ; GCN1-LABEL: flat_atomic_add_i64_ret_scalar:
854 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
855 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
856 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
857 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
858 ; GCN1-NEXT: v_mov_b32_e32 v3, s5
859 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
860 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
861 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
862 ; GCN1-NEXT: buffer_wbinvl1_vol
863 ; GCN1-NEXT: s_setpc_b64 s[30:31]
865 ; GCN2-LABEL: flat_atomic_add_i64_ret_scalar:
867 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
868 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
869 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
870 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
871 ; GCN2-NEXT: v_mov_b32_e32 v3, s5
872 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
873 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
874 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
875 ; GCN2-NEXT: buffer_wbinvl1_vol
876 ; GCN2-NEXT: s_setpc_b64 s[30:31]
878 ; GCN3-LABEL: flat_atomic_add_i64_ret_scalar:
880 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
881 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
882 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
883 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
884 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
885 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
886 ; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
887 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
888 ; GCN3-NEXT: buffer_wbinvl1_vol
889 ; GCN3-NEXT: s_setpc_b64 s[30:31]
890 %result = atomicrmw add ptr %ptr, i64 %in seq_cst
894 define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
895 ; GCN1-LABEL: flat_atomic_add_i64_ret_offset_scalar:
897 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
898 ; GCN1-NEXT: s_add_u32 s34, s4, 32
899 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
900 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
901 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
902 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
903 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
904 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
905 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
906 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
907 ; GCN1-NEXT: buffer_wbinvl1_vol
908 ; GCN1-NEXT: s_setpc_b64 s[30:31]
910 ; GCN2-LABEL: flat_atomic_add_i64_ret_offset_scalar:
912 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
913 ; GCN2-NEXT: s_add_u32 s34, s4, 32
914 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
915 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
916 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
917 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
918 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
919 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
920 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
921 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
922 ; GCN2-NEXT: buffer_wbinvl1_vol
923 ; GCN2-NEXT: s_setpc_b64 s[30:31]
925 ; GCN3-LABEL: flat_atomic_add_i64_ret_offset_scalar:
927 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
928 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
929 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
930 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
931 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
932 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
933 ; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
934 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
935 ; GCN3-NEXT: buffer_wbinvl1_vol
936 ; GCN3-NEXT: s_setpc_b64 s[30:31]
937 %gep = getelementptr i64, ptr %out, i64 4
938 %result = atomicrmw add ptr %gep, i64 %in seq_cst
942 ; ---------------------------------------------------------------------
944 ; ---------------------------------------------------------------------
946 define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) {
947 ; GCN1-LABEL: flat_atomic_sub_i64_noret:
949 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
950 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
951 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
952 ; GCN1-NEXT: buffer_wbinvl1_vol
953 ; GCN1-NEXT: s_setpc_b64 s[30:31]
955 ; GCN2-LABEL: flat_atomic_sub_i64_noret:
957 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
958 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
959 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
960 ; GCN2-NEXT: buffer_wbinvl1_vol
961 ; GCN2-NEXT: s_setpc_b64 s[30:31]
963 ; GCN3-LABEL: flat_atomic_sub_i64_noret:
965 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
966 ; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
967 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
968 ; GCN3-NEXT: buffer_wbinvl1_vol
969 ; GCN3-NEXT: s_setpc_b64 s[30:31]
970 %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst
974 define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) {
975 ; GCN1-LABEL: flat_atomic_sub_i64_noret_offset:
977 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
978 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
979 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
980 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
981 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
982 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
983 ; GCN1-NEXT: buffer_wbinvl1_vol
984 ; GCN1-NEXT: s_setpc_b64 s[30:31]
986 ; GCN2-LABEL: flat_atomic_sub_i64_noret_offset:
988 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
989 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
990 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
991 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
992 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
993 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
994 ; GCN2-NEXT: buffer_wbinvl1_vol
995 ; GCN2-NEXT: s_setpc_b64 s[30:31]
997 ; GCN3-LABEL: flat_atomic_sub_i64_noret_offset:
999 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1000 ; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] offset:32
1001 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1002 ; GCN3-NEXT: buffer_wbinvl1_vol
1003 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1004 %gep = getelementptr i64, ptr %out, i64 4
1005 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst
1009 define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) {
1010 ; GCN1-LABEL: flat_atomic_sub_i64_ret:
1012 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1013 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
1014 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1015 ; GCN1-NEXT: buffer_wbinvl1_vol
1016 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1018 ; GCN2-LABEL: flat_atomic_sub_i64_ret:
1020 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1021 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
1022 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1023 ; GCN2-NEXT: buffer_wbinvl1_vol
1024 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1026 ; GCN3-LABEL: flat_atomic_sub_i64_ret:
1028 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1029 ; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
1030 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1031 ; GCN3-NEXT: buffer_wbinvl1_vol
1032 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1033 %result = atomicrmw sub ptr %ptr, i64 %in seq_cst
1037 define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) {
1038 ; GCN1-LABEL: flat_atomic_sub_i64_ret_offset:
1040 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1041 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
1042 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1043 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1044 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
1045 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1046 ; GCN1-NEXT: buffer_wbinvl1_vol
1047 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1049 ; GCN2-LABEL: flat_atomic_sub_i64_ret_offset:
1051 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1052 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
1053 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1054 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1055 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
1056 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1057 ; GCN2-NEXT: buffer_wbinvl1_vol
1058 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1060 ; GCN3-LABEL: flat_atomic_sub_i64_ret_offset:
1062 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1063 ; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
1064 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1065 ; GCN3-NEXT: buffer_wbinvl1_vol
1066 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1067 %gep = getelementptr i64, ptr %out, i64 4
1068 %result = atomicrmw sub ptr %gep, i64 %in seq_cst
1072 define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
1073 ; GCN1-LABEL: flat_atomic_sub_i64_noret_scalar:
1075 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1076 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
1077 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
1078 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
1079 ; GCN1-NEXT: v_mov_b32_e32 v3, s5
1080 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1081 ; GCN1-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
1082 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1083 ; GCN1-NEXT: buffer_wbinvl1_vol
1084 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1086 ; GCN2-LABEL: flat_atomic_sub_i64_noret_scalar:
1088 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1089 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
1090 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
1091 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
1092 ; GCN2-NEXT: v_mov_b32_e32 v3, s5
1093 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1094 ; GCN2-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
1095 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1096 ; GCN2-NEXT: buffer_wbinvl1_vol
1097 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1099 ; GCN3-LABEL: flat_atomic_sub_i64_noret_scalar:
1101 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1102 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
1103 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
1104 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
1105 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
1106 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1107 ; GCN3-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
1108 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1109 ; GCN3-NEXT: buffer_wbinvl1_vol
1110 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1111 %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst
1115 define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
1116 ; GCN1-LABEL: flat_atomic_sub_i64_noret_offset_scalar:
1118 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1119 ; GCN1-NEXT: s_add_u32 s34, s4, 32
1120 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
1121 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
1122 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
1123 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
1124 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
1125 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1126 ; GCN1-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
1127 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1128 ; GCN1-NEXT: buffer_wbinvl1_vol
1129 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1131 ; GCN2-LABEL: flat_atomic_sub_i64_noret_offset_scalar:
1133 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1134 ; GCN2-NEXT: s_add_u32 s34, s4, 32
1135 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
1136 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
1137 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
1138 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
1139 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
1140 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1141 ; GCN2-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
1142 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1143 ; GCN2-NEXT: buffer_wbinvl1_vol
1144 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1146 ; GCN3-LABEL: flat_atomic_sub_i64_noret_offset_scalar:
1148 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1149 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
1150 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
1151 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
1152 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
1153 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1154 ; GCN3-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] offset:32
1155 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1156 ; GCN3-NEXT: buffer_wbinvl1_vol
1157 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1158 %gep = getelementptr i64, ptr %out, i64 4
1159 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst
1163 define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
1164 ; GCN1-LABEL: flat_atomic_sub_i64_ret_scalar:
1166 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1167 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
1168 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
1169 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
1170 ; GCN1-NEXT: v_mov_b32_e32 v3, s5
1171 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1172 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
1173 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1174 ; GCN1-NEXT: buffer_wbinvl1_vol
1175 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1177 ; GCN2-LABEL: flat_atomic_sub_i64_ret_scalar:
1179 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1180 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
1181 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
1182 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
1183 ; GCN2-NEXT: v_mov_b32_e32 v3, s5
1184 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1185 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
1186 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1187 ; GCN2-NEXT: buffer_wbinvl1_vol
1188 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1190 ; GCN3-LABEL: flat_atomic_sub_i64_ret_scalar:
1192 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1193 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
1194 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
1195 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
1196 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
1197 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1198 ; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
1199 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1200 ; GCN3-NEXT: buffer_wbinvl1_vol
1201 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1202 %result = atomicrmw sub ptr %ptr, i64 %in seq_cst
1206 define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
1207 ; GCN1-LABEL: flat_atomic_sub_i64_ret_offset_scalar:
1209 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1210 ; GCN1-NEXT: s_add_u32 s34, s4, 32
1211 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
1212 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
1213 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
1214 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
1215 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
1216 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1217 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
1218 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1219 ; GCN1-NEXT: buffer_wbinvl1_vol
1220 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1222 ; GCN2-LABEL: flat_atomic_sub_i64_ret_offset_scalar:
1224 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1225 ; GCN2-NEXT: s_add_u32 s34, s4, 32
1226 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
1227 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
1228 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
1229 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
1230 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
1231 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1232 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
1233 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1234 ; GCN2-NEXT: buffer_wbinvl1_vol
1235 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1237 ; GCN3-LABEL: flat_atomic_sub_i64_ret_offset_scalar:
1239 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1240 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
1241 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
1242 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
1243 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
1244 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1245 ; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
1246 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1247 ; GCN3-NEXT: buffer_wbinvl1_vol
1248 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1249 %gep = getelementptr i64, ptr %out, i64 4
1250 %result = atomicrmw sub ptr %gep, i64 %in seq_cst
1254 ; ---------------------------------------------------------------------
1256 ; ---------------------------------------------------------------------
1258 define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) {
1259 ; GCN1-LABEL: flat_atomic_and_i64_noret:
1261 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1262 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
1263 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1264 ; GCN1-NEXT: buffer_wbinvl1_vol
1265 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1267 ; GCN2-LABEL: flat_atomic_and_i64_noret:
1269 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1270 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
1271 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1272 ; GCN2-NEXT: buffer_wbinvl1_vol
1273 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1275 ; GCN3-LABEL: flat_atomic_and_i64_noret:
1277 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1278 ; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
1279 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1280 ; GCN3-NEXT: buffer_wbinvl1_vol
1281 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1282 %tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst
1286 define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) {
1287 ; GCN1-LABEL: flat_atomic_and_i64_noret_offset:
1289 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1290 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
1291 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1292 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1293 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
1294 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1295 ; GCN1-NEXT: buffer_wbinvl1_vol
1296 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1298 ; GCN2-LABEL: flat_atomic_and_i64_noret_offset:
1300 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1301 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
1302 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1303 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1304 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
1305 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1306 ; GCN2-NEXT: buffer_wbinvl1_vol
1307 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1309 ; GCN3-LABEL: flat_atomic_and_i64_noret_offset:
1311 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1312 ; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] offset:32
1313 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1314 ; GCN3-NEXT: buffer_wbinvl1_vol
1315 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1316 %gep = getelementptr i64, ptr %out, i64 4
1317 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst
1321 define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) {
1322 ; GCN1-LABEL: flat_atomic_and_i64_ret:
1324 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1325 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
1326 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1327 ; GCN1-NEXT: buffer_wbinvl1_vol
1328 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1330 ; GCN2-LABEL: flat_atomic_and_i64_ret:
1332 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1333 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
1334 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1335 ; GCN2-NEXT: buffer_wbinvl1_vol
1336 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1338 ; GCN3-LABEL: flat_atomic_and_i64_ret:
1340 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1341 ; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
1342 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1343 ; GCN3-NEXT: buffer_wbinvl1_vol
1344 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1345 %result = atomicrmw and ptr %ptr, i64 %in seq_cst
1349 define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) {
1350 ; GCN1-LABEL: flat_atomic_and_i64_ret_offset:
1352 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1353 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
1354 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1355 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1356 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
1357 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1358 ; GCN1-NEXT: buffer_wbinvl1_vol
1359 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1361 ; GCN2-LABEL: flat_atomic_and_i64_ret_offset:
1363 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1364 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
1365 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1366 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1367 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
1368 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1369 ; GCN2-NEXT: buffer_wbinvl1_vol
1370 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1372 ; GCN3-LABEL: flat_atomic_and_i64_ret_offset:
1374 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1375 ; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
1376 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1377 ; GCN3-NEXT: buffer_wbinvl1_vol
1378 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1379 %gep = getelementptr i64, ptr %out, i64 4
1380 %result = atomicrmw and ptr %gep, i64 %in seq_cst
1384 define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
1385 ; GCN1-LABEL: flat_atomic_and_i64_noret_scalar:
1387 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1388 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
1389 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
1390 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
1391 ; GCN1-NEXT: v_mov_b32_e32 v3, s5
1392 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1393 ; GCN1-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
1394 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1395 ; GCN1-NEXT: buffer_wbinvl1_vol
1396 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1398 ; GCN2-LABEL: flat_atomic_and_i64_noret_scalar:
1400 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1401 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
1402 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
1403 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
1404 ; GCN2-NEXT: v_mov_b32_e32 v3, s5
1405 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1406 ; GCN2-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
1407 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1408 ; GCN2-NEXT: buffer_wbinvl1_vol
1409 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1411 ; GCN3-LABEL: flat_atomic_and_i64_noret_scalar:
1413 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1414 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
1415 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
1416 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
1417 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
1418 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1419 ; GCN3-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
1420 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1421 ; GCN3-NEXT: buffer_wbinvl1_vol
1422 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1423 %tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst
1427 define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
1428 ; GCN1-LABEL: flat_atomic_and_i64_noret_offset_scalar:
1430 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1431 ; GCN1-NEXT: s_add_u32 s34, s4, 32
1432 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
1433 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
1434 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
1435 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
1436 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
1437 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1438 ; GCN1-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
1439 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1440 ; GCN1-NEXT: buffer_wbinvl1_vol
1441 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1443 ; GCN2-LABEL: flat_atomic_and_i64_noret_offset_scalar:
1445 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1446 ; GCN2-NEXT: s_add_u32 s34, s4, 32
1447 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
1448 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
1449 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
1450 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
1451 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
1452 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1453 ; GCN2-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
1454 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1455 ; GCN2-NEXT: buffer_wbinvl1_vol
1456 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1458 ; GCN3-LABEL: flat_atomic_and_i64_noret_offset_scalar:
1460 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1461 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
1462 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
1463 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
1464 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
1465 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1466 ; GCN3-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] offset:32
1467 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1468 ; GCN3-NEXT: buffer_wbinvl1_vol
1469 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1470 %gep = getelementptr i64, ptr %out, i64 4
1471 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst
1475 define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
1476 ; GCN1-LABEL: flat_atomic_and_i64_ret_scalar:
1478 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1479 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
1480 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
1481 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
1482 ; GCN1-NEXT: v_mov_b32_e32 v3, s5
1483 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1484 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
1485 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1486 ; GCN1-NEXT: buffer_wbinvl1_vol
1487 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1489 ; GCN2-LABEL: flat_atomic_and_i64_ret_scalar:
1491 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1492 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
1493 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
1494 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
1495 ; GCN2-NEXT: v_mov_b32_e32 v3, s5
1496 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1497 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
1498 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1499 ; GCN2-NEXT: buffer_wbinvl1_vol
1500 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1502 ; GCN3-LABEL: flat_atomic_and_i64_ret_scalar:
1504 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1505 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
1506 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
1507 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
1508 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
1509 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1510 ; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
1511 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1512 ; GCN3-NEXT: buffer_wbinvl1_vol
1513 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1514 %result = atomicrmw and ptr %ptr, i64 %in seq_cst
1518 define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
1519 ; GCN1-LABEL: flat_atomic_and_i64_ret_offset_scalar:
1521 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1522 ; GCN1-NEXT: s_add_u32 s34, s4, 32
1523 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
1524 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
1525 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
1526 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
1527 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
1528 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1529 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
1530 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1531 ; GCN1-NEXT: buffer_wbinvl1_vol
1532 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1534 ; GCN2-LABEL: flat_atomic_and_i64_ret_offset_scalar:
1536 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1537 ; GCN2-NEXT: s_add_u32 s34, s4, 32
1538 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
1539 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
1540 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
1541 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
1542 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
1543 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1544 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
1545 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1546 ; GCN2-NEXT: buffer_wbinvl1_vol
1547 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1549 ; GCN3-LABEL: flat_atomic_and_i64_ret_offset_scalar:
1551 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1552 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
1553 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
1554 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
1555 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
1556 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1557 ; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
1558 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1559 ; GCN3-NEXT: buffer_wbinvl1_vol
1560 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1561 %gep = getelementptr i64, ptr %out, i64 4
1562 %result = atomicrmw and ptr %gep, i64 %in seq_cst
1566 ; ---------------------------------------------------------------------
1568 ; ---------------------------------------------------------------------
1570 define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) {
1571 ; GCN1-LABEL: flat_atomic_nand_i64_noret:
1573 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1574 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
1575 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1576 ; GCN1-NEXT: flat_load_dword v6, v[0:1]
1577 ; GCN1-NEXT: flat_load_dword v7, v[4:5]
1578 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
1579 ; GCN1-NEXT: .LBB40_1: ; %atomicrmw.start
1580 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
1581 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1582 ; GCN1-NEXT: v_and_b32_e32 v4, v7, v3
1583 ; GCN1-NEXT: v_and_b32_e32 v8, v6, v2
1584 ; GCN1-NEXT: v_not_b32_e32 v5, v4
1585 ; GCN1-NEXT: v_not_b32_e32 v4, v8
1586 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1587 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
1588 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1589 ; GCN1-NEXT: buffer_wbinvl1_vol
1590 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
1591 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
1592 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1593 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
1594 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
1595 ; GCN1-NEXT: s_cbranch_execnz .LBB40_1
1596 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
1597 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
1598 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1600 ; GCN2-LABEL: flat_atomic_nand_i64_noret:
1602 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1603 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
1604 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1605 ; GCN2-NEXT: flat_load_dword v6, v[0:1]
1606 ; GCN2-NEXT: flat_load_dword v7, v[4:5]
1607 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
1608 ; GCN2-NEXT: .LBB40_1: ; %atomicrmw.start
1609 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
1610 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1611 ; GCN2-NEXT: v_and_b32_e32 v4, v7, v3
1612 ; GCN2-NEXT: v_and_b32_e32 v8, v6, v2
1613 ; GCN2-NEXT: v_not_b32_e32 v5, v4
1614 ; GCN2-NEXT: v_not_b32_e32 v4, v8
1615 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1616 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
1617 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1618 ; GCN2-NEXT: buffer_wbinvl1_vol
1619 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
1620 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
1621 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1622 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
1623 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
1624 ; GCN2-NEXT: s_cbranch_execnz .LBB40_1
1625 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
1626 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
1627 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1629 ; GCN3-LABEL: flat_atomic_nand_i64_noret:
1631 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1632 ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
1633 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
1634 ; GCN3-NEXT: .LBB40_1: ; %atomicrmw.start
1635 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
1636 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1637 ; GCN3-NEXT: v_and_b32_e32 v4, v7, v3
1638 ; GCN3-NEXT: v_and_b32_e32 v8, v6, v2
1639 ; GCN3-NEXT: v_not_b32_e32 v5, v4
1640 ; GCN3-NEXT: v_not_b32_e32 v4, v8
1641 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1642 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
1643 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1644 ; GCN3-NEXT: buffer_wbinvl1_vol
1645 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
1646 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
1647 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1648 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
1649 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
1650 ; GCN3-NEXT: s_cbranch_execnz .LBB40_1
1651 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
1652 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
1653 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1654 %tmp0 = atomicrmw nand ptr %ptr, i64 %in seq_cst
1658 define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) {
1659 ; GCN1-LABEL: flat_atomic_nand_i64_noret_offset:
1661 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1662 ; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0
1663 ; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
1664 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
1665 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1666 ; GCN1-NEXT: flat_load_dword v7, v[0:1]
1667 ; GCN1-NEXT: flat_load_dword v6, v[8:9]
1668 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
1669 ; GCN1-NEXT: .LBB41_1: ; %atomicrmw.start
1670 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
1671 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1672 ; GCN1-NEXT: v_and_b32_e32 v0, v7, v3
1673 ; GCN1-NEXT: v_and_b32_e32 v1, v6, v2
1674 ; GCN1-NEXT: v_not_b32_e32 v5, v0
1675 ; GCN1-NEXT: v_not_b32_e32 v4, v1
1676 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1677 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
1678 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1679 ; GCN1-NEXT: buffer_wbinvl1_vol
1680 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
1681 ; GCN1-NEXT: v_mov_b32_e32 v7, v1
1682 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1683 ; GCN1-NEXT: v_mov_b32_e32 v6, v0
1684 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
1685 ; GCN1-NEXT: s_cbranch_execnz .LBB41_1
1686 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
1687 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
1688 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1690 ; GCN2-LABEL: flat_atomic_nand_i64_noret_offset:
1692 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1693 ; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0
1694 ; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
1695 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
1696 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1697 ; GCN2-NEXT: flat_load_dword v7, v[0:1]
1698 ; GCN2-NEXT: flat_load_dword v6, v[8:9]
1699 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
1700 ; GCN2-NEXT: .LBB41_1: ; %atomicrmw.start
1701 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
1702 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1703 ; GCN2-NEXT: v_and_b32_e32 v0, v7, v3
1704 ; GCN2-NEXT: v_and_b32_e32 v1, v6, v2
1705 ; GCN2-NEXT: v_not_b32_e32 v5, v0
1706 ; GCN2-NEXT: v_not_b32_e32 v4, v1
1707 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1708 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
1709 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1710 ; GCN2-NEXT: buffer_wbinvl1_vol
1711 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
1712 ; GCN2-NEXT: v_mov_b32_e32 v7, v1
1713 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1714 ; GCN2-NEXT: v_mov_b32_e32 v6, v0
1715 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
1716 ; GCN2-NEXT: s_cbranch_execnz .LBB41_1
1717 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
1718 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
1719 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1721 ; GCN3-LABEL: flat_atomic_nand_i64_noret_offset:
1723 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1724 ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
1725 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
1726 ; GCN3-NEXT: .LBB41_1: ; %atomicrmw.start
1727 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
1728 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1729 ; GCN3-NEXT: v_and_b32_e32 v4, v7, v3
1730 ; GCN3-NEXT: v_and_b32_e32 v8, v6, v2
1731 ; GCN3-NEXT: v_not_b32_e32 v5, v4
1732 ; GCN3-NEXT: v_not_b32_e32 v4, v8
1733 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1734 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
1735 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1736 ; GCN3-NEXT: buffer_wbinvl1_vol
1737 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
1738 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
1739 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1740 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
1741 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
1742 ; GCN3-NEXT: s_cbranch_execnz .LBB41_1
1743 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
1744 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
1745 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1746 %gep = getelementptr i64, ptr %out, i64 4
1747 %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst
1751 define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) {
1752 ; GCN1-LABEL: flat_atomic_nand_i64_ret:
1754 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1755 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0
1756 ; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
1757 ; GCN1-NEXT: flat_load_dword v4, v[0:1]
1758 ; GCN1-NEXT: flat_load_dword v5, v[5:6]
1759 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
1760 ; GCN1-NEXT: .LBB42_1: ; %atomicrmw.start
1761 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
1762 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1763 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
1764 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
1765 ; GCN1-NEXT: v_and_b32_e32 v4, v7, v3
1766 ; GCN1-NEXT: v_and_b32_e32 v8, v6, v2
1767 ; GCN1-NEXT: v_not_b32_e32 v5, v4
1768 ; GCN1-NEXT: v_not_b32_e32 v4, v8
1769 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1770 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
1771 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1772 ; GCN1-NEXT: buffer_wbinvl1_vol
1773 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
1774 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1775 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
1776 ; GCN1-NEXT: s_cbranch_execnz .LBB42_1
1777 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
1778 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
1779 ; GCN1-NEXT: v_mov_b32_e32 v0, v4
1780 ; GCN1-NEXT: v_mov_b32_e32 v1, v5
1781 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1783 ; GCN2-LABEL: flat_atomic_nand_i64_ret:
1785 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1786 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0
1787 ; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
1788 ; GCN2-NEXT: flat_load_dword v4, v[0:1]
1789 ; GCN2-NEXT: flat_load_dword v5, v[5:6]
1790 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
1791 ; GCN2-NEXT: .LBB42_1: ; %atomicrmw.start
1792 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
1793 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1794 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
1795 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
1796 ; GCN2-NEXT: v_and_b32_e32 v4, v7, v3
1797 ; GCN2-NEXT: v_and_b32_e32 v8, v6, v2
1798 ; GCN2-NEXT: v_not_b32_e32 v5, v4
1799 ; GCN2-NEXT: v_not_b32_e32 v4, v8
1800 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1801 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
1802 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1803 ; GCN2-NEXT: buffer_wbinvl1_vol
1804 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
1805 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1806 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
1807 ; GCN2-NEXT: s_cbranch_execnz .LBB42_1
1808 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
1809 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
1810 ; GCN2-NEXT: v_mov_b32_e32 v0, v4
1811 ; GCN2-NEXT: v_mov_b32_e32 v1, v5
1812 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1814 ; GCN3-LABEL: flat_atomic_nand_i64_ret:
1816 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1817 ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
1818 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
1819 ; GCN3-NEXT: .LBB42_1: ; %atomicrmw.start
1820 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
1821 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1822 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
1823 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
1824 ; GCN3-NEXT: v_and_b32_e32 v4, v7, v3
1825 ; GCN3-NEXT: v_and_b32_e32 v8, v6, v2
1826 ; GCN3-NEXT: v_not_b32_e32 v5, v4
1827 ; GCN3-NEXT: v_not_b32_e32 v4, v8
1828 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1829 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
1830 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1831 ; GCN3-NEXT: buffer_wbinvl1_vol
1832 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
1833 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1834 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
1835 ; GCN3-NEXT: s_cbranch_execnz .LBB42_1
1836 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
1837 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
1838 ; GCN3-NEXT: v_mov_b32_e32 v0, v4
1839 ; GCN3-NEXT: v_mov_b32_e32 v1, v5
1840 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1841 %result = atomicrmw nand ptr %ptr, i64 %in seq_cst
1845 define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) {
1846 ; GCN1-LABEL: flat_atomic_nand_i64_ret_offset:
1848 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1849 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
1850 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1851 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
1852 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1853 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
1854 ; GCN1-NEXT: flat_load_dword v0, v[4:5]
1855 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
1856 ; GCN1-NEXT: .LBB43_1: ; %atomicrmw.start
1857 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
1858 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1859 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
1860 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
1861 ; GCN1-NEXT: v_and_b32_e32 v0, v9, v3
1862 ; GCN1-NEXT: v_and_b32_e32 v1, v8, v2
1863 ; GCN1-NEXT: v_not_b32_e32 v7, v0
1864 ; GCN1-NEXT: v_not_b32_e32 v6, v1
1865 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1866 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
1867 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1868 ; GCN1-NEXT: buffer_wbinvl1_vol
1869 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
1870 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1871 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
1872 ; GCN1-NEXT: s_cbranch_execnz .LBB43_1
1873 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
1874 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
1875 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1877 ; GCN2-LABEL: flat_atomic_nand_i64_ret_offset:
1879 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1880 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
1881 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1882 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
1883 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1884 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
1885 ; GCN2-NEXT: flat_load_dword v0, v[4:5]
1886 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
1887 ; GCN2-NEXT: .LBB43_1: ; %atomicrmw.start
1888 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
1889 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1890 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
1891 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
1892 ; GCN2-NEXT: v_and_b32_e32 v0, v9, v3
1893 ; GCN2-NEXT: v_and_b32_e32 v1, v8, v2
1894 ; GCN2-NEXT: v_not_b32_e32 v7, v0
1895 ; GCN2-NEXT: v_not_b32_e32 v6, v1
1896 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1897 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
1898 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1899 ; GCN2-NEXT: buffer_wbinvl1_vol
1900 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
1901 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1902 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
1903 ; GCN2-NEXT: s_cbranch_execnz .LBB43_1
1904 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
1905 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
1906 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1908 ; GCN3-LABEL: flat_atomic_nand_i64_ret_offset:
1910 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1911 ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
1912 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
1913 ; GCN3-NEXT: .LBB43_1: ; %atomicrmw.start
1914 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
1915 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1916 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
1917 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
1918 ; GCN3-NEXT: v_and_b32_e32 v4, v7, v3
1919 ; GCN3-NEXT: v_and_b32_e32 v8, v6, v2
1920 ; GCN3-NEXT: v_not_b32_e32 v5, v4
1921 ; GCN3-NEXT: v_not_b32_e32 v4, v8
1922 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1923 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
1924 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1925 ; GCN3-NEXT: buffer_wbinvl1_vol
1926 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
1927 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1928 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
1929 ; GCN3-NEXT: s_cbranch_execnz .LBB43_1
1930 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
1931 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
1932 ; GCN3-NEXT: v_mov_b32_e32 v0, v4
1933 ; GCN3-NEXT: v_mov_b32_e32 v1, v5
1934 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1935 %gep = getelementptr i64, ptr %out, i64 4
1936 %result = atomicrmw nand ptr %gep, i64 %in seq_cst
1940 define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
1941 ; GCN1-LABEL: flat_atomic_nand_i64_noret_scalar:
1943 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1944 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
1945 ; GCN1-NEXT: s_add_u32 s34, s4, 4
1946 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
1947 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
1948 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
1949 ; GCN1-NEXT: v_mov_b32_e32 v4, s35
1950 ; GCN1-NEXT: flat_load_dword v2, v[0:1]
1951 ; GCN1-NEXT: flat_load_dword v3, v[3:4]
1952 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
1953 ; GCN1-NEXT: .LBB44_1: ; %atomicrmw.start
1954 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
1955 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1956 ; GCN1-NEXT: v_and_b32_e32 v0, s7, v3
1957 ; GCN1-NEXT: v_and_b32_e32 v6, s6, v2
1958 ; GCN1-NEXT: v_mov_b32_e32 v4, s4
1959 ; GCN1-NEXT: v_mov_b32_e32 v5, s5
1960 ; GCN1-NEXT: v_not_b32_e32 v1, v0
1961 ; GCN1-NEXT: v_not_b32_e32 v0, v6
1962 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1963 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
1964 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1965 ; GCN1-NEXT: buffer_wbinvl1_vol
1966 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
1967 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
1968 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
1969 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
1970 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
1971 ; GCN1-NEXT: s_cbranch_execnz .LBB44_1
1972 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
1973 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
1974 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1976 ; GCN2-LABEL: flat_atomic_nand_i64_noret_scalar:
1978 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1979 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
1980 ; GCN2-NEXT: s_add_u32 s34, s4, 4
1981 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
1982 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
1983 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
1984 ; GCN2-NEXT: v_mov_b32_e32 v4, s35
1985 ; GCN2-NEXT: flat_load_dword v2, v[0:1]
1986 ; GCN2-NEXT: flat_load_dword v3, v[3:4]
1987 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
1988 ; GCN2-NEXT: .LBB44_1: ; %atomicrmw.start
1989 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
1990 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1991 ; GCN2-NEXT: v_and_b32_e32 v0, s7, v3
1992 ; GCN2-NEXT: v_and_b32_e32 v6, s6, v2
1993 ; GCN2-NEXT: v_mov_b32_e32 v4, s4
1994 ; GCN2-NEXT: v_mov_b32_e32 v5, s5
1995 ; GCN2-NEXT: v_not_b32_e32 v1, v0
1996 ; GCN2-NEXT: v_not_b32_e32 v0, v6
1997 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1998 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
1999 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2000 ; GCN2-NEXT: buffer_wbinvl1_vol
2001 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
2002 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
2003 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2004 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
2005 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
2006 ; GCN2-NEXT: s_cbranch_execnz .LBB44_1
2007 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
2008 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
2009 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2011 ; GCN3-LABEL: flat_atomic_nand_i64_noret_scalar:
2013 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2014 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
2015 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
2016 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
2017 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
2018 ; GCN3-NEXT: .LBB44_1: ; %atomicrmw.start
2019 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
2020 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2021 ; GCN3-NEXT: v_and_b32_e32 v0, s7, v3
2022 ; GCN3-NEXT: v_and_b32_e32 v6, s6, v2
2023 ; GCN3-NEXT: v_mov_b32_e32 v4, s4
2024 ; GCN3-NEXT: v_mov_b32_e32 v5, s5
2025 ; GCN3-NEXT: v_not_b32_e32 v1, v0
2026 ; GCN3-NEXT: v_not_b32_e32 v0, v6
2027 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2028 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
2029 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2030 ; GCN3-NEXT: buffer_wbinvl1_vol
2031 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
2032 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
2033 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2034 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
2035 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
2036 ; GCN3-NEXT: s_cbranch_execnz .LBB44_1
2037 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
2038 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
2039 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2040 %tmp0 = atomicrmw nand ptr %ptr, i64 %in seq_cst
2044 define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
2045 ; GCN1-LABEL: flat_atomic_nand_i64_noret_offset_scalar:
2047 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2048 ; GCN1-NEXT: s_add_u32 s34, s4, 32
2049 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
2050 ; GCN1-NEXT: s_add_u32 s36, s4, 36
2051 ; GCN1-NEXT: s_addc_u32 s37, s5, 0
2052 ; GCN1-NEXT: v_mov_b32_e32 v0, s36
2053 ; GCN1-NEXT: v_mov_b32_e32 v1, s37
2054 ; GCN1-NEXT: v_mov_b32_e32 v4, s34
2055 ; GCN1-NEXT: v_mov_b32_e32 v5, s35
2056 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
2057 ; GCN1-NEXT: flat_load_dword v2, v[4:5]
2058 ; GCN1-NEXT: s_mov_b64 s[36:37], 0
2059 ; GCN1-NEXT: .LBB45_1: ; %atomicrmw.start
2060 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
2061 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2062 ; GCN1-NEXT: v_and_b32_e32 v0, s7, v3
2063 ; GCN1-NEXT: v_and_b32_e32 v6, s6, v2
2064 ; GCN1-NEXT: v_mov_b32_e32 v4, s34
2065 ; GCN1-NEXT: v_mov_b32_e32 v5, s35
2066 ; GCN1-NEXT: v_not_b32_e32 v1, v0
2067 ; GCN1-NEXT: v_not_b32_e32 v0, v6
2068 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2069 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
2070 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2071 ; GCN1-NEXT: buffer_wbinvl1_vol
2072 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
2073 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
2074 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
2075 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
2076 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
2077 ; GCN1-NEXT: s_cbranch_execnz .LBB45_1
2078 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
2079 ; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
2080 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2082 ; GCN2-LABEL: flat_atomic_nand_i64_noret_offset_scalar:
2084 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2085 ; GCN2-NEXT: s_add_u32 s34, s4, 32
2086 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
2087 ; GCN2-NEXT: s_add_u32 s36, s4, 36
2088 ; GCN2-NEXT: s_addc_u32 s37, s5, 0
2089 ; GCN2-NEXT: v_mov_b32_e32 v0, s36
2090 ; GCN2-NEXT: v_mov_b32_e32 v1, s37
2091 ; GCN2-NEXT: v_mov_b32_e32 v4, s34
2092 ; GCN2-NEXT: v_mov_b32_e32 v5, s35
2093 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
2094 ; GCN2-NEXT: flat_load_dword v2, v[4:5]
2095 ; GCN2-NEXT: s_mov_b64 s[36:37], 0
2096 ; GCN2-NEXT: .LBB45_1: ; %atomicrmw.start
2097 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
2098 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2099 ; GCN2-NEXT: v_and_b32_e32 v0, s7, v3
2100 ; GCN2-NEXT: v_and_b32_e32 v6, s6, v2
2101 ; GCN2-NEXT: v_mov_b32_e32 v4, s34
2102 ; GCN2-NEXT: v_mov_b32_e32 v5, s35
2103 ; GCN2-NEXT: v_not_b32_e32 v1, v0
2104 ; GCN2-NEXT: v_not_b32_e32 v0, v6
2105 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2106 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
2107 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2108 ; GCN2-NEXT: buffer_wbinvl1_vol
2109 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
2110 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
2111 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
2112 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
2113 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
2114 ; GCN2-NEXT: s_cbranch_execnz .LBB45_1
2115 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
2116 ; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
2117 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2119 ; GCN3-LABEL: flat_atomic_nand_i64_noret_offset_scalar:
2121 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2122 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
2123 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
2124 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
2125 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
2126 ; GCN3-NEXT: .LBB45_1: ; %atomicrmw.start
2127 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
2128 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2129 ; GCN3-NEXT: v_and_b32_e32 v0, s7, v3
2130 ; GCN3-NEXT: v_and_b32_e32 v6, s6, v2
2131 ; GCN3-NEXT: v_mov_b32_e32 v4, s4
2132 ; GCN3-NEXT: v_mov_b32_e32 v5, s5
2133 ; GCN3-NEXT: v_not_b32_e32 v1, v0
2134 ; GCN3-NEXT: v_not_b32_e32 v0, v6
2135 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2136 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
2137 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2138 ; GCN3-NEXT: buffer_wbinvl1_vol
2139 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
2140 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
2141 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2142 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
2143 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
2144 ; GCN3-NEXT: s_cbranch_execnz .LBB45_1
2145 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
2146 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
2147 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2148 %gep = getelementptr i64, ptr %out, i64 4
2149 %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst
2153 define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
2154 ; GCN1-LABEL: flat_atomic_nand_i64_ret_scalar:
2156 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2157 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
2158 ; GCN1-NEXT: s_add_u32 s34, s4, 4
2159 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
2160 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
2161 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
2162 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
2163 ; GCN1-NEXT: flat_load_dword v0, v[0:1]
2164 ; GCN1-NEXT: flat_load_dword v1, v[2:3]
2165 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
2166 ; GCN1-NEXT: .LBB46_1: ; %atomicrmw.start
2167 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
2168 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2169 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
2170 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
2171 ; GCN1-NEXT: v_mov_b32_e32 v4, s4
2172 ; GCN1-NEXT: v_and_b32_e32 v0, s7, v3
2173 ; GCN1-NEXT: v_and_b32_e32 v6, s6, v2
2174 ; GCN1-NEXT: v_mov_b32_e32 v5, s5
2175 ; GCN1-NEXT: v_not_b32_e32 v1, v0
2176 ; GCN1-NEXT: v_not_b32_e32 v0, v6
2177 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2178 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
2179 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2180 ; GCN1-NEXT: buffer_wbinvl1_vol
2181 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
2182 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2183 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
2184 ; GCN1-NEXT: s_cbranch_execnz .LBB46_1
2185 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
2186 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
2187 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2189 ; GCN2-LABEL: flat_atomic_nand_i64_ret_scalar:
2191 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2192 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
2193 ; GCN2-NEXT: s_add_u32 s34, s4, 4
2194 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
2195 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
2196 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
2197 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
2198 ; GCN2-NEXT: flat_load_dword v0, v[0:1]
2199 ; GCN2-NEXT: flat_load_dword v1, v[2:3]
2200 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
2201 ; GCN2-NEXT: .LBB46_1: ; %atomicrmw.start
2202 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
2203 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2204 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
2205 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
2206 ; GCN2-NEXT: v_mov_b32_e32 v4, s4
2207 ; GCN2-NEXT: v_and_b32_e32 v0, s7, v3
2208 ; GCN2-NEXT: v_and_b32_e32 v6, s6, v2
2209 ; GCN2-NEXT: v_mov_b32_e32 v5, s5
2210 ; GCN2-NEXT: v_not_b32_e32 v1, v0
2211 ; GCN2-NEXT: v_not_b32_e32 v0, v6
2212 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2213 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
2214 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2215 ; GCN2-NEXT: buffer_wbinvl1_vol
2216 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
2217 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2218 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
2219 ; GCN2-NEXT: s_cbranch_execnz .LBB46_1
2220 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
2221 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
2222 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2224 ; GCN3-LABEL: flat_atomic_nand_i64_ret_scalar:
2226 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2227 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
2228 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
2229 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
2230 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
2231 ; GCN3-NEXT: .LBB46_1: ; %atomicrmw.start
2232 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
2233 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2234 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
2235 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
2236 ; GCN3-NEXT: v_mov_b32_e32 v4, s4
2237 ; GCN3-NEXT: v_and_b32_e32 v0, s7, v3
2238 ; GCN3-NEXT: v_and_b32_e32 v6, s6, v2
2239 ; GCN3-NEXT: v_mov_b32_e32 v5, s5
2240 ; GCN3-NEXT: v_not_b32_e32 v1, v0
2241 ; GCN3-NEXT: v_not_b32_e32 v0, v6
2242 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2243 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
2244 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2245 ; GCN3-NEXT: buffer_wbinvl1_vol
2246 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
2247 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2248 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
2249 ; GCN3-NEXT: s_cbranch_execnz .LBB46_1
2250 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
2251 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
2252 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2253 %result = atomicrmw nand ptr %ptr, i64 %in seq_cst
2257 define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
2258 ; GCN1-LABEL: flat_atomic_nand_i64_ret_offset_scalar:
2260 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2261 ; GCN1-NEXT: s_add_u32 s34, s4, 32
2262 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
2263 ; GCN1-NEXT: s_add_u32 s36, s4, 36
2264 ; GCN1-NEXT: s_addc_u32 s37, s5, 0
2265 ; GCN1-NEXT: v_mov_b32_e32 v0, s36
2266 ; GCN1-NEXT: v_mov_b32_e32 v1, s37
2267 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
2268 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
2269 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
2270 ; GCN1-NEXT: flat_load_dword v0, v[2:3]
2271 ; GCN1-NEXT: s_mov_b64 s[36:37], 0
2272 ; GCN1-NEXT: .LBB47_1: ; %atomicrmw.start
2273 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
2274 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2275 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
2276 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
2277 ; GCN1-NEXT: v_mov_b32_e32 v4, s34
2278 ; GCN1-NEXT: v_and_b32_e32 v0, s7, v3
2279 ; GCN1-NEXT: v_and_b32_e32 v6, s6, v2
2280 ; GCN1-NEXT: v_mov_b32_e32 v5, s35
2281 ; GCN1-NEXT: v_not_b32_e32 v1, v0
2282 ; GCN1-NEXT: v_not_b32_e32 v0, v6
2283 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2284 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
2285 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2286 ; GCN1-NEXT: buffer_wbinvl1_vol
2287 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
2288 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
2289 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
2290 ; GCN1-NEXT: s_cbranch_execnz .LBB47_1
2291 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
2292 ; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
2293 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2295 ; GCN2-LABEL: flat_atomic_nand_i64_ret_offset_scalar:
2297 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2298 ; GCN2-NEXT: s_add_u32 s34, s4, 32
2299 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
2300 ; GCN2-NEXT: s_add_u32 s36, s4, 36
2301 ; GCN2-NEXT: s_addc_u32 s37, s5, 0
2302 ; GCN2-NEXT: v_mov_b32_e32 v0, s36
2303 ; GCN2-NEXT: v_mov_b32_e32 v1, s37
2304 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
2305 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
2306 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
2307 ; GCN2-NEXT: flat_load_dword v0, v[2:3]
2308 ; GCN2-NEXT: s_mov_b64 s[36:37], 0
2309 ; GCN2-NEXT: .LBB47_1: ; %atomicrmw.start
2310 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
2311 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2312 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
2313 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
2314 ; GCN2-NEXT: v_mov_b32_e32 v4, s34
2315 ; GCN2-NEXT: v_and_b32_e32 v0, s7, v3
2316 ; GCN2-NEXT: v_and_b32_e32 v6, s6, v2
2317 ; GCN2-NEXT: v_mov_b32_e32 v5, s35
2318 ; GCN2-NEXT: v_not_b32_e32 v1, v0
2319 ; GCN2-NEXT: v_not_b32_e32 v0, v6
2320 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2321 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
2322 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2323 ; GCN2-NEXT: buffer_wbinvl1_vol
2324 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
2325 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
2326 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
2327 ; GCN2-NEXT: s_cbranch_execnz .LBB47_1
2328 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
2329 ; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
2330 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2332 ; GCN3-LABEL: flat_atomic_nand_i64_ret_offset_scalar:
2334 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2335 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
2336 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
2337 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
2338 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
2339 ; GCN3-NEXT: .LBB47_1: ; %atomicrmw.start
2340 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
2341 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2342 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
2343 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
2344 ; GCN3-NEXT: v_mov_b32_e32 v4, s4
2345 ; GCN3-NEXT: v_and_b32_e32 v0, s7, v3
2346 ; GCN3-NEXT: v_and_b32_e32 v6, s6, v2
2347 ; GCN3-NEXT: v_mov_b32_e32 v5, s5
2348 ; GCN3-NEXT: v_not_b32_e32 v1, v0
2349 ; GCN3-NEXT: v_not_b32_e32 v0, v6
2350 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2351 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
2352 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2353 ; GCN3-NEXT: buffer_wbinvl1_vol
2354 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
2355 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2356 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
2357 ; GCN3-NEXT: s_cbranch_execnz .LBB47_1
2358 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
2359 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
2360 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2361 %gep = getelementptr i64, ptr %out, i64 4
2362 %result = atomicrmw nand ptr %gep, i64 %in seq_cst
2366 ; ---------------------------------------------------------------------
2368 ; ---------------------------------------------------------------------
2370 define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) {
2371 ; GCN1-LABEL: flat_atomic_or_i64_noret:
2373 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2374 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
2375 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2376 ; GCN1-NEXT: buffer_wbinvl1_vol
2377 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2379 ; GCN2-LABEL: flat_atomic_or_i64_noret:
2381 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2382 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
2383 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2384 ; GCN2-NEXT: buffer_wbinvl1_vol
2385 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2387 ; GCN3-LABEL: flat_atomic_or_i64_noret:
2389 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2390 ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
2391 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2392 ; GCN3-NEXT: buffer_wbinvl1_vol
2393 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2394 %tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst
2398 define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) {
2399 ; GCN1-LABEL: flat_atomic_or_i64_noret_offset:
2401 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2402 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
2403 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2404 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2405 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
2406 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2407 ; GCN1-NEXT: buffer_wbinvl1_vol
2408 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2410 ; GCN2-LABEL: flat_atomic_or_i64_noret_offset:
2412 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2413 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
2414 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2415 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2416 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
2417 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2418 ; GCN2-NEXT: buffer_wbinvl1_vol
2419 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2421 ; GCN3-LABEL: flat_atomic_or_i64_noret_offset:
2423 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2424 ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] offset:32
2425 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2426 ; GCN3-NEXT: buffer_wbinvl1_vol
2427 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2428 %gep = getelementptr i64, ptr %out, i64 4
2429 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst
2433 define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) {
2434 ; GCN1-LABEL: flat_atomic_or_i64_ret:
2436 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2437 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
2438 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2439 ; GCN1-NEXT: buffer_wbinvl1_vol
2440 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2442 ; GCN2-LABEL: flat_atomic_or_i64_ret:
2444 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2445 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
2446 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2447 ; GCN2-NEXT: buffer_wbinvl1_vol
2448 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2450 ; GCN3-LABEL: flat_atomic_or_i64_ret:
2452 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2453 ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
2454 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2455 ; GCN3-NEXT: buffer_wbinvl1_vol
2456 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2457 %result = atomicrmw or ptr %ptr, i64 %in seq_cst
2461 define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) {
2462 ; GCN1-LABEL: flat_atomic_or_i64_ret_offset:
2464 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2465 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
2466 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2467 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2468 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
2469 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2470 ; GCN1-NEXT: buffer_wbinvl1_vol
2471 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2473 ; GCN2-LABEL: flat_atomic_or_i64_ret_offset:
2475 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2476 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
2477 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2478 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2479 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
2480 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2481 ; GCN2-NEXT: buffer_wbinvl1_vol
2482 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2484 ; GCN3-LABEL: flat_atomic_or_i64_ret_offset:
2486 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2487 ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
2488 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2489 ; GCN3-NEXT: buffer_wbinvl1_vol
2490 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2491 %gep = getelementptr i64, ptr %out, i64 4
2492 %result = atomicrmw or ptr %gep, i64 %in seq_cst
2496 define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
2497 ; GCN1-LABEL: flat_atomic_or_i64_noret_scalar:
2499 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2500 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
2501 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
2502 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
2503 ; GCN1-NEXT: v_mov_b32_e32 v3, s5
2504 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2505 ; GCN1-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
2506 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2507 ; GCN1-NEXT: buffer_wbinvl1_vol
2508 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2510 ; GCN2-LABEL: flat_atomic_or_i64_noret_scalar:
2512 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2513 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
2514 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
2515 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
2516 ; GCN2-NEXT: v_mov_b32_e32 v3, s5
2517 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2518 ; GCN2-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
2519 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2520 ; GCN2-NEXT: buffer_wbinvl1_vol
2521 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2523 ; GCN3-LABEL: flat_atomic_or_i64_noret_scalar:
2525 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2526 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
2527 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
2528 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
2529 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
2530 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2531 ; GCN3-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
2532 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2533 ; GCN3-NEXT: buffer_wbinvl1_vol
2534 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2535 %tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst
2539 define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
2540 ; GCN1-LABEL: flat_atomic_or_i64_noret_offset_scalar:
2542 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2543 ; GCN1-NEXT: s_add_u32 s34, s4, 32
2544 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
2545 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
2546 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
2547 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
2548 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
2549 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2550 ; GCN1-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
2551 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2552 ; GCN1-NEXT: buffer_wbinvl1_vol
2553 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2555 ; GCN2-LABEL: flat_atomic_or_i64_noret_offset_scalar:
2557 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2558 ; GCN2-NEXT: s_add_u32 s34, s4, 32
2559 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
2560 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
2561 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
2562 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
2563 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
2564 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2565 ; GCN2-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
2566 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2567 ; GCN2-NEXT: buffer_wbinvl1_vol
2568 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2570 ; GCN3-LABEL: flat_atomic_or_i64_noret_offset_scalar:
2572 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2573 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
2574 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
2575 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
2576 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
2577 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2578 ; GCN3-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] offset:32
2579 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2580 ; GCN3-NEXT: buffer_wbinvl1_vol
2581 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2582 %gep = getelementptr i64, ptr %out, i64 4
2583 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst
2587 define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
2588 ; GCN1-LABEL: flat_atomic_or_i64_ret_scalar:
2590 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2591 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
2592 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
2593 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
2594 ; GCN1-NEXT: v_mov_b32_e32 v3, s5
2595 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2596 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
2597 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2598 ; GCN1-NEXT: buffer_wbinvl1_vol
2599 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2601 ; GCN2-LABEL: flat_atomic_or_i64_ret_scalar:
2603 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2604 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
2605 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
2606 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
2607 ; GCN2-NEXT: v_mov_b32_e32 v3, s5
2608 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2609 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
2610 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2611 ; GCN2-NEXT: buffer_wbinvl1_vol
2612 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2614 ; GCN3-LABEL: flat_atomic_or_i64_ret_scalar:
2616 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2617 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
2618 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
2619 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
2620 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
2621 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2622 ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
2623 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2624 ; GCN3-NEXT: buffer_wbinvl1_vol
2625 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2626 %result = atomicrmw or ptr %ptr, i64 %in seq_cst
2630 define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
2631 ; GCN1-LABEL: flat_atomic_or_i64_ret_offset_scalar:
2633 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2634 ; GCN1-NEXT: s_add_u32 s34, s4, 32
2635 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
2636 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
2637 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
2638 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
2639 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
2640 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2641 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
2642 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2643 ; GCN1-NEXT: buffer_wbinvl1_vol
2644 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2646 ; GCN2-LABEL: flat_atomic_or_i64_ret_offset_scalar:
2648 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2649 ; GCN2-NEXT: s_add_u32 s34, s4, 32
2650 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
2651 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
2652 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
2653 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
2654 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
2655 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2656 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
2657 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2658 ; GCN2-NEXT: buffer_wbinvl1_vol
2659 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2661 ; GCN3-LABEL: flat_atomic_or_i64_ret_offset_scalar:
2663 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2664 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
2665 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
2666 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
2667 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
2668 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2669 ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
2670 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2671 ; GCN3-NEXT: buffer_wbinvl1_vol
2672 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2673 %gep = getelementptr i64, ptr %out, i64 4
2674 %result = atomicrmw or ptr %gep, i64 %in seq_cst
2678 ; ---------------------------------------------------------------------
2680 ; ---------------------------------------------------------------------
2682 define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) {
2683 ; GCN1-LABEL: flat_atomic_xor_i64_noret:
2685 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2686 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
2687 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2688 ; GCN1-NEXT: buffer_wbinvl1_vol
2689 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2691 ; GCN2-LABEL: flat_atomic_xor_i64_noret:
2693 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2694 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
2695 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2696 ; GCN2-NEXT: buffer_wbinvl1_vol
2697 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2699 ; GCN3-LABEL: flat_atomic_xor_i64_noret:
2701 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2702 ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
2703 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2704 ; GCN3-NEXT: buffer_wbinvl1_vol
2705 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2706 %tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst
2710 define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) {
2711 ; GCN1-LABEL: flat_atomic_xor_i64_noret_offset:
2713 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2714 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
2715 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2716 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2717 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
2718 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2719 ; GCN1-NEXT: buffer_wbinvl1_vol
2720 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2722 ; GCN2-LABEL: flat_atomic_xor_i64_noret_offset:
2724 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2725 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
2726 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2727 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2728 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
2729 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2730 ; GCN2-NEXT: buffer_wbinvl1_vol
2731 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2733 ; GCN3-LABEL: flat_atomic_xor_i64_noret_offset:
2735 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2736 ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] offset:32
2737 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2738 ; GCN3-NEXT: buffer_wbinvl1_vol
2739 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2740 %gep = getelementptr i64, ptr %out, i64 4
2741 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst
2745 define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) {
2746 ; GCN1-LABEL: flat_atomic_xor_i64_ret:
2748 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2749 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
2750 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2751 ; GCN1-NEXT: buffer_wbinvl1_vol
2752 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2754 ; GCN2-LABEL: flat_atomic_xor_i64_ret:
2756 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2757 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
2758 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2759 ; GCN2-NEXT: buffer_wbinvl1_vol
2760 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2762 ; GCN3-LABEL: flat_atomic_xor_i64_ret:
2764 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2765 ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
2766 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2767 ; GCN3-NEXT: buffer_wbinvl1_vol
2768 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2769 %result = atomicrmw xor ptr %ptr, i64 %in seq_cst
2773 define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) {
2774 ; GCN1-LABEL: flat_atomic_xor_i64_ret_offset:
2776 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2777 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
2778 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2779 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2780 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
2781 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2782 ; GCN1-NEXT: buffer_wbinvl1_vol
2783 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2785 ; GCN2-LABEL: flat_atomic_xor_i64_ret_offset:
2787 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2788 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
2789 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2790 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2791 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
2792 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2793 ; GCN2-NEXT: buffer_wbinvl1_vol
2794 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2796 ; GCN3-LABEL: flat_atomic_xor_i64_ret_offset:
2798 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2799 ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
2800 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2801 ; GCN3-NEXT: buffer_wbinvl1_vol
2802 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2803 %gep = getelementptr i64, ptr %out, i64 4
2804 %result = atomicrmw xor ptr %gep, i64 %in seq_cst
2808 define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
2809 ; GCN1-LABEL: flat_atomic_xor_i64_noret_scalar:
2811 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2812 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
2813 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
2814 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
2815 ; GCN1-NEXT: v_mov_b32_e32 v3, s5
2816 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2817 ; GCN1-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
2818 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2819 ; GCN1-NEXT: buffer_wbinvl1_vol
2820 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2822 ; GCN2-LABEL: flat_atomic_xor_i64_noret_scalar:
2824 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2825 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
2826 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
2827 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
2828 ; GCN2-NEXT: v_mov_b32_e32 v3, s5
2829 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2830 ; GCN2-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
2831 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2832 ; GCN2-NEXT: buffer_wbinvl1_vol
2833 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2835 ; GCN3-LABEL: flat_atomic_xor_i64_noret_scalar:
2837 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2838 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
2839 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
2840 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
2841 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
2842 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2843 ; GCN3-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
2844 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2845 ; GCN3-NEXT: buffer_wbinvl1_vol
2846 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2847 %tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst
2851 define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
2852 ; GCN1-LABEL: flat_atomic_xor_i64_noret_offset_scalar:
2854 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2855 ; GCN1-NEXT: s_add_u32 s34, s4, 32
2856 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
2857 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
2858 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
2859 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
2860 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
2861 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2862 ; GCN1-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
2863 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2864 ; GCN1-NEXT: buffer_wbinvl1_vol
2865 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2867 ; GCN2-LABEL: flat_atomic_xor_i64_noret_offset_scalar:
2869 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2870 ; GCN2-NEXT: s_add_u32 s34, s4, 32
2871 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
2872 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
2873 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
2874 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
2875 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
2876 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2877 ; GCN2-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
2878 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2879 ; GCN2-NEXT: buffer_wbinvl1_vol
2880 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2882 ; GCN3-LABEL: flat_atomic_xor_i64_noret_offset_scalar:
2884 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2885 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
2886 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
2887 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
2888 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
2889 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2890 ; GCN3-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] offset:32
2891 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2892 ; GCN3-NEXT: buffer_wbinvl1_vol
2893 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2894 %gep = getelementptr i64, ptr %out, i64 4
2895 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst
2899 define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
2900 ; GCN1-LABEL: flat_atomic_xor_i64_ret_scalar:
2902 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2903 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
2904 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
2905 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
2906 ; GCN1-NEXT: v_mov_b32_e32 v3, s5
2907 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2908 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
2909 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2910 ; GCN1-NEXT: buffer_wbinvl1_vol
2911 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2913 ; GCN2-LABEL: flat_atomic_xor_i64_ret_scalar:
2915 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2916 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
2917 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
2918 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
2919 ; GCN2-NEXT: v_mov_b32_e32 v3, s5
2920 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2921 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
2922 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2923 ; GCN2-NEXT: buffer_wbinvl1_vol
2924 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2926 ; GCN3-LABEL: flat_atomic_xor_i64_ret_scalar:
2928 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2929 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
2930 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
2931 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
2932 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
2933 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2934 ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
2935 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2936 ; GCN3-NEXT: buffer_wbinvl1_vol
2937 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2938 %result = atomicrmw xor ptr %ptr, i64 %in seq_cst
2942 define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
2943 ; GCN1-LABEL: flat_atomic_xor_i64_ret_offset_scalar:
2945 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2946 ; GCN1-NEXT: s_add_u32 s34, s4, 32
2947 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
2948 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
2949 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
2950 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
2951 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
2952 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2953 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
2954 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2955 ; GCN1-NEXT: buffer_wbinvl1_vol
2956 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2958 ; GCN2-LABEL: flat_atomic_xor_i64_ret_offset_scalar:
2960 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2961 ; GCN2-NEXT: s_add_u32 s34, s4, 32
2962 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
2963 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
2964 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
2965 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
2966 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
2967 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2968 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
2969 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2970 ; GCN2-NEXT: buffer_wbinvl1_vol
2971 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2973 ; GCN3-LABEL: flat_atomic_xor_i64_ret_offset_scalar:
2975 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2976 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
2977 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
2978 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
2979 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
2980 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2981 ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
2982 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2983 ; GCN3-NEXT: buffer_wbinvl1_vol
2984 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2985 %gep = getelementptr i64, ptr %out, i64 4
2986 %result = atomicrmw xor ptr %gep, i64 %in seq_cst
2990 ; ---------------------------------------------------------------------
2992 ; ---------------------------------------------------------------------
2994 define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) {
2995 ; GCN1-LABEL: flat_atomic_max_i64_noret:
2997 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2998 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
2999 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
3000 ; GCN1-NEXT: flat_load_dword v6, v[0:1]
3001 ; GCN1-NEXT: flat_load_dword v7, v[4:5]
3002 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
3003 ; GCN1-NEXT: .LBB64_1: ; %atomicrmw.start
3004 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
3005 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3006 ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
3007 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
3008 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
3009 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3010 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
3011 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3012 ; GCN1-NEXT: buffer_wbinvl1_vol
3013 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
3014 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
3015 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3016 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
3017 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
3018 ; GCN1-NEXT: s_cbranch_execnz .LBB64_1
3019 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
3020 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
3021 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3023 ; GCN2-LABEL: flat_atomic_max_i64_noret:
3025 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3026 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
3027 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
3028 ; GCN2-NEXT: flat_load_dword v6, v[0:1]
3029 ; GCN2-NEXT: flat_load_dword v7, v[4:5]
3030 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
3031 ; GCN2-NEXT: .LBB64_1: ; %atomicrmw.start
3032 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
3033 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3034 ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
3035 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
3036 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
3037 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3038 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
3039 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3040 ; GCN2-NEXT: buffer_wbinvl1_vol
3041 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
3042 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
3043 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3044 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
3045 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
3046 ; GCN2-NEXT: s_cbranch_execnz .LBB64_1
3047 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
3048 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
3049 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3051 ; GCN3-LABEL: flat_atomic_max_i64_noret:
3053 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3054 ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
3055 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
3056 ; GCN3-NEXT: .LBB64_1: ; %atomicrmw.start
3057 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
3058 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3059 ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
3060 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
3061 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
3062 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3063 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
3064 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3065 ; GCN3-NEXT: buffer_wbinvl1_vol
3066 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
3067 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
3068 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3069 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
3070 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
3071 ; GCN3-NEXT: s_cbranch_execnz .LBB64_1
3072 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
3073 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
3074 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3075 %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst
3079 define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) {
3080 ; GCN1-LABEL: flat_atomic_max_i64_noret_offset:
3082 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3083 ; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0
3084 ; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
3085 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
3086 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3087 ; GCN1-NEXT: flat_load_dword v7, v[0:1]
3088 ; GCN1-NEXT: flat_load_dword v6, v[8:9]
3089 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
3090 ; GCN1-NEXT: .LBB65_1: ; %atomicrmw.start
3091 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
3092 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3093 ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
3094 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
3095 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
3096 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3097 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
3098 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3099 ; GCN1-NEXT: buffer_wbinvl1_vol
3100 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
3101 ; GCN1-NEXT: v_mov_b32_e32 v7, v1
3102 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3103 ; GCN1-NEXT: v_mov_b32_e32 v6, v0
3104 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
3105 ; GCN1-NEXT: s_cbranch_execnz .LBB65_1
3106 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
3107 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
3108 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3110 ; GCN2-LABEL: flat_atomic_max_i64_noret_offset:
3112 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3113 ; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0
3114 ; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
3115 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
3116 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3117 ; GCN2-NEXT: flat_load_dword v7, v[0:1]
3118 ; GCN2-NEXT: flat_load_dword v6, v[8:9]
3119 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
3120 ; GCN2-NEXT: .LBB65_1: ; %atomicrmw.start
3121 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
3122 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3123 ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
3124 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
3125 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
3126 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3127 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
3128 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3129 ; GCN2-NEXT: buffer_wbinvl1_vol
3130 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
3131 ; GCN2-NEXT: v_mov_b32_e32 v7, v1
3132 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3133 ; GCN2-NEXT: v_mov_b32_e32 v6, v0
3134 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
3135 ; GCN2-NEXT: s_cbranch_execnz .LBB65_1
3136 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
3137 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
3138 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3140 ; GCN3-LABEL: flat_atomic_max_i64_noret_offset:
3142 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3143 ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
3144 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
3145 ; GCN3-NEXT: .LBB65_1: ; %atomicrmw.start
3146 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
3147 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3148 ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
3149 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
3150 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
3151 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3152 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
3153 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3154 ; GCN3-NEXT: buffer_wbinvl1_vol
3155 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
3156 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
3157 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3158 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
3159 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
3160 ; GCN3-NEXT: s_cbranch_execnz .LBB65_1
3161 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
3162 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
3163 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3164 %gep = getelementptr i64, ptr %out, i64 4
3165 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst
3169 define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) {
3170 ; GCN1-LABEL: flat_atomic_max_i64_ret:
3172 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3173 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0
3174 ; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
3175 ; GCN1-NEXT: flat_load_dword v4, v[0:1]
3176 ; GCN1-NEXT: flat_load_dword v5, v[5:6]
3177 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
3178 ; GCN1-NEXT: .LBB66_1: ; %atomicrmw.start
3179 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
3180 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3181 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
3182 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
3183 ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
3184 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
3185 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
3186 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3187 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
3188 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3189 ; GCN1-NEXT: buffer_wbinvl1_vol
3190 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
3191 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3192 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
3193 ; GCN1-NEXT: s_cbranch_execnz .LBB66_1
3194 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
3195 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
3196 ; GCN1-NEXT: v_mov_b32_e32 v0, v4
3197 ; GCN1-NEXT: v_mov_b32_e32 v1, v5
3198 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3200 ; GCN2-LABEL: flat_atomic_max_i64_ret:
3202 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3203 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0
3204 ; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
3205 ; GCN2-NEXT: flat_load_dword v4, v[0:1]
3206 ; GCN2-NEXT: flat_load_dword v5, v[5:6]
3207 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
3208 ; GCN2-NEXT: .LBB66_1: ; %atomicrmw.start
3209 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
3210 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3211 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
3212 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
3213 ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
3214 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
3215 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
3216 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3217 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
3218 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3219 ; GCN2-NEXT: buffer_wbinvl1_vol
3220 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
3221 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3222 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
3223 ; GCN2-NEXT: s_cbranch_execnz .LBB66_1
3224 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
3225 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
3226 ; GCN2-NEXT: v_mov_b32_e32 v0, v4
3227 ; GCN2-NEXT: v_mov_b32_e32 v1, v5
3228 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3230 ; GCN3-LABEL: flat_atomic_max_i64_ret:
3232 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3233 ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
3234 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
3235 ; GCN3-NEXT: .LBB66_1: ; %atomicrmw.start
3236 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
3237 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3238 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
3239 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
3240 ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
3241 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
3242 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
3243 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3244 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
3245 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3246 ; GCN3-NEXT: buffer_wbinvl1_vol
3247 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
3248 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3249 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
3250 ; GCN3-NEXT: s_cbranch_execnz .LBB66_1
3251 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
3252 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
3253 ; GCN3-NEXT: v_mov_b32_e32 v0, v4
3254 ; GCN3-NEXT: v_mov_b32_e32 v1, v5
3255 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3256 %result = atomicrmw max ptr %ptr, i64 %in seq_cst
3260 define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) {
3261 ; GCN1-LABEL: flat_atomic_max_i64_ret_offset:
3263 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3264 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
3265 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
3266 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
3267 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3268 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
3269 ; GCN1-NEXT: flat_load_dword v0, v[4:5]
3270 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
3271 ; GCN1-NEXT: .LBB67_1: ; %atomicrmw.start
3272 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
3273 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3274 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
3275 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
3276 ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
3277 ; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
3278 ; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
3279 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3280 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
3281 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3282 ; GCN1-NEXT: buffer_wbinvl1_vol
3283 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
3284 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3285 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
3286 ; GCN1-NEXT: s_cbranch_execnz .LBB67_1
3287 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
3288 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
3289 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3291 ; GCN2-LABEL: flat_atomic_max_i64_ret_offset:
3293 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3294 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
3295 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
3296 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
3297 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3298 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
3299 ; GCN2-NEXT: flat_load_dword v0, v[4:5]
3300 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
3301 ; GCN2-NEXT: .LBB67_1: ; %atomicrmw.start
3302 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
3303 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3304 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
3305 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
3306 ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
3307 ; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
3308 ; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
3309 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3310 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
3311 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3312 ; GCN2-NEXT: buffer_wbinvl1_vol
3313 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
3314 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3315 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
3316 ; GCN2-NEXT: s_cbranch_execnz .LBB67_1
3317 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
3318 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
3319 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3321 ; GCN3-LABEL: flat_atomic_max_i64_ret_offset:
3323 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3324 ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
3325 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
3326 ; GCN3-NEXT: .LBB67_1: ; %atomicrmw.start
3327 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
3328 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3329 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
3330 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
3331 ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
3332 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
3333 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
3334 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3335 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
3336 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3337 ; GCN3-NEXT: buffer_wbinvl1_vol
3338 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
3339 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3340 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
3341 ; GCN3-NEXT: s_cbranch_execnz .LBB67_1
3342 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
3343 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
3344 ; GCN3-NEXT: v_mov_b32_e32 v0, v4
3345 ; GCN3-NEXT: v_mov_b32_e32 v1, v5
3346 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3347 %gep = getelementptr i64, ptr %out, i64 4
3348 %result = atomicrmw max ptr %gep, i64 %in seq_cst
3352 define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
3353 ; GCN1-LABEL: flat_atomic_max_i64_noret_scalar:
3355 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3356 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
3357 ; GCN1-NEXT: s_add_u32 s34, s4, 4
3358 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
3359 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
3360 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
3361 ; GCN1-NEXT: v_mov_b32_e32 v4, s35
3362 ; GCN1-NEXT: flat_load_dword v2, v[0:1]
3363 ; GCN1-NEXT: flat_load_dword v3, v[3:4]
3364 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
3365 ; GCN1-NEXT: .LBB68_1: ; %atomicrmw.start
3366 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
3367 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3368 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
3369 ; GCN1-NEXT: v_mov_b32_e32 v0, s7
3370 ; GCN1-NEXT: v_mov_b32_e32 v6, s6
3371 ; GCN1-NEXT: v_mov_b32_e32 v4, s4
3372 ; GCN1-NEXT: v_mov_b32_e32 v5, s5
3373 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
3374 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
3375 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3376 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
3377 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3378 ; GCN1-NEXT: buffer_wbinvl1_vol
3379 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
3380 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
3381 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
3382 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
3383 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
3384 ; GCN1-NEXT: s_cbranch_execnz .LBB68_1
3385 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
3386 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
3387 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3389 ; GCN2-LABEL: flat_atomic_max_i64_noret_scalar:
3391 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3392 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
3393 ; GCN2-NEXT: s_add_u32 s34, s4, 4
3394 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
3395 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
3396 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
3397 ; GCN2-NEXT: v_mov_b32_e32 v4, s35
3398 ; GCN2-NEXT: flat_load_dword v2, v[0:1]
3399 ; GCN2-NEXT: flat_load_dword v3, v[3:4]
3400 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
3401 ; GCN2-NEXT: .LBB68_1: ; %atomicrmw.start
3402 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
3403 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3404 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
3405 ; GCN2-NEXT: v_mov_b32_e32 v0, s7
3406 ; GCN2-NEXT: v_mov_b32_e32 v6, s6
3407 ; GCN2-NEXT: v_mov_b32_e32 v4, s4
3408 ; GCN2-NEXT: v_mov_b32_e32 v5, s5
3409 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
3410 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
3411 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3412 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
3413 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3414 ; GCN2-NEXT: buffer_wbinvl1_vol
3415 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
3416 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
3417 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
3418 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
3419 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
3420 ; GCN2-NEXT: s_cbranch_execnz .LBB68_1
3421 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
3422 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
3423 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3425 ; GCN3-LABEL: flat_atomic_max_i64_noret_scalar:
3427 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3428 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
3429 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
3430 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
3431 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
3432 ; GCN3-NEXT: .LBB68_1: ; %atomicrmw.start
3433 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
3434 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3435 ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
3436 ; GCN3-NEXT: v_mov_b32_e32 v0, s7
3437 ; GCN3-NEXT: v_mov_b32_e32 v6, s6
3438 ; GCN3-NEXT: v_mov_b32_e32 v4, s4
3439 ; GCN3-NEXT: v_mov_b32_e32 v5, s5
3440 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
3441 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
3442 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3443 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
3444 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3445 ; GCN3-NEXT: buffer_wbinvl1_vol
3446 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
3447 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
3448 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
3449 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
3450 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
3451 ; GCN3-NEXT: s_cbranch_execnz .LBB68_1
3452 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
3453 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
3454 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3455 %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst
3459 define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
3460 ; GCN1-LABEL: flat_atomic_max_i64_noret_offset_scalar:
3462 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3463 ; GCN1-NEXT: s_add_u32 s34, s4, 32
3464 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
3465 ; GCN1-NEXT: s_add_u32 s36, s4, 36
3466 ; GCN1-NEXT: s_addc_u32 s37, s5, 0
3467 ; GCN1-NEXT: v_mov_b32_e32 v0, s36
3468 ; GCN1-NEXT: v_mov_b32_e32 v1, s37
3469 ; GCN1-NEXT: v_mov_b32_e32 v4, s34
3470 ; GCN1-NEXT: v_mov_b32_e32 v5, s35
3471 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
3472 ; GCN1-NEXT: flat_load_dword v2, v[4:5]
3473 ; GCN1-NEXT: s_mov_b64 s[36:37], 0
3474 ; GCN1-NEXT: .LBB69_1: ; %atomicrmw.start
3475 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
3476 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3477 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
3478 ; GCN1-NEXT: v_mov_b32_e32 v0, s7
3479 ; GCN1-NEXT: v_mov_b32_e32 v6, s6
3480 ; GCN1-NEXT: v_mov_b32_e32 v4, s34
3481 ; GCN1-NEXT: v_mov_b32_e32 v5, s35
3482 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
3483 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
3484 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3485 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
3486 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3487 ; GCN1-NEXT: buffer_wbinvl1_vol
3488 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
3489 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
3490 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
3491 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
3492 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
3493 ; GCN1-NEXT: s_cbranch_execnz .LBB69_1
3494 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
3495 ; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
3496 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3498 ; GCN2-LABEL: flat_atomic_max_i64_noret_offset_scalar:
3500 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3501 ; GCN2-NEXT: s_add_u32 s34, s4, 32
3502 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
3503 ; GCN2-NEXT: s_add_u32 s36, s4, 36
3504 ; GCN2-NEXT: s_addc_u32 s37, s5, 0
3505 ; GCN2-NEXT: v_mov_b32_e32 v0, s36
3506 ; GCN2-NEXT: v_mov_b32_e32 v1, s37
3507 ; GCN2-NEXT: v_mov_b32_e32 v4, s34
3508 ; GCN2-NEXT: v_mov_b32_e32 v5, s35
3509 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
3510 ; GCN2-NEXT: flat_load_dword v2, v[4:5]
3511 ; GCN2-NEXT: s_mov_b64 s[36:37], 0
3512 ; GCN2-NEXT: .LBB69_1: ; %atomicrmw.start
3513 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
3514 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3515 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
3516 ; GCN2-NEXT: v_mov_b32_e32 v0, s7
3517 ; GCN2-NEXT: v_mov_b32_e32 v6, s6
3518 ; GCN2-NEXT: v_mov_b32_e32 v4, s34
3519 ; GCN2-NEXT: v_mov_b32_e32 v5, s35
3520 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
3521 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
3522 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3523 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
3524 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3525 ; GCN2-NEXT: buffer_wbinvl1_vol
3526 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
3527 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
3528 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
3529 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
3530 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
3531 ; GCN2-NEXT: s_cbranch_execnz .LBB69_1
3532 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
3533 ; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
3534 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3536 ; GCN3-LABEL: flat_atomic_max_i64_noret_offset_scalar:
3538 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3539 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
3540 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
3541 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
3542 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
3543 ; GCN3-NEXT: .LBB69_1: ; %atomicrmw.start
3544 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
3545 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3546 ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
3547 ; GCN3-NEXT: v_mov_b32_e32 v0, s7
3548 ; GCN3-NEXT: v_mov_b32_e32 v6, s6
3549 ; GCN3-NEXT: v_mov_b32_e32 v4, s4
3550 ; GCN3-NEXT: v_mov_b32_e32 v5, s5
3551 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
3552 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
3553 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3554 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
3555 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3556 ; GCN3-NEXT: buffer_wbinvl1_vol
3557 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
3558 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
3559 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
3560 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
3561 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
3562 ; GCN3-NEXT: s_cbranch_execnz .LBB69_1
3563 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
3564 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
3565 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3566 %gep = getelementptr i64, ptr %out, i64 4
3567 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst
3571 define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
3572 ; GCN1-LABEL: flat_atomic_max_i64_ret_scalar:
3574 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3575 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
3576 ; GCN1-NEXT: s_add_u32 s34, s4, 4
3577 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
3578 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
3579 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
3580 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
3581 ; GCN1-NEXT: flat_load_dword v0, v[0:1]
3582 ; GCN1-NEXT: flat_load_dword v1, v[2:3]
3583 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
3584 ; GCN1-NEXT: .LBB70_1: ; %atomicrmw.start
3585 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
3586 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3587 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
3588 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
3589 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
3590 ; GCN1-NEXT: v_mov_b32_e32 v0, s7
3591 ; GCN1-NEXT: v_mov_b32_e32 v6, s6
3592 ; GCN1-NEXT: v_mov_b32_e32 v4, s4
3593 ; GCN1-NEXT: v_mov_b32_e32 v5, s5
3594 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
3595 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
3596 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3597 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
3598 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3599 ; GCN1-NEXT: buffer_wbinvl1_vol
3600 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
3601 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
3602 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
3603 ; GCN1-NEXT: s_cbranch_execnz .LBB70_1
3604 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
3605 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
3606 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3608 ; GCN2-LABEL: flat_atomic_max_i64_ret_scalar:
3610 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3611 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
3612 ; GCN2-NEXT: s_add_u32 s34, s4, 4
3613 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
3614 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
3615 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
3616 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
3617 ; GCN2-NEXT: flat_load_dword v0, v[0:1]
3618 ; GCN2-NEXT: flat_load_dword v1, v[2:3]
3619 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
3620 ; GCN2-NEXT: .LBB70_1: ; %atomicrmw.start
3621 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
3622 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3623 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
3624 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
3625 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
3626 ; GCN2-NEXT: v_mov_b32_e32 v0, s7
3627 ; GCN2-NEXT: v_mov_b32_e32 v6, s6
3628 ; GCN2-NEXT: v_mov_b32_e32 v4, s4
3629 ; GCN2-NEXT: v_mov_b32_e32 v5, s5
3630 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
3631 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
3632 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3633 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
3634 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3635 ; GCN2-NEXT: buffer_wbinvl1_vol
3636 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
3637 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
3638 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
3639 ; GCN2-NEXT: s_cbranch_execnz .LBB70_1
3640 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
3641 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
3642 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3644 ; GCN3-LABEL: flat_atomic_max_i64_ret_scalar:
3646 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3647 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
3648 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
3649 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
3650 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
3651 ; GCN3-NEXT: .LBB70_1: ; %atomicrmw.start
3652 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
3653 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3654 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
3655 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
3656 ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
3657 ; GCN3-NEXT: v_mov_b32_e32 v0, s7
3658 ; GCN3-NEXT: v_mov_b32_e32 v6, s6
3659 ; GCN3-NEXT: v_mov_b32_e32 v4, s4
3660 ; GCN3-NEXT: v_mov_b32_e32 v5, s5
3661 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
3662 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
3663 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3664 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
3665 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3666 ; GCN3-NEXT: buffer_wbinvl1_vol
3667 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
3668 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
3669 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
3670 ; GCN3-NEXT: s_cbranch_execnz .LBB70_1
3671 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
3672 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
3673 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3674 %result = atomicrmw max ptr %ptr, i64 %in seq_cst
3678 define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
3679 ; GCN1-LABEL: flat_atomic_max_i64_ret_offset_scalar:
3681 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3682 ; GCN1-NEXT: s_add_u32 s34, s4, 32
3683 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
3684 ; GCN1-NEXT: s_add_u32 s36, s4, 36
3685 ; GCN1-NEXT: s_addc_u32 s37, s5, 0
3686 ; GCN1-NEXT: v_mov_b32_e32 v0, s36
3687 ; GCN1-NEXT: v_mov_b32_e32 v1, s37
3688 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
3689 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
3690 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
3691 ; GCN1-NEXT: flat_load_dword v0, v[2:3]
3692 ; GCN1-NEXT: s_mov_b64 s[36:37], 0
3693 ; GCN1-NEXT: .LBB71_1: ; %atomicrmw.start
3694 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
3695 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3696 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
3697 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
3698 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
3699 ; GCN1-NEXT: v_mov_b32_e32 v0, s7
3700 ; GCN1-NEXT: v_mov_b32_e32 v6, s6
3701 ; GCN1-NEXT: v_mov_b32_e32 v4, s34
3702 ; GCN1-NEXT: v_mov_b32_e32 v5, s35
3703 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
3704 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
3705 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3706 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
3707 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3708 ; GCN1-NEXT: buffer_wbinvl1_vol
3709 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
3710 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
3711 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
3712 ; GCN1-NEXT: s_cbranch_execnz .LBB71_1
3713 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
3714 ; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
3715 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3717 ; GCN2-LABEL: flat_atomic_max_i64_ret_offset_scalar:
3719 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3720 ; GCN2-NEXT: s_add_u32 s34, s4, 32
3721 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
3722 ; GCN2-NEXT: s_add_u32 s36, s4, 36
3723 ; GCN2-NEXT: s_addc_u32 s37, s5, 0
3724 ; GCN2-NEXT: v_mov_b32_e32 v0, s36
3725 ; GCN2-NEXT: v_mov_b32_e32 v1, s37
3726 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
3727 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
3728 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
3729 ; GCN2-NEXT: flat_load_dword v0, v[2:3]
3730 ; GCN2-NEXT: s_mov_b64 s[36:37], 0
3731 ; GCN2-NEXT: .LBB71_1: ; %atomicrmw.start
3732 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
3733 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3734 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
3735 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
3736 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
3737 ; GCN2-NEXT: v_mov_b32_e32 v0, s7
3738 ; GCN2-NEXT: v_mov_b32_e32 v6, s6
3739 ; GCN2-NEXT: v_mov_b32_e32 v4, s34
3740 ; GCN2-NEXT: v_mov_b32_e32 v5, s35
3741 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
3742 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
3743 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3744 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
3745 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3746 ; GCN2-NEXT: buffer_wbinvl1_vol
3747 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
3748 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
3749 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
3750 ; GCN2-NEXT: s_cbranch_execnz .LBB71_1
3751 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
3752 ; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
3753 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3755 ; GCN3-LABEL: flat_atomic_max_i64_ret_offset_scalar:
3757 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3758 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
3759 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
3760 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
3761 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
3762 ; GCN3-NEXT: .LBB71_1: ; %atomicrmw.start
3763 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
3764 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3765 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
3766 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
3767 ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
3768 ; GCN3-NEXT: v_mov_b32_e32 v0, s7
3769 ; GCN3-NEXT: v_mov_b32_e32 v6, s6
3770 ; GCN3-NEXT: v_mov_b32_e32 v4, s4
3771 ; GCN3-NEXT: v_mov_b32_e32 v5, s5
3772 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
3773 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
3774 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3775 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
3776 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3777 ; GCN3-NEXT: buffer_wbinvl1_vol
3778 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
3779 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
3780 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
3781 ; GCN3-NEXT: s_cbranch_execnz .LBB71_1
3782 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
3783 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
3784 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3785 %gep = getelementptr i64, ptr %out, i64 4
3786 %result = atomicrmw max ptr %gep, i64 %in seq_cst
3790 define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
3791 ; GCN1-LABEL: atomic_max_i64_addr64_offset:
3792 ; GCN1: ; %bb.0: ; %entry
3793 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
3794 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
3795 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3796 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
3797 ; GCN1-NEXT: s_add_u32 s0, s0, s4
3798 ; GCN1-NEXT: s_addc_u32 s1, s1, s5
3799 ; GCN1-NEXT: s_add_u32 s0, s0, 32
3800 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
3801 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
3802 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
3803 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
3804 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
3805 ; GCN1-NEXT: .LBB72_1: ; %atomicrmw.start
3806 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
3807 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3808 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
3809 ; GCN1-NEXT: v_mov_b32_e32 v0, s3
3810 ; GCN1-NEXT: v_mov_b32_e32 v6, s2
3811 ; GCN1-NEXT: v_mov_b32_e32 v5, s1
3812 ; GCN1-NEXT: v_mov_b32_e32 v4, s0
3813 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
3814 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
3815 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3816 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
3817 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3818 ; GCN1-NEXT: buffer_wbinvl1_vol
3819 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
3820 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
3821 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3822 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
3823 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
3824 ; GCN1-NEXT: s_cbranch_execnz .LBB72_1
3825 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
3826 ; GCN1-NEXT: s_endpgm
3828 ; GCN2-LABEL: atomic_max_i64_addr64_offset:
3829 ; GCN2: ; %bb.0: ; %entry
3830 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
3831 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
3832 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3833 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
3834 ; GCN2-NEXT: s_add_u32 s0, s0, s4
3835 ; GCN2-NEXT: s_addc_u32 s1, s1, s5
3836 ; GCN2-NEXT: s_add_u32 s0, s0, 32
3837 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
3838 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
3839 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
3840 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
3841 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
3842 ; GCN2-NEXT: .LBB72_1: ; %atomicrmw.start
3843 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
3844 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3845 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
3846 ; GCN2-NEXT: v_mov_b32_e32 v0, s3
3847 ; GCN2-NEXT: v_mov_b32_e32 v6, s2
3848 ; GCN2-NEXT: v_mov_b32_e32 v5, s1
3849 ; GCN2-NEXT: v_mov_b32_e32 v4, s0
3850 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
3851 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
3852 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3853 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
3854 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3855 ; GCN2-NEXT: buffer_wbinvl1_vol
3856 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
3857 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
3858 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3859 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
3860 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
3861 ; GCN2-NEXT: s_cbranch_execnz .LBB72_1
3862 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
3863 ; GCN2-NEXT: s_endpgm
3865 ; GCN3-LABEL: atomic_max_i64_addr64_offset:
3866 ; GCN3: ; %bb.0: ; %entry
3867 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3868 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3869 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3870 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
3871 ; GCN3-NEXT: s_add_u32 s0, s4, s0
3872 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
3873 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
3874 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
3875 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
3876 ; GCN3-NEXT: s_mov_b64 s[2:3], 0
3877 ; GCN3-NEXT: .LBB72_1: ; %atomicrmw.start
3878 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
3879 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3880 ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
3881 ; GCN3-NEXT: v_mov_b32_e32 v0, s7
3882 ; GCN3-NEXT: v_mov_b32_e32 v6, s6
3883 ; GCN3-NEXT: v_mov_b32_e32 v5, s1
3884 ; GCN3-NEXT: v_mov_b32_e32 v4, s0
3885 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
3886 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
3887 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3888 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
3889 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3890 ; GCN3-NEXT: buffer_wbinvl1_vol
3891 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
3892 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
3893 ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
3894 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
3895 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
3896 ; GCN3-NEXT: s_cbranch_execnz .LBB72_1
3897 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
3898 ; GCN3-NEXT: s_endpgm
3900 %ptr = getelementptr i64, ptr %out, i64 %index
3901 %gep = getelementptr i64, ptr %ptr, i64 4
3902 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst
3906 define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
3907 ; GCN1-LABEL: atomic_max_i64_ret_addr64_offset:
3908 ; GCN1: ; %bb.0: ; %entry
3909 ; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
3910 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3911 ; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
3912 ; GCN1-NEXT: s_add_u32 s0, s0, s6
3913 ; GCN1-NEXT: s_addc_u32 s1, s1, s7
3914 ; GCN1-NEXT: s_add_u32 s0, s0, 32
3915 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
3916 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
3917 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
3918 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
3919 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
3920 ; GCN1-NEXT: .LBB73_1: ; %atomicrmw.start
3921 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
3922 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3923 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
3924 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
3925 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
3926 ; GCN1-NEXT: v_mov_b32_e32 v0, s5
3927 ; GCN1-NEXT: v_mov_b32_e32 v6, s4
3928 ; GCN1-NEXT: v_mov_b32_e32 v5, s1
3929 ; GCN1-NEXT: v_mov_b32_e32 v4, s0
3930 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
3931 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
3932 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3933 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
3934 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3935 ; GCN1-NEXT: buffer_wbinvl1_vol
3936 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
3937 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
3938 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
3939 ; GCN1-NEXT: s_cbranch_execnz .LBB73_1
3940 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
3941 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
3942 ; GCN1-NEXT: v_mov_b32_e32 v2, s2
3943 ; GCN1-NEXT: v_mov_b32_e32 v3, s3
3944 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
3945 ; GCN1-NEXT: s_endpgm
3947 ; GCN2-LABEL: atomic_max_i64_ret_addr64_offset:
3948 ; GCN2: ; %bb.0: ; %entry
3949 ; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
3950 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3951 ; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
3952 ; GCN2-NEXT: s_add_u32 s0, s0, s6
3953 ; GCN2-NEXT: s_addc_u32 s1, s1, s7
3954 ; GCN2-NEXT: s_add_u32 s0, s0, 32
3955 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
3956 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
3957 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
3958 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
3959 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
3960 ; GCN2-NEXT: .LBB73_1: ; %atomicrmw.start
3961 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
3962 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3963 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
3964 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
3965 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
3966 ; GCN2-NEXT: v_mov_b32_e32 v0, s5
3967 ; GCN2-NEXT: v_mov_b32_e32 v6, s4
3968 ; GCN2-NEXT: v_mov_b32_e32 v5, s1
3969 ; GCN2-NEXT: v_mov_b32_e32 v4, s0
3970 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
3971 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
3972 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3973 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
3974 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3975 ; GCN2-NEXT: buffer_wbinvl1_vol
3976 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
3977 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
3978 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
3979 ; GCN2-NEXT: s_cbranch_execnz .LBB73_1
3980 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
3981 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
3982 ; GCN2-NEXT: v_mov_b32_e32 v2, s2
3983 ; GCN2-NEXT: v_mov_b32_e32 v3, s3
3984 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
3985 ; GCN2-NEXT: s_endpgm
3987 ; GCN3-LABEL: atomic_max_i64_ret_addr64_offset:
3988 ; GCN3: ; %bb.0: ; %entry
3989 ; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
3990 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3991 ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
3992 ; GCN3-NEXT: s_add_u32 s0, s0, s6
3993 ; GCN3-NEXT: s_addc_u32 s1, s1, s7
3994 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
3995 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
3996 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
3997 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
3998 ; GCN3-NEXT: .LBB73_1: ; %atomicrmw.start
3999 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
4000 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4001 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
4002 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
4003 ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
4004 ; GCN3-NEXT: v_mov_b32_e32 v0, s5
4005 ; GCN3-NEXT: v_mov_b32_e32 v6, s4
4006 ; GCN3-NEXT: v_mov_b32_e32 v5, s1
4007 ; GCN3-NEXT: v_mov_b32_e32 v4, s0
4008 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
4009 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
4010 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4011 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
4012 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4013 ; GCN3-NEXT: buffer_wbinvl1_vol
4014 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4015 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
4016 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
4017 ; GCN3-NEXT: s_cbranch_execnz .LBB73_1
4018 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
4019 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
4020 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
4021 ; GCN3-NEXT: v_mov_b32_e32 v3, s3
4022 ; GCN3-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
4023 ; GCN3-NEXT: s_endpgm
4025 %ptr = getelementptr i64, ptr %out, i64 %index
4026 %gep = getelementptr i64, ptr %ptr, i64 4
4027 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst
4028 store i64 %tmp0, ptr %out2
4032 define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) {
4033 ; GCN1-LABEL: atomic_max_i64_addr64:
4034 ; GCN1: ; %bb.0: ; %entry
4035 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
4036 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
4037 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4038 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
4039 ; GCN1-NEXT: s_add_u32 s0, s0, s4
4040 ; GCN1-NEXT: s_addc_u32 s1, s1, s5
4041 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
4042 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
4043 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
4044 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
4045 ; GCN1-NEXT: .LBB74_1: ; %atomicrmw.start
4046 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
4047 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4048 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
4049 ; GCN1-NEXT: v_mov_b32_e32 v0, s3
4050 ; GCN1-NEXT: v_mov_b32_e32 v6, s2
4051 ; GCN1-NEXT: v_mov_b32_e32 v5, s1
4052 ; GCN1-NEXT: v_mov_b32_e32 v4, s0
4053 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
4054 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
4055 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4056 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
4057 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4058 ; GCN1-NEXT: buffer_wbinvl1_vol
4059 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4060 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
4061 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4062 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
4063 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
4064 ; GCN1-NEXT: s_cbranch_execnz .LBB74_1
4065 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
4066 ; GCN1-NEXT: s_endpgm
4068 ; GCN2-LABEL: atomic_max_i64_addr64:
4069 ; GCN2: ; %bb.0: ; %entry
4070 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
4071 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
4072 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4073 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
4074 ; GCN2-NEXT: s_add_u32 s0, s0, s4
4075 ; GCN2-NEXT: s_addc_u32 s1, s1, s5
4076 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
4077 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
4078 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
4079 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
4080 ; GCN2-NEXT: .LBB74_1: ; %atomicrmw.start
4081 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
4082 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4083 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
4084 ; GCN2-NEXT: v_mov_b32_e32 v0, s3
4085 ; GCN2-NEXT: v_mov_b32_e32 v6, s2
4086 ; GCN2-NEXT: v_mov_b32_e32 v5, s1
4087 ; GCN2-NEXT: v_mov_b32_e32 v4, s0
4088 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
4089 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
4090 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4091 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
4092 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4093 ; GCN2-NEXT: buffer_wbinvl1_vol
4094 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4095 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
4096 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4097 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
4098 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
4099 ; GCN2-NEXT: s_cbranch_execnz .LBB74_1
4100 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
4101 ; GCN2-NEXT: s_endpgm
4103 ; GCN3-LABEL: atomic_max_i64_addr64:
4104 ; GCN3: ; %bb.0: ; %entry
4105 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4106 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4107 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
4108 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
4109 ; GCN3-NEXT: s_add_u32 s0, s4, s0
4110 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
4111 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
4112 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
4113 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
4114 ; GCN3-NEXT: s_mov_b64 s[2:3], 0
4115 ; GCN3-NEXT: .LBB74_1: ; %atomicrmw.start
4116 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
4117 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4118 ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
4119 ; GCN3-NEXT: v_mov_b32_e32 v0, s7
4120 ; GCN3-NEXT: v_mov_b32_e32 v6, s6
4121 ; GCN3-NEXT: v_mov_b32_e32 v5, s1
4122 ; GCN3-NEXT: v_mov_b32_e32 v4, s0
4123 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
4124 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
4125 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4126 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
4127 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4128 ; GCN3-NEXT: buffer_wbinvl1_vol
4129 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4130 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
4131 ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
4132 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
4133 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
4134 ; GCN3-NEXT: s_cbranch_execnz .LBB74_1
4135 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
4136 ; GCN3-NEXT: s_endpgm
4138 %ptr = getelementptr i64, ptr %out, i64 %index
4139 %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst
4143 define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
4144 ; GCN1-LABEL: atomic_max_i64_ret_addr64:
4145 ; GCN1: ; %bb.0: ; %entry
4146 ; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
4147 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4148 ; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
4149 ; GCN1-NEXT: s_add_u32 s0, s0, s6
4150 ; GCN1-NEXT: s_addc_u32 s1, s1, s7
4151 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
4152 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
4153 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
4154 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
4155 ; GCN1-NEXT: .LBB75_1: ; %atomicrmw.start
4156 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
4157 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4158 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
4159 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
4160 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
4161 ; GCN1-NEXT: v_mov_b32_e32 v0, s5
4162 ; GCN1-NEXT: v_mov_b32_e32 v6, s4
4163 ; GCN1-NEXT: v_mov_b32_e32 v5, s1
4164 ; GCN1-NEXT: v_mov_b32_e32 v4, s0
4165 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
4166 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
4167 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4168 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
4169 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4170 ; GCN1-NEXT: buffer_wbinvl1_vol
4171 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4172 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
4173 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
4174 ; GCN1-NEXT: s_cbranch_execnz .LBB75_1
4175 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
4176 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
4177 ; GCN1-NEXT: v_mov_b32_e32 v2, s2
4178 ; GCN1-NEXT: v_mov_b32_e32 v3, s3
4179 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
4180 ; GCN1-NEXT: s_endpgm
4182 ; GCN2-LABEL: atomic_max_i64_ret_addr64:
4183 ; GCN2: ; %bb.0: ; %entry
4184 ; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
4185 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4186 ; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
4187 ; GCN2-NEXT: s_add_u32 s0, s0, s6
4188 ; GCN2-NEXT: s_addc_u32 s1, s1, s7
4189 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
4190 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
4191 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
4192 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
4193 ; GCN2-NEXT: .LBB75_1: ; %atomicrmw.start
4194 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
4195 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4196 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
4197 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
4198 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
4199 ; GCN2-NEXT: v_mov_b32_e32 v0, s5
4200 ; GCN2-NEXT: v_mov_b32_e32 v6, s4
4201 ; GCN2-NEXT: v_mov_b32_e32 v5, s1
4202 ; GCN2-NEXT: v_mov_b32_e32 v4, s0
4203 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
4204 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
4205 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4206 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
4207 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4208 ; GCN2-NEXT: buffer_wbinvl1_vol
4209 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4210 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
4211 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
4212 ; GCN2-NEXT: s_cbranch_execnz .LBB75_1
4213 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
4214 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
4215 ; GCN2-NEXT: v_mov_b32_e32 v2, s2
4216 ; GCN2-NEXT: v_mov_b32_e32 v3, s3
4217 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
4218 ; GCN2-NEXT: s_endpgm
4220 ; GCN3-LABEL: atomic_max_i64_ret_addr64:
4221 ; GCN3: ; %bb.0: ; %entry
4222 ; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
4223 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
4224 ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
4225 ; GCN3-NEXT: s_add_u32 s0, s0, s6
4226 ; GCN3-NEXT: s_addc_u32 s1, s1, s7
4227 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
4228 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
4229 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
4230 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
4231 ; GCN3-NEXT: .LBB75_1: ; %atomicrmw.start
4232 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
4233 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4234 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
4235 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
4236 ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3]
4237 ; GCN3-NEXT: v_mov_b32_e32 v0, s5
4238 ; GCN3-NEXT: v_mov_b32_e32 v6, s4
4239 ; GCN3-NEXT: v_mov_b32_e32 v5, s1
4240 ; GCN3-NEXT: v_mov_b32_e32 v4, s0
4241 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
4242 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
4243 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4244 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
4245 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4246 ; GCN3-NEXT: buffer_wbinvl1_vol
4247 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4248 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
4249 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
4250 ; GCN3-NEXT: s_cbranch_execnz .LBB75_1
4251 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
4252 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
4253 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
4254 ; GCN3-NEXT: v_mov_b32_e32 v3, s3
4255 ; GCN3-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
4256 ; GCN3-NEXT: s_endpgm
4258 %ptr = getelementptr i64, ptr %out, i64 %index
4259 %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst
4260 store i64 %tmp0, ptr %out2
4264 ; ---------------------------------------------------------------------
4266 ; ---------------------------------------------------------------------
4268 define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) {
4269 ; GCN1-LABEL: flat_atomic_umax_i64_noret:
4271 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4272 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
4273 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
4274 ; GCN1-NEXT: flat_load_dword v6, v[0:1]
4275 ; GCN1-NEXT: flat_load_dword v7, v[4:5]
4276 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
4277 ; GCN1-NEXT: .LBB76_1: ; %atomicrmw.start
4278 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
4279 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4280 ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
4281 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
4282 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
4283 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4284 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
4285 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4286 ; GCN1-NEXT: buffer_wbinvl1_vol
4287 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4288 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
4289 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4290 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
4291 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
4292 ; GCN1-NEXT: s_cbranch_execnz .LBB76_1
4293 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
4294 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
4295 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4297 ; GCN2-LABEL: flat_atomic_umax_i64_noret:
4299 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4300 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
4301 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
4302 ; GCN2-NEXT: flat_load_dword v6, v[0:1]
4303 ; GCN2-NEXT: flat_load_dword v7, v[4:5]
4304 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
4305 ; GCN2-NEXT: .LBB76_1: ; %atomicrmw.start
4306 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
4307 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4308 ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
4309 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
4310 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
4311 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4312 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
4313 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4314 ; GCN2-NEXT: buffer_wbinvl1_vol
4315 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4316 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
4317 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4318 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
4319 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
4320 ; GCN2-NEXT: s_cbranch_execnz .LBB76_1
4321 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
4322 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
4323 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4325 ; GCN3-LABEL: flat_atomic_umax_i64_noret:
4327 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4328 ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
4329 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
4330 ; GCN3-NEXT: .LBB76_1: ; %atomicrmw.start
4331 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
4332 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4333 ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
4334 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
4335 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
4336 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4337 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
4338 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4339 ; GCN3-NEXT: buffer_wbinvl1_vol
4340 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4341 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
4342 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4343 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
4344 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
4345 ; GCN3-NEXT: s_cbranch_execnz .LBB76_1
4346 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
4347 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
4348 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4349 %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst
4353 define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) {
4354 ; GCN1-LABEL: flat_atomic_umax_i64_noret_offset:
4356 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4357 ; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0
4358 ; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
4359 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
4360 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4361 ; GCN1-NEXT: flat_load_dword v7, v[0:1]
4362 ; GCN1-NEXT: flat_load_dword v6, v[8:9]
4363 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
4364 ; GCN1-NEXT: .LBB77_1: ; %atomicrmw.start
4365 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
4366 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4367 ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
4368 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
4369 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
4370 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4371 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
4372 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4373 ; GCN1-NEXT: buffer_wbinvl1_vol
4374 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
4375 ; GCN1-NEXT: v_mov_b32_e32 v7, v1
4376 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4377 ; GCN1-NEXT: v_mov_b32_e32 v6, v0
4378 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
4379 ; GCN1-NEXT: s_cbranch_execnz .LBB77_1
4380 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
4381 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
4382 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4384 ; GCN2-LABEL: flat_atomic_umax_i64_noret_offset:
4386 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4387 ; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0
4388 ; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
4389 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
4390 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4391 ; GCN2-NEXT: flat_load_dword v7, v[0:1]
4392 ; GCN2-NEXT: flat_load_dword v6, v[8:9]
4393 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
4394 ; GCN2-NEXT: .LBB77_1: ; %atomicrmw.start
4395 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
4396 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4397 ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
4398 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
4399 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
4400 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4401 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
4402 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4403 ; GCN2-NEXT: buffer_wbinvl1_vol
4404 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
4405 ; GCN2-NEXT: v_mov_b32_e32 v7, v1
4406 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4407 ; GCN2-NEXT: v_mov_b32_e32 v6, v0
4408 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
4409 ; GCN2-NEXT: s_cbranch_execnz .LBB77_1
4410 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
4411 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
4412 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4414 ; GCN3-LABEL: flat_atomic_umax_i64_noret_offset:
4416 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4417 ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
4418 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
4419 ; GCN3-NEXT: .LBB77_1: ; %atomicrmw.start
4420 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
4421 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4422 ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
4423 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
4424 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
4425 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4426 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
4427 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4428 ; GCN3-NEXT: buffer_wbinvl1_vol
4429 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4430 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
4431 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4432 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
4433 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
4434 ; GCN3-NEXT: s_cbranch_execnz .LBB77_1
4435 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
4436 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
4437 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4438 %gep = getelementptr i64, ptr %out, i64 4
4439 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst
4443 define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) {
4444 ; GCN1-LABEL: flat_atomic_umax_i64_ret:
4446 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4447 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0
4448 ; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
4449 ; GCN1-NEXT: flat_load_dword v4, v[0:1]
4450 ; GCN1-NEXT: flat_load_dword v5, v[5:6]
4451 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
4452 ; GCN1-NEXT: .LBB78_1: ; %atomicrmw.start
4453 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
4454 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4455 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
4456 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
4457 ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
4458 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
4459 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
4460 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4461 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
4462 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4463 ; GCN1-NEXT: buffer_wbinvl1_vol
4464 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4465 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4466 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
4467 ; GCN1-NEXT: s_cbranch_execnz .LBB78_1
4468 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
4469 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
4470 ; GCN1-NEXT: v_mov_b32_e32 v0, v4
4471 ; GCN1-NEXT: v_mov_b32_e32 v1, v5
4472 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4474 ; GCN2-LABEL: flat_atomic_umax_i64_ret:
4476 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4477 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0
4478 ; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
4479 ; GCN2-NEXT: flat_load_dword v4, v[0:1]
4480 ; GCN2-NEXT: flat_load_dword v5, v[5:6]
4481 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
4482 ; GCN2-NEXT: .LBB78_1: ; %atomicrmw.start
4483 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
4484 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4485 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
4486 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
4487 ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
4488 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
4489 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
4490 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4491 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
4492 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4493 ; GCN2-NEXT: buffer_wbinvl1_vol
4494 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4495 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4496 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
4497 ; GCN2-NEXT: s_cbranch_execnz .LBB78_1
4498 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
4499 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
4500 ; GCN2-NEXT: v_mov_b32_e32 v0, v4
4501 ; GCN2-NEXT: v_mov_b32_e32 v1, v5
4502 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4504 ; GCN3-LABEL: flat_atomic_umax_i64_ret:
4506 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4507 ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
4508 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
4509 ; GCN3-NEXT: .LBB78_1: ; %atomicrmw.start
4510 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
4511 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4512 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
4513 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
4514 ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
4515 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
4516 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
4517 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4518 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
4519 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4520 ; GCN3-NEXT: buffer_wbinvl1_vol
4521 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4522 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4523 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
4524 ; GCN3-NEXT: s_cbranch_execnz .LBB78_1
4525 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
4526 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
4527 ; GCN3-NEXT: v_mov_b32_e32 v0, v4
4528 ; GCN3-NEXT: v_mov_b32_e32 v1, v5
4529 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4530 %result = atomicrmw umax ptr %ptr, i64 %in seq_cst
4534 define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) {
4535 ; GCN1-LABEL: flat_atomic_umax_i64_ret_offset:
4537 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4538 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
4539 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
4540 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
4541 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4542 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
4543 ; GCN1-NEXT: flat_load_dword v0, v[4:5]
4544 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
4545 ; GCN1-NEXT: .LBB79_1: ; %atomicrmw.start
4546 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
4547 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4548 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
4549 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
4550 ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
4551 ; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
4552 ; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
4553 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4554 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
4555 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4556 ; GCN1-NEXT: buffer_wbinvl1_vol
4557 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
4558 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4559 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
4560 ; GCN1-NEXT: s_cbranch_execnz .LBB79_1
4561 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
4562 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
4563 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4565 ; GCN2-LABEL: flat_atomic_umax_i64_ret_offset:
4567 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4568 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
4569 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
4570 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
4571 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4572 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
4573 ; GCN2-NEXT: flat_load_dword v0, v[4:5]
4574 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
4575 ; GCN2-NEXT: .LBB79_1: ; %atomicrmw.start
4576 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
4577 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4578 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
4579 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
4580 ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
4581 ; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
4582 ; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
4583 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4584 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
4585 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4586 ; GCN2-NEXT: buffer_wbinvl1_vol
4587 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
4588 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4589 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
4590 ; GCN2-NEXT: s_cbranch_execnz .LBB79_1
4591 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
4592 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
4593 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4595 ; GCN3-LABEL: flat_atomic_umax_i64_ret_offset:
4597 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4598 ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
4599 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
4600 ; GCN3-NEXT: .LBB79_1: ; %atomicrmw.start
4601 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
4602 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4603 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
4604 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
4605 ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
4606 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
4607 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
4608 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4609 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
4610 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4611 ; GCN3-NEXT: buffer_wbinvl1_vol
4612 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
4613 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4614 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
4615 ; GCN3-NEXT: s_cbranch_execnz .LBB79_1
4616 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
4617 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
4618 ; GCN3-NEXT: v_mov_b32_e32 v0, v4
4619 ; GCN3-NEXT: v_mov_b32_e32 v1, v5
4620 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4621 %gep = getelementptr i64, ptr %out, i64 4
4622 %result = atomicrmw umax ptr %gep, i64 %in seq_cst
4626 define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
4627 ; GCN1-LABEL: flat_atomic_umax_i64_noret_scalar:
4629 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4630 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
4631 ; GCN1-NEXT: s_add_u32 s34, s4, 4
4632 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
4633 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
4634 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
4635 ; GCN1-NEXT: v_mov_b32_e32 v4, s35
4636 ; GCN1-NEXT: flat_load_dword v2, v[0:1]
4637 ; GCN1-NEXT: flat_load_dword v3, v[3:4]
4638 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
4639 ; GCN1-NEXT: .LBB80_1: ; %atomicrmw.start
4640 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
4641 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4642 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
4643 ; GCN1-NEXT: v_mov_b32_e32 v0, s7
4644 ; GCN1-NEXT: v_mov_b32_e32 v6, s6
4645 ; GCN1-NEXT: v_mov_b32_e32 v4, s4
4646 ; GCN1-NEXT: v_mov_b32_e32 v5, s5
4647 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
4648 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
4649 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4650 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
4651 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4652 ; GCN1-NEXT: buffer_wbinvl1_vol
4653 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4654 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
4655 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4656 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
4657 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
4658 ; GCN1-NEXT: s_cbranch_execnz .LBB80_1
4659 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
4660 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
4661 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4663 ; GCN2-LABEL: flat_atomic_umax_i64_noret_scalar:
4665 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4666 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
4667 ; GCN2-NEXT: s_add_u32 s34, s4, 4
4668 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
4669 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
4670 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
4671 ; GCN2-NEXT: v_mov_b32_e32 v4, s35
4672 ; GCN2-NEXT: flat_load_dword v2, v[0:1]
4673 ; GCN2-NEXT: flat_load_dword v3, v[3:4]
4674 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
4675 ; GCN2-NEXT: .LBB80_1: ; %atomicrmw.start
4676 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
4677 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4678 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
4679 ; GCN2-NEXT: v_mov_b32_e32 v0, s7
4680 ; GCN2-NEXT: v_mov_b32_e32 v6, s6
4681 ; GCN2-NEXT: v_mov_b32_e32 v4, s4
4682 ; GCN2-NEXT: v_mov_b32_e32 v5, s5
4683 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
4684 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
4685 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4686 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
4687 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4688 ; GCN2-NEXT: buffer_wbinvl1_vol
4689 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4690 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
4691 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4692 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
4693 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
4694 ; GCN2-NEXT: s_cbranch_execnz .LBB80_1
4695 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
4696 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
4697 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4699 ; GCN3-LABEL: flat_atomic_umax_i64_noret_scalar:
4701 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4702 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
4703 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
4704 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
4705 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
4706 ; GCN3-NEXT: .LBB80_1: ; %atomicrmw.start
4707 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
4708 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4709 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
4710 ; GCN3-NEXT: v_mov_b32_e32 v0, s7
4711 ; GCN3-NEXT: v_mov_b32_e32 v6, s6
4712 ; GCN3-NEXT: v_mov_b32_e32 v4, s4
4713 ; GCN3-NEXT: v_mov_b32_e32 v5, s5
4714 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
4715 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
4716 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4717 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
4718 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4719 ; GCN3-NEXT: buffer_wbinvl1_vol
4720 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4721 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
4722 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4723 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
4724 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
4725 ; GCN3-NEXT: s_cbranch_execnz .LBB80_1
4726 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
4727 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
4728 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4729 %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst
4733 define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
4734 ; GCN1-LABEL: flat_atomic_umax_i64_noret_offset_scalar:
4736 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4737 ; GCN1-NEXT: s_add_u32 s34, s4, 32
4738 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
4739 ; GCN1-NEXT: s_add_u32 s36, s4, 36
4740 ; GCN1-NEXT: s_addc_u32 s37, s5, 0
4741 ; GCN1-NEXT: v_mov_b32_e32 v0, s36
4742 ; GCN1-NEXT: v_mov_b32_e32 v1, s37
4743 ; GCN1-NEXT: v_mov_b32_e32 v4, s34
4744 ; GCN1-NEXT: v_mov_b32_e32 v5, s35
4745 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
4746 ; GCN1-NEXT: flat_load_dword v2, v[4:5]
4747 ; GCN1-NEXT: s_mov_b64 s[36:37], 0
4748 ; GCN1-NEXT: .LBB81_1: ; %atomicrmw.start
4749 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
4750 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4751 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
4752 ; GCN1-NEXT: v_mov_b32_e32 v0, s7
4753 ; GCN1-NEXT: v_mov_b32_e32 v6, s6
4754 ; GCN1-NEXT: v_mov_b32_e32 v4, s34
4755 ; GCN1-NEXT: v_mov_b32_e32 v5, s35
4756 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
4757 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
4758 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4759 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
4760 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4761 ; GCN1-NEXT: buffer_wbinvl1_vol
4762 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4763 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
4764 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
4765 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
4766 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
4767 ; GCN1-NEXT: s_cbranch_execnz .LBB81_1
4768 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
4769 ; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
4770 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4772 ; GCN2-LABEL: flat_atomic_umax_i64_noret_offset_scalar:
4774 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4775 ; GCN2-NEXT: s_add_u32 s34, s4, 32
4776 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
4777 ; GCN2-NEXT: s_add_u32 s36, s4, 36
4778 ; GCN2-NEXT: s_addc_u32 s37, s5, 0
4779 ; GCN2-NEXT: v_mov_b32_e32 v0, s36
4780 ; GCN2-NEXT: v_mov_b32_e32 v1, s37
4781 ; GCN2-NEXT: v_mov_b32_e32 v4, s34
4782 ; GCN2-NEXT: v_mov_b32_e32 v5, s35
4783 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
4784 ; GCN2-NEXT: flat_load_dword v2, v[4:5]
4785 ; GCN2-NEXT: s_mov_b64 s[36:37], 0
4786 ; GCN2-NEXT: .LBB81_1: ; %atomicrmw.start
4787 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
4788 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4789 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
4790 ; GCN2-NEXT: v_mov_b32_e32 v0, s7
4791 ; GCN2-NEXT: v_mov_b32_e32 v6, s6
4792 ; GCN2-NEXT: v_mov_b32_e32 v4, s34
4793 ; GCN2-NEXT: v_mov_b32_e32 v5, s35
4794 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
4795 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
4796 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4797 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
4798 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4799 ; GCN2-NEXT: buffer_wbinvl1_vol
4800 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4801 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
4802 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
4803 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
4804 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
4805 ; GCN2-NEXT: s_cbranch_execnz .LBB81_1
4806 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
4807 ; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
4808 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4810 ; GCN3-LABEL: flat_atomic_umax_i64_noret_offset_scalar:
4812 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4813 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
4814 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
4815 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
4816 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
4817 ; GCN3-NEXT: .LBB81_1: ; %atomicrmw.start
4818 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
4819 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4820 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
4821 ; GCN3-NEXT: v_mov_b32_e32 v0, s7
4822 ; GCN3-NEXT: v_mov_b32_e32 v6, s6
4823 ; GCN3-NEXT: v_mov_b32_e32 v4, s4
4824 ; GCN3-NEXT: v_mov_b32_e32 v5, s5
4825 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
4826 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
4827 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4828 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
4829 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4830 ; GCN3-NEXT: buffer_wbinvl1_vol
4831 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4832 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
4833 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4834 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
4835 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
4836 ; GCN3-NEXT: s_cbranch_execnz .LBB81_1
4837 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
4838 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
4839 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4840 %gep = getelementptr i64, ptr %out, i64 4
4841 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst
4845 define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
4846 ; GCN1-LABEL: flat_atomic_umax_i64_ret_scalar:
4848 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4849 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
4850 ; GCN1-NEXT: s_add_u32 s34, s4, 4
4851 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
4852 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
4853 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
4854 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
4855 ; GCN1-NEXT: flat_load_dword v0, v[0:1]
4856 ; GCN1-NEXT: flat_load_dword v1, v[2:3]
4857 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
4858 ; GCN1-NEXT: .LBB82_1: ; %atomicrmw.start
4859 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
4860 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4861 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
4862 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
4863 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
4864 ; GCN1-NEXT: v_mov_b32_e32 v0, s7
4865 ; GCN1-NEXT: v_mov_b32_e32 v6, s6
4866 ; GCN1-NEXT: v_mov_b32_e32 v4, s4
4867 ; GCN1-NEXT: v_mov_b32_e32 v5, s5
4868 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
4869 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
4870 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4871 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
4872 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4873 ; GCN1-NEXT: buffer_wbinvl1_vol
4874 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4875 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4876 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
4877 ; GCN1-NEXT: s_cbranch_execnz .LBB82_1
4878 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
4879 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
4880 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4882 ; GCN2-LABEL: flat_atomic_umax_i64_ret_scalar:
4884 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4885 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
4886 ; GCN2-NEXT: s_add_u32 s34, s4, 4
4887 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
4888 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
4889 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
4890 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
4891 ; GCN2-NEXT: flat_load_dword v0, v[0:1]
4892 ; GCN2-NEXT: flat_load_dword v1, v[2:3]
4893 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
4894 ; GCN2-NEXT: .LBB82_1: ; %atomicrmw.start
4895 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
4896 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4897 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
4898 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
4899 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
4900 ; GCN2-NEXT: v_mov_b32_e32 v0, s7
4901 ; GCN2-NEXT: v_mov_b32_e32 v6, s6
4902 ; GCN2-NEXT: v_mov_b32_e32 v4, s4
4903 ; GCN2-NEXT: v_mov_b32_e32 v5, s5
4904 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
4905 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
4906 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4907 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
4908 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4909 ; GCN2-NEXT: buffer_wbinvl1_vol
4910 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4911 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4912 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
4913 ; GCN2-NEXT: s_cbranch_execnz .LBB82_1
4914 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
4915 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
4916 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4918 ; GCN3-LABEL: flat_atomic_umax_i64_ret_scalar:
4920 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4921 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
4922 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
4923 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
4924 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
4925 ; GCN3-NEXT: .LBB82_1: ; %atomicrmw.start
4926 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
4927 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4928 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
4929 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
4930 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
4931 ; GCN3-NEXT: v_mov_b32_e32 v0, s7
4932 ; GCN3-NEXT: v_mov_b32_e32 v6, s6
4933 ; GCN3-NEXT: v_mov_b32_e32 v4, s4
4934 ; GCN3-NEXT: v_mov_b32_e32 v5, s5
4935 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
4936 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
4937 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4938 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
4939 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4940 ; GCN3-NEXT: buffer_wbinvl1_vol
4941 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4942 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4943 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
4944 ; GCN3-NEXT: s_cbranch_execnz .LBB82_1
4945 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
4946 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
4947 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4948 %result = atomicrmw umax ptr %ptr, i64 %in seq_cst
4952 define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
4953 ; GCN1-LABEL: flat_atomic_umax_i64_ret_offset_scalar:
4955 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4956 ; GCN1-NEXT: s_add_u32 s34, s4, 32
4957 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
4958 ; GCN1-NEXT: s_add_u32 s36, s4, 36
4959 ; GCN1-NEXT: s_addc_u32 s37, s5, 0
4960 ; GCN1-NEXT: v_mov_b32_e32 v0, s36
4961 ; GCN1-NEXT: v_mov_b32_e32 v1, s37
4962 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
4963 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
4964 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
4965 ; GCN1-NEXT: flat_load_dword v0, v[2:3]
4966 ; GCN1-NEXT: s_mov_b64 s[36:37], 0
4967 ; GCN1-NEXT: .LBB83_1: ; %atomicrmw.start
4968 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
4969 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4970 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
4971 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
4972 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
4973 ; GCN1-NEXT: v_mov_b32_e32 v0, s7
4974 ; GCN1-NEXT: v_mov_b32_e32 v6, s6
4975 ; GCN1-NEXT: v_mov_b32_e32 v4, s34
4976 ; GCN1-NEXT: v_mov_b32_e32 v5, s35
4977 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
4978 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
4979 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4980 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
4981 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4982 ; GCN1-NEXT: buffer_wbinvl1_vol
4983 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
4984 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
4985 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
4986 ; GCN1-NEXT: s_cbranch_execnz .LBB83_1
4987 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
4988 ; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
4989 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4991 ; GCN2-LABEL: flat_atomic_umax_i64_ret_offset_scalar:
4993 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4994 ; GCN2-NEXT: s_add_u32 s34, s4, 32
4995 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
4996 ; GCN2-NEXT: s_add_u32 s36, s4, 36
4997 ; GCN2-NEXT: s_addc_u32 s37, s5, 0
4998 ; GCN2-NEXT: v_mov_b32_e32 v0, s36
4999 ; GCN2-NEXT: v_mov_b32_e32 v1, s37
5000 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
5001 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
5002 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
5003 ; GCN2-NEXT: flat_load_dword v0, v[2:3]
5004 ; GCN2-NEXT: s_mov_b64 s[36:37], 0
5005 ; GCN2-NEXT: .LBB83_1: ; %atomicrmw.start
5006 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
5007 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5008 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
5009 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
5010 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
5011 ; GCN2-NEXT: v_mov_b32_e32 v0, s7
5012 ; GCN2-NEXT: v_mov_b32_e32 v6, s6
5013 ; GCN2-NEXT: v_mov_b32_e32 v4, s34
5014 ; GCN2-NEXT: v_mov_b32_e32 v5, s35
5015 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5016 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
5017 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5018 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
5019 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5020 ; GCN2-NEXT: buffer_wbinvl1_vol
5021 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5022 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
5023 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
5024 ; GCN2-NEXT: s_cbranch_execnz .LBB83_1
5025 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
5026 ; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
5027 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5029 ; GCN3-LABEL: flat_atomic_umax_i64_ret_offset_scalar:
5031 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5032 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
5033 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
5034 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
5035 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
5036 ; GCN3-NEXT: .LBB83_1: ; %atomicrmw.start
5037 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
5038 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5039 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
5040 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
5041 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
5042 ; GCN3-NEXT: v_mov_b32_e32 v0, s7
5043 ; GCN3-NEXT: v_mov_b32_e32 v6, s6
5044 ; GCN3-NEXT: v_mov_b32_e32 v4, s4
5045 ; GCN3-NEXT: v_mov_b32_e32 v5, s5
5046 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5047 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
5048 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5049 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
5050 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5051 ; GCN3-NEXT: buffer_wbinvl1_vol
5052 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5053 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5054 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
5055 ; GCN3-NEXT: s_cbranch_execnz .LBB83_1
5056 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
5057 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
5058 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5059 %gep = getelementptr i64, ptr %out, i64 4
5060 %result = atomicrmw umax ptr %gep, i64 %in seq_cst
5064 define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
5065 ; GCN1-LABEL: atomic_umax_i64_addr64_offset:
5066 ; GCN1: ; %bb.0: ; %entry
5067 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
5068 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5069 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5070 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
5071 ; GCN1-NEXT: s_add_u32 s0, s0, s4
5072 ; GCN1-NEXT: s_addc_u32 s1, s1, s5
5073 ; GCN1-NEXT: s_add_u32 s0, s0, 32
5074 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
5075 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
5076 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
5077 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
5078 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
5079 ; GCN1-NEXT: .LBB84_1: ; %atomicrmw.start
5080 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
5081 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5082 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
5083 ; GCN1-NEXT: v_mov_b32_e32 v0, s3
5084 ; GCN1-NEXT: v_mov_b32_e32 v6, s2
5085 ; GCN1-NEXT: v_mov_b32_e32 v5, s1
5086 ; GCN1-NEXT: v_mov_b32_e32 v4, s0
5087 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5088 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
5089 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5090 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
5091 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5092 ; GCN1-NEXT: buffer_wbinvl1_vol
5093 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5094 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
5095 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5096 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
5097 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
5098 ; GCN1-NEXT: s_cbranch_execnz .LBB84_1
5099 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
5100 ; GCN1-NEXT: s_endpgm
5102 ; GCN2-LABEL: atomic_umax_i64_addr64_offset:
5103 ; GCN2: ; %bb.0: ; %entry
5104 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
5105 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5106 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5107 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
5108 ; GCN2-NEXT: s_add_u32 s0, s0, s4
5109 ; GCN2-NEXT: s_addc_u32 s1, s1, s5
5110 ; GCN2-NEXT: s_add_u32 s0, s0, 32
5111 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
5112 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
5113 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
5114 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
5115 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
5116 ; GCN2-NEXT: .LBB84_1: ; %atomicrmw.start
5117 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
5118 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5119 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
5120 ; GCN2-NEXT: v_mov_b32_e32 v0, s3
5121 ; GCN2-NEXT: v_mov_b32_e32 v6, s2
5122 ; GCN2-NEXT: v_mov_b32_e32 v5, s1
5123 ; GCN2-NEXT: v_mov_b32_e32 v4, s0
5124 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5125 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
5126 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5127 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
5128 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5129 ; GCN2-NEXT: buffer_wbinvl1_vol
5130 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5131 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
5132 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5133 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
5134 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
5135 ; GCN2-NEXT: s_cbranch_execnz .LBB84_1
5136 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
5137 ; GCN2-NEXT: s_endpgm
5139 ; GCN3-LABEL: atomic_umax_i64_addr64_offset:
5140 ; GCN3: ; %bb.0: ; %entry
5141 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
5142 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5143 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
5144 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
5145 ; GCN3-NEXT: s_add_u32 s0, s4, s0
5146 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
5147 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
5148 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
5149 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
5150 ; GCN3-NEXT: s_mov_b64 s[2:3], 0
5151 ; GCN3-NEXT: .LBB84_1: ; %atomicrmw.start
5152 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
5153 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5154 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
5155 ; GCN3-NEXT: v_mov_b32_e32 v0, s7
5156 ; GCN3-NEXT: v_mov_b32_e32 v6, s6
5157 ; GCN3-NEXT: v_mov_b32_e32 v5, s1
5158 ; GCN3-NEXT: v_mov_b32_e32 v4, s0
5159 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5160 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
5161 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5162 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
5163 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5164 ; GCN3-NEXT: buffer_wbinvl1_vol
5165 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5166 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
5167 ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
5168 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
5169 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
5170 ; GCN3-NEXT: s_cbranch_execnz .LBB84_1
5171 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
5172 ; GCN3-NEXT: s_endpgm
5174 %ptr = getelementptr i64, ptr %out, i64 %index
5175 %gep = getelementptr i64, ptr %ptr, i64 4
5176 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst
5180 define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
5181 ; GCN1-LABEL: atomic_umax_i64_ret_addr64_offset:
5182 ; GCN1: ; %bb.0: ; %entry
5183 ; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
5184 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5185 ; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
5186 ; GCN1-NEXT: s_add_u32 s0, s0, s6
5187 ; GCN1-NEXT: s_addc_u32 s1, s1, s7
5188 ; GCN1-NEXT: s_add_u32 s0, s0, 32
5189 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
5190 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
5191 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
5192 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
5193 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
5194 ; GCN1-NEXT: .LBB85_1: ; %atomicrmw.start
5195 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
5196 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5197 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
5198 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
5199 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
5200 ; GCN1-NEXT: v_mov_b32_e32 v0, s5
5201 ; GCN1-NEXT: v_mov_b32_e32 v6, s4
5202 ; GCN1-NEXT: v_mov_b32_e32 v5, s1
5203 ; GCN1-NEXT: v_mov_b32_e32 v4, s0
5204 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5205 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
5206 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5207 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
5208 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5209 ; GCN1-NEXT: buffer_wbinvl1_vol
5210 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5211 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
5212 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
5213 ; GCN1-NEXT: s_cbranch_execnz .LBB85_1
5214 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
5215 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
5216 ; GCN1-NEXT: v_mov_b32_e32 v2, s2
5217 ; GCN1-NEXT: v_mov_b32_e32 v3, s3
5218 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
5219 ; GCN1-NEXT: s_endpgm
5221 ; GCN2-LABEL: atomic_umax_i64_ret_addr64_offset:
5222 ; GCN2: ; %bb.0: ; %entry
5223 ; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
5224 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5225 ; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
5226 ; GCN2-NEXT: s_add_u32 s0, s0, s6
5227 ; GCN2-NEXT: s_addc_u32 s1, s1, s7
5228 ; GCN2-NEXT: s_add_u32 s0, s0, 32
5229 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
5230 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
5231 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
5232 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
5233 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
5234 ; GCN2-NEXT: .LBB85_1: ; %atomicrmw.start
5235 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
5236 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5237 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
5238 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
5239 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
5240 ; GCN2-NEXT: v_mov_b32_e32 v0, s5
5241 ; GCN2-NEXT: v_mov_b32_e32 v6, s4
5242 ; GCN2-NEXT: v_mov_b32_e32 v5, s1
5243 ; GCN2-NEXT: v_mov_b32_e32 v4, s0
5244 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5245 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
5246 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5247 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
5248 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5249 ; GCN2-NEXT: buffer_wbinvl1_vol
5250 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5251 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
5252 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
5253 ; GCN2-NEXT: s_cbranch_execnz .LBB85_1
5254 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
5255 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
5256 ; GCN2-NEXT: v_mov_b32_e32 v2, s2
5257 ; GCN2-NEXT: v_mov_b32_e32 v3, s3
5258 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
5259 ; GCN2-NEXT: s_endpgm
5261 ; GCN3-LABEL: atomic_umax_i64_ret_addr64_offset:
5262 ; GCN3: ; %bb.0: ; %entry
5263 ; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
5264 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
5265 ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
5266 ; GCN3-NEXT: s_add_u32 s0, s0, s6
5267 ; GCN3-NEXT: s_addc_u32 s1, s1, s7
5268 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
5269 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
5270 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
5271 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
5272 ; GCN3-NEXT: .LBB85_1: ; %atomicrmw.start
5273 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
5274 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5275 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
5276 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
5277 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
5278 ; GCN3-NEXT: v_mov_b32_e32 v0, s5
5279 ; GCN3-NEXT: v_mov_b32_e32 v6, s4
5280 ; GCN3-NEXT: v_mov_b32_e32 v5, s1
5281 ; GCN3-NEXT: v_mov_b32_e32 v4, s0
5282 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5283 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
5284 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5285 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
5286 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5287 ; GCN3-NEXT: buffer_wbinvl1_vol
5288 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5289 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
5290 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
5291 ; GCN3-NEXT: s_cbranch_execnz .LBB85_1
5292 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
5293 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
5294 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
5295 ; GCN3-NEXT: v_mov_b32_e32 v3, s3
5296 ; GCN3-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
5297 ; GCN3-NEXT: s_endpgm
5299 %ptr = getelementptr i64, ptr %out, i64 %index
5300 %gep = getelementptr i64, ptr %ptr, i64 4
5301 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst
5302 store i64 %tmp0, ptr %out2
5306 define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
5307 ; GCN1-LABEL: atomic_umax_i64_ret_addr64:
5308 ; GCN1: ; %bb.0: ; %entry
5309 ; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
5310 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5311 ; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
5312 ; GCN1-NEXT: s_add_u32 s0, s0, s6
5313 ; GCN1-NEXT: s_addc_u32 s1, s1, s7
5314 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
5315 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
5316 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
5317 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
5318 ; GCN1-NEXT: .LBB86_1: ; %atomicrmw.start
5319 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
5320 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5321 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
5322 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
5323 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
5324 ; GCN1-NEXT: v_mov_b32_e32 v0, s5
5325 ; GCN1-NEXT: v_mov_b32_e32 v6, s4
5326 ; GCN1-NEXT: v_mov_b32_e32 v5, s1
5327 ; GCN1-NEXT: v_mov_b32_e32 v4, s0
5328 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5329 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
5330 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5331 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
5332 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5333 ; GCN1-NEXT: buffer_wbinvl1_vol
5334 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5335 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
5336 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
5337 ; GCN1-NEXT: s_cbranch_execnz .LBB86_1
5338 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
5339 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
5340 ; GCN1-NEXT: v_mov_b32_e32 v2, s2
5341 ; GCN1-NEXT: v_mov_b32_e32 v3, s3
5342 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
5343 ; GCN1-NEXT: s_endpgm
5345 ; GCN2-LABEL: atomic_umax_i64_ret_addr64:
5346 ; GCN2: ; %bb.0: ; %entry
5347 ; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
5348 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5349 ; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
5350 ; GCN2-NEXT: s_add_u32 s0, s0, s6
5351 ; GCN2-NEXT: s_addc_u32 s1, s1, s7
5352 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
5353 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
5354 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
5355 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
5356 ; GCN2-NEXT: .LBB86_1: ; %atomicrmw.start
5357 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
5358 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5359 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
5360 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
5361 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
5362 ; GCN2-NEXT: v_mov_b32_e32 v0, s5
5363 ; GCN2-NEXT: v_mov_b32_e32 v6, s4
5364 ; GCN2-NEXT: v_mov_b32_e32 v5, s1
5365 ; GCN2-NEXT: v_mov_b32_e32 v4, s0
5366 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5367 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
5368 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5369 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
5370 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5371 ; GCN2-NEXT: buffer_wbinvl1_vol
5372 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5373 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
5374 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
5375 ; GCN2-NEXT: s_cbranch_execnz .LBB86_1
5376 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
5377 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
5378 ; GCN2-NEXT: v_mov_b32_e32 v2, s2
5379 ; GCN2-NEXT: v_mov_b32_e32 v3, s3
5380 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
5381 ; GCN2-NEXT: s_endpgm
5383 ; GCN3-LABEL: atomic_umax_i64_ret_addr64:
5384 ; GCN3: ; %bb.0: ; %entry
5385 ; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
5386 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
5387 ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
5388 ; GCN3-NEXT: s_add_u32 s0, s0, s6
5389 ; GCN3-NEXT: s_addc_u32 s1, s1, s7
5390 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
5391 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
5392 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
5393 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
5394 ; GCN3-NEXT: .LBB86_1: ; %atomicrmw.start
5395 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
5396 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5397 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
5398 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
5399 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
5400 ; GCN3-NEXT: v_mov_b32_e32 v0, s5
5401 ; GCN3-NEXT: v_mov_b32_e32 v6, s4
5402 ; GCN3-NEXT: v_mov_b32_e32 v5, s1
5403 ; GCN3-NEXT: v_mov_b32_e32 v4, s0
5404 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5405 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
5406 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5407 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
5408 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5409 ; GCN3-NEXT: buffer_wbinvl1_vol
5410 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5411 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
5412 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
5413 ; GCN3-NEXT: s_cbranch_execnz .LBB86_1
5414 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
5415 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
5416 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
5417 ; GCN3-NEXT: v_mov_b32_e32 v3, s3
5418 ; GCN3-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
5419 ; GCN3-NEXT: s_endpgm
5421 %ptr = getelementptr i64, ptr %out, i64 %index
5422 %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst
5423 store i64 %tmp0, ptr %out2
5427 ; ---------------------------------------------------------------------
5429 ; ---------------------------------------------------------------------
5431 define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) {
5432 ; GCN1-LABEL: flat_atomic_umin_i64_noret:
5434 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5435 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
5436 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
5437 ; GCN1-NEXT: flat_load_dword v6, v[0:1]
5438 ; GCN1-NEXT: flat_load_dword v7, v[4:5]
5439 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
5440 ; GCN1-NEXT: .LBB87_1: ; %atomicrmw.start
5441 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
5442 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5443 ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
5444 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
5445 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
5446 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5447 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
5448 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5449 ; GCN1-NEXT: buffer_wbinvl1_vol
5450 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5451 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
5452 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5453 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
5454 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
5455 ; GCN1-NEXT: s_cbranch_execnz .LBB87_1
5456 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
5457 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
5458 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5460 ; GCN2-LABEL: flat_atomic_umin_i64_noret:
5462 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5463 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
5464 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
5465 ; GCN2-NEXT: flat_load_dword v6, v[0:1]
5466 ; GCN2-NEXT: flat_load_dword v7, v[4:5]
5467 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
5468 ; GCN2-NEXT: .LBB87_1: ; %atomicrmw.start
5469 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
5470 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5471 ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
5472 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
5473 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
5474 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5475 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
5476 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5477 ; GCN2-NEXT: buffer_wbinvl1_vol
5478 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5479 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
5480 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5481 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
5482 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
5483 ; GCN2-NEXT: s_cbranch_execnz .LBB87_1
5484 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
5485 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
5486 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5488 ; GCN3-LABEL: flat_atomic_umin_i64_noret:
5490 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5491 ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
5492 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
5493 ; GCN3-NEXT: .LBB87_1: ; %atomicrmw.start
5494 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
5495 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5496 ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
5497 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
5498 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
5499 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5500 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
5501 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5502 ; GCN3-NEXT: buffer_wbinvl1_vol
5503 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5504 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
5505 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5506 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
5507 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
5508 ; GCN3-NEXT: s_cbranch_execnz .LBB87_1
5509 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
5510 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
5511 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5512 %tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst
5516 define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) {
5517 ; GCN1-LABEL: flat_atomic_umin_i64_noret_offset:
5519 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5520 ; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0
5521 ; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
5522 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
5523 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
5524 ; GCN1-NEXT: flat_load_dword v7, v[0:1]
5525 ; GCN1-NEXT: flat_load_dword v6, v[8:9]
5526 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
5527 ; GCN1-NEXT: .LBB88_1: ; %atomicrmw.start
5528 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
5529 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5530 ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
5531 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
5532 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
5533 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5534 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
5535 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5536 ; GCN1-NEXT: buffer_wbinvl1_vol
5537 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
5538 ; GCN1-NEXT: v_mov_b32_e32 v7, v1
5539 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5540 ; GCN1-NEXT: v_mov_b32_e32 v6, v0
5541 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
5542 ; GCN1-NEXT: s_cbranch_execnz .LBB88_1
5543 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
5544 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
5545 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5547 ; GCN2-LABEL: flat_atomic_umin_i64_noret_offset:
5549 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5550 ; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0
5551 ; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
5552 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
5553 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
5554 ; GCN2-NEXT: flat_load_dword v7, v[0:1]
5555 ; GCN2-NEXT: flat_load_dword v6, v[8:9]
5556 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
5557 ; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start
5558 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
5559 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5560 ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
5561 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
5562 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
5563 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5564 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
5565 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5566 ; GCN2-NEXT: buffer_wbinvl1_vol
5567 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
5568 ; GCN2-NEXT: v_mov_b32_e32 v7, v1
5569 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5570 ; GCN2-NEXT: v_mov_b32_e32 v6, v0
5571 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
5572 ; GCN2-NEXT: s_cbranch_execnz .LBB88_1
5573 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
5574 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
5575 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5577 ; GCN3-LABEL: flat_atomic_umin_i64_noret_offset:
5579 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5580 ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
5581 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
5582 ; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start
5583 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
5584 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5585 ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
5586 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
5587 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
5588 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5589 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
5590 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5591 ; GCN3-NEXT: buffer_wbinvl1_vol
5592 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5593 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
5594 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5595 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
5596 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
5597 ; GCN3-NEXT: s_cbranch_execnz .LBB88_1
5598 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
5599 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
5600 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5601 %gep = getelementptr i64, ptr %out, i64 4
5602 %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst
5606 define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) {
5607 ; GCN1-LABEL: flat_atomic_umin_i64_ret:
5609 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5610 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0
5611 ; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
5612 ; GCN1-NEXT: flat_load_dword v4, v[0:1]
5613 ; GCN1-NEXT: flat_load_dword v5, v[5:6]
5614 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
5615 ; GCN1-NEXT: .LBB89_1: ; %atomicrmw.start
5616 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
5617 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5618 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
5619 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
5620 ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
5621 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
5622 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
5623 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5624 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
5625 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5626 ; GCN1-NEXT: buffer_wbinvl1_vol
5627 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5628 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5629 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
5630 ; GCN1-NEXT: s_cbranch_execnz .LBB89_1
5631 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
5632 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
5633 ; GCN1-NEXT: v_mov_b32_e32 v0, v4
5634 ; GCN1-NEXT: v_mov_b32_e32 v1, v5
5635 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5637 ; GCN2-LABEL: flat_atomic_umin_i64_ret:
5639 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5640 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0
5641 ; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
5642 ; GCN2-NEXT: flat_load_dword v4, v[0:1]
5643 ; GCN2-NEXT: flat_load_dword v5, v[5:6]
5644 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
5645 ; GCN2-NEXT: .LBB89_1: ; %atomicrmw.start
5646 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
5647 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5648 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
5649 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
5650 ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
5651 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
5652 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
5653 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5654 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
5655 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5656 ; GCN2-NEXT: buffer_wbinvl1_vol
5657 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5658 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5659 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
5660 ; GCN2-NEXT: s_cbranch_execnz .LBB89_1
5661 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
5662 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
5663 ; GCN2-NEXT: v_mov_b32_e32 v0, v4
5664 ; GCN2-NEXT: v_mov_b32_e32 v1, v5
5665 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5667 ; GCN3-LABEL: flat_atomic_umin_i64_ret:
5669 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5670 ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
5671 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
5672 ; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start
5673 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
5674 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5675 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
5676 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
5677 ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
5678 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
5679 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
5680 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5681 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
5682 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5683 ; GCN3-NEXT: buffer_wbinvl1_vol
5684 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5685 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5686 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
5687 ; GCN3-NEXT: s_cbranch_execnz .LBB89_1
5688 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
5689 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
5690 ; GCN3-NEXT: v_mov_b32_e32 v0, v4
5691 ; GCN3-NEXT: v_mov_b32_e32 v1, v5
5692 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5693 %result = atomicrmw umin ptr %ptr, i64 %in seq_cst
5697 define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) {
5698 ; GCN1-LABEL: flat_atomic_umin_i64_ret_offset:
5700 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5701 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
5702 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
5703 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
5704 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
5705 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
5706 ; GCN1-NEXT: flat_load_dword v0, v[4:5]
5707 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
5708 ; GCN1-NEXT: .LBB90_1: ; %atomicrmw.start
5709 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
5710 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5711 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
5712 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
5713 ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
5714 ; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
5715 ; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
5716 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5717 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
5718 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5719 ; GCN1-NEXT: buffer_wbinvl1_vol
5720 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
5721 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5722 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
5723 ; GCN1-NEXT: s_cbranch_execnz .LBB90_1
5724 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
5725 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
5726 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5728 ; GCN2-LABEL: flat_atomic_umin_i64_ret_offset:
5730 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5731 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
5732 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
5733 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
5734 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
5735 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
5736 ; GCN2-NEXT: flat_load_dword v0, v[4:5]
5737 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
5738 ; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start
5739 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
5740 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5741 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
5742 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
5743 ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
5744 ; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
5745 ; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
5746 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5747 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
5748 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5749 ; GCN2-NEXT: buffer_wbinvl1_vol
5750 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
5751 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5752 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
5753 ; GCN2-NEXT: s_cbranch_execnz .LBB90_1
5754 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
5755 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
5756 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5758 ; GCN3-LABEL: flat_atomic_umin_i64_ret_offset:
5760 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5761 ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
5762 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
5763 ; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start
5764 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
5765 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5766 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
5767 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
5768 ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
5769 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
5770 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
5771 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5772 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
5773 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5774 ; GCN3-NEXT: buffer_wbinvl1_vol
5775 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
5776 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5777 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
5778 ; GCN3-NEXT: s_cbranch_execnz .LBB90_1
5779 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
5780 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
5781 ; GCN3-NEXT: v_mov_b32_e32 v0, v4
5782 ; GCN3-NEXT: v_mov_b32_e32 v1, v5
5783 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5784 %gep = getelementptr i64, ptr %out, i64 4
5785 %result = atomicrmw umin ptr %gep, i64 %in seq_cst
5789 define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
5790 ; GCN1-LABEL: flat_atomic_umin_i64_noret_scalar:
5792 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5793 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
5794 ; GCN1-NEXT: s_add_u32 s34, s4, 4
5795 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
5796 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
5797 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
5798 ; GCN1-NEXT: v_mov_b32_e32 v4, s35
5799 ; GCN1-NEXT: flat_load_dword v2, v[0:1]
5800 ; GCN1-NEXT: flat_load_dword v3, v[3:4]
5801 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
5802 ; GCN1-NEXT: .LBB91_1: ; %atomicrmw.start
5803 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
5804 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5805 ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
5806 ; GCN1-NEXT: v_mov_b32_e32 v0, s7
5807 ; GCN1-NEXT: v_mov_b32_e32 v6, s6
5808 ; GCN1-NEXT: v_mov_b32_e32 v4, s4
5809 ; GCN1-NEXT: v_mov_b32_e32 v5, s5
5810 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5811 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
5812 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5813 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
5814 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5815 ; GCN1-NEXT: buffer_wbinvl1_vol
5816 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5817 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
5818 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5819 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
5820 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
5821 ; GCN1-NEXT: s_cbranch_execnz .LBB91_1
5822 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
5823 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
5824 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5826 ; GCN2-LABEL: flat_atomic_umin_i64_noret_scalar:
5828 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5829 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
5830 ; GCN2-NEXT: s_add_u32 s34, s4, 4
5831 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
5832 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
5833 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
5834 ; GCN2-NEXT: v_mov_b32_e32 v4, s35
5835 ; GCN2-NEXT: flat_load_dword v2, v[0:1]
5836 ; GCN2-NEXT: flat_load_dword v3, v[3:4]
5837 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
5838 ; GCN2-NEXT: .LBB91_1: ; %atomicrmw.start
5839 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
5840 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5841 ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
5842 ; GCN2-NEXT: v_mov_b32_e32 v0, s7
5843 ; GCN2-NEXT: v_mov_b32_e32 v6, s6
5844 ; GCN2-NEXT: v_mov_b32_e32 v4, s4
5845 ; GCN2-NEXT: v_mov_b32_e32 v5, s5
5846 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5847 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
5848 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5849 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
5850 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5851 ; GCN2-NEXT: buffer_wbinvl1_vol
5852 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5853 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
5854 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5855 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
5856 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
5857 ; GCN2-NEXT: s_cbranch_execnz .LBB91_1
5858 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
5859 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
5860 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5862 ; GCN3-LABEL: flat_atomic_umin_i64_noret_scalar:
5864 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5865 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
5866 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
5867 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
5868 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
5869 ; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start
5870 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
5871 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5872 ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
5873 ; GCN3-NEXT: v_mov_b32_e32 v0, s7
5874 ; GCN3-NEXT: v_mov_b32_e32 v6, s6
5875 ; GCN3-NEXT: v_mov_b32_e32 v4, s4
5876 ; GCN3-NEXT: v_mov_b32_e32 v5, s5
5877 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5878 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
5879 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5880 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
5881 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5882 ; GCN3-NEXT: buffer_wbinvl1_vol
5883 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5884 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
5885 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5886 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
5887 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
5888 ; GCN3-NEXT: s_cbranch_execnz .LBB91_1
5889 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
5890 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
5891 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5892 %tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst
5896 define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
5897 ; GCN1-LABEL: flat_atomic_umin_i64_noret_offset_scalar:
5899 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5900 ; GCN1-NEXT: s_add_u32 s34, s4, 32
5901 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
5902 ; GCN1-NEXT: s_add_u32 s36, s4, 36
5903 ; GCN1-NEXT: s_addc_u32 s37, s5, 0
5904 ; GCN1-NEXT: v_mov_b32_e32 v0, s36
5905 ; GCN1-NEXT: v_mov_b32_e32 v1, s37
5906 ; GCN1-NEXT: v_mov_b32_e32 v4, s34
5907 ; GCN1-NEXT: v_mov_b32_e32 v5, s35
5908 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
5909 ; GCN1-NEXT: flat_load_dword v2, v[4:5]
5910 ; GCN1-NEXT: s_mov_b64 s[36:37], 0
5911 ; GCN1-NEXT: .LBB92_1: ; %atomicrmw.start
5912 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
5913 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5914 ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
5915 ; GCN1-NEXT: v_mov_b32_e32 v0, s7
5916 ; GCN1-NEXT: v_mov_b32_e32 v6, s6
5917 ; GCN1-NEXT: v_mov_b32_e32 v4, s34
5918 ; GCN1-NEXT: v_mov_b32_e32 v5, s35
5919 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5920 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
5921 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5922 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
5923 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5924 ; GCN1-NEXT: buffer_wbinvl1_vol
5925 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5926 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
5927 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
5928 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
5929 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
5930 ; GCN1-NEXT: s_cbranch_execnz .LBB92_1
5931 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
5932 ; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
5933 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5935 ; GCN2-LABEL: flat_atomic_umin_i64_noret_offset_scalar:
5937 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5938 ; GCN2-NEXT: s_add_u32 s34, s4, 32
5939 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
5940 ; GCN2-NEXT: s_add_u32 s36, s4, 36
5941 ; GCN2-NEXT: s_addc_u32 s37, s5, 0
5942 ; GCN2-NEXT: v_mov_b32_e32 v0, s36
5943 ; GCN2-NEXT: v_mov_b32_e32 v1, s37
5944 ; GCN2-NEXT: v_mov_b32_e32 v4, s34
5945 ; GCN2-NEXT: v_mov_b32_e32 v5, s35
5946 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
5947 ; GCN2-NEXT: flat_load_dword v2, v[4:5]
5948 ; GCN2-NEXT: s_mov_b64 s[36:37], 0
5949 ; GCN2-NEXT: .LBB92_1: ; %atomicrmw.start
5950 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
5951 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5952 ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
5953 ; GCN2-NEXT: v_mov_b32_e32 v0, s7
5954 ; GCN2-NEXT: v_mov_b32_e32 v6, s6
5955 ; GCN2-NEXT: v_mov_b32_e32 v4, s34
5956 ; GCN2-NEXT: v_mov_b32_e32 v5, s35
5957 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5958 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
5959 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5960 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
5961 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5962 ; GCN2-NEXT: buffer_wbinvl1_vol
5963 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5964 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
5965 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
5966 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
5967 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
5968 ; GCN2-NEXT: s_cbranch_execnz .LBB92_1
5969 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
5970 ; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
5971 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5973 ; GCN3-LABEL: flat_atomic_umin_i64_noret_offset_scalar:
5975 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5976 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
5977 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
5978 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
5979 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
5980 ; GCN3-NEXT: .LBB92_1: ; %atomicrmw.start
5981 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
5982 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5983 ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
5984 ; GCN3-NEXT: v_mov_b32_e32 v0, s7
5985 ; GCN3-NEXT: v_mov_b32_e32 v6, s6
5986 ; GCN3-NEXT: v_mov_b32_e32 v4, s4
5987 ; GCN3-NEXT: v_mov_b32_e32 v5, s5
5988 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
5989 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
5990 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5991 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
5992 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5993 ; GCN3-NEXT: buffer_wbinvl1_vol
5994 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
5995 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
5996 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5997 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
5998 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
5999 ; GCN3-NEXT: s_cbranch_execnz .LBB92_1
6000 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
6001 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
6002 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6003 %gep = getelementptr i64, ptr %out, i64 4
6004 %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst
6008 define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
6009 ; GCN1-LABEL: flat_atomic_umin_i64_ret_scalar:
6011 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6012 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
6013 ; GCN1-NEXT: s_add_u32 s34, s4, 4
6014 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
6015 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
6016 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
6017 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
6018 ; GCN1-NEXT: flat_load_dword v0, v[0:1]
6019 ; GCN1-NEXT: flat_load_dword v1, v[2:3]
6020 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
6021 ; GCN1-NEXT: .LBB93_1: ; %atomicrmw.start
6022 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6023 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6024 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
6025 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
6026 ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
6027 ; GCN1-NEXT: v_mov_b32_e32 v0, s7
6028 ; GCN1-NEXT: v_mov_b32_e32 v6, s6
6029 ; GCN1-NEXT: v_mov_b32_e32 v4, s4
6030 ; GCN1-NEXT: v_mov_b32_e32 v5, s5
6031 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
6032 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
6033 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6034 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
6035 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6036 ; GCN1-NEXT: buffer_wbinvl1_vol
6037 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6038 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6039 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
6040 ; GCN1-NEXT: s_cbranch_execnz .LBB93_1
6041 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
6042 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
6043 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6045 ; GCN2-LABEL: flat_atomic_umin_i64_ret_scalar:
6047 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6048 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
6049 ; GCN2-NEXT: s_add_u32 s34, s4, 4
6050 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
6051 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
6052 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
6053 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
6054 ; GCN2-NEXT: flat_load_dword v0, v[0:1]
6055 ; GCN2-NEXT: flat_load_dword v1, v[2:3]
6056 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
6057 ; GCN2-NEXT: .LBB93_1: ; %atomicrmw.start
6058 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
6059 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6060 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
6061 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
6062 ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
6063 ; GCN2-NEXT: v_mov_b32_e32 v0, s7
6064 ; GCN2-NEXT: v_mov_b32_e32 v6, s6
6065 ; GCN2-NEXT: v_mov_b32_e32 v4, s4
6066 ; GCN2-NEXT: v_mov_b32_e32 v5, s5
6067 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
6068 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
6069 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6070 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
6071 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6072 ; GCN2-NEXT: buffer_wbinvl1_vol
6073 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6074 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6075 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
6076 ; GCN2-NEXT: s_cbranch_execnz .LBB93_1
6077 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
6078 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
6079 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6081 ; GCN3-LABEL: flat_atomic_umin_i64_ret_scalar:
6083 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6084 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
6085 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
6086 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
6087 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
6088 ; GCN3-NEXT: .LBB93_1: ; %atomicrmw.start
6089 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
6090 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6091 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
6092 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
6093 ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
6094 ; GCN3-NEXT: v_mov_b32_e32 v0, s7
6095 ; GCN3-NEXT: v_mov_b32_e32 v6, s6
6096 ; GCN3-NEXT: v_mov_b32_e32 v4, s4
6097 ; GCN3-NEXT: v_mov_b32_e32 v5, s5
6098 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
6099 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
6100 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6101 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
6102 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6103 ; GCN3-NEXT: buffer_wbinvl1_vol
6104 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6105 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6106 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
6107 ; GCN3-NEXT: s_cbranch_execnz .LBB93_1
6108 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
6109 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
6110 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6111 %result = atomicrmw umin ptr %ptr, i64 %in seq_cst
6115 define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
6116 ; GCN1-LABEL: flat_atomic_umin_i64_ret_offset_scalar:
6118 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6119 ; GCN1-NEXT: s_add_u32 s34, s4, 32
6120 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
6121 ; GCN1-NEXT: s_add_u32 s36, s4, 36
6122 ; GCN1-NEXT: s_addc_u32 s37, s5, 0
6123 ; GCN1-NEXT: v_mov_b32_e32 v0, s36
6124 ; GCN1-NEXT: v_mov_b32_e32 v1, s37
6125 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
6126 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
6127 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
6128 ; GCN1-NEXT: flat_load_dword v0, v[2:3]
6129 ; GCN1-NEXT: s_mov_b64 s[36:37], 0
6130 ; GCN1-NEXT: .LBB94_1: ; %atomicrmw.start
6131 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6132 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6133 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
6134 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
6135 ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
6136 ; GCN1-NEXT: v_mov_b32_e32 v0, s7
6137 ; GCN1-NEXT: v_mov_b32_e32 v6, s6
6138 ; GCN1-NEXT: v_mov_b32_e32 v4, s34
6139 ; GCN1-NEXT: v_mov_b32_e32 v5, s35
6140 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
6141 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
6142 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6143 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
6144 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6145 ; GCN1-NEXT: buffer_wbinvl1_vol
6146 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6147 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
6148 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
6149 ; GCN1-NEXT: s_cbranch_execnz .LBB94_1
6150 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
6151 ; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
6152 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6154 ; GCN2-LABEL: flat_atomic_umin_i64_ret_offset_scalar:
6156 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6157 ; GCN2-NEXT: s_add_u32 s34, s4, 32
6158 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
6159 ; GCN2-NEXT: s_add_u32 s36, s4, 36
6160 ; GCN2-NEXT: s_addc_u32 s37, s5, 0
6161 ; GCN2-NEXT: v_mov_b32_e32 v0, s36
6162 ; GCN2-NEXT: v_mov_b32_e32 v1, s37
6163 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
6164 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
6165 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
6166 ; GCN2-NEXT: flat_load_dword v0, v[2:3]
6167 ; GCN2-NEXT: s_mov_b64 s[36:37], 0
6168 ; GCN2-NEXT: .LBB94_1: ; %atomicrmw.start
6169 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
6170 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6171 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
6172 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
6173 ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
6174 ; GCN2-NEXT: v_mov_b32_e32 v0, s7
6175 ; GCN2-NEXT: v_mov_b32_e32 v6, s6
6176 ; GCN2-NEXT: v_mov_b32_e32 v4, s34
6177 ; GCN2-NEXT: v_mov_b32_e32 v5, s35
6178 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
6179 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
6180 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6181 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
6182 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6183 ; GCN2-NEXT: buffer_wbinvl1_vol
6184 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6185 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
6186 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
6187 ; GCN2-NEXT: s_cbranch_execnz .LBB94_1
6188 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
6189 ; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
6190 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6192 ; GCN3-LABEL: flat_atomic_umin_i64_ret_offset_scalar:
6194 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6195 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
6196 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
6197 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
6198 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
6199 ; GCN3-NEXT: .LBB94_1: ; %atomicrmw.start
6200 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
6201 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6202 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
6203 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
6204 ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
6205 ; GCN3-NEXT: v_mov_b32_e32 v0, s7
6206 ; GCN3-NEXT: v_mov_b32_e32 v6, s6
6207 ; GCN3-NEXT: v_mov_b32_e32 v4, s4
6208 ; GCN3-NEXT: v_mov_b32_e32 v5, s5
6209 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
6210 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
6211 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6212 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
6213 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6214 ; GCN3-NEXT: buffer_wbinvl1_vol
6215 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6216 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6217 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
6218 ; GCN3-NEXT: s_cbranch_execnz .LBB94_1
6219 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
6220 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
6221 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6222 %gep = getelementptr i64, ptr %out, i64 4
6223 %result = atomicrmw umin ptr %gep, i64 %in seq_cst
6227 ; ---------------------------------------------------------------------
6229 ; ---------------------------------------------------------------------
6231 define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) {
6232 ; GCN1-LABEL: flat_atomic_min_i64_noret:
6234 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6235 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
6236 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
6237 ; GCN1-NEXT: flat_load_dword v6, v[0:1]
6238 ; GCN1-NEXT: flat_load_dword v7, v[4:5]
6239 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
6240 ; GCN1-NEXT: .LBB95_1: ; %atomicrmw.start
6241 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6242 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6243 ; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
6244 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
6245 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
6246 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6247 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
6248 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6249 ; GCN1-NEXT: buffer_wbinvl1_vol
6250 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6251 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
6252 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6253 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
6254 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
6255 ; GCN1-NEXT: s_cbranch_execnz .LBB95_1
6256 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
6257 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
6258 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6260 ; GCN2-LABEL: flat_atomic_min_i64_noret:
6262 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6263 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
6264 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
6265 ; GCN2-NEXT: flat_load_dword v6, v[0:1]
6266 ; GCN2-NEXT: flat_load_dword v7, v[4:5]
6267 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
6268 ; GCN2-NEXT: .LBB95_1: ; %atomicrmw.start
6269 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
6270 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6271 ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
6272 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
6273 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
6274 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6275 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
6276 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6277 ; GCN2-NEXT: buffer_wbinvl1_vol
6278 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6279 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
6280 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6281 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
6282 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
6283 ; GCN2-NEXT: s_cbranch_execnz .LBB95_1
6284 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
6285 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
6286 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6288 ; GCN3-LABEL: flat_atomic_min_i64_noret:
6290 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6291 ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
6292 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
6293 ; GCN3-NEXT: .LBB95_1: ; %atomicrmw.start
6294 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
6295 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6296 ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
6297 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
6298 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
6299 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6300 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
6301 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6302 ; GCN3-NEXT: buffer_wbinvl1_vol
6303 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6304 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
6305 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6306 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
6307 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
6308 ; GCN3-NEXT: s_cbranch_execnz .LBB95_1
6309 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
6310 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
6311 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6312 %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst
6316 define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) {
6317 ; GCN1-LABEL: flat_atomic_min_i64_noret_offset:
6319 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6320 ; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0
6321 ; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
6322 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
6323 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
6324 ; GCN1-NEXT: flat_load_dword v7, v[0:1]
6325 ; GCN1-NEXT: flat_load_dword v6, v[8:9]
6326 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
6327 ; GCN1-NEXT: .LBB96_1: ; %atomicrmw.start
6328 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6329 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6330 ; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
6331 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
6332 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
6333 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6334 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
6335 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6336 ; GCN1-NEXT: buffer_wbinvl1_vol
6337 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
6338 ; GCN1-NEXT: v_mov_b32_e32 v7, v1
6339 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6340 ; GCN1-NEXT: v_mov_b32_e32 v6, v0
6341 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
6342 ; GCN1-NEXT: s_cbranch_execnz .LBB96_1
6343 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
6344 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
6345 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6347 ; GCN2-LABEL: flat_atomic_min_i64_noret_offset:
6349 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6350 ; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0
6351 ; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
6352 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
6353 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
6354 ; GCN2-NEXT: flat_load_dword v7, v[0:1]
6355 ; GCN2-NEXT: flat_load_dword v6, v[8:9]
6356 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
6357 ; GCN2-NEXT: .LBB96_1: ; %atomicrmw.start
6358 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
6359 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6360 ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
6361 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
6362 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
6363 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6364 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
6365 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6366 ; GCN2-NEXT: buffer_wbinvl1_vol
6367 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
6368 ; GCN2-NEXT: v_mov_b32_e32 v7, v1
6369 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6370 ; GCN2-NEXT: v_mov_b32_e32 v6, v0
6371 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
6372 ; GCN2-NEXT: s_cbranch_execnz .LBB96_1
6373 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
6374 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
6375 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6377 ; GCN3-LABEL: flat_atomic_min_i64_noret_offset:
6379 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6380 ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
6381 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
6382 ; GCN3-NEXT: .LBB96_1: ; %atomicrmw.start
6383 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
6384 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6385 ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
6386 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
6387 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
6388 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6389 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
6390 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6391 ; GCN3-NEXT: buffer_wbinvl1_vol
6392 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6393 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
6394 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6395 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
6396 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
6397 ; GCN3-NEXT: s_cbranch_execnz .LBB96_1
6398 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
6399 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
6400 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6401 %gep = getelementptr i64, ptr %out, i64 4
6402 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst
6406 define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) {
6407 ; GCN1-LABEL: flat_atomic_min_i64_ret:
6409 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6410 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0
6411 ; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
6412 ; GCN1-NEXT: flat_load_dword v4, v[0:1]
6413 ; GCN1-NEXT: flat_load_dword v5, v[5:6]
6414 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
6415 ; GCN1-NEXT: .LBB97_1: ; %atomicrmw.start
6416 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6417 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6418 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
6419 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
6420 ; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
6421 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
6422 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
6423 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6424 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
6425 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6426 ; GCN1-NEXT: buffer_wbinvl1_vol
6427 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6428 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6429 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
6430 ; GCN1-NEXT: s_cbranch_execnz .LBB97_1
6431 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
6432 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
6433 ; GCN1-NEXT: v_mov_b32_e32 v0, v4
6434 ; GCN1-NEXT: v_mov_b32_e32 v1, v5
6435 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6437 ; GCN2-LABEL: flat_atomic_min_i64_ret:
6439 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6440 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0
6441 ; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
6442 ; GCN2-NEXT: flat_load_dword v4, v[0:1]
6443 ; GCN2-NEXT: flat_load_dword v5, v[5:6]
6444 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
6445 ; GCN2-NEXT: .LBB97_1: ; %atomicrmw.start
6446 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
6447 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6448 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
6449 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
6450 ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
6451 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
6452 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
6453 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6454 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
6455 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6456 ; GCN2-NEXT: buffer_wbinvl1_vol
6457 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6458 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6459 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
6460 ; GCN2-NEXT: s_cbranch_execnz .LBB97_1
6461 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
6462 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
6463 ; GCN2-NEXT: v_mov_b32_e32 v0, v4
6464 ; GCN2-NEXT: v_mov_b32_e32 v1, v5
6465 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6467 ; GCN3-LABEL: flat_atomic_min_i64_ret:
6469 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6470 ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
6471 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
6472 ; GCN3-NEXT: .LBB97_1: ; %atomicrmw.start
6473 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
6474 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6475 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
6476 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
6477 ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
6478 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
6479 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
6480 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6481 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
6482 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6483 ; GCN3-NEXT: buffer_wbinvl1_vol
6484 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6485 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6486 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
6487 ; GCN3-NEXT: s_cbranch_execnz .LBB97_1
6488 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
6489 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
6490 ; GCN3-NEXT: v_mov_b32_e32 v0, v4
6491 ; GCN3-NEXT: v_mov_b32_e32 v1, v5
6492 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6493 %result = atomicrmw min ptr %ptr, i64 %in seq_cst
6497 define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) {
6498 ; GCN1-LABEL: flat_atomic_min_i64_ret_offset:
6500 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6501 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
6502 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
6503 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0
6504 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
6505 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
6506 ; GCN1-NEXT: flat_load_dword v0, v[4:5]
6507 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
6508 ; GCN1-NEXT: .LBB98_1: ; %atomicrmw.start
6509 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6510 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6511 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
6512 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
6513 ; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
6514 ; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
6515 ; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
6516 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6517 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
6518 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6519 ; GCN1-NEXT: buffer_wbinvl1_vol
6520 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
6521 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6522 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
6523 ; GCN1-NEXT: s_cbranch_execnz .LBB98_1
6524 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
6525 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
6526 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6528 ; GCN2-LABEL: flat_atomic_min_i64_ret_offset:
6530 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6531 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
6532 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
6533 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0
6534 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
6535 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
6536 ; GCN2-NEXT: flat_load_dword v0, v[4:5]
6537 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
6538 ; GCN2-NEXT: .LBB98_1: ; %atomicrmw.start
6539 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
6540 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6541 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
6542 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
6543 ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
6544 ; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
6545 ; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
6546 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6547 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
6548 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6549 ; GCN2-NEXT: buffer_wbinvl1_vol
6550 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
6551 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6552 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
6553 ; GCN2-NEXT: s_cbranch_execnz .LBB98_1
6554 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
6555 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
6556 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6558 ; GCN3-LABEL: flat_atomic_min_i64_ret_offset:
6560 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6561 ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
6562 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
6563 ; GCN3-NEXT: .LBB98_1: ; %atomicrmw.start
6564 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
6565 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6566 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
6567 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
6568 ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
6569 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
6570 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
6571 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6572 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
6573 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6574 ; GCN3-NEXT: buffer_wbinvl1_vol
6575 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6576 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6577 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
6578 ; GCN3-NEXT: s_cbranch_execnz .LBB98_1
6579 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
6580 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
6581 ; GCN3-NEXT: v_mov_b32_e32 v0, v4
6582 ; GCN3-NEXT: v_mov_b32_e32 v1, v5
6583 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6584 %gep = getelementptr i64, ptr %out, i64 4
6585 %result = atomicrmw min ptr %gep, i64 %in seq_cst
6589 define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
6590 ; GCN1-LABEL: flat_atomic_min_i64_noret_scalar:
6592 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6593 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
6594 ; GCN1-NEXT: s_add_u32 s34, s4, 4
6595 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
6596 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
6597 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
6598 ; GCN1-NEXT: v_mov_b32_e32 v4, s35
6599 ; GCN1-NEXT: flat_load_dword v2, v[0:1]
6600 ; GCN1-NEXT: flat_load_dword v3, v[3:4]
6601 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
6602 ; GCN1-NEXT: .LBB99_1: ; %atomicrmw.start
6603 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6604 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6605 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
6606 ; GCN1-NEXT: v_mov_b32_e32 v0, s7
6607 ; GCN1-NEXT: v_mov_b32_e32 v6, s6
6608 ; GCN1-NEXT: v_mov_b32_e32 v4, s4
6609 ; GCN1-NEXT: v_mov_b32_e32 v5, s5
6610 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
6611 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
6612 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6613 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
6614 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6615 ; GCN1-NEXT: buffer_wbinvl1_vol
6616 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6617 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
6618 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6619 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
6620 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
6621 ; GCN1-NEXT: s_cbranch_execnz .LBB99_1
6622 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
6623 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
6624 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6626 ; GCN2-LABEL: flat_atomic_min_i64_noret_scalar:
6628 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6629 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
6630 ; GCN2-NEXT: s_add_u32 s34, s4, 4
6631 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
6632 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
6633 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
6634 ; GCN2-NEXT: v_mov_b32_e32 v4, s35
6635 ; GCN2-NEXT: flat_load_dword v2, v[0:1]
6636 ; GCN2-NEXT: flat_load_dword v3, v[3:4]
6637 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
6638 ; GCN2-NEXT: .LBB99_1: ; %atomicrmw.start
6639 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
6640 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6641 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
6642 ; GCN2-NEXT: v_mov_b32_e32 v0, s7
6643 ; GCN2-NEXT: v_mov_b32_e32 v6, s6
6644 ; GCN2-NEXT: v_mov_b32_e32 v4, s4
6645 ; GCN2-NEXT: v_mov_b32_e32 v5, s5
6646 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
6647 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
6648 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6649 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
6650 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6651 ; GCN2-NEXT: buffer_wbinvl1_vol
6652 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6653 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
6654 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6655 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
6656 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
6657 ; GCN2-NEXT: s_cbranch_execnz .LBB99_1
6658 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
6659 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
6660 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6662 ; GCN3-LABEL: flat_atomic_min_i64_noret_scalar:
6664 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6665 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
6666 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
6667 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
6668 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
6669 ; GCN3-NEXT: .LBB99_1: ; %atomicrmw.start
6670 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
6671 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6672 ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
6673 ; GCN3-NEXT: v_mov_b32_e32 v0, s7
6674 ; GCN3-NEXT: v_mov_b32_e32 v6, s6
6675 ; GCN3-NEXT: v_mov_b32_e32 v4, s4
6676 ; GCN3-NEXT: v_mov_b32_e32 v5, s5
6677 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
6678 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
6679 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6680 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
6681 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6682 ; GCN3-NEXT: buffer_wbinvl1_vol
6683 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6684 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
6685 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6686 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
6687 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
6688 ; GCN3-NEXT: s_cbranch_execnz .LBB99_1
6689 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
6690 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
6691 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6692 %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst
6696 define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
6697 ; GCN1-LABEL: flat_atomic_min_i64_noret_offset_scalar:
6699 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6700 ; GCN1-NEXT: s_add_u32 s34, s4, 32
6701 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
6702 ; GCN1-NEXT: s_add_u32 s36, s4, 36
6703 ; GCN1-NEXT: s_addc_u32 s37, s5, 0
6704 ; GCN1-NEXT: v_mov_b32_e32 v0, s36
6705 ; GCN1-NEXT: v_mov_b32_e32 v1, s37
6706 ; GCN1-NEXT: v_mov_b32_e32 v4, s34
6707 ; GCN1-NEXT: v_mov_b32_e32 v5, s35
6708 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
6709 ; GCN1-NEXT: flat_load_dword v2, v[4:5]
6710 ; GCN1-NEXT: s_mov_b64 s[36:37], 0
6711 ; GCN1-NEXT: .LBB100_1: ; %atomicrmw.start
6712 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6713 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6714 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
6715 ; GCN1-NEXT: v_mov_b32_e32 v0, s7
6716 ; GCN1-NEXT: v_mov_b32_e32 v6, s6
6717 ; GCN1-NEXT: v_mov_b32_e32 v4, s34
6718 ; GCN1-NEXT: v_mov_b32_e32 v5, s35
6719 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
6720 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
6721 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6722 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
6723 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6724 ; GCN1-NEXT: buffer_wbinvl1_vol
6725 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6726 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
6727 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
6728 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
6729 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
6730 ; GCN1-NEXT: s_cbranch_execnz .LBB100_1
6731 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
6732 ; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
6733 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6735 ; GCN2-LABEL: flat_atomic_min_i64_noret_offset_scalar:
6737 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6738 ; GCN2-NEXT: s_add_u32 s34, s4, 32
6739 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
6740 ; GCN2-NEXT: s_add_u32 s36, s4, 36
6741 ; GCN2-NEXT: s_addc_u32 s37, s5, 0
6742 ; GCN2-NEXT: v_mov_b32_e32 v0, s36
6743 ; GCN2-NEXT: v_mov_b32_e32 v1, s37
6744 ; GCN2-NEXT: v_mov_b32_e32 v4, s34
6745 ; GCN2-NEXT: v_mov_b32_e32 v5, s35
6746 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
6747 ; GCN2-NEXT: flat_load_dword v2, v[4:5]
6748 ; GCN2-NEXT: s_mov_b64 s[36:37], 0
6749 ; GCN2-NEXT: .LBB100_1: ; %atomicrmw.start
6750 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
6751 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6752 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
6753 ; GCN2-NEXT: v_mov_b32_e32 v0, s7
6754 ; GCN2-NEXT: v_mov_b32_e32 v6, s6
6755 ; GCN2-NEXT: v_mov_b32_e32 v4, s34
6756 ; GCN2-NEXT: v_mov_b32_e32 v5, s35
6757 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
6758 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
6759 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6760 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
6761 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6762 ; GCN2-NEXT: buffer_wbinvl1_vol
6763 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6764 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
6765 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
6766 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
6767 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
6768 ; GCN2-NEXT: s_cbranch_execnz .LBB100_1
6769 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
6770 ; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
6771 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6773 ; GCN3-LABEL: flat_atomic_min_i64_noret_offset_scalar:
6775 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6776 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
6777 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
6778 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
6779 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
6780 ; GCN3-NEXT: .LBB100_1: ; %atomicrmw.start
6781 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
6782 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6783 ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
6784 ; GCN3-NEXT: v_mov_b32_e32 v0, s7
6785 ; GCN3-NEXT: v_mov_b32_e32 v6, s6
6786 ; GCN3-NEXT: v_mov_b32_e32 v4, s4
6787 ; GCN3-NEXT: v_mov_b32_e32 v5, s5
6788 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
6789 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
6790 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6791 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
6792 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6793 ; GCN3-NEXT: buffer_wbinvl1_vol
6794 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6795 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
6796 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6797 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
6798 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
6799 ; GCN3-NEXT: s_cbranch_execnz .LBB100_1
6800 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
6801 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
6802 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6803 %gep = getelementptr i64, ptr %out, i64 4
6804 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst
6808 define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
6809 ; GCN1-LABEL: flat_atomic_min_i64_ret_scalar:
6811 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6812 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
6813 ; GCN1-NEXT: s_add_u32 s34, s4, 4
6814 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
6815 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
6816 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
6817 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
6818 ; GCN1-NEXT: flat_load_dword v0, v[0:1]
6819 ; GCN1-NEXT: flat_load_dword v1, v[2:3]
6820 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
6821 ; GCN1-NEXT: .LBB101_1: ; %atomicrmw.start
6822 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6823 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6824 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
6825 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
6826 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
6827 ; GCN1-NEXT: v_mov_b32_e32 v0, s7
6828 ; GCN1-NEXT: v_mov_b32_e32 v6, s6
6829 ; GCN1-NEXT: v_mov_b32_e32 v4, s4
6830 ; GCN1-NEXT: v_mov_b32_e32 v5, s5
6831 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
6832 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
6833 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6834 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
6835 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6836 ; GCN1-NEXT: buffer_wbinvl1_vol
6837 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6838 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6839 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
6840 ; GCN1-NEXT: s_cbranch_execnz .LBB101_1
6841 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
6842 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
6843 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6845 ; GCN2-LABEL: flat_atomic_min_i64_ret_scalar:
6847 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6848 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
6849 ; GCN2-NEXT: s_add_u32 s34, s4, 4
6850 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
6851 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
6852 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
6853 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
6854 ; GCN2-NEXT: flat_load_dword v0, v[0:1]
6855 ; GCN2-NEXT: flat_load_dword v1, v[2:3]
6856 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
6857 ; GCN2-NEXT: .LBB101_1: ; %atomicrmw.start
6858 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
6859 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6860 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
6861 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
6862 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
6863 ; GCN2-NEXT: v_mov_b32_e32 v0, s7
6864 ; GCN2-NEXT: v_mov_b32_e32 v6, s6
6865 ; GCN2-NEXT: v_mov_b32_e32 v4, s4
6866 ; GCN2-NEXT: v_mov_b32_e32 v5, s5
6867 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
6868 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
6869 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6870 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
6871 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6872 ; GCN2-NEXT: buffer_wbinvl1_vol
6873 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6874 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6875 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
6876 ; GCN2-NEXT: s_cbranch_execnz .LBB101_1
6877 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
6878 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
6879 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6881 ; GCN3-LABEL: flat_atomic_min_i64_ret_scalar:
6883 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6884 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
6885 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
6886 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
6887 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
6888 ; GCN3-NEXT: .LBB101_1: ; %atomicrmw.start
6889 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
6890 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6891 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
6892 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
6893 ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
6894 ; GCN3-NEXT: v_mov_b32_e32 v0, s7
6895 ; GCN3-NEXT: v_mov_b32_e32 v6, s6
6896 ; GCN3-NEXT: v_mov_b32_e32 v4, s4
6897 ; GCN3-NEXT: v_mov_b32_e32 v5, s5
6898 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
6899 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
6900 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6901 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
6902 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6903 ; GCN3-NEXT: buffer_wbinvl1_vol
6904 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6905 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6906 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
6907 ; GCN3-NEXT: s_cbranch_execnz .LBB101_1
6908 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
6909 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
6910 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6911 %result = atomicrmw min ptr %ptr, i64 %in seq_cst
6915 define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
6916 ; GCN1-LABEL: flat_atomic_min_i64_ret_offset_scalar:
6918 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6919 ; GCN1-NEXT: s_add_u32 s34, s4, 32
6920 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
6921 ; GCN1-NEXT: s_add_u32 s36, s4, 36
6922 ; GCN1-NEXT: s_addc_u32 s37, s5, 0
6923 ; GCN1-NEXT: v_mov_b32_e32 v0, s36
6924 ; GCN1-NEXT: v_mov_b32_e32 v1, s37
6925 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
6926 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
6927 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
6928 ; GCN1-NEXT: flat_load_dword v0, v[2:3]
6929 ; GCN1-NEXT: s_mov_b64 s[36:37], 0
6930 ; GCN1-NEXT: .LBB102_1: ; %atomicrmw.start
6931 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6932 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6933 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
6934 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
6935 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
6936 ; GCN1-NEXT: v_mov_b32_e32 v0, s7
6937 ; GCN1-NEXT: v_mov_b32_e32 v6, s6
6938 ; GCN1-NEXT: v_mov_b32_e32 v4, s34
6939 ; GCN1-NEXT: v_mov_b32_e32 v5, s35
6940 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
6941 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
6942 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6943 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
6944 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6945 ; GCN1-NEXT: buffer_wbinvl1_vol
6946 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6947 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
6948 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
6949 ; GCN1-NEXT: s_cbranch_execnz .LBB102_1
6950 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
6951 ; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
6952 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6954 ; GCN2-LABEL: flat_atomic_min_i64_ret_offset_scalar:
6956 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6957 ; GCN2-NEXT: s_add_u32 s34, s4, 32
6958 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
6959 ; GCN2-NEXT: s_add_u32 s36, s4, 36
6960 ; GCN2-NEXT: s_addc_u32 s37, s5, 0
6961 ; GCN2-NEXT: v_mov_b32_e32 v0, s36
6962 ; GCN2-NEXT: v_mov_b32_e32 v1, s37
6963 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
6964 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
6965 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
6966 ; GCN2-NEXT: flat_load_dword v0, v[2:3]
6967 ; GCN2-NEXT: s_mov_b64 s[36:37], 0
6968 ; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start
6969 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
6970 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6971 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
6972 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
6973 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
6974 ; GCN2-NEXT: v_mov_b32_e32 v0, s7
6975 ; GCN2-NEXT: v_mov_b32_e32 v6, s6
6976 ; GCN2-NEXT: v_mov_b32_e32 v4, s34
6977 ; GCN2-NEXT: v_mov_b32_e32 v5, s35
6978 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
6979 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
6980 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6981 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
6982 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6983 ; GCN2-NEXT: buffer_wbinvl1_vol
6984 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
6985 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
6986 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
6987 ; GCN2-NEXT: s_cbranch_execnz .LBB102_1
6988 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
6989 ; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
6990 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6992 ; GCN3-LABEL: flat_atomic_min_i64_ret_offset_scalar:
6994 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6995 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
6996 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
6997 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
6998 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
6999 ; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start
7000 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
7001 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7002 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
7003 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
7004 ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
7005 ; GCN3-NEXT: v_mov_b32_e32 v0, s7
7006 ; GCN3-NEXT: v_mov_b32_e32 v6, s6
7007 ; GCN3-NEXT: v_mov_b32_e32 v4, s4
7008 ; GCN3-NEXT: v_mov_b32_e32 v5, s5
7009 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
7010 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
7011 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7012 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
7013 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7014 ; GCN3-NEXT: buffer_wbinvl1_vol
7015 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7016 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7017 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
7018 ; GCN3-NEXT: s_cbranch_execnz .LBB102_1
7019 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
7020 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
7021 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7022 %gep = getelementptr i64, ptr %out, i64 4
7023 %result = atomicrmw min ptr %gep, i64 %in seq_cst
7027 define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
7028 ; GCN1-LABEL: atomic_min_i64_addr64_offset:
7029 ; GCN1: ; %bb.0: ; %entry
7030 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
7031 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
7032 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7033 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
7034 ; GCN1-NEXT: s_add_u32 s0, s0, s4
7035 ; GCN1-NEXT: s_addc_u32 s1, s1, s5
7036 ; GCN1-NEXT: s_add_u32 s0, s0, 32
7037 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
7038 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
7039 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
7040 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
7041 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
7042 ; GCN1-NEXT: .LBB103_1: ; %atomicrmw.start
7043 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
7044 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7045 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
7046 ; GCN1-NEXT: v_mov_b32_e32 v0, s3
7047 ; GCN1-NEXT: v_mov_b32_e32 v6, s2
7048 ; GCN1-NEXT: v_mov_b32_e32 v5, s1
7049 ; GCN1-NEXT: v_mov_b32_e32 v4, s0
7050 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
7051 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
7052 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7053 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
7054 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7055 ; GCN1-NEXT: buffer_wbinvl1_vol
7056 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7057 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
7058 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7059 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
7060 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
7061 ; GCN1-NEXT: s_cbranch_execnz .LBB103_1
7062 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
7063 ; GCN1-NEXT: s_endpgm
7065 ; GCN2-LABEL: atomic_min_i64_addr64_offset:
7066 ; GCN2: ; %bb.0: ; %entry
7067 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
7068 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
7069 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7070 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
7071 ; GCN2-NEXT: s_add_u32 s0, s0, s4
7072 ; GCN2-NEXT: s_addc_u32 s1, s1, s5
7073 ; GCN2-NEXT: s_add_u32 s0, s0, 32
7074 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
7075 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
7076 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
7077 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
7078 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
7079 ; GCN2-NEXT: .LBB103_1: ; %atomicrmw.start
7080 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
7081 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7082 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
7083 ; GCN2-NEXT: v_mov_b32_e32 v0, s3
7084 ; GCN2-NEXT: v_mov_b32_e32 v6, s2
7085 ; GCN2-NEXT: v_mov_b32_e32 v5, s1
7086 ; GCN2-NEXT: v_mov_b32_e32 v4, s0
7087 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
7088 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
7089 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7090 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
7091 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7092 ; GCN2-NEXT: buffer_wbinvl1_vol
7093 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7094 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
7095 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7096 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
7097 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
7098 ; GCN2-NEXT: s_cbranch_execnz .LBB103_1
7099 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
7100 ; GCN2-NEXT: s_endpgm
7102 ; GCN3-LABEL: atomic_min_i64_addr64_offset:
7103 ; GCN3: ; %bb.0: ; %entry
7104 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
7105 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
7106 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
7107 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
7108 ; GCN3-NEXT: s_add_u32 s0, s4, s0
7109 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
7110 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
7111 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
7112 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
7113 ; GCN3-NEXT: s_mov_b64 s[2:3], 0
7114 ; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start
7115 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
7116 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7117 ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
7118 ; GCN3-NEXT: v_mov_b32_e32 v0, s7
7119 ; GCN3-NEXT: v_mov_b32_e32 v6, s6
7120 ; GCN3-NEXT: v_mov_b32_e32 v5, s1
7121 ; GCN3-NEXT: v_mov_b32_e32 v4, s0
7122 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
7123 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
7124 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7125 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
7126 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7127 ; GCN3-NEXT: buffer_wbinvl1_vol
7128 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7129 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
7130 ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
7131 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
7132 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
7133 ; GCN3-NEXT: s_cbranch_execnz .LBB103_1
7134 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
7135 ; GCN3-NEXT: s_endpgm
7137 %ptr = getelementptr i64, ptr %out, i64 %index
7138 %gep = getelementptr i64, ptr %ptr, i64 4
7139 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst
7143 define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
7144 ; GCN1-LABEL: atomic_min_i64_ret_addr64_offset:
7145 ; GCN1: ; %bb.0: ; %entry
7146 ; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
7147 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7148 ; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
7149 ; GCN1-NEXT: s_add_u32 s0, s0, s6
7150 ; GCN1-NEXT: s_addc_u32 s1, s1, s7
7151 ; GCN1-NEXT: s_add_u32 s0, s0, 32
7152 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
7153 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
7154 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
7155 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
7156 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
7157 ; GCN1-NEXT: .LBB104_1: ; %atomicrmw.start
7158 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
7159 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7160 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
7161 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
7162 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
7163 ; GCN1-NEXT: v_mov_b32_e32 v0, s5
7164 ; GCN1-NEXT: v_mov_b32_e32 v6, s4
7165 ; GCN1-NEXT: v_mov_b32_e32 v5, s1
7166 ; GCN1-NEXT: v_mov_b32_e32 v4, s0
7167 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
7168 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
7169 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7170 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
7171 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7172 ; GCN1-NEXT: buffer_wbinvl1_vol
7173 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7174 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
7175 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
7176 ; GCN1-NEXT: s_cbranch_execnz .LBB104_1
7177 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
7178 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
7179 ; GCN1-NEXT: v_mov_b32_e32 v2, s2
7180 ; GCN1-NEXT: v_mov_b32_e32 v3, s3
7181 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
7182 ; GCN1-NEXT: s_endpgm
7184 ; GCN2-LABEL: atomic_min_i64_ret_addr64_offset:
7185 ; GCN2: ; %bb.0: ; %entry
7186 ; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
7187 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7188 ; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
7189 ; GCN2-NEXT: s_add_u32 s0, s0, s6
7190 ; GCN2-NEXT: s_addc_u32 s1, s1, s7
7191 ; GCN2-NEXT: s_add_u32 s0, s0, 32
7192 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
7193 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
7194 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
7195 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
7196 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
7197 ; GCN2-NEXT: .LBB104_1: ; %atomicrmw.start
7198 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
7199 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7200 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
7201 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
7202 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
7203 ; GCN2-NEXT: v_mov_b32_e32 v0, s5
7204 ; GCN2-NEXT: v_mov_b32_e32 v6, s4
7205 ; GCN2-NEXT: v_mov_b32_e32 v5, s1
7206 ; GCN2-NEXT: v_mov_b32_e32 v4, s0
7207 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
7208 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
7209 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7210 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
7211 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7212 ; GCN2-NEXT: buffer_wbinvl1_vol
7213 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7214 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
7215 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
7216 ; GCN2-NEXT: s_cbranch_execnz .LBB104_1
7217 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
7218 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
7219 ; GCN2-NEXT: v_mov_b32_e32 v2, s2
7220 ; GCN2-NEXT: v_mov_b32_e32 v3, s3
7221 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
7222 ; GCN2-NEXT: s_endpgm
7224 ; GCN3-LABEL: atomic_min_i64_ret_addr64_offset:
7225 ; GCN3: ; %bb.0: ; %entry
7226 ; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
7227 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
7228 ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
7229 ; GCN3-NEXT: s_add_u32 s0, s0, s6
7230 ; GCN3-NEXT: s_addc_u32 s1, s1, s7
7231 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
7232 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
7233 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
7234 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
7235 ; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start
7236 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
7237 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7238 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
7239 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
7240 ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
7241 ; GCN3-NEXT: v_mov_b32_e32 v0, s5
7242 ; GCN3-NEXT: v_mov_b32_e32 v6, s4
7243 ; GCN3-NEXT: v_mov_b32_e32 v5, s1
7244 ; GCN3-NEXT: v_mov_b32_e32 v4, s0
7245 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
7246 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
7247 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7248 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
7249 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7250 ; GCN3-NEXT: buffer_wbinvl1_vol
7251 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7252 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
7253 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
7254 ; GCN3-NEXT: s_cbranch_execnz .LBB104_1
7255 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
7256 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
7257 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
7258 ; GCN3-NEXT: v_mov_b32_e32 v3, s3
7259 ; GCN3-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
7260 ; GCN3-NEXT: s_endpgm
7262 %ptr = getelementptr i64, ptr %out, i64 %index
7263 %gep = getelementptr i64, ptr %ptr, i64 4
7264 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst
7265 store i64 %tmp0, ptr %out2
7269 define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
7270 ; GCN1-LABEL: atomic_min_i64:
7271 ; GCN1: ; %bb.0: ; %entry
7272 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
7273 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
7274 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7275 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
7276 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
7277 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
7278 ; GCN1-NEXT: .LBB105_1: ; %atomicrmw.start
7279 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
7280 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7281 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
7282 ; GCN1-NEXT: v_mov_b32_e32 v0, s3
7283 ; GCN1-NEXT: v_mov_b32_e32 v6, s2
7284 ; GCN1-NEXT: v_mov_b32_e32 v5, s1
7285 ; GCN1-NEXT: v_mov_b32_e32 v4, s0
7286 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
7287 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
7288 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7289 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
7290 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7291 ; GCN1-NEXT: buffer_wbinvl1_vol
7292 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7293 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
7294 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7295 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
7296 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
7297 ; GCN1-NEXT: s_cbranch_execnz .LBB105_1
7298 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
7299 ; GCN1-NEXT: s_endpgm
7301 ; GCN2-LABEL: atomic_min_i64:
7302 ; GCN2: ; %bb.0: ; %entry
7303 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
7304 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
7305 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7306 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
7307 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
7308 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
7309 ; GCN2-NEXT: .LBB105_1: ; %atomicrmw.start
7310 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
7311 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7312 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
7313 ; GCN2-NEXT: v_mov_b32_e32 v0, s3
7314 ; GCN2-NEXT: v_mov_b32_e32 v6, s2
7315 ; GCN2-NEXT: v_mov_b32_e32 v5, s1
7316 ; GCN2-NEXT: v_mov_b32_e32 v4, s0
7317 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
7318 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
7319 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7320 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
7321 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7322 ; GCN2-NEXT: buffer_wbinvl1_vol
7323 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7324 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
7325 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7326 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
7327 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
7328 ; GCN2-NEXT: s_cbranch_execnz .LBB105_1
7329 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
7330 ; GCN2-NEXT: s_endpgm
7332 ; GCN3-LABEL: atomic_min_i64:
7333 ; GCN3: ; %bb.0: ; %entry
7334 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
7335 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
7336 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
7337 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
7338 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
7339 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
7340 ; GCN3-NEXT: .LBB105_1: ; %atomicrmw.start
7341 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
7342 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7343 ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
7344 ; GCN3-NEXT: v_mov_b32_e32 v0, s3
7345 ; GCN3-NEXT: v_mov_b32_e32 v6, s2
7346 ; GCN3-NEXT: v_mov_b32_e32 v5, s1
7347 ; GCN3-NEXT: v_mov_b32_e32 v4, s0
7348 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
7349 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
7350 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7351 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
7352 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7353 ; GCN3-NEXT: buffer_wbinvl1_vol
7354 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7355 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
7356 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7357 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
7358 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
7359 ; GCN3-NEXT: s_cbranch_execnz .LBB105_1
7360 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
7361 ; GCN3-NEXT: s_endpgm
7363 %tmp0 = atomicrmw min ptr %out, i64 %in seq_cst
7367 define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
7368 ; GCN1-LABEL: atomic_min_i64_ret_addr64:
7369 ; GCN1: ; %bb.0: ; %entry
7370 ; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
7371 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7372 ; GCN1-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
7373 ; GCN1-NEXT: s_add_u32 s0, s0, s6
7374 ; GCN1-NEXT: s_addc_u32 s1, s1, s7
7375 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
7376 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
7377 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
7378 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
7379 ; GCN1-NEXT: .LBB106_1: ; %atomicrmw.start
7380 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
7381 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7382 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
7383 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
7384 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
7385 ; GCN1-NEXT: v_mov_b32_e32 v0, s5
7386 ; GCN1-NEXT: v_mov_b32_e32 v6, s4
7387 ; GCN1-NEXT: v_mov_b32_e32 v5, s1
7388 ; GCN1-NEXT: v_mov_b32_e32 v4, s0
7389 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
7390 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
7391 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7392 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
7393 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7394 ; GCN1-NEXT: buffer_wbinvl1_vol
7395 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7396 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
7397 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
7398 ; GCN1-NEXT: s_cbranch_execnz .LBB106_1
7399 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
7400 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
7401 ; GCN1-NEXT: v_mov_b32_e32 v2, s2
7402 ; GCN1-NEXT: v_mov_b32_e32 v3, s3
7403 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
7404 ; GCN1-NEXT: s_endpgm
7406 ; GCN2-LABEL: atomic_min_i64_ret_addr64:
7407 ; GCN2: ; %bb.0: ; %entry
7408 ; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
7409 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7410 ; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
7411 ; GCN2-NEXT: s_add_u32 s0, s0, s6
7412 ; GCN2-NEXT: s_addc_u32 s1, s1, s7
7413 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
7414 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
7415 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
7416 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
7417 ; GCN2-NEXT: .LBB106_1: ; %atomicrmw.start
7418 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
7419 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7420 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
7421 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
7422 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
7423 ; GCN2-NEXT: v_mov_b32_e32 v0, s5
7424 ; GCN2-NEXT: v_mov_b32_e32 v6, s4
7425 ; GCN2-NEXT: v_mov_b32_e32 v5, s1
7426 ; GCN2-NEXT: v_mov_b32_e32 v4, s0
7427 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
7428 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
7429 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7430 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
7431 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7432 ; GCN2-NEXT: buffer_wbinvl1_vol
7433 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7434 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
7435 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
7436 ; GCN2-NEXT: s_cbranch_execnz .LBB106_1
7437 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
7438 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
7439 ; GCN2-NEXT: v_mov_b32_e32 v2, s2
7440 ; GCN2-NEXT: v_mov_b32_e32 v3, s3
7441 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
7442 ; GCN2-NEXT: s_endpgm
7444 ; GCN3-LABEL: atomic_min_i64_ret_addr64:
7445 ; GCN3: ; %bb.0: ; %entry
7446 ; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
7447 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
7448 ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
7449 ; GCN3-NEXT: s_add_u32 s0, s0, s6
7450 ; GCN3-NEXT: s_addc_u32 s1, s1, s7
7451 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
7452 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
7453 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
7454 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
7455 ; GCN3-NEXT: .LBB106_1: ; %atomicrmw.start
7456 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
7457 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7458 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
7459 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
7460 ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3]
7461 ; GCN3-NEXT: v_mov_b32_e32 v0, s5
7462 ; GCN3-NEXT: v_mov_b32_e32 v6, s4
7463 ; GCN3-NEXT: v_mov_b32_e32 v5, s1
7464 ; GCN3-NEXT: v_mov_b32_e32 v4, s0
7465 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc
7466 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
7467 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7468 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
7469 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7470 ; GCN3-NEXT: buffer_wbinvl1_vol
7471 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7472 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
7473 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
7474 ; GCN3-NEXT: s_cbranch_execnz .LBB106_1
7475 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
7476 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
7477 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
7478 ; GCN3-NEXT: v_mov_b32_e32 v3, s3
7479 ; GCN3-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
7480 ; GCN3-NEXT: s_endpgm
7482 %ptr = getelementptr i64, ptr %out, i64 %index
7483 %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst
7484 store i64 %tmp0, ptr %out2
7488 ; ---------------------------------------------------------------------
7489 ; atomicrmw uinc_wrap
7490 ; ---------------------------------------------------------------------
7492 define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) {
7493 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret:
7495 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7496 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
7497 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7498 ; GCN1-NEXT: buffer_wbinvl1_vol
7499 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7501 ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret:
7503 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7504 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
7505 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7506 ; GCN2-NEXT: buffer_wbinvl1_vol
7507 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7509 ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret:
7511 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7512 ; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
7513 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7514 ; GCN3-NEXT: buffer_wbinvl1_vol
7515 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7516 %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst
7520 define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) {
7521 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret_offset:
7523 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7524 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
7525 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7526 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7527 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
7528 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7529 ; GCN1-NEXT: buffer_wbinvl1_vol
7530 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7532 ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_offset:
7534 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7535 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
7536 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7537 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7538 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
7539 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7540 ; GCN2-NEXT: buffer_wbinvl1_vol
7541 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7543 ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_offset:
7545 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7546 ; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] offset:32
7547 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7548 ; GCN3-NEXT: buffer_wbinvl1_vol
7549 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7550 %gep = getelementptr i64, ptr %out, i64 4
7551 %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst
7555 define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) {
7556 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret:
7558 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7559 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
7560 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7561 ; GCN1-NEXT: buffer_wbinvl1_vol
7562 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7564 ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret:
7566 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7567 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
7568 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7569 ; GCN2-NEXT: buffer_wbinvl1_vol
7570 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7572 ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret:
7574 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7575 ; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
7576 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7577 ; GCN3-NEXT: buffer_wbinvl1_vol
7578 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7579 %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst
7583 define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) {
7584 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret_offset:
7586 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7587 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
7588 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7589 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7590 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
7591 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7592 ; GCN1-NEXT: buffer_wbinvl1_vol
7593 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7595 ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_offset:
7597 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7598 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
7599 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7600 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7601 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
7602 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7603 ; GCN2-NEXT: buffer_wbinvl1_vol
7604 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7606 ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_offset:
7608 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7609 ; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
7610 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7611 ; GCN3-NEXT: buffer_wbinvl1_vol
7612 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7613 %gep = getelementptr i64, ptr %out, i64 4
7614 %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst
7618 define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
7619 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar:
7621 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7622 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
7623 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
7624 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
7625 ; GCN1-NEXT: v_mov_b32_e32 v3, s5
7626 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7627 ; GCN1-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
7628 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7629 ; GCN1-NEXT: buffer_wbinvl1_vol
7630 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7632 ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar:
7634 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7635 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
7636 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
7637 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
7638 ; GCN2-NEXT: v_mov_b32_e32 v3, s5
7639 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7640 ; GCN2-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
7641 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7642 ; GCN2-NEXT: buffer_wbinvl1_vol
7643 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7645 ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar:
7647 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7648 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
7649 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
7650 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
7651 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
7652 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7653 ; GCN3-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
7654 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7655 ; GCN3-NEXT: buffer_wbinvl1_vol
7656 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7657 %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst
7661 define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
7662 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar:
7664 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7665 ; GCN1-NEXT: s_add_u32 s34, s4, 32
7666 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
7667 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
7668 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
7669 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
7670 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
7671 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7672 ; GCN1-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
7673 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7674 ; GCN1-NEXT: buffer_wbinvl1_vol
7675 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7677 ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar:
7679 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7680 ; GCN2-NEXT: s_add_u32 s34, s4, 32
7681 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
7682 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
7683 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
7684 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
7685 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
7686 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7687 ; GCN2-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
7688 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7689 ; GCN2-NEXT: buffer_wbinvl1_vol
7690 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7692 ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar:
7694 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7695 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
7696 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
7697 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
7698 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
7699 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7700 ; GCN3-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] offset:32
7701 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7702 ; GCN3-NEXT: buffer_wbinvl1_vol
7703 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7704 %gep = getelementptr i64, ptr %out, i64 4
7705 %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst
7709 define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
7710 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar:
7712 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7713 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
7714 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
7715 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
7716 ; GCN1-NEXT: v_mov_b32_e32 v3, s5
7717 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7718 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
7719 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7720 ; GCN1-NEXT: buffer_wbinvl1_vol
7721 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7723 ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar:
7725 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7726 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
7727 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
7728 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
7729 ; GCN2-NEXT: v_mov_b32_e32 v3, s5
7730 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7731 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
7732 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7733 ; GCN2-NEXT: buffer_wbinvl1_vol
7734 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7736 ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar:
7738 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7739 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
7740 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
7741 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
7742 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
7743 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7744 ; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
7745 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7746 ; GCN3-NEXT: buffer_wbinvl1_vol
7747 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7748 %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst
7752 define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
7753 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar:
7755 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7756 ; GCN1-NEXT: s_add_u32 s34, s4, 32
7757 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
7758 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
7759 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
7760 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
7761 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
7762 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7763 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
7764 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7765 ; GCN1-NEXT: buffer_wbinvl1_vol
7766 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7768 ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar:
7770 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7771 ; GCN2-NEXT: s_add_u32 s34, s4, 32
7772 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
7773 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
7774 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
7775 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
7776 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
7777 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7778 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
7779 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7780 ; GCN2-NEXT: buffer_wbinvl1_vol
7781 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7783 ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar:
7785 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7786 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
7787 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
7788 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
7789 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
7790 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7791 ; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
7792 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7793 ; GCN3-NEXT: buffer_wbinvl1_vol
7794 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7795 %gep = getelementptr i64, ptr %out, i64 4
7796 %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst
7800 ; ---------------------------------------------------------------------
7801 ; atomicrmw udec_wrap
7802 ; ---------------------------------------------------------------------
7804 define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) {
7805 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret:
7807 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7808 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
7809 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7810 ; GCN1-NEXT: buffer_wbinvl1_vol
7811 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7813 ; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret:
7815 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7816 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
7817 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7818 ; GCN2-NEXT: buffer_wbinvl1_vol
7819 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7821 ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret:
7823 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7824 ; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
7825 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7826 ; GCN3-NEXT: buffer_wbinvl1_vol
7827 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7828 %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst
7832 define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) {
7833 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_offset:
7835 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7836 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
7837 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7838 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7839 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
7840 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7841 ; GCN1-NEXT: buffer_wbinvl1_vol
7842 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7844 ; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_offset:
7846 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7847 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
7848 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7849 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7850 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
7851 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7852 ; GCN2-NEXT: buffer_wbinvl1_vol
7853 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7855 ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_offset:
7857 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7858 ; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] offset:32
7859 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7860 ; GCN3-NEXT: buffer_wbinvl1_vol
7861 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7862 %gep = getelementptr i64, ptr %out, i64 4
7863 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst
7867 define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) {
7868 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret:
7870 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7871 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
7872 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7873 ; GCN1-NEXT: buffer_wbinvl1_vol
7874 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7876 ; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret:
7878 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7879 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
7880 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7881 ; GCN2-NEXT: buffer_wbinvl1_vol
7882 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7884 ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret:
7886 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7887 ; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
7888 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7889 ; GCN3-NEXT: buffer_wbinvl1_vol
7890 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7891 %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst
7895 define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) {
7896 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_offset:
7898 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7899 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
7900 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7901 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7902 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
7903 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7904 ; GCN1-NEXT: buffer_wbinvl1_vol
7905 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7907 ; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_offset:
7909 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7910 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
7911 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7912 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7913 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
7914 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7915 ; GCN2-NEXT: buffer_wbinvl1_vol
7916 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7918 ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_offset:
7920 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7921 ; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
7922 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7923 ; GCN3-NEXT: buffer_wbinvl1_vol
7924 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7925 %gep = getelementptr i64, ptr %out, i64 4
7926 %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst
7930 define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
7931 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_scalar:
7933 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7934 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
7935 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
7936 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
7937 ; GCN1-NEXT: v_mov_b32_e32 v3, s5
7938 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7939 ; GCN1-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
7940 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7941 ; GCN1-NEXT: buffer_wbinvl1_vol
7942 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7944 ; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_scalar:
7946 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7947 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
7948 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
7949 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
7950 ; GCN2-NEXT: v_mov_b32_e32 v3, s5
7951 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7952 ; GCN2-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
7953 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7954 ; GCN2-NEXT: buffer_wbinvl1_vol
7955 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7957 ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_scalar:
7959 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7960 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
7961 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
7962 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
7963 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
7964 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7965 ; GCN3-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
7966 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7967 ; GCN3-NEXT: buffer_wbinvl1_vol
7968 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7969 %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst
7973 define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
7974 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar:
7976 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7977 ; GCN1-NEXT: s_add_u32 s34, s4, 32
7978 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
7979 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
7980 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
7981 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
7982 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
7983 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7984 ; GCN1-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
7985 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7986 ; GCN1-NEXT: buffer_wbinvl1_vol
7987 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7989 ; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar:
7991 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7992 ; GCN2-NEXT: s_add_u32 s34, s4, 32
7993 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
7994 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
7995 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
7996 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
7997 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
7998 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7999 ; GCN2-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
8000 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8001 ; GCN2-NEXT: buffer_wbinvl1_vol
8002 ; GCN2-NEXT: s_setpc_b64 s[30:31]
8004 ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar:
8006 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8007 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
8008 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
8009 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
8010 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
8011 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8012 ; GCN3-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] offset:32
8013 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8014 ; GCN3-NEXT: buffer_wbinvl1_vol
8015 ; GCN3-NEXT: s_setpc_b64 s[30:31]
8016 %gep = getelementptr i64, ptr %out, i64 4
8017 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst
8021 define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
8022 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_scalar:
8024 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8025 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
8026 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
8027 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
8028 ; GCN1-NEXT: v_mov_b32_e32 v3, s5
8029 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8030 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
8031 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8032 ; GCN1-NEXT: buffer_wbinvl1_vol
8033 ; GCN1-NEXT: s_setpc_b64 s[30:31]
8035 ; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_scalar:
8037 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8038 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
8039 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
8040 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
8041 ; GCN2-NEXT: v_mov_b32_e32 v3, s5
8042 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8043 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
8044 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8045 ; GCN2-NEXT: buffer_wbinvl1_vol
8046 ; GCN2-NEXT: s_setpc_b64 s[30:31]
8048 ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_scalar:
8050 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8051 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
8052 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
8053 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
8054 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
8055 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8056 ; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
8057 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8058 ; GCN3-NEXT: buffer_wbinvl1_vol
8059 ; GCN3-NEXT: s_setpc_b64 s[30:31]
8060 %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst
8064 define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
8065 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar:
8067 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8068 ; GCN1-NEXT: s_add_u32 s34, s4, 32
8069 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
8070 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
8071 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
8072 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
8073 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
8074 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8075 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
8076 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8077 ; GCN1-NEXT: buffer_wbinvl1_vol
8078 ; GCN1-NEXT: s_setpc_b64 s[30:31]
8080 ; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar:
8082 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8083 ; GCN2-NEXT: s_add_u32 s34, s4, 32
8084 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
8085 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
8086 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
8087 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
8088 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
8089 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8090 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
8091 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8092 ; GCN2-NEXT: buffer_wbinvl1_vol
8093 ; GCN2-NEXT: s_setpc_b64 s[30:31]
8095 ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar:
8097 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8098 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
8099 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
8100 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
8101 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
8102 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8103 ; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
8104 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8105 ; GCN3-NEXT: buffer_wbinvl1_vol
8106 ; GCN3-NEXT: s_setpc_b64 s[30:31]
8107 %gep = getelementptr i64, ptr %out, i64 4
8108 %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst