1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN1 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN2 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN3 %s
6 ; ---------------------------------------------------------------------
8 ; ---------------------------------------------------------------------
10 define void @flat_atomic_xchg_i32_noret(ptr %ptr, i32 %in) {
11 ; GCN1-LABEL: flat_atomic_xchg_i32_noret:
13 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2
15 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16 ; GCN1-NEXT: buffer_wbinvl1_vol
17 ; GCN1-NEXT: s_setpc_b64 s[30:31]
19 ; GCN2-LABEL: flat_atomic_xchg_i32_noret:
21 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2
23 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
24 ; GCN2-NEXT: buffer_wbinvl1_vol
25 ; GCN2-NEXT: s_setpc_b64 s[30:31]
27 ; GCN3-LABEL: flat_atomic_xchg_i32_noret:
29 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2
31 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
32 ; GCN3-NEXT: buffer_wbinvl1_vol
33 ; GCN3-NEXT: s_setpc_b64 s[30:31]
34 %tmp0 = atomicrmw xchg ptr %ptr, i32 %in seq_cst
38 define void @flat_atomic_xchg_i32_noret_offset(ptr %out, i32 %in) {
39 ; GCN1-LABEL: flat_atomic_xchg_i32_noret_offset:
41 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
42 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
43 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
44 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2
45 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
46 ; GCN1-NEXT: buffer_wbinvl1_vol
47 ; GCN1-NEXT: s_setpc_b64 s[30:31]
49 ; GCN2-LABEL: flat_atomic_xchg_i32_noret_offset:
51 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
52 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
53 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
54 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2
55 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
56 ; GCN2-NEXT: buffer_wbinvl1_vol
57 ; GCN2-NEXT: s_setpc_b64 s[30:31]
59 ; GCN3-LABEL: flat_atomic_xchg_i32_noret_offset:
61 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
62 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16
63 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
64 ; GCN3-NEXT: buffer_wbinvl1_vol
65 ; GCN3-NEXT: s_setpc_b64 s[30:31]
66 %gep = getelementptr i32, ptr %out, i32 4
67 %tmp0 = atomicrmw xchg ptr %gep, i32 %in seq_cst
71 define i32 @flat_atomic_xchg_i32_ret(ptr %ptr, i32 %in) {
72 ; GCN1-LABEL: flat_atomic_xchg_i32_ret:
74 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
75 ; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
76 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
77 ; GCN1-NEXT: buffer_wbinvl1_vol
78 ; GCN1-NEXT: s_setpc_b64 s[30:31]
80 ; GCN2-LABEL: flat_atomic_xchg_i32_ret:
82 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83 ; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
84 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
85 ; GCN2-NEXT: buffer_wbinvl1_vol
86 ; GCN2-NEXT: s_setpc_b64 s[30:31]
88 ; GCN3-LABEL: flat_atomic_xchg_i32_ret:
90 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
91 ; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
92 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
93 ; GCN3-NEXT: buffer_wbinvl1_vol
94 ; GCN3-NEXT: s_setpc_b64 s[30:31]
95 %result = atomicrmw xchg ptr %ptr, i32 %in seq_cst
99 define i32 @flat_atomic_xchg_i32_ret_offset(ptr %out, i32 %in) {
100 ; GCN1-LABEL: flat_atomic_xchg_i32_ret_offset:
102 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
103 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
104 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
105 ; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
106 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
107 ; GCN1-NEXT: buffer_wbinvl1_vol
108 ; GCN1-NEXT: s_setpc_b64 s[30:31]
110 ; GCN2-LABEL: flat_atomic_xchg_i32_ret_offset:
112 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
113 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
114 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
115 ; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
116 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
117 ; GCN2-NEXT: buffer_wbinvl1_vol
118 ; GCN2-NEXT: s_setpc_b64 s[30:31]
120 ; GCN3-LABEL: flat_atomic_xchg_i32_ret_offset:
122 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
123 ; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:16 glc
124 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
125 ; GCN3-NEXT: buffer_wbinvl1_vol
126 ; GCN3-NEXT: s_setpc_b64 s[30:31]
127 %gep = getelementptr i32, ptr %out, i32 4
128 %result = atomicrmw xchg ptr %gep, i32 %in seq_cst
132 define amdgpu_gfx void @flat_atomic_xchg_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
133 ; GCN1-LABEL: flat_atomic_xchg_i32_noret_scalar:
135 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
136 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
137 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
138 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
139 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2
140 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
141 ; GCN1-NEXT: buffer_wbinvl1_vol
142 ; GCN1-NEXT: s_setpc_b64 s[30:31]
144 ; GCN2-LABEL: flat_atomic_xchg_i32_noret_scalar:
146 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
147 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
148 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
149 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
150 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2
151 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
152 ; GCN2-NEXT: buffer_wbinvl1_vol
153 ; GCN2-NEXT: s_setpc_b64 s[30:31]
155 ; GCN3-LABEL: flat_atomic_xchg_i32_noret_scalar:
157 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
158 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
159 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
160 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
161 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2
162 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
163 ; GCN3-NEXT: buffer_wbinvl1_vol
164 ; GCN3-NEXT: s_setpc_b64 s[30:31]
165 %tmp0 = atomicrmw xchg ptr %ptr, i32 %in seq_cst
169 define amdgpu_gfx void @flat_atomic_xchg_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
170 ; GCN1-LABEL: flat_atomic_xchg_i32_noret_offset_scalar:
172 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
173 ; GCN1-NEXT: s_add_u32 s34, s4, 16
174 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
175 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
176 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
177 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
178 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2
179 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
180 ; GCN1-NEXT: buffer_wbinvl1_vol
181 ; GCN1-NEXT: s_setpc_b64 s[30:31]
183 ; GCN2-LABEL: flat_atomic_xchg_i32_noret_offset_scalar:
185 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
186 ; GCN2-NEXT: s_add_u32 s34, s4, 16
187 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
188 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
189 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
190 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
191 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2
192 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
193 ; GCN2-NEXT: buffer_wbinvl1_vol
194 ; GCN2-NEXT: s_setpc_b64 s[30:31]
196 ; GCN3-LABEL: flat_atomic_xchg_i32_noret_offset_scalar:
198 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
199 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
200 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
201 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
202 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16
203 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
204 ; GCN3-NEXT: buffer_wbinvl1_vol
205 ; GCN3-NEXT: s_setpc_b64 s[30:31]
206 %gep = getelementptr i32, ptr %out, i32 4
207 %tmp0 = atomicrmw xchg ptr %gep, i32 %in seq_cst
211 define amdgpu_gfx i32 @flat_atomic_xchg_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
212 ; GCN1-LABEL: flat_atomic_xchg_i32_ret_scalar:
214 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
215 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
216 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
217 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
218 ; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
219 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
220 ; GCN1-NEXT: buffer_wbinvl1_vol
221 ; GCN1-NEXT: s_setpc_b64 s[30:31]
223 ; GCN2-LABEL: flat_atomic_xchg_i32_ret_scalar:
225 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
226 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
227 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
228 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
229 ; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
230 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
231 ; GCN2-NEXT: buffer_wbinvl1_vol
232 ; GCN2-NEXT: s_setpc_b64 s[30:31]
234 ; GCN3-LABEL: flat_atomic_xchg_i32_ret_scalar:
236 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
237 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
238 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
239 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
240 ; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
241 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
242 ; GCN3-NEXT: buffer_wbinvl1_vol
243 ; GCN3-NEXT: s_setpc_b64 s[30:31]
244 %result = atomicrmw xchg ptr %ptr, i32 %in seq_cst
248 define amdgpu_gfx i32 @flat_atomic_xchg_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
249 ; GCN1-LABEL: flat_atomic_xchg_i32_ret_offset_scalar:
251 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
252 ; GCN1-NEXT: s_add_u32 s34, s4, 16
253 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
254 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
255 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
256 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
257 ; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
258 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
259 ; GCN1-NEXT: buffer_wbinvl1_vol
260 ; GCN1-NEXT: s_setpc_b64 s[30:31]
262 ; GCN2-LABEL: flat_atomic_xchg_i32_ret_offset_scalar:
264 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
265 ; GCN2-NEXT: s_add_u32 s34, s4, 16
266 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
267 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
268 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
269 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
270 ; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
271 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
272 ; GCN2-NEXT: buffer_wbinvl1_vol
273 ; GCN2-NEXT: s_setpc_b64 s[30:31]
275 ; GCN3-LABEL: flat_atomic_xchg_i32_ret_offset_scalar:
277 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
278 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
279 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
280 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
281 ; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:16 glc
282 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
283 ; GCN3-NEXT: buffer_wbinvl1_vol
284 ; GCN3-NEXT: s_setpc_b64 s[30:31]
285 %gep = getelementptr i32, ptr %out, i32 4
286 %result = atomicrmw xchg ptr %gep, i32 %in seq_cst
290 define void @flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
291 ; GCN1-LABEL: flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory:
293 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
294 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
295 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
296 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2
297 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
298 ; GCN1-NEXT: buffer_wbinvl1_vol
299 ; GCN1-NEXT: s_setpc_b64 s[30:31]
301 ; GCN2-LABEL: flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory:
303 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
304 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
305 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
306 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2
307 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
308 ; GCN2-NEXT: buffer_wbinvl1_vol
309 ; GCN2-NEXT: s_setpc_b64 s[30:31]
311 ; GCN3-LABEL: flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory:
313 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
314 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16
315 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
316 ; GCN3-NEXT: buffer_wbinvl1_vol
317 ; GCN3-NEXT: s_setpc_b64 s[30:31]
318 %gep = getelementptr i32, ptr %out, i64 4
319 %tmp0 = atomicrmw xchg ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
323 define i32 @flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
324 ; GCN1-LABEL: flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory:
326 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
327 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
328 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
329 ; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
330 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
331 ; GCN1-NEXT: buffer_wbinvl1_vol
332 ; GCN1-NEXT: s_setpc_b64 s[30:31]
334 ; GCN2-LABEL: flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory:
336 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
337 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
338 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
339 ; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
340 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
341 ; GCN2-NEXT: buffer_wbinvl1_vol
342 ; GCN2-NEXT: s_setpc_b64 s[30:31]
344 ; GCN3-LABEL: flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory:
346 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
347 ; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:16 glc
348 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
349 ; GCN3-NEXT: buffer_wbinvl1_vol
350 ; GCN3-NEXT: s_setpc_b64 s[30:31]
351 %gep = getelementptr i32, ptr %out, i64 4
352 %result = atomicrmw xchg ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
356 ; ---------------------------------------------------------------------
358 ; ---------------------------------------------------------------------
360 define void @flat_atomic_xchg_f32_noret(ptr %ptr, float %in) {
361 ; GCN1-LABEL: flat_atomic_xchg_f32_noret:
363 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
364 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2
365 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
366 ; GCN1-NEXT: buffer_wbinvl1_vol
367 ; GCN1-NEXT: s_setpc_b64 s[30:31]
369 ; GCN2-LABEL: flat_atomic_xchg_f32_noret:
371 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
372 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2
373 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
374 ; GCN2-NEXT: buffer_wbinvl1_vol
375 ; GCN2-NEXT: s_setpc_b64 s[30:31]
377 ; GCN3-LABEL: flat_atomic_xchg_f32_noret:
379 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
380 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2
381 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
382 ; GCN3-NEXT: buffer_wbinvl1_vol
383 ; GCN3-NEXT: s_setpc_b64 s[30:31]
384 %tmp0 = atomicrmw xchg ptr %ptr, float %in seq_cst
388 define void @flat_atomic_xchg_f32_noret_offset(ptr %out, float %in) {
389 ; GCN1-LABEL: flat_atomic_xchg_f32_noret_offset:
391 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
392 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
393 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
394 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2
395 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
396 ; GCN1-NEXT: buffer_wbinvl1_vol
397 ; GCN1-NEXT: s_setpc_b64 s[30:31]
399 ; GCN2-LABEL: flat_atomic_xchg_f32_noret_offset:
401 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
402 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
403 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
404 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2
405 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
406 ; GCN2-NEXT: buffer_wbinvl1_vol
407 ; GCN2-NEXT: s_setpc_b64 s[30:31]
409 ; GCN3-LABEL: flat_atomic_xchg_f32_noret_offset:
411 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
412 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16
413 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
414 ; GCN3-NEXT: buffer_wbinvl1_vol
415 ; GCN3-NEXT: s_setpc_b64 s[30:31]
416 %gep = getelementptr float, ptr %out, i32 4
417 %tmp0 = atomicrmw xchg ptr %gep, float %in seq_cst
421 define float @flat_atomic_xchg_f32_ret(ptr %ptr, float %in) {
422 ; GCN1-LABEL: flat_atomic_xchg_f32_ret:
424 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
425 ; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
426 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
427 ; GCN1-NEXT: buffer_wbinvl1_vol
428 ; GCN1-NEXT: s_setpc_b64 s[30:31]
430 ; GCN2-LABEL: flat_atomic_xchg_f32_ret:
432 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
433 ; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
434 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
435 ; GCN2-NEXT: buffer_wbinvl1_vol
436 ; GCN2-NEXT: s_setpc_b64 s[30:31]
438 ; GCN3-LABEL: flat_atomic_xchg_f32_ret:
440 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
441 ; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
442 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
443 ; GCN3-NEXT: buffer_wbinvl1_vol
444 ; GCN3-NEXT: s_setpc_b64 s[30:31]
445 %result = atomicrmw xchg ptr %ptr, float %in seq_cst
449 define float @flat_atomic_xchg_f32_ret_offset(ptr %out, float %in) {
450 ; GCN1-LABEL: flat_atomic_xchg_f32_ret_offset:
452 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
453 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
454 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
455 ; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
456 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
457 ; GCN1-NEXT: buffer_wbinvl1_vol
458 ; GCN1-NEXT: s_setpc_b64 s[30:31]
460 ; GCN2-LABEL: flat_atomic_xchg_f32_ret_offset:
462 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
463 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
464 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
465 ; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
466 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
467 ; GCN2-NEXT: buffer_wbinvl1_vol
468 ; GCN2-NEXT: s_setpc_b64 s[30:31]
470 ; GCN3-LABEL: flat_atomic_xchg_f32_ret_offset:
472 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
473 ; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:16 glc
474 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
475 ; GCN3-NEXT: buffer_wbinvl1_vol
476 ; GCN3-NEXT: s_setpc_b64 s[30:31]
477 %gep = getelementptr float, ptr %out, i32 4
478 %result = atomicrmw xchg ptr %gep, float %in seq_cst
482 define amdgpu_gfx void @flat_atomic_xchg_f32_noret_scalar(ptr inreg %ptr, float inreg %in) {
483 ; GCN1-LABEL: flat_atomic_xchg_f32_noret_scalar:
485 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
486 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
487 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
488 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
489 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2
490 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
491 ; GCN1-NEXT: buffer_wbinvl1_vol
492 ; GCN1-NEXT: s_setpc_b64 s[30:31]
494 ; GCN2-LABEL: flat_atomic_xchg_f32_noret_scalar:
496 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
497 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
498 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
499 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
500 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2
501 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
502 ; GCN2-NEXT: buffer_wbinvl1_vol
503 ; GCN2-NEXT: s_setpc_b64 s[30:31]
505 ; GCN3-LABEL: flat_atomic_xchg_f32_noret_scalar:
507 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
508 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
509 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
510 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
511 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2
512 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
513 ; GCN3-NEXT: buffer_wbinvl1_vol
514 ; GCN3-NEXT: s_setpc_b64 s[30:31]
515 %tmp0 = atomicrmw xchg ptr %ptr, float %in seq_cst
519 define amdgpu_gfx void @flat_atomic_xchg_f32_noret_offset_scalar(ptr inreg %out, float inreg %in) {
520 ; GCN1-LABEL: flat_atomic_xchg_f32_noret_offset_scalar:
522 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
523 ; GCN1-NEXT: s_add_u32 s34, s4, 16
524 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
525 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
526 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
527 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
528 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2
529 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
530 ; GCN1-NEXT: buffer_wbinvl1_vol
531 ; GCN1-NEXT: s_setpc_b64 s[30:31]
533 ; GCN2-LABEL: flat_atomic_xchg_f32_noret_offset_scalar:
535 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
536 ; GCN2-NEXT: s_add_u32 s34, s4, 16
537 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
538 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
539 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
540 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
541 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2
542 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
543 ; GCN2-NEXT: buffer_wbinvl1_vol
544 ; GCN2-NEXT: s_setpc_b64 s[30:31]
546 ; GCN3-LABEL: flat_atomic_xchg_f32_noret_offset_scalar:
548 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
549 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
550 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
551 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
552 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16
553 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
554 ; GCN3-NEXT: buffer_wbinvl1_vol
555 ; GCN3-NEXT: s_setpc_b64 s[30:31]
556 %gep = getelementptr float, ptr %out, i32 4
557 %tmp0 = atomicrmw xchg ptr %gep, float %in seq_cst
561 define amdgpu_gfx float @flat_atomic_xchg_f32_ret_scalar(ptr inreg %ptr, float inreg %in) {
562 ; GCN1-LABEL: flat_atomic_xchg_f32_ret_scalar:
564 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
565 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
566 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
567 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
568 ; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
569 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
570 ; GCN1-NEXT: buffer_wbinvl1_vol
571 ; GCN1-NEXT: s_setpc_b64 s[30:31]
573 ; GCN2-LABEL: flat_atomic_xchg_f32_ret_scalar:
575 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
576 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
577 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
578 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
579 ; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
580 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
581 ; GCN2-NEXT: buffer_wbinvl1_vol
582 ; GCN2-NEXT: s_setpc_b64 s[30:31]
584 ; GCN3-LABEL: flat_atomic_xchg_f32_ret_scalar:
586 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
587 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
588 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
589 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
590 ; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
591 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
592 ; GCN3-NEXT: buffer_wbinvl1_vol
593 ; GCN3-NEXT: s_setpc_b64 s[30:31]
594 %result = atomicrmw xchg ptr %ptr, float %in seq_cst
598 define amdgpu_gfx float @flat_atomic_xchg_f32_ret_offset_scalar(ptr inreg %out, float inreg %in) {
599 ; GCN1-LABEL: flat_atomic_xchg_f32_ret_offset_scalar:
601 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
602 ; GCN1-NEXT: s_add_u32 s34, s4, 16
603 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
604 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
605 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
606 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
607 ; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
608 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
609 ; GCN1-NEXT: buffer_wbinvl1_vol
610 ; GCN1-NEXT: s_setpc_b64 s[30:31]
612 ; GCN2-LABEL: flat_atomic_xchg_f32_ret_offset_scalar:
614 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
615 ; GCN2-NEXT: s_add_u32 s34, s4, 16
616 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
617 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
618 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
619 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
620 ; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
621 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
622 ; GCN2-NEXT: buffer_wbinvl1_vol
623 ; GCN2-NEXT: s_setpc_b64 s[30:31]
625 ; GCN3-LABEL: flat_atomic_xchg_f32_ret_offset_scalar:
627 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
628 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
629 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
630 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
631 ; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:16 glc
632 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
633 ; GCN3-NEXT: buffer_wbinvl1_vol
634 ; GCN3-NEXT: s_setpc_b64 s[30:31]
635 %gep = getelementptr float, ptr %out, i32 4
636 %result = atomicrmw xchg ptr %gep, float %in seq_cst
640 define void @flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory(ptr %out, float %in) {
641 ; GCN1-LABEL: flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory:
643 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
644 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
645 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
646 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2
647 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
648 ; GCN1-NEXT: buffer_wbinvl1_vol
649 ; GCN1-NEXT: s_setpc_b64 s[30:31]
651 ; GCN2-LABEL: flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory:
653 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
654 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
655 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
656 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2
657 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
658 ; GCN2-NEXT: buffer_wbinvl1_vol
659 ; GCN2-NEXT: s_setpc_b64 s[30:31]
661 ; GCN3-LABEL: flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory:
663 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
664 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16
665 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
666 ; GCN3-NEXT: buffer_wbinvl1_vol
667 ; GCN3-NEXT: s_setpc_b64 s[30:31]
668 %gep = getelementptr float, ptr %out, i64 4
669 %tmp0 = atomicrmw xchg ptr %gep, float %in seq_cst, !amdgpu.no.remote.memory !0
673 define float @flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory(ptr %out, float %in) {
674 ; GCN1-LABEL: flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory:
676 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
677 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
678 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
679 ; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
680 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
681 ; GCN1-NEXT: buffer_wbinvl1_vol
682 ; GCN1-NEXT: s_setpc_b64 s[30:31]
684 ; GCN2-LABEL: flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory:
686 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
687 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
688 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
689 ; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
690 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
691 ; GCN2-NEXT: buffer_wbinvl1_vol
692 ; GCN2-NEXT: s_setpc_b64 s[30:31]
694 ; GCN3-LABEL: flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory:
696 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
697 ; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:16 glc
698 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
699 ; GCN3-NEXT: buffer_wbinvl1_vol
700 ; GCN3-NEXT: s_setpc_b64 s[30:31]
701 %gep = getelementptr float, ptr %out, i64 4
702 %result = atomicrmw xchg ptr %gep, float %in seq_cst, !amdgpu.no.remote.memory !0
706 ; ---------------------------------------------------------------------
708 ; ---------------------------------------------------------------------
710 define void @flat_atomic_add_i32_noret(ptr %ptr, i32 %in) {
711 ; GCN1-LABEL: flat_atomic_add_i32_noret:
713 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
714 ; GCN1-NEXT: flat_atomic_add v[0:1], v2
715 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
716 ; GCN1-NEXT: buffer_wbinvl1_vol
717 ; GCN1-NEXT: s_setpc_b64 s[30:31]
719 ; GCN2-LABEL: flat_atomic_add_i32_noret:
721 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
722 ; GCN2-NEXT: flat_atomic_add v[0:1], v2
723 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
724 ; GCN2-NEXT: buffer_wbinvl1_vol
725 ; GCN2-NEXT: s_setpc_b64 s[30:31]
727 ; GCN3-LABEL: flat_atomic_add_i32_noret:
729 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
730 ; GCN3-NEXT: flat_atomic_add v[0:1], v2
731 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
732 ; GCN3-NEXT: buffer_wbinvl1_vol
733 ; GCN3-NEXT: s_setpc_b64 s[30:31]
734 %tmp0 = atomicrmw add ptr %ptr, i32 %in seq_cst
738 define void @flat_atomic_add_i32_noret_offset(ptr %out, i32 %in) {
739 ; GCN1-LABEL: flat_atomic_add_i32_noret_offset:
741 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
742 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
743 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
744 ; GCN1-NEXT: flat_atomic_add v[0:1], v2
745 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
746 ; GCN1-NEXT: buffer_wbinvl1_vol
747 ; GCN1-NEXT: s_setpc_b64 s[30:31]
749 ; GCN2-LABEL: flat_atomic_add_i32_noret_offset:
751 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
752 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
753 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
754 ; GCN2-NEXT: flat_atomic_add v[0:1], v2
755 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
756 ; GCN2-NEXT: buffer_wbinvl1_vol
757 ; GCN2-NEXT: s_setpc_b64 s[30:31]
759 ; GCN3-LABEL: flat_atomic_add_i32_noret_offset:
761 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
762 ; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:16
763 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
764 ; GCN3-NEXT: buffer_wbinvl1_vol
765 ; GCN3-NEXT: s_setpc_b64 s[30:31]
766 %gep = getelementptr i32, ptr %out, i32 4
767 %tmp0 = atomicrmw add ptr %gep, i32 %in seq_cst
771 define i32 @flat_atomic_add_i32_ret(ptr %ptr, i32 %in) {
772 ; GCN1-LABEL: flat_atomic_add_i32_ret:
774 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
775 ; GCN1-NEXT: flat_atomic_add v0, v[0:1], v2 glc
776 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
777 ; GCN1-NEXT: buffer_wbinvl1_vol
778 ; GCN1-NEXT: s_setpc_b64 s[30:31]
780 ; GCN2-LABEL: flat_atomic_add_i32_ret:
782 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
783 ; GCN2-NEXT: flat_atomic_add v0, v[0:1], v2 glc
784 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
785 ; GCN2-NEXT: buffer_wbinvl1_vol
786 ; GCN2-NEXT: s_setpc_b64 s[30:31]
788 ; GCN3-LABEL: flat_atomic_add_i32_ret:
790 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
791 ; GCN3-NEXT: flat_atomic_add v0, v[0:1], v2 glc
792 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
793 ; GCN3-NEXT: buffer_wbinvl1_vol
794 ; GCN3-NEXT: s_setpc_b64 s[30:31]
795 %result = atomicrmw add ptr %ptr, i32 %in seq_cst
799 define i32 @flat_atomic_add_i32_ret_offset(ptr %out, i32 %in) {
800 ; GCN1-LABEL: flat_atomic_add_i32_ret_offset:
802 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
803 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
804 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
805 ; GCN1-NEXT: flat_atomic_add v0, v[0:1], v2 glc
806 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
807 ; GCN1-NEXT: buffer_wbinvl1_vol
808 ; GCN1-NEXT: s_setpc_b64 s[30:31]
810 ; GCN2-LABEL: flat_atomic_add_i32_ret_offset:
812 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
813 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
814 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
815 ; GCN2-NEXT: flat_atomic_add v0, v[0:1], v2 glc
816 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
817 ; GCN2-NEXT: buffer_wbinvl1_vol
818 ; GCN2-NEXT: s_setpc_b64 s[30:31]
820 ; GCN3-LABEL: flat_atomic_add_i32_ret_offset:
822 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
823 ; GCN3-NEXT: flat_atomic_add v0, v[0:1], v2 offset:16 glc
824 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
825 ; GCN3-NEXT: buffer_wbinvl1_vol
826 ; GCN3-NEXT: s_setpc_b64 s[30:31]
827 %gep = getelementptr i32, ptr %out, i32 4
828 %result = atomicrmw add ptr %gep, i32 %in seq_cst
832 define amdgpu_gfx void @flat_atomic_add_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
833 ; GCN1-LABEL: flat_atomic_add_i32_noret_scalar:
835 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
836 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
837 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
838 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
839 ; GCN1-NEXT: flat_atomic_add v[0:1], v2
840 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
841 ; GCN1-NEXT: buffer_wbinvl1_vol
842 ; GCN1-NEXT: s_setpc_b64 s[30:31]
844 ; GCN2-LABEL: flat_atomic_add_i32_noret_scalar:
846 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
847 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
848 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
849 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
850 ; GCN2-NEXT: flat_atomic_add v[0:1], v2
851 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
852 ; GCN2-NEXT: buffer_wbinvl1_vol
853 ; GCN2-NEXT: s_setpc_b64 s[30:31]
855 ; GCN3-LABEL: flat_atomic_add_i32_noret_scalar:
857 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
858 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
859 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
860 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
861 ; GCN3-NEXT: flat_atomic_add v[0:1], v2
862 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
863 ; GCN3-NEXT: buffer_wbinvl1_vol
864 ; GCN3-NEXT: s_setpc_b64 s[30:31]
865 %tmp0 = atomicrmw add ptr %ptr, i32 %in seq_cst
869 define amdgpu_gfx void @flat_atomic_add_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
870 ; GCN1-LABEL: flat_atomic_add_i32_noret_offset_scalar:
872 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
873 ; GCN1-NEXT: s_add_u32 s34, s4, 16
874 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
875 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
876 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
877 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
878 ; GCN1-NEXT: flat_atomic_add v[0:1], v2
879 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
880 ; GCN1-NEXT: buffer_wbinvl1_vol
881 ; GCN1-NEXT: s_setpc_b64 s[30:31]
883 ; GCN2-LABEL: flat_atomic_add_i32_noret_offset_scalar:
885 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
886 ; GCN2-NEXT: s_add_u32 s34, s4, 16
887 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
888 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
889 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
890 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
891 ; GCN2-NEXT: flat_atomic_add v[0:1], v2
892 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
893 ; GCN2-NEXT: buffer_wbinvl1_vol
894 ; GCN2-NEXT: s_setpc_b64 s[30:31]
896 ; GCN3-LABEL: flat_atomic_add_i32_noret_offset_scalar:
898 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
899 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
900 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
901 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
902 ; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:16
903 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
904 ; GCN3-NEXT: buffer_wbinvl1_vol
905 ; GCN3-NEXT: s_setpc_b64 s[30:31]
906 %gep = getelementptr i32, ptr %out, i32 4
907 %tmp0 = atomicrmw add ptr %gep, i32 %in seq_cst
911 define amdgpu_gfx i32 @flat_atomic_add_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
912 ; GCN1-LABEL: flat_atomic_add_i32_ret_scalar:
914 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
915 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
916 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
917 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
918 ; GCN1-NEXT: flat_atomic_add v0, v[0:1], v2 glc
919 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
920 ; GCN1-NEXT: buffer_wbinvl1_vol
921 ; GCN1-NEXT: s_setpc_b64 s[30:31]
923 ; GCN2-LABEL: flat_atomic_add_i32_ret_scalar:
925 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
926 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
927 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
928 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
929 ; GCN2-NEXT: flat_atomic_add v0, v[0:1], v2 glc
930 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
931 ; GCN2-NEXT: buffer_wbinvl1_vol
932 ; GCN2-NEXT: s_setpc_b64 s[30:31]
934 ; GCN3-LABEL: flat_atomic_add_i32_ret_scalar:
936 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
937 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
938 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
939 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
940 ; GCN3-NEXT: flat_atomic_add v0, v[0:1], v2 glc
941 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
942 ; GCN3-NEXT: buffer_wbinvl1_vol
943 ; GCN3-NEXT: s_setpc_b64 s[30:31]
944 %result = atomicrmw add ptr %ptr, i32 %in seq_cst
948 define amdgpu_gfx i32 @flat_atomic_add_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
949 ; GCN1-LABEL: flat_atomic_add_i32_ret_offset_scalar:
951 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
952 ; GCN1-NEXT: s_add_u32 s34, s4, 16
953 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
954 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
955 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
956 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
957 ; GCN1-NEXT: flat_atomic_add v0, v[0:1], v2 glc
958 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
959 ; GCN1-NEXT: buffer_wbinvl1_vol
960 ; GCN1-NEXT: s_setpc_b64 s[30:31]
962 ; GCN2-LABEL: flat_atomic_add_i32_ret_offset_scalar:
964 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
965 ; GCN2-NEXT: s_add_u32 s34, s4, 16
966 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
967 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
968 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
969 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
970 ; GCN2-NEXT: flat_atomic_add v0, v[0:1], v2 glc
971 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
972 ; GCN2-NEXT: buffer_wbinvl1_vol
973 ; GCN2-NEXT: s_setpc_b64 s[30:31]
975 ; GCN3-LABEL: flat_atomic_add_i32_ret_offset_scalar:
977 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
978 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
979 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
980 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
981 ; GCN3-NEXT: flat_atomic_add v0, v[0:1], v2 offset:16 glc
982 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
983 ; GCN3-NEXT: buffer_wbinvl1_vol
984 ; GCN3-NEXT: s_setpc_b64 s[30:31]
985 %gep = getelementptr i32, ptr %out, i32 4
986 %result = atomicrmw add ptr %gep, i32 %in seq_cst
990 define void @flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
991 ; GCN1-LABEL: flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory:
993 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
994 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
995 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
996 ; GCN1-NEXT: flat_atomic_add v[0:1], v2
997 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
998 ; GCN1-NEXT: buffer_wbinvl1_vol
999 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1001 ; GCN2-LABEL: flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory:
1003 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1004 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
1005 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1006 ; GCN2-NEXT: flat_atomic_add v[0:1], v2
1007 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1008 ; GCN2-NEXT: buffer_wbinvl1_vol
1009 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1011 ; GCN3-LABEL: flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory:
1013 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1014 ; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:16
1015 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1016 ; GCN3-NEXT: buffer_wbinvl1_vol
1017 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1018 %gep = getelementptr i32, ptr %out, i64 4
1019 %tmp0 = atomicrmw add ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
1023 define i32 @flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
1024 ; GCN1-LABEL: flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory:
1026 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1027 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
1028 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1029 ; GCN1-NEXT: flat_atomic_add v0, v[0:1], v2 glc
1030 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1031 ; GCN1-NEXT: buffer_wbinvl1_vol
1032 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1034 ; GCN2-LABEL: flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory:
1036 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1037 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
1038 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1039 ; GCN2-NEXT: flat_atomic_add v0, v[0:1], v2 glc
1040 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1041 ; GCN2-NEXT: buffer_wbinvl1_vol
1042 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1044 ; GCN3-LABEL: flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory:
1046 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1047 ; GCN3-NEXT: flat_atomic_add v0, v[0:1], v2 offset:16 glc
1048 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1049 ; GCN3-NEXT: buffer_wbinvl1_vol
1050 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1051 %gep = getelementptr i32, ptr %out, i64 4
1052 %result = atomicrmw add ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
1056 ; ---------------------------------------------------------------------
1058 ; ---------------------------------------------------------------------
1060 define void @flat_atomic_sub_i32_noret(ptr %ptr, i32 %in) {
1061 ; GCN1-LABEL: flat_atomic_sub_i32_noret:
1063 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1064 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2
1065 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1066 ; GCN1-NEXT: buffer_wbinvl1_vol
1067 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1069 ; GCN2-LABEL: flat_atomic_sub_i32_noret:
1071 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1072 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2
1073 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1074 ; GCN2-NEXT: buffer_wbinvl1_vol
1075 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1077 ; GCN3-LABEL: flat_atomic_sub_i32_noret:
1079 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1080 ; GCN3-NEXT: flat_atomic_sub v[0:1], v2
1081 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1082 ; GCN3-NEXT: buffer_wbinvl1_vol
1083 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1084 %tmp0 = atomicrmw sub ptr %ptr, i32 %in seq_cst
1088 define void @flat_atomic_sub_i32_noret_offset(ptr %out, i32 %in) {
1089 ; GCN1-LABEL: flat_atomic_sub_i32_noret_offset:
1091 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1092 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
1093 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1094 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2
1095 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1096 ; GCN1-NEXT: buffer_wbinvl1_vol
1097 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1099 ; GCN2-LABEL: flat_atomic_sub_i32_noret_offset:
1101 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1102 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
1103 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1104 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2
1105 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1106 ; GCN2-NEXT: buffer_wbinvl1_vol
1107 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1109 ; GCN3-LABEL: flat_atomic_sub_i32_noret_offset:
1111 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1112 ; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16
1113 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1114 ; GCN3-NEXT: buffer_wbinvl1_vol
1115 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1116 %gep = getelementptr i32, ptr %out, i32 4
1117 %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst
1121 define i32 @flat_atomic_sub_i32_ret(ptr %ptr, i32 %in) {
1122 ; GCN1-LABEL: flat_atomic_sub_i32_ret:
1124 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1125 ; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
1126 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1127 ; GCN1-NEXT: buffer_wbinvl1_vol
1128 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1130 ; GCN2-LABEL: flat_atomic_sub_i32_ret:
1132 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1133 ; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
1134 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1135 ; GCN2-NEXT: buffer_wbinvl1_vol
1136 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1138 ; GCN3-LABEL: flat_atomic_sub_i32_ret:
1140 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1141 ; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
1142 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1143 ; GCN3-NEXT: buffer_wbinvl1_vol
1144 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1145 %result = atomicrmw sub ptr %ptr, i32 %in seq_cst
1149 define i32 @flat_atomic_sub_i32_ret_offset(ptr %out, i32 %in) {
1150 ; GCN1-LABEL: flat_atomic_sub_i32_ret_offset:
1152 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1153 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
1154 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1155 ; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
1156 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1157 ; GCN1-NEXT: buffer_wbinvl1_vol
1158 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1160 ; GCN2-LABEL: flat_atomic_sub_i32_ret_offset:
1162 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1163 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
1164 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1165 ; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
1166 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1167 ; GCN2-NEXT: buffer_wbinvl1_vol
1168 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1170 ; GCN3-LABEL: flat_atomic_sub_i32_ret_offset:
1172 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1173 ; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:16 glc
1174 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1175 ; GCN3-NEXT: buffer_wbinvl1_vol
1176 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1177 %gep = getelementptr i32, ptr %out, i32 4
1178 %result = atomicrmw sub ptr %gep, i32 %in seq_cst
1182 define amdgpu_gfx void @flat_atomic_sub_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
1183 ; GCN1-LABEL: flat_atomic_sub_i32_noret_scalar:
1185 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1186 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
1187 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
1188 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
1189 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2
1190 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1191 ; GCN1-NEXT: buffer_wbinvl1_vol
1192 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1194 ; GCN2-LABEL: flat_atomic_sub_i32_noret_scalar:
1196 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1197 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
1198 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
1199 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
1200 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2
1201 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1202 ; GCN2-NEXT: buffer_wbinvl1_vol
1203 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1205 ; GCN3-LABEL: flat_atomic_sub_i32_noret_scalar:
1207 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1208 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
1209 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
1210 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
1211 ; GCN3-NEXT: flat_atomic_sub v[0:1], v2
1212 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1213 ; GCN3-NEXT: buffer_wbinvl1_vol
1214 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1215 %tmp0 = atomicrmw sub ptr %ptr, i32 %in seq_cst
1219 define amdgpu_gfx void @flat_atomic_sub_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
1220 ; GCN1-LABEL: flat_atomic_sub_i32_noret_offset_scalar:
1222 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1223 ; GCN1-NEXT: s_add_u32 s34, s4, 16
1224 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
1225 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
1226 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
1227 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
1228 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2
1229 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1230 ; GCN1-NEXT: buffer_wbinvl1_vol
1231 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1233 ; GCN2-LABEL: flat_atomic_sub_i32_noret_offset_scalar:
1235 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1236 ; GCN2-NEXT: s_add_u32 s34, s4, 16
1237 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
1238 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
1239 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
1240 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
1241 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2
1242 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1243 ; GCN2-NEXT: buffer_wbinvl1_vol
1244 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1246 ; GCN3-LABEL: flat_atomic_sub_i32_noret_offset_scalar:
1248 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1249 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
1250 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
1251 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
1252 ; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16
1253 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1254 ; GCN3-NEXT: buffer_wbinvl1_vol
1255 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1256 %gep = getelementptr i32, ptr %out, i32 4
1257 %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst
1261 define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
1262 ; GCN1-LABEL: flat_atomic_sub_i32_ret_scalar:
1264 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1265 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
1266 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
1267 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
1268 ; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
1269 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1270 ; GCN1-NEXT: buffer_wbinvl1_vol
1271 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1273 ; GCN2-LABEL: flat_atomic_sub_i32_ret_scalar:
1275 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1276 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
1277 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
1278 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
1279 ; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
1280 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1281 ; GCN2-NEXT: buffer_wbinvl1_vol
1282 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1284 ; GCN3-LABEL: flat_atomic_sub_i32_ret_scalar:
1286 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1287 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
1288 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
1289 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
1290 ; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
1291 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1292 ; GCN3-NEXT: buffer_wbinvl1_vol
1293 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1294 %result = atomicrmw sub ptr %ptr, i32 %in seq_cst
1298 define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
1299 ; GCN1-LABEL: flat_atomic_sub_i32_ret_offset_scalar:
1301 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1302 ; GCN1-NEXT: s_add_u32 s34, s4, 16
1303 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
1304 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
1305 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
1306 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
1307 ; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
1308 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1309 ; GCN1-NEXT: buffer_wbinvl1_vol
1310 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1312 ; GCN2-LABEL: flat_atomic_sub_i32_ret_offset_scalar:
1314 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1315 ; GCN2-NEXT: s_add_u32 s34, s4, 16
1316 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
1317 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
1318 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
1319 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
1320 ; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
1321 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1322 ; GCN2-NEXT: buffer_wbinvl1_vol
1323 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1325 ; GCN3-LABEL: flat_atomic_sub_i32_ret_offset_scalar:
1327 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1328 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
1329 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
1330 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
1331 ; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:16 glc
1332 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1333 ; GCN3-NEXT: buffer_wbinvl1_vol
1334 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1335 %gep = getelementptr i32, ptr %out, i32 4
1336 %result = atomicrmw sub ptr %gep, i32 %in seq_cst
1340 define void @flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
1341 ; GCN1-LABEL: flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory:
1343 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1344 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
1345 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1346 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2
1347 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1348 ; GCN1-NEXT: buffer_wbinvl1_vol
1349 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1351 ; GCN2-LABEL: flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory:
1353 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1354 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
1355 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1356 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2
1357 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1358 ; GCN2-NEXT: buffer_wbinvl1_vol
1359 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1361 ; GCN3-LABEL: flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory:
1363 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1364 ; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16
1365 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1366 ; GCN3-NEXT: buffer_wbinvl1_vol
1367 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1368 %gep = getelementptr i32, ptr %out, i64 4
1369 %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
1373 define i32 @flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
1374 ; GCN1-LABEL: flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory:
1376 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1377 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
1378 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1379 ; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
1380 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1381 ; GCN1-NEXT: buffer_wbinvl1_vol
1382 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1384 ; GCN2-LABEL: flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory:
1386 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1387 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
1388 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1389 ; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
1390 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1391 ; GCN2-NEXT: buffer_wbinvl1_vol
1392 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1394 ; GCN3-LABEL: flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory:
1396 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1397 ; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:16 glc
1398 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1399 ; GCN3-NEXT: buffer_wbinvl1_vol
1400 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1401 %gep = getelementptr i32, ptr %out, i64 4
1402 %result = atomicrmw sub ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
1406 ; ---------------------------------------------------------------------
1408 ; ---------------------------------------------------------------------
1410 define void @flat_atomic_and_i32_noret(ptr %ptr, i32 %in) {
1411 ; GCN1-LABEL: flat_atomic_and_i32_noret:
1413 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1414 ; GCN1-NEXT: flat_atomic_and v[0:1], v2
1415 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1416 ; GCN1-NEXT: buffer_wbinvl1_vol
1417 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1419 ; GCN2-LABEL: flat_atomic_and_i32_noret:
1421 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1422 ; GCN2-NEXT: flat_atomic_and v[0:1], v2
1423 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1424 ; GCN2-NEXT: buffer_wbinvl1_vol
1425 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1427 ; GCN3-LABEL: flat_atomic_and_i32_noret:
1429 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1430 ; GCN3-NEXT: flat_atomic_and v[0:1], v2
1431 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1432 ; GCN3-NEXT: buffer_wbinvl1_vol
1433 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1434 %tmp0 = atomicrmw and ptr %ptr, i32 %in seq_cst
1438 define void @flat_atomic_and_i32_noret_offset(ptr %out, i32 %in) {
1439 ; GCN1-LABEL: flat_atomic_and_i32_noret_offset:
1441 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1442 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
1443 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1444 ; GCN1-NEXT: flat_atomic_and v[0:1], v2
1445 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1446 ; GCN1-NEXT: buffer_wbinvl1_vol
1447 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1449 ; GCN2-LABEL: flat_atomic_and_i32_noret_offset:
1451 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1452 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
1453 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1454 ; GCN2-NEXT: flat_atomic_and v[0:1], v2
1455 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1456 ; GCN2-NEXT: buffer_wbinvl1_vol
1457 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1459 ; GCN3-LABEL: flat_atomic_and_i32_noret_offset:
1461 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1462 ; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16
1463 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1464 ; GCN3-NEXT: buffer_wbinvl1_vol
1465 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1466 %gep = getelementptr i32, ptr %out, i32 4
1467 %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst
1471 define i32 @flat_atomic_and_i32_ret(ptr %ptr, i32 %in) {
1472 ; GCN1-LABEL: flat_atomic_and_i32_ret:
1474 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1475 ; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc
1476 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1477 ; GCN1-NEXT: buffer_wbinvl1_vol
1478 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1480 ; GCN2-LABEL: flat_atomic_and_i32_ret:
1482 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1483 ; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc
1484 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1485 ; GCN2-NEXT: buffer_wbinvl1_vol
1486 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1488 ; GCN3-LABEL: flat_atomic_and_i32_ret:
1490 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1491 ; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 glc
1492 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1493 ; GCN3-NEXT: buffer_wbinvl1_vol
1494 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1495 %result = atomicrmw and ptr %ptr, i32 %in seq_cst
1499 define i32 @flat_atomic_and_i32_ret_offset(ptr %out, i32 %in) {
1500 ; GCN1-LABEL: flat_atomic_and_i32_ret_offset:
1502 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1503 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
1504 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1505 ; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc
1506 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1507 ; GCN1-NEXT: buffer_wbinvl1_vol
1508 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1510 ; GCN2-LABEL: flat_atomic_and_i32_ret_offset:
1512 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1513 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
1514 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1515 ; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc
1516 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1517 ; GCN2-NEXT: buffer_wbinvl1_vol
1518 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1520 ; GCN3-LABEL: flat_atomic_and_i32_ret_offset:
1522 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1523 ; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 offset:16 glc
1524 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1525 ; GCN3-NEXT: buffer_wbinvl1_vol
1526 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1527 %gep = getelementptr i32, ptr %out, i32 4
1528 %result = atomicrmw and ptr %gep, i32 %in seq_cst
1532 define amdgpu_gfx void @flat_atomic_and_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
1533 ; GCN1-LABEL: flat_atomic_and_i32_noret_scalar:
1535 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1536 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
1537 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
1538 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
1539 ; GCN1-NEXT: flat_atomic_and v[0:1], v2
1540 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1541 ; GCN1-NEXT: buffer_wbinvl1_vol
1542 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1544 ; GCN2-LABEL: flat_atomic_and_i32_noret_scalar:
1546 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1547 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
1548 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
1549 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
1550 ; GCN2-NEXT: flat_atomic_and v[0:1], v2
1551 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1552 ; GCN2-NEXT: buffer_wbinvl1_vol
1553 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1555 ; GCN3-LABEL: flat_atomic_and_i32_noret_scalar:
1557 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1558 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
1559 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
1560 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
1561 ; GCN3-NEXT: flat_atomic_and v[0:1], v2
1562 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1563 ; GCN3-NEXT: buffer_wbinvl1_vol
1564 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1565 %tmp0 = atomicrmw and ptr %ptr, i32 %in seq_cst
1569 define amdgpu_gfx void @flat_atomic_and_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
1570 ; GCN1-LABEL: flat_atomic_and_i32_noret_offset_scalar:
1572 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1573 ; GCN1-NEXT: s_add_u32 s34, s4, 16
1574 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
1575 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
1576 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
1577 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
1578 ; GCN1-NEXT: flat_atomic_and v[0:1], v2
1579 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1580 ; GCN1-NEXT: buffer_wbinvl1_vol
1581 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1583 ; GCN2-LABEL: flat_atomic_and_i32_noret_offset_scalar:
1585 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1586 ; GCN2-NEXT: s_add_u32 s34, s4, 16
1587 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
1588 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
1589 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
1590 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
1591 ; GCN2-NEXT: flat_atomic_and v[0:1], v2
1592 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1593 ; GCN2-NEXT: buffer_wbinvl1_vol
1594 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1596 ; GCN3-LABEL: flat_atomic_and_i32_noret_offset_scalar:
1598 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1599 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
1600 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
1601 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
1602 ; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16
1603 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1604 ; GCN3-NEXT: buffer_wbinvl1_vol
1605 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1606 %gep = getelementptr i32, ptr %out, i32 4
1607 %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst
1611 define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
1612 ; GCN1-LABEL: flat_atomic_and_i32_ret_scalar:
1614 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1615 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
1616 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
1617 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
1618 ; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc
1619 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1620 ; GCN1-NEXT: buffer_wbinvl1_vol
1621 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1623 ; GCN2-LABEL: flat_atomic_and_i32_ret_scalar:
1625 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1626 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
1627 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
1628 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
1629 ; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc
1630 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1631 ; GCN2-NEXT: buffer_wbinvl1_vol
1632 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1634 ; GCN3-LABEL: flat_atomic_and_i32_ret_scalar:
1636 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1637 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
1638 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
1639 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
1640 ; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 glc
1641 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1642 ; GCN3-NEXT: buffer_wbinvl1_vol
1643 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1644 %result = atomicrmw and ptr %ptr, i32 %in seq_cst
1648 define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
1649 ; GCN1-LABEL: flat_atomic_and_i32_ret_offset_scalar:
1651 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1652 ; GCN1-NEXT: s_add_u32 s34, s4, 16
1653 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
1654 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
1655 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
1656 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
1657 ; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc
1658 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1659 ; GCN1-NEXT: buffer_wbinvl1_vol
1660 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1662 ; GCN2-LABEL: flat_atomic_and_i32_ret_offset_scalar:
1664 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1665 ; GCN2-NEXT: s_add_u32 s34, s4, 16
1666 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
1667 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
1668 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
1669 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
1670 ; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc
1671 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1672 ; GCN2-NEXT: buffer_wbinvl1_vol
1673 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1675 ; GCN3-LABEL: flat_atomic_and_i32_ret_offset_scalar:
1677 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1678 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
1679 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
1680 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
1681 ; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 offset:16 glc
1682 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1683 ; GCN3-NEXT: buffer_wbinvl1_vol
1684 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1685 %gep = getelementptr i32, ptr %out, i32 4
1686 %result = atomicrmw and ptr %gep, i32 %in seq_cst
1690 define void @flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
1691 ; GCN1-LABEL: flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory:
1693 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1694 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
1695 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1696 ; GCN1-NEXT: flat_atomic_and v[0:1], v2
1697 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1698 ; GCN1-NEXT: buffer_wbinvl1_vol
1699 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1701 ; GCN2-LABEL: flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory:
1703 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1704 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
1705 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1706 ; GCN2-NEXT: flat_atomic_and v[0:1], v2
1707 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1708 ; GCN2-NEXT: buffer_wbinvl1_vol
1709 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1711 ; GCN3-LABEL: flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory:
1713 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1714 ; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16
1715 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1716 ; GCN3-NEXT: buffer_wbinvl1_vol
1717 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1718 %gep = getelementptr i32, ptr %out, i64 4
1719 %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
1723 define i32 @flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
1724 ; GCN1-LABEL: flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory:
1726 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1727 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
1728 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1729 ; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc
1730 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1731 ; GCN1-NEXT: buffer_wbinvl1_vol
1732 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1734 ; GCN2-LABEL: flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory:
1736 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1737 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
1738 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1739 ; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc
1740 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1741 ; GCN2-NEXT: buffer_wbinvl1_vol
1742 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1744 ; GCN3-LABEL: flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory:
1746 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1747 ; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 offset:16 glc
1748 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1749 ; GCN3-NEXT: buffer_wbinvl1_vol
1750 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1751 %gep = getelementptr i32, ptr %out, i64 4
1752 %result = atomicrmw and ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
1756 ; ---------------------------------------------------------------------
1758 ; ---------------------------------------------------------------------
1760 define void @flat_atomic_nand_i32_noret(ptr %ptr, i32 %in) {
1761 ; GCN1-LABEL: flat_atomic_nand_i32_noret:
1763 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1764 ; GCN1-NEXT: flat_load_dword v4, v[0:1]
1765 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
1766 ; GCN1-NEXT: .LBB50_1: ; %atomicrmw.start
1767 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
1768 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1769 ; GCN1-NEXT: v_and_b32_e32 v3, v4, v2
1770 ; GCN1-NEXT: v_not_b32_e32 v3, v3
1771 ; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1772 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1773 ; GCN1-NEXT: buffer_wbinvl1_vol
1774 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
1775 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1776 ; GCN1-NEXT: v_mov_b32_e32 v4, v3
1777 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
1778 ; GCN1-NEXT: s_cbranch_execnz .LBB50_1
1779 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
1780 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
1781 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1783 ; GCN2-LABEL: flat_atomic_nand_i32_noret:
1785 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1786 ; GCN2-NEXT: flat_load_dword v4, v[0:1]
1787 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
1788 ; GCN2-NEXT: .LBB50_1: ; %atomicrmw.start
1789 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
1790 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1791 ; GCN2-NEXT: v_and_b32_e32 v3, v4, v2
1792 ; GCN2-NEXT: v_not_b32_e32 v3, v3
1793 ; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1794 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1795 ; GCN2-NEXT: buffer_wbinvl1_vol
1796 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
1797 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1798 ; GCN2-NEXT: v_mov_b32_e32 v4, v3
1799 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
1800 ; GCN2-NEXT: s_cbranch_execnz .LBB50_1
1801 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
1802 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
1803 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1805 ; GCN3-LABEL: flat_atomic_nand_i32_noret:
1807 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1808 ; GCN3-NEXT: flat_load_dword v4, v[0:1]
1809 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
1810 ; GCN3-NEXT: .LBB50_1: ; %atomicrmw.start
1811 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
1812 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1813 ; GCN3-NEXT: v_and_b32_e32 v3, v4, v2
1814 ; GCN3-NEXT: v_not_b32_e32 v3, v3
1815 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1816 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1817 ; GCN3-NEXT: buffer_wbinvl1_vol
1818 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
1819 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1820 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
1821 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
1822 ; GCN3-NEXT: s_cbranch_execnz .LBB50_1
1823 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
1824 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
1825 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1826 %tmp0 = atomicrmw nand ptr %ptr, i32 %in seq_cst
1830 define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) {
1831 ; GCN1-LABEL: flat_atomic_nand_i32_noret_offset:
1833 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1834 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
1835 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1836 ; GCN1-NEXT: flat_load_dword v4, v[0:1]
1837 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
1838 ; GCN1-NEXT: .LBB51_1: ; %atomicrmw.start
1839 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
1840 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1841 ; GCN1-NEXT: v_and_b32_e32 v3, v4, v2
1842 ; GCN1-NEXT: v_not_b32_e32 v3, v3
1843 ; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1844 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1845 ; GCN1-NEXT: buffer_wbinvl1_vol
1846 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
1847 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1848 ; GCN1-NEXT: v_mov_b32_e32 v4, v3
1849 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
1850 ; GCN1-NEXT: s_cbranch_execnz .LBB51_1
1851 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
1852 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
1853 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1855 ; GCN2-LABEL: flat_atomic_nand_i32_noret_offset:
1857 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1858 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
1859 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1860 ; GCN2-NEXT: flat_load_dword v4, v[0:1]
1861 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
1862 ; GCN2-NEXT: .LBB51_1: ; %atomicrmw.start
1863 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
1864 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1865 ; GCN2-NEXT: v_and_b32_e32 v3, v4, v2
1866 ; GCN2-NEXT: v_not_b32_e32 v3, v3
1867 ; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1868 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1869 ; GCN2-NEXT: buffer_wbinvl1_vol
1870 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
1871 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1872 ; GCN2-NEXT: v_mov_b32_e32 v4, v3
1873 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
1874 ; GCN2-NEXT: s_cbranch_execnz .LBB51_1
1875 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
1876 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
1877 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1879 ; GCN3-LABEL: flat_atomic_nand_i32_noret_offset:
1881 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1882 ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16
1883 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
1884 ; GCN3-NEXT: .LBB51_1: ; %atomicrmw.start
1885 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
1886 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1887 ; GCN3-NEXT: v_and_b32_e32 v3, v4, v2
1888 ; GCN3-NEXT: v_not_b32_e32 v3, v3
1889 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
1890 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1891 ; GCN3-NEXT: buffer_wbinvl1_vol
1892 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
1893 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1894 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
1895 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
1896 ; GCN3-NEXT: s_cbranch_execnz .LBB51_1
1897 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
1898 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
1899 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1900 %gep = getelementptr i32, ptr %out, i32 4
1901 %tmp0 = atomicrmw nand ptr %gep, i32 %in seq_cst
1905 define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) {
1906 ; GCN1-LABEL: flat_atomic_nand_i32_ret:
1908 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1909 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
1910 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
1911 ; GCN1-NEXT: .LBB52_1: ; %atomicrmw.start
1912 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
1913 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1914 ; GCN1-NEXT: v_mov_b32_e32 v4, v3
1915 ; GCN1-NEXT: v_and_b32_e32 v3, v4, v2
1916 ; GCN1-NEXT: v_not_b32_e32 v3, v3
1917 ; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1918 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1919 ; GCN1-NEXT: buffer_wbinvl1_vol
1920 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
1921 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1922 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
1923 ; GCN1-NEXT: s_cbranch_execnz .LBB52_1
1924 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
1925 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
1926 ; GCN1-NEXT: v_mov_b32_e32 v0, v3
1927 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1929 ; GCN2-LABEL: flat_atomic_nand_i32_ret:
1931 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1932 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
1933 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
1934 ; GCN2-NEXT: .LBB52_1: ; %atomicrmw.start
1935 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
1936 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1937 ; GCN2-NEXT: v_mov_b32_e32 v4, v3
1938 ; GCN2-NEXT: v_and_b32_e32 v3, v4, v2
1939 ; GCN2-NEXT: v_not_b32_e32 v3, v3
1940 ; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1941 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1942 ; GCN2-NEXT: buffer_wbinvl1_vol
1943 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
1944 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1945 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
1946 ; GCN2-NEXT: s_cbranch_execnz .LBB52_1
1947 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
1948 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
1949 ; GCN2-NEXT: v_mov_b32_e32 v0, v3
1950 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1952 ; GCN3-LABEL: flat_atomic_nand_i32_ret:
1954 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1955 ; GCN3-NEXT: flat_load_dword v3, v[0:1]
1956 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
1957 ; GCN3-NEXT: .LBB52_1: ; %atomicrmw.start
1958 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
1959 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1960 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
1961 ; GCN3-NEXT: v_and_b32_e32 v3, v4, v2
1962 ; GCN3-NEXT: v_not_b32_e32 v3, v3
1963 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
1964 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1965 ; GCN3-NEXT: buffer_wbinvl1_vol
1966 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
1967 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1968 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
1969 ; GCN3-NEXT: s_cbranch_execnz .LBB52_1
1970 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
1971 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
1972 ; GCN3-NEXT: v_mov_b32_e32 v0, v3
1973 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1974 %result = atomicrmw nand ptr %ptr, i32 %in seq_cst
1978 define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) {
1979 ; GCN1-LABEL: flat_atomic_nand_i32_ret_offset:
1981 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1982 ; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0
1983 ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
1984 ; GCN1-NEXT: flat_load_dword v0, v[3:4]
1985 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
1986 ; GCN1-NEXT: .LBB53_1: ; %atomicrmw.start
1987 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
1988 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1989 ; GCN1-NEXT: v_mov_b32_e32 v1, v0
1990 ; GCN1-NEXT: v_and_b32_e32 v0, v1, v2
1991 ; GCN1-NEXT: v_not_b32_e32 v0, v0
1992 ; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
1993 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1994 ; GCN1-NEXT: buffer_wbinvl1_vol
1995 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
1996 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1997 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
1998 ; GCN1-NEXT: s_cbranch_execnz .LBB53_1
1999 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
2000 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
2001 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2003 ; GCN2-LABEL: flat_atomic_nand_i32_ret_offset:
2005 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2006 ; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0
2007 ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
2008 ; GCN2-NEXT: flat_load_dword v0, v[3:4]
2009 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
2010 ; GCN2-NEXT: .LBB53_1: ; %atomicrmw.start
2011 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
2012 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2013 ; GCN2-NEXT: v_mov_b32_e32 v1, v0
2014 ; GCN2-NEXT: v_and_b32_e32 v0, v1, v2
2015 ; GCN2-NEXT: v_not_b32_e32 v0, v0
2016 ; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2017 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2018 ; GCN2-NEXT: buffer_wbinvl1_vol
2019 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2020 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2021 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
2022 ; GCN2-NEXT: s_cbranch_execnz .LBB53_1
2023 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
2024 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
2025 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2027 ; GCN3-LABEL: flat_atomic_nand_i32_ret_offset:
2029 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2030 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
2031 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
2032 ; GCN3-NEXT: .LBB53_1: ; %atomicrmw.start
2033 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
2034 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2035 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
2036 ; GCN3-NEXT: v_and_b32_e32 v3, v4, v2
2037 ; GCN3-NEXT: v_not_b32_e32 v3, v3
2038 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
2039 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2040 ; GCN3-NEXT: buffer_wbinvl1_vol
2041 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
2042 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2043 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
2044 ; GCN3-NEXT: s_cbranch_execnz .LBB53_1
2045 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
2046 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
2047 ; GCN3-NEXT: v_mov_b32_e32 v0, v3
2048 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2049 %gep = getelementptr i32, ptr %out, i32 4
2050 %result = atomicrmw nand ptr %gep, i32 %in seq_cst
2054 define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
2055 ; GCN1-LABEL: flat_atomic_nand_i32_noret_scalar:
2057 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2058 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
2059 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
2060 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
2061 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
2062 ; GCN1-NEXT: .LBB54_1: ; %atomicrmw.start
2063 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
2064 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2065 ; GCN1-NEXT: v_and_b32_e32 v2, s6, v3
2066 ; GCN1-NEXT: v_not_b32_e32 v2, v2
2067 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2068 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2069 ; GCN1-NEXT: buffer_wbinvl1_vol
2070 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
2071 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2072 ; GCN1-NEXT: v_mov_b32_e32 v3, v2
2073 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
2074 ; GCN1-NEXT: s_cbranch_execnz .LBB54_1
2075 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
2076 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
2077 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2079 ; GCN2-LABEL: flat_atomic_nand_i32_noret_scalar:
2081 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2082 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
2083 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
2084 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
2085 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
2086 ; GCN2-NEXT: .LBB54_1: ; %atomicrmw.start
2087 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
2088 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2089 ; GCN2-NEXT: v_and_b32_e32 v2, s6, v3
2090 ; GCN2-NEXT: v_not_b32_e32 v2, v2
2091 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2092 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2093 ; GCN2-NEXT: buffer_wbinvl1_vol
2094 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
2095 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2096 ; GCN2-NEXT: v_mov_b32_e32 v3, v2
2097 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
2098 ; GCN2-NEXT: s_cbranch_execnz .LBB54_1
2099 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
2100 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
2101 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2103 ; GCN3-LABEL: flat_atomic_nand_i32_noret_scalar:
2105 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2106 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
2107 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
2108 ; GCN3-NEXT: flat_load_dword v3, v[0:1]
2109 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
2110 ; GCN3-NEXT: .LBB54_1: ; %atomicrmw.start
2111 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
2112 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2113 ; GCN3-NEXT: v_and_b32_e32 v2, s6, v3
2114 ; GCN3-NEXT: v_not_b32_e32 v2, v2
2115 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2116 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2117 ; GCN3-NEXT: buffer_wbinvl1_vol
2118 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
2119 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2120 ; GCN3-NEXT: v_mov_b32_e32 v3, v2
2121 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
2122 ; GCN3-NEXT: s_cbranch_execnz .LBB54_1
2123 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
2124 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
2125 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2126 %tmp0 = atomicrmw nand ptr %ptr, i32 %in seq_cst
2130 define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
2131 ; GCN1-LABEL: flat_atomic_nand_i32_noret_offset_scalar:
2133 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2134 ; GCN1-NEXT: s_add_u32 s34, s4, 16
2135 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
2136 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
2137 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
2138 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
2139 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
2140 ; GCN1-NEXT: .LBB55_1: ; %atomicrmw.start
2141 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
2142 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2143 ; GCN1-NEXT: v_and_b32_e32 v2, s6, v3
2144 ; GCN1-NEXT: v_not_b32_e32 v2, v2
2145 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2146 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2147 ; GCN1-NEXT: buffer_wbinvl1_vol
2148 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
2149 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2150 ; GCN1-NEXT: v_mov_b32_e32 v3, v2
2151 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
2152 ; GCN1-NEXT: s_cbranch_execnz .LBB55_1
2153 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
2154 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
2155 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2157 ; GCN2-LABEL: flat_atomic_nand_i32_noret_offset_scalar:
2159 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2160 ; GCN2-NEXT: s_add_u32 s34, s4, 16
2161 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
2162 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
2163 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
2164 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
2165 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
2166 ; GCN2-NEXT: .LBB55_1: ; %atomicrmw.start
2167 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
2168 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2169 ; GCN2-NEXT: v_and_b32_e32 v2, s6, v3
2170 ; GCN2-NEXT: v_not_b32_e32 v2, v2
2171 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2172 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2173 ; GCN2-NEXT: buffer_wbinvl1_vol
2174 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
2175 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2176 ; GCN2-NEXT: v_mov_b32_e32 v3, v2
2177 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
2178 ; GCN2-NEXT: s_cbranch_execnz .LBB55_1
2179 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
2180 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
2181 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2183 ; GCN3-LABEL: flat_atomic_nand_i32_noret_offset_scalar:
2185 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2186 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
2187 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
2188 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
2189 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
2190 ; GCN3-NEXT: .LBB55_1: ; %atomicrmw.start
2191 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
2192 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2193 ; GCN3-NEXT: v_and_b32_e32 v2, s6, v3
2194 ; GCN3-NEXT: v_not_b32_e32 v2, v2
2195 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
2196 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2197 ; GCN3-NEXT: buffer_wbinvl1_vol
2198 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
2199 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2200 ; GCN3-NEXT: v_mov_b32_e32 v3, v2
2201 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
2202 ; GCN3-NEXT: s_cbranch_execnz .LBB55_1
2203 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
2204 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
2205 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2206 %gep = getelementptr i32, ptr %out, i32 4
2207 %tmp0 = atomicrmw nand ptr %gep, i32 %in seq_cst
2211 define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
2212 ; GCN1-LABEL: flat_atomic_nand_i32_ret_scalar:
2214 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2215 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
2216 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
2217 ; GCN1-NEXT: flat_load_dword v0, v[0:1]
2218 ; GCN1-NEXT: v_mov_b32_e32 v1, s4
2219 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
2220 ; GCN1-NEXT: v_mov_b32_e32 v2, s5
2221 ; GCN1-NEXT: .LBB56_1: ; %atomicrmw.start
2222 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
2223 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2224 ; GCN1-NEXT: v_mov_b32_e32 v4, v0
2225 ; GCN1-NEXT: v_and_b32_e32 v0, s6, v4
2226 ; GCN1-NEXT: v_not_b32_e32 v3, v0
2227 ; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
2228 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2229 ; GCN1-NEXT: buffer_wbinvl1_vol
2230 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
2231 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2232 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
2233 ; GCN1-NEXT: s_cbranch_execnz .LBB56_1
2234 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
2235 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
2236 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2238 ; GCN2-LABEL: flat_atomic_nand_i32_ret_scalar:
2240 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2241 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
2242 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
2243 ; GCN2-NEXT: flat_load_dword v0, v[0:1]
2244 ; GCN2-NEXT: v_mov_b32_e32 v1, s4
2245 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
2246 ; GCN2-NEXT: v_mov_b32_e32 v2, s5
2247 ; GCN2-NEXT: .LBB56_1: ; %atomicrmw.start
2248 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
2249 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2250 ; GCN2-NEXT: v_mov_b32_e32 v4, v0
2251 ; GCN2-NEXT: v_and_b32_e32 v0, s6, v4
2252 ; GCN2-NEXT: v_not_b32_e32 v3, v0
2253 ; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
2254 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2255 ; GCN2-NEXT: buffer_wbinvl1_vol
2256 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
2257 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2258 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
2259 ; GCN2-NEXT: s_cbranch_execnz .LBB56_1
2260 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
2261 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
2262 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2264 ; GCN3-LABEL: flat_atomic_nand_i32_ret_scalar:
2266 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2267 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
2268 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
2269 ; GCN3-NEXT: flat_load_dword v0, v[0:1]
2270 ; GCN3-NEXT: v_mov_b32_e32 v1, s4
2271 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
2272 ; GCN3-NEXT: v_mov_b32_e32 v2, s5
2273 ; GCN3-NEXT: .LBB56_1: ; %atomicrmw.start
2274 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
2275 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2276 ; GCN3-NEXT: v_mov_b32_e32 v4, v0
2277 ; GCN3-NEXT: v_and_b32_e32 v0, s6, v4
2278 ; GCN3-NEXT: v_not_b32_e32 v3, v0
2279 ; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
2280 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2281 ; GCN3-NEXT: buffer_wbinvl1_vol
2282 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
2283 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2284 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
2285 ; GCN3-NEXT: s_cbranch_execnz .LBB56_1
2286 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
2287 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
2288 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2289 %result = atomicrmw nand ptr %ptr, i32 %in seq_cst
2293 define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
2294 ; GCN1-LABEL: flat_atomic_nand_i32_ret_offset_scalar:
2296 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2297 ; GCN1-NEXT: s_add_u32 s34, s4, 16
2298 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
2299 ; GCN1-NEXT: v_mov_b32_e32 v1, s34
2300 ; GCN1-NEXT: v_mov_b32_e32 v2, s35
2301 ; GCN1-NEXT: flat_load_dword v0, v[1:2]
2302 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
2303 ; GCN1-NEXT: .LBB57_1: ; %atomicrmw.start
2304 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
2305 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2306 ; GCN1-NEXT: v_mov_b32_e32 v4, v0
2307 ; GCN1-NEXT: v_and_b32_e32 v0, s6, v4
2308 ; GCN1-NEXT: v_not_b32_e32 v3, v0
2309 ; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
2310 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2311 ; GCN1-NEXT: buffer_wbinvl1_vol
2312 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
2313 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2314 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
2315 ; GCN1-NEXT: s_cbranch_execnz .LBB57_1
2316 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
2317 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
2318 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2320 ; GCN2-LABEL: flat_atomic_nand_i32_ret_offset_scalar:
2322 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2323 ; GCN2-NEXT: s_add_u32 s34, s4, 16
2324 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
2325 ; GCN2-NEXT: v_mov_b32_e32 v1, s34
2326 ; GCN2-NEXT: v_mov_b32_e32 v2, s35
2327 ; GCN2-NEXT: flat_load_dword v0, v[1:2]
2328 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
2329 ; GCN2-NEXT: .LBB57_1: ; %atomicrmw.start
2330 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
2331 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2332 ; GCN2-NEXT: v_mov_b32_e32 v4, v0
2333 ; GCN2-NEXT: v_and_b32_e32 v0, s6, v4
2334 ; GCN2-NEXT: v_not_b32_e32 v3, v0
2335 ; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
2336 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2337 ; GCN2-NEXT: buffer_wbinvl1_vol
2338 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
2339 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2340 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
2341 ; GCN2-NEXT: s_cbranch_execnz .LBB57_1
2342 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
2343 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
2344 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2346 ; GCN3-LABEL: flat_atomic_nand_i32_ret_offset_scalar:
2348 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2349 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
2350 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
2351 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
2352 ; GCN3-NEXT: v_mov_b32_e32 v1, s4
2353 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
2354 ; GCN3-NEXT: v_mov_b32_e32 v2, s5
2355 ; GCN3-NEXT: .LBB57_1: ; %atomicrmw.start
2356 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
2357 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2358 ; GCN3-NEXT: v_mov_b32_e32 v4, v0
2359 ; GCN3-NEXT: v_and_b32_e32 v0, s6, v4
2360 ; GCN3-NEXT: v_not_b32_e32 v3, v0
2361 ; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc
2362 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2363 ; GCN3-NEXT: buffer_wbinvl1_vol
2364 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
2365 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
2366 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
2367 ; GCN3-NEXT: s_cbranch_execnz .LBB57_1
2368 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
2369 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
2370 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2371 %gep = getelementptr i32, ptr %out, i32 4
2372 %result = atomicrmw nand ptr %gep, i32 %in seq_cst
2376 define void @flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
2377 ; GCN1-LABEL: flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory:
2379 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2380 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
2381 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2382 ; GCN1-NEXT: flat_load_dword v4, v[0:1]
2383 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
2384 ; GCN1-NEXT: .LBB58_1: ; %atomicrmw.start
2385 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
2386 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2387 ; GCN1-NEXT: v_and_b32_e32 v3, v4, v2
2388 ; GCN1-NEXT: v_not_b32_e32 v3, v3
2389 ; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2390 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2391 ; GCN1-NEXT: buffer_wbinvl1_vol
2392 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
2393 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2394 ; GCN1-NEXT: v_mov_b32_e32 v4, v3
2395 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
2396 ; GCN1-NEXT: s_cbranch_execnz .LBB58_1
2397 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
2398 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
2399 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2401 ; GCN2-LABEL: flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory:
2403 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2404 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
2405 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2406 ; GCN2-NEXT: flat_load_dword v4, v[0:1]
2407 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
2408 ; GCN2-NEXT: .LBB58_1: ; %atomicrmw.start
2409 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
2410 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2411 ; GCN2-NEXT: v_and_b32_e32 v3, v4, v2
2412 ; GCN2-NEXT: v_not_b32_e32 v3, v3
2413 ; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
2414 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2415 ; GCN2-NEXT: buffer_wbinvl1_vol
2416 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
2417 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2418 ; GCN2-NEXT: v_mov_b32_e32 v4, v3
2419 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
2420 ; GCN2-NEXT: s_cbranch_execnz .LBB58_1
2421 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
2422 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
2423 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2425 ; GCN3-LABEL: flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory:
2427 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2428 ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16
2429 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
2430 ; GCN3-NEXT: .LBB58_1: ; %atomicrmw.start
2431 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
2432 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2433 ; GCN3-NEXT: v_and_b32_e32 v3, v4, v2
2434 ; GCN3-NEXT: v_not_b32_e32 v3, v3
2435 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
2436 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2437 ; GCN3-NEXT: buffer_wbinvl1_vol
2438 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
2439 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2440 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
2441 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
2442 ; GCN3-NEXT: s_cbranch_execnz .LBB58_1
2443 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
2444 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
2445 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2446 %gep = getelementptr i32, ptr %out, i64 4
2447 %tmp0 = atomicrmw nand ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
2451 define i32 @flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
2452 ; GCN1-LABEL: flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory:
2454 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2455 ; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0
2456 ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
2457 ; GCN1-NEXT: flat_load_dword v0, v[3:4]
2458 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
2459 ; GCN1-NEXT: .LBB59_1: ; %atomicrmw.start
2460 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
2461 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2462 ; GCN1-NEXT: v_mov_b32_e32 v1, v0
2463 ; GCN1-NEXT: v_and_b32_e32 v0, v1, v2
2464 ; GCN1-NEXT: v_not_b32_e32 v0, v0
2465 ; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2466 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2467 ; GCN1-NEXT: buffer_wbinvl1_vol
2468 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2469 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2470 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
2471 ; GCN1-NEXT: s_cbranch_execnz .LBB59_1
2472 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
2473 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
2474 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2476 ; GCN2-LABEL: flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory:
2478 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2479 ; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0
2480 ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
2481 ; GCN2-NEXT: flat_load_dword v0, v[3:4]
2482 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
2483 ; GCN2-NEXT: .LBB59_1: ; %atomicrmw.start
2484 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
2485 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2486 ; GCN2-NEXT: v_mov_b32_e32 v1, v0
2487 ; GCN2-NEXT: v_and_b32_e32 v0, v1, v2
2488 ; GCN2-NEXT: v_not_b32_e32 v0, v0
2489 ; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
2490 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2491 ; GCN2-NEXT: buffer_wbinvl1_vol
2492 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2493 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2494 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
2495 ; GCN2-NEXT: s_cbranch_execnz .LBB59_1
2496 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
2497 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
2498 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2500 ; GCN3-LABEL: flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory:
2502 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2503 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
2504 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
2505 ; GCN3-NEXT: .LBB59_1: ; %atomicrmw.start
2506 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
2507 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2508 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
2509 ; GCN3-NEXT: v_and_b32_e32 v3, v4, v2
2510 ; GCN3-NEXT: v_not_b32_e32 v3, v3
2511 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
2512 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2513 ; GCN3-NEXT: buffer_wbinvl1_vol
2514 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
2515 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2516 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
2517 ; GCN3-NEXT: s_cbranch_execnz .LBB59_1
2518 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
2519 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
2520 ; GCN3-NEXT: v_mov_b32_e32 v0, v3
2521 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2522 %gep = getelementptr i32, ptr %out, i64 4
2523 %result = atomicrmw nand ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
2527 ; ---------------------------------------------------------------------
2529 ; ---------------------------------------------------------------------
2531 define void @flat_atomic_or_i32_noret(ptr %ptr, i32 %in) {
2532 ; GCN1-LABEL: flat_atomic_or_i32_noret:
2534 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2535 ; GCN1-NEXT: flat_atomic_or v[0:1], v2
2536 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2537 ; GCN1-NEXT: buffer_wbinvl1_vol
2538 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2540 ; GCN2-LABEL: flat_atomic_or_i32_noret:
2542 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2543 ; GCN2-NEXT: flat_atomic_or v[0:1], v2
2544 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2545 ; GCN2-NEXT: buffer_wbinvl1_vol
2546 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2548 ; GCN3-LABEL: flat_atomic_or_i32_noret:
2550 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2551 ; GCN3-NEXT: flat_atomic_or v[0:1], v2
2552 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2553 ; GCN3-NEXT: buffer_wbinvl1_vol
2554 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2555 %tmp0 = atomicrmw or ptr %ptr, i32 %in seq_cst
2559 define void @flat_atomic_or_i32_noret_offset(ptr %out, i32 %in) {
2560 ; GCN1-LABEL: flat_atomic_or_i32_noret_offset:
2562 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2563 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
2564 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2565 ; GCN1-NEXT: flat_atomic_or v[0:1], v2
2566 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2567 ; GCN1-NEXT: buffer_wbinvl1_vol
2568 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2570 ; GCN2-LABEL: flat_atomic_or_i32_noret_offset:
2572 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2573 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
2574 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2575 ; GCN2-NEXT: flat_atomic_or v[0:1], v2
2576 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2577 ; GCN2-NEXT: buffer_wbinvl1_vol
2578 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2580 ; GCN3-LABEL: flat_atomic_or_i32_noret_offset:
2582 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2583 ; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16
2584 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2585 ; GCN3-NEXT: buffer_wbinvl1_vol
2586 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2587 %gep = getelementptr i32, ptr %out, i32 4
2588 %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst
2592 define i32 @flat_atomic_or_i32_ret(ptr %ptr, i32 %in) {
2593 ; GCN1-LABEL: flat_atomic_or_i32_ret:
2595 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2596 ; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc
2597 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2598 ; GCN1-NEXT: buffer_wbinvl1_vol
2599 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2601 ; GCN2-LABEL: flat_atomic_or_i32_ret:
2603 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2604 ; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc
2605 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2606 ; GCN2-NEXT: buffer_wbinvl1_vol
2607 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2609 ; GCN3-LABEL: flat_atomic_or_i32_ret:
2611 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2612 ; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 glc
2613 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2614 ; GCN3-NEXT: buffer_wbinvl1_vol
2615 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2616 %result = atomicrmw or ptr %ptr, i32 %in seq_cst
2620 define i32 @flat_atomic_or_i32_ret_offset(ptr %out, i32 %in) {
2621 ; GCN1-LABEL: flat_atomic_or_i32_ret_offset:
2623 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2624 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
2625 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2626 ; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc
2627 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2628 ; GCN1-NEXT: buffer_wbinvl1_vol
2629 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2631 ; GCN2-LABEL: flat_atomic_or_i32_ret_offset:
2633 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2634 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
2635 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2636 ; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc
2637 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2638 ; GCN2-NEXT: buffer_wbinvl1_vol
2639 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2641 ; GCN3-LABEL: flat_atomic_or_i32_ret_offset:
2643 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2644 ; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 offset:16 glc
2645 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2646 ; GCN3-NEXT: buffer_wbinvl1_vol
2647 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2648 %gep = getelementptr i32, ptr %out, i32 4
2649 %result = atomicrmw or ptr %gep, i32 %in seq_cst
2653 define amdgpu_gfx void @flat_atomic_or_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
2654 ; GCN1-LABEL: flat_atomic_or_i32_noret_scalar:
2656 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2657 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
2658 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
2659 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
2660 ; GCN1-NEXT: flat_atomic_or v[0:1], v2
2661 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2662 ; GCN1-NEXT: buffer_wbinvl1_vol
2663 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2665 ; GCN2-LABEL: flat_atomic_or_i32_noret_scalar:
2667 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2668 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
2669 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
2670 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
2671 ; GCN2-NEXT: flat_atomic_or v[0:1], v2
2672 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2673 ; GCN2-NEXT: buffer_wbinvl1_vol
2674 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2676 ; GCN3-LABEL: flat_atomic_or_i32_noret_scalar:
2678 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2679 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
2680 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
2681 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
2682 ; GCN3-NEXT: flat_atomic_or v[0:1], v2
2683 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2684 ; GCN3-NEXT: buffer_wbinvl1_vol
2685 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2686 %tmp0 = atomicrmw or ptr %ptr, i32 %in seq_cst
2690 define amdgpu_gfx void @flat_atomic_or_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
2691 ; GCN1-LABEL: flat_atomic_or_i32_noret_offset_scalar:
2693 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2694 ; GCN1-NEXT: s_add_u32 s34, s4, 16
2695 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
2696 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
2697 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
2698 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
2699 ; GCN1-NEXT: flat_atomic_or v[0:1], v2
2700 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2701 ; GCN1-NEXT: buffer_wbinvl1_vol
2702 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2704 ; GCN2-LABEL: flat_atomic_or_i32_noret_offset_scalar:
2706 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2707 ; GCN2-NEXT: s_add_u32 s34, s4, 16
2708 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
2709 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
2710 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
2711 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
2712 ; GCN2-NEXT: flat_atomic_or v[0:1], v2
2713 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2714 ; GCN2-NEXT: buffer_wbinvl1_vol
2715 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2717 ; GCN3-LABEL: flat_atomic_or_i32_noret_offset_scalar:
2719 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2720 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
2721 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
2722 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
2723 ; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16
2724 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2725 ; GCN3-NEXT: buffer_wbinvl1_vol
2726 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2727 %gep = getelementptr i32, ptr %out, i32 4
2728 %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst
2732 define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
2733 ; GCN1-LABEL: flat_atomic_or_i32_ret_scalar:
2735 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2736 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
2737 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
2738 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
2739 ; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc
2740 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2741 ; GCN1-NEXT: buffer_wbinvl1_vol
2742 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2744 ; GCN2-LABEL: flat_atomic_or_i32_ret_scalar:
2746 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2747 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
2748 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
2749 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
2750 ; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc
2751 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2752 ; GCN2-NEXT: buffer_wbinvl1_vol
2753 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2755 ; GCN3-LABEL: flat_atomic_or_i32_ret_scalar:
2757 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2758 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
2759 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
2760 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
2761 ; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 glc
2762 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2763 ; GCN3-NEXT: buffer_wbinvl1_vol
2764 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2765 %result = atomicrmw or ptr %ptr, i32 %in seq_cst
2769 define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
2770 ; GCN1-LABEL: flat_atomic_or_i32_ret_offset_scalar:
2772 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2773 ; GCN1-NEXT: s_add_u32 s34, s4, 16
2774 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
2775 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
2776 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
2777 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
2778 ; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc
2779 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2780 ; GCN1-NEXT: buffer_wbinvl1_vol
2781 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2783 ; GCN2-LABEL: flat_atomic_or_i32_ret_offset_scalar:
2785 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2786 ; GCN2-NEXT: s_add_u32 s34, s4, 16
2787 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
2788 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
2789 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
2790 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
2791 ; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc
2792 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2793 ; GCN2-NEXT: buffer_wbinvl1_vol
2794 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2796 ; GCN3-LABEL: flat_atomic_or_i32_ret_offset_scalar:
2798 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2799 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
2800 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
2801 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
2802 ; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 offset:16 glc
2803 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2804 ; GCN3-NEXT: buffer_wbinvl1_vol
2805 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2806 %gep = getelementptr i32, ptr %out, i32 4
2807 %result = atomicrmw or ptr %gep, i32 %in seq_cst
2811 define void @flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
2812 ; GCN1-LABEL: flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory:
2814 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2815 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
2816 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2817 ; GCN1-NEXT: flat_atomic_or v[0:1], v2
2818 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2819 ; GCN1-NEXT: buffer_wbinvl1_vol
2820 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2822 ; GCN2-LABEL: flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory:
2824 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2825 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
2826 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2827 ; GCN2-NEXT: flat_atomic_or v[0:1], v2
2828 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2829 ; GCN2-NEXT: buffer_wbinvl1_vol
2830 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2832 ; GCN3-LABEL: flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory:
2834 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2835 ; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16
2836 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2837 ; GCN3-NEXT: buffer_wbinvl1_vol
2838 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2839 %gep = getelementptr i32, ptr %out, i64 4
2840 %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
2844 define i32 @flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
2845 ; GCN1-LABEL: flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory:
2847 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2848 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
2849 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2850 ; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc
2851 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2852 ; GCN1-NEXT: buffer_wbinvl1_vol
2853 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2855 ; GCN2-LABEL: flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory:
2857 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2858 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
2859 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2860 ; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc
2861 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2862 ; GCN2-NEXT: buffer_wbinvl1_vol
2863 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2865 ; GCN3-LABEL: flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory:
2867 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2868 ; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 offset:16 glc
2869 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2870 ; GCN3-NEXT: buffer_wbinvl1_vol
2871 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2872 %gep = getelementptr i32, ptr %out, i64 4
2873 %result = atomicrmw or ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
2877 ; ---------------------------------------------------------------------
2879 ; ---------------------------------------------------------------------
2881 define void @flat_atomic_xor_i32_noret(ptr %ptr, i32 %in) {
2882 ; GCN1-LABEL: flat_atomic_xor_i32_noret:
2884 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2885 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2
2886 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2887 ; GCN1-NEXT: buffer_wbinvl1_vol
2888 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2890 ; GCN2-LABEL: flat_atomic_xor_i32_noret:
2892 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2893 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2
2894 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2895 ; GCN2-NEXT: buffer_wbinvl1_vol
2896 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2898 ; GCN3-LABEL: flat_atomic_xor_i32_noret:
2900 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2901 ; GCN3-NEXT: flat_atomic_xor v[0:1], v2
2902 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2903 ; GCN3-NEXT: buffer_wbinvl1_vol
2904 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2905 %tmp0 = atomicrmw xor ptr %ptr, i32 %in seq_cst
2909 define void @flat_atomic_xor_i32_noret_offset(ptr %out, i32 %in) {
2910 ; GCN1-LABEL: flat_atomic_xor_i32_noret_offset:
2912 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2913 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
2914 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2915 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2
2916 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2917 ; GCN1-NEXT: buffer_wbinvl1_vol
2918 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2920 ; GCN2-LABEL: flat_atomic_xor_i32_noret_offset:
2922 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2923 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
2924 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2925 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2
2926 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2927 ; GCN2-NEXT: buffer_wbinvl1_vol
2928 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2930 ; GCN3-LABEL: flat_atomic_xor_i32_noret_offset:
2932 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2933 ; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16
2934 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2935 ; GCN3-NEXT: buffer_wbinvl1_vol
2936 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2937 %gep = getelementptr i32, ptr %out, i32 4
2938 %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst
2942 define i32 @flat_atomic_xor_i32_ret(ptr %ptr, i32 %in) {
2943 ; GCN1-LABEL: flat_atomic_xor_i32_ret:
2945 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2946 ; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
2947 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2948 ; GCN1-NEXT: buffer_wbinvl1_vol
2949 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2951 ; GCN2-LABEL: flat_atomic_xor_i32_ret:
2953 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2954 ; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
2955 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2956 ; GCN2-NEXT: buffer_wbinvl1_vol
2957 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2959 ; GCN3-LABEL: flat_atomic_xor_i32_ret:
2961 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2962 ; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
2963 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2964 ; GCN3-NEXT: buffer_wbinvl1_vol
2965 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2966 %result = atomicrmw xor ptr %ptr, i32 %in seq_cst
2970 define i32 @flat_atomic_xor_i32_ret_offset(ptr %out, i32 %in) {
2971 ; GCN1-LABEL: flat_atomic_xor_i32_ret_offset:
2973 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2974 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
2975 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2976 ; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
2977 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2978 ; GCN1-NEXT: buffer_wbinvl1_vol
2979 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2981 ; GCN2-LABEL: flat_atomic_xor_i32_ret_offset:
2983 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2984 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
2985 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2986 ; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
2987 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2988 ; GCN2-NEXT: buffer_wbinvl1_vol
2989 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2991 ; GCN3-LABEL: flat_atomic_xor_i32_ret_offset:
2993 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2994 ; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:16 glc
2995 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2996 ; GCN3-NEXT: buffer_wbinvl1_vol
2997 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2998 %gep = getelementptr i32, ptr %out, i32 4
2999 %result = atomicrmw xor ptr %gep, i32 %in seq_cst
3003 define amdgpu_gfx void @flat_atomic_xor_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
3004 ; GCN1-LABEL: flat_atomic_xor_i32_noret_scalar:
3006 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3007 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
3008 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
3009 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
3010 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2
3011 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3012 ; GCN1-NEXT: buffer_wbinvl1_vol
3013 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3015 ; GCN2-LABEL: flat_atomic_xor_i32_noret_scalar:
3017 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3018 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
3019 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
3020 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
3021 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2
3022 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3023 ; GCN2-NEXT: buffer_wbinvl1_vol
3024 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3026 ; GCN3-LABEL: flat_atomic_xor_i32_noret_scalar:
3028 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3029 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
3030 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
3031 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
3032 ; GCN3-NEXT: flat_atomic_xor v[0:1], v2
3033 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3034 ; GCN3-NEXT: buffer_wbinvl1_vol
3035 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3036 %tmp0 = atomicrmw xor ptr %ptr, i32 %in seq_cst
3040 define amdgpu_gfx void @flat_atomic_xor_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
3041 ; GCN1-LABEL: flat_atomic_xor_i32_noret_offset_scalar:
3043 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3044 ; GCN1-NEXT: s_add_u32 s34, s4, 16
3045 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
3046 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
3047 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
3048 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
3049 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2
3050 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3051 ; GCN1-NEXT: buffer_wbinvl1_vol
3052 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3054 ; GCN2-LABEL: flat_atomic_xor_i32_noret_offset_scalar:
3056 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3057 ; GCN2-NEXT: s_add_u32 s34, s4, 16
3058 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
3059 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
3060 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
3061 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
3062 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2
3063 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3064 ; GCN2-NEXT: buffer_wbinvl1_vol
3065 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3067 ; GCN3-LABEL: flat_atomic_xor_i32_noret_offset_scalar:
3069 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3070 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
3071 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
3072 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
3073 ; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16
3074 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3075 ; GCN3-NEXT: buffer_wbinvl1_vol
3076 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3077 %gep = getelementptr i32, ptr %out, i32 4
3078 %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst
3082 define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
3083 ; GCN1-LABEL: flat_atomic_xor_i32_ret_scalar:
3085 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3086 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
3087 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
3088 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
3089 ; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
3090 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3091 ; GCN1-NEXT: buffer_wbinvl1_vol
3092 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3094 ; GCN2-LABEL: flat_atomic_xor_i32_ret_scalar:
3096 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3097 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
3098 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
3099 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
3100 ; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
3101 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3102 ; GCN2-NEXT: buffer_wbinvl1_vol
3103 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3105 ; GCN3-LABEL: flat_atomic_xor_i32_ret_scalar:
3107 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3108 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
3109 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
3110 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
3111 ; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
3112 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3113 ; GCN3-NEXT: buffer_wbinvl1_vol
3114 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3115 %result = atomicrmw xor ptr %ptr, i32 %in seq_cst
3119 define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
3120 ; GCN1-LABEL: flat_atomic_xor_i32_ret_offset_scalar:
3122 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3123 ; GCN1-NEXT: s_add_u32 s34, s4, 16
3124 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
3125 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
3126 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
3127 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
3128 ; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
3129 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3130 ; GCN1-NEXT: buffer_wbinvl1_vol
3131 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3133 ; GCN2-LABEL: flat_atomic_xor_i32_ret_offset_scalar:
3135 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3136 ; GCN2-NEXT: s_add_u32 s34, s4, 16
3137 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
3138 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
3139 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
3140 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
3141 ; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
3142 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3143 ; GCN2-NEXT: buffer_wbinvl1_vol
3144 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3146 ; GCN3-LABEL: flat_atomic_xor_i32_ret_offset_scalar:
3148 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3149 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
3150 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
3151 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
3152 ; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:16 glc
3153 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3154 ; GCN3-NEXT: buffer_wbinvl1_vol
3155 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3156 %gep = getelementptr i32, ptr %out, i32 4
3157 %result = atomicrmw xor ptr %gep, i32 %in seq_cst
3161 define void @flat_xor_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
3162 ; GCN1-LABEL: flat_xor_i32_noret_offset__amdgpu_no_remote_memory:
3164 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3165 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
3166 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3167 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2
3168 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3169 ; GCN1-NEXT: buffer_wbinvl1_vol
3170 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3172 ; GCN2-LABEL: flat_xor_i32_noret_offset__amdgpu_no_remote_memory:
3174 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3175 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
3176 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3177 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2
3178 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3179 ; GCN2-NEXT: buffer_wbinvl1_vol
3180 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3182 ; GCN3-LABEL: flat_xor_i32_noret_offset__amdgpu_no_remote_memory:
3184 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3185 ; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16
3186 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3187 ; GCN3-NEXT: buffer_wbinvl1_vol
3188 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3189 %gep = getelementptr i32, ptr %out, i64 4
3190 %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
3194 define i32 @flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
3195 ; GCN1-LABEL: flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory:
3197 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3198 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
3199 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3200 ; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
3201 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3202 ; GCN1-NEXT: buffer_wbinvl1_vol
3203 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3205 ; GCN2-LABEL: flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory:
3207 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3208 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
3209 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3210 ; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
3211 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3212 ; GCN2-NEXT: buffer_wbinvl1_vol
3213 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3215 ; GCN3-LABEL: flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory:
3217 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3218 ; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:16 glc
3219 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3220 ; GCN3-NEXT: buffer_wbinvl1_vol
3221 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3222 %gep = getelementptr i32, ptr %out, i64 4
3223 %result = atomicrmw xor ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
3227 ; ---------------------------------------------------------------------
3229 ; ---------------------------------------------------------------------
3231 define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) {
3232 ; GCN1-LABEL: flat_atomic_max_i32_noret:
3234 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3235 ; GCN1-NEXT: flat_load_dword v4, v[0:1]
3236 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
3237 ; GCN1-NEXT: .LBB80_1: ; %atomicrmw.start
3238 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
3239 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3240 ; GCN1-NEXT: v_max_i32_e32 v3, v4, v2
3241 ; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3242 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3243 ; GCN1-NEXT: buffer_wbinvl1_vol
3244 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
3245 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3246 ; GCN1-NEXT: v_mov_b32_e32 v4, v3
3247 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
3248 ; GCN1-NEXT: s_cbranch_execnz .LBB80_1
3249 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
3250 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
3251 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3253 ; GCN2-LABEL: flat_atomic_max_i32_noret:
3255 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3256 ; GCN2-NEXT: flat_load_dword v4, v[0:1]
3257 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
3258 ; GCN2-NEXT: .LBB80_1: ; %atomicrmw.start
3259 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
3260 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3261 ; GCN2-NEXT: v_max_i32_e32 v3, v4, v2
3262 ; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3263 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3264 ; GCN2-NEXT: buffer_wbinvl1_vol
3265 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
3266 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3267 ; GCN2-NEXT: v_mov_b32_e32 v4, v3
3268 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
3269 ; GCN2-NEXT: s_cbranch_execnz .LBB80_1
3270 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
3271 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
3272 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3274 ; GCN3-LABEL: flat_atomic_max_i32_noret:
3276 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3277 ; GCN3-NEXT: flat_load_dword v4, v[0:1]
3278 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
3279 ; GCN3-NEXT: .LBB80_1: ; %atomicrmw.start
3280 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
3281 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3282 ; GCN3-NEXT: v_max_i32_e32 v3, v4, v2
3283 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3284 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3285 ; GCN3-NEXT: buffer_wbinvl1_vol
3286 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
3287 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3288 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
3289 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
3290 ; GCN3-NEXT: s_cbranch_execnz .LBB80_1
3291 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
3292 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
3293 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3294 %tmp0 = atomicrmw max ptr %ptr, i32 %in seq_cst
3298 define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) {
3299 ; GCN1-LABEL: flat_atomic_max_i32_noret_offset:
3301 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3302 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
3303 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3304 ; GCN1-NEXT: flat_load_dword v4, v[0:1]
3305 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
3306 ; GCN1-NEXT: .LBB81_1: ; %atomicrmw.start
3307 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
3308 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3309 ; GCN1-NEXT: v_max_i32_e32 v3, v4, v2
3310 ; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3311 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3312 ; GCN1-NEXT: buffer_wbinvl1_vol
3313 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
3314 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3315 ; GCN1-NEXT: v_mov_b32_e32 v4, v3
3316 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
3317 ; GCN1-NEXT: s_cbranch_execnz .LBB81_1
3318 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
3319 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
3320 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3322 ; GCN2-LABEL: flat_atomic_max_i32_noret_offset:
3324 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3325 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
3326 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3327 ; GCN2-NEXT: flat_load_dword v4, v[0:1]
3328 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
3329 ; GCN2-NEXT: .LBB81_1: ; %atomicrmw.start
3330 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
3331 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3332 ; GCN2-NEXT: v_max_i32_e32 v3, v4, v2
3333 ; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3334 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3335 ; GCN2-NEXT: buffer_wbinvl1_vol
3336 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
3337 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3338 ; GCN2-NEXT: v_mov_b32_e32 v4, v3
3339 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
3340 ; GCN2-NEXT: s_cbranch_execnz .LBB81_1
3341 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
3342 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
3343 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3345 ; GCN3-LABEL: flat_atomic_max_i32_noret_offset:
3347 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3348 ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16
3349 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
3350 ; GCN3-NEXT: .LBB81_1: ; %atomicrmw.start
3351 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
3352 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3353 ; GCN3-NEXT: v_max_i32_e32 v3, v4, v2
3354 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
3355 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3356 ; GCN3-NEXT: buffer_wbinvl1_vol
3357 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
3358 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3359 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
3360 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
3361 ; GCN3-NEXT: s_cbranch_execnz .LBB81_1
3362 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
3363 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
3364 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3365 %gep = getelementptr i32, ptr %out, i32 4
3366 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst
3370 define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) {
3371 ; GCN1-LABEL: flat_atomic_max_i32_ret:
3373 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3374 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
3375 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
3376 ; GCN1-NEXT: .LBB82_1: ; %atomicrmw.start
3377 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
3378 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3379 ; GCN1-NEXT: v_mov_b32_e32 v4, v3
3380 ; GCN1-NEXT: v_max_i32_e32 v3, v4, v2
3381 ; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3382 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3383 ; GCN1-NEXT: buffer_wbinvl1_vol
3384 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
3385 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3386 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
3387 ; GCN1-NEXT: s_cbranch_execnz .LBB82_1
3388 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
3389 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
3390 ; GCN1-NEXT: v_mov_b32_e32 v0, v3
3391 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3393 ; GCN2-LABEL: flat_atomic_max_i32_ret:
3395 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3396 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
3397 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
3398 ; GCN2-NEXT: .LBB82_1: ; %atomicrmw.start
3399 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
3400 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3401 ; GCN2-NEXT: v_mov_b32_e32 v4, v3
3402 ; GCN2-NEXT: v_max_i32_e32 v3, v4, v2
3403 ; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3404 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3405 ; GCN2-NEXT: buffer_wbinvl1_vol
3406 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
3407 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3408 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
3409 ; GCN2-NEXT: s_cbranch_execnz .LBB82_1
3410 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
3411 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
3412 ; GCN2-NEXT: v_mov_b32_e32 v0, v3
3413 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3415 ; GCN3-LABEL: flat_atomic_max_i32_ret:
3417 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3418 ; GCN3-NEXT: flat_load_dword v3, v[0:1]
3419 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
3420 ; GCN3-NEXT: .LBB82_1: ; %atomicrmw.start
3421 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
3422 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3423 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
3424 ; GCN3-NEXT: v_max_i32_e32 v3, v4, v2
3425 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
3426 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3427 ; GCN3-NEXT: buffer_wbinvl1_vol
3428 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
3429 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3430 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
3431 ; GCN3-NEXT: s_cbranch_execnz .LBB82_1
3432 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
3433 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
3434 ; GCN3-NEXT: v_mov_b32_e32 v0, v3
3435 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3436 %result = atomicrmw max ptr %ptr, i32 %in seq_cst
3440 define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) {
3441 ; GCN1-LABEL: flat_atomic_max_i32_ret_offset:
3443 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3444 ; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0
3445 ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
3446 ; GCN1-NEXT: flat_load_dword v0, v[3:4]
3447 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
3448 ; GCN1-NEXT: .LBB83_1: ; %atomicrmw.start
3449 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
3450 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3451 ; GCN1-NEXT: v_mov_b32_e32 v1, v0
3452 ; GCN1-NEXT: v_max_i32_e32 v0, v1, v2
3453 ; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
3454 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3455 ; GCN1-NEXT: buffer_wbinvl1_vol
3456 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
3457 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3458 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
3459 ; GCN1-NEXT: s_cbranch_execnz .LBB83_1
3460 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
3461 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
3462 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3464 ; GCN2-LABEL: flat_atomic_max_i32_ret_offset:
3466 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3467 ; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0
3468 ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
3469 ; GCN2-NEXT: flat_load_dword v0, v[3:4]
3470 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
3471 ; GCN2-NEXT: .LBB83_1: ; %atomicrmw.start
3472 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
3473 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3474 ; GCN2-NEXT: v_mov_b32_e32 v1, v0
3475 ; GCN2-NEXT: v_max_i32_e32 v0, v1, v2
3476 ; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
3477 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3478 ; GCN2-NEXT: buffer_wbinvl1_vol
3479 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
3480 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3481 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
3482 ; GCN2-NEXT: s_cbranch_execnz .LBB83_1
3483 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
3484 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
3485 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3487 ; GCN3-LABEL: flat_atomic_max_i32_ret_offset:
3489 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3490 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
3491 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
3492 ; GCN3-NEXT: .LBB83_1: ; %atomicrmw.start
3493 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
3494 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3495 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
3496 ; GCN3-NEXT: v_max_i32_e32 v3, v4, v2
3497 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
3498 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3499 ; GCN3-NEXT: buffer_wbinvl1_vol
3500 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
3501 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3502 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
3503 ; GCN3-NEXT: s_cbranch_execnz .LBB83_1
3504 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
3505 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
3506 ; GCN3-NEXT: v_mov_b32_e32 v0, v3
3507 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3508 %gep = getelementptr i32, ptr %out, i32 4
3509 %result = atomicrmw max ptr %gep, i32 %in seq_cst
3513 define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
3514 ; GCN1-LABEL: flat_atomic_max_i32_noret_scalar:
3516 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3517 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
3518 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
3519 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
3520 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
3521 ; GCN1-NEXT: .LBB84_1: ; %atomicrmw.start
3522 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
3523 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3524 ; GCN1-NEXT: v_max_i32_e32 v2, s6, v3
3525 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3526 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3527 ; GCN1-NEXT: buffer_wbinvl1_vol
3528 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
3529 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
3530 ; GCN1-NEXT: v_mov_b32_e32 v3, v2
3531 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
3532 ; GCN1-NEXT: s_cbranch_execnz .LBB84_1
3533 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
3534 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
3535 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3537 ; GCN2-LABEL: flat_atomic_max_i32_noret_scalar:
3539 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3540 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
3541 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
3542 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
3543 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
3544 ; GCN2-NEXT: .LBB84_1: ; %atomicrmw.start
3545 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
3546 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3547 ; GCN2-NEXT: v_max_i32_e32 v2, s6, v3
3548 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3549 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3550 ; GCN2-NEXT: buffer_wbinvl1_vol
3551 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
3552 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
3553 ; GCN2-NEXT: v_mov_b32_e32 v3, v2
3554 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
3555 ; GCN2-NEXT: s_cbranch_execnz .LBB84_1
3556 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
3557 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
3558 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3560 ; GCN3-LABEL: flat_atomic_max_i32_noret_scalar:
3562 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3563 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
3564 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
3565 ; GCN3-NEXT: flat_load_dword v3, v[0:1]
3566 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
3567 ; GCN3-NEXT: .LBB84_1: ; %atomicrmw.start
3568 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
3569 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3570 ; GCN3-NEXT: v_max_i32_e32 v2, s6, v3
3571 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3572 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3573 ; GCN3-NEXT: buffer_wbinvl1_vol
3574 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
3575 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
3576 ; GCN3-NEXT: v_mov_b32_e32 v3, v2
3577 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
3578 ; GCN3-NEXT: s_cbranch_execnz .LBB84_1
3579 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
3580 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
3581 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3582 %tmp0 = atomicrmw max ptr %ptr, i32 %in seq_cst
3586 define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
3587 ; GCN1-LABEL: flat_atomic_max_i32_noret_offset_scalar:
3589 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3590 ; GCN1-NEXT: s_add_u32 s34, s4, 16
3591 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
3592 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
3593 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
3594 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
3595 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
3596 ; GCN1-NEXT: .LBB85_1: ; %atomicrmw.start
3597 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
3598 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3599 ; GCN1-NEXT: v_max_i32_e32 v2, s6, v3
3600 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3601 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3602 ; GCN1-NEXT: buffer_wbinvl1_vol
3603 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
3604 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
3605 ; GCN1-NEXT: v_mov_b32_e32 v3, v2
3606 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
3607 ; GCN1-NEXT: s_cbranch_execnz .LBB85_1
3608 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
3609 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
3610 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3612 ; GCN2-LABEL: flat_atomic_max_i32_noret_offset_scalar:
3614 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3615 ; GCN2-NEXT: s_add_u32 s34, s4, 16
3616 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
3617 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
3618 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
3619 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
3620 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
3621 ; GCN2-NEXT: .LBB85_1: ; %atomicrmw.start
3622 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
3623 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3624 ; GCN2-NEXT: v_max_i32_e32 v2, s6, v3
3625 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3626 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3627 ; GCN2-NEXT: buffer_wbinvl1_vol
3628 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
3629 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
3630 ; GCN2-NEXT: v_mov_b32_e32 v3, v2
3631 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
3632 ; GCN2-NEXT: s_cbranch_execnz .LBB85_1
3633 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
3634 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
3635 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3637 ; GCN3-LABEL: flat_atomic_max_i32_noret_offset_scalar:
3639 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3640 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
3641 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
3642 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
3643 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
3644 ; GCN3-NEXT: .LBB85_1: ; %atomicrmw.start
3645 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
3646 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3647 ; GCN3-NEXT: v_max_i32_e32 v2, s6, v3
3648 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3649 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3650 ; GCN3-NEXT: buffer_wbinvl1_vol
3651 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
3652 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
3653 ; GCN3-NEXT: v_mov_b32_e32 v3, v2
3654 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
3655 ; GCN3-NEXT: s_cbranch_execnz .LBB85_1
3656 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
3657 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
3658 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3659 %gep = getelementptr i32, ptr %out, i32 4
3660 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst
3664 define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
3665 ; GCN1-LABEL: flat_atomic_max_i32_ret_scalar:
3667 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3668 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
3669 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
3670 ; GCN1-NEXT: flat_load_dword v0, v[0:1]
3671 ; GCN1-NEXT: v_mov_b32_e32 v1, s4
3672 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
3673 ; GCN1-NEXT: v_mov_b32_e32 v2, s5
3674 ; GCN1-NEXT: .LBB86_1: ; %atomicrmw.start
3675 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
3676 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3677 ; GCN1-NEXT: v_mov_b32_e32 v4, v0
3678 ; GCN1-NEXT: v_max_i32_e32 v3, s6, v4
3679 ; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
3680 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3681 ; GCN1-NEXT: buffer_wbinvl1_vol
3682 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
3683 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
3684 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
3685 ; GCN1-NEXT: s_cbranch_execnz .LBB86_1
3686 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
3687 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
3688 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3690 ; GCN2-LABEL: flat_atomic_max_i32_ret_scalar:
3692 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3693 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
3694 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
3695 ; GCN2-NEXT: flat_load_dword v0, v[0:1]
3696 ; GCN2-NEXT: v_mov_b32_e32 v1, s4
3697 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
3698 ; GCN2-NEXT: v_mov_b32_e32 v2, s5
3699 ; GCN2-NEXT: .LBB86_1: ; %atomicrmw.start
3700 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
3701 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3702 ; GCN2-NEXT: v_mov_b32_e32 v4, v0
3703 ; GCN2-NEXT: v_max_i32_e32 v3, s6, v4
3704 ; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
3705 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3706 ; GCN2-NEXT: buffer_wbinvl1_vol
3707 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
3708 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
3709 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
3710 ; GCN2-NEXT: s_cbranch_execnz .LBB86_1
3711 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
3712 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
3713 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3715 ; GCN3-LABEL: flat_atomic_max_i32_ret_scalar:
3717 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3718 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
3719 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
3720 ; GCN3-NEXT: flat_load_dword v0, v[0:1]
3721 ; GCN3-NEXT: v_mov_b32_e32 v1, s4
3722 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
3723 ; GCN3-NEXT: v_mov_b32_e32 v2, s5
3724 ; GCN3-NEXT: .LBB86_1: ; %atomicrmw.start
3725 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
3726 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3727 ; GCN3-NEXT: v_mov_b32_e32 v4, v0
3728 ; GCN3-NEXT: v_max_i32_e32 v3, s6, v4
3729 ; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
3730 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3731 ; GCN3-NEXT: buffer_wbinvl1_vol
3732 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
3733 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
3734 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
3735 ; GCN3-NEXT: s_cbranch_execnz .LBB86_1
3736 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
3737 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
3738 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3739 %result = atomicrmw max ptr %ptr, i32 %in seq_cst
3743 define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
3744 ; GCN1-LABEL: flat_atomic_max_i32_ret_offset_scalar:
3746 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3747 ; GCN1-NEXT: s_add_u32 s34, s4, 16
3748 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
3749 ; GCN1-NEXT: v_mov_b32_e32 v1, s34
3750 ; GCN1-NEXT: v_mov_b32_e32 v2, s35
3751 ; GCN1-NEXT: flat_load_dword v0, v[1:2]
3752 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
3753 ; GCN1-NEXT: .LBB87_1: ; %atomicrmw.start
3754 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
3755 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3756 ; GCN1-NEXT: v_mov_b32_e32 v4, v0
3757 ; GCN1-NEXT: v_max_i32_e32 v3, s6, v4
3758 ; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
3759 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3760 ; GCN1-NEXT: buffer_wbinvl1_vol
3761 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
3762 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
3763 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
3764 ; GCN1-NEXT: s_cbranch_execnz .LBB87_1
3765 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
3766 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
3767 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3769 ; GCN2-LABEL: flat_atomic_max_i32_ret_offset_scalar:
3771 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3772 ; GCN2-NEXT: s_add_u32 s34, s4, 16
3773 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
3774 ; GCN2-NEXT: v_mov_b32_e32 v1, s34
3775 ; GCN2-NEXT: v_mov_b32_e32 v2, s35
3776 ; GCN2-NEXT: flat_load_dword v0, v[1:2]
3777 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
3778 ; GCN2-NEXT: .LBB87_1: ; %atomicrmw.start
3779 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
3780 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3781 ; GCN2-NEXT: v_mov_b32_e32 v4, v0
3782 ; GCN2-NEXT: v_max_i32_e32 v3, s6, v4
3783 ; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
3784 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3785 ; GCN2-NEXT: buffer_wbinvl1_vol
3786 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
3787 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
3788 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
3789 ; GCN2-NEXT: s_cbranch_execnz .LBB87_1
3790 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
3791 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
3792 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3794 ; GCN3-LABEL: flat_atomic_max_i32_ret_offset_scalar:
3796 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3797 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
3798 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
3799 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
3800 ; GCN3-NEXT: v_mov_b32_e32 v1, s4
3801 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
3802 ; GCN3-NEXT: v_mov_b32_e32 v2, s5
3803 ; GCN3-NEXT: .LBB87_1: ; %atomicrmw.start
3804 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
3805 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3806 ; GCN3-NEXT: v_mov_b32_e32 v4, v0
3807 ; GCN3-NEXT: v_max_i32_e32 v3, s6, v4
3808 ; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc
3809 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3810 ; GCN3-NEXT: buffer_wbinvl1_vol
3811 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
3812 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
3813 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
3814 ; GCN3-NEXT: s_cbranch_execnz .LBB87_1
3815 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
3816 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
3817 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3818 %gep = getelementptr i32, ptr %out, i32 4
3819 %result = atomicrmw max ptr %gep, i32 %in seq_cst
3823 define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 %index) {
3824 ; GCN1-LABEL: atomic_max_i32_addr64_offset:
3825 ; GCN1: ; %bb.0: ; %entry
3826 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
3827 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3828 ; GCN1-NEXT: s_ashr_i32 s5, s3, 31
3829 ; GCN1-NEXT: s_mov_b32 s4, s3
3830 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
3831 ; GCN1-NEXT: s_add_u32 s0, s0, s4
3832 ; GCN1-NEXT: s_addc_u32 s1, s1, s5
3833 ; GCN1-NEXT: s_add_u32 s0, s0, 16
3834 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
3835 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
3836 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
3837 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
3838 ; GCN1-NEXT: s_mov_b64 s[0:1], 0
3839 ; GCN1-NEXT: .LBB88_1: ; %atomicrmw.start
3840 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
3841 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3842 ; GCN1-NEXT: v_max_i32_e32 v2, s2, v3
3843 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3844 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3845 ; GCN1-NEXT: buffer_wbinvl1_vol
3846 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
3847 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3848 ; GCN1-NEXT: v_mov_b32_e32 v3, v2
3849 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
3850 ; GCN1-NEXT: s_cbranch_execnz .LBB88_1
3851 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
3852 ; GCN1-NEXT: s_endpgm
3854 ; GCN2-LABEL: atomic_max_i32_addr64_offset:
3855 ; GCN2: ; %bb.0: ; %entry
3856 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
3857 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3858 ; GCN2-NEXT: s_ashr_i32 s5, s3, 31
3859 ; GCN2-NEXT: s_mov_b32 s4, s3
3860 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
3861 ; GCN2-NEXT: s_add_u32 s0, s0, s4
3862 ; GCN2-NEXT: s_addc_u32 s1, s1, s5
3863 ; GCN2-NEXT: s_add_u32 s0, s0, 16
3864 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
3865 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
3866 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
3867 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
3868 ; GCN2-NEXT: s_mov_b64 s[0:1], 0
3869 ; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start
3870 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
3871 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3872 ; GCN2-NEXT: v_max_i32_e32 v2, s2, v3
3873 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3874 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3875 ; GCN2-NEXT: buffer_wbinvl1_vol
3876 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
3877 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3878 ; GCN2-NEXT: v_mov_b32_e32 v3, v2
3879 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
3880 ; GCN2-NEXT: s_cbranch_execnz .LBB88_1
3881 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
3882 ; GCN2-NEXT: s_endpgm
3884 ; GCN3-LABEL: atomic_max_i32_addr64_offset:
3885 ; GCN3: ; %bb.0: ; %entry
3886 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
3887 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3888 ; GCN3-NEXT: s_ashr_i32 s5, s3, 31
3889 ; GCN3-NEXT: s_mov_b32 s4, s3
3890 ; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
3891 ; GCN3-NEXT: s_add_u32 s0, s0, s4
3892 ; GCN3-NEXT: s_addc_u32 s1, s1, s5
3893 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
3894 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
3895 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
3896 ; GCN3-NEXT: s_mov_b64 s[0:1], 0
3897 ; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start
3898 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
3899 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3900 ; GCN3-NEXT: v_max_i32_e32 v2, s2, v3
3901 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
3902 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3903 ; GCN3-NEXT: buffer_wbinvl1_vol
3904 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
3905 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3906 ; GCN3-NEXT: v_mov_b32_e32 v3, v2
3907 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
3908 ; GCN3-NEXT: s_cbranch_execnz .LBB88_1
3909 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
3910 ; GCN3-NEXT: s_endpgm
3912 %ptr = getelementptr i32, ptr %out, i32 %index
3913 %gep = getelementptr i32, ptr %ptr, i32 4
3914 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst
3918 define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) {
3919 ; GCN1-LABEL: atomic_max_i32_ret_addr64_offset:
3920 ; GCN1: ; %bb.0: ; %entry
3921 ; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
3922 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
3923 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3924 ; GCN1-NEXT: s_ashr_i32 s5, s7, 31
3925 ; GCN1-NEXT: s_mov_b32 s4, s7
3926 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
3927 ; GCN1-NEXT: s_add_u32 s0, s0, s4
3928 ; GCN1-NEXT: s_addc_u32 s1, s1, s5
3929 ; GCN1-NEXT: s_add_u32 s0, s0, 16
3930 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
3931 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
3932 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
3933 ; GCN1-NEXT: flat_load_dword v2, v[0:1]
3934 ; GCN1-NEXT: s_mov_b64 s[0:1], 0
3935 ; GCN1-NEXT: .LBB89_1: ; %atomicrmw.start
3936 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
3937 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3938 ; GCN1-NEXT: v_mov_b32_e32 v3, v2
3939 ; GCN1-NEXT: v_max_i32_e32 v2, s6, v3
3940 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3941 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3942 ; GCN1-NEXT: buffer_wbinvl1_vol
3943 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
3944 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3945 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
3946 ; GCN1-NEXT: s_cbranch_execnz .LBB89_1
3947 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
3948 ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1]
3949 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
3950 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
3951 ; GCN1-NEXT: flat_store_dword v[0:1], v2
3952 ; GCN1-NEXT: s_endpgm
3954 ; GCN2-LABEL: atomic_max_i32_ret_addr64_offset:
3955 ; GCN2: ; %bb.0: ; %entry
3956 ; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
3957 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
3958 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3959 ; GCN2-NEXT: s_ashr_i32 s5, s7, 31
3960 ; GCN2-NEXT: s_mov_b32 s4, s7
3961 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
3962 ; GCN2-NEXT: s_add_u32 s0, s0, s4
3963 ; GCN2-NEXT: s_addc_u32 s1, s1, s5
3964 ; GCN2-NEXT: s_add_u32 s0, s0, 16
3965 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
3966 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
3967 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
3968 ; GCN2-NEXT: flat_load_dword v2, v[0:1]
3969 ; GCN2-NEXT: s_mov_b64 s[0:1], 0
3970 ; GCN2-NEXT: .LBB89_1: ; %atomicrmw.start
3971 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
3972 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3973 ; GCN2-NEXT: v_mov_b32_e32 v3, v2
3974 ; GCN2-NEXT: v_max_i32_e32 v2, s6, v3
3975 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
3976 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3977 ; GCN2-NEXT: buffer_wbinvl1_vol
3978 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
3979 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3980 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
3981 ; GCN2-NEXT: s_cbranch_execnz .LBB89_1
3982 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
3983 ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
3984 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
3985 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
3986 ; GCN2-NEXT: flat_store_dword v[0:1], v2
3987 ; GCN2-NEXT: s_endpgm
3989 ; GCN3-LABEL: atomic_max_i32_ret_addr64_offset:
3990 ; GCN3: ; %bb.0: ; %entry
3991 ; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
3992 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
3993 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3994 ; GCN3-NEXT: s_ashr_i32 s5, s7, 31
3995 ; GCN3-NEXT: s_mov_b32 s4, s7
3996 ; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
3997 ; GCN3-NEXT: s_add_u32 s0, s0, s4
3998 ; GCN3-NEXT: s_addc_u32 s1, s1, s5
3999 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
4000 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
4001 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16
4002 ; GCN3-NEXT: s_mov_b64 s[0:1], 0
4003 ; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start
4004 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
4005 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4006 ; GCN3-NEXT: v_mov_b32_e32 v3, v2
4007 ; GCN3-NEXT: v_max_i32_e32 v2, s6, v3
4008 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4009 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4010 ; GCN3-NEXT: buffer_wbinvl1_vol
4011 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
4012 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
4013 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
4014 ; GCN3-NEXT: s_cbranch_execnz .LBB89_1
4015 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
4016 ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
4017 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
4018 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
4019 ; GCN3-NEXT: flat_store_dword v[0:1], v2
4020 ; GCN3-NEXT: s_endpgm
4022 %ptr = getelementptr i32, ptr %out, i32 %index
4023 %gep = getelementptr i32, ptr %ptr, i32 4
4024 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst
4025 store i32 %tmp0, ptr %out2
4029 define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) {
4030 ; GCN1-LABEL: atomic_max_i32_addr64:
4031 ; GCN1: ; %bb.0: ; %entry
4032 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
4033 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4034 ; GCN1-NEXT: s_ashr_i32 s5, s3, 31
4035 ; GCN1-NEXT: s_mov_b32 s4, s3
4036 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
4037 ; GCN1-NEXT: s_add_u32 s0, s0, s4
4038 ; GCN1-NEXT: s_addc_u32 s1, s1, s5
4039 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
4040 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
4041 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
4042 ; GCN1-NEXT: s_mov_b64 s[0:1], 0
4043 ; GCN1-NEXT: .LBB90_1: ; %atomicrmw.start
4044 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
4045 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4046 ; GCN1-NEXT: v_max_i32_e32 v2, s2, v3
4047 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4048 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4049 ; GCN1-NEXT: buffer_wbinvl1_vol
4050 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
4051 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
4052 ; GCN1-NEXT: v_mov_b32_e32 v3, v2
4053 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
4054 ; GCN1-NEXT: s_cbranch_execnz .LBB90_1
4055 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
4056 ; GCN1-NEXT: s_endpgm
4058 ; GCN2-LABEL: atomic_max_i32_addr64:
4059 ; GCN2: ; %bb.0: ; %entry
4060 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
4061 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4062 ; GCN2-NEXT: s_ashr_i32 s5, s3, 31
4063 ; GCN2-NEXT: s_mov_b32 s4, s3
4064 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
4065 ; GCN2-NEXT: s_add_u32 s0, s0, s4
4066 ; GCN2-NEXT: s_addc_u32 s1, s1, s5
4067 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
4068 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
4069 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
4070 ; GCN2-NEXT: s_mov_b64 s[0:1], 0
4071 ; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start
4072 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
4073 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4074 ; GCN2-NEXT: v_max_i32_e32 v2, s2, v3
4075 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4076 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4077 ; GCN2-NEXT: buffer_wbinvl1_vol
4078 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
4079 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
4080 ; GCN2-NEXT: v_mov_b32_e32 v3, v2
4081 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
4082 ; GCN2-NEXT: s_cbranch_execnz .LBB90_1
4083 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
4084 ; GCN2-NEXT: s_endpgm
4086 ; GCN3-LABEL: atomic_max_i32_addr64:
4087 ; GCN3: ; %bb.0: ; %entry
4088 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
4089 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
4090 ; GCN3-NEXT: s_ashr_i32 s5, s3, 31
4091 ; GCN3-NEXT: s_mov_b32 s4, s3
4092 ; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
4093 ; GCN3-NEXT: s_add_u32 s0, s0, s4
4094 ; GCN3-NEXT: s_addc_u32 s1, s1, s5
4095 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
4096 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
4097 ; GCN3-NEXT: flat_load_dword v3, v[0:1]
4098 ; GCN3-NEXT: s_mov_b64 s[0:1], 0
4099 ; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start
4100 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
4101 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4102 ; GCN3-NEXT: v_max_i32_e32 v2, s2, v3
4103 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4104 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4105 ; GCN3-NEXT: buffer_wbinvl1_vol
4106 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
4107 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
4108 ; GCN3-NEXT: v_mov_b32_e32 v3, v2
4109 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
4110 ; GCN3-NEXT: s_cbranch_execnz .LBB90_1
4111 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
4112 ; GCN3-NEXT: s_endpgm
4114 %ptr = getelementptr i32, ptr %out, i32 %index
4115 %tmp0 = atomicrmw max ptr %ptr, i32 %in seq_cst
4119 define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) {
4120 ; GCN1-LABEL: atomic_max_i32_ret_addr64:
4121 ; GCN1: ; %bb.0: ; %entry
4122 ; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
4123 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
4124 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4125 ; GCN1-NEXT: s_ashr_i32 s5, s7, 31
4126 ; GCN1-NEXT: s_mov_b32 s4, s7
4127 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
4128 ; GCN1-NEXT: s_add_u32 s0, s0, s4
4129 ; GCN1-NEXT: s_addc_u32 s1, s1, s5
4130 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
4131 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
4132 ; GCN1-NEXT: flat_load_dword v2, v[0:1]
4133 ; GCN1-NEXT: s_mov_b64 s[0:1], 0
4134 ; GCN1-NEXT: .LBB91_1: ; %atomicrmw.start
4135 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
4136 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4137 ; GCN1-NEXT: v_mov_b32_e32 v3, v2
4138 ; GCN1-NEXT: v_max_i32_e32 v2, s6, v3
4139 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4140 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4141 ; GCN1-NEXT: buffer_wbinvl1_vol
4142 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
4143 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
4144 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
4145 ; GCN1-NEXT: s_cbranch_execnz .LBB91_1
4146 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
4147 ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1]
4148 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
4149 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
4150 ; GCN1-NEXT: flat_store_dword v[0:1], v2
4151 ; GCN1-NEXT: s_endpgm
4153 ; GCN2-LABEL: atomic_max_i32_ret_addr64:
4154 ; GCN2: ; %bb.0: ; %entry
4155 ; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
4156 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
4157 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4158 ; GCN2-NEXT: s_ashr_i32 s5, s7, 31
4159 ; GCN2-NEXT: s_mov_b32 s4, s7
4160 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
4161 ; GCN2-NEXT: s_add_u32 s0, s0, s4
4162 ; GCN2-NEXT: s_addc_u32 s1, s1, s5
4163 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
4164 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
4165 ; GCN2-NEXT: flat_load_dword v2, v[0:1]
4166 ; GCN2-NEXT: s_mov_b64 s[0:1], 0
4167 ; GCN2-NEXT: .LBB91_1: ; %atomicrmw.start
4168 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
4169 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4170 ; GCN2-NEXT: v_mov_b32_e32 v3, v2
4171 ; GCN2-NEXT: v_max_i32_e32 v2, s6, v3
4172 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4173 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4174 ; GCN2-NEXT: buffer_wbinvl1_vol
4175 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
4176 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
4177 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
4178 ; GCN2-NEXT: s_cbranch_execnz .LBB91_1
4179 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
4180 ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
4181 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
4182 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
4183 ; GCN2-NEXT: flat_store_dword v[0:1], v2
4184 ; GCN2-NEXT: s_endpgm
4186 ; GCN3-LABEL: atomic_max_i32_ret_addr64:
4187 ; GCN3: ; %bb.0: ; %entry
4188 ; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
4189 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
4190 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
4191 ; GCN3-NEXT: s_ashr_i32 s5, s7, 31
4192 ; GCN3-NEXT: s_mov_b32 s4, s7
4193 ; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
4194 ; GCN3-NEXT: s_add_u32 s0, s0, s4
4195 ; GCN3-NEXT: s_addc_u32 s1, s1, s5
4196 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
4197 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
4198 ; GCN3-NEXT: flat_load_dword v2, v[0:1]
4199 ; GCN3-NEXT: s_mov_b64 s[0:1], 0
4200 ; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start
4201 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
4202 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4203 ; GCN3-NEXT: v_mov_b32_e32 v3, v2
4204 ; GCN3-NEXT: v_max_i32_e32 v2, s6, v3
4205 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4206 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4207 ; GCN3-NEXT: buffer_wbinvl1_vol
4208 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
4209 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
4210 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
4211 ; GCN3-NEXT: s_cbranch_execnz .LBB91_1
4212 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
4213 ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
4214 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
4215 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
4216 ; GCN3-NEXT: flat_store_dword v[0:1], v2
4217 ; GCN3-NEXT: s_endpgm
4219 %ptr = getelementptr i32, ptr %out, i32 %index
4220 %tmp0 = atomicrmw max ptr %ptr, i32 %in seq_cst
4221 store i32 %tmp0, ptr %out2
4225 define void @flat_max_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
4226 ; GCN1-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory:
4228 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4229 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
4230 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4231 ; GCN1-NEXT: flat_load_dword v4, v[0:1]
4232 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
4233 ; GCN1-NEXT: .LBB92_1: ; %atomicrmw.start
4234 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
4235 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4236 ; GCN1-NEXT: v_max_i32_e32 v3, v4, v2
4237 ; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4238 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4239 ; GCN1-NEXT: buffer_wbinvl1_vol
4240 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
4241 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4242 ; GCN1-NEXT: v_mov_b32_e32 v4, v3
4243 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
4244 ; GCN1-NEXT: s_cbranch_execnz .LBB92_1
4245 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
4246 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
4247 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4249 ; GCN2-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory:
4251 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4252 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
4253 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4254 ; GCN2-NEXT: flat_load_dword v4, v[0:1]
4255 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
4256 ; GCN2-NEXT: .LBB92_1: ; %atomicrmw.start
4257 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
4258 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4259 ; GCN2-NEXT: v_max_i32_e32 v3, v4, v2
4260 ; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4261 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4262 ; GCN2-NEXT: buffer_wbinvl1_vol
4263 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
4264 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4265 ; GCN2-NEXT: v_mov_b32_e32 v4, v3
4266 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
4267 ; GCN2-NEXT: s_cbranch_execnz .LBB92_1
4268 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
4269 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
4270 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4272 ; GCN3-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory:
4274 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4275 ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16
4276 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
4277 ; GCN3-NEXT: .LBB92_1: ; %atomicrmw.start
4278 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
4279 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4280 ; GCN3-NEXT: v_max_i32_e32 v3, v4, v2
4281 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
4282 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4283 ; GCN3-NEXT: buffer_wbinvl1_vol
4284 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
4285 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4286 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
4287 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
4288 ; GCN3-NEXT: s_cbranch_execnz .LBB92_1
4289 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
4290 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
4291 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4292 %gep = getelementptr i32, ptr %out, i64 4
4293 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
4297 define i32 @flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
4298 ; GCN1-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory:
4300 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4301 ; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0
4302 ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
4303 ; GCN1-NEXT: flat_load_dword v0, v[3:4]
4304 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
4305 ; GCN1-NEXT: .LBB93_1: ; %atomicrmw.start
4306 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
4307 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4308 ; GCN1-NEXT: v_mov_b32_e32 v1, v0
4309 ; GCN1-NEXT: v_max_i32_e32 v0, v1, v2
4310 ; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
4311 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4312 ; GCN1-NEXT: buffer_wbinvl1_vol
4313 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4314 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4315 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
4316 ; GCN1-NEXT: s_cbranch_execnz .LBB93_1
4317 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
4318 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
4319 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4321 ; GCN2-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory:
4323 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4324 ; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0
4325 ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
4326 ; GCN2-NEXT: flat_load_dword v0, v[3:4]
4327 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
4328 ; GCN2-NEXT: .LBB93_1: ; %atomicrmw.start
4329 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
4330 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4331 ; GCN2-NEXT: v_mov_b32_e32 v1, v0
4332 ; GCN2-NEXT: v_max_i32_e32 v0, v1, v2
4333 ; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
4334 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4335 ; GCN2-NEXT: buffer_wbinvl1_vol
4336 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4337 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4338 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
4339 ; GCN2-NEXT: s_cbranch_execnz .LBB93_1
4340 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
4341 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
4342 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4344 ; GCN3-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory:
4346 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4347 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
4348 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
4349 ; GCN3-NEXT: .LBB93_1: ; %atomicrmw.start
4350 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
4351 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4352 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
4353 ; GCN3-NEXT: v_max_i32_e32 v3, v4, v2
4354 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
4355 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4356 ; GCN3-NEXT: buffer_wbinvl1_vol
4357 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
4358 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4359 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
4360 ; GCN3-NEXT: s_cbranch_execnz .LBB93_1
4361 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
4362 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
4363 ; GCN3-NEXT: v_mov_b32_e32 v0, v3
4364 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4365 %gep = getelementptr i32, ptr %out, i64 4
4366 %result = atomicrmw max ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
4370 ; ---------------------------------------------------------------------
4372 ; ---------------------------------------------------------------------
4374 define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) {
4375 ; GCN1-LABEL: flat_atomic_umax_i32_noret:
4377 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4378 ; GCN1-NEXT: flat_load_dword v4, v[0:1]
4379 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
4380 ; GCN1-NEXT: .LBB94_1: ; %atomicrmw.start
4381 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
4382 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4383 ; GCN1-NEXT: v_max_u32_e32 v3, v4, v2
4384 ; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4385 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4386 ; GCN1-NEXT: buffer_wbinvl1_vol
4387 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
4388 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4389 ; GCN1-NEXT: v_mov_b32_e32 v4, v3
4390 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
4391 ; GCN1-NEXT: s_cbranch_execnz .LBB94_1
4392 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
4393 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
4394 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4396 ; GCN2-LABEL: flat_atomic_umax_i32_noret:
4398 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4399 ; GCN2-NEXT: flat_load_dword v4, v[0:1]
4400 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
4401 ; GCN2-NEXT: .LBB94_1: ; %atomicrmw.start
4402 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
4403 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4404 ; GCN2-NEXT: v_max_u32_e32 v3, v4, v2
4405 ; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4406 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4407 ; GCN2-NEXT: buffer_wbinvl1_vol
4408 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
4409 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4410 ; GCN2-NEXT: v_mov_b32_e32 v4, v3
4411 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
4412 ; GCN2-NEXT: s_cbranch_execnz .LBB94_1
4413 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
4414 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
4415 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4417 ; GCN3-LABEL: flat_atomic_umax_i32_noret:
4419 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4420 ; GCN3-NEXT: flat_load_dword v4, v[0:1]
4421 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
4422 ; GCN3-NEXT: .LBB94_1: ; %atomicrmw.start
4423 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
4424 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4425 ; GCN3-NEXT: v_max_u32_e32 v3, v4, v2
4426 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4427 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4428 ; GCN3-NEXT: buffer_wbinvl1_vol
4429 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
4430 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4431 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
4432 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
4433 ; GCN3-NEXT: s_cbranch_execnz .LBB94_1
4434 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
4435 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
4436 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4437 %tmp0 = atomicrmw umax ptr %ptr, i32 %in seq_cst
4441 define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) {
4442 ; GCN1-LABEL: flat_atomic_umax_i32_noret_offset:
4444 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4445 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
4446 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4447 ; GCN1-NEXT: flat_load_dword v4, v[0:1]
4448 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
4449 ; GCN1-NEXT: .LBB95_1: ; %atomicrmw.start
4450 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
4451 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4452 ; GCN1-NEXT: v_max_u32_e32 v3, v4, v2
4453 ; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4454 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4455 ; GCN1-NEXT: buffer_wbinvl1_vol
4456 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
4457 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4458 ; GCN1-NEXT: v_mov_b32_e32 v4, v3
4459 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
4460 ; GCN1-NEXT: s_cbranch_execnz .LBB95_1
4461 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
4462 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
4463 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4465 ; GCN2-LABEL: flat_atomic_umax_i32_noret_offset:
4467 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4468 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
4469 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4470 ; GCN2-NEXT: flat_load_dword v4, v[0:1]
4471 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
4472 ; GCN2-NEXT: .LBB95_1: ; %atomicrmw.start
4473 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
4474 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4475 ; GCN2-NEXT: v_max_u32_e32 v3, v4, v2
4476 ; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4477 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4478 ; GCN2-NEXT: buffer_wbinvl1_vol
4479 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
4480 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4481 ; GCN2-NEXT: v_mov_b32_e32 v4, v3
4482 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
4483 ; GCN2-NEXT: s_cbranch_execnz .LBB95_1
4484 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
4485 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
4486 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4488 ; GCN3-LABEL: flat_atomic_umax_i32_noret_offset:
4490 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4491 ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16
4492 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
4493 ; GCN3-NEXT: .LBB95_1: ; %atomicrmw.start
4494 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
4495 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4496 ; GCN3-NEXT: v_max_u32_e32 v3, v4, v2
4497 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
4498 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4499 ; GCN3-NEXT: buffer_wbinvl1_vol
4500 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
4501 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4502 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
4503 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
4504 ; GCN3-NEXT: s_cbranch_execnz .LBB95_1
4505 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
4506 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
4507 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4508 %gep = getelementptr i32, ptr %out, i32 4
4509 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst
4513 define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) {
4514 ; GCN1-LABEL: flat_atomic_umax_i32_ret:
4516 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4517 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
4518 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
4519 ; GCN1-NEXT: .LBB96_1: ; %atomicrmw.start
4520 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
4521 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4522 ; GCN1-NEXT: v_mov_b32_e32 v4, v3
4523 ; GCN1-NEXT: v_max_u32_e32 v3, v4, v2
4524 ; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4525 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4526 ; GCN1-NEXT: buffer_wbinvl1_vol
4527 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
4528 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4529 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
4530 ; GCN1-NEXT: s_cbranch_execnz .LBB96_1
4531 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
4532 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
4533 ; GCN1-NEXT: v_mov_b32_e32 v0, v3
4534 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4536 ; GCN2-LABEL: flat_atomic_umax_i32_ret:
4538 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4539 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
4540 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
4541 ; GCN2-NEXT: .LBB96_1: ; %atomicrmw.start
4542 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
4543 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4544 ; GCN2-NEXT: v_mov_b32_e32 v4, v3
4545 ; GCN2-NEXT: v_max_u32_e32 v3, v4, v2
4546 ; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4547 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4548 ; GCN2-NEXT: buffer_wbinvl1_vol
4549 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
4550 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4551 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
4552 ; GCN2-NEXT: s_cbranch_execnz .LBB96_1
4553 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
4554 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
4555 ; GCN2-NEXT: v_mov_b32_e32 v0, v3
4556 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4558 ; GCN3-LABEL: flat_atomic_umax_i32_ret:
4560 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4561 ; GCN3-NEXT: flat_load_dword v3, v[0:1]
4562 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
4563 ; GCN3-NEXT: .LBB96_1: ; %atomicrmw.start
4564 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
4565 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4566 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
4567 ; GCN3-NEXT: v_max_u32_e32 v3, v4, v2
4568 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
4569 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4570 ; GCN3-NEXT: buffer_wbinvl1_vol
4571 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
4572 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4573 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
4574 ; GCN3-NEXT: s_cbranch_execnz .LBB96_1
4575 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
4576 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
4577 ; GCN3-NEXT: v_mov_b32_e32 v0, v3
4578 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4579 %result = atomicrmw umax ptr %ptr, i32 %in seq_cst
4583 define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) {
4584 ; GCN1-LABEL: flat_atomic_umax_i32_ret_offset:
4586 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4587 ; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0
4588 ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
4589 ; GCN1-NEXT: flat_load_dword v0, v[3:4]
4590 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
4591 ; GCN1-NEXT: .LBB97_1: ; %atomicrmw.start
4592 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
4593 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4594 ; GCN1-NEXT: v_mov_b32_e32 v1, v0
4595 ; GCN1-NEXT: v_max_u32_e32 v0, v1, v2
4596 ; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
4597 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4598 ; GCN1-NEXT: buffer_wbinvl1_vol
4599 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4600 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4601 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
4602 ; GCN1-NEXT: s_cbranch_execnz .LBB97_1
4603 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
4604 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
4605 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4607 ; GCN2-LABEL: flat_atomic_umax_i32_ret_offset:
4609 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4610 ; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0
4611 ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
4612 ; GCN2-NEXT: flat_load_dword v0, v[3:4]
4613 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
4614 ; GCN2-NEXT: .LBB97_1: ; %atomicrmw.start
4615 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
4616 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4617 ; GCN2-NEXT: v_mov_b32_e32 v1, v0
4618 ; GCN2-NEXT: v_max_u32_e32 v0, v1, v2
4619 ; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
4620 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4621 ; GCN2-NEXT: buffer_wbinvl1_vol
4622 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4623 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4624 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
4625 ; GCN2-NEXT: s_cbranch_execnz .LBB97_1
4626 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
4627 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
4628 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4630 ; GCN3-LABEL: flat_atomic_umax_i32_ret_offset:
4632 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4633 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
4634 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
4635 ; GCN3-NEXT: .LBB97_1: ; %atomicrmw.start
4636 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
4637 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4638 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
4639 ; GCN3-NEXT: v_max_u32_e32 v3, v4, v2
4640 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
4641 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4642 ; GCN3-NEXT: buffer_wbinvl1_vol
4643 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
4644 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4645 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
4646 ; GCN3-NEXT: s_cbranch_execnz .LBB97_1
4647 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
4648 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
4649 ; GCN3-NEXT: v_mov_b32_e32 v0, v3
4650 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4651 %gep = getelementptr i32, ptr %out, i32 4
4652 %result = atomicrmw umax ptr %gep, i32 %in seq_cst
4656 define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
4657 ; GCN1-LABEL: flat_atomic_umax_i32_noret_scalar:
4659 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4660 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
4661 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
4662 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
4663 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
4664 ; GCN1-NEXT: .LBB98_1: ; %atomicrmw.start
4665 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
4666 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4667 ; GCN1-NEXT: v_max_u32_e32 v2, s6, v3
4668 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4669 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4670 ; GCN1-NEXT: buffer_wbinvl1_vol
4671 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
4672 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4673 ; GCN1-NEXT: v_mov_b32_e32 v3, v2
4674 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
4675 ; GCN1-NEXT: s_cbranch_execnz .LBB98_1
4676 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
4677 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
4678 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4680 ; GCN2-LABEL: flat_atomic_umax_i32_noret_scalar:
4682 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4683 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
4684 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
4685 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
4686 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
4687 ; GCN2-NEXT: .LBB98_1: ; %atomicrmw.start
4688 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
4689 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4690 ; GCN2-NEXT: v_max_u32_e32 v2, s6, v3
4691 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4692 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4693 ; GCN2-NEXT: buffer_wbinvl1_vol
4694 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
4695 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4696 ; GCN2-NEXT: v_mov_b32_e32 v3, v2
4697 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
4698 ; GCN2-NEXT: s_cbranch_execnz .LBB98_1
4699 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
4700 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
4701 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4703 ; GCN3-LABEL: flat_atomic_umax_i32_noret_scalar:
4705 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4706 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
4707 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
4708 ; GCN3-NEXT: flat_load_dword v3, v[0:1]
4709 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
4710 ; GCN3-NEXT: .LBB98_1: ; %atomicrmw.start
4711 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
4712 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4713 ; GCN3-NEXT: v_max_u32_e32 v2, s6, v3
4714 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4715 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4716 ; GCN3-NEXT: buffer_wbinvl1_vol
4717 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
4718 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4719 ; GCN3-NEXT: v_mov_b32_e32 v3, v2
4720 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
4721 ; GCN3-NEXT: s_cbranch_execnz .LBB98_1
4722 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
4723 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
4724 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4725 %tmp0 = atomicrmw umax ptr %ptr, i32 %in seq_cst
4729 define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
4730 ; GCN1-LABEL: flat_atomic_umax_i32_noret_offset_scalar:
4732 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4733 ; GCN1-NEXT: s_add_u32 s34, s4, 16
4734 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
4735 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
4736 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
4737 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
4738 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
4739 ; GCN1-NEXT: .LBB99_1: ; %atomicrmw.start
4740 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
4741 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4742 ; GCN1-NEXT: v_max_u32_e32 v2, s6, v3
4743 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4744 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4745 ; GCN1-NEXT: buffer_wbinvl1_vol
4746 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
4747 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4748 ; GCN1-NEXT: v_mov_b32_e32 v3, v2
4749 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
4750 ; GCN1-NEXT: s_cbranch_execnz .LBB99_1
4751 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
4752 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
4753 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4755 ; GCN2-LABEL: flat_atomic_umax_i32_noret_offset_scalar:
4757 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4758 ; GCN2-NEXT: s_add_u32 s34, s4, 16
4759 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
4760 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
4761 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
4762 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
4763 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
4764 ; GCN2-NEXT: .LBB99_1: ; %atomicrmw.start
4765 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
4766 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4767 ; GCN2-NEXT: v_max_u32_e32 v2, s6, v3
4768 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4769 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4770 ; GCN2-NEXT: buffer_wbinvl1_vol
4771 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
4772 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4773 ; GCN2-NEXT: v_mov_b32_e32 v3, v2
4774 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
4775 ; GCN2-NEXT: s_cbranch_execnz .LBB99_1
4776 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
4777 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
4778 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4780 ; GCN3-LABEL: flat_atomic_umax_i32_noret_offset_scalar:
4782 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4783 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
4784 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
4785 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
4786 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
4787 ; GCN3-NEXT: .LBB99_1: ; %atomicrmw.start
4788 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
4789 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4790 ; GCN3-NEXT: v_max_u32_e32 v2, s6, v3
4791 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4792 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4793 ; GCN3-NEXT: buffer_wbinvl1_vol
4794 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
4795 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4796 ; GCN3-NEXT: v_mov_b32_e32 v3, v2
4797 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
4798 ; GCN3-NEXT: s_cbranch_execnz .LBB99_1
4799 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
4800 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
4801 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4802 %gep = getelementptr i32, ptr %out, i32 4
4803 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst
4807 define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
4808 ; GCN1-LABEL: flat_atomic_umax_i32_ret_scalar:
4810 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4811 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
4812 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
4813 ; GCN1-NEXT: flat_load_dword v0, v[0:1]
4814 ; GCN1-NEXT: v_mov_b32_e32 v1, s4
4815 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
4816 ; GCN1-NEXT: v_mov_b32_e32 v2, s5
4817 ; GCN1-NEXT: .LBB100_1: ; %atomicrmw.start
4818 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
4819 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4820 ; GCN1-NEXT: v_mov_b32_e32 v4, v0
4821 ; GCN1-NEXT: v_max_u32_e32 v3, s6, v4
4822 ; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
4823 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4824 ; GCN1-NEXT: buffer_wbinvl1_vol
4825 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
4826 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4827 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
4828 ; GCN1-NEXT: s_cbranch_execnz .LBB100_1
4829 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
4830 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
4831 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4833 ; GCN2-LABEL: flat_atomic_umax_i32_ret_scalar:
4835 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4836 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
4837 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
4838 ; GCN2-NEXT: flat_load_dword v0, v[0:1]
4839 ; GCN2-NEXT: v_mov_b32_e32 v1, s4
4840 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
4841 ; GCN2-NEXT: v_mov_b32_e32 v2, s5
4842 ; GCN2-NEXT: .LBB100_1: ; %atomicrmw.start
4843 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
4844 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4845 ; GCN2-NEXT: v_mov_b32_e32 v4, v0
4846 ; GCN2-NEXT: v_max_u32_e32 v3, s6, v4
4847 ; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
4848 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4849 ; GCN2-NEXT: buffer_wbinvl1_vol
4850 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
4851 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4852 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
4853 ; GCN2-NEXT: s_cbranch_execnz .LBB100_1
4854 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
4855 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
4856 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4858 ; GCN3-LABEL: flat_atomic_umax_i32_ret_scalar:
4860 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4861 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
4862 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
4863 ; GCN3-NEXT: flat_load_dword v0, v[0:1]
4864 ; GCN3-NEXT: v_mov_b32_e32 v1, s4
4865 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
4866 ; GCN3-NEXT: v_mov_b32_e32 v2, s5
4867 ; GCN3-NEXT: .LBB100_1: ; %atomicrmw.start
4868 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
4869 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4870 ; GCN3-NEXT: v_mov_b32_e32 v4, v0
4871 ; GCN3-NEXT: v_max_u32_e32 v3, s6, v4
4872 ; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
4873 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4874 ; GCN3-NEXT: buffer_wbinvl1_vol
4875 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
4876 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4877 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
4878 ; GCN3-NEXT: s_cbranch_execnz .LBB100_1
4879 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
4880 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
4881 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4882 %result = atomicrmw umax ptr %ptr, i32 %in seq_cst
4886 define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
4887 ; GCN1-LABEL: flat_atomic_umax_i32_ret_offset_scalar:
4889 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4890 ; GCN1-NEXT: s_add_u32 s34, s4, 16
4891 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
4892 ; GCN1-NEXT: v_mov_b32_e32 v1, s34
4893 ; GCN1-NEXT: v_mov_b32_e32 v2, s35
4894 ; GCN1-NEXT: flat_load_dword v0, v[1:2]
4895 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
4896 ; GCN1-NEXT: .LBB101_1: ; %atomicrmw.start
4897 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
4898 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4899 ; GCN1-NEXT: v_mov_b32_e32 v4, v0
4900 ; GCN1-NEXT: v_max_u32_e32 v3, s6, v4
4901 ; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
4902 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4903 ; GCN1-NEXT: buffer_wbinvl1_vol
4904 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
4905 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4906 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
4907 ; GCN1-NEXT: s_cbranch_execnz .LBB101_1
4908 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
4909 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
4910 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4912 ; GCN2-LABEL: flat_atomic_umax_i32_ret_offset_scalar:
4914 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4915 ; GCN2-NEXT: s_add_u32 s34, s4, 16
4916 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
4917 ; GCN2-NEXT: v_mov_b32_e32 v1, s34
4918 ; GCN2-NEXT: v_mov_b32_e32 v2, s35
4919 ; GCN2-NEXT: flat_load_dword v0, v[1:2]
4920 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
4921 ; GCN2-NEXT: .LBB101_1: ; %atomicrmw.start
4922 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
4923 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4924 ; GCN2-NEXT: v_mov_b32_e32 v4, v0
4925 ; GCN2-NEXT: v_max_u32_e32 v3, s6, v4
4926 ; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
4927 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4928 ; GCN2-NEXT: buffer_wbinvl1_vol
4929 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
4930 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4931 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
4932 ; GCN2-NEXT: s_cbranch_execnz .LBB101_1
4933 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
4934 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
4935 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4937 ; GCN3-LABEL: flat_atomic_umax_i32_ret_offset_scalar:
4939 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4940 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
4941 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
4942 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
4943 ; GCN3-NEXT: v_mov_b32_e32 v1, s4
4944 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
4945 ; GCN3-NEXT: v_mov_b32_e32 v2, s5
4946 ; GCN3-NEXT: .LBB101_1: ; %atomicrmw.start
4947 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
4948 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4949 ; GCN3-NEXT: v_mov_b32_e32 v4, v0
4950 ; GCN3-NEXT: v_max_u32_e32 v3, s6, v4
4951 ; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc
4952 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4953 ; GCN3-NEXT: buffer_wbinvl1_vol
4954 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
4955 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
4956 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
4957 ; GCN3-NEXT: s_cbranch_execnz .LBB101_1
4958 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
4959 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
4960 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4961 %gep = getelementptr i32, ptr %out, i32 4
4962 %result = atomicrmw umax ptr %gep, i32 %in seq_cst
4966 define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 %index) {
4967 ; GCN1-LABEL: atomic_umax_i32_addr64_offset:
4968 ; GCN1: ; %bb.0: ; %entry
4969 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
4970 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4971 ; GCN1-NEXT: s_ashr_i32 s5, s3, 31
4972 ; GCN1-NEXT: s_mov_b32 s4, s3
4973 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
4974 ; GCN1-NEXT: s_add_u32 s0, s0, s4
4975 ; GCN1-NEXT: s_addc_u32 s1, s1, s5
4976 ; GCN1-NEXT: s_add_u32 s0, s0, 16
4977 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
4978 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
4979 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
4980 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
4981 ; GCN1-NEXT: s_mov_b64 s[0:1], 0
4982 ; GCN1-NEXT: .LBB102_1: ; %atomicrmw.start
4983 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
4984 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4985 ; GCN1-NEXT: v_max_u32_e32 v2, s2, v3
4986 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4987 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4988 ; GCN1-NEXT: buffer_wbinvl1_vol
4989 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
4990 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
4991 ; GCN1-NEXT: v_mov_b32_e32 v3, v2
4992 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
4993 ; GCN1-NEXT: s_cbranch_execnz .LBB102_1
4994 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
4995 ; GCN1-NEXT: s_endpgm
4997 ; GCN2-LABEL: atomic_umax_i32_addr64_offset:
4998 ; GCN2: ; %bb.0: ; %entry
4999 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
5000 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5001 ; GCN2-NEXT: s_ashr_i32 s5, s3, 31
5002 ; GCN2-NEXT: s_mov_b32 s4, s3
5003 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5004 ; GCN2-NEXT: s_add_u32 s0, s0, s4
5005 ; GCN2-NEXT: s_addc_u32 s1, s1, s5
5006 ; GCN2-NEXT: s_add_u32 s0, s0, 16
5007 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
5008 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
5009 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
5010 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
5011 ; GCN2-NEXT: s_mov_b64 s[0:1], 0
5012 ; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start
5013 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
5014 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5015 ; GCN2-NEXT: v_max_u32_e32 v2, s2, v3
5016 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5017 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5018 ; GCN2-NEXT: buffer_wbinvl1_vol
5019 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
5020 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
5021 ; GCN2-NEXT: v_mov_b32_e32 v3, v2
5022 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
5023 ; GCN2-NEXT: s_cbranch_execnz .LBB102_1
5024 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
5025 ; GCN2-NEXT: s_endpgm
5027 ; GCN3-LABEL: atomic_umax_i32_addr64_offset:
5028 ; GCN3: ; %bb.0: ; %entry
5029 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
5030 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
5031 ; GCN3-NEXT: s_ashr_i32 s5, s3, 31
5032 ; GCN3-NEXT: s_mov_b32 s4, s3
5033 ; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5034 ; GCN3-NEXT: s_add_u32 s0, s0, s4
5035 ; GCN3-NEXT: s_addc_u32 s1, s1, s5
5036 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
5037 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
5038 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
5039 ; GCN3-NEXT: s_mov_b64 s[0:1], 0
5040 ; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start
5041 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
5042 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5043 ; GCN3-NEXT: v_max_u32_e32 v2, s2, v3
5044 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5045 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5046 ; GCN3-NEXT: buffer_wbinvl1_vol
5047 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
5048 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
5049 ; GCN3-NEXT: v_mov_b32_e32 v3, v2
5050 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
5051 ; GCN3-NEXT: s_cbranch_execnz .LBB102_1
5052 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
5053 ; GCN3-NEXT: s_endpgm
5055 %ptr = getelementptr i32, ptr %out, i32 %index
5056 %gep = getelementptr i32, ptr %ptr, i32 4
5057 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst
5061 define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) {
5062 ; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset:
5063 ; GCN1: ; %bb.0: ; %entry
5064 ; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
5065 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
5066 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5067 ; GCN1-NEXT: s_ashr_i32 s5, s7, 31
5068 ; GCN1-NEXT: s_mov_b32 s4, s7
5069 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5070 ; GCN1-NEXT: s_add_u32 s0, s0, s4
5071 ; GCN1-NEXT: s_addc_u32 s1, s1, s5
5072 ; GCN1-NEXT: s_add_u32 s0, s0, 16
5073 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
5074 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
5075 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
5076 ; GCN1-NEXT: flat_load_dword v2, v[0:1]
5077 ; GCN1-NEXT: s_mov_b64 s[0:1], 0
5078 ; GCN1-NEXT: .LBB103_1: ; %atomicrmw.start
5079 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
5080 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5081 ; GCN1-NEXT: v_mov_b32_e32 v3, v2
5082 ; GCN1-NEXT: v_max_u32_e32 v2, s6, v3
5083 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5084 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5085 ; GCN1-NEXT: buffer_wbinvl1_vol
5086 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
5087 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
5088 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
5089 ; GCN1-NEXT: s_cbranch_execnz .LBB103_1
5090 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
5091 ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1]
5092 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
5093 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
5094 ; GCN1-NEXT: flat_store_dword v[0:1], v2
5095 ; GCN1-NEXT: s_endpgm
5097 ; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset:
5098 ; GCN2: ; %bb.0: ; %entry
5099 ; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
5100 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
5101 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5102 ; GCN2-NEXT: s_ashr_i32 s5, s7, 31
5103 ; GCN2-NEXT: s_mov_b32 s4, s7
5104 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5105 ; GCN2-NEXT: s_add_u32 s0, s0, s4
5106 ; GCN2-NEXT: s_addc_u32 s1, s1, s5
5107 ; GCN2-NEXT: s_add_u32 s0, s0, 16
5108 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
5109 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
5110 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
5111 ; GCN2-NEXT: flat_load_dword v2, v[0:1]
5112 ; GCN2-NEXT: s_mov_b64 s[0:1], 0
5113 ; GCN2-NEXT: .LBB103_1: ; %atomicrmw.start
5114 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
5115 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5116 ; GCN2-NEXT: v_mov_b32_e32 v3, v2
5117 ; GCN2-NEXT: v_max_u32_e32 v2, s6, v3
5118 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5119 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5120 ; GCN2-NEXT: buffer_wbinvl1_vol
5121 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
5122 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
5123 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
5124 ; GCN2-NEXT: s_cbranch_execnz .LBB103_1
5125 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
5126 ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
5127 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
5128 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
5129 ; GCN2-NEXT: flat_store_dword v[0:1], v2
5130 ; GCN2-NEXT: s_endpgm
5132 ; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset:
5133 ; GCN3: ; %bb.0: ; %entry
5134 ; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
5135 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
5136 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
5137 ; GCN3-NEXT: s_ashr_i32 s5, s7, 31
5138 ; GCN3-NEXT: s_mov_b32 s4, s7
5139 ; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5140 ; GCN3-NEXT: s_add_u32 s0, s0, s4
5141 ; GCN3-NEXT: s_addc_u32 s1, s1, s5
5142 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
5143 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
5144 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16
5145 ; GCN3-NEXT: s_mov_b64 s[0:1], 0
5146 ; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start
5147 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
5148 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5149 ; GCN3-NEXT: v_mov_b32_e32 v3, v2
5150 ; GCN3-NEXT: v_max_u32_e32 v2, s6, v3
5151 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5152 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5153 ; GCN3-NEXT: buffer_wbinvl1_vol
5154 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
5155 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
5156 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
5157 ; GCN3-NEXT: s_cbranch_execnz .LBB103_1
5158 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
5159 ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
5160 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
5161 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
5162 ; GCN3-NEXT: flat_store_dword v[0:1], v2
5163 ; GCN3-NEXT: s_endpgm
5165 %ptr = getelementptr i32, ptr %out, i32 %index
5166 %gep = getelementptr i32, ptr %ptr, i32 4
5167 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst
5168 store i32 %tmp0, ptr %out2
5172 define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) {
5173 ; GCN1-LABEL: atomic_umax_i32_ret_addr64:
5174 ; GCN1: ; %bb.0: ; %entry
5175 ; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
5176 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
5177 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5178 ; GCN1-NEXT: s_ashr_i32 s5, s7, 31
5179 ; GCN1-NEXT: s_mov_b32 s4, s7
5180 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5181 ; GCN1-NEXT: s_add_u32 s0, s0, s4
5182 ; GCN1-NEXT: s_addc_u32 s1, s1, s5
5183 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
5184 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
5185 ; GCN1-NEXT: flat_load_dword v2, v[0:1]
5186 ; GCN1-NEXT: s_mov_b64 s[0:1], 0
5187 ; GCN1-NEXT: .LBB104_1: ; %atomicrmw.start
5188 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
5189 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5190 ; GCN1-NEXT: v_mov_b32_e32 v3, v2
5191 ; GCN1-NEXT: v_max_u32_e32 v2, s6, v3
5192 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5193 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5194 ; GCN1-NEXT: buffer_wbinvl1_vol
5195 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
5196 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
5197 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
5198 ; GCN1-NEXT: s_cbranch_execnz .LBB104_1
5199 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
5200 ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1]
5201 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
5202 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
5203 ; GCN1-NEXT: flat_store_dword v[0:1], v2
5204 ; GCN1-NEXT: s_endpgm
5206 ; GCN2-LABEL: atomic_umax_i32_ret_addr64:
5207 ; GCN2: ; %bb.0: ; %entry
5208 ; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
5209 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
5210 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5211 ; GCN2-NEXT: s_ashr_i32 s5, s7, 31
5212 ; GCN2-NEXT: s_mov_b32 s4, s7
5213 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5214 ; GCN2-NEXT: s_add_u32 s0, s0, s4
5215 ; GCN2-NEXT: s_addc_u32 s1, s1, s5
5216 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
5217 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
5218 ; GCN2-NEXT: flat_load_dword v2, v[0:1]
5219 ; GCN2-NEXT: s_mov_b64 s[0:1], 0
5220 ; GCN2-NEXT: .LBB104_1: ; %atomicrmw.start
5221 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
5222 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5223 ; GCN2-NEXT: v_mov_b32_e32 v3, v2
5224 ; GCN2-NEXT: v_max_u32_e32 v2, s6, v3
5225 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5226 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5227 ; GCN2-NEXT: buffer_wbinvl1_vol
5228 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
5229 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
5230 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
5231 ; GCN2-NEXT: s_cbranch_execnz .LBB104_1
5232 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
5233 ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
5234 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
5235 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
5236 ; GCN2-NEXT: flat_store_dword v[0:1], v2
5237 ; GCN2-NEXT: s_endpgm
5239 ; GCN3-LABEL: atomic_umax_i32_ret_addr64:
5240 ; GCN3: ; %bb.0: ; %entry
5241 ; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
5242 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
5243 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
5244 ; GCN3-NEXT: s_ashr_i32 s5, s7, 31
5245 ; GCN3-NEXT: s_mov_b32 s4, s7
5246 ; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5247 ; GCN3-NEXT: s_add_u32 s0, s0, s4
5248 ; GCN3-NEXT: s_addc_u32 s1, s1, s5
5249 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
5250 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
5251 ; GCN3-NEXT: flat_load_dword v2, v[0:1]
5252 ; GCN3-NEXT: s_mov_b64 s[0:1], 0
5253 ; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start
5254 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
5255 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5256 ; GCN3-NEXT: v_mov_b32_e32 v3, v2
5257 ; GCN3-NEXT: v_max_u32_e32 v2, s6, v3
5258 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5259 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5260 ; GCN3-NEXT: buffer_wbinvl1_vol
5261 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
5262 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
5263 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
5264 ; GCN3-NEXT: s_cbranch_execnz .LBB104_1
5265 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
5266 ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
5267 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
5268 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
5269 ; GCN3-NEXT: flat_store_dword v[0:1], v2
5270 ; GCN3-NEXT: s_endpgm
5272 %ptr = getelementptr i32, ptr %out, i32 %index
5273 %tmp0 = atomicrmw umax ptr %ptr, i32 %in seq_cst
5274 store i32 %tmp0, ptr %out2
5278 define void @flat_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
5279 ; GCN1-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory:
5281 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5282 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
5283 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
5284 ; GCN1-NEXT: flat_load_dword v4, v[0:1]
5285 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
5286 ; GCN1-NEXT: .LBB105_1: ; %atomicrmw.start
5287 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
5288 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5289 ; GCN1-NEXT: v_max_u32_e32 v3, v4, v2
5290 ; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5291 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5292 ; GCN1-NEXT: buffer_wbinvl1_vol
5293 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5294 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5295 ; GCN1-NEXT: v_mov_b32_e32 v4, v3
5296 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
5297 ; GCN1-NEXT: s_cbranch_execnz .LBB105_1
5298 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
5299 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
5300 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5302 ; GCN2-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory:
5304 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5305 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
5306 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
5307 ; GCN2-NEXT: flat_load_dword v4, v[0:1]
5308 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
5309 ; GCN2-NEXT: .LBB105_1: ; %atomicrmw.start
5310 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
5311 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5312 ; GCN2-NEXT: v_max_u32_e32 v3, v4, v2
5313 ; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5314 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5315 ; GCN2-NEXT: buffer_wbinvl1_vol
5316 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5317 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5318 ; GCN2-NEXT: v_mov_b32_e32 v4, v3
5319 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
5320 ; GCN2-NEXT: s_cbranch_execnz .LBB105_1
5321 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
5322 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
5323 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5325 ; GCN3-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory:
5327 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5328 ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16
5329 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
5330 ; GCN3-NEXT: .LBB105_1: ; %atomicrmw.start
5331 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
5332 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5333 ; GCN3-NEXT: v_max_u32_e32 v3, v4, v2
5334 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
5335 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5336 ; GCN3-NEXT: buffer_wbinvl1_vol
5337 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5338 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5339 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
5340 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
5341 ; GCN3-NEXT: s_cbranch_execnz .LBB105_1
5342 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
5343 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
5344 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5345 %gep = getelementptr i32, ptr %out, i64 4
5346 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
5350 define i32 @flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
5351 ; GCN1-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory:
5353 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5354 ; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0
5355 ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
5356 ; GCN1-NEXT: flat_load_dword v0, v[3:4]
5357 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
5358 ; GCN1-NEXT: .LBB106_1: ; %atomicrmw.start
5359 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
5360 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5361 ; GCN1-NEXT: v_mov_b32_e32 v1, v0
5362 ; GCN1-NEXT: v_max_u32_e32 v0, v1, v2
5363 ; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
5364 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5365 ; GCN1-NEXT: buffer_wbinvl1_vol
5366 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
5367 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5368 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
5369 ; GCN1-NEXT: s_cbranch_execnz .LBB106_1
5370 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
5371 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
5372 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5374 ; GCN2-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory:
5376 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5377 ; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0
5378 ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
5379 ; GCN2-NEXT: flat_load_dword v0, v[3:4]
5380 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
5381 ; GCN2-NEXT: .LBB106_1: ; %atomicrmw.start
5382 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
5383 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5384 ; GCN2-NEXT: v_mov_b32_e32 v1, v0
5385 ; GCN2-NEXT: v_max_u32_e32 v0, v1, v2
5386 ; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
5387 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5388 ; GCN2-NEXT: buffer_wbinvl1_vol
5389 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
5390 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5391 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
5392 ; GCN2-NEXT: s_cbranch_execnz .LBB106_1
5393 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
5394 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
5395 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5397 ; GCN3-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory:
5399 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5400 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
5401 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
5402 ; GCN3-NEXT: .LBB106_1: ; %atomicrmw.start
5403 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
5404 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5405 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
5406 ; GCN3-NEXT: v_max_u32_e32 v3, v4, v2
5407 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
5408 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5409 ; GCN3-NEXT: buffer_wbinvl1_vol
5410 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5411 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5412 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
5413 ; GCN3-NEXT: s_cbranch_execnz .LBB106_1
5414 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
5415 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
5416 ; GCN3-NEXT: v_mov_b32_e32 v0, v3
5417 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5418 %gep = getelementptr i32, ptr %out, i64 4
5419 %result = atomicrmw umax ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
5423 ; ---------------------------------------------------------------------
5425 ; ---------------------------------------------------------------------
5427 define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) {
5428 ; GCN1-LABEL: flat_atomic_umin_i32_noret:
5430 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5431 ; GCN1-NEXT: flat_load_dword v4, v[0:1]
5432 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
5433 ; GCN1-NEXT: .LBB107_1: ; %atomicrmw.start
5434 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
5435 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5436 ; GCN1-NEXT: v_min_u32_e32 v3, v4, v2
5437 ; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5438 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5439 ; GCN1-NEXT: buffer_wbinvl1_vol
5440 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5441 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5442 ; GCN1-NEXT: v_mov_b32_e32 v4, v3
5443 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
5444 ; GCN1-NEXT: s_cbranch_execnz .LBB107_1
5445 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
5446 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
5447 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5449 ; GCN2-LABEL: flat_atomic_umin_i32_noret:
5451 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5452 ; GCN2-NEXT: flat_load_dword v4, v[0:1]
5453 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
5454 ; GCN2-NEXT: .LBB107_1: ; %atomicrmw.start
5455 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
5456 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5457 ; GCN2-NEXT: v_min_u32_e32 v3, v4, v2
5458 ; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5459 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5460 ; GCN2-NEXT: buffer_wbinvl1_vol
5461 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5462 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5463 ; GCN2-NEXT: v_mov_b32_e32 v4, v3
5464 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
5465 ; GCN2-NEXT: s_cbranch_execnz .LBB107_1
5466 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
5467 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
5468 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5470 ; GCN3-LABEL: flat_atomic_umin_i32_noret:
5472 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5473 ; GCN3-NEXT: flat_load_dword v4, v[0:1]
5474 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
5475 ; GCN3-NEXT: .LBB107_1: ; %atomicrmw.start
5476 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
5477 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5478 ; GCN3-NEXT: v_min_u32_e32 v3, v4, v2
5479 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5480 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5481 ; GCN3-NEXT: buffer_wbinvl1_vol
5482 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5483 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5484 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
5485 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
5486 ; GCN3-NEXT: s_cbranch_execnz .LBB107_1
5487 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
5488 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
5489 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5490 %tmp0 = atomicrmw umin ptr %ptr, i32 %in seq_cst
5494 define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) {
5495 ; GCN1-LABEL: flat_atomic_umin_i32_noret_offset:
5497 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5498 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
5499 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
5500 ; GCN1-NEXT: flat_load_dword v4, v[0:1]
5501 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
5502 ; GCN1-NEXT: .LBB108_1: ; %atomicrmw.start
5503 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
5504 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5505 ; GCN1-NEXT: v_min_u32_e32 v3, v4, v2
5506 ; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5507 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5508 ; GCN1-NEXT: buffer_wbinvl1_vol
5509 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5510 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5511 ; GCN1-NEXT: v_mov_b32_e32 v4, v3
5512 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
5513 ; GCN1-NEXT: s_cbranch_execnz .LBB108_1
5514 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
5515 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
5516 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5518 ; GCN2-LABEL: flat_atomic_umin_i32_noret_offset:
5520 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5521 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
5522 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
5523 ; GCN2-NEXT: flat_load_dword v4, v[0:1]
5524 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
5525 ; GCN2-NEXT: .LBB108_1: ; %atomicrmw.start
5526 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
5527 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5528 ; GCN2-NEXT: v_min_u32_e32 v3, v4, v2
5529 ; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5530 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5531 ; GCN2-NEXT: buffer_wbinvl1_vol
5532 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5533 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5534 ; GCN2-NEXT: v_mov_b32_e32 v4, v3
5535 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
5536 ; GCN2-NEXT: s_cbranch_execnz .LBB108_1
5537 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
5538 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
5539 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5541 ; GCN3-LABEL: flat_atomic_umin_i32_noret_offset:
5543 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5544 ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16
5545 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
5546 ; GCN3-NEXT: .LBB108_1: ; %atomicrmw.start
5547 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
5548 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5549 ; GCN3-NEXT: v_min_u32_e32 v3, v4, v2
5550 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
5551 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5552 ; GCN3-NEXT: buffer_wbinvl1_vol
5553 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5554 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5555 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
5556 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
5557 ; GCN3-NEXT: s_cbranch_execnz .LBB108_1
5558 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
5559 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
5560 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5561 %gep = getelementptr i32, ptr %out, i32 4
5562 %tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst
5566 define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) {
5567 ; GCN1-LABEL: flat_atomic_umin_i32_ret:
5569 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5570 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
5571 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
5572 ; GCN1-NEXT: .LBB109_1: ; %atomicrmw.start
5573 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
5574 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5575 ; GCN1-NEXT: v_mov_b32_e32 v4, v3
5576 ; GCN1-NEXT: v_min_u32_e32 v3, v4, v2
5577 ; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5578 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5579 ; GCN1-NEXT: buffer_wbinvl1_vol
5580 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5581 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5582 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
5583 ; GCN1-NEXT: s_cbranch_execnz .LBB109_1
5584 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
5585 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
5586 ; GCN1-NEXT: v_mov_b32_e32 v0, v3
5587 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5589 ; GCN2-LABEL: flat_atomic_umin_i32_ret:
5591 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5592 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
5593 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
5594 ; GCN2-NEXT: .LBB109_1: ; %atomicrmw.start
5595 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
5596 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5597 ; GCN2-NEXT: v_mov_b32_e32 v4, v3
5598 ; GCN2-NEXT: v_min_u32_e32 v3, v4, v2
5599 ; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5600 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5601 ; GCN2-NEXT: buffer_wbinvl1_vol
5602 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5603 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5604 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
5605 ; GCN2-NEXT: s_cbranch_execnz .LBB109_1
5606 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
5607 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
5608 ; GCN2-NEXT: v_mov_b32_e32 v0, v3
5609 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5611 ; GCN3-LABEL: flat_atomic_umin_i32_ret:
5613 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5614 ; GCN3-NEXT: flat_load_dword v3, v[0:1]
5615 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
5616 ; GCN3-NEXT: .LBB109_1: ; %atomicrmw.start
5617 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
5618 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5619 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
5620 ; GCN3-NEXT: v_min_u32_e32 v3, v4, v2
5621 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
5622 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5623 ; GCN3-NEXT: buffer_wbinvl1_vol
5624 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5625 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5626 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
5627 ; GCN3-NEXT: s_cbranch_execnz .LBB109_1
5628 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
5629 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
5630 ; GCN3-NEXT: v_mov_b32_e32 v0, v3
5631 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5632 %result = atomicrmw umin ptr %ptr, i32 %in seq_cst
5636 define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) {
5637 ; GCN1-LABEL: flat_atomic_umin_i32_ret_offset:
5639 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5640 ; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0
5641 ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
5642 ; GCN1-NEXT: flat_load_dword v0, v[3:4]
5643 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
5644 ; GCN1-NEXT: .LBB110_1: ; %atomicrmw.start
5645 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
5646 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5647 ; GCN1-NEXT: v_mov_b32_e32 v1, v0
5648 ; GCN1-NEXT: v_min_u32_e32 v0, v1, v2
5649 ; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
5650 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5651 ; GCN1-NEXT: buffer_wbinvl1_vol
5652 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
5653 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5654 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
5655 ; GCN1-NEXT: s_cbranch_execnz .LBB110_1
5656 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
5657 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
5658 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5660 ; GCN2-LABEL: flat_atomic_umin_i32_ret_offset:
5662 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5663 ; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0
5664 ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
5665 ; GCN2-NEXT: flat_load_dword v0, v[3:4]
5666 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
5667 ; GCN2-NEXT: .LBB110_1: ; %atomicrmw.start
5668 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
5669 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5670 ; GCN2-NEXT: v_mov_b32_e32 v1, v0
5671 ; GCN2-NEXT: v_min_u32_e32 v0, v1, v2
5672 ; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
5673 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5674 ; GCN2-NEXT: buffer_wbinvl1_vol
5675 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
5676 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5677 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
5678 ; GCN2-NEXT: s_cbranch_execnz .LBB110_1
5679 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
5680 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
5681 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5683 ; GCN3-LABEL: flat_atomic_umin_i32_ret_offset:
5685 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5686 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
5687 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
5688 ; GCN3-NEXT: .LBB110_1: ; %atomicrmw.start
5689 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
5690 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5691 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
5692 ; GCN3-NEXT: v_min_u32_e32 v3, v4, v2
5693 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
5694 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5695 ; GCN3-NEXT: buffer_wbinvl1_vol
5696 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
5697 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5698 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
5699 ; GCN3-NEXT: s_cbranch_execnz .LBB110_1
5700 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
5701 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
5702 ; GCN3-NEXT: v_mov_b32_e32 v0, v3
5703 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5704 %gep = getelementptr i32, ptr %out, i32 4
5705 %result = atomicrmw umin ptr %gep, i32 %in seq_cst
5709 define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
5710 ; GCN1-LABEL: flat_atomic_umin_i32_noret_scalar:
5712 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5713 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
5714 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
5715 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
5716 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
5717 ; GCN1-NEXT: .LBB111_1: ; %atomicrmw.start
5718 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
5719 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5720 ; GCN1-NEXT: v_min_u32_e32 v2, s6, v3
5721 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5722 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5723 ; GCN1-NEXT: buffer_wbinvl1_vol
5724 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
5725 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5726 ; GCN1-NEXT: v_mov_b32_e32 v3, v2
5727 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
5728 ; GCN1-NEXT: s_cbranch_execnz .LBB111_1
5729 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
5730 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
5731 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5733 ; GCN2-LABEL: flat_atomic_umin_i32_noret_scalar:
5735 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5736 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
5737 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
5738 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
5739 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
5740 ; GCN2-NEXT: .LBB111_1: ; %atomicrmw.start
5741 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
5742 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5743 ; GCN2-NEXT: v_min_u32_e32 v2, s6, v3
5744 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5745 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5746 ; GCN2-NEXT: buffer_wbinvl1_vol
5747 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
5748 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5749 ; GCN2-NEXT: v_mov_b32_e32 v3, v2
5750 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
5751 ; GCN2-NEXT: s_cbranch_execnz .LBB111_1
5752 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
5753 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
5754 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5756 ; GCN3-LABEL: flat_atomic_umin_i32_noret_scalar:
5758 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5759 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
5760 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
5761 ; GCN3-NEXT: flat_load_dword v3, v[0:1]
5762 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
5763 ; GCN3-NEXT: .LBB111_1: ; %atomicrmw.start
5764 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
5765 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5766 ; GCN3-NEXT: v_min_u32_e32 v2, s6, v3
5767 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5768 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5769 ; GCN3-NEXT: buffer_wbinvl1_vol
5770 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
5771 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5772 ; GCN3-NEXT: v_mov_b32_e32 v3, v2
5773 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
5774 ; GCN3-NEXT: s_cbranch_execnz .LBB111_1
5775 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
5776 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
5777 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5778 %tmp0 = atomicrmw umin ptr %ptr, i32 %in seq_cst
5782 define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
5783 ; GCN1-LABEL: flat_atomic_umin_i32_noret_offset_scalar:
5785 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5786 ; GCN1-NEXT: s_add_u32 s34, s4, 16
5787 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
5788 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
5789 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
5790 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
5791 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
5792 ; GCN1-NEXT: .LBB112_1: ; %atomicrmw.start
5793 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
5794 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5795 ; GCN1-NEXT: v_min_u32_e32 v2, s6, v3
5796 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5797 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5798 ; GCN1-NEXT: buffer_wbinvl1_vol
5799 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
5800 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5801 ; GCN1-NEXT: v_mov_b32_e32 v3, v2
5802 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
5803 ; GCN1-NEXT: s_cbranch_execnz .LBB112_1
5804 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
5805 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
5806 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5808 ; GCN2-LABEL: flat_atomic_umin_i32_noret_offset_scalar:
5810 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5811 ; GCN2-NEXT: s_add_u32 s34, s4, 16
5812 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
5813 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
5814 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
5815 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
5816 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
5817 ; GCN2-NEXT: .LBB112_1: ; %atomicrmw.start
5818 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
5819 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5820 ; GCN2-NEXT: v_min_u32_e32 v2, s6, v3
5821 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5822 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5823 ; GCN2-NEXT: buffer_wbinvl1_vol
5824 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
5825 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5826 ; GCN2-NEXT: v_mov_b32_e32 v3, v2
5827 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
5828 ; GCN2-NEXT: s_cbranch_execnz .LBB112_1
5829 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
5830 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
5831 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5833 ; GCN3-LABEL: flat_atomic_umin_i32_noret_offset_scalar:
5835 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5836 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
5837 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
5838 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
5839 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
5840 ; GCN3-NEXT: .LBB112_1: ; %atomicrmw.start
5841 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
5842 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5843 ; GCN3-NEXT: v_min_u32_e32 v2, s6, v3
5844 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5845 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5846 ; GCN3-NEXT: buffer_wbinvl1_vol
5847 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
5848 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5849 ; GCN3-NEXT: v_mov_b32_e32 v3, v2
5850 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
5851 ; GCN3-NEXT: s_cbranch_execnz .LBB112_1
5852 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
5853 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
5854 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5855 %gep = getelementptr i32, ptr %out, i32 4
5856 %tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst
5860 define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
5861 ; GCN1-LABEL: flat_atomic_umin_i32_ret_scalar:
5863 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5864 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
5865 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
5866 ; GCN1-NEXT: flat_load_dword v0, v[0:1]
5867 ; GCN1-NEXT: v_mov_b32_e32 v1, s4
5868 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
5869 ; GCN1-NEXT: v_mov_b32_e32 v2, s5
5870 ; GCN1-NEXT: .LBB113_1: ; %atomicrmw.start
5871 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
5872 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5873 ; GCN1-NEXT: v_mov_b32_e32 v4, v0
5874 ; GCN1-NEXT: v_min_u32_e32 v3, s6, v4
5875 ; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
5876 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5877 ; GCN1-NEXT: buffer_wbinvl1_vol
5878 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
5879 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5880 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
5881 ; GCN1-NEXT: s_cbranch_execnz .LBB113_1
5882 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
5883 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
5884 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5886 ; GCN2-LABEL: flat_atomic_umin_i32_ret_scalar:
5888 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5889 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
5890 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
5891 ; GCN2-NEXT: flat_load_dword v0, v[0:1]
5892 ; GCN2-NEXT: v_mov_b32_e32 v1, s4
5893 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
5894 ; GCN2-NEXT: v_mov_b32_e32 v2, s5
5895 ; GCN2-NEXT: .LBB113_1: ; %atomicrmw.start
5896 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
5897 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5898 ; GCN2-NEXT: v_mov_b32_e32 v4, v0
5899 ; GCN2-NEXT: v_min_u32_e32 v3, s6, v4
5900 ; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
5901 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5902 ; GCN2-NEXT: buffer_wbinvl1_vol
5903 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
5904 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5905 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
5906 ; GCN2-NEXT: s_cbranch_execnz .LBB113_1
5907 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
5908 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
5909 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5911 ; GCN3-LABEL: flat_atomic_umin_i32_ret_scalar:
5913 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5914 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
5915 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
5916 ; GCN3-NEXT: flat_load_dword v0, v[0:1]
5917 ; GCN3-NEXT: v_mov_b32_e32 v1, s4
5918 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
5919 ; GCN3-NEXT: v_mov_b32_e32 v2, s5
5920 ; GCN3-NEXT: .LBB113_1: ; %atomicrmw.start
5921 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
5922 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5923 ; GCN3-NEXT: v_mov_b32_e32 v4, v0
5924 ; GCN3-NEXT: v_min_u32_e32 v3, s6, v4
5925 ; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
5926 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5927 ; GCN3-NEXT: buffer_wbinvl1_vol
5928 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
5929 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5930 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
5931 ; GCN3-NEXT: s_cbranch_execnz .LBB113_1
5932 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
5933 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
5934 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5935 %result = atomicrmw umin ptr %ptr, i32 %in seq_cst
5939 define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
5940 ; GCN1-LABEL: flat_atomic_umin_i32_ret_offset_scalar:
5942 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5943 ; GCN1-NEXT: s_add_u32 s34, s4, 16
5944 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
5945 ; GCN1-NEXT: v_mov_b32_e32 v1, s34
5946 ; GCN1-NEXT: v_mov_b32_e32 v2, s35
5947 ; GCN1-NEXT: flat_load_dword v0, v[1:2]
5948 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
5949 ; GCN1-NEXT: .LBB114_1: ; %atomicrmw.start
5950 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
5951 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5952 ; GCN1-NEXT: v_mov_b32_e32 v4, v0
5953 ; GCN1-NEXT: v_min_u32_e32 v3, s6, v4
5954 ; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
5955 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5956 ; GCN1-NEXT: buffer_wbinvl1_vol
5957 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
5958 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5959 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
5960 ; GCN1-NEXT: s_cbranch_execnz .LBB114_1
5961 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
5962 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
5963 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5965 ; GCN2-LABEL: flat_atomic_umin_i32_ret_offset_scalar:
5967 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5968 ; GCN2-NEXT: s_add_u32 s34, s4, 16
5969 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
5970 ; GCN2-NEXT: v_mov_b32_e32 v1, s34
5971 ; GCN2-NEXT: v_mov_b32_e32 v2, s35
5972 ; GCN2-NEXT: flat_load_dword v0, v[1:2]
5973 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
5974 ; GCN2-NEXT: .LBB114_1: ; %atomicrmw.start
5975 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
5976 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5977 ; GCN2-NEXT: v_mov_b32_e32 v4, v0
5978 ; GCN2-NEXT: v_min_u32_e32 v3, s6, v4
5979 ; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
5980 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5981 ; GCN2-NEXT: buffer_wbinvl1_vol
5982 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
5983 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
5984 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
5985 ; GCN2-NEXT: s_cbranch_execnz .LBB114_1
5986 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
5987 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
5988 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5990 ; GCN3-LABEL: flat_atomic_umin_i32_ret_offset_scalar:
5992 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5993 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
5994 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
5995 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
5996 ; GCN3-NEXT: v_mov_b32_e32 v1, s4
5997 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
5998 ; GCN3-NEXT: v_mov_b32_e32 v2, s5
5999 ; GCN3-NEXT: .LBB114_1: ; %atomicrmw.start
6000 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
6001 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6002 ; GCN3-NEXT: v_mov_b32_e32 v4, v0
6003 ; GCN3-NEXT: v_min_u32_e32 v3, s6, v4
6004 ; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc
6005 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6006 ; GCN3-NEXT: buffer_wbinvl1_vol
6007 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
6008 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6009 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
6010 ; GCN3-NEXT: s_cbranch_execnz .LBB114_1
6011 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
6012 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
6013 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6014 %gep = getelementptr i32, ptr %out, i32 4
6015 %result = atomicrmw umin ptr %gep, i32 %in seq_cst
6019 define void @flat_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
6020 ; GCN1-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory:
6022 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6023 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
6024 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
6025 ; GCN1-NEXT: flat_load_dword v4, v[0:1]
6026 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
6027 ; GCN1-NEXT: .LBB115_1: ; %atomicrmw.start
6028 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6029 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6030 ; GCN1-NEXT: v_min_u32_e32 v3, v4, v2
6031 ; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6032 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6033 ; GCN1-NEXT: buffer_wbinvl1_vol
6034 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6035 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6036 ; GCN1-NEXT: v_mov_b32_e32 v4, v3
6037 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
6038 ; GCN1-NEXT: s_cbranch_execnz .LBB115_1
6039 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
6040 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
6041 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6043 ; GCN2-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory:
6045 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6046 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
6047 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
6048 ; GCN2-NEXT: flat_load_dword v4, v[0:1]
6049 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
6050 ; GCN2-NEXT: .LBB115_1: ; %atomicrmw.start
6051 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
6052 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6053 ; GCN2-NEXT: v_min_u32_e32 v3, v4, v2
6054 ; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6055 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6056 ; GCN2-NEXT: buffer_wbinvl1_vol
6057 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6058 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6059 ; GCN2-NEXT: v_mov_b32_e32 v4, v3
6060 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
6061 ; GCN2-NEXT: s_cbranch_execnz .LBB115_1
6062 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
6063 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
6064 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6066 ; GCN3-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory:
6068 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6069 ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16
6070 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
6071 ; GCN3-NEXT: .LBB115_1: ; %atomicrmw.start
6072 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
6073 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6074 ; GCN3-NEXT: v_min_u32_e32 v3, v4, v2
6075 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
6076 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6077 ; GCN3-NEXT: buffer_wbinvl1_vol
6078 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6079 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6080 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
6081 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
6082 ; GCN3-NEXT: s_cbranch_execnz .LBB115_1
6083 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
6084 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
6085 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6086 %gep = getelementptr i32, ptr %out, i64 4
6087 %tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
6091 define i32 @flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
6092 ; GCN1-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory:
6094 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6095 ; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0
6096 ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
6097 ; GCN1-NEXT: flat_load_dword v0, v[3:4]
6098 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
6099 ; GCN1-NEXT: .LBB116_1: ; %atomicrmw.start
6100 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6101 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6102 ; GCN1-NEXT: v_mov_b32_e32 v1, v0
6103 ; GCN1-NEXT: v_min_u32_e32 v0, v1, v2
6104 ; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
6105 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6106 ; GCN1-NEXT: buffer_wbinvl1_vol
6107 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
6108 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6109 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
6110 ; GCN1-NEXT: s_cbranch_execnz .LBB116_1
6111 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
6112 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
6113 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6115 ; GCN2-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory:
6117 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6118 ; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0
6119 ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
6120 ; GCN2-NEXT: flat_load_dword v0, v[3:4]
6121 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
6122 ; GCN2-NEXT: .LBB116_1: ; %atomicrmw.start
6123 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
6124 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6125 ; GCN2-NEXT: v_mov_b32_e32 v1, v0
6126 ; GCN2-NEXT: v_min_u32_e32 v0, v1, v2
6127 ; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
6128 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6129 ; GCN2-NEXT: buffer_wbinvl1_vol
6130 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
6131 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6132 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
6133 ; GCN2-NEXT: s_cbranch_execnz .LBB116_1
6134 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
6135 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
6136 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6138 ; GCN3-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory:
6140 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6141 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
6142 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
6143 ; GCN3-NEXT: .LBB116_1: ; %atomicrmw.start
6144 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
6145 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6146 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
6147 ; GCN3-NEXT: v_min_u32_e32 v3, v4, v2
6148 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
6149 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6150 ; GCN3-NEXT: buffer_wbinvl1_vol
6151 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6152 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6153 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
6154 ; GCN3-NEXT: s_cbranch_execnz .LBB116_1
6155 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
6156 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
6157 ; GCN3-NEXT: v_mov_b32_e32 v0, v3
6158 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6159 %gep = getelementptr i32, ptr %out, i64 4
6160 %result = atomicrmw umin ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
6164 ; ---------------------------------------------------------------------
6166 ; ---------------------------------------------------------------------
6168 define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) {
6169 ; GCN1-LABEL: flat_atomic_min_i32_noret:
6171 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6172 ; GCN1-NEXT: flat_load_dword v4, v[0:1]
6173 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
6174 ; GCN1-NEXT: .LBB117_1: ; %atomicrmw.start
6175 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6176 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6177 ; GCN1-NEXT: v_min_i32_e32 v3, v4, v2
6178 ; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6179 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6180 ; GCN1-NEXT: buffer_wbinvl1_vol
6181 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6182 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6183 ; GCN1-NEXT: v_mov_b32_e32 v4, v3
6184 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
6185 ; GCN1-NEXT: s_cbranch_execnz .LBB117_1
6186 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
6187 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
6188 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6190 ; GCN2-LABEL: flat_atomic_min_i32_noret:
6192 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6193 ; GCN2-NEXT: flat_load_dword v4, v[0:1]
6194 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
6195 ; GCN2-NEXT: .LBB117_1: ; %atomicrmw.start
6196 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
6197 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6198 ; GCN2-NEXT: v_min_i32_e32 v3, v4, v2
6199 ; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6200 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6201 ; GCN2-NEXT: buffer_wbinvl1_vol
6202 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6203 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6204 ; GCN2-NEXT: v_mov_b32_e32 v4, v3
6205 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
6206 ; GCN2-NEXT: s_cbranch_execnz .LBB117_1
6207 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
6208 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
6209 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6211 ; GCN3-LABEL: flat_atomic_min_i32_noret:
6213 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6214 ; GCN3-NEXT: flat_load_dword v4, v[0:1]
6215 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
6216 ; GCN3-NEXT: .LBB117_1: ; %atomicrmw.start
6217 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
6218 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6219 ; GCN3-NEXT: v_min_i32_e32 v3, v4, v2
6220 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6221 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6222 ; GCN3-NEXT: buffer_wbinvl1_vol
6223 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6224 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6225 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
6226 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
6227 ; GCN3-NEXT: s_cbranch_execnz .LBB117_1
6228 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
6229 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
6230 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6231 %tmp0 = atomicrmw min ptr %ptr, i32 %in seq_cst
6235 define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) {
6236 ; GCN1-LABEL: flat_atomic_min_i32_noret_offset:
6238 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6239 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
6240 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
6241 ; GCN1-NEXT: flat_load_dword v4, v[0:1]
6242 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
6243 ; GCN1-NEXT: .LBB118_1: ; %atomicrmw.start
6244 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6245 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6246 ; GCN1-NEXT: v_min_i32_e32 v3, v4, v2
6247 ; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6248 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6249 ; GCN1-NEXT: buffer_wbinvl1_vol
6250 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6251 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6252 ; GCN1-NEXT: v_mov_b32_e32 v4, v3
6253 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
6254 ; GCN1-NEXT: s_cbranch_execnz .LBB118_1
6255 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
6256 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
6257 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6259 ; GCN2-LABEL: flat_atomic_min_i32_noret_offset:
6261 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6262 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
6263 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
6264 ; GCN2-NEXT: flat_load_dword v4, v[0:1]
6265 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
6266 ; GCN2-NEXT: .LBB118_1: ; %atomicrmw.start
6267 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
6268 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6269 ; GCN2-NEXT: v_min_i32_e32 v3, v4, v2
6270 ; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6271 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6272 ; GCN2-NEXT: buffer_wbinvl1_vol
6273 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6274 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6275 ; GCN2-NEXT: v_mov_b32_e32 v4, v3
6276 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
6277 ; GCN2-NEXT: s_cbranch_execnz .LBB118_1
6278 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
6279 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
6280 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6282 ; GCN3-LABEL: flat_atomic_min_i32_noret_offset:
6284 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6285 ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16
6286 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
6287 ; GCN3-NEXT: .LBB118_1: ; %atomicrmw.start
6288 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
6289 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6290 ; GCN3-NEXT: v_min_i32_e32 v3, v4, v2
6291 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
6292 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6293 ; GCN3-NEXT: buffer_wbinvl1_vol
6294 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6295 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6296 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
6297 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
6298 ; GCN3-NEXT: s_cbranch_execnz .LBB118_1
6299 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
6300 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
6301 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6302 %gep = getelementptr i32, ptr %out, i32 4
6303 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst
6307 define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) {
6308 ; GCN1-LABEL: flat_atomic_min_i32_ret:
6310 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6311 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
6312 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
6313 ; GCN1-NEXT: .LBB119_1: ; %atomicrmw.start
6314 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6315 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6316 ; GCN1-NEXT: v_mov_b32_e32 v4, v3
6317 ; GCN1-NEXT: v_min_i32_e32 v3, v4, v2
6318 ; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6319 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6320 ; GCN1-NEXT: buffer_wbinvl1_vol
6321 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6322 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6323 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
6324 ; GCN1-NEXT: s_cbranch_execnz .LBB119_1
6325 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
6326 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
6327 ; GCN1-NEXT: v_mov_b32_e32 v0, v3
6328 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6330 ; GCN2-LABEL: flat_atomic_min_i32_ret:
6332 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6333 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
6334 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
6335 ; GCN2-NEXT: .LBB119_1: ; %atomicrmw.start
6336 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
6337 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6338 ; GCN2-NEXT: v_mov_b32_e32 v4, v3
6339 ; GCN2-NEXT: v_min_i32_e32 v3, v4, v2
6340 ; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6341 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6342 ; GCN2-NEXT: buffer_wbinvl1_vol
6343 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6344 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6345 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
6346 ; GCN2-NEXT: s_cbranch_execnz .LBB119_1
6347 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
6348 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
6349 ; GCN2-NEXT: v_mov_b32_e32 v0, v3
6350 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6352 ; GCN3-LABEL: flat_atomic_min_i32_ret:
6354 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6355 ; GCN3-NEXT: flat_load_dword v3, v[0:1]
6356 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
6357 ; GCN3-NEXT: .LBB119_1: ; %atomicrmw.start
6358 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
6359 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6360 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
6361 ; GCN3-NEXT: v_min_i32_e32 v3, v4, v2
6362 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
6363 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6364 ; GCN3-NEXT: buffer_wbinvl1_vol
6365 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6366 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6367 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
6368 ; GCN3-NEXT: s_cbranch_execnz .LBB119_1
6369 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
6370 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
6371 ; GCN3-NEXT: v_mov_b32_e32 v0, v3
6372 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6373 %result = atomicrmw min ptr %ptr, i32 %in seq_cst
6377 define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) {
6378 ; GCN1-LABEL: flat_atomic_min_i32_ret_offset:
6380 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6381 ; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0
6382 ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
6383 ; GCN1-NEXT: flat_load_dword v0, v[3:4]
6384 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
6385 ; GCN1-NEXT: .LBB120_1: ; %atomicrmw.start
6386 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6387 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6388 ; GCN1-NEXT: v_mov_b32_e32 v1, v0
6389 ; GCN1-NEXT: v_min_i32_e32 v0, v1, v2
6390 ; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
6391 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6392 ; GCN1-NEXT: buffer_wbinvl1_vol
6393 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
6394 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6395 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
6396 ; GCN1-NEXT: s_cbranch_execnz .LBB120_1
6397 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
6398 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
6399 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6401 ; GCN2-LABEL: flat_atomic_min_i32_ret_offset:
6403 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6404 ; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0
6405 ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
6406 ; GCN2-NEXT: flat_load_dword v0, v[3:4]
6407 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
6408 ; GCN2-NEXT: .LBB120_1: ; %atomicrmw.start
6409 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
6410 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6411 ; GCN2-NEXT: v_mov_b32_e32 v1, v0
6412 ; GCN2-NEXT: v_min_i32_e32 v0, v1, v2
6413 ; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
6414 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6415 ; GCN2-NEXT: buffer_wbinvl1_vol
6416 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
6417 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6418 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
6419 ; GCN2-NEXT: s_cbranch_execnz .LBB120_1
6420 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
6421 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
6422 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6424 ; GCN3-LABEL: flat_atomic_min_i32_ret_offset:
6426 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6427 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
6428 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
6429 ; GCN3-NEXT: .LBB120_1: ; %atomicrmw.start
6430 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
6431 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6432 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
6433 ; GCN3-NEXT: v_min_i32_e32 v3, v4, v2
6434 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
6435 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6436 ; GCN3-NEXT: buffer_wbinvl1_vol
6437 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
6438 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6439 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
6440 ; GCN3-NEXT: s_cbranch_execnz .LBB120_1
6441 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
6442 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
6443 ; GCN3-NEXT: v_mov_b32_e32 v0, v3
6444 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6445 %gep = getelementptr i32, ptr %out, i32 4
6446 %result = atomicrmw min ptr %gep, i32 %in seq_cst
6450 define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
6451 ; GCN1-LABEL: flat_atomic_min_i32_noret_scalar:
6453 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6454 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
6455 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
6456 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
6457 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
6458 ; GCN1-NEXT: .LBB121_1: ; %atomicrmw.start
6459 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6460 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6461 ; GCN1-NEXT: v_min_i32_e32 v2, s6, v3
6462 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6463 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6464 ; GCN1-NEXT: buffer_wbinvl1_vol
6465 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
6466 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6467 ; GCN1-NEXT: v_mov_b32_e32 v3, v2
6468 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
6469 ; GCN1-NEXT: s_cbranch_execnz .LBB121_1
6470 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
6471 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
6472 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6474 ; GCN2-LABEL: flat_atomic_min_i32_noret_scalar:
6476 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6477 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
6478 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
6479 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
6480 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
6481 ; GCN2-NEXT: .LBB121_1: ; %atomicrmw.start
6482 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
6483 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6484 ; GCN2-NEXT: v_min_i32_e32 v2, s6, v3
6485 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6486 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6487 ; GCN2-NEXT: buffer_wbinvl1_vol
6488 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
6489 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6490 ; GCN2-NEXT: v_mov_b32_e32 v3, v2
6491 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
6492 ; GCN2-NEXT: s_cbranch_execnz .LBB121_1
6493 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
6494 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
6495 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6497 ; GCN3-LABEL: flat_atomic_min_i32_noret_scalar:
6499 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6500 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
6501 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
6502 ; GCN3-NEXT: flat_load_dword v3, v[0:1]
6503 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
6504 ; GCN3-NEXT: .LBB121_1: ; %atomicrmw.start
6505 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
6506 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6507 ; GCN3-NEXT: v_min_i32_e32 v2, s6, v3
6508 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6509 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6510 ; GCN3-NEXT: buffer_wbinvl1_vol
6511 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
6512 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6513 ; GCN3-NEXT: v_mov_b32_e32 v3, v2
6514 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
6515 ; GCN3-NEXT: s_cbranch_execnz .LBB121_1
6516 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
6517 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
6518 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6519 %tmp0 = atomicrmw min ptr %ptr, i32 %in seq_cst
6523 define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
6524 ; GCN1-LABEL: flat_atomic_min_i32_noret_offset_scalar:
6526 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6527 ; GCN1-NEXT: s_add_u32 s34, s4, 16
6528 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
6529 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
6530 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
6531 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
6532 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
6533 ; GCN1-NEXT: .LBB122_1: ; %atomicrmw.start
6534 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6535 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6536 ; GCN1-NEXT: v_min_i32_e32 v2, s6, v3
6537 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6538 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6539 ; GCN1-NEXT: buffer_wbinvl1_vol
6540 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
6541 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6542 ; GCN1-NEXT: v_mov_b32_e32 v3, v2
6543 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
6544 ; GCN1-NEXT: s_cbranch_execnz .LBB122_1
6545 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
6546 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
6547 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6549 ; GCN2-LABEL: flat_atomic_min_i32_noret_offset_scalar:
6551 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6552 ; GCN2-NEXT: s_add_u32 s34, s4, 16
6553 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
6554 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
6555 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
6556 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
6557 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
6558 ; GCN2-NEXT: .LBB122_1: ; %atomicrmw.start
6559 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
6560 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6561 ; GCN2-NEXT: v_min_i32_e32 v2, s6, v3
6562 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6563 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6564 ; GCN2-NEXT: buffer_wbinvl1_vol
6565 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
6566 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6567 ; GCN2-NEXT: v_mov_b32_e32 v3, v2
6568 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
6569 ; GCN2-NEXT: s_cbranch_execnz .LBB122_1
6570 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
6571 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
6572 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6574 ; GCN3-LABEL: flat_atomic_min_i32_noret_offset_scalar:
6576 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6577 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
6578 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
6579 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
6580 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
6581 ; GCN3-NEXT: .LBB122_1: ; %atomicrmw.start
6582 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
6583 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6584 ; GCN3-NEXT: v_min_i32_e32 v2, s6, v3
6585 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6586 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6587 ; GCN3-NEXT: buffer_wbinvl1_vol
6588 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
6589 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6590 ; GCN3-NEXT: v_mov_b32_e32 v3, v2
6591 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
6592 ; GCN3-NEXT: s_cbranch_execnz .LBB122_1
6593 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
6594 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
6595 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6596 %gep = getelementptr i32, ptr %out, i32 4
6597 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst
6601 define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
6602 ; GCN1-LABEL: flat_atomic_min_i32_ret_scalar:
6604 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6605 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
6606 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
6607 ; GCN1-NEXT: flat_load_dword v0, v[0:1]
6608 ; GCN1-NEXT: v_mov_b32_e32 v1, s4
6609 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
6610 ; GCN1-NEXT: v_mov_b32_e32 v2, s5
6611 ; GCN1-NEXT: .LBB123_1: ; %atomicrmw.start
6612 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6613 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6614 ; GCN1-NEXT: v_mov_b32_e32 v4, v0
6615 ; GCN1-NEXT: v_min_i32_e32 v3, s6, v4
6616 ; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
6617 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6618 ; GCN1-NEXT: buffer_wbinvl1_vol
6619 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
6620 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6621 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
6622 ; GCN1-NEXT: s_cbranch_execnz .LBB123_1
6623 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
6624 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
6625 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6627 ; GCN2-LABEL: flat_atomic_min_i32_ret_scalar:
6629 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6630 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
6631 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
6632 ; GCN2-NEXT: flat_load_dword v0, v[0:1]
6633 ; GCN2-NEXT: v_mov_b32_e32 v1, s4
6634 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
6635 ; GCN2-NEXT: v_mov_b32_e32 v2, s5
6636 ; GCN2-NEXT: .LBB123_1: ; %atomicrmw.start
6637 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
6638 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6639 ; GCN2-NEXT: v_mov_b32_e32 v4, v0
6640 ; GCN2-NEXT: v_min_i32_e32 v3, s6, v4
6641 ; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
6642 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6643 ; GCN2-NEXT: buffer_wbinvl1_vol
6644 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
6645 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6646 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
6647 ; GCN2-NEXT: s_cbranch_execnz .LBB123_1
6648 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
6649 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
6650 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6652 ; GCN3-LABEL: flat_atomic_min_i32_ret_scalar:
6654 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6655 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
6656 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
6657 ; GCN3-NEXT: flat_load_dword v0, v[0:1]
6658 ; GCN3-NEXT: v_mov_b32_e32 v1, s4
6659 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
6660 ; GCN3-NEXT: v_mov_b32_e32 v2, s5
6661 ; GCN3-NEXT: .LBB123_1: ; %atomicrmw.start
6662 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
6663 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6664 ; GCN3-NEXT: v_mov_b32_e32 v4, v0
6665 ; GCN3-NEXT: v_min_i32_e32 v3, s6, v4
6666 ; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
6667 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6668 ; GCN3-NEXT: buffer_wbinvl1_vol
6669 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
6670 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6671 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
6672 ; GCN3-NEXT: s_cbranch_execnz .LBB123_1
6673 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
6674 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
6675 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6676 %result = atomicrmw min ptr %ptr, i32 %in seq_cst
6680 define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
6681 ; GCN1-LABEL: flat_atomic_min_i32_ret_offset_scalar:
6683 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6684 ; GCN1-NEXT: s_add_u32 s34, s4, 16
6685 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
6686 ; GCN1-NEXT: v_mov_b32_e32 v1, s34
6687 ; GCN1-NEXT: v_mov_b32_e32 v2, s35
6688 ; GCN1-NEXT: flat_load_dword v0, v[1:2]
6689 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
6690 ; GCN1-NEXT: .LBB124_1: ; %atomicrmw.start
6691 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6692 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6693 ; GCN1-NEXT: v_mov_b32_e32 v4, v0
6694 ; GCN1-NEXT: v_min_i32_e32 v3, s6, v4
6695 ; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
6696 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6697 ; GCN1-NEXT: buffer_wbinvl1_vol
6698 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
6699 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6700 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
6701 ; GCN1-NEXT: s_cbranch_execnz .LBB124_1
6702 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
6703 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
6704 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6706 ; GCN2-LABEL: flat_atomic_min_i32_ret_offset_scalar:
6708 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6709 ; GCN2-NEXT: s_add_u32 s34, s4, 16
6710 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
6711 ; GCN2-NEXT: v_mov_b32_e32 v1, s34
6712 ; GCN2-NEXT: v_mov_b32_e32 v2, s35
6713 ; GCN2-NEXT: flat_load_dword v0, v[1:2]
6714 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
6715 ; GCN2-NEXT: .LBB124_1: ; %atomicrmw.start
6716 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
6717 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6718 ; GCN2-NEXT: v_mov_b32_e32 v4, v0
6719 ; GCN2-NEXT: v_min_i32_e32 v3, s6, v4
6720 ; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
6721 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6722 ; GCN2-NEXT: buffer_wbinvl1_vol
6723 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
6724 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6725 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
6726 ; GCN2-NEXT: s_cbranch_execnz .LBB124_1
6727 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
6728 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
6729 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6731 ; GCN3-LABEL: flat_atomic_min_i32_ret_offset_scalar:
6733 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6734 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
6735 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
6736 ; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
6737 ; GCN3-NEXT: v_mov_b32_e32 v1, s4
6738 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
6739 ; GCN3-NEXT: v_mov_b32_e32 v2, s5
6740 ; GCN3-NEXT: .LBB124_1: ; %atomicrmw.start
6741 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
6742 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6743 ; GCN3-NEXT: v_mov_b32_e32 v4, v0
6744 ; GCN3-NEXT: v_min_i32_e32 v3, s6, v4
6745 ; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc
6746 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6747 ; GCN3-NEXT: buffer_wbinvl1_vol
6748 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
6749 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
6750 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
6751 ; GCN3-NEXT: s_cbranch_execnz .LBB124_1
6752 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
6753 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
6754 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6755 %gep = getelementptr i32, ptr %out, i32 4
6756 %result = atomicrmw min ptr %gep, i32 %in seq_cst
6760 define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 %index) {
6761 ; GCN1-LABEL: atomic_min_i32_addr64_offset:
6762 ; GCN1: ; %bb.0: ; %entry
6763 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
6764 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6765 ; GCN1-NEXT: s_ashr_i32 s5, s3, 31
6766 ; GCN1-NEXT: s_mov_b32 s4, s3
6767 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
6768 ; GCN1-NEXT: s_add_u32 s0, s0, s4
6769 ; GCN1-NEXT: s_addc_u32 s1, s1, s5
6770 ; GCN1-NEXT: s_add_u32 s0, s0, 16
6771 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
6772 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
6773 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
6774 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
6775 ; GCN1-NEXT: s_mov_b64 s[0:1], 0
6776 ; GCN1-NEXT: .LBB125_1: ; %atomicrmw.start
6777 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6778 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6779 ; GCN1-NEXT: v_min_i32_e32 v2, s2, v3
6780 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6781 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6782 ; GCN1-NEXT: buffer_wbinvl1_vol
6783 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
6784 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
6785 ; GCN1-NEXT: v_mov_b32_e32 v3, v2
6786 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
6787 ; GCN1-NEXT: s_cbranch_execnz .LBB125_1
6788 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
6789 ; GCN1-NEXT: s_endpgm
6791 ; GCN2-LABEL: atomic_min_i32_addr64_offset:
6792 ; GCN2: ; %bb.0: ; %entry
6793 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
6794 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6795 ; GCN2-NEXT: s_ashr_i32 s5, s3, 31
6796 ; GCN2-NEXT: s_mov_b32 s4, s3
6797 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
6798 ; GCN2-NEXT: s_add_u32 s0, s0, s4
6799 ; GCN2-NEXT: s_addc_u32 s1, s1, s5
6800 ; GCN2-NEXT: s_add_u32 s0, s0, 16
6801 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
6802 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
6803 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
6804 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
6805 ; GCN2-NEXT: s_mov_b64 s[0:1], 0
6806 ; GCN2-NEXT: .LBB125_1: ; %atomicrmw.start
6807 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
6808 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6809 ; GCN2-NEXT: v_min_i32_e32 v2, s2, v3
6810 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6811 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6812 ; GCN2-NEXT: buffer_wbinvl1_vol
6813 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
6814 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
6815 ; GCN2-NEXT: v_mov_b32_e32 v3, v2
6816 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
6817 ; GCN2-NEXT: s_cbranch_execnz .LBB125_1
6818 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
6819 ; GCN2-NEXT: s_endpgm
6821 ; GCN3-LABEL: atomic_min_i32_addr64_offset:
6822 ; GCN3: ; %bb.0: ; %entry
6823 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
6824 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6825 ; GCN3-NEXT: s_ashr_i32 s5, s3, 31
6826 ; GCN3-NEXT: s_mov_b32 s4, s3
6827 ; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
6828 ; GCN3-NEXT: s_add_u32 s0, s0, s4
6829 ; GCN3-NEXT: s_addc_u32 s1, s1, s5
6830 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
6831 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
6832 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
6833 ; GCN3-NEXT: s_mov_b64 s[0:1], 0
6834 ; GCN3-NEXT: .LBB125_1: ; %atomicrmw.start
6835 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
6836 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6837 ; GCN3-NEXT: v_min_i32_e32 v2, s2, v3
6838 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6839 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6840 ; GCN3-NEXT: buffer_wbinvl1_vol
6841 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
6842 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
6843 ; GCN3-NEXT: v_mov_b32_e32 v3, v2
6844 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
6845 ; GCN3-NEXT: s_cbranch_execnz .LBB125_1
6846 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
6847 ; GCN3-NEXT: s_endpgm
6849 %ptr = getelementptr i32, ptr %out, i32 %index
6850 %gep = getelementptr i32, ptr %ptr, i32 4
6851 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst
6855 define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i32 %index) {
6856 ; GCN1-LABEL: atomic_min_i32_ret_addr64_offset:
6857 ; GCN1: ; %bb.0: ; %entry
6858 ; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
6859 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
6860 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6861 ; GCN1-NEXT: s_ashr_i32 s5, s7, 31
6862 ; GCN1-NEXT: s_mov_b32 s4, s7
6863 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
6864 ; GCN1-NEXT: s_add_u32 s0, s0, s4
6865 ; GCN1-NEXT: s_addc_u32 s1, s1, s5
6866 ; GCN1-NEXT: s_add_u32 s0, s0, 16
6867 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
6868 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
6869 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
6870 ; GCN1-NEXT: flat_load_dword v2, v[0:1]
6871 ; GCN1-NEXT: s_mov_b64 s[0:1], 0
6872 ; GCN1-NEXT: .LBB126_1: ; %atomicrmw.start
6873 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6874 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6875 ; GCN1-NEXT: v_mov_b32_e32 v3, v2
6876 ; GCN1-NEXT: v_min_i32_e32 v2, s6, v3
6877 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6878 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6879 ; GCN1-NEXT: buffer_wbinvl1_vol
6880 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
6881 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
6882 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
6883 ; GCN1-NEXT: s_cbranch_execnz .LBB126_1
6884 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
6885 ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1]
6886 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
6887 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
6888 ; GCN1-NEXT: flat_store_dword v[0:1], v2
6889 ; GCN1-NEXT: s_endpgm
6891 ; GCN2-LABEL: atomic_min_i32_ret_addr64_offset:
6892 ; GCN2: ; %bb.0: ; %entry
6893 ; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
6894 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
6895 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6896 ; GCN2-NEXT: s_ashr_i32 s5, s7, 31
6897 ; GCN2-NEXT: s_mov_b32 s4, s7
6898 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
6899 ; GCN2-NEXT: s_add_u32 s0, s0, s4
6900 ; GCN2-NEXT: s_addc_u32 s1, s1, s5
6901 ; GCN2-NEXT: s_add_u32 s0, s0, 16
6902 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
6903 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
6904 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
6905 ; GCN2-NEXT: flat_load_dword v2, v[0:1]
6906 ; GCN2-NEXT: s_mov_b64 s[0:1], 0
6907 ; GCN2-NEXT: .LBB126_1: ; %atomicrmw.start
6908 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
6909 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6910 ; GCN2-NEXT: v_mov_b32_e32 v3, v2
6911 ; GCN2-NEXT: v_min_i32_e32 v2, s6, v3
6912 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6913 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6914 ; GCN2-NEXT: buffer_wbinvl1_vol
6915 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
6916 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
6917 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
6918 ; GCN2-NEXT: s_cbranch_execnz .LBB126_1
6919 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
6920 ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
6921 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
6922 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
6923 ; GCN2-NEXT: flat_store_dword v[0:1], v2
6924 ; GCN2-NEXT: s_endpgm
6926 ; GCN3-LABEL: atomic_min_i32_ret_addr64_offset:
6927 ; GCN3: ; %bb.0: ; %entry
6928 ; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
6929 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
6930 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6931 ; GCN3-NEXT: s_ashr_i32 s5, s7, 31
6932 ; GCN3-NEXT: s_mov_b32 s4, s7
6933 ; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
6934 ; GCN3-NEXT: s_add_u32 s0, s0, s4
6935 ; GCN3-NEXT: s_addc_u32 s1, s1, s5
6936 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
6937 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
6938 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16
6939 ; GCN3-NEXT: s_mov_b64 s[0:1], 0
6940 ; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start
6941 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
6942 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6943 ; GCN3-NEXT: v_mov_b32_e32 v3, v2
6944 ; GCN3-NEXT: v_min_i32_e32 v2, s6, v3
6945 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6946 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6947 ; GCN3-NEXT: buffer_wbinvl1_vol
6948 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
6949 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
6950 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
6951 ; GCN3-NEXT: s_cbranch_execnz .LBB126_1
6952 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
6953 ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
6954 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
6955 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
6956 ; GCN3-NEXT: flat_store_dword v[0:1], v2
6957 ; GCN3-NEXT: s_endpgm
6959 %ptr = getelementptr i32, ptr %out, i32 %index
6960 %gep = getelementptr i32, ptr %ptr, i32 4
6961 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst
6962 store i32 %tmp0, ptr %out2
6966 define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) {
6967 ; GCN1-LABEL: atomic_min_i32:
6968 ; GCN1: ; %bb.0: ; %entry
6969 ; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x9
6970 ; GCN1-NEXT: s_load_dword s2, s[4:5], 0xb
6971 ; GCN1-NEXT: s_mov_b64 s[0:1], 0
6972 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6973 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
6974 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
6975 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
6976 ; GCN1-NEXT: .LBB127_1: ; %atomicrmw.start
6977 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6978 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6979 ; GCN1-NEXT: v_min_i32_e32 v2, s2, v3
6980 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6981 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6982 ; GCN1-NEXT: buffer_wbinvl1_vol
6983 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
6984 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
6985 ; GCN1-NEXT: v_mov_b32_e32 v3, v2
6986 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
6987 ; GCN1-NEXT: s_cbranch_execnz .LBB127_1
6988 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
6989 ; GCN1-NEXT: s_endpgm
6991 ; GCN2-LABEL: atomic_min_i32:
6992 ; GCN2: ; %bb.0: ; %entry
6993 ; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
6994 ; GCN2-NEXT: s_load_dword s2, s[4:5], 0x2c
6995 ; GCN2-NEXT: s_mov_b64 s[0:1], 0
6996 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6997 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
6998 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
6999 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
7000 ; GCN2-NEXT: .LBB127_1: ; %atomicrmw.start
7001 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
7002 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7003 ; GCN2-NEXT: v_min_i32_e32 v2, s2, v3
7004 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7005 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7006 ; GCN2-NEXT: buffer_wbinvl1_vol
7007 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
7008 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
7009 ; GCN2-NEXT: v_mov_b32_e32 v3, v2
7010 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
7011 ; GCN2-NEXT: s_cbranch_execnz .LBB127_1
7012 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
7013 ; GCN2-NEXT: s_endpgm
7015 ; GCN3-LABEL: atomic_min_i32:
7016 ; GCN3: ; %bb.0: ; %entry
7017 ; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
7018 ; GCN3-NEXT: s_load_dword s2, s[4:5], 0x2c
7019 ; GCN3-NEXT: s_mov_b64 s[0:1], 0
7020 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
7021 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
7022 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
7023 ; GCN3-NEXT: flat_load_dword v3, v[0:1]
7024 ; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start
7025 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
7026 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7027 ; GCN3-NEXT: v_min_i32_e32 v2, s2, v3
7028 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7029 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7030 ; GCN3-NEXT: buffer_wbinvl1_vol
7031 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
7032 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
7033 ; GCN3-NEXT: v_mov_b32_e32 v3, v2
7034 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
7035 ; GCN3-NEXT: s_cbranch_execnz .LBB127_1
7036 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
7037 ; GCN3-NEXT: s_endpgm
7039 %tmp0 = atomicrmw min ptr %out, i32 %in seq_cst
7043 define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i32 %index) {
7044 ; GCN1-LABEL: atomic_min_i32_ret_addr64:
7045 ; GCN1: ; %bb.0: ; %entry
7046 ; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
7047 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
7048 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7049 ; GCN1-NEXT: s_ashr_i32 s5, s7, 31
7050 ; GCN1-NEXT: s_mov_b32 s4, s7
7051 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
7052 ; GCN1-NEXT: s_add_u32 s0, s0, s4
7053 ; GCN1-NEXT: s_addc_u32 s1, s1, s5
7054 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
7055 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
7056 ; GCN1-NEXT: flat_load_dword v2, v[0:1]
7057 ; GCN1-NEXT: s_mov_b64 s[0:1], 0
7058 ; GCN1-NEXT: .LBB128_1: ; %atomicrmw.start
7059 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
7060 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7061 ; GCN1-NEXT: v_mov_b32_e32 v3, v2
7062 ; GCN1-NEXT: v_min_i32_e32 v2, s6, v3
7063 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7064 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7065 ; GCN1-NEXT: buffer_wbinvl1_vol
7066 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
7067 ; GCN1-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
7068 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[0:1]
7069 ; GCN1-NEXT: s_cbranch_execnz .LBB128_1
7070 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
7071 ; GCN1-NEXT: s_or_b64 exec, exec, s[0:1]
7072 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
7073 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
7074 ; GCN1-NEXT: flat_store_dword v[0:1], v2
7075 ; GCN1-NEXT: s_endpgm
7077 ; GCN2-LABEL: atomic_min_i32_ret_addr64:
7078 ; GCN2: ; %bb.0: ; %entry
7079 ; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
7080 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
7081 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7082 ; GCN2-NEXT: s_ashr_i32 s5, s7, 31
7083 ; GCN2-NEXT: s_mov_b32 s4, s7
7084 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
7085 ; GCN2-NEXT: s_add_u32 s0, s0, s4
7086 ; GCN2-NEXT: s_addc_u32 s1, s1, s5
7087 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
7088 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
7089 ; GCN2-NEXT: flat_load_dword v2, v[0:1]
7090 ; GCN2-NEXT: s_mov_b64 s[0:1], 0
7091 ; GCN2-NEXT: .LBB128_1: ; %atomicrmw.start
7092 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
7093 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7094 ; GCN2-NEXT: v_mov_b32_e32 v3, v2
7095 ; GCN2-NEXT: v_min_i32_e32 v2, s6, v3
7096 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7097 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7098 ; GCN2-NEXT: buffer_wbinvl1_vol
7099 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
7100 ; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
7101 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
7102 ; GCN2-NEXT: s_cbranch_execnz .LBB128_1
7103 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
7104 ; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
7105 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
7106 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
7107 ; GCN2-NEXT: flat_store_dword v[0:1], v2
7108 ; GCN2-NEXT: s_endpgm
7110 ; GCN3-LABEL: atomic_min_i32_ret_addr64:
7111 ; GCN3: ; %bb.0: ; %entry
7112 ; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
7113 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
7114 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
7115 ; GCN3-NEXT: s_ashr_i32 s5, s7, 31
7116 ; GCN3-NEXT: s_mov_b32 s4, s7
7117 ; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
7118 ; GCN3-NEXT: s_add_u32 s0, s0, s4
7119 ; GCN3-NEXT: s_addc_u32 s1, s1, s5
7120 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
7121 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
7122 ; GCN3-NEXT: flat_load_dword v2, v[0:1]
7123 ; GCN3-NEXT: s_mov_b64 s[0:1], 0
7124 ; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start
7125 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
7126 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7127 ; GCN3-NEXT: v_mov_b32_e32 v3, v2
7128 ; GCN3-NEXT: v_min_i32_e32 v2, s6, v3
7129 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7130 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7131 ; GCN3-NEXT: buffer_wbinvl1_vol
7132 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
7133 ; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
7134 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
7135 ; GCN3-NEXT: s_cbranch_execnz .LBB128_1
7136 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
7137 ; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
7138 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
7139 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
7140 ; GCN3-NEXT: flat_store_dword v[0:1], v2
7141 ; GCN3-NEXT: s_endpgm
7143 %ptr = getelementptr i32, ptr %out, i32 %index
7144 %tmp0 = atomicrmw min ptr %ptr, i32 %in seq_cst
7145 store i32 %tmp0, ptr %out2
7149 define void @flat_min_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
7150 ; GCN1-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory:
7152 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7153 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
7154 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7155 ; GCN1-NEXT: flat_load_dword v4, v[0:1]
7156 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
7157 ; GCN1-NEXT: .LBB129_1: ; %atomicrmw.start
7158 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
7159 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7160 ; GCN1-NEXT: v_min_i32_e32 v3, v4, v2
7161 ; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
7162 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7163 ; GCN1-NEXT: buffer_wbinvl1_vol
7164 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
7165 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7166 ; GCN1-NEXT: v_mov_b32_e32 v4, v3
7167 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
7168 ; GCN1-NEXT: s_cbranch_execnz .LBB129_1
7169 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
7170 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
7171 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7173 ; GCN2-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory:
7175 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7176 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
7177 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7178 ; GCN2-NEXT: flat_load_dword v4, v[0:1]
7179 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
7180 ; GCN2-NEXT: .LBB129_1: ; %atomicrmw.start
7181 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
7182 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7183 ; GCN2-NEXT: v_min_i32_e32 v3, v4, v2
7184 ; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
7185 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7186 ; GCN2-NEXT: buffer_wbinvl1_vol
7187 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
7188 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7189 ; GCN2-NEXT: v_mov_b32_e32 v4, v3
7190 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
7191 ; GCN2-NEXT: s_cbranch_execnz .LBB129_1
7192 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
7193 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
7194 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7196 ; GCN3-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory:
7198 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7199 ; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16
7200 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
7201 ; GCN3-NEXT: .LBB129_1: ; %atomicrmw.start
7202 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
7203 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7204 ; GCN3-NEXT: v_min_i32_e32 v3, v4, v2
7205 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
7206 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7207 ; GCN3-NEXT: buffer_wbinvl1_vol
7208 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
7209 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7210 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
7211 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
7212 ; GCN3-NEXT: s_cbranch_execnz .LBB129_1
7213 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
7214 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
7215 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7216 %gep = getelementptr i32, ptr %out, i64 4
7217 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
7221 define i32 @flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
7222 ; GCN1-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory:
7224 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7225 ; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0
7226 ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
7227 ; GCN1-NEXT: flat_load_dword v0, v[3:4]
7228 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
7229 ; GCN1-NEXT: .LBB130_1: ; %atomicrmw.start
7230 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
7231 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7232 ; GCN1-NEXT: v_mov_b32_e32 v1, v0
7233 ; GCN1-NEXT: v_min_i32_e32 v0, v1, v2
7234 ; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
7235 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7236 ; GCN1-NEXT: buffer_wbinvl1_vol
7237 ; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
7238 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7239 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
7240 ; GCN1-NEXT: s_cbranch_execnz .LBB130_1
7241 ; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
7242 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
7243 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7245 ; GCN2-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory:
7247 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7248 ; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0
7249 ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
7250 ; GCN2-NEXT: flat_load_dword v0, v[3:4]
7251 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
7252 ; GCN2-NEXT: .LBB130_1: ; %atomicrmw.start
7253 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
7254 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7255 ; GCN2-NEXT: v_mov_b32_e32 v1, v0
7256 ; GCN2-NEXT: v_min_i32_e32 v0, v1, v2
7257 ; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
7258 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7259 ; GCN2-NEXT: buffer_wbinvl1_vol
7260 ; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
7261 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7262 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
7263 ; GCN2-NEXT: s_cbranch_execnz .LBB130_1
7264 ; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
7265 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
7266 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7268 ; GCN3-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory:
7270 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7271 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
7272 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
7273 ; GCN3-NEXT: .LBB130_1: ; %atomicrmw.start
7274 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
7275 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7276 ; GCN3-NEXT: v_mov_b32_e32 v4, v3
7277 ; GCN3-NEXT: v_min_i32_e32 v3, v4, v2
7278 ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
7279 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7280 ; GCN3-NEXT: buffer_wbinvl1_vol
7281 ; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
7282 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7283 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
7284 ; GCN3-NEXT: s_cbranch_execnz .LBB130_1
7285 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
7286 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
7287 ; GCN3-NEXT: v_mov_b32_e32 v0, v3
7288 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7289 %gep = getelementptr i32, ptr %out, i64 4
7290 %result = atomicrmw min ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
7294 ; ---------------------------------------------------------------------
7295 ; atomicrmw uinc_wrap
7296 ; ---------------------------------------------------------------------
7298 define void @flat_atomic_uinc_wrap_i32_noret(ptr %ptr, i32 %in) {
7299 ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_noret:
7301 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7302 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2
7303 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7304 ; GCN1-NEXT: buffer_wbinvl1_vol
7305 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7307 ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret:
7309 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7310 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2
7311 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7312 ; GCN2-NEXT: buffer_wbinvl1_vol
7313 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7315 ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret:
7317 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7318 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2
7319 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7320 ; GCN3-NEXT: buffer_wbinvl1_vol
7321 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7322 %tmp0 = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst
7326 define void @flat_atomic_uinc_wrap_i32_noret_offset(ptr %out, i32 %in) {
7327 ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_noret_offset:
7329 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7330 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
7331 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7332 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2
7333 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7334 ; GCN1-NEXT: buffer_wbinvl1_vol
7335 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7337 ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_offset:
7339 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7340 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
7341 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7342 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2
7343 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7344 ; GCN2-NEXT: buffer_wbinvl1_vol
7345 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7347 ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_offset:
7349 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7350 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16
7351 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7352 ; GCN3-NEXT: buffer_wbinvl1_vol
7353 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7354 %gep = getelementptr i32, ptr %out, i32 4
7355 %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst
7359 define i32 @flat_atomic_uinc_wrap_i32_ret(ptr %ptr, i32 %in) {
7360 ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret:
7362 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7363 ; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
7364 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7365 ; GCN1-NEXT: buffer_wbinvl1_vol
7366 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7368 ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret:
7370 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7371 ; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
7372 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7373 ; GCN2-NEXT: buffer_wbinvl1_vol
7374 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7376 ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret:
7378 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7379 ; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
7380 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7381 ; GCN3-NEXT: buffer_wbinvl1_vol
7382 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7383 %result = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst
7387 define i32 @flat_atomic_uinc_wrap_i32_ret_offset(ptr %out, i32 %in) {
7388 ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret_offset:
7390 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7391 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
7392 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7393 ; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
7394 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7395 ; GCN1-NEXT: buffer_wbinvl1_vol
7396 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7398 ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset:
7400 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7401 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
7402 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7403 ; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
7404 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7405 ; GCN2-NEXT: buffer_wbinvl1_vol
7406 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7408 ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset:
7410 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7411 ; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:16 glc
7412 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7413 ; GCN3-NEXT: buffer_wbinvl1_vol
7414 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7415 %gep = getelementptr i32, ptr %out, i32 4
7416 %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst
7420 define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
7421 ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_noret_scalar:
7423 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7424 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
7425 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
7426 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
7427 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2
7428 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7429 ; GCN1-NEXT: buffer_wbinvl1_vol
7430 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7432 ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_scalar:
7434 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7435 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
7436 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
7437 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
7438 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2
7439 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7440 ; GCN2-NEXT: buffer_wbinvl1_vol
7441 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7443 ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_scalar:
7445 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7446 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
7447 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
7448 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
7449 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2
7450 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7451 ; GCN3-NEXT: buffer_wbinvl1_vol
7452 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7453 %tmp0 = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst
7457 define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
7458 ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_noret_offset_scalar:
7460 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7461 ; GCN1-NEXT: s_add_u32 s34, s4, 16
7462 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
7463 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
7464 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
7465 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
7466 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2
7467 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7468 ; GCN1-NEXT: buffer_wbinvl1_vol
7469 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7471 ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_offset_scalar:
7473 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7474 ; GCN2-NEXT: s_add_u32 s34, s4, 16
7475 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
7476 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
7477 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
7478 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
7479 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2
7480 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7481 ; GCN2-NEXT: buffer_wbinvl1_vol
7482 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7484 ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_offset_scalar:
7486 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7487 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
7488 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
7489 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
7490 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16
7491 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7492 ; GCN3-NEXT: buffer_wbinvl1_vol
7493 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7494 %gep = getelementptr i32, ptr %out, i32 4
7495 %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst
7499 define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
7500 ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret_scalar:
7502 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7503 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
7504 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
7505 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
7506 ; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
7507 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7508 ; GCN1-NEXT: buffer_wbinvl1_vol
7509 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7511 ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_scalar:
7513 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7514 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
7515 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
7516 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
7517 ; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
7518 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7519 ; GCN2-NEXT: buffer_wbinvl1_vol
7520 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7522 ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_scalar:
7524 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7525 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
7526 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
7527 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
7528 ; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
7529 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7530 ; GCN3-NEXT: buffer_wbinvl1_vol
7531 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7532 %result = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst
7536 define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
7537 ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret_offset_scalar:
7539 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7540 ; GCN1-NEXT: s_add_u32 s34, s4, 16
7541 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
7542 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
7543 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
7544 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
7545 ; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
7546 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7547 ; GCN1-NEXT: buffer_wbinvl1_vol
7548 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7550 ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset_scalar:
7552 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7553 ; GCN2-NEXT: s_add_u32 s34, s4, 16
7554 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
7555 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
7556 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
7557 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
7558 ; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
7559 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7560 ; GCN2-NEXT: buffer_wbinvl1_vol
7561 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7563 ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset_scalar:
7565 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7566 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
7567 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
7568 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
7569 ; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:16 glc
7570 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7571 ; GCN3-NEXT: buffer_wbinvl1_vol
7572 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7573 %gep = getelementptr i32, ptr %out, i32 4
7574 %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst
7578 define void @flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
7579 ; GCN1-LABEL: flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory:
7581 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7582 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
7583 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7584 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2
7585 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7586 ; GCN1-NEXT: buffer_wbinvl1_vol
7587 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7589 ; GCN2-LABEL: flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory:
7591 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7592 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
7593 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7594 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2
7595 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7596 ; GCN2-NEXT: buffer_wbinvl1_vol
7597 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7599 ; GCN3-LABEL: flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory:
7601 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7602 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16
7603 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7604 ; GCN3-NEXT: buffer_wbinvl1_vol
7605 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7606 %gep = getelementptr i32, ptr %out, i64 4
7607 %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
7611 define i32 @flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
7612 ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory:
7614 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7615 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
7616 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7617 ; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
7618 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7619 ; GCN1-NEXT: buffer_wbinvl1_vol
7620 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7622 ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory:
7624 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7625 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
7626 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7627 ; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
7628 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7629 ; GCN2-NEXT: buffer_wbinvl1_vol
7630 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7632 ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory:
7634 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7635 ; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:16 glc
7636 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7637 ; GCN3-NEXT: buffer_wbinvl1_vol
7638 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7639 %gep = getelementptr i32, ptr %out, i64 4
7640 %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
7644 ; ---------------------------------------------------------------------
7645 ; atomicrmw udec_wrap
7646 ; ---------------------------------------------------------------------
7648 define void @flat_atomic_udec_wrap_i32_noret(ptr %ptr, i32 %in) {
7649 ; GCN1-LABEL: flat_atomic_udec_wrap_i32_noret:
7651 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7652 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2
7653 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7654 ; GCN1-NEXT: buffer_wbinvl1_vol
7655 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7657 ; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret:
7659 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7660 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2
7661 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7662 ; GCN2-NEXT: buffer_wbinvl1_vol
7663 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7665 ; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret:
7667 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7668 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2
7669 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7670 ; GCN3-NEXT: buffer_wbinvl1_vol
7671 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7672 %tmp0 = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst
7676 define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) {
7677 ; GCN1-LABEL: flat_atomic_udec_wrap_i32_noret_offset:
7679 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7680 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
7681 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7682 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2
7683 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7684 ; GCN1-NEXT: buffer_wbinvl1_vol
7685 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7687 ; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_offset:
7689 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7690 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
7691 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7692 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2
7693 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7694 ; GCN2-NEXT: buffer_wbinvl1_vol
7695 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7697 ; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_offset:
7699 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7700 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16
7701 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7702 ; GCN3-NEXT: buffer_wbinvl1_vol
7703 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7704 %gep = getelementptr i32, ptr %out, i32 4
7705 %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst
7709 define i32 @flat_atomic_udec_wrap_i32_ret(ptr %ptr, i32 %in) {
7710 ; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret:
7712 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7713 ; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
7714 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7715 ; GCN1-NEXT: buffer_wbinvl1_vol
7716 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7718 ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret:
7720 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7721 ; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
7722 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7723 ; GCN2-NEXT: buffer_wbinvl1_vol
7724 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7726 ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret:
7728 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7729 ; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
7730 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7731 ; GCN3-NEXT: buffer_wbinvl1_vol
7732 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7733 %result = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst
7737 define i32 @flat_atomic_udec_wrap_i32_ret_offset(ptr %out, i32 %in) {
7738 ; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_offset:
7740 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7741 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
7742 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7743 ; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
7744 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7745 ; GCN1-NEXT: buffer_wbinvl1_vol
7746 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7748 ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset:
7750 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7751 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
7752 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7753 ; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
7754 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7755 ; GCN2-NEXT: buffer_wbinvl1_vol
7756 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7758 ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset:
7760 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7761 ; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:16 glc
7762 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7763 ; GCN3-NEXT: buffer_wbinvl1_vol
7764 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7765 %gep = getelementptr i32, ptr %out, i32 4
7766 %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst
7770 define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
7771 ; GCN1-LABEL: flat_atomic_udec_wrap_i32_noret_scalar:
7773 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7774 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
7775 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
7776 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
7777 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2
7778 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7779 ; GCN1-NEXT: buffer_wbinvl1_vol
7780 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7782 ; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_scalar:
7784 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7785 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
7786 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
7787 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
7788 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2
7789 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7790 ; GCN2-NEXT: buffer_wbinvl1_vol
7791 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7793 ; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_scalar:
7795 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7796 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
7797 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
7798 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
7799 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2
7800 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7801 ; GCN3-NEXT: buffer_wbinvl1_vol
7802 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7803 %tmp0 = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst
7807 define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
7808 ; GCN1-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar:
7810 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7811 ; GCN1-NEXT: s_add_u32 s34, s4, 16
7812 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
7813 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
7814 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
7815 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
7816 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2
7817 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7818 ; GCN1-NEXT: buffer_wbinvl1_vol
7819 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7821 ; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar:
7823 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7824 ; GCN2-NEXT: s_add_u32 s34, s4, 16
7825 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
7826 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
7827 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
7828 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
7829 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2
7830 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7831 ; GCN2-NEXT: buffer_wbinvl1_vol
7832 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7834 ; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar:
7836 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7837 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
7838 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
7839 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
7840 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16
7841 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7842 ; GCN3-NEXT: buffer_wbinvl1_vol
7843 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7844 %gep = getelementptr i32, ptr %out, i32 4
7845 %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst
7849 define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
7850 ; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_scalar:
7852 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7853 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
7854 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
7855 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
7856 ; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
7857 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7858 ; GCN1-NEXT: buffer_wbinvl1_vol
7859 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7861 ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_scalar:
7863 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7864 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
7865 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
7866 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
7867 ; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
7868 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7869 ; GCN2-NEXT: buffer_wbinvl1_vol
7870 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7872 ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_scalar:
7874 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7875 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
7876 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
7877 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
7878 ; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
7879 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7880 ; GCN3-NEXT: buffer_wbinvl1_vol
7881 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7882 %result = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst
7886 define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
7887 ; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar:
7889 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7890 ; GCN1-NEXT: s_add_u32 s34, s4, 16
7891 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
7892 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
7893 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
7894 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
7895 ; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
7896 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7897 ; GCN1-NEXT: buffer_wbinvl1_vol
7898 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7900 ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar:
7902 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7903 ; GCN2-NEXT: s_add_u32 s34, s4, 16
7904 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
7905 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
7906 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
7907 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
7908 ; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
7909 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7910 ; GCN2-NEXT: buffer_wbinvl1_vol
7911 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7913 ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar:
7915 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7916 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
7917 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
7918 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
7919 ; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:16 glc
7920 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7921 ; GCN3-NEXT: buffer_wbinvl1_vol
7922 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7923 %gep = getelementptr i32, ptr %out, i32 4
7924 %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst
7928 define void @flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
7929 ; GCN1-LABEL: flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory:
7931 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7932 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
7933 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7934 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2
7935 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7936 ; GCN1-NEXT: buffer_wbinvl1_vol
7937 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7939 ; GCN2-LABEL: flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory:
7941 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7942 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
7943 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7944 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2
7945 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7946 ; GCN2-NEXT: buffer_wbinvl1_vol
7947 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7949 ; GCN3-LABEL: flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory:
7951 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7952 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16
7953 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7954 ; GCN3-NEXT: buffer_wbinvl1_vol
7955 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7956 %gep = getelementptr i32, ptr %out, i64 4
7957 %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
7961 define i32 @flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 %in) {
7962 ; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory:
7964 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7965 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
7966 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7967 ; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
7968 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7969 ; GCN1-NEXT: buffer_wbinvl1_vol
7970 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7972 ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory:
7974 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7975 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
7976 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7977 ; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
7978 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7979 ; GCN2-NEXT: buffer_wbinvl1_vol
7980 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7982 ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory:
7984 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7985 ; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:16 glc
7986 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7987 ; GCN3-NEXT: buffer_wbinvl1_vol
7988 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7989 %gep = getelementptr i32, ptr %out, i64 4
7990 %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0