1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN1 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN2 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN3 %s
6 ; ---------------------------------------------------------------------
8 ; ---------------------------------------------------------------------
10 define void @flat_atomic_xchg_i64_noret(ptr %ptr, i64 %in) {
11 ; GCN1-LABEL: flat_atomic_xchg_i64_noret:
13 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
15 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
16 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
17 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
18 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
19 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
20 ; GCN1-NEXT: s_cbranch_execnz .LBB0_3
21 ; GCN1-NEXT: ; %bb.1: ; %Flow
22 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
23 ; GCN1-NEXT: s_cbranch_execnz .LBB0_4
24 ; GCN1-NEXT: .LBB0_2: ; %atomicrmw.phi
25 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
26 ; GCN1-NEXT: s_setpc_b64 s[30:31]
27 ; GCN1-NEXT: .LBB0_3: ; %atomicrmw.global
28 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
29 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
30 ; GCN1-NEXT: buffer_wbinvl1_vol
31 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
32 ; GCN1-NEXT: ; implicit-def: $vgpr2
33 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
34 ; GCN1-NEXT: s_cbranch_execz .LBB0_2
35 ; GCN1-NEXT: .LBB0_4: ; %atomicrmw.private
36 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
37 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
38 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
39 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v0
40 ; GCN1-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
41 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
42 ; GCN1-NEXT: s_waitcnt vmcnt(0)
43 ; GCN1-NEXT: s_setpc_b64 s[30:31]
45 ; GCN2-LABEL: flat_atomic_xchg_i64_noret:
47 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
48 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
49 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
50 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
51 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
52 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
53 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
54 ; GCN2-NEXT: s_cbranch_execnz .LBB0_3
55 ; GCN2-NEXT: ; %bb.1: ; %Flow
56 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
57 ; GCN2-NEXT: s_cbranch_execnz .LBB0_4
58 ; GCN2-NEXT: .LBB0_2: ; %atomicrmw.phi
59 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
60 ; GCN2-NEXT: s_setpc_b64 s[30:31]
61 ; GCN2-NEXT: .LBB0_3: ; %atomicrmw.global
62 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
63 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
64 ; GCN2-NEXT: buffer_wbinvl1_vol
65 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
66 ; GCN2-NEXT: ; implicit-def: $vgpr2
67 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
68 ; GCN2-NEXT: s_cbranch_execz .LBB0_2
69 ; GCN2-NEXT: .LBB0_4: ; %atomicrmw.private
70 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
71 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
72 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
73 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v0
74 ; GCN2-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
75 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
76 ; GCN2-NEXT: s_waitcnt vmcnt(0)
77 ; GCN2-NEXT: s_setpc_b64 s[30:31]
79 ; GCN3-LABEL: flat_atomic_xchg_i64_noret:
81 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
82 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
83 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
84 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
85 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
86 ; GCN3-NEXT: s_cbranch_execnz .LBB0_3
87 ; GCN3-NEXT: ; %bb.1: ; %Flow
88 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
89 ; GCN3-NEXT: s_cbranch_execnz .LBB0_4
90 ; GCN3-NEXT: .LBB0_2: ; %atomicrmw.phi
91 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
92 ; GCN3-NEXT: s_setpc_b64 s[30:31]
93 ; GCN3-NEXT: .LBB0_3: ; %atomicrmw.global
94 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
95 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
96 ; GCN3-NEXT: buffer_wbinvl1_vol
97 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
98 ; GCN3-NEXT: ; implicit-def: $vgpr3
99 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
100 ; GCN3-NEXT: s_cbranch_execz .LBB0_2
101 ; GCN3-NEXT: .LBB0_4: ; %atomicrmw.private
102 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
103 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
104 ; GCN3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
105 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
106 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
107 ; GCN3-NEXT: s_waitcnt vmcnt(0)
108 ; GCN3-NEXT: s_setpc_b64 s[30:31]
109 %tmp0 = atomicrmw xchg ptr %ptr, i64 %in seq_cst
113 define void @flat_atomic_xchg_i64_noret_offset(ptr %out, i64 %in) {
114 ; GCN1-LABEL: flat_atomic_xchg_i64_noret_offset:
116 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
117 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
118 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
119 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
120 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
121 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
122 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
123 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
124 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
125 ; GCN1-NEXT: s_cbranch_execnz .LBB1_3
126 ; GCN1-NEXT: ; %bb.1: ; %Flow
127 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
128 ; GCN1-NEXT: s_cbranch_execnz .LBB1_4
129 ; GCN1-NEXT: .LBB1_2: ; %atomicrmw.phi
130 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
131 ; GCN1-NEXT: s_setpc_b64 s[30:31]
132 ; GCN1-NEXT: .LBB1_3: ; %atomicrmw.global
133 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
134 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
135 ; GCN1-NEXT: buffer_wbinvl1_vol
136 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
137 ; GCN1-NEXT: ; implicit-def: $vgpr2
138 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
139 ; GCN1-NEXT: s_cbranch_execz .LBB1_2
140 ; GCN1-NEXT: .LBB1_4: ; %atomicrmw.private
141 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
142 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
143 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
144 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v0
145 ; GCN1-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
146 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
147 ; GCN1-NEXT: s_waitcnt vmcnt(0)
148 ; GCN1-NEXT: s_setpc_b64 s[30:31]
150 ; GCN2-LABEL: flat_atomic_xchg_i64_noret_offset:
152 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
153 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
154 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
155 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
156 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
157 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
158 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
159 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
160 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
161 ; GCN2-NEXT: s_cbranch_execnz .LBB1_3
162 ; GCN2-NEXT: ; %bb.1: ; %Flow
163 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
164 ; GCN2-NEXT: s_cbranch_execnz .LBB1_4
165 ; GCN2-NEXT: .LBB1_2: ; %atomicrmw.phi
166 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
167 ; GCN2-NEXT: s_setpc_b64 s[30:31]
168 ; GCN2-NEXT: .LBB1_3: ; %atomicrmw.global
169 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
170 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
171 ; GCN2-NEXT: buffer_wbinvl1_vol
172 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
173 ; GCN2-NEXT: ; implicit-def: $vgpr2
174 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
175 ; GCN2-NEXT: s_cbranch_execz .LBB1_2
176 ; GCN2-NEXT: .LBB1_4: ; %atomicrmw.private
177 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
178 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
179 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
180 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v0
181 ; GCN2-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
182 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
183 ; GCN2-NEXT: s_waitcnt vmcnt(0)
184 ; GCN2-NEXT: s_setpc_b64 s[30:31]
186 ; GCN3-LABEL: flat_atomic_xchg_i64_noret_offset:
188 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
189 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
190 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
191 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
192 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
193 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
194 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
195 ; GCN3-NEXT: s_cbranch_execnz .LBB1_3
196 ; GCN3-NEXT: ; %bb.1: ; %Flow
197 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
198 ; GCN3-NEXT: s_cbranch_execnz .LBB1_4
199 ; GCN3-NEXT: .LBB1_2: ; %atomicrmw.phi
200 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
201 ; GCN3-NEXT: s_setpc_b64 s[30:31]
202 ; GCN3-NEXT: .LBB1_3: ; %atomicrmw.global
203 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
204 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
205 ; GCN3-NEXT: buffer_wbinvl1_vol
206 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
207 ; GCN3-NEXT: ; implicit-def: $vgpr3
208 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
209 ; GCN3-NEXT: s_cbranch_execz .LBB1_2
210 ; GCN3-NEXT: .LBB1_4: ; %atomicrmw.private
211 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
212 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
213 ; GCN3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
214 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
215 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
216 ; GCN3-NEXT: s_waitcnt vmcnt(0)
217 ; GCN3-NEXT: s_setpc_b64 s[30:31]
218 %gep = getelementptr i64, ptr %out, i64 4
219 %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst
223 define i64 @flat_atomic_xchg_i64_ret(ptr %ptr, i64 %in) {
224 ; GCN1-LABEL: flat_atomic_xchg_i64_ret:
226 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
227 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
228 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
229 ; GCN1-NEXT: v_mov_b32_e32 v5, v1
230 ; GCN1-NEXT: v_mov_b32_e32 v4, v0
231 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
232 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
233 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
234 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
235 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
236 ; GCN1-NEXT: s_cbranch_execnz .LBB2_3
237 ; GCN1-NEXT: ; %bb.1: ; %Flow
238 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
239 ; GCN1-NEXT: s_cbranch_execnz .LBB2_4
240 ; GCN1-NEXT: .LBB2_2: ; %atomicrmw.phi
241 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
242 ; GCN1-NEXT: s_setpc_b64 s[30:31]
243 ; GCN1-NEXT: .LBB2_3: ; %atomicrmw.global
244 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
245 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
246 ; GCN1-NEXT: buffer_wbinvl1_vol
247 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
248 ; GCN1-NEXT: ; implicit-def: $vgpr2
249 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
250 ; GCN1-NEXT: s_cbranch_execz .LBB2_2
251 ; GCN1-NEXT: .LBB2_4: ; %atomicrmw.private
252 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
253 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
254 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
255 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
256 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
257 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
258 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
259 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
260 ; GCN1-NEXT: s_waitcnt vmcnt(0)
261 ; GCN1-NEXT: s_setpc_b64 s[30:31]
263 ; GCN2-LABEL: flat_atomic_xchg_i64_ret:
265 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
266 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
267 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
268 ; GCN2-NEXT: v_mov_b32_e32 v5, v1
269 ; GCN2-NEXT: v_mov_b32_e32 v4, v0
270 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
271 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
272 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
273 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
274 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
275 ; GCN2-NEXT: s_cbranch_execnz .LBB2_3
276 ; GCN2-NEXT: ; %bb.1: ; %Flow
277 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
278 ; GCN2-NEXT: s_cbranch_execnz .LBB2_4
279 ; GCN2-NEXT: .LBB2_2: ; %atomicrmw.phi
280 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
281 ; GCN2-NEXT: s_setpc_b64 s[30:31]
282 ; GCN2-NEXT: .LBB2_3: ; %atomicrmw.global
283 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
284 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
285 ; GCN2-NEXT: buffer_wbinvl1_vol
286 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
287 ; GCN2-NEXT: ; implicit-def: $vgpr2
288 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
289 ; GCN2-NEXT: s_cbranch_execz .LBB2_2
290 ; GCN2-NEXT: .LBB2_4: ; %atomicrmw.private
291 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
292 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
293 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
294 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
295 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
296 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
297 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
298 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
299 ; GCN2-NEXT: s_waitcnt vmcnt(0)
300 ; GCN2-NEXT: s_setpc_b64 s[30:31]
302 ; GCN3-LABEL: flat_atomic_xchg_i64_ret:
304 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
305 ; GCN3-NEXT: v_mov_b32_e32 v5, v1
306 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
307 ; GCN3-NEXT: v_mov_b32_e32 v4, v0
308 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
309 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
310 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
311 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
312 ; GCN3-NEXT: s_cbranch_execnz .LBB2_3
313 ; GCN3-NEXT: ; %bb.1: ; %Flow
314 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
315 ; GCN3-NEXT: s_cbranch_execnz .LBB2_4
316 ; GCN3-NEXT: .LBB2_2: ; %atomicrmw.phi
317 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
318 ; GCN3-NEXT: s_setpc_b64 s[30:31]
319 ; GCN3-NEXT: .LBB2_3: ; %atomicrmw.global
320 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
321 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
322 ; GCN3-NEXT: buffer_wbinvl1_vol
323 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
324 ; GCN3-NEXT: ; implicit-def: $vgpr2
325 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
326 ; GCN3-NEXT: s_cbranch_execz .LBB2_2
327 ; GCN3-NEXT: .LBB2_4: ; %atomicrmw.private
328 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
329 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
330 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
331 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
333 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
334 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
335 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
336 ; GCN3-NEXT: s_waitcnt vmcnt(0)
337 ; GCN3-NEXT: s_setpc_b64 s[30:31]
338 %result = atomicrmw xchg ptr %ptr, i64 %in seq_cst
342 define i64 @flat_atomic_xchg_i64_ret_offset(ptr %out, i64 %in) {
343 ; GCN1-LABEL: flat_atomic_xchg_i64_ret_offset:
345 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
346 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
347 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
348 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
349 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
350 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
351 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
352 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
353 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
354 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
355 ; GCN1-NEXT: s_cbranch_execnz .LBB3_3
356 ; GCN1-NEXT: ; %bb.1: ; %Flow
357 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
358 ; GCN1-NEXT: s_cbranch_execnz .LBB3_4
359 ; GCN1-NEXT: .LBB3_2: ; %atomicrmw.phi
360 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
361 ; GCN1-NEXT: s_setpc_b64 s[30:31]
362 ; GCN1-NEXT: .LBB3_3: ; %atomicrmw.global
363 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
364 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
365 ; GCN1-NEXT: buffer_wbinvl1_vol
366 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
367 ; GCN1-NEXT: ; implicit-def: $vgpr2
368 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
369 ; GCN1-NEXT: s_cbranch_execz .LBB3_2
370 ; GCN1-NEXT: .LBB3_4: ; %atomicrmw.private
371 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
372 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
373 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
374 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
375 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
376 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
377 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
378 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
379 ; GCN1-NEXT: s_waitcnt vmcnt(0)
380 ; GCN1-NEXT: s_setpc_b64 s[30:31]
382 ; GCN2-LABEL: flat_atomic_xchg_i64_ret_offset:
384 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
385 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
386 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
387 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
388 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
389 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
390 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
391 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
392 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
393 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
394 ; GCN2-NEXT: s_cbranch_execnz .LBB3_3
395 ; GCN2-NEXT: ; %bb.1: ; %Flow
396 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
397 ; GCN2-NEXT: s_cbranch_execnz .LBB3_4
398 ; GCN2-NEXT: .LBB3_2: ; %atomicrmw.phi
399 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
400 ; GCN2-NEXT: s_setpc_b64 s[30:31]
401 ; GCN2-NEXT: .LBB3_3: ; %atomicrmw.global
402 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
403 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
404 ; GCN2-NEXT: buffer_wbinvl1_vol
405 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
406 ; GCN2-NEXT: ; implicit-def: $vgpr2
407 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
408 ; GCN2-NEXT: s_cbranch_execz .LBB3_2
409 ; GCN2-NEXT: .LBB3_4: ; %atomicrmw.private
410 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
411 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
412 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
413 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
414 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
415 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
416 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
417 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
418 ; GCN2-NEXT: s_waitcnt vmcnt(0)
419 ; GCN2-NEXT: s_setpc_b64 s[30:31]
421 ; GCN3-LABEL: flat_atomic_xchg_i64_ret_offset:
423 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
424 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
425 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
426 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
427 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
428 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
429 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
430 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
431 ; GCN3-NEXT: s_cbranch_execnz .LBB3_3
432 ; GCN3-NEXT: ; %bb.1: ; %Flow
433 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
434 ; GCN3-NEXT: s_cbranch_execnz .LBB3_4
435 ; GCN3-NEXT: .LBB3_2: ; %atomicrmw.phi
436 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
437 ; GCN3-NEXT: s_setpc_b64 s[30:31]
438 ; GCN3-NEXT: .LBB3_3: ; %atomicrmw.global
439 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
440 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
441 ; GCN3-NEXT: buffer_wbinvl1_vol
442 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
443 ; GCN3-NEXT: ; implicit-def: $vgpr2
444 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
445 ; GCN3-NEXT: s_cbranch_execz .LBB3_2
446 ; GCN3-NEXT: .LBB3_4: ; %atomicrmw.private
447 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
448 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
449 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
450 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
452 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
453 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
454 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
455 ; GCN3-NEXT: s_waitcnt vmcnt(0)
456 ; GCN3-NEXT: s_setpc_b64 s[30:31]
457 %gep = getelementptr i64, ptr %out, i64 4
458 %result = atomicrmw xchg ptr %gep, i64 %in seq_cst
462 define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
463 ; GCN1-LABEL: flat_atomic_xchg_i64_noret_scalar:
465 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
466 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
467 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
468 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
469 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
470 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
471 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
472 ; GCN1-NEXT: s_mov_b64 s[34:35], -1
473 ; GCN1-NEXT: s_cbranch_vccnz .LBB4_3
474 ; GCN1-NEXT: ; %bb.1: ; %Flow
475 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
476 ; GCN1-NEXT: s_cbranch_vccz .LBB4_4
477 ; GCN1-NEXT: .LBB4_2: ; %atomicrmw.phi
478 ; GCN1-NEXT: s_setpc_b64 s[30:31]
479 ; GCN1-NEXT: .LBB4_3: ; %atomicrmw.global
480 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
481 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
482 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
483 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
484 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
485 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
486 ; GCN1-NEXT: buffer_wbinvl1_vol
487 ; GCN1-NEXT: s_cbranch_execnz .LBB4_2
488 ; GCN1-NEXT: .LBB4_4: ; %atomicrmw.private
489 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
490 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
491 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
492 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
493 ; GCN1-NEXT: v_mov_b32_e32 v1, s34
494 ; GCN1-NEXT: s_add_i32 s34, s34, 4
495 ; GCN1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
496 ; GCN1-NEXT: v_mov_b32_e32 v0, s7
497 ; GCN1-NEXT: v_mov_b32_e32 v1, s34
498 ; GCN1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
499 ; GCN1-NEXT: s_waitcnt vmcnt(0)
500 ; GCN1-NEXT: s_setpc_b64 s[30:31]
502 ; GCN2-LABEL: flat_atomic_xchg_i64_noret_scalar:
504 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
505 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
506 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
507 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
508 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
509 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
510 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
511 ; GCN2-NEXT: s_mov_b64 s[34:35], -1
512 ; GCN2-NEXT: s_cbranch_vccnz .LBB4_3
513 ; GCN2-NEXT: ; %bb.1: ; %Flow
514 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
515 ; GCN2-NEXT: s_cbranch_vccz .LBB4_4
516 ; GCN2-NEXT: .LBB4_2: ; %atomicrmw.phi
517 ; GCN2-NEXT: s_setpc_b64 s[30:31]
518 ; GCN2-NEXT: .LBB4_3: ; %atomicrmw.global
519 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
520 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
521 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
522 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
523 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
524 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
525 ; GCN2-NEXT: buffer_wbinvl1_vol
526 ; GCN2-NEXT: s_cbranch_execnz .LBB4_2
527 ; GCN2-NEXT: .LBB4_4: ; %atomicrmw.private
528 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
529 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
530 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
531 ; GCN2-NEXT: v_mov_b32_e32 v1, s34
532 ; GCN2-NEXT: s_add_i32 s34, s34, 4
533 ; GCN2-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
534 ; GCN2-NEXT: v_mov_b32_e32 v0, s7
535 ; GCN2-NEXT: v_mov_b32_e32 v1, s34
536 ; GCN2-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
537 ; GCN2-NEXT: s_waitcnt vmcnt(0)
538 ; GCN2-NEXT: s_setpc_b64 s[30:31]
540 ; GCN3-LABEL: flat_atomic_xchg_i64_noret_scalar:
542 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
543 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
544 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
545 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
546 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
547 ; GCN3-NEXT: s_mov_b64 s[34:35], -1
548 ; GCN3-NEXT: s_cbranch_vccnz .LBB4_3
549 ; GCN3-NEXT: ; %bb.1: ; %Flow
550 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
551 ; GCN3-NEXT: s_cbranch_vccz .LBB4_4
552 ; GCN3-NEXT: .LBB4_2: ; %atomicrmw.phi
553 ; GCN3-NEXT: s_setpc_b64 s[30:31]
554 ; GCN3-NEXT: .LBB4_3: ; %atomicrmw.global
555 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
556 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
557 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
558 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
559 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
560 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
561 ; GCN3-NEXT: buffer_wbinvl1_vol
562 ; GCN3-NEXT: s_cbranch_execnz .LBB4_2
563 ; GCN3-NEXT: .LBB4_4: ; %atomicrmw.private
564 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
565 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
566 ; GCN3-NEXT: v_mov_b32_e32 v0, s7
567 ; GCN3-NEXT: v_mov_b32_e32 v1, s34
568 ; GCN3-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4
569 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
570 ; GCN3-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
571 ; GCN3-NEXT: s_waitcnt vmcnt(0)
572 ; GCN3-NEXT: s_setpc_b64 s[30:31]
573 %tmp0 = atomicrmw xchg ptr %ptr, i64 %in seq_cst
577 define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
578 ; GCN1-LABEL: flat_atomic_xchg_i64_noret_offset_scalar:
580 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
581 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
582 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
583 ; GCN1-NEXT: s_add_u32 s34, s4, 32
584 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
585 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
586 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
587 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
588 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
589 ; GCN1-NEXT: s_mov_b64 s[36:37], -1
590 ; GCN1-NEXT: s_cbranch_vccnz .LBB5_3
591 ; GCN1-NEXT: ; %bb.1: ; %Flow
592 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
593 ; GCN1-NEXT: s_cbranch_vccz .LBB5_4
594 ; GCN1-NEXT: .LBB5_2: ; %atomicrmw.phi
595 ; GCN1-NEXT: s_setpc_b64 s[30:31]
596 ; GCN1-NEXT: .LBB5_3: ; %atomicrmw.global
597 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
598 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
599 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
600 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
601 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
602 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
603 ; GCN1-NEXT: buffer_wbinvl1_vol
604 ; GCN1-NEXT: s_cbranch_execnz .LBB5_2
605 ; GCN1-NEXT: .LBB5_4: ; %atomicrmw.private
606 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
607 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
608 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
609 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
610 ; GCN1-NEXT: v_mov_b32_e32 v1, s34
611 ; GCN1-NEXT: s_add_i32 s34, s34, 4
612 ; GCN1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
613 ; GCN1-NEXT: v_mov_b32_e32 v0, s7
614 ; GCN1-NEXT: v_mov_b32_e32 v1, s34
615 ; GCN1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
616 ; GCN1-NEXT: s_waitcnt vmcnt(0)
617 ; GCN1-NEXT: s_setpc_b64 s[30:31]
619 ; GCN2-LABEL: flat_atomic_xchg_i64_noret_offset_scalar:
621 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
622 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
623 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
624 ; GCN2-NEXT: s_add_u32 s34, s4, 32
625 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
626 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
627 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
628 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
629 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
630 ; GCN2-NEXT: s_mov_b64 s[36:37], -1
631 ; GCN2-NEXT: s_cbranch_vccnz .LBB5_3
632 ; GCN2-NEXT: ; %bb.1: ; %Flow
633 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
634 ; GCN2-NEXT: s_cbranch_vccz .LBB5_4
635 ; GCN2-NEXT: .LBB5_2: ; %atomicrmw.phi
636 ; GCN2-NEXT: s_setpc_b64 s[30:31]
637 ; GCN2-NEXT: .LBB5_3: ; %atomicrmw.global
638 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
639 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
640 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
641 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
642 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
643 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
644 ; GCN2-NEXT: buffer_wbinvl1_vol
645 ; GCN2-NEXT: s_cbranch_execnz .LBB5_2
646 ; GCN2-NEXT: .LBB5_4: ; %atomicrmw.private
647 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
648 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
649 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
650 ; GCN2-NEXT: v_mov_b32_e32 v1, s34
651 ; GCN2-NEXT: s_add_i32 s34, s34, 4
652 ; GCN2-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
653 ; GCN2-NEXT: v_mov_b32_e32 v0, s7
654 ; GCN2-NEXT: v_mov_b32_e32 v1, s34
655 ; GCN2-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
656 ; GCN2-NEXT: s_waitcnt vmcnt(0)
657 ; GCN2-NEXT: s_setpc_b64 s[30:31]
659 ; GCN3-LABEL: flat_atomic_xchg_i64_noret_offset_scalar:
661 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
662 ; GCN3-NEXT: s_add_u32 s34, s4, 32
663 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
664 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
665 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
666 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
667 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
668 ; GCN3-NEXT: s_mov_b64 s[36:37], -1
669 ; GCN3-NEXT: s_cbranch_vccnz .LBB5_3
670 ; GCN3-NEXT: ; %bb.1: ; %Flow
671 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
672 ; GCN3-NEXT: s_cbranch_vccz .LBB5_4
673 ; GCN3-NEXT: .LBB5_2: ; %atomicrmw.phi
674 ; GCN3-NEXT: s_setpc_b64 s[30:31]
675 ; GCN3-NEXT: .LBB5_3: ; %atomicrmw.global
676 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
677 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
678 ; GCN3-NEXT: v_mov_b32_e32 v1, s35
679 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
680 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
681 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
682 ; GCN3-NEXT: buffer_wbinvl1_vol
683 ; GCN3-NEXT: s_cbranch_execnz .LBB5_2
684 ; GCN3-NEXT: .LBB5_4: ; %atomicrmw.private
685 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
686 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
687 ; GCN3-NEXT: v_mov_b32_e32 v0, s7
688 ; GCN3-NEXT: v_mov_b32_e32 v1, s34
689 ; GCN3-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4
690 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
691 ; GCN3-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
692 ; GCN3-NEXT: s_waitcnt vmcnt(0)
693 ; GCN3-NEXT: s_setpc_b64 s[30:31]
694 %gep = getelementptr i64, ptr %out, i64 4
695 %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst
699 define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
700 ; GCN1-LABEL: flat_atomic_xchg_i64_ret_scalar:
702 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
703 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
704 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
705 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
706 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
707 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
708 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
709 ; GCN1-NEXT: s_cbranch_vccz .LBB6_2
710 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
711 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
712 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
713 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
714 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
715 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
716 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
717 ; GCN1-NEXT: buffer_wbinvl1_vol
718 ; GCN1-NEXT: s_cbranch_execz .LBB6_3
719 ; GCN1-NEXT: s_branch .LBB6_4
720 ; GCN1-NEXT: .LBB6_2:
721 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
722 ; GCN1-NEXT: .LBB6_3: ; %atomicrmw.private
723 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
724 ; GCN1-NEXT: v_mov_b32_e32 v4, s6
725 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
726 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
727 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
728 ; GCN1-NEXT: s_add_i32 s34, s34, 4
729 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
730 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
731 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
732 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
733 ; GCN1-NEXT: v_mov_b32_e32 v2, s7
734 ; GCN1-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
735 ; GCN1-NEXT: .LBB6_4: ; %atomicrmw.end
736 ; GCN1-NEXT: s_waitcnt vmcnt(0)
737 ; GCN1-NEXT: s_setpc_b64 s[30:31]
739 ; GCN2-LABEL: flat_atomic_xchg_i64_ret_scalar:
741 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
742 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
743 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
744 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
745 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
746 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
747 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
748 ; GCN2-NEXT: s_cbranch_vccz .LBB6_2
749 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
750 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
751 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
752 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
753 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
754 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
755 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
756 ; GCN2-NEXT: buffer_wbinvl1_vol
757 ; GCN2-NEXT: s_cbranch_execz .LBB6_3
758 ; GCN2-NEXT: s_branch .LBB6_4
759 ; GCN2-NEXT: .LBB6_2:
760 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
761 ; GCN2-NEXT: .LBB6_3: ; %atomicrmw.private
762 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
763 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
764 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
765 ; GCN2-NEXT: s_add_i32 s34, s34, 4
766 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
767 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
768 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
769 ; GCN2-NEXT: v_mov_b32_e32 v4, s6
770 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
771 ; GCN2-NEXT: v_mov_b32_e32 v2, s7
772 ; GCN2-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
773 ; GCN2-NEXT: .LBB6_4: ; %atomicrmw.end
774 ; GCN2-NEXT: s_waitcnt vmcnt(0)
775 ; GCN2-NEXT: s_setpc_b64 s[30:31]
777 ; GCN3-LABEL: flat_atomic_xchg_i64_ret_scalar:
779 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
780 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
781 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
782 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
783 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
784 ; GCN3-NEXT: s_cbranch_vccz .LBB6_2
785 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
786 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
787 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
788 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
789 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
790 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
791 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
792 ; GCN3-NEXT: buffer_wbinvl1_vol
793 ; GCN3-NEXT: s_cbranch_execz .LBB6_3
794 ; GCN3-NEXT: s_branch .LBB6_4
795 ; GCN3-NEXT: .LBB6_2:
796 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
797 ; GCN3-NEXT: .LBB6_3: ; %atomicrmw.private
798 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
799 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
800 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
801 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
802 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
803 ; GCN3-NEXT: v_mov_b32_e32 v3, s6
804 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
805 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
806 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
807 ; GCN3-NEXT: .LBB6_4: ; %atomicrmw.end
808 ; GCN3-NEXT: s_waitcnt vmcnt(0)
809 ; GCN3-NEXT: s_setpc_b64 s[30:31]
810 %result = atomicrmw xchg ptr %ptr, i64 %in seq_cst
814 define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
815 ; GCN1-LABEL: flat_atomic_xchg_i64_ret_offset_scalar:
817 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
818 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
819 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
820 ; GCN1-NEXT: s_add_u32 s34, s4, 32
821 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
822 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
823 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
824 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
825 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
826 ; GCN1-NEXT: s_cbranch_vccz .LBB7_2
827 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
828 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
829 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
830 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
831 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
832 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
833 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
834 ; GCN1-NEXT: buffer_wbinvl1_vol
835 ; GCN1-NEXT: s_cbranch_execz .LBB7_3
836 ; GCN1-NEXT: s_branch .LBB7_4
837 ; GCN1-NEXT: .LBB7_2:
838 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
839 ; GCN1-NEXT: .LBB7_3: ; %atomicrmw.private
840 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
841 ; GCN1-NEXT: v_mov_b32_e32 v4, s6
842 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
843 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
844 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
845 ; GCN1-NEXT: s_add_i32 s34, s34, 4
846 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
847 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
848 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
849 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
850 ; GCN1-NEXT: v_mov_b32_e32 v2, s7
851 ; GCN1-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
852 ; GCN1-NEXT: .LBB7_4: ; %atomicrmw.end
853 ; GCN1-NEXT: s_waitcnt vmcnt(0)
854 ; GCN1-NEXT: s_setpc_b64 s[30:31]
856 ; GCN2-LABEL: flat_atomic_xchg_i64_ret_offset_scalar:
858 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
859 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
860 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
861 ; GCN2-NEXT: s_add_u32 s34, s4, 32
862 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
863 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
864 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
865 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
866 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
867 ; GCN2-NEXT: s_cbranch_vccz .LBB7_2
868 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
869 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
870 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
871 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
872 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
873 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
874 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
875 ; GCN2-NEXT: buffer_wbinvl1_vol
876 ; GCN2-NEXT: s_cbranch_execz .LBB7_3
877 ; GCN2-NEXT: s_branch .LBB7_4
878 ; GCN2-NEXT: .LBB7_2:
879 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
880 ; GCN2-NEXT: .LBB7_3: ; %atomicrmw.private
881 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
882 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
883 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
884 ; GCN2-NEXT: s_add_i32 s34, s34, 4
885 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
886 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
887 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
888 ; GCN2-NEXT: v_mov_b32_e32 v4, s6
889 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
890 ; GCN2-NEXT: v_mov_b32_e32 v2, s7
891 ; GCN2-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
892 ; GCN2-NEXT: .LBB7_4: ; %atomicrmw.end
893 ; GCN2-NEXT: s_waitcnt vmcnt(0)
894 ; GCN2-NEXT: s_setpc_b64 s[30:31]
896 ; GCN3-LABEL: flat_atomic_xchg_i64_ret_offset_scalar:
898 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
899 ; GCN3-NEXT: s_add_u32 s34, s4, 32
900 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
901 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
902 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
903 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
904 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
905 ; GCN3-NEXT: s_cbranch_vccz .LBB7_2
906 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
907 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
908 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
909 ; GCN3-NEXT: v_mov_b32_e32 v1, s35
910 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
911 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
912 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
913 ; GCN3-NEXT: buffer_wbinvl1_vol
914 ; GCN3-NEXT: s_cbranch_execz .LBB7_3
915 ; GCN3-NEXT: s_branch .LBB7_4
916 ; GCN3-NEXT: .LBB7_2:
917 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
918 ; GCN3-NEXT: .LBB7_3: ; %atomicrmw.private
919 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
920 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
921 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
922 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
923 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
924 ; GCN3-NEXT: v_mov_b32_e32 v3, s6
925 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
926 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
927 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
928 ; GCN3-NEXT: .LBB7_4: ; %atomicrmw.end
929 ; GCN3-NEXT: s_waitcnt vmcnt(0)
930 ; GCN3-NEXT: s_setpc_b64 s[30:31]
931 %gep = getelementptr i64, ptr %out, i64 4
932 %result = atomicrmw xchg ptr %gep, i64 %in seq_cst
936 define void @flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
937 ; GCN1-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory:
939 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
940 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
941 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
942 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
943 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
944 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
945 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
946 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
947 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
948 ; GCN1-NEXT: s_cbranch_execnz .LBB8_3
949 ; GCN1-NEXT: ; %bb.1: ; %Flow
950 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
951 ; GCN1-NEXT: s_cbranch_execnz .LBB8_4
952 ; GCN1-NEXT: .LBB8_2: ; %atomicrmw.phi
953 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
954 ; GCN1-NEXT: s_setpc_b64 s[30:31]
955 ; GCN1-NEXT: .LBB8_3: ; %atomicrmw.global
956 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
957 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
958 ; GCN1-NEXT: buffer_wbinvl1_vol
959 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
960 ; GCN1-NEXT: ; implicit-def: $vgpr2
961 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
962 ; GCN1-NEXT: s_cbranch_execz .LBB8_2
963 ; GCN1-NEXT: .LBB8_4: ; %atomicrmw.private
964 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
965 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
966 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
967 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v0
968 ; GCN1-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
969 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
970 ; GCN1-NEXT: s_waitcnt vmcnt(0)
971 ; GCN1-NEXT: s_setpc_b64 s[30:31]
973 ; GCN2-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory:
975 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
976 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
977 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
978 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
979 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
980 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
981 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
982 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
983 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
984 ; GCN2-NEXT: s_cbranch_execnz .LBB8_3
985 ; GCN2-NEXT: ; %bb.1: ; %Flow
986 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
987 ; GCN2-NEXT: s_cbranch_execnz .LBB8_4
988 ; GCN2-NEXT: .LBB8_2: ; %atomicrmw.phi
989 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
990 ; GCN2-NEXT: s_setpc_b64 s[30:31]
991 ; GCN2-NEXT: .LBB8_3: ; %atomicrmw.global
992 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
993 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
994 ; GCN2-NEXT: buffer_wbinvl1_vol
995 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
996 ; GCN2-NEXT: ; implicit-def: $vgpr2
997 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
998 ; GCN2-NEXT: s_cbranch_execz .LBB8_2
999 ; GCN2-NEXT: .LBB8_4: ; %atomicrmw.private
1000 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
1001 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
1002 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
1003 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v0
1004 ; GCN2-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
1005 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
1006 ; GCN2-NEXT: s_waitcnt vmcnt(0)
1007 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1009 ; GCN3-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory:
1011 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1012 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
1013 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1014 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
1015 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
1016 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
1017 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
1018 ; GCN3-NEXT: s_cbranch_execnz .LBB8_3
1019 ; GCN3-NEXT: ; %bb.1: ; %Flow
1020 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1021 ; GCN3-NEXT: s_cbranch_execnz .LBB8_4
1022 ; GCN3-NEXT: .LBB8_2: ; %atomicrmw.phi
1023 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
1024 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1025 ; GCN3-NEXT: .LBB8_3: ; %atomicrmw.global
1026 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
1027 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1028 ; GCN3-NEXT: buffer_wbinvl1_vol
1029 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
1030 ; GCN3-NEXT: ; implicit-def: $vgpr3
1031 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1032 ; GCN3-NEXT: s_cbranch_execz .LBB8_2
1033 ; GCN3-NEXT: .LBB8_4: ; %atomicrmw.private
1034 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
1035 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
1036 ; GCN3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
1037 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
1038 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
1039 ; GCN3-NEXT: s_waitcnt vmcnt(0)
1040 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1041 %gep = getelementptr i64, ptr %out, i64 4
1042 %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
1046 define i64 @flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
1047 ; GCN1-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory:
1049 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1050 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
1051 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
1052 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
1053 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1054 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1055 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
1056 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
1057 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
1058 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
1059 ; GCN1-NEXT: s_cbranch_execnz .LBB9_3
1060 ; GCN1-NEXT: ; %bb.1: ; %Flow
1061 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1062 ; GCN1-NEXT: s_cbranch_execnz .LBB9_4
1063 ; GCN1-NEXT: .LBB9_2: ; %atomicrmw.phi
1064 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
1065 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1066 ; GCN1-NEXT: .LBB9_3: ; %atomicrmw.global
1067 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
1068 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1069 ; GCN1-NEXT: buffer_wbinvl1_vol
1070 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
1071 ; GCN1-NEXT: ; implicit-def: $vgpr2
1072 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1073 ; GCN1-NEXT: s_cbranch_execz .LBB9_2
1074 ; GCN1-NEXT: .LBB9_4: ; %atomicrmw.private
1075 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
1076 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
1077 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
1078 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
1079 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
1080 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
1081 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
1082 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
1083 ; GCN1-NEXT: s_waitcnt vmcnt(0)
1084 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1086 ; GCN2-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory:
1088 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1089 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
1090 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
1091 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
1092 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1093 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1094 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
1095 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
1096 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
1097 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
1098 ; GCN2-NEXT: s_cbranch_execnz .LBB9_3
1099 ; GCN2-NEXT: ; %bb.1: ; %Flow
1100 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1101 ; GCN2-NEXT: s_cbranch_execnz .LBB9_4
1102 ; GCN2-NEXT: .LBB9_2: ; %atomicrmw.phi
1103 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
1104 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1105 ; GCN2-NEXT: .LBB9_3: ; %atomicrmw.global
1106 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
1107 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1108 ; GCN2-NEXT: buffer_wbinvl1_vol
1109 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
1110 ; GCN2-NEXT: ; implicit-def: $vgpr2
1111 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1112 ; GCN2-NEXT: s_cbranch_execz .LBB9_2
1113 ; GCN2-NEXT: .LBB9_4: ; %atomicrmw.private
1114 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
1115 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
1116 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
1117 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
1118 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
1119 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
1120 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
1121 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
1122 ; GCN2-NEXT: s_waitcnt vmcnt(0)
1123 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1125 ; GCN3-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory:
1127 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1128 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
1129 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
1130 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
1131 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
1132 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
1133 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
1134 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
1135 ; GCN3-NEXT: s_cbranch_execnz .LBB9_3
1136 ; GCN3-NEXT: ; %bb.1: ; %Flow
1137 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1138 ; GCN3-NEXT: s_cbranch_execnz .LBB9_4
1139 ; GCN3-NEXT: .LBB9_2: ; %atomicrmw.phi
1140 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
1141 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1142 ; GCN3-NEXT: .LBB9_3: ; %atomicrmw.global
1143 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
1144 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1145 ; GCN3-NEXT: buffer_wbinvl1_vol
1146 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
1147 ; GCN3-NEXT: ; implicit-def: $vgpr2
1148 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1149 ; GCN3-NEXT: s_cbranch_execz .LBB9_2
1150 ; GCN3-NEXT: .LBB9_4: ; %atomicrmw.private
1151 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
1152 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
1153 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
1154 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
1155 ; GCN3-NEXT: s_nop 0
1156 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
1157 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
1158 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
1159 ; GCN3-NEXT: s_waitcnt vmcnt(0)
1160 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1161 %gep = getelementptr i64, ptr %out, i64 4
1162 %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
1166 ; ---------------------------------------------------------------------
1167 ; atomicrmw xchg f64
1168 ; ---------------------------------------------------------------------
1170 define void @flat_atomic_xchg_f64_noret(ptr %ptr, double %in) {
1171 ; GCN1-LABEL: flat_atomic_xchg_f64_noret:
1173 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1174 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
1175 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
1176 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1177 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
1178 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
1179 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
1180 ; GCN1-NEXT: s_cbranch_execnz .LBB10_3
1181 ; GCN1-NEXT: ; %bb.1: ; %Flow
1182 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1183 ; GCN1-NEXT: s_cbranch_execnz .LBB10_4
1184 ; GCN1-NEXT: .LBB10_2: ; %atomicrmw.phi
1185 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
1186 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1187 ; GCN1-NEXT: .LBB10_3: ; %atomicrmw.global
1188 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
1189 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1190 ; GCN1-NEXT: buffer_wbinvl1_vol
1191 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
1192 ; GCN1-NEXT: ; implicit-def: $vgpr2
1193 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1194 ; GCN1-NEXT: s_cbranch_execz .LBB10_2
1195 ; GCN1-NEXT: .LBB10_4: ; %atomicrmw.private
1196 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
1197 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
1198 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
1199 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v0
1200 ; GCN1-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
1201 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
1202 ; GCN1-NEXT: s_waitcnt vmcnt(0)
1203 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1205 ; GCN2-LABEL: flat_atomic_xchg_f64_noret:
1207 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1208 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
1209 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
1210 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1211 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
1212 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
1213 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
1214 ; GCN2-NEXT: s_cbranch_execnz .LBB10_3
1215 ; GCN2-NEXT: ; %bb.1: ; %Flow
1216 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1217 ; GCN2-NEXT: s_cbranch_execnz .LBB10_4
1218 ; GCN2-NEXT: .LBB10_2: ; %atomicrmw.phi
1219 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
1220 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1221 ; GCN2-NEXT: .LBB10_3: ; %atomicrmw.global
1222 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
1223 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1224 ; GCN2-NEXT: buffer_wbinvl1_vol
1225 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
1226 ; GCN2-NEXT: ; implicit-def: $vgpr2
1227 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1228 ; GCN2-NEXT: s_cbranch_execz .LBB10_2
1229 ; GCN2-NEXT: .LBB10_4: ; %atomicrmw.private
1230 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
1231 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
1232 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
1233 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v0
1234 ; GCN2-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
1235 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
1236 ; GCN2-NEXT: s_waitcnt vmcnt(0)
1237 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1239 ; GCN3-LABEL: flat_atomic_xchg_f64_noret:
1241 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1242 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
1243 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
1244 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
1245 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
1246 ; GCN3-NEXT: s_cbranch_execnz .LBB10_3
1247 ; GCN3-NEXT: ; %bb.1: ; %Flow
1248 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1249 ; GCN3-NEXT: s_cbranch_execnz .LBB10_4
1250 ; GCN3-NEXT: .LBB10_2: ; %atomicrmw.phi
1251 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
1252 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1253 ; GCN3-NEXT: .LBB10_3: ; %atomicrmw.global
1254 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
1255 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1256 ; GCN3-NEXT: buffer_wbinvl1_vol
1257 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
1258 ; GCN3-NEXT: ; implicit-def: $vgpr3
1259 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1260 ; GCN3-NEXT: s_cbranch_execz .LBB10_2
1261 ; GCN3-NEXT: .LBB10_4: ; %atomicrmw.private
1262 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
1263 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
1264 ; GCN3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
1265 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
1266 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
1267 ; GCN3-NEXT: s_waitcnt vmcnt(0)
1268 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1269 %tmp0 = atomicrmw xchg ptr %ptr, double %in seq_cst
1273 define void @flat_atomic_xchg_f64_noret_offset(ptr %out, double %in) {
1274 ; GCN1-LABEL: flat_atomic_xchg_f64_noret_offset:
1276 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1277 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
1278 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
1279 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
1280 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1281 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1282 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
1283 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
1284 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
1285 ; GCN1-NEXT: s_cbranch_execnz .LBB11_3
1286 ; GCN1-NEXT: ; %bb.1: ; %Flow
1287 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1288 ; GCN1-NEXT: s_cbranch_execnz .LBB11_4
1289 ; GCN1-NEXT: .LBB11_2: ; %atomicrmw.phi
1290 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
1291 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1292 ; GCN1-NEXT: .LBB11_3: ; %atomicrmw.global
1293 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
1294 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1295 ; GCN1-NEXT: buffer_wbinvl1_vol
1296 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
1297 ; GCN1-NEXT: ; implicit-def: $vgpr2
1298 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1299 ; GCN1-NEXT: s_cbranch_execz .LBB11_2
1300 ; GCN1-NEXT: .LBB11_4: ; %atomicrmw.private
1301 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
1302 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
1303 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
1304 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v0
1305 ; GCN1-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
1306 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
1307 ; GCN1-NEXT: s_waitcnt vmcnt(0)
1308 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1310 ; GCN2-LABEL: flat_atomic_xchg_f64_noret_offset:
1312 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1313 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
1314 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
1315 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
1316 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1317 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1318 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
1319 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
1320 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
1321 ; GCN2-NEXT: s_cbranch_execnz .LBB11_3
1322 ; GCN2-NEXT: ; %bb.1: ; %Flow
1323 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1324 ; GCN2-NEXT: s_cbranch_execnz .LBB11_4
1325 ; GCN2-NEXT: .LBB11_2: ; %atomicrmw.phi
1326 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
1327 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1328 ; GCN2-NEXT: .LBB11_3: ; %atomicrmw.global
1329 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
1330 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1331 ; GCN2-NEXT: buffer_wbinvl1_vol
1332 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
1333 ; GCN2-NEXT: ; implicit-def: $vgpr2
1334 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1335 ; GCN2-NEXT: s_cbranch_execz .LBB11_2
1336 ; GCN2-NEXT: .LBB11_4: ; %atomicrmw.private
1337 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
1338 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
1339 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
1340 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v0
1341 ; GCN2-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
1342 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
1343 ; GCN2-NEXT: s_waitcnt vmcnt(0)
1344 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1346 ; GCN3-LABEL: flat_atomic_xchg_f64_noret_offset:
1348 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1349 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
1350 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1351 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
1352 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
1353 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
1354 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
1355 ; GCN3-NEXT: s_cbranch_execnz .LBB11_3
1356 ; GCN3-NEXT: ; %bb.1: ; %Flow
1357 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1358 ; GCN3-NEXT: s_cbranch_execnz .LBB11_4
1359 ; GCN3-NEXT: .LBB11_2: ; %atomicrmw.phi
1360 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
1361 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1362 ; GCN3-NEXT: .LBB11_3: ; %atomicrmw.global
1363 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
1364 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1365 ; GCN3-NEXT: buffer_wbinvl1_vol
1366 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
1367 ; GCN3-NEXT: ; implicit-def: $vgpr3
1368 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1369 ; GCN3-NEXT: s_cbranch_execz .LBB11_2
1370 ; GCN3-NEXT: .LBB11_4: ; %atomicrmw.private
1371 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
1372 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
1373 ; GCN3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
1374 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
1375 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
1376 ; GCN3-NEXT: s_waitcnt vmcnt(0)
1377 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1378 %gep = getelementptr double, ptr %out, i32 4
1379 %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst
1383 define double @flat_atomic_xchg_f64_ret(ptr %ptr, double %in) {
1384 ; GCN1-LABEL: flat_atomic_xchg_f64_ret:
1386 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1387 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
1388 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
1389 ; GCN1-NEXT: v_mov_b32_e32 v5, v1
1390 ; GCN1-NEXT: v_mov_b32_e32 v4, v0
1391 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
1392 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1393 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
1394 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
1395 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
1396 ; GCN1-NEXT: s_cbranch_execnz .LBB12_3
1397 ; GCN1-NEXT: ; %bb.1: ; %Flow
1398 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1399 ; GCN1-NEXT: s_cbranch_execnz .LBB12_4
1400 ; GCN1-NEXT: .LBB12_2: ; %atomicrmw.phi
1401 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
1402 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1403 ; GCN1-NEXT: .LBB12_3: ; %atomicrmw.global
1404 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
1405 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1406 ; GCN1-NEXT: buffer_wbinvl1_vol
1407 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
1408 ; GCN1-NEXT: ; implicit-def: $vgpr2
1409 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1410 ; GCN1-NEXT: s_cbranch_execz .LBB12_2
1411 ; GCN1-NEXT: .LBB12_4: ; %atomicrmw.private
1412 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
1413 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
1414 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
1415 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
1416 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
1417 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
1418 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
1419 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
1420 ; GCN1-NEXT: s_waitcnt vmcnt(0)
1421 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1423 ; GCN2-LABEL: flat_atomic_xchg_f64_ret:
1425 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1426 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
1427 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
1428 ; GCN2-NEXT: v_mov_b32_e32 v5, v1
1429 ; GCN2-NEXT: v_mov_b32_e32 v4, v0
1430 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
1431 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1432 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
1433 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
1434 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
1435 ; GCN2-NEXT: s_cbranch_execnz .LBB12_3
1436 ; GCN2-NEXT: ; %bb.1: ; %Flow
1437 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1438 ; GCN2-NEXT: s_cbranch_execnz .LBB12_4
1439 ; GCN2-NEXT: .LBB12_2: ; %atomicrmw.phi
1440 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
1441 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1442 ; GCN2-NEXT: .LBB12_3: ; %atomicrmw.global
1443 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
1444 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1445 ; GCN2-NEXT: buffer_wbinvl1_vol
1446 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
1447 ; GCN2-NEXT: ; implicit-def: $vgpr2
1448 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1449 ; GCN2-NEXT: s_cbranch_execz .LBB12_2
1450 ; GCN2-NEXT: .LBB12_4: ; %atomicrmw.private
1451 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
1452 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
1453 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
1454 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
1455 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
1456 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
1457 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
1458 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
1459 ; GCN2-NEXT: s_waitcnt vmcnt(0)
1460 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1462 ; GCN3-LABEL: flat_atomic_xchg_f64_ret:
1464 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1465 ; GCN3-NEXT: v_mov_b32_e32 v5, v1
1466 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
1467 ; GCN3-NEXT: v_mov_b32_e32 v4, v0
1468 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
1469 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
1470 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
1471 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
1472 ; GCN3-NEXT: s_cbranch_execnz .LBB12_3
1473 ; GCN3-NEXT: ; %bb.1: ; %Flow
1474 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1475 ; GCN3-NEXT: s_cbranch_execnz .LBB12_4
1476 ; GCN3-NEXT: .LBB12_2: ; %atomicrmw.phi
1477 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
1478 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1479 ; GCN3-NEXT: .LBB12_3: ; %atomicrmw.global
1480 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
1481 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1482 ; GCN3-NEXT: buffer_wbinvl1_vol
1483 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
1484 ; GCN3-NEXT: ; implicit-def: $vgpr2
1485 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1486 ; GCN3-NEXT: s_cbranch_execz .LBB12_2
1487 ; GCN3-NEXT: .LBB12_4: ; %atomicrmw.private
1488 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
1489 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
1490 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
1491 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
1492 ; GCN3-NEXT: s_nop 0
1493 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
1494 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
1495 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
1496 ; GCN3-NEXT: s_waitcnt vmcnt(0)
1497 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1498 %result = atomicrmw xchg ptr %ptr, double %in seq_cst
1502 define double @flat_atomic_xchg_f64_ret_offset(ptr %out, double %in) {
1503 ; GCN1-LABEL: flat_atomic_xchg_f64_ret_offset:
1505 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1506 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
1507 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
1508 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
1509 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1510 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1511 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
1512 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
1513 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
1514 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
1515 ; GCN1-NEXT: s_cbranch_execnz .LBB13_3
1516 ; GCN1-NEXT: ; %bb.1: ; %Flow
1517 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1518 ; GCN1-NEXT: s_cbranch_execnz .LBB13_4
1519 ; GCN1-NEXT: .LBB13_2: ; %atomicrmw.phi
1520 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
1521 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1522 ; GCN1-NEXT: .LBB13_3: ; %atomicrmw.global
1523 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
1524 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1525 ; GCN1-NEXT: buffer_wbinvl1_vol
1526 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
1527 ; GCN1-NEXT: ; implicit-def: $vgpr2
1528 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1529 ; GCN1-NEXT: s_cbranch_execz .LBB13_2
1530 ; GCN1-NEXT: .LBB13_4: ; %atomicrmw.private
1531 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
1532 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
1533 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
1534 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
1535 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
1536 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
1537 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
1538 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
1539 ; GCN1-NEXT: s_waitcnt vmcnt(0)
1540 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1542 ; GCN2-LABEL: flat_atomic_xchg_f64_ret_offset:
1544 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1545 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
1546 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
1547 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
1548 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1549 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1550 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
1551 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
1552 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
1553 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
1554 ; GCN2-NEXT: s_cbranch_execnz .LBB13_3
1555 ; GCN2-NEXT: ; %bb.1: ; %Flow
1556 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1557 ; GCN2-NEXT: s_cbranch_execnz .LBB13_4
1558 ; GCN2-NEXT: .LBB13_2: ; %atomicrmw.phi
1559 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
1560 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1561 ; GCN2-NEXT: .LBB13_3: ; %atomicrmw.global
1562 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
1563 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1564 ; GCN2-NEXT: buffer_wbinvl1_vol
1565 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
1566 ; GCN2-NEXT: ; implicit-def: $vgpr2
1567 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1568 ; GCN2-NEXT: s_cbranch_execz .LBB13_2
1569 ; GCN2-NEXT: .LBB13_4: ; %atomicrmw.private
1570 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
1571 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
1572 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
1573 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
1574 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
1575 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
1576 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
1577 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
1578 ; GCN2-NEXT: s_waitcnt vmcnt(0)
1579 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1581 ; GCN3-LABEL: flat_atomic_xchg_f64_ret_offset:
1583 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1584 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
1585 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
1586 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
1587 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
1588 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
1589 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
1590 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
1591 ; GCN3-NEXT: s_cbranch_execnz .LBB13_3
1592 ; GCN3-NEXT: ; %bb.1: ; %Flow
1593 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1594 ; GCN3-NEXT: s_cbranch_execnz .LBB13_4
1595 ; GCN3-NEXT: .LBB13_2: ; %atomicrmw.phi
1596 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
1597 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1598 ; GCN3-NEXT: .LBB13_3: ; %atomicrmw.global
1599 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
1600 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1601 ; GCN3-NEXT: buffer_wbinvl1_vol
1602 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
1603 ; GCN3-NEXT: ; implicit-def: $vgpr2
1604 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
1605 ; GCN3-NEXT: s_cbranch_execz .LBB13_2
1606 ; GCN3-NEXT: .LBB13_4: ; %atomicrmw.private
1607 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
1608 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
1609 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
1610 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
1611 ; GCN3-NEXT: s_nop 0
1612 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
1613 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
1614 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
1615 ; GCN3-NEXT: s_waitcnt vmcnt(0)
1616 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1617 %gep = getelementptr double, ptr %out, i32 4
1618 %result = atomicrmw xchg ptr %gep, double %in seq_cst
1622 define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double inreg %in) {
1623 ; GCN1-LABEL: flat_atomic_xchg_f64_noret_scalar:
1625 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1626 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
1627 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
1628 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1629 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
1630 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
1631 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
1632 ; GCN1-NEXT: s_mov_b64 s[34:35], -1
1633 ; GCN1-NEXT: s_cbranch_vccnz .LBB14_3
1634 ; GCN1-NEXT: ; %bb.1: ; %Flow
1635 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
1636 ; GCN1-NEXT: s_cbranch_vccz .LBB14_4
1637 ; GCN1-NEXT: .LBB14_2: ; %atomicrmw.phi
1638 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1639 ; GCN1-NEXT: .LBB14_3: ; %atomicrmw.global
1640 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
1641 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
1642 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
1643 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
1644 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
1645 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1646 ; GCN1-NEXT: buffer_wbinvl1_vol
1647 ; GCN1-NEXT: s_cbranch_execnz .LBB14_2
1648 ; GCN1-NEXT: .LBB14_4: ; %atomicrmw.private
1649 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
1650 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
1651 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
1652 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
1653 ; GCN1-NEXT: v_mov_b32_e32 v1, s34
1654 ; GCN1-NEXT: s_add_i32 s34, s34, 4
1655 ; GCN1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
1656 ; GCN1-NEXT: v_mov_b32_e32 v0, s7
1657 ; GCN1-NEXT: v_mov_b32_e32 v1, s34
1658 ; GCN1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
1659 ; GCN1-NEXT: s_waitcnt vmcnt(0)
1660 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1662 ; GCN2-LABEL: flat_atomic_xchg_f64_noret_scalar:
1664 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1665 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
1666 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
1667 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1668 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
1669 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
1670 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
1671 ; GCN2-NEXT: s_mov_b64 s[34:35], -1
1672 ; GCN2-NEXT: s_cbranch_vccnz .LBB14_3
1673 ; GCN2-NEXT: ; %bb.1: ; %Flow
1674 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
1675 ; GCN2-NEXT: s_cbranch_vccz .LBB14_4
1676 ; GCN2-NEXT: .LBB14_2: ; %atomicrmw.phi
1677 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1678 ; GCN2-NEXT: .LBB14_3: ; %atomicrmw.global
1679 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
1680 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
1681 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
1682 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
1683 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
1684 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1685 ; GCN2-NEXT: buffer_wbinvl1_vol
1686 ; GCN2-NEXT: s_cbranch_execnz .LBB14_2
1687 ; GCN2-NEXT: .LBB14_4: ; %atomicrmw.private
1688 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
1689 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
1690 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
1691 ; GCN2-NEXT: v_mov_b32_e32 v1, s34
1692 ; GCN2-NEXT: s_add_i32 s34, s34, 4
1693 ; GCN2-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
1694 ; GCN2-NEXT: v_mov_b32_e32 v0, s7
1695 ; GCN2-NEXT: v_mov_b32_e32 v1, s34
1696 ; GCN2-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
1697 ; GCN2-NEXT: s_waitcnt vmcnt(0)
1698 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1700 ; GCN3-LABEL: flat_atomic_xchg_f64_noret_scalar:
1702 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1703 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
1704 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
1705 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
1706 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
1707 ; GCN3-NEXT: s_mov_b64 s[34:35], -1
1708 ; GCN3-NEXT: s_cbranch_vccnz .LBB14_3
1709 ; GCN3-NEXT: ; %bb.1: ; %Flow
1710 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
1711 ; GCN3-NEXT: s_cbranch_vccz .LBB14_4
1712 ; GCN3-NEXT: .LBB14_2: ; %atomicrmw.phi
1713 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1714 ; GCN3-NEXT: .LBB14_3: ; %atomicrmw.global
1715 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
1716 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
1717 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
1718 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
1719 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
1720 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1721 ; GCN3-NEXT: buffer_wbinvl1_vol
1722 ; GCN3-NEXT: s_cbranch_execnz .LBB14_2
1723 ; GCN3-NEXT: .LBB14_4: ; %atomicrmw.private
1724 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
1725 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
1726 ; GCN3-NEXT: v_mov_b32_e32 v0, s7
1727 ; GCN3-NEXT: v_mov_b32_e32 v1, s34
1728 ; GCN3-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4
1729 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
1730 ; GCN3-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
1731 ; GCN3-NEXT: s_waitcnt vmcnt(0)
1732 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1733 %tmp0 = atomicrmw xchg ptr %ptr, double %in seq_cst
1737 define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out, double inreg %in) {
1738 ; GCN1-LABEL: flat_atomic_xchg_f64_noret_offset_scalar:
1740 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1741 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
1742 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
1743 ; GCN1-NEXT: s_add_u32 s34, s4, 32
1744 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
1745 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1746 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
1747 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
1748 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
1749 ; GCN1-NEXT: s_mov_b64 s[36:37], -1
1750 ; GCN1-NEXT: s_cbranch_vccnz .LBB15_3
1751 ; GCN1-NEXT: ; %bb.1: ; %Flow
1752 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
1753 ; GCN1-NEXT: s_cbranch_vccz .LBB15_4
1754 ; GCN1-NEXT: .LBB15_2: ; %atomicrmw.phi
1755 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1756 ; GCN1-NEXT: .LBB15_3: ; %atomicrmw.global
1757 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
1758 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
1759 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
1760 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
1761 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
1762 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1763 ; GCN1-NEXT: buffer_wbinvl1_vol
1764 ; GCN1-NEXT: s_cbranch_execnz .LBB15_2
1765 ; GCN1-NEXT: .LBB15_4: ; %atomicrmw.private
1766 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
1767 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
1768 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
1769 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
1770 ; GCN1-NEXT: v_mov_b32_e32 v1, s34
1771 ; GCN1-NEXT: s_add_i32 s34, s34, 4
1772 ; GCN1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
1773 ; GCN1-NEXT: v_mov_b32_e32 v0, s7
1774 ; GCN1-NEXT: v_mov_b32_e32 v1, s34
1775 ; GCN1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
1776 ; GCN1-NEXT: s_waitcnt vmcnt(0)
1777 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1779 ; GCN2-LABEL: flat_atomic_xchg_f64_noret_offset_scalar:
1781 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1782 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
1783 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
1784 ; GCN2-NEXT: s_add_u32 s34, s4, 32
1785 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
1786 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1787 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
1788 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
1789 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
1790 ; GCN2-NEXT: s_mov_b64 s[36:37], -1
1791 ; GCN2-NEXT: s_cbranch_vccnz .LBB15_3
1792 ; GCN2-NEXT: ; %bb.1: ; %Flow
1793 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
1794 ; GCN2-NEXT: s_cbranch_vccz .LBB15_4
1795 ; GCN2-NEXT: .LBB15_2: ; %atomicrmw.phi
1796 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1797 ; GCN2-NEXT: .LBB15_3: ; %atomicrmw.global
1798 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
1799 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
1800 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
1801 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
1802 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
1803 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1804 ; GCN2-NEXT: buffer_wbinvl1_vol
1805 ; GCN2-NEXT: s_cbranch_execnz .LBB15_2
1806 ; GCN2-NEXT: .LBB15_4: ; %atomicrmw.private
1807 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
1808 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
1809 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
1810 ; GCN2-NEXT: v_mov_b32_e32 v1, s34
1811 ; GCN2-NEXT: s_add_i32 s34, s34, 4
1812 ; GCN2-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
1813 ; GCN2-NEXT: v_mov_b32_e32 v0, s7
1814 ; GCN2-NEXT: v_mov_b32_e32 v1, s34
1815 ; GCN2-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
1816 ; GCN2-NEXT: s_waitcnt vmcnt(0)
1817 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1819 ; GCN3-LABEL: flat_atomic_xchg_f64_noret_offset_scalar:
1821 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1822 ; GCN3-NEXT: s_add_u32 s34, s4, 32
1823 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
1824 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
1825 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
1826 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
1827 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
1828 ; GCN3-NEXT: s_mov_b64 s[36:37], -1
1829 ; GCN3-NEXT: s_cbranch_vccnz .LBB15_3
1830 ; GCN3-NEXT: ; %bb.1: ; %Flow
1831 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
1832 ; GCN3-NEXT: s_cbranch_vccz .LBB15_4
1833 ; GCN3-NEXT: .LBB15_2: ; %atomicrmw.phi
1834 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1835 ; GCN3-NEXT: .LBB15_3: ; %atomicrmw.global
1836 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
1837 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
1838 ; GCN3-NEXT: v_mov_b32_e32 v1, s35
1839 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
1840 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
1841 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1842 ; GCN3-NEXT: buffer_wbinvl1_vol
1843 ; GCN3-NEXT: s_cbranch_execnz .LBB15_2
1844 ; GCN3-NEXT: .LBB15_4: ; %atomicrmw.private
1845 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
1846 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
1847 ; GCN3-NEXT: v_mov_b32_e32 v0, s7
1848 ; GCN3-NEXT: v_mov_b32_e32 v1, s34
1849 ; GCN3-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4
1850 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
1851 ; GCN3-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
1852 ; GCN3-NEXT: s_waitcnt vmcnt(0)
1853 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1854 %gep = getelementptr double, ptr %out, i32 4
1855 %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst
1859 define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double inreg %in) {
1860 ; GCN1-LABEL: flat_atomic_xchg_f64_ret_scalar:
1862 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1863 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
1864 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
1865 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1866 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
1867 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
1868 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
1869 ; GCN1-NEXT: s_cbranch_vccz .LBB16_2
1870 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
1871 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
1872 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
1873 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
1874 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
1875 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
1876 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1877 ; GCN1-NEXT: buffer_wbinvl1_vol
1878 ; GCN1-NEXT: s_cbranch_execz .LBB16_3
1879 ; GCN1-NEXT: s_branch .LBB16_4
1880 ; GCN1-NEXT: .LBB16_2:
1881 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
1882 ; GCN1-NEXT: .LBB16_3: ; %atomicrmw.private
1883 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
1884 ; GCN1-NEXT: v_mov_b32_e32 v4, s6
1885 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
1886 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
1887 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
1888 ; GCN1-NEXT: s_add_i32 s34, s34, 4
1889 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
1890 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
1891 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
1892 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
1893 ; GCN1-NEXT: v_mov_b32_e32 v2, s7
1894 ; GCN1-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
1895 ; GCN1-NEXT: .LBB16_4: ; %atomicrmw.end
1896 ; GCN1-NEXT: s_waitcnt vmcnt(0)
1897 ; GCN1-NEXT: s_setpc_b64 s[30:31]
1899 ; GCN2-LABEL: flat_atomic_xchg_f64_ret_scalar:
1901 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1902 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
1903 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
1904 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1905 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
1906 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
1907 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
1908 ; GCN2-NEXT: s_cbranch_vccz .LBB16_2
1909 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
1910 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
1911 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
1912 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
1913 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
1914 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
1915 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1916 ; GCN2-NEXT: buffer_wbinvl1_vol
1917 ; GCN2-NEXT: s_cbranch_execz .LBB16_3
1918 ; GCN2-NEXT: s_branch .LBB16_4
1919 ; GCN2-NEXT: .LBB16_2:
1920 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
1921 ; GCN2-NEXT: .LBB16_3: ; %atomicrmw.private
1922 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
1923 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
1924 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
1925 ; GCN2-NEXT: s_add_i32 s34, s34, 4
1926 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
1927 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
1928 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
1929 ; GCN2-NEXT: v_mov_b32_e32 v4, s6
1930 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
1931 ; GCN2-NEXT: v_mov_b32_e32 v2, s7
1932 ; GCN2-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
1933 ; GCN2-NEXT: .LBB16_4: ; %atomicrmw.end
1934 ; GCN2-NEXT: s_waitcnt vmcnt(0)
1935 ; GCN2-NEXT: s_setpc_b64 s[30:31]
1937 ; GCN3-LABEL: flat_atomic_xchg_f64_ret_scalar:
1939 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1940 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
1941 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
1942 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
1943 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
1944 ; GCN3-NEXT: s_cbranch_vccz .LBB16_2
1945 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
1946 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
1947 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
1948 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
1949 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
1950 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
1951 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1952 ; GCN3-NEXT: buffer_wbinvl1_vol
1953 ; GCN3-NEXT: s_cbranch_execz .LBB16_3
1954 ; GCN3-NEXT: s_branch .LBB16_4
1955 ; GCN3-NEXT: .LBB16_2:
1956 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
1957 ; GCN3-NEXT: .LBB16_3: ; %atomicrmw.private
1958 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
1959 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
1960 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
1961 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
1962 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
1963 ; GCN3-NEXT: v_mov_b32_e32 v3, s6
1964 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
1965 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
1966 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
1967 ; GCN3-NEXT: .LBB16_4: ; %atomicrmw.end
1968 ; GCN3-NEXT: s_waitcnt vmcnt(0)
1969 ; GCN3-NEXT: s_setpc_b64 s[30:31]
1970 %result = atomicrmw xchg ptr %ptr, double %in seq_cst
1974 define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, double inreg %in) {
1975 ; GCN1-LABEL: flat_atomic_xchg_f64_ret_offset_scalar:
1977 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1978 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
1979 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
1980 ; GCN1-NEXT: s_add_u32 s34, s4, 32
1981 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
1982 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1983 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
1984 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
1985 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
1986 ; GCN1-NEXT: s_cbranch_vccz .LBB17_2
1987 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
1988 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
1989 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
1990 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
1991 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
1992 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
1993 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1994 ; GCN1-NEXT: buffer_wbinvl1_vol
1995 ; GCN1-NEXT: s_cbranch_execz .LBB17_3
1996 ; GCN1-NEXT: s_branch .LBB17_4
1997 ; GCN1-NEXT: .LBB17_2:
1998 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
1999 ; GCN1-NEXT: .LBB17_3: ; %atomicrmw.private
2000 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
2001 ; GCN1-NEXT: v_mov_b32_e32 v4, s6
2002 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
2003 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
2004 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
2005 ; GCN1-NEXT: s_add_i32 s34, s34, 4
2006 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
2007 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
2008 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
2009 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
2010 ; GCN1-NEXT: v_mov_b32_e32 v2, s7
2011 ; GCN1-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
2012 ; GCN1-NEXT: .LBB17_4: ; %atomicrmw.end
2013 ; GCN1-NEXT: s_waitcnt vmcnt(0)
2014 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2016 ; GCN2-LABEL: flat_atomic_xchg_f64_ret_offset_scalar:
2018 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2019 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
2020 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
2021 ; GCN2-NEXT: s_add_u32 s34, s4, 32
2022 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
2023 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2024 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
2025 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
2026 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
2027 ; GCN2-NEXT: s_cbranch_vccz .LBB17_2
2028 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
2029 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
2030 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
2031 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
2032 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
2033 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
2034 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2035 ; GCN2-NEXT: buffer_wbinvl1_vol
2036 ; GCN2-NEXT: s_cbranch_execz .LBB17_3
2037 ; GCN2-NEXT: s_branch .LBB17_4
2038 ; GCN2-NEXT: .LBB17_2:
2039 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
2040 ; GCN2-NEXT: .LBB17_3: ; %atomicrmw.private
2041 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
2042 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
2043 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
2044 ; GCN2-NEXT: s_add_i32 s34, s34, 4
2045 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
2046 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
2047 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
2048 ; GCN2-NEXT: v_mov_b32_e32 v4, s6
2049 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
2050 ; GCN2-NEXT: v_mov_b32_e32 v2, s7
2051 ; GCN2-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
2052 ; GCN2-NEXT: .LBB17_4: ; %atomicrmw.end
2053 ; GCN2-NEXT: s_waitcnt vmcnt(0)
2054 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2056 ; GCN3-LABEL: flat_atomic_xchg_f64_ret_offset_scalar:
2058 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2059 ; GCN3-NEXT: s_add_u32 s34, s4, 32
2060 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
2061 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
2062 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
2063 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
2064 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
2065 ; GCN3-NEXT: s_cbranch_vccz .LBB17_2
2066 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
2067 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
2068 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
2069 ; GCN3-NEXT: v_mov_b32_e32 v1, s35
2070 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
2071 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
2072 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2073 ; GCN3-NEXT: buffer_wbinvl1_vol
2074 ; GCN3-NEXT: s_cbranch_execz .LBB17_3
2075 ; GCN3-NEXT: s_branch .LBB17_4
2076 ; GCN3-NEXT: .LBB17_2:
2077 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
2078 ; GCN3-NEXT: .LBB17_3: ; %atomicrmw.private
2079 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
2080 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
2081 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
2082 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
2083 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
2084 ; GCN3-NEXT: v_mov_b32_e32 v3, s6
2085 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
2086 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
2087 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
2088 ; GCN3-NEXT: .LBB17_4: ; %atomicrmw.end
2089 ; GCN3-NEXT: s_waitcnt vmcnt(0)
2090 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2091 %gep = getelementptr double, ptr %out, i32 4
2092 %result = atomicrmw xchg ptr %gep, double %in seq_cst
2096 define void @flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory(ptr %out, double %in) {
2097 ; GCN1-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory:
2099 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2100 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
2101 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
2102 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
2103 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2104 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2105 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
2106 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
2107 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
2108 ; GCN1-NEXT: s_cbranch_execnz .LBB18_3
2109 ; GCN1-NEXT: ; %bb.1: ; %Flow
2110 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2111 ; GCN1-NEXT: s_cbranch_execnz .LBB18_4
2112 ; GCN1-NEXT: .LBB18_2: ; %atomicrmw.phi
2113 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
2114 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2115 ; GCN1-NEXT: .LBB18_3: ; %atomicrmw.global
2116 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
2117 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2118 ; GCN1-NEXT: buffer_wbinvl1_vol
2119 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
2120 ; GCN1-NEXT: ; implicit-def: $vgpr2
2121 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2122 ; GCN1-NEXT: s_cbranch_execz .LBB18_2
2123 ; GCN1-NEXT: .LBB18_4: ; %atomicrmw.private
2124 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
2125 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
2126 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
2127 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v0
2128 ; GCN1-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
2129 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
2130 ; GCN1-NEXT: s_waitcnt vmcnt(0)
2131 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2133 ; GCN2-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory:
2135 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2136 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
2137 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
2138 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
2139 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2140 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2141 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
2142 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
2143 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
2144 ; GCN2-NEXT: s_cbranch_execnz .LBB18_3
2145 ; GCN2-NEXT: ; %bb.1: ; %Flow
2146 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2147 ; GCN2-NEXT: s_cbranch_execnz .LBB18_4
2148 ; GCN2-NEXT: .LBB18_2: ; %atomicrmw.phi
2149 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
2150 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2151 ; GCN2-NEXT: .LBB18_3: ; %atomicrmw.global
2152 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
2153 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2154 ; GCN2-NEXT: buffer_wbinvl1_vol
2155 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
2156 ; GCN2-NEXT: ; implicit-def: $vgpr2
2157 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2158 ; GCN2-NEXT: s_cbranch_execz .LBB18_2
2159 ; GCN2-NEXT: .LBB18_4: ; %atomicrmw.private
2160 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
2161 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
2162 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
2163 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v0
2164 ; GCN2-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
2165 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
2166 ; GCN2-NEXT: s_waitcnt vmcnt(0)
2167 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2169 ; GCN3-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory:
2171 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2172 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
2173 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2174 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
2175 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
2176 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
2177 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
2178 ; GCN3-NEXT: s_cbranch_execnz .LBB18_3
2179 ; GCN3-NEXT: ; %bb.1: ; %Flow
2180 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2181 ; GCN3-NEXT: s_cbranch_execnz .LBB18_4
2182 ; GCN3-NEXT: .LBB18_2: ; %atomicrmw.phi
2183 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
2184 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2185 ; GCN3-NEXT: .LBB18_3: ; %atomicrmw.global
2186 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
2187 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2188 ; GCN3-NEXT: buffer_wbinvl1_vol
2189 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
2190 ; GCN3-NEXT: ; implicit-def: $vgpr3
2191 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2192 ; GCN3-NEXT: s_cbranch_execz .LBB18_2
2193 ; GCN3-NEXT: .LBB18_4: ; %atomicrmw.private
2194 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
2195 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
2196 ; GCN3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
2197 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
2198 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
2199 ; GCN3-NEXT: s_waitcnt vmcnt(0)
2200 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2201 %gep = getelementptr double, ptr %out, i64 4
2202 %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory !0
2206 define double @flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory(ptr %out, double %in) {
2207 ; GCN1-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory:
2209 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2210 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
2211 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
2212 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
2213 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
2214 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2215 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
2216 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
2217 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
2218 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
2219 ; GCN1-NEXT: s_cbranch_execnz .LBB19_3
2220 ; GCN1-NEXT: ; %bb.1: ; %Flow
2221 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2222 ; GCN1-NEXT: s_cbranch_execnz .LBB19_4
2223 ; GCN1-NEXT: .LBB19_2: ; %atomicrmw.phi
2224 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
2225 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2226 ; GCN1-NEXT: .LBB19_3: ; %atomicrmw.global
2227 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
2228 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2229 ; GCN1-NEXT: buffer_wbinvl1_vol
2230 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
2231 ; GCN1-NEXT: ; implicit-def: $vgpr2
2232 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2233 ; GCN1-NEXT: s_cbranch_execz .LBB19_2
2234 ; GCN1-NEXT: .LBB19_4: ; %atomicrmw.private
2235 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
2236 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
2237 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
2238 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
2239 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
2240 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
2241 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
2242 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
2243 ; GCN1-NEXT: s_waitcnt vmcnt(0)
2244 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2246 ; GCN2-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory:
2248 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2249 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
2250 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
2251 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
2252 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
2253 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2254 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
2255 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
2256 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
2257 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
2258 ; GCN2-NEXT: s_cbranch_execnz .LBB19_3
2259 ; GCN2-NEXT: ; %bb.1: ; %Flow
2260 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2261 ; GCN2-NEXT: s_cbranch_execnz .LBB19_4
2262 ; GCN2-NEXT: .LBB19_2: ; %atomicrmw.phi
2263 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
2264 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2265 ; GCN2-NEXT: .LBB19_3: ; %atomicrmw.global
2266 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
2267 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2268 ; GCN2-NEXT: buffer_wbinvl1_vol
2269 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
2270 ; GCN2-NEXT: ; implicit-def: $vgpr2
2271 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2272 ; GCN2-NEXT: s_cbranch_execz .LBB19_2
2273 ; GCN2-NEXT: .LBB19_4: ; %atomicrmw.private
2274 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
2275 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
2276 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
2277 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
2278 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
2279 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
2280 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
2281 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
2282 ; GCN2-NEXT: s_waitcnt vmcnt(0)
2283 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2285 ; GCN3-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory:
2287 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2288 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
2289 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
2290 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
2291 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
2292 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
2293 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
2294 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
2295 ; GCN3-NEXT: s_cbranch_execnz .LBB19_3
2296 ; GCN3-NEXT: ; %bb.1: ; %Flow
2297 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2298 ; GCN3-NEXT: s_cbranch_execnz .LBB19_4
2299 ; GCN3-NEXT: .LBB19_2: ; %atomicrmw.phi
2300 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
2301 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2302 ; GCN3-NEXT: .LBB19_3: ; %atomicrmw.global
2303 ; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] glc
2304 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2305 ; GCN3-NEXT: buffer_wbinvl1_vol
2306 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
2307 ; GCN3-NEXT: ; implicit-def: $vgpr2
2308 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2309 ; GCN3-NEXT: s_cbranch_execz .LBB19_2
2310 ; GCN3-NEXT: .LBB19_4: ; %atomicrmw.private
2311 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
2312 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
2313 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
2314 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
2315 ; GCN3-NEXT: s_nop 0
2316 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
2317 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
2318 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
2319 ; GCN3-NEXT: s_waitcnt vmcnt(0)
2320 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2321 %gep = getelementptr double, ptr %out, i64 4
2322 %result = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory !0
2326 ; ---------------------------------------------------------------------
2328 ; ---------------------------------------------------------------------
2330 define void @flat_atomic_add_i64_noret(ptr %ptr, i64 %in) {
2331 ; GCN1-LABEL: flat_atomic_add_i64_noret:
2333 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2334 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
2335 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
2336 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2337 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
2338 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
2339 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
2340 ; GCN1-NEXT: s_cbranch_execnz .LBB20_3
2341 ; GCN1-NEXT: ; %bb.1: ; %Flow
2342 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2343 ; GCN1-NEXT: s_cbranch_execnz .LBB20_4
2344 ; GCN1-NEXT: .LBB20_2: ; %atomicrmw.phi
2345 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
2346 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2347 ; GCN1-NEXT: .LBB20_3: ; %atomicrmw.global
2348 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
2349 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2350 ; GCN1-NEXT: buffer_wbinvl1_vol
2351 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
2352 ; GCN1-NEXT: ; implicit-def: $vgpr2
2353 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2354 ; GCN1-NEXT: s_cbranch_execz .LBB20_2
2355 ; GCN1-NEXT: .LBB20_4: ; %atomicrmw.private
2356 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
2357 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
2358 ; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
2359 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
2360 ; GCN1-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen
2361 ; GCN1-NEXT: s_waitcnt vmcnt(1)
2362 ; GCN1-NEXT: v_add_i32_e32 v1, vcc, v1, v2
2363 ; GCN1-NEXT: s_waitcnt vmcnt(0)
2364 ; GCN1-NEXT: v_addc_u32_e32 v2, vcc, v5, v3, vcc
2365 ; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
2366 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
2367 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
2368 ; GCN1-NEXT: s_waitcnt vmcnt(0)
2369 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2371 ; GCN2-LABEL: flat_atomic_add_i64_noret:
2373 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2374 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
2375 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
2376 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2377 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
2378 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
2379 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
2380 ; GCN2-NEXT: s_cbranch_execnz .LBB20_3
2381 ; GCN2-NEXT: ; %bb.1: ; %Flow
2382 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2383 ; GCN2-NEXT: s_cbranch_execnz .LBB20_4
2384 ; GCN2-NEXT: .LBB20_2: ; %atomicrmw.phi
2385 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
2386 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2387 ; GCN2-NEXT: .LBB20_3: ; %atomicrmw.global
2388 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
2389 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2390 ; GCN2-NEXT: buffer_wbinvl1_vol
2391 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
2392 ; GCN2-NEXT: ; implicit-def: $vgpr2
2393 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2394 ; GCN2-NEXT: s_cbranch_execz .LBB20_2
2395 ; GCN2-NEXT: .LBB20_4: ; %atomicrmw.private
2396 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
2397 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
2398 ; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
2399 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
2400 ; GCN2-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen
2401 ; GCN2-NEXT: s_waitcnt vmcnt(1)
2402 ; GCN2-NEXT: v_add_u32_e32 v1, vcc, v1, v2
2403 ; GCN2-NEXT: s_waitcnt vmcnt(0)
2404 ; GCN2-NEXT: v_addc_u32_e32 v2, vcc, v5, v3, vcc
2405 ; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
2406 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
2407 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
2408 ; GCN2-NEXT: s_waitcnt vmcnt(0)
2409 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2411 ; GCN3-LABEL: flat_atomic_add_i64_noret:
2413 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2414 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
2415 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
2416 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
2417 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
2418 ; GCN3-NEXT: s_cbranch_execnz .LBB20_3
2419 ; GCN3-NEXT: ; %bb.1: ; %Flow
2420 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2421 ; GCN3-NEXT: s_cbranch_execnz .LBB20_4
2422 ; GCN3-NEXT: .LBB20_2: ; %atomicrmw.phi
2423 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
2424 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2425 ; GCN3-NEXT: .LBB20_3: ; %atomicrmw.global
2426 ; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
2427 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2428 ; GCN3-NEXT: buffer_wbinvl1_vol
2429 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
2430 ; GCN3-NEXT: ; implicit-def: $vgpr2
2431 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2432 ; GCN3-NEXT: s_cbranch_execz .LBB20_2
2433 ; GCN3-NEXT: .LBB20_4: ; %atomicrmw.private
2434 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
2435 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
2436 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
2437 ; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
2438 ; GCN3-NEXT: s_waitcnt vmcnt(1)
2439 ; GCN3-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
2440 ; GCN3-NEXT: s_waitcnt vmcnt(0)
2441 ; GCN3-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v3, vcc
2442 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
2443 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
2444 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
2445 ; GCN3-NEXT: s_waitcnt vmcnt(0)
2446 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2447 %tmp0 = atomicrmw add ptr %ptr, i64 %in seq_cst
2451 define void @flat_atomic_add_i64_noret_offset(ptr %out, i64 %in) {
2452 ; GCN1-LABEL: flat_atomic_add_i64_noret_offset:
2454 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2455 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
2456 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
2457 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
2458 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2459 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2460 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
2461 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
2462 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
2463 ; GCN1-NEXT: s_cbranch_execnz .LBB21_3
2464 ; GCN1-NEXT: ; %bb.1: ; %Flow
2465 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2466 ; GCN1-NEXT: s_cbranch_execnz .LBB21_4
2467 ; GCN1-NEXT: .LBB21_2: ; %atomicrmw.phi
2468 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
2469 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2470 ; GCN1-NEXT: .LBB21_3: ; %atomicrmw.global
2471 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
2472 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2473 ; GCN1-NEXT: buffer_wbinvl1_vol
2474 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
2475 ; GCN1-NEXT: ; implicit-def: $vgpr2
2476 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2477 ; GCN1-NEXT: s_cbranch_execz .LBB21_2
2478 ; GCN1-NEXT: .LBB21_4: ; %atomicrmw.private
2479 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
2480 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
2481 ; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
2482 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
2483 ; GCN1-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen
2484 ; GCN1-NEXT: s_waitcnt vmcnt(1)
2485 ; GCN1-NEXT: v_add_i32_e32 v1, vcc, v1, v2
2486 ; GCN1-NEXT: s_waitcnt vmcnt(0)
2487 ; GCN1-NEXT: v_addc_u32_e32 v2, vcc, v5, v3, vcc
2488 ; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
2489 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
2490 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
2491 ; GCN1-NEXT: s_waitcnt vmcnt(0)
2492 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2494 ; GCN2-LABEL: flat_atomic_add_i64_noret_offset:
2496 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2497 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
2498 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
2499 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
2500 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2501 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2502 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
2503 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
2504 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
2505 ; GCN2-NEXT: s_cbranch_execnz .LBB21_3
2506 ; GCN2-NEXT: ; %bb.1: ; %Flow
2507 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2508 ; GCN2-NEXT: s_cbranch_execnz .LBB21_4
2509 ; GCN2-NEXT: .LBB21_2: ; %atomicrmw.phi
2510 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
2511 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2512 ; GCN2-NEXT: .LBB21_3: ; %atomicrmw.global
2513 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
2514 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2515 ; GCN2-NEXT: buffer_wbinvl1_vol
2516 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
2517 ; GCN2-NEXT: ; implicit-def: $vgpr2
2518 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2519 ; GCN2-NEXT: s_cbranch_execz .LBB21_2
2520 ; GCN2-NEXT: .LBB21_4: ; %atomicrmw.private
2521 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
2522 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
2523 ; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
2524 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
2525 ; GCN2-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen
2526 ; GCN2-NEXT: s_waitcnt vmcnt(1)
2527 ; GCN2-NEXT: v_add_u32_e32 v1, vcc, v1, v2
2528 ; GCN2-NEXT: s_waitcnt vmcnt(0)
2529 ; GCN2-NEXT: v_addc_u32_e32 v2, vcc, v5, v3, vcc
2530 ; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
2531 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
2532 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
2533 ; GCN2-NEXT: s_waitcnt vmcnt(0)
2534 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2536 ; GCN3-LABEL: flat_atomic_add_i64_noret_offset:
2538 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2539 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
2540 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2541 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
2542 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
2543 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
2544 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
2545 ; GCN3-NEXT: s_cbranch_execnz .LBB21_3
2546 ; GCN3-NEXT: ; %bb.1: ; %Flow
2547 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2548 ; GCN3-NEXT: s_cbranch_execnz .LBB21_4
2549 ; GCN3-NEXT: .LBB21_2: ; %atomicrmw.phi
2550 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
2551 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2552 ; GCN3-NEXT: .LBB21_3: ; %atomicrmw.global
2553 ; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
2554 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2555 ; GCN3-NEXT: buffer_wbinvl1_vol
2556 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
2557 ; GCN3-NEXT: ; implicit-def: $vgpr2
2558 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2559 ; GCN3-NEXT: s_cbranch_execz .LBB21_2
2560 ; GCN3-NEXT: .LBB21_4: ; %atomicrmw.private
2561 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
2562 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
2563 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
2564 ; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
2565 ; GCN3-NEXT: s_waitcnt vmcnt(1)
2566 ; GCN3-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
2567 ; GCN3-NEXT: s_waitcnt vmcnt(0)
2568 ; GCN3-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v3, vcc
2569 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
2570 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
2571 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
2572 ; GCN3-NEXT: s_waitcnt vmcnt(0)
2573 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2574 %gep = getelementptr i64, ptr %out, i64 4
2575 %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst
2579 define i64 @flat_atomic_add_i64_ret(ptr %ptr, i64 %in) {
2580 ; GCN1-LABEL: flat_atomic_add_i64_ret:
2582 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2583 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
2584 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
2585 ; GCN1-NEXT: v_mov_b32_e32 v5, v1
2586 ; GCN1-NEXT: v_mov_b32_e32 v4, v0
2587 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
2588 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2589 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
2590 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
2591 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
2592 ; GCN1-NEXT: s_cbranch_execnz .LBB22_3
2593 ; GCN1-NEXT: ; %bb.1: ; %Flow
2594 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2595 ; GCN1-NEXT: s_cbranch_execnz .LBB22_4
2596 ; GCN1-NEXT: .LBB22_2: ; %atomicrmw.phi
2597 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
2598 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2599 ; GCN1-NEXT: .LBB22_3: ; %atomicrmw.global
2600 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc
2601 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2602 ; GCN1-NEXT: buffer_wbinvl1_vol
2603 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
2604 ; GCN1-NEXT: ; implicit-def: $vgpr2
2605 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2606 ; GCN1-NEXT: s_cbranch_execz .LBB22_2
2607 ; GCN1-NEXT: .LBB22_4: ; %atomicrmw.private
2608 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
2609 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
2610 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
2611 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
2612 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
2613 ; GCN1-NEXT: s_waitcnt vmcnt(1)
2614 ; GCN1-NEXT: v_add_i32_e32 v2, vcc, v0, v2
2615 ; GCN1-NEXT: s_waitcnt vmcnt(0)
2616 ; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc
2617 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
2618 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
2619 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
2620 ; GCN1-NEXT: s_waitcnt vmcnt(0)
2621 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2623 ; GCN2-LABEL: flat_atomic_add_i64_ret:
2625 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2626 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
2627 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
2628 ; GCN2-NEXT: v_mov_b32_e32 v5, v1
2629 ; GCN2-NEXT: v_mov_b32_e32 v4, v0
2630 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
2631 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2632 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
2633 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
2634 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
2635 ; GCN2-NEXT: s_cbranch_execnz .LBB22_3
2636 ; GCN2-NEXT: ; %bb.1: ; %Flow
2637 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2638 ; GCN2-NEXT: s_cbranch_execnz .LBB22_4
2639 ; GCN2-NEXT: .LBB22_2: ; %atomicrmw.phi
2640 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
2641 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2642 ; GCN2-NEXT: .LBB22_3: ; %atomicrmw.global
2643 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc
2644 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2645 ; GCN2-NEXT: buffer_wbinvl1_vol
2646 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
2647 ; GCN2-NEXT: ; implicit-def: $vgpr2
2648 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2649 ; GCN2-NEXT: s_cbranch_execz .LBB22_2
2650 ; GCN2-NEXT: .LBB22_4: ; %atomicrmw.private
2651 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
2652 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
2653 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
2654 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
2655 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
2656 ; GCN2-NEXT: s_waitcnt vmcnt(1)
2657 ; GCN2-NEXT: v_add_u32_e32 v2, vcc, v0, v2
2658 ; GCN2-NEXT: s_waitcnt vmcnt(0)
2659 ; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc
2660 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
2661 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
2662 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
2663 ; GCN2-NEXT: s_waitcnt vmcnt(0)
2664 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2666 ; GCN3-LABEL: flat_atomic_add_i64_ret:
2668 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2669 ; GCN3-NEXT: v_mov_b32_e32 v5, v1
2670 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
2671 ; GCN3-NEXT: v_mov_b32_e32 v4, v0
2672 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
2673 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
2674 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
2675 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
2676 ; GCN3-NEXT: s_cbranch_execnz .LBB22_3
2677 ; GCN3-NEXT: ; %bb.1: ; %Flow
2678 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2679 ; GCN3-NEXT: s_cbranch_execnz .LBB22_4
2680 ; GCN3-NEXT: .LBB22_2: ; %atomicrmw.phi
2681 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
2682 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2683 ; GCN3-NEXT: .LBB22_3: ; %atomicrmw.global
2684 ; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc
2685 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2686 ; GCN3-NEXT: buffer_wbinvl1_vol
2687 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
2688 ; GCN3-NEXT: ; implicit-def: $vgpr2
2689 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2690 ; GCN3-NEXT: s_cbranch_execz .LBB22_2
2691 ; GCN3-NEXT: .LBB22_4: ; %atomicrmw.private
2692 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
2693 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
2694 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
2695 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
2696 ; GCN3-NEXT: s_waitcnt vmcnt(1)
2697 ; GCN3-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2
2698 ; GCN3-NEXT: s_waitcnt vmcnt(0)
2699 ; GCN3-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
2700 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
2701 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
2702 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
2703 ; GCN3-NEXT: s_waitcnt vmcnt(0)
2704 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2705 %result = atomicrmw add ptr %ptr, i64 %in seq_cst
2709 define i64 @flat_atomic_add_i64_ret_offset(ptr %out, i64 %in) {
2710 ; GCN1-LABEL: flat_atomic_add_i64_ret_offset:
2712 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2713 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
2714 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
2715 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
2716 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
2717 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2718 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
2719 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
2720 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
2721 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
2722 ; GCN1-NEXT: s_cbranch_execnz .LBB23_3
2723 ; GCN1-NEXT: ; %bb.1: ; %Flow
2724 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2725 ; GCN1-NEXT: s_cbranch_execnz .LBB23_4
2726 ; GCN1-NEXT: .LBB23_2: ; %atomicrmw.phi
2727 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
2728 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2729 ; GCN1-NEXT: .LBB23_3: ; %atomicrmw.global
2730 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc
2731 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2732 ; GCN1-NEXT: buffer_wbinvl1_vol
2733 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
2734 ; GCN1-NEXT: ; implicit-def: $vgpr2
2735 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2736 ; GCN1-NEXT: s_cbranch_execz .LBB23_2
2737 ; GCN1-NEXT: .LBB23_4: ; %atomicrmw.private
2738 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
2739 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
2740 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
2741 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
2742 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
2743 ; GCN1-NEXT: s_waitcnt vmcnt(1)
2744 ; GCN1-NEXT: v_add_i32_e32 v2, vcc, v0, v2
2745 ; GCN1-NEXT: s_waitcnt vmcnt(0)
2746 ; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc
2747 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
2748 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
2749 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
2750 ; GCN1-NEXT: s_waitcnt vmcnt(0)
2751 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2753 ; GCN2-LABEL: flat_atomic_add_i64_ret_offset:
2755 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2756 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
2757 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
2758 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
2759 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
2760 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2761 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
2762 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
2763 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
2764 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
2765 ; GCN2-NEXT: s_cbranch_execnz .LBB23_3
2766 ; GCN2-NEXT: ; %bb.1: ; %Flow
2767 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2768 ; GCN2-NEXT: s_cbranch_execnz .LBB23_4
2769 ; GCN2-NEXT: .LBB23_2: ; %atomicrmw.phi
2770 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
2771 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2772 ; GCN2-NEXT: .LBB23_3: ; %atomicrmw.global
2773 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc
2774 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2775 ; GCN2-NEXT: buffer_wbinvl1_vol
2776 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
2777 ; GCN2-NEXT: ; implicit-def: $vgpr2
2778 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2779 ; GCN2-NEXT: s_cbranch_execz .LBB23_2
2780 ; GCN2-NEXT: .LBB23_4: ; %atomicrmw.private
2781 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
2782 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
2783 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
2784 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
2785 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
2786 ; GCN2-NEXT: s_waitcnt vmcnt(1)
2787 ; GCN2-NEXT: v_add_u32_e32 v2, vcc, v0, v2
2788 ; GCN2-NEXT: s_waitcnt vmcnt(0)
2789 ; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc
2790 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
2791 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
2792 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
2793 ; GCN2-NEXT: s_waitcnt vmcnt(0)
2794 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2796 ; GCN3-LABEL: flat_atomic_add_i64_ret_offset:
2798 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2799 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
2800 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
2801 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
2802 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
2803 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
2804 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
2805 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
2806 ; GCN3-NEXT: s_cbranch_execnz .LBB23_3
2807 ; GCN3-NEXT: ; %bb.1: ; %Flow
2808 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2809 ; GCN3-NEXT: s_cbranch_execnz .LBB23_4
2810 ; GCN3-NEXT: .LBB23_2: ; %atomicrmw.phi
2811 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
2812 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2813 ; GCN3-NEXT: .LBB23_3: ; %atomicrmw.global
2814 ; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc
2815 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2816 ; GCN3-NEXT: buffer_wbinvl1_vol
2817 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
2818 ; GCN3-NEXT: ; implicit-def: $vgpr2
2819 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
2820 ; GCN3-NEXT: s_cbranch_execz .LBB23_2
2821 ; GCN3-NEXT: .LBB23_4: ; %atomicrmw.private
2822 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
2823 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
2824 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
2825 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
2826 ; GCN3-NEXT: s_waitcnt vmcnt(1)
2827 ; GCN3-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2
2828 ; GCN3-NEXT: s_waitcnt vmcnt(0)
2829 ; GCN3-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
2830 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
2831 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
2832 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
2833 ; GCN3-NEXT: s_waitcnt vmcnt(0)
2834 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2835 %gep = getelementptr i64, ptr %out, i64 4
2836 %result = atomicrmw add ptr %gep, i64 %in seq_cst
2840 define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
2841 ; GCN1-LABEL: flat_atomic_add_i64_noret_scalar:
2843 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2844 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
2845 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
2846 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2847 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
2848 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
2849 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
2850 ; GCN1-NEXT: s_mov_b64 s[34:35], -1
2851 ; GCN1-NEXT: s_cbranch_vccnz .LBB24_3
2852 ; GCN1-NEXT: ; %bb.1: ; %Flow
2853 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
2854 ; GCN1-NEXT: s_cbranch_vccz .LBB24_4
2855 ; GCN1-NEXT: .LBB24_2: ; %atomicrmw.phi
2856 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2857 ; GCN1-NEXT: .LBB24_3: ; %atomicrmw.global
2858 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
2859 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
2860 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
2861 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
2862 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
2863 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2864 ; GCN1-NEXT: buffer_wbinvl1_vol
2865 ; GCN1-NEXT: s_cbranch_execnz .LBB24_2
2866 ; GCN1-NEXT: .LBB24_4: ; %atomicrmw.private
2867 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
2868 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
2869 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
2870 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
2871 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
2872 ; GCN1-NEXT: s_add_i32 s34, s34, 4
2873 ; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
2874 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
2875 ; GCN1-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
2876 ; GCN1-NEXT: s_waitcnt vmcnt(1)
2877 ; GCN1-NEXT: v_add_i32_e32 v1, vcc, s6, v1
2878 ; GCN1-NEXT: s_waitcnt vmcnt(0)
2879 ; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc
2880 ; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
2881 ; GCN1-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
2882 ; GCN1-NEXT: s_waitcnt vmcnt(0)
2883 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2885 ; GCN2-LABEL: flat_atomic_add_i64_noret_scalar:
2887 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2888 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
2889 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
2890 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2891 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
2892 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
2893 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
2894 ; GCN2-NEXT: s_mov_b64 s[34:35], -1
2895 ; GCN2-NEXT: s_cbranch_vccnz .LBB24_3
2896 ; GCN2-NEXT: ; %bb.1: ; %Flow
2897 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
2898 ; GCN2-NEXT: s_cbranch_vccz .LBB24_4
2899 ; GCN2-NEXT: .LBB24_2: ; %atomicrmw.phi
2900 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2901 ; GCN2-NEXT: .LBB24_3: ; %atomicrmw.global
2902 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
2903 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
2904 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
2905 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
2906 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
2907 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2908 ; GCN2-NEXT: buffer_wbinvl1_vol
2909 ; GCN2-NEXT: s_cbranch_execnz .LBB24_2
2910 ; GCN2-NEXT: .LBB24_4: ; %atomicrmw.private
2911 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
2912 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
2913 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
2914 ; GCN2-NEXT: s_add_i32 s34, s34, 4
2915 ; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
2916 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
2917 ; GCN2-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
2918 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
2919 ; GCN2-NEXT: s_waitcnt vmcnt(1)
2920 ; GCN2-NEXT: v_add_u32_e32 v1, vcc, s6, v1
2921 ; GCN2-NEXT: s_waitcnt vmcnt(0)
2922 ; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc
2923 ; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
2924 ; GCN2-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
2925 ; GCN2-NEXT: s_waitcnt vmcnt(0)
2926 ; GCN2-NEXT: s_setpc_b64 s[30:31]
2928 ; GCN3-LABEL: flat_atomic_add_i64_noret_scalar:
2930 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2931 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
2932 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
2933 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
2934 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
2935 ; GCN3-NEXT: s_mov_b64 s[34:35], -1
2936 ; GCN3-NEXT: s_cbranch_vccnz .LBB24_3
2937 ; GCN3-NEXT: ; %bb.1: ; %Flow
2938 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
2939 ; GCN3-NEXT: s_cbranch_vccz .LBB24_4
2940 ; GCN3-NEXT: .LBB24_2: ; %atomicrmw.phi
2941 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2942 ; GCN3-NEXT: .LBB24_3: ; %atomicrmw.global
2943 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
2944 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
2945 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
2946 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
2947 ; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
2948 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2949 ; GCN3-NEXT: buffer_wbinvl1_vol
2950 ; GCN3-NEXT: s_cbranch_execnz .LBB24_2
2951 ; GCN3-NEXT: .LBB24_4: ; %atomicrmw.private
2952 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
2953 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
2954 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
2955 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
2956 ; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
2957 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
2958 ; GCN3-NEXT: s_waitcnt vmcnt(1)
2959 ; GCN3-NEXT: v_add_co_u32_e32 v1, vcc, s6, v1
2960 ; GCN3-NEXT: s_waitcnt vmcnt(0)
2961 ; GCN3-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v3, vcc
2962 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
2963 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
2964 ; GCN3-NEXT: s_waitcnt vmcnt(0)
2965 ; GCN3-NEXT: s_setpc_b64 s[30:31]
2966 %tmp0 = atomicrmw add ptr %ptr, i64 %in seq_cst
2970 define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
2971 ; GCN1-LABEL: flat_atomic_add_i64_noret_offset_scalar:
2973 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2974 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
2975 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
2976 ; GCN1-NEXT: s_add_u32 s34, s4, 32
2977 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
2978 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2979 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
2980 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
2981 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
2982 ; GCN1-NEXT: s_mov_b64 s[36:37], -1
2983 ; GCN1-NEXT: s_cbranch_vccnz .LBB25_3
2984 ; GCN1-NEXT: ; %bb.1: ; %Flow
2985 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
2986 ; GCN1-NEXT: s_cbranch_vccz .LBB25_4
2987 ; GCN1-NEXT: .LBB25_2: ; %atomicrmw.phi
2988 ; GCN1-NEXT: s_setpc_b64 s[30:31]
2989 ; GCN1-NEXT: .LBB25_3: ; %atomicrmw.global
2990 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
2991 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
2992 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
2993 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
2994 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
2995 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2996 ; GCN1-NEXT: buffer_wbinvl1_vol
2997 ; GCN1-NEXT: s_cbranch_execnz .LBB25_2
2998 ; GCN1-NEXT: .LBB25_4: ; %atomicrmw.private
2999 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
3000 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
3001 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
3002 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
3003 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
3004 ; GCN1-NEXT: s_add_i32 s34, s34, 4
3005 ; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
3006 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
3007 ; GCN1-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
3008 ; GCN1-NEXT: s_waitcnt vmcnt(1)
3009 ; GCN1-NEXT: v_add_i32_e32 v1, vcc, s6, v1
3010 ; GCN1-NEXT: s_waitcnt vmcnt(0)
3011 ; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc
3012 ; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
3013 ; GCN1-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
3014 ; GCN1-NEXT: s_waitcnt vmcnt(0)
3015 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3017 ; GCN2-LABEL: flat_atomic_add_i64_noret_offset_scalar:
3019 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3020 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
3021 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
3022 ; GCN2-NEXT: s_add_u32 s34, s4, 32
3023 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
3024 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3025 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
3026 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
3027 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
3028 ; GCN2-NEXT: s_mov_b64 s[36:37], -1
3029 ; GCN2-NEXT: s_cbranch_vccnz .LBB25_3
3030 ; GCN2-NEXT: ; %bb.1: ; %Flow
3031 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
3032 ; GCN2-NEXT: s_cbranch_vccz .LBB25_4
3033 ; GCN2-NEXT: .LBB25_2: ; %atomicrmw.phi
3034 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3035 ; GCN2-NEXT: .LBB25_3: ; %atomicrmw.global
3036 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
3037 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
3038 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
3039 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
3040 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
3041 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3042 ; GCN2-NEXT: buffer_wbinvl1_vol
3043 ; GCN2-NEXT: s_cbranch_execnz .LBB25_2
3044 ; GCN2-NEXT: .LBB25_4: ; %atomicrmw.private
3045 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
3046 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
3047 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
3048 ; GCN2-NEXT: s_add_i32 s34, s34, 4
3049 ; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
3050 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
3051 ; GCN2-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
3052 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
3053 ; GCN2-NEXT: s_waitcnt vmcnt(1)
3054 ; GCN2-NEXT: v_add_u32_e32 v1, vcc, s6, v1
3055 ; GCN2-NEXT: s_waitcnt vmcnt(0)
3056 ; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc
3057 ; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
3058 ; GCN2-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
3059 ; GCN2-NEXT: s_waitcnt vmcnt(0)
3060 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3062 ; GCN3-LABEL: flat_atomic_add_i64_noret_offset_scalar:
3064 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3065 ; GCN3-NEXT: s_add_u32 s34, s4, 32
3066 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
3067 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
3068 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
3069 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
3070 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
3071 ; GCN3-NEXT: s_mov_b64 s[36:37], -1
3072 ; GCN3-NEXT: s_cbranch_vccnz .LBB25_3
3073 ; GCN3-NEXT: ; %bb.1: ; %Flow
3074 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
3075 ; GCN3-NEXT: s_cbranch_vccz .LBB25_4
3076 ; GCN3-NEXT: .LBB25_2: ; %atomicrmw.phi
3077 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3078 ; GCN3-NEXT: .LBB25_3: ; %atomicrmw.global
3079 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
3080 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
3081 ; GCN3-NEXT: v_mov_b32_e32 v1, s35
3082 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
3083 ; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
3084 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3085 ; GCN3-NEXT: buffer_wbinvl1_vol
3086 ; GCN3-NEXT: s_cbranch_execnz .LBB25_2
3087 ; GCN3-NEXT: .LBB25_4: ; %atomicrmw.private
3088 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
3089 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
3090 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
3091 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
3092 ; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
3093 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
3094 ; GCN3-NEXT: s_waitcnt vmcnt(1)
3095 ; GCN3-NEXT: v_add_co_u32_e32 v1, vcc, s6, v1
3096 ; GCN3-NEXT: s_waitcnt vmcnt(0)
3097 ; GCN3-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v3, vcc
3098 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
3099 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
3100 ; GCN3-NEXT: s_waitcnt vmcnt(0)
3101 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3102 %gep = getelementptr i64, ptr %out, i64 4
3103 %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst
3107 define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
3108 ; GCN1-LABEL: flat_atomic_add_i64_ret_scalar:
3110 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3111 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
3112 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
3113 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3114 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
3115 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
3116 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
3117 ; GCN1-NEXT: s_cbranch_vccz .LBB26_2
3118 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
3119 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
3120 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
3121 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
3122 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
3123 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
3124 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3125 ; GCN1-NEXT: buffer_wbinvl1_vol
3126 ; GCN1-NEXT: s_cbranch_execz .LBB26_3
3127 ; GCN1-NEXT: s_branch .LBB26_4
3128 ; GCN1-NEXT: .LBB26_2:
3129 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
3130 ; GCN1-NEXT: .LBB26_3: ; %atomicrmw.private
3131 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
3132 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
3133 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
3134 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
3135 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
3136 ; GCN1-NEXT: s_add_i32 s34, s34, 4
3137 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
3138 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
3139 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
3140 ; GCN1-NEXT: s_waitcnt vmcnt(1)
3141 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, s6, v0
3142 ; GCN1-NEXT: s_waitcnt vmcnt(0)
3143 ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc
3144 ; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
3145 ; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
3146 ; GCN1-NEXT: .LBB26_4: ; %atomicrmw.end
3147 ; GCN1-NEXT: s_waitcnt vmcnt(0)
3148 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3150 ; GCN2-LABEL: flat_atomic_add_i64_ret_scalar:
3152 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3153 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
3154 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
3155 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3156 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
3157 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
3158 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
3159 ; GCN2-NEXT: s_cbranch_vccz .LBB26_2
3160 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
3161 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
3162 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
3163 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
3164 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
3165 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
3166 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3167 ; GCN2-NEXT: buffer_wbinvl1_vol
3168 ; GCN2-NEXT: s_cbranch_execz .LBB26_3
3169 ; GCN2-NEXT: s_branch .LBB26_4
3170 ; GCN2-NEXT: .LBB26_2:
3171 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
3172 ; GCN2-NEXT: .LBB26_3: ; %atomicrmw.private
3173 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
3174 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
3175 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
3176 ; GCN2-NEXT: s_add_i32 s34, s34, 4
3177 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
3178 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
3179 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
3180 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
3181 ; GCN2-NEXT: s_waitcnt vmcnt(1)
3182 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, s6, v0
3183 ; GCN2-NEXT: s_waitcnt vmcnt(0)
3184 ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc
3185 ; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
3186 ; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
3187 ; GCN2-NEXT: .LBB26_4: ; %atomicrmw.end
3188 ; GCN2-NEXT: s_waitcnt vmcnt(0)
3189 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3191 ; GCN3-LABEL: flat_atomic_add_i64_ret_scalar:
3193 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3194 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
3195 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
3196 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
3197 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
3198 ; GCN3-NEXT: s_cbranch_vccz .LBB26_2
3199 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
3200 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
3201 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
3202 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
3203 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
3204 ; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
3205 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3206 ; GCN3-NEXT: buffer_wbinvl1_vol
3207 ; GCN3-NEXT: s_cbranch_execz .LBB26_3
3208 ; GCN3-NEXT: s_branch .LBB26_4
3209 ; GCN3-NEXT: .LBB26_2:
3210 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
3211 ; GCN3-NEXT: .LBB26_3: ; %atomicrmw.private
3212 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
3213 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
3214 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
3215 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
3216 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
3217 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
3218 ; GCN3-NEXT: s_waitcnt vmcnt(1)
3219 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, s6, v0
3220 ; GCN3-NEXT: s_waitcnt vmcnt(0)
3221 ; GCN3-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
3222 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
3223 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
3224 ; GCN3-NEXT: .LBB26_4: ; %atomicrmw.end
3225 ; GCN3-NEXT: s_waitcnt vmcnt(0)
3226 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3227 %result = atomicrmw add ptr %ptr, i64 %in seq_cst
3231 define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
3232 ; GCN1-LABEL: flat_atomic_add_i64_ret_offset_scalar:
3234 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3235 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
3236 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
3237 ; GCN1-NEXT: s_add_u32 s34, s4, 32
3238 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
3239 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3240 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
3241 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
3242 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
3243 ; GCN1-NEXT: s_cbranch_vccz .LBB27_2
3244 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
3245 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
3246 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
3247 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
3248 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
3249 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
3250 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3251 ; GCN1-NEXT: buffer_wbinvl1_vol
3252 ; GCN1-NEXT: s_cbranch_execz .LBB27_3
3253 ; GCN1-NEXT: s_branch .LBB27_4
3254 ; GCN1-NEXT: .LBB27_2:
3255 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
3256 ; GCN1-NEXT: .LBB27_3: ; %atomicrmw.private
3257 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
3258 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
3259 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
3260 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
3261 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
3262 ; GCN1-NEXT: s_add_i32 s34, s34, 4
3263 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
3264 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
3265 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
3266 ; GCN1-NEXT: s_waitcnt vmcnt(1)
3267 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, s6, v0
3268 ; GCN1-NEXT: s_waitcnt vmcnt(0)
3269 ; GCN1-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc
3270 ; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
3271 ; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
3272 ; GCN1-NEXT: .LBB27_4: ; %atomicrmw.end
3273 ; GCN1-NEXT: s_waitcnt vmcnt(0)
3274 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3276 ; GCN2-LABEL: flat_atomic_add_i64_ret_offset_scalar:
3278 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3279 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
3280 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
3281 ; GCN2-NEXT: s_add_u32 s34, s4, 32
3282 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
3283 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3284 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
3285 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
3286 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
3287 ; GCN2-NEXT: s_cbranch_vccz .LBB27_2
3288 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
3289 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
3290 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
3291 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
3292 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
3293 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
3294 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3295 ; GCN2-NEXT: buffer_wbinvl1_vol
3296 ; GCN2-NEXT: s_cbranch_execz .LBB27_3
3297 ; GCN2-NEXT: s_branch .LBB27_4
3298 ; GCN2-NEXT: .LBB27_2:
3299 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
3300 ; GCN2-NEXT: .LBB27_3: ; %atomicrmw.private
3301 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
3302 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
3303 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
3304 ; GCN2-NEXT: s_add_i32 s34, s34, 4
3305 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
3306 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
3307 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
3308 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
3309 ; GCN2-NEXT: s_waitcnt vmcnt(1)
3310 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, s6, v0
3311 ; GCN2-NEXT: s_waitcnt vmcnt(0)
3312 ; GCN2-NEXT: v_addc_u32_e32 v4, vcc, v1, v4, vcc
3313 ; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
3314 ; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
3315 ; GCN2-NEXT: .LBB27_4: ; %atomicrmw.end
3316 ; GCN2-NEXT: s_waitcnt vmcnt(0)
3317 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3319 ; GCN3-LABEL: flat_atomic_add_i64_ret_offset_scalar:
3321 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3322 ; GCN3-NEXT: s_add_u32 s34, s4, 32
3323 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
3324 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
3325 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
3326 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
3327 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
3328 ; GCN3-NEXT: s_cbranch_vccz .LBB27_2
3329 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
3330 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
3331 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
3332 ; GCN3-NEXT: v_mov_b32_e32 v1, s35
3333 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
3334 ; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
3335 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3336 ; GCN3-NEXT: buffer_wbinvl1_vol
3337 ; GCN3-NEXT: s_cbranch_execz .LBB27_3
3338 ; GCN3-NEXT: s_branch .LBB27_4
3339 ; GCN3-NEXT: .LBB27_2:
3340 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
3341 ; GCN3-NEXT: .LBB27_3: ; %atomicrmw.private
3342 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
3343 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
3344 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
3345 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
3346 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
3347 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
3348 ; GCN3-NEXT: s_waitcnt vmcnt(1)
3349 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, s6, v0
3350 ; GCN3-NEXT: s_waitcnt vmcnt(0)
3351 ; GCN3-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
3352 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
3353 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
3354 ; GCN3-NEXT: .LBB27_4: ; %atomicrmw.end
3355 ; GCN3-NEXT: s_waitcnt vmcnt(0)
3356 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3357 %gep = getelementptr i64, ptr %out, i64 4
3358 %result = atomicrmw add ptr %gep, i64 %in seq_cst
3362 define void @flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
3363 ; GCN1-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory:
3365 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3366 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
3367 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
3368 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
3369 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3370 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3371 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
3372 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
3373 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
3374 ; GCN1-NEXT: s_cbranch_execnz .LBB28_3
3375 ; GCN1-NEXT: ; %bb.1: ; %Flow
3376 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3377 ; GCN1-NEXT: s_cbranch_execnz .LBB28_4
3378 ; GCN1-NEXT: .LBB28_2: ; %atomicrmw.phi
3379 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
3380 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3381 ; GCN1-NEXT: .LBB28_3: ; %atomicrmw.global
3382 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
3383 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3384 ; GCN1-NEXT: buffer_wbinvl1_vol
3385 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
3386 ; GCN1-NEXT: ; implicit-def: $vgpr2
3387 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3388 ; GCN1-NEXT: s_cbranch_execz .LBB28_2
3389 ; GCN1-NEXT: .LBB28_4: ; %atomicrmw.private
3390 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3391 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
3392 ; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
3393 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
3394 ; GCN1-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen
3395 ; GCN1-NEXT: s_waitcnt vmcnt(1)
3396 ; GCN1-NEXT: v_add_i32_e32 v1, vcc, v1, v2
3397 ; GCN1-NEXT: s_waitcnt vmcnt(0)
3398 ; GCN1-NEXT: v_addc_u32_e32 v2, vcc, v5, v3, vcc
3399 ; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
3400 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
3401 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
3402 ; GCN1-NEXT: s_waitcnt vmcnt(0)
3403 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3405 ; GCN2-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory:
3407 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3408 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
3409 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
3410 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
3411 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3412 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3413 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
3414 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
3415 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
3416 ; GCN2-NEXT: s_cbranch_execnz .LBB28_3
3417 ; GCN2-NEXT: ; %bb.1: ; %Flow
3418 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3419 ; GCN2-NEXT: s_cbranch_execnz .LBB28_4
3420 ; GCN2-NEXT: .LBB28_2: ; %atomicrmw.phi
3421 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
3422 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3423 ; GCN2-NEXT: .LBB28_3: ; %atomicrmw.global
3424 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
3425 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3426 ; GCN2-NEXT: buffer_wbinvl1_vol
3427 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
3428 ; GCN2-NEXT: ; implicit-def: $vgpr2
3429 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3430 ; GCN2-NEXT: s_cbranch_execz .LBB28_2
3431 ; GCN2-NEXT: .LBB28_4: ; %atomicrmw.private
3432 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3433 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
3434 ; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
3435 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
3436 ; GCN2-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen
3437 ; GCN2-NEXT: s_waitcnt vmcnt(1)
3438 ; GCN2-NEXT: v_add_u32_e32 v1, vcc, v1, v2
3439 ; GCN2-NEXT: s_waitcnt vmcnt(0)
3440 ; GCN2-NEXT: v_addc_u32_e32 v2, vcc, v5, v3, vcc
3441 ; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
3442 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
3443 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
3444 ; GCN2-NEXT: s_waitcnt vmcnt(0)
3445 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3447 ; GCN3-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory:
3449 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3450 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
3451 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
3452 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
3453 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
3454 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
3455 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
3456 ; GCN3-NEXT: s_cbranch_execnz .LBB28_3
3457 ; GCN3-NEXT: ; %bb.1: ; %Flow
3458 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3459 ; GCN3-NEXT: s_cbranch_execnz .LBB28_4
3460 ; GCN3-NEXT: .LBB28_2: ; %atomicrmw.phi
3461 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
3462 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3463 ; GCN3-NEXT: .LBB28_3: ; %atomicrmw.global
3464 ; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
3465 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3466 ; GCN3-NEXT: buffer_wbinvl1_vol
3467 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
3468 ; GCN3-NEXT: ; implicit-def: $vgpr2
3469 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3470 ; GCN3-NEXT: s_cbranch_execz .LBB28_2
3471 ; GCN3-NEXT: .LBB28_4: ; %atomicrmw.private
3472 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3473 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
3474 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
3475 ; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
3476 ; GCN3-NEXT: s_waitcnt vmcnt(1)
3477 ; GCN3-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
3478 ; GCN3-NEXT: s_waitcnt vmcnt(0)
3479 ; GCN3-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v3, vcc
3480 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
3481 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
3482 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
3483 ; GCN3-NEXT: s_waitcnt vmcnt(0)
3484 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3485 %gep = getelementptr i64, ptr %out, i64 4
3486 %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
3490 define i64 @flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
3491 ; GCN1-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory:
3493 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3494 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
3495 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
3496 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
3497 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
3498 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3499 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
3500 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
3501 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
3502 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
3503 ; GCN1-NEXT: s_cbranch_execnz .LBB29_3
3504 ; GCN1-NEXT: ; %bb.1: ; %Flow
3505 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3506 ; GCN1-NEXT: s_cbranch_execnz .LBB29_4
3507 ; GCN1-NEXT: .LBB29_2: ; %atomicrmw.phi
3508 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
3509 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3510 ; GCN1-NEXT: .LBB29_3: ; %atomicrmw.global
3511 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc
3512 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3513 ; GCN1-NEXT: buffer_wbinvl1_vol
3514 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
3515 ; GCN1-NEXT: ; implicit-def: $vgpr2
3516 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3517 ; GCN1-NEXT: s_cbranch_execz .LBB29_2
3518 ; GCN1-NEXT: .LBB29_4: ; %atomicrmw.private
3519 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
3520 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
3521 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
3522 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
3523 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
3524 ; GCN1-NEXT: s_waitcnt vmcnt(1)
3525 ; GCN1-NEXT: v_add_i32_e32 v2, vcc, v0, v2
3526 ; GCN1-NEXT: s_waitcnt vmcnt(0)
3527 ; GCN1-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc
3528 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
3529 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
3530 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
3531 ; GCN1-NEXT: s_waitcnt vmcnt(0)
3532 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3534 ; GCN2-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory:
3536 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3537 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
3538 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
3539 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
3540 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
3541 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3542 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
3543 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
3544 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
3545 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
3546 ; GCN2-NEXT: s_cbranch_execnz .LBB29_3
3547 ; GCN2-NEXT: ; %bb.1: ; %Flow
3548 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3549 ; GCN2-NEXT: s_cbranch_execnz .LBB29_4
3550 ; GCN2-NEXT: .LBB29_2: ; %atomicrmw.phi
3551 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
3552 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3553 ; GCN2-NEXT: .LBB29_3: ; %atomicrmw.global
3554 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc
3555 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3556 ; GCN2-NEXT: buffer_wbinvl1_vol
3557 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
3558 ; GCN2-NEXT: ; implicit-def: $vgpr2
3559 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3560 ; GCN2-NEXT: s_cbranch_execz .LBB29_2
3561 ; GCN2-NEXT: .LBB29_4: ; %atomicrmw.private
3562 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
3563 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
3564 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
3565 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
3566 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
3567 ; GCN2-NEXT: s_waitcnt vmcnt(1)
3568 ; GCN2-NEXT: v_add_u32_e32 v2, vcc, v0, v2
3569 ; GCN2-NEXT: s_waitcnt vmcnt(0)
3570 ; GCN2-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc
3571 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
3572 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
3573 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
3574 ; GCN2-NEXT: s_waitcnt vmcnt(0)
3575 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3577 ; GCN3-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory:
3579 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3580 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
3581 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
3582 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
3583 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
3584 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
3585 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
3586 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
3587 ; GCN3-NEXT: s_cbranch_execnz .LBB29_3
3588 ; GCN3-NEXT: ; %bb.1: ; %Flow
3589 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3590 ; GCN3-NEXT: s_cbranch_execnz .LBB29_4
3591 ; GCN3-NEXT: .LBB29_2: ; %atomicrmw.phi
3592 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
3593 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3594 ; GCN3-NEXT: .LBB29_3: ; %atomicrmw.global
3595 ; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] glc
3596 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3597 ; GCN3-NEXT: buffer_wbinvl1_vol
3598 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
3599 ; GCN3-NEXT: ; implicit-def: $vgpr2
3600 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3601 ; GCN3-NEXT: s_cbranch_execz .LBB29_2
3602 ; GCN3-NEXT: .LBB29_4: ; %atomicrmw.private
3603 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
3604 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
3605 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
3606 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
3607 ; GCN3-NEXT: s_waitcnt vmcnt(1)
3608 ; GCN3-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2
3609 ; GCN3-NEXT: s_waitcnt vmcnt(0)
3610 ; GCN3-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
3611 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
3612 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
3613 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
3614 ; GCN3-NEXT: s_waitcnt vmcnt(0)
3615 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3616 %gep = getelementptr i64, ptr %out, i64 4
3617 %result = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
3621 ; ---------------------------------------------------------------------
3623 ; ---------------------------------------------------------------------
3625 define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) {
3626 ; GCN1-LABEL: flat_atomic_sub_i64_noret:
3628 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3629 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
3630 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
3631 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3632 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
3633 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
3634 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
3635 ; GCN1-NEXT: s_cbranch_execnz .LBB30_3
3636 ; GCN1-NEXT: ; %bb.1: ; %Flow
3637 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3638 ; GCN1-NEXT: s_cbranch_execnz .LBB30_4
3639 ; GCN1-NEXT: .LBB30_2: ; %atomicrmw.phi
3640 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
3641 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3642 ; GCN1-NEXT: .LBB30_3: ; %atomicrmw.global
3643 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
3644 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3645 ; GCN1-NEXT: buffer_wbinvl1_vol
3646 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
3647 ; GCN1-NEXT: ; implicit-def: $vgpr2
3648 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3649 ; GCN1-NEXT: s_cbranch_execz .LBB30_2
3650 ; GCN1-NEXT: .LBB30_4: ; %atomicrmw.private
3651 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3652 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
3653 ; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
3654 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
3655 ; GCN1-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen
3656 ; GCN1-NEXT: s_waitcnt vmcnt(1)
3657 ; GCN1-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
3658 ; GCN1-NEXT: s_waitcnt vmcnt(0)
3659 ; GCN1-NEXT: v_subb_u32_e32 v2, vcc, v5, v3, vcc
3660 ; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
3661 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
3662 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
3663 ; GCN1-NEXT: s_waitcnt vmcnt(0)
3664 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3666 ; GCN2-LABEL: flat_atomic_sub_i64_noret:
3668 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3669 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
3670 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
3671 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3672 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
3673 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
3674 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
3675 ; GCN2-NEXT: s_cbranch_execnz .LBB30_3
3676 ; GCN2-NEXT: ; %bb.1: ; %Flow
3677 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3678 ; GCN2-NEXT: s_cbranch_execnz .LBB30_4
3679 ; GCN2-NEXT: .LBB30_2: ; %atomicrmw.phi
3680 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
3681 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3682 ; GCN2-NEXT: .LBB30_3: ; %atomicrmw.global
3683 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
3684 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3685 ; GCN2-NEXT: buffer_wbinvl1_vol
3686 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
3687 ; GCN2-NEXT: ; implicit-def: $vgpr2
3688 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3689 ; GCN2-NEXT: s_cbranch_execz .LBB30_2
3690 ; GCN2-NEXT: .LBB30_4: ; %atomicrmw.private
3691 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3692 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
3693 ; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
3694 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
3695 ; GCN2-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen
3696 ; GCN2-NEXT: s_waitcnt vmcnt(1)
3697 ; GCN2-NEXT: v_sub_u32_e32 v1, vcc, v1, v2
3698 ; GCN2-NEXT: s_waitcnt vmcnt(0)
3699 ; GCN2-NEXT: v_subb_u32_e32 v2, vcc, v5, v3, vcc
3700 ; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
3701 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
3702 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
3703 ; GCN2-NEXT: s_waitcnt vmcnt(0)
3704 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3706 ; GCN3-LABEL: flat_atomic_sub_i64_noret:
3708 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3709 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
3710 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
3711 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
3712 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
3713 ; GCN3-NEXT: s_cbranch_execnz .LBB30_3
3714 ; GCN3-NEXT: ; %bb.1: ; %Flow
3715 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3716 ; GCN3-NEXT: s_cbranch_execnz .LBB30_4
3717 ; GCN3-NEXT: .LBB30_2: ; %atomicrmw.phi
3718 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
3719 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3720 ; GCN3-NEXT: .LBB30_3: ; %atomicrmw.global
3721 ; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
3722 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3723 ; GCN3-NEXT: buffer_wbinvl1_vol
3724 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
3725 ; GCN3-NEXT: ; implicit-def: $vgpr2
3726 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3727 ; GCN3-NEXT: s_cbranch_execz .LBB30_2
3728 ; GCN3-NEXT: .LBB30_4: ; %atomicrmw.private
3729 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3730 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
3731 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
3732 ; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
3733 ; GCN3-NEXT: s_waitcnt vmcnt(1)
3734 ; GCN3-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v2
3735 ; GCN3-NEXT: s_waitcnt vmcnt(0)
3736 ; GCN3-NEXT: v_subb_co_u32_e32 v2, vcc, v4, v3, vcc
3737 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
3738 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
3739 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
3740 ; GCN3-NEXT: s_waitcnt vmcnt(0)
3741 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3742 %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst
3746 define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) {
3747 ; GCN1-LABEL: flat_atomic_sub_i64_noret_offset:
3749 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3750 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
3751 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
3752 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
3753 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3754 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3755 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
3756 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
3757 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
3758 ; GCN1-NEXT: s_cbranch_execnz .LBB31_3
3759 ; GCN1-NEXT: ; %bb.1: ; %Flow
3760 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3761 ; GCN1-NEXT: s_cbranch_execnz .LBB31_4
3762 ; GCN1-NEXT: .LBB31_2: ; %atomicrmw.phi
3763 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
3764 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3765 ; GCN1-NEXT: .LBB31_3: ; %atomicrmw.global
3766 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
3767 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3768 ; GCN1-NEXT: buffer_wbinvl1_vol
3769 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
3770 ; GCN1-NEXT: ; implicit-def: $vgpr2
3771 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3772 ; GCN1-NEXT: s_cbranch_execz .LBB31_2
3773 ; GCN1-NEXT: .LBB31_4: ; %atomicrmw.private
3774 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3775 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
3776 ; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
3777 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
3778 ; GCN1-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen
3779 ; GCN1-NEXT: s_waitcnt vmcnt(1)
3780 ; GCN1-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
3781 ; GCN1-NEXT: s_waitcnt vmcnt(0)
3782 ; GCN1-NEXT: v_subb_u32_e32 v2, vcc, v5, v3, vcc
3783 ; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
3784 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
3785 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
3786 ; GCN1-NEXT: s_waitcnt vmcnt(0)
3787 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3789 ; GCN2-LABEL: flat_atomic_sub_i64_noret_offset:
3791 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3792 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
3793 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
3794 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
3795 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
3796 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3797 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
3798 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
3799 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
3800 ; GCN2-NEXT: s_cbranch_execnz .LBB31_3
3801 ; GCN2-NEXT: ; %bb.1: ; %Flow
3802 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3803 ; GCN2-NEXT: s_cbranch_execnz .LBB31_4
3804 ; GCN2-NEXT: .LBB31_2: ; %atomicrmw.phi
3805 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
3806 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3807 ; GCN2-NEXT: .LBB31_3: ; %atomicrmw.global
3808 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
3809 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3810 ; GCN2-NEXT: buffer_wbinvl1_vol
3811 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
3812 ; GCN2-NEXT: ; implicit-def: $vgpr2
3813 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3814 ; GCN2-NEXT: s_cbranch_execz .LBB31_2
3815 ; GCN2-NEXT: .LBB31_4: ; %atomicrmw.private
3816 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3817 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
3818 ; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
3819 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
3820 ; GCN2-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen
3821 ; GCN2-NEXT: s_waitcnt vmcnt(1)
3822 ; GCN2-NEXT: v_sub_u32_e32 v1, vcc, v1, v2
3823 ; GCN2-NEXT: s_waitcnt vmcnt(0)
3824 ; GCN2-NEXT: v_subb_u32_e32 v2, vcc, v5, v3, vcc
3825 ; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
3826 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
3827 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
3828 ; GCN2-NEXT: s_waitcnt vmcnt(0)
3829 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3831 ; GCN3-LABEL: flat_atomic_sub_i64_noret_offset:
3833 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3834 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
3835 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
3836 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
3837 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
3838 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
3839 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
3840 ; GCN3-NEXT: s_cbranch_execnz .LBB31_3
3841 ; GCN3-NEXT: ; %bb.1: ; %Flow
3842 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3843 ; GCN3-NEXT: s_cbranch_execnz .LBB31_4
3844 ; GCN3-NEXT: .LBB31_2: ; %atomicrmw.phi
3845 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
3846 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3847 ; GCN3-NEXT: .LBB31_3: ; %atomicrmw.global
3848 ; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
3849 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3850 ; GCN3-NEXT: buffer_wbinvl1_vol
3851 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
3852 ; GCN3-NEXT: ; implicit-def: $vgpr2
3853 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3854 ; GCN3-NEXT: s_cbranch_execz .LBB31_2
3855 ; GCN3-NEXT: .LBB31_4: ; %atomicrmw.private
3856 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
3857 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
3858 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
3859 ; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
3860 ; GCN3-NEXT: s_waitcnt vmcnt(1)
3861 ; GCN3-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v2
3862 ; GCN3-NEXT: s_waitcnt vmcnt(0)
3863 ; GCN3-NEXT: v_subb_co_u32_e32 v2, vcc, v4, v3, vcc
3864 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
3865 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
3866 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
3867 ; GCN3-NEXT: s_waitcnt vmcnt(0)
3868 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3869 %gep = getelementptr i64, ptr %out, i64 4
3870 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst
3874 define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) {
3875 ; GCN1-LABEL: flat_atomic_sub_i64_ret:
3877 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3878 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
3879 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
3880 ; GCN1-NEXT: v_mov_b32_e32 v5, v1
3881 ; GCN1-NEXT: v_mov_b32_e32 v4, v0
3882 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
3883 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3884 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
3885 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
3886 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
3887 ; GCN1-NEXT: s_cbranch_execnz .LBB32_3
3888 ; GCN1-NEXT: ; %bb.1: ; %Flow
3889 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3890 ; GCN1-NEXT: s_cbranch_execnz .LBB32_4
3891 ; GCN1-NEXT: .LBB32_2: ; %atomicrmw.phi
3892 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
3893 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3894 ; GCN1-NEXT: .LBB32_3: ; %atomicrmw.global
3895 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc
3896 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3897 ; GCN1-NEXT: buffer_wbinvl1_vol
3898 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
3899 ; GCN1-NEXT: ; implicit-def: $vgpr2
3900 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3901 ; GCN1-NEXT: s_cbranch_execz .LBB32_2
3902 ; GCN1-NEXT: .LBB32_4: ; %atomicrmw.private
3903 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
3904 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
3905 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
3906 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
3907 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
3908 ; GCN1-NEXT: s_waitcnt vmcnt(1)
3909 ; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v0, v2
3910 ; GCN1-NEXT: s_waitcnt vmcnt(0)
3911 ; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
3912 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
3913 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
3914 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
3915 ; GCN1-NEXT: s_waitcnt vmcnt(0)
3916 ; GCN1-NEXT: s_setpc_b64 s[30:31]
3918 ; GCN2-LABEL: flat_atomic_sub_i64_ret:
3920 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3921 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
3922 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
3923 ; GCN2-NEXT: v_mov_b32_e32 v5, v1
3924 ; GCN2-NEXT: v_mov_b32_e32 v4, v0
3925 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
3926 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3927 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
3928 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
3929 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
3930 ; GCN2-NEXT: s_cbranch_execnz .LBB32_3
3931 ; GCN2-NEXT: ; %bb.1: ; %Flow
3932 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3933 ; GCN2-NEXT: s_cbranch_execnz .LBB32_4
3934 ; GCN2-NEXT: .LBB32_2: ; %atomicrmw.phi
3935 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
3936 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3937 ; GCN2-NEXT: .LBB32_3: ; %atomicrmw.global
3938 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc
3939 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3940 ; GCN2-NEXT: buffer_wbinvl1_vol
3941 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
3942 ; GCN2-NEXT: ; implicit-def: $vgpr2
3943 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3944 ; GCN2-NEXT: s_cbranch_execz .LBB32_2
3945 ; GCN2-NEXT: .LBB32_4: ; %atomicrmw.private
3946 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
3947 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
3948 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
3949 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
3950 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
3951 ; GCN2-NEXT: s_waitcnt vmcnt(1)
3952 ; GCN2-NEXT: v_sub_u32_e32 v2, vcc, v0, v2
3953 ; GCN2-NEXT: s_waitcnt vmcnt(0)
3954 ; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
3955 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
3956 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
3957 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
3958 ; GCN2-NEXT: s_waitcnt vmcnt(0)
3959 ; GCN2-NEXT: s_setpc_b64 s[30:31]
3961 ; GCN3-LABEL: flat_atomic_sub_i64_ret:
3963 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3964 ; GCN3-NEXT: v_mov_b32_e32 v5, v1
3965 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
3966 ; GCN3-NEXT: v_mov_b32_e32 v4, v0
3967 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
3968 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
3969 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
3970 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
3971 ; GCN3-NEXT: s_cbranch_execnz .LBB32_3
3972 ; GCN3-NEXT: ; %bb.1: ; %Flow
3973 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3974 ; GCN3-NEXT: s_cbranch_execnz .LBB32_4
3975 ; GCN3-NEXT: .LBB32_2: ; %atomicrmw.phi
3976 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
3977 ; GCN3-NEXT: s_setpc_b64 s[30:31]
3978 ; GCN3-NEXT: .LBB32_3: ; %atomicrmw.global
3979 ; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc
3980 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3981 ; GCN3-NEXT: buffer_wbinvl1_vol
3982 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
3983 ; GCN3-NEXT: ; implicit-def: $vgpr2
3984 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
3985 ; GCN3-NEXT: s_cbranch_execz .LBB32_2
3986 ; GCN3-NEXT: .LBB32_4: ; %atomicrmw.private
3987 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
3988 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
3989 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
3990 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
3991 ; GCN3-NEXT: s_waitcnt vmcnt(1)
3992 ; GCN3-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
3993 ; GCN3-NEXT: s_waitcnt vmcnt(0)
3994 ; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
3995 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
3996 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
3997 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
3998 ; GCN3-NEXT: s_waitcnt vmcnt(0)
3999 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4000 %result = atomicrmw sub ptr %ptr, i64 %in seq_cst
4004 define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) {
4005 ; GCN1-LABEL: flat_atomic_sub_i64_ret_offset:
4007 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4008 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
4009 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
4010 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
4011 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
4012 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4013 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
4014 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
4015 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
4016 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
4017 ; GCN1-NEXT: s_cbranch_execnz .LBB33_3
4018 ; GCN1-NEXT: ; %bb.1: ; %Flow
4019 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
4020 ; GCN1-NEXT: s_cbranch_execnz .LBB33_4
4021 ; GCN1-NEXT: .LBB33_2: ; %atomicrmw.phi
4022 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
4023 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4024 ; GCN1-NEXT: .LBB33_3: ; %atomicrmw.global
4025 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc
4026 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4027 ; GCN1-NEXT: buffer_wbinvl1_vol
4028 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
4029 ; GCN1-NEXT: ; implicit-def: $vgpr2
4030 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
4031 ; GCN1-NEXT: s_cbranch_execz .LBB33_2
4032 ; GCN1-NEXT: .LBB33_4: ; %atomicrmw.private
4033 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
4034 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
4035 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
4036 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
4037 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
4038 ; GCN1-NEXT: s_waitcnt vmcnt(1)
4039 ; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v0, v2
4040 ; GCN1-NEXT: s_waitcnt vmcnt(0)
4041 ; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
4042 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
4043 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
4044 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
4045 ; GCN1-NEXT: s_waitcnt vmcnt(0)
4046 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4048 ; GCN2-LABEL: flat_atomic_sub_i64_ret_offset:
4050 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4051 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
4052 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
4053 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
4054 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
4055 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4056 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
4057 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
4058 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
4059 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
4060 ; GCN2-NEXT: s_cbranch_execnz .LBB33_3
4061 ; GCN2-NEXT: ; %bb.1: ; %Flow
4062 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
4063 ; GCN2-NEXT: s_cbranch_execnz .LBB33_4
4064 ; GCN2-NEXT: .LBB33_2: ; %atomicrmw.phi
4065 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
4066 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4067 ; GCN2-NEXT: .LBB33_3: ; %atomicrmw.global
4068 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc
4069 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4070 ; GCN2-NEXT: buffer_wbinvl1_vol
4071 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
4072 ; GCN2-NEXT: ; implicit-def: $vgpr2
4073 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
4074 ; GCN2-NEXT: s_cbranch_execz .LBB33_2
4075 ; GCN2-NEXT: .LBB33_4: ; %atomicrmw.private
4076 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
4077 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
4078 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
4079 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
4080 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
4081 ; GCN2-NEXT: s_waitcnt vmcnt(1)
4082 ; GCN2-NEXT: v_sub_u32_e32 v2, vcc, v0, v2
4083 ; GCN2-NEXT: s_waitcnt vmcnt(0)
4084 ; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
4085 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
4086 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
4087 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
4088 ; GCN2-NEXT: s_waitcnt vmcnt(0)
4089 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4091 ; GCN3-LABEL: flat_atomic_sub_i64_ret_offset:
4093 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4094 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
4095 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
4096 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
4097 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
4098 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
4099 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
4100 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
4101 ; GCN3-NEXT: s_cbranch_execnz .LBB33_3
4102 ; GCN3-NEXT: ; %bb.1: ; %Flow
4103 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
4104 ; GCN3-NEXT: s_cbranch_execnz .LBB33_4
4105 ; GCN3-NEXT: .LBB33_2: ; %atomicrmw.phi
4106 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
4107 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4108 ; GCN3-NEXT: .LBB33_3: ; %atomicrmw.global
4109 ; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc
4110 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4111 ; GCN3-NEXT: buffer_wbinvl1_vol
4112 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
4113 ; GCN3-NEXT: ; implicit-def: $vgpr2
4114 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
4115 ; GCN3-NEXT: s_cbranch_execz .LBB33_2
4116 ; GCN3-NEXT: .LBB33_4: ; %atomicrmw.private
4117 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
4118 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
4119 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
4120 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
4121 ; GCN3-NEXT: s_waitcnt vmcnt(1)
4122 ; GCN3-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
4123 ; GCN3-NEXT: s_waitcnt vmcnt(0)
4124 ; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
4125 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
4126 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
4127 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
4128 ; GCN3-NEXT: s_waitcnt vmcnt(0)
4129 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4130 %gep = getelementptr i64, ptr %out, i64 4
4131 %result = atomicrmw sub ptr %gep, i64 %in seq_cst
4135 define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
4136 ; GCN1-LABEL: flat_atomic_sub_i64_noret_scalar:
4138 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4139 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
4140 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
4141 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4142 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
4143 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
4144 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
4145 ; GCN1-NEXT: s_mov_b64 s[34:35], -1
4146 ; GCN1-NEXT: s_cbranch_vccnz .LBB34_3
4147 ; GCN1-NEXT: ; %bb.1: ; %Flow
4148 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
4149 ; GCN1-NEXT: s_cbranch_vccz .LBB34_4
4150 ; GCN1-NEXT: .LBB34_2: ; %atomicrmw.phi
4151 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4152 ; GCN1-NEXT: .LBB34_3: ; %atomicrmw.global
4153 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
4154 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
4155 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
4156 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
4157 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
4158 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4159 ; GCN1-NEXT: buffer_wbinvl1_vol
4160 ; GCN1-NEXT: s_cbranch_execnz .LBB34_2
4161 ; GCN1-NEXT: .LBB34_4: ; %atomicrmw.private
4162 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
4163 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
4164 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
4165 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
4166 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
4167 ; GCN1-NEXT: s_add_i32 s34, s34, 4
4168 ; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
4169 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
4170 ; GCN1-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
4171 ; GCN1-NEXT: s_waitcnt vmcnt(1)
4172 ; GCN1-NEXT: v_subrev_i32_e32 v1, vcc, s6, v1
4173 ; GCN1-NEXT: s_waitcnt vmcnt(0)
4174 ; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc
4175 ; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
4176 ; GCN1-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
4177 ; GCN1-NEXT: s_waitcnt vmcnt(0)
4178 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4180 ; GCN2-LABEL: flat_atomic_sub_i64_noret_scalar:
4182 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4183 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
4184 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
4185 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4186 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
4187 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
4188 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
4189 ; GCN2-NEXT: s_mov_b64 s[34:35], -1
4190 ; GCN2-NEXT: s_cbranch_vccnz .LBB34_3
4191 ; GCN2-NEXT: ; %bb.1: ; %Flow
4192 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
4193 ; GCN2-NEXT: s_cbranch_vccz .LBB34_4
4194 ; GCN2-NEXT: .LBB34_2: ; %atomicrmw.phi
4195 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4196 ; GCN2-NEXT: .LBB34_3: ; %atomicrmw.global
4197 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
4198 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
4199 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
4200 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
4201 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
4202 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4203 ; GCN2-NEXT: buffer_wbinvl1_vol
4204 ; GCN2-NEXT: s_cbranch_execnz .LBB34_2
4205 ; GCN2-NEXT: .LBB34_4: ; %atomicrmw.private
4206 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
4207 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
4208 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
4209 ; GCN2-NEXT: s_add_i32 s34, s34, 4
4210 ; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
4211 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
4212 ; GCN2-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
4213 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
4214 ; GCN2-NEXT: s_waitcnt vmcnt(1)
4215 ; GCN2-NEXT: v_subrev_u32_e32 v1, vcc, s6, v1
4216 ; GCN2-NEXT: s_waitcnt vmcnt(0)
4217 ; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc
4218 ; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
4219 ; GCN2-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
4220 ; GCN2-NEXT: s_waitcnt vmcnt(0)
4221 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4223 ; GCN3-LABEL: flat_atomic_sub_i64_noret_scalar:
4225 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4226 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
4227 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
4228 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
4229 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
4230 ; GCN3-NEXT: s_mov_b64 s[34:35], -1
4231 ; GCN3-NEXT: s_cbranch_vccnz .LBB34_3
4232 ; GCN3-NEXT: ; %bb.1: ; %Flow
4233 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
4234 ; GCN3-NEXT: s_cbranch_vccz .LBB34_4
4235 ; GCN3-NEXT: .LBB34_2: ; %atomicrmw.phi
4236 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4237 ; GCN3-NEXT: .LBB34_3: ; %atomicrmw.global
4238 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
4239 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
4240 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
4241 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
4242 ; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
4243 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4244 ; GCN3-NEXT: buffer_wbinvl1_vol
4245 ; GCN3-NEXT: s_cbranch_execnz .LBB34_2
4246 ; GCN3-NEXT: .LBB34_4: ; %atomicrmw.private
4247 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
4248 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
4249 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
4250 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
4251 ; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
4252 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
4253 ; GCN3-NEXT: s_waitcnt vmcnt(1)
4254 ; GCN3-NEXT: v_subrev_co_u32_e32 v1, vcc, s6, v1
4255 ; GCN3-NEXT: s_waitcnt vmcnt(0)
4256 ; GCN3-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc
4257 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
4258 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
4259 ; GCN3-NEXT: s_waitcnt vmcnt(0)
4260 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4261 %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst
4265 define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
4266 ; GCN1-LABEL: flat_atomic_sub_i64_noret_offset_scalar:
4268 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4269 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
4270 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
4271 ; GCN1-NEXT: s_add_u32 s34, s4, 32
4272 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
4273 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4274 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
4275 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
4276 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
4277 ; GCN1-NEXT: s_mov_b64 s[36:37], -1
4278 ; GCN1-NEXT: s_cbranch_vccnz .LBB35_3
4279 ; GCN1-NEXT: ; %bb.1: ; %Flow
4280 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
4281 ; GCN1-NEXT: s_cbranch_vccz .LBB35_4
4282 ; GCN1-NEXT: .LBB35_2: ; %atomicrmw.phi
4283 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4284 ; GCN1-NEXT: .LBB35_3: ; %atomicrmw.global
4285 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
4286 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
4287 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
4288 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
4289 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
4290 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4291 ; GCN1-NEXT: buffer_wbinvl1_vol
4292 ; GCN1-NEXT: s_cbranch_execnz .LBB35_2
4293 ; GCN1-NEXT: .LBB35_4: ; %atomicrmw.private
4294 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
4295 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
4296 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
4297 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
4298 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
4299 ; GCN1-NEXT: s_add_i32 s34, s34, 4
4300 ; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
4301 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
4302 ; GCN1-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
4303 ; GCN1-NEXT: s_waitcnt vmcnt(1)
4304 ; GCN1-NEXT: v_subrev_i32_e32 v1, vcc, s6, v1
4305 ; GCN1-NEXT: s_waitcnt vmcnt(0)
4306 ; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc
4307 ; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
4308 ; GCN1-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
4309 ; GCN1-NEXT: s_waitcnt vmcnt(0)
4310 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4312 ; GCN2-LABEL: flat_atomic_sub_i64_noret_offset_scalar:
4314 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4315 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
4316 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
4317 ; GCN2-NEXT: s_add_u32 s34, s4, 32
4318 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
4319 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4320 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
4321 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
4322 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
4323 ; GCN2-NEXT: s_mov_b64 s[36:37], -1
4324 ; GCN2-NEXT: s_cbranch_vccnz .LBB35_3
4325 ; GCN2-NEXT: ; %bb.1: ; %Flow
4326 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
4327 ; GCN2-NEXT: s_cbranch_vccz .LBB35_4
4328 ; GCN2-NEXT: .LBB35_2: ; %atomicrmw.phi
4329 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4330 ; GCN2-NEXT: .LBB35_3: ; %atomicrmw.global
4331 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
4332 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
4333 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
4334 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
4335 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
4336 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4337 ; GCN2-NEXT: buffer_wbinvl1_vol
4338 ; GCN2-NEXT: s_cbranch_execnz .LBB35_2
4339 ; GCN2-NEXT: .LBB35_4: ; %atomicrmw.private
4340 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
4341 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
4342 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
4343 ; GCN2-NEXT: s_add_i32 s34, s34, 4
4344 ; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
4345 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
4346 ; GCN2-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
4347 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
4348 ; GCN2-NEXT: s_waitcnt vmcnt(1)
4349 ; GCN2-NEXT: v_subrev_u32_e32 v1, vcc, s6, v1
4350 ; GCN2-NEXT: s_waitcnt vmcnt(0)
4351 ; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc
4352 ; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
4353 ; GCN2-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
4354 ; GCN2-NEXT: s_waitcnt vmcnt(0)
4355 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4357 ; GCN3-LABEL: flat_atomic_sub_i64_noret_offset_scalar:
4359 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4360 ; GCN3-NEXT: s_add_u32 s34, s4, 32
4361 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
4362 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
4363 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
4364 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
4365 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
4366 ; GCN3-NEXT: s_mov_b64 s[36:37], -1
4367 ; GCN3-NEXT: s_cbranch_vccnz .LBB35_3
4368 ; GCN3-NEXT: ; %bb.1: ; %Flow
4369 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
4370 ; GCN3-NEXT: s_cbranch_vccz .LBB35_4
4371 ; GCN3-NEXT: .LBB35_2: ; %atomicrmw.phi
4372 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4373 ; GCN3-NEXT: .LBB35_3: ; %atomicrmw.global
4374 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
4375 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
4376 ; GCN3-NEXT: v_mov_b32_e32 v1, s35
4377 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
4378 ; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
4379 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4380 ; GCN3-NEXT: buffer_wbinvl1_vol
4381 ; GCN3-NEXT: s_cbranch_execnz .LBB35_2
4382 ; GCN3-NEXT: .LBB35_4: ; %atomicrmw.private
4383 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
4384 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
4385 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
4386 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
4387 ; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
4388 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
4389 ; GCN3-NEXT: s_waitcnt vmcnt(1)
4390 ; GCN3-NEXT: v_subrev_co_u32_e32 v1, vcc, s6, v1
4391 ; GCN3-NEXT: s_waitcnt vmcnt(0)
4392 ; GCN3-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc
4393 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
4394 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
4395 ; GCN3-NEXT: s_waitcnt vmcnt(0)
4396 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4397 %gep = getelementptr i64, ptr %out, i64 4
4398 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst
4402 define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
4403 ; GCN1-LABEL: flat_atomic_sub_i64_ret_scalar:
4405 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4406 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
4407 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
4408 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4409 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
4410 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
4411 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
4412 ; GCN1-NEXT: s_cbranch_vccz .LBB36_2
4413 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
4414 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
4415 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
4416 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
4417 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
4418 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
4419 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4420 ; GCN1-NEXT: buffer_wbinvl1_vol
4421 ; GCN1-NEXT: s_cbranch_execz .LBB36_3
4422 ; GCN1-NEXT: s_branch .LBB36_4
4423 ; GCN1-NEXT: .LBB36_2:
4424 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
4425 ; GCN1-NEXT: .LBB36_3: ; %atomicrmw.private
4426 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
4427 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
4428 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
4429 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
4430 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
4431 ; GCN1-NEXT: s_add_i32 s34, s34, 4
4432 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
4433 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
4434 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
4435 ; GCN1-NEXT: s_waitcnt vmcnt(1)
4436 ; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s6, v0
4437 ; GCN1-NEXT: s_waitcnt vmcnt(0)
4438 ; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc
4439 ; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
4440 ; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
4441 ; GCN1-NEXT: .LBB36_4: ; %atomicrmw.end
4442 ; GCN1-NEXT: s_waitcnt vmcnt(0)
4443 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4445 ; GCN2-LABEL: flat_atomic_sub_i64_ret_scalar:
4447 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4448 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
4449 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
4450 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4451 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
4452 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
4453 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
4454 ; GCN2-NEXT: s_cbranch_vccz .LBB36_2
4455 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
4456 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
4457 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
4458 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
4459 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
4460 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
4461 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4462 ; GCN2-NEXT: buffer_wbinvl1_vol
4463 ; GCN2-NEXT: s_cbranch_execz .LBB36_3
4464 ; GCN2-NEXT: s_branch .LBB36_4
4465 ; GCN2-NEXT: .LBB36_2:
4466 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
4467 ; GCN2-NEXT: .LBB36_3: ; %atomicrmw.private
4468 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
4469 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
4470 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
4471 ; GCN2-NEXT: s_add_i32 s34, s34, 4
4472 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
4473 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
4474 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
4475 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
4476 ; GCN2-NEXT: s_waitcnt vmcnt(1)
4477 ; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s6, v0
4478 ; GCN2-NEXT: s_waitcnt vmcnt(0)
4479 ; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc
4480 ; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
4481 ; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
4482 ; GCN2-NEXT: .LBB36_4: ; %atomicrmw.end
4483 ; GCN2-NEXT: s_waitcnt vmcnt(0)
4484 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4486 ; GCN3-LABEL: flat_atomic_sub_i64_ret_scalar:
4488 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4489 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
4490 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
4491 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
4492 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
4493 ; GCN3-NEXT: s_cbranch_vccz .LBB36_2
4494 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
4495 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
4496 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
4497 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
4498 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
4499 ; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
4500 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4501 ; GCN3-NEXT: buffer_wbinvl1_vol
4502 ; GCN3-NEXT: s_cbranch_execz .LBB36_3
4503 ; GCN3-NEXT: s_branch .LBB36_4
4504 ; GCN3-NEXT: .LBB36_2:
4505 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
4506 ; GCN3-NEXT: .LBB36_3: ; %atomicrmw.private
4507 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
4508 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
4509 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
4510 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
4511 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
4512 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
4513 ; GCN3-NEXT: s_waitcnt vmcnt(1)
4514 ; GCN3-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v0
4515 ; GCN3-NEXT: s_waitcnt vmcnt(0)
4516 ; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
4517 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
4518 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
4519 ; GCN3-NEXT: .LBB36_4: ; %atomicrmw.end
4520 ; GCN3-NEXT: s_waitcnt vmcnt(0)
4521 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4522 %result = atomicrmw sub ptr %ptr, i64 %in seq_cst
4526 define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
4527 ; GCN1-LABEL: flat_atomic_sub_i64_ret_offset_scalar:
4529 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4530 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
4531 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
4532 ; GCN1-NEXT: s_add_u32 s34, s4, 32
4533 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
4534 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4535 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
4536 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
4537 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
4538 ; GCN1-NEXT: s_cbranch_vccz .LBB37_2
4539 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
4540 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
4541 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
4542 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
4543 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
4544 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
4545 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4546 ; GCN1-NEXT: buffer_wbinvl1_vol
4547 ; GCN1-NEXT: s_cbranch_execz .LBB37_3
4548 ; GCN1-NEXT: s_branch .LBB37_4
4549 ; GCN1-NEXT: .LBB37_2:
4550 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
4551 ; GCN1-NEXT: .LBB37_3: ; %atomicrmw.private
4552 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
4553 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
4554 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
4555 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
4556 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
4557 ; GCN1-NEXT: s_add_i32 s34, s34, 4
4558 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
4559 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
4560 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
4561 ; GCN1-NEXT: s_waitcnt vmcnt(1)
4562 ; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s6, v0
4563 ; GCN1-NEXT: s_waitcnt vmcnt(0)
4564 ; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc
4565 ; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
4566 ; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
4567 ; GCN1-NEXT: .LBB37_4: ; %atomicrmw.end
4568 ; GCN1-NEXT: s_waitcnt vmcnt(0)
4569 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4571 ; GCN2-LABEL: flat_atomic_sub_i64_ret_offset_scalar:
4573 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4574 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
4575 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
4576 ; GCN2-NEXT: s_add_u32 s34, s4, 32
4577 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
4578 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4579 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
4580 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
4581 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
4582 ; GCN2-NEXT: s_cbranch_vccz .LBB37_2
4583 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
4584 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
4585 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
4586 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
4587 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
4588 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
4589 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4590 ; GCN2-NEXT: buffer_wbinvl1_vol
4591 ; GCN2-NEXT: s_cbranch_execz .LBB37_3
4592 ; GCN2-NEXT: s_branch .LBB37_4
4593 ; GCN2-NEXT: .LBB37_2:
4594 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
4595 ; GCN2-NEXT: .LBB37_3: ; %atomicrmw.private
4596 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
4597 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
4598 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
4599 ; GCN2-NEXT: s_add_i32 s34, s34, 4
4600 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
4601 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
4602 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
4603 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
4604 ; GCN2-NEXT: s_waitcnt vmcnt(1)
4605 ; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s6, v0
4606 ; GCN2-NEXT: s_waitcnt vmcnt(0)
4607 ; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc
4608 ; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
4609 ; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
4610 ; GCN2-NEXT: .LBB37_4: ; %atomicrmw.end
4611 ; GCN2-NEXT: s_waitcnt vmcnt(0)
4612 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4614 ; GCN3-LABEL: flat_atomic_sub_i64_ret_offset_scalar:
4616 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4617 ; GCN3-NEXT: s_add_u32 s34, s4, 32
4618 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
4619 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
4620 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
4621 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
4622 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
4623 ; GCN3-NEXT: s_cbranch_vccz .LBB37_2
4624 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
4625 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
4626 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
4627 ; GCN3-NEXT: v_mov_b32_e32 v1, s35
4628 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
4629 ; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
4630 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4631 ; GCN3-NEXT: buffer_wbinvl1_vol
4632 ; GCN3-NEXT: s_cbranch_execz .LBB37_3
4633 ; GCN3-NEXT: s_branch .LBB37_4
4634 ; GCN3-NEXT: .LBB37_2:
4635 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
4636 ; GCN3-NEXT: .LBB37_3: ; %atomicrmw.private
4637 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
4638 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
4639 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
4640 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
4641 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
4642 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
4643 ; GCN3-NEXT: s_waitcnt vmcnt(1)
4644 ; GCN3-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v0
4645 ; GCN3-NEXT: s_waitcnt vmcnt(0)
4646 ; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
4647 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
4648 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
4649 ; GCN3-NEXT: .LBB37_4: ; %atomicrmw.end
4650 ; GCN3-NEXT: s_waitcnt vmcnt(0)
4651 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4652 %gep = getelementptr i64, ptr %out, i64 4
4653 %result = atomicrmw sub ptr %gep, i64 %in seq_cst
4657 define void @flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
4658 ; GCN1-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory:
4660 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4661 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
4662 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
4663 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
4664 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4665 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4666 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
4667 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
4668 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
4669 ; GCN1-NEXT: s_cbranch_execnz .LBB38_3
4670 ; GCN1-NEXT: ; %bb.1: ; %Flow
4671 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
4672 ; GCN1-NEXT: s_cbranch_execnz .LBB38_4
4673 ; GCN1-NEXT: .LBB38_2: ; %atomicrmw.phi
4674 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
4675 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4676 ; GCN1-NEXT: .LBB38_3: ; %atomicrmw.global
4677 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
4678 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4679 ; GCN1-NEXT: buffer_wbinvl1_vol
4680 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
4681 ; GCN1-NEXT: ; implicit-def: $vgpr2
4682 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
4683 ; GCN1-NEXT: s_cbranch_execz .LBB38_2
4684 ; GCN1-NEXT: .LBB38_4: ; %atomicrmw.private
4685 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
4686 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
4687 ; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
4688 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
4689 ; GCN1-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen
4690 ; GCN1-NEXT: s_waitcnt vmcnt(1)
4691 ; GCN1-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
4692 ; GCN1-NEXT: s_waitcnt vmcnt(0)
4693 ; GCN1-NEXT: v_subb_u32_e32 v2, vcc, v5, v3, vcc
4694 ; GCN1-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
4695 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
4696 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
4697 ; GCN1-NEXT: s_waitcnt vmcnt(0)
4698 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4700 ; GCN2-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory:
4702 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4703 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
4704 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
4705 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
4706 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4707 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4708 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
4709 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
4710 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
4711 ; GCN2-NEXT: s_cbranch_execnz .LBB38_3
4712 ; GCN2-NEXT: ; %bb.1: ; %Flow
4713 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
4714 ; GCN2-NEXT: s_cbranch_execnz .LBB38_4
4715 ; GCN2-NEXT: .LBB38_2: ; %atomicrmw.phi
4716 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
4717 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4718 ; GCN2-NEXT: .LBB38_3: ; %atomicrmw.global
4719 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
4720 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4721 ; GCN2-NEXT: buffer_wbinvl1_vol
4722 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
4723 ; GCN2-NEXT: ; implicit-def: $vgpr2
4724 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
4725 ; GCN2-NEXT: s_cbranch_execz .LBB38_2
4726 ; GCN2-NEXT: .LBB38_4: ; %atomicrmw.private
4727 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
4728 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
4729 ; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
4730 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
4731 ; GCN2-NEXT: buffer_load_dword v5, v4, s[0:3], 0 offen
4732 ; GCN2-NEXT: s_waitcnt vmcnt(1)
4733 ; GCN2-NEXT: v_sub_u32_e32 v1, vcc, v1, v2
4734 ; GCN2-NEXT: s_waitcnt vmcnt(0)
4735 ; GCN2-NEXT: v_subb_u32_e32 v2, vcc, v5, v3, vcc
4736 ; GCN2-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
4737 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
4738 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
4739 ; GCN2-NEXT: s_waitcnt vmcnt(0)
4740 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4742 ; GCN3-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory:
4744 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4745 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
4746 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
4747 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
4748 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
4749 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
4750 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
4751 ; GCN3-NEXT: s_cbranch_execnz .LBB38_3
4752 ; GCN3-NEXT: ; %bb.1: ; %Flow
4753 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
4754 ; GCN3-NEXT: s_cbranch_execnz .LBB38_4
4755 ; GCN3-NEXT: .LBB38_2: ; %atomicrmw.phi
4756 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
4757 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4758 ; GCN3-NEXT: .LBB38_3: ; %atomicrmw.global
4759 ; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
4760 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4761 ; GCN3-NEXT: buffer_wbinvl1_vol
4762 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
4763 ; GCN3-NEXT: ; implicit-def: $vgpr2
4764 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
4765 ; GCN3-NEXT: s_cbranch_execz .LBB38_2
4766 ; GCN3-NEXT: .LBB38_4: ; %atomicrmw.private
4767 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
4768 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
4769 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
4770 ; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
4771 ; GCN3-NEXT: s_waitcnt vmcnt(1)
4772 ; GCN3-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v2
4773 ; GCN3-NEXT: s_waitcnt vmcnt(0)
4774 ; GCN3-NEXT: v_subb_co_u32_e32 v2, vcc, v4, v3, vcc
4775 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
4776 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
4777 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
4778 ; GCN3-NEXT: s_waitcnt vmcnt(0)
4779 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4780 %gep = getelementptr i64, ptr %out, i64 4
4781 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
4785 define i64 @flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
4786 ; GCN1-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory:
4788 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4789 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
4790 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
4791 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
4792 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
4793 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4794 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
4795 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
4796 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
4797 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
4798 ; GCN1-NEXT: s_cbranch_execnz .LBB39_3
4799 ; GCN1-NEXT: ; %bb.1: ; %Flow
4800 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
4801 ; GCN1-NEXT: s_cbranch_execnz .LBB39_4
4802 ; GCN1-NEXT: .LBB39_2: ; %atomicrmw.phi
4803 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
4804 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4805 ; GCN1-NEXT: .LBB39_3: ; %atomicrmw.global
4806 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc
4807 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4808 ; GCN1-NEXT: buffer_wbinvl1_vol
4809 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
4810 ; GCN1-NEXT: ; implicit-def: $vgpr2
4811 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
4812 ; GCN1-NEXT: s_cbranch_execz .LBB39_2
4813 ; GCN1-NEXT: .LBB39_4: ; %atomicrmw.private
4814 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
4815 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
4816 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
4817 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
4818 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
4819 ; GCN1-NEXT: s_waitcnt vmcnt(1)
4820 ; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v0, v2
4821 ; GCN1-NEXT: s_waitcnt vmcnt(0)
4822 ; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
4823 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
4824 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
4825 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
4826 ; GCN1-NEXT: s_waitcnt vmcnt(0)
4827 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4829 ; GCN2-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory:
4831 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4832 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
4833 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
4834 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
4835 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
4836 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4837 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
4838 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
4839 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
4840 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
4841 ; GCN2-NEXT: s_cbranch_execnz .LBB39_3
4842 ; GCN2-NEXT: ; %bb.1: ; %Flow
4843 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
4844 ; GCN2-NEXT: s_cbranch_execnz .LBB39_4
4845 ; GCN2-NEXT: .LBB39_2: ; %atomicrmw.phi
4846 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
4847 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4848 ; GCN2-NEXT: .LBB39_3: ; %atomicrmw.global
4849 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc
4850 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4851 ; GCN2-NEXT: buffer_wbinvl1_vol
4852 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
4853 ; GCN2-NEXT: ; implicit-def: $vgpr2
4854 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
4855 ; GCN2-NEXT: s_cbranch_execz .LBB39_2
4856 ; GCN2-NEXT: .LBB39_4: ; %atomicrmw.private
4857 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
4858 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
4859 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
4860 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
4861 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
4862 ; GCN2-NEXT: s_waitcnt vmcnt(1)
4863 ; GCN2-NEXT: v_sub_u32_e32 v2, vcc, v0, v2
4864 ; GCN2-NEXT: s_waitcnt vmcnt(0)
4865 ; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
4866 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
4867 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
4868 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
4869 ; GCN2-NEXT: s_waitcnt vmcnt(0)
4870 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4872 ; GCN3-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory:
4874 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4875 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
4876 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
4877 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
4878 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
4879 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
4880 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
4881 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
4882 ; GCN3-NEXT: s_cbranch_execnz .LBB39_3
4883 ; GCN3-NEXT: ; %bb.1: ; %Flow
4884 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
4885 ; GCN3-NEXT: s_cbranch_execnz .LBB39_4
4886 ; GCN3-NEXT: .LBB39_2: ; %atomicrmw.phi
4887 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
4888 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4889 ; GCN3-NEXT: .LBB39_3: ; %atomicrmw.global
4890 ; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc
4891 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4892 ; GCN3-NEXT: buffer_wbinvl1_vol
4893 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
4894 ; GCN3-NEXT: ; implicit-def: $vgpr2
4895 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
4896 ; GCN3-NEXT: s_cbranch_execz .LBB39_2
4897 ; GCN3-NEXT: .LBB39_4: ; %atomicrmw.private
4898 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
4899 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
4900 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
4901 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
4902 ; GCN3-NEXT: s_waitcnt vmcnt(1)
4903 ; GCN3-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
4904 ; GCN3-NEXT: s_waitcnt vmcnt(0)
4905 ; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
4906 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
4907 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
4908 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
4909 ; GCN3-NEXT: s_waitcnt vmcnt(0)
4910 ; GCN3-NEXT: s_setpc_b64 s[30:31]
4911 %gep = getelementptr i64, ptr %out, i64 4
4912 %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
4916 ; ---------------------------------------------------------------------
4918 ; ---------------------------------------------------------------------
4920 define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) {
4921 ; GCN1-LABEL: flat_atomic_and_i64_noret:
4923 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4924 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
4925 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
4926 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4927 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
4928 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
4929 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
4930 ; GCN1-NEXT: s_cbranch_execnz .LBB40_3
4931 ; GCN1-NEXT: ; %bb.1: ; %Flow
4932 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
4933 ; GCN1-NEXT: s_cbranch_execnz .LBB40_4
4934 ; GCN1-NEXT: .LBB40_2: ; %atomicrmw.phi
4935 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
4936 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4937 ; GCN1-NEXT: .LBB40_3: ; %atomicrmw.global
4938 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
4939 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4940 ; GCN1-NEXT: buffer_wbinvl1_vol
4941 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
4942 ; GCN1-NEXT: ; implicit-def: $vgpr3
4943 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
4944 ; GCN1-NEXT: s_cbranch_execz .LBB40_2
4945 ; GCN1-NEXT: .LBB40_4: ; %atomicrmw.private
4946 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
4947 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
4948 ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
4949 ; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
4950 ; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
4951 ; GCN1-NEXT: s_waitcnt vmcnt(1)
4952 ; GCN1-NEXT: v_and_b32_e32 v2, v4, v2
4953 ; GCN1-NEXT: s_waitcnt vmcnt(0)
4954 ; GCN1-NEXT: v_and_b32_e32 v3, v5, v3
4955 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
4956 ; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
4957 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
4958 ; GCN1-NEXT: s_waitcnt vmcnt(0)
4959 ; GCN1-NEXT: s_setpc_b64 s[30:31]
4961 ; GCN2-LABEL: flat_atomic_and_i64_noret:
4963 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4964 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
4965 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
4966 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4967 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
4968 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
4969 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
4970 ; GCN2-NEXT: s_cbranch_execnz .LBB40_3
4971 ; GCN2-NEXT: ; %bb.1: ; %Flow
4972 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
4973 ; GCN2-NEXT: s_cbranch_execnz .LBB40_4
4974 ; GCN2-NEXT: .LBB40_2: ; %atomicrmw.phi
4975 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
4976 ; GCN2-NEXT: s_setpc_b64 s[30:31]
4977 ; GCN2-NEXT: .LBB40_3: ; %atomicrmw.global
4978 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
4979 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4980 ; GCN2-NEXT: buffer_wbinvl1_vol
4981 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
4982 ; GCN2-NEXT: ; implicit-def: $vgpr3
4983 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
4984 ; GCN2-NEXT: s_cbranch_execz .LBB40_2
4985 ; GCN2-NEXT: .LBB40_4: ; %atomicrmw.private
4986 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
4987 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
4988 ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
4989 ; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
4990 ; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
4991 ; GCN2-NEXT: s_waitcnt vmcnt(1)
4992 ; GCN2-NEXT: v_and_b32_e32 v2, v4, v2
4993 ; GCN2-NEXT: s_waitcnt vmcnt(0)
4994 ; GCN2-NEXT: v_and_b32_e32 v3, v5, v3
4995 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
4996 ; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
4997 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
4998 ; GCN2-NEXT: s_waitcnt vmcnt(0)
4999 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5001 ; GCN3-LABEL: flat_atomic_and_i64_noret:
5003 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5004 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
5005 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
5006 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
5007 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
5008 ; GCN3-NEXT: s_cbranch_execnz .LBB40_3
5009 ; GCN3-NEXT: ; %bb.1: ; %Flow
5010 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
5011 ; GCN3-NEXT: s_cbranch_execnz .LBB40_4
5012 ; GCN3-NEXT: .LBB40_2: ; %atomicrmw.phi
5013 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
5014 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5015 ; GCN3-NEXT: .LBB40_3: ; %atomicrmw.global
5016 ; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
5017 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5018 ; GCN3-NEXT: buffer_wbinvl1_vol
5019 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
5020 ; GCN3-NEXT: ; implicit-def: $vgpr3
5021 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
5022 ; GCN3-NEXT: s_cbranch_execz .LBB40_2
5023 ; GCN3-NEXT: .LBB40_4: ; %atomicrmw.private
5024 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5025 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
5026 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
5027 ; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
5028 ; GCN3-NEXT: s_waitcnt vmcnt(1)
5029 ; GCN3-NEXT: v_and_b32_e32 v1, v1, v3
5030 ; GCN3-NEXT: s_waitcnt vmcnt(0)
5031 ; GCN3-NEXT: v_and_b32_e32 v2, v4, v2
5032 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
5033 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
5034 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
5035 ; GCN3-NEXT: s_waitcnt vmcnt(0)
5036 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5037 %tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst
5041 define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) {
5042 ; GCN1-LABEL: flat_atomic_and_i64_noret_offset:
5044 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5045 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
5046 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
5047 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
5048 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
5049 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5050 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
5051 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
5052 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
5053 ; GCN1-NEXT: s_cbranch_execnz .LBB41_3
5054 ; GCN1-NEXT: ; %bb.1: ; %Flow
5055 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
5056 ; GCN1-NEXT: s_cbranch_execnz .LBB41_4
5057 ; GCN1-NEXT: .LBB41_2: ; %atomicrmw.phi
5058 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
5059 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5060 ; GCN1-NEXT: .LBB41_3: ; %atomicrmw.global
5061 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
5062 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5063 ; GCN1-NEXT: buffer_wbinvl1_vol
5064 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
5065 ; GCN1-NEXT: ; implicit-def: $vgpr3
5066 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
5067 ; GCN1-NEXT: s_cbranch_execz .LBB41_2
5068 ; GCN1-NEXT: .LBB41_4: ; %atomicrmw.private
5069 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5070 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
5071 ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
5072 ; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
5073 ; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
5074 ; GCN1-NEXT: s_waitcnt vmcnt(1)
5075 ; GCN1-NEXT: v_and_b32_e32 v2, v4, v2
5076 ; GCN1-NEXT: s_waitcnt vmcnt(0)
5077 ; GCN1-NEXT: v_and_b32_e32 v3, v5, v3
5078 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
5079 ; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
5080 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
5081 ; GCN1-NEXT: s_waitcnt vmcnt(0)
5082 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5084 ; GCN2-LABEL: flat_atomic_and_i64_noret_offset:
5086 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5087 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
5088 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
5089 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
5090 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
5091 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5092 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
5093 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
5094 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
5095 ; GCN2-NEXT: s_cbranch_execnz .LBB41_3
5096 ; GCN2-NEXT: ; %bb.1: ; %Flow
5097 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
5098 ; GCN2-NEXT: s_cbranch_execnz .LBB41_4
5099 ; GCN2-NEXT: .LBB41_2: ; %atomicrmw.phi
5100 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
5101 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5102 ; GCN2-NEXT: .LBB41_3: ; %atomicrmw.global
5103 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
5104 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5105 ; GCN2-NEXT: buffer_wbinvl1_vol
5106 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
5107 ; GCN2-NEXT: ; implicit-def: $vgpr3
5108 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
5109 ; GCN2-NEXT: s_cbranch_execz .LBB41_2
5110 ; GCN2-NEXT: .LBB41_4: ; %atomicrmw.private
5111 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5112 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
5113 ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
5114 ; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
5115 ; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
5116 ; GCN2-NEXT: s_waitcnt vmcnt(1)
5117 ; GCN2-NEXT: v_and_b32_e32 v2, v4, v2
5118 ; GCN2-NEXT: s_waitcnt vmcnt(0)
5119 ; GCN2-NEXT: v_and_b32_e32 v3, v5, v3
5120 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
5121 ; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
5122 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
5123 ; GCN2-NEXT: s_waitcnt vmcnt(0)
5124 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5126 ; GCN3-LABEL: flat_atomic_and_i64_noret_offset:
5128 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5129 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
5130 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
5131 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
5132 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
5133 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
5134 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
5135 ; GCN3-NEXT: s_cbranch_execnz .LBB41_3
5136 ; GCN3-NEXT: ; %bb.1: ; %Flow
5137 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
5138 ; GCN3-NEXT: s_cbranch_execnz .LBB41_4
5139 ; GCN3-NEXT: .LBB41_2: ; %atomicrmw.phi
5140 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
5141 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5142 ; GCN3-NEXT: .LBB41_3: ; %atomicrmw.global
5143 ; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
5144 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5145 ; GCN3-NEXT: buffer_wbinvl1_vol
5146 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
5147 ; GCN3-NEXT: ; implicit-def: $vgpr3
5148 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
5149 ; GCN3-NEXT: s_cbranch_execz .LBB41_2
5150 ; GCN3-NEXT: .LBB41_4: ; %atomicrmw.private
5151 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5152 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
5153 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
5154 ; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
5155 ; GCN3-NEXT: s_waitcnt vmcnt(1)
5156 ; GCN3-NEXT: v_and_b32_e32 v1, v1, v3
5157 ; GCN3-NEXT: s_waitcnt vmcnt(0)
5158 ; GCN3-NEXT: v_and_b32_e32 v2, v4, v2
5159 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
5160 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
5161 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
5162 ; GCN3-NEXT: s_waitcnt vmcnt(0)
5163 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5164 %gep = getelementptr i64, ptr %out, i64 4
5165 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst
5169 define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) {
5170 ; GCN1-LABEL: flat_atomic_and_i64_ret:
5172 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5173 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
5174 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
5175 ; GCN1-NEXT: v_mov_b32_e32 v5, v1
5176 ; GCN1-NEXT: v_mov_b32_e32 v4, v0
5177 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
5178 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5179 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
5180 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
5181 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
5182 ; GCN1-NEXT: s_cbranch_execnz .LBB42_3
5183 ; GCN1-NEXT: ; %bb.1: ; %Flow
5184 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
5185 ; GCN1-NEXT: s_cbranch_execnz .LBB42_4
5186 ; GCN1-NEXT: .LBB42_2: ; %atomicrmw.phi
5187 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
5188 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5189 ; GCN1-NEXT: .LBB42_3: ; %atomicrmw.global
5190 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc
5191 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5192 ; GCN1-NEXT: buffer_wbinvl1_vol
5193 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
5194 ; GCN1-NEXT: ; implicit-def: $vgpr3
5195 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
5196 ; GCN1-NEXT: s_cbranch_execz .LBB42_2
5197 ; GCN1-NEXT: .LBB42_4: ; %atomicrmw.private
5198 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
5199 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
5200 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
5201 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
5202 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
5203 ; GCN1-NEXT: s_waitcnt vmcnt(1)
5204 ; GCN1-NEXT: v_and_b32_e32 v2, v0, v2
5205 ; GCN1-NEXT: s_waitcnt vmcnt(0)
5206 ; GCN1-NEXT: v_and_b32_e32 v3, v1, v3
5207 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
5208 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
5209 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
5210 ; GCN1-NEXT: s_waitcnt vmcnt(0)
5211 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5213 ; GCN2-LABEL: flat_atomic_and_i64_ret:
5215 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5216 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
5217 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
5218 ; GCN2-NEXT: v_mov_b32_e32 v5, v1
5219 ; GCN2-NEXT: v_mov_b32_e32 v4, v0
5220 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
5221 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5222 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
5223 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
5224 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
5225 ; GCN2-NEXT: s_cbranch_execnz .LBB42_3
5226 ; GCN2-NEXT: ; %bb.1: ; %Flow
5227 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
5228 ; GCN2-NEXT: s_cbranch_execnz .LBB42_4
5229 ; GCN2-NEXT: .LBB42_2: ; %atomicrmw.phi
5230 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
5231 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5232 ; GCN2-NEXT: .LBB42_3: ; %atomicrmw.global
5233 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc
5234 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5235 ; GCN2-NEXT: buffer_wbinvl1_vol
5236 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
5237 ; GCN2-NEXT: ; implicit-def: $vgpr3
5238 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
5239 ; GCN2-NEXT: s_cbranch_execz .LBB42_2
5240 ; GCN2-NEXT: .LBB42_4: ; %atomicrmw.private
5241 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
5242 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
5243 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
5244 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
5245 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
5246 ; GCN2-NEXT: s_waitcnt vmcnt(1)
5247 ; GCN2-NEXT: v_and_b32_e32 v2, v0, v2
5248 ; GCN2-NEXT: s_waitcnt vmcnt(0)
5249 ; GCN2-NEXT: v_and_b32_e32 v3, v1, v3
5250 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
5251 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
5252 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
5253 ; GCN2-NEXT: s_waitcnt vmcnt(0)
5254 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5256 ; GCN3-LABEL: flat_atomic_and_i64_ret:
5258 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5259 ; GCN3-NEXT: v_mov_b32_e32 v5, v1
5260 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
5261 ; GCN3-NEXT: v_mov_b32_e32 v4, v0
5262 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
5263 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
5264 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
5265 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
5266 ; GCN3-NEXT: s_cbranch_execnz .LBB42_3
5267 ; GCN3-NEXT: ; %bb.1: ; %Flow
5268 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
5269 ; GCN3-NEXT: s_cbranch_execnz .LBB42_4
5270 ; GCN3-NEXT: .LBB42_2: ; %atomicrmw.phi
5271 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
5272 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5273 ; GCN3-NEXT: .LBB42_3: ; %atomicrmw.global
5274 ; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc
5275 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5276 ; GCN3-NEXT: buffer_wbinvl1_vol
5277 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
5278 ; GCN3-NEXT: ; implicit-def: $vgpr3
5279 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
5280 ; GCN3-NEXT: s_cbranch_execz .LBB42_2
5281 ; GCN3-NEXT: .LBB42_4: ; %atomicrmw.private
5282 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
5283 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
5284 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
5285 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
5286 ; GCN3-NEXT: s_waitcnt vmcnt(1)
5287 ; GCN3-NEXT: v_and_b32_e32 v3, v1, v3
5288 ; GCN3-NEXT: s_waitcnt vmcnt(0)
5289 ; GCN3-NEXT: v_and_b32_e32 v2, v0, v2
5290 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
5291 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
5292 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
5293 ; GCN3-NEXT: s_waitcnt vmcnt(0)
5294 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5295 %result = atomicrmw and ptr %ptr, i64 %in seq_cst
5299 define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) {
5300 ; GCN1-LABEL: flat_atomic_and_i64_ret_offset:
5302 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5303 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
5304 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
5305 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
5306 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
5307 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5308 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
5309 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
5310 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
5311 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
5312 ; GCN1-NEXT: s_cbranch_execnz .LBB43_3
5313 ; GCN1-NEXT: ; %bb.1: ; %Flow
5314 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
5315 ; GCN1-NEXT: s_cbranch_execnz .LBB43_4
5316 ; GCN1-NEXT: .LBB43_2: ; %atomicrmw.phi
5317 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
5318 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5319 ; GCN1-NEXT: .LBB43_3: ; %atomicrmw.global
5320 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc
5321 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5322 ; GCN1-NEXT: buffer_wbinvl1_vol
5323 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
5324 ; GCN1-NEXT: ; implicit-def: $vgpr3
5325 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
5326 ; GCN1-NEXT: s_cbranch_execz .LBB43_2
5327 ; GCN1-NEXT: .LBB43_4: ; %atomicrmw.private
5328 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
5329 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
5330 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
5331 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
5332 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
5333 ; GCN1-NEXT: s_waitcnt vmcnt(1)
5334 ; GCN1-NEXT: v_and_b32_e32 v2, v0, v2
5335 ; GCN1-NEXT: s_waitcnt vmcnt(0)
5336 ; GCN1-NEXT: v_and_b32_e32 v3, v1, v3
5337 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
5338 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
5339 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
5340 ; GCN1-NEXT: s_waitcnt vmcnt(0)
5341 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5343 ; GCN2-LABEL: flat_atomic_and_i64_ret_offset:
5345 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5346 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
5347 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
5348 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
5349 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
5350 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5351 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
5352 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
5353 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
5354 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
5355 ; GCN2-NEXT: s_cbranch_execnz .LBB43_3
5356 ; GCN2-NEXT: ; %bb.1: ; %Flow
5357 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
5358 ; GCN2-NEXT: s_cbranch_execnz .LBB43_4
5359 ; GCN2-NEXT: .LBB43_2: ; %atomicrmw.phi
5360 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
5361 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5362 ; GCN2-NEXT: .LBB43_3: ; %atomicrmw.global
5363 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc
5364 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5365 ; GCN2-NEXT: buffer_wbinvl1_vol
5366 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
5367 ; GCN2-NEXT: ; implicit-def: $vgpr3
5368 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
5369 ; GCN2-NEXT: s_cbranch_execz .LBB43_2
5370 ; GCN2-NEXT: .LBB43_4: ; %atomicrmw.private
5371 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
5372 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
5373 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
5374 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
5375 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
5376 ; GCN2-NEXT: s_waitcnt vmcnt(1)
5377 ; GCN2-NEXT: v_and_b32_e32 v2, v0, v2
5378 ; GCN2-NEXT: s_waitcnt vmcnt(0)
5379 ; GCN2-NEXT: v_and_b32_e32 v3, v1, v3
5380 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
5381 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
5382 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
5383 ; GCN2-NEXT: s_waitcnt vmcnt(0)
5384 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5386 ; GCN3-LABEL: flat_atomic_and_i64_ret_offset:
5388 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5389 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
5390 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
5391 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
5392 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
5393 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
5394 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
5395 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
5396 ; GCN3-NEXT: s_cbranch_execnz .LBB43_3
5397 ; GCN3-NEXT: ; %bb.1: ; %Flow
5398 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
5399 ; GCN3-NEXT: s_cbranch_execnz .LBB43_4
5400 ; GCN3-NEXT: .LBB43_2: ; %atomicrmw.phi
5401 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
5402 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5403 ; GCN3-NEXT: .LBB43_3: ; %atomicrmw.global
5404 ; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc
5405 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5406 ; GCN3-NEXT: buffer_wbinvl1_vol
5407 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
5408 ; GCN3-NEXT: ; implicit-def: $vgpr3
5409 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
5410 ; GCN3-NEXT: s_cbranch_execz .LBB43_2
5411 ; GCN3-NEXT: .LBB43_4: ; %atomicrmw.private
5412 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
5413 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
5414 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
5415 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
5416 ; GCN3-NEXT: s_waitcnt vmcnt(1)
5417 ; GCN3-NEXT: v_and_b32_e32 v3, v1, v3
5418 ; GCN3-NEXT: s_waitcnt vmcnt(0)
5419 ; GCN3-NEXT: v_and_b32_e32 v2, v0, v2
5420 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
5421 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
5422 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
5423 ; GCN3-NEXT: s_waitcnt vmcnt(0)
5424 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5425 %gep = getelementptr i64, ptr %out, i64 4
5426 %result = atomicrmw and ptr %gep, i64 %in seq_cst
5430 define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
5431 ; GCN1-LABEL: flat_atomic_and_i64_noret_scalar:
5433 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5434 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
5435 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
5436 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5437 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
5438 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
5439 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
5440 ; GCN1-NEXT: s_mov_b64 s[34:35], -1
5441 ; GCN1-NEXT: s_cbranch_vccnz .LBB44_3
5442 ; GCN1-NEXT: ; %bb.1: ; %Flow
5443 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
5444 ; GCN1-NEXT: s_cbranch_vccz .LBB44_4
5445 ; GCN1-NEXT: .LBB44_2: ; %atomicrmw.phi
5446 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5447 ; GCN1-NEXT: .LBB44_3: ; %atomicrmw.global
5448 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
5449 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
5450 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
5451 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
5452 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
5453 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5454 ; GCN1-NEXT: buffer_wbinvl1_vol
5455 ; GCN1-NEXT: s_cbranch_execnz .LBB44_2
5456 ; GCN1-NEXT: .LBB44_4: ; %atomicrmw.private
5457 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
5458 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
5459 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
5460 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
5461 ; GCN1-NEXT: s_add_i32 s34, s34, 4
5462 ; GCN1-NEXT: v_mov_b32_e32 v1, s34
5463 ; GCN1-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
5464 ; GCN1-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
5465 ; GCN1-NEXT: s_waitcnt vmcnt(1)
5466 ; GCN1-NEXT: v_and_b32_e32 v2, s6, v2
5467 ; GCN1-NEXT: s_waitcnt vmcnt(0)
5468 ; GCN1-NEXT: v_and_b32_e32 v3, s7, v3
5469 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
5470 ; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
5471 ; GCN1-NEXT: s_waitcnt vmcnt(0)
5472 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5474 ; GCN2-LABEL: flat_atomic_and_i64_noret_scalar:
5476 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5477 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
5478 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
5479 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5480 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
5481 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
5482 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
5483 ; GCN2-NEXT: s_mov_b64 s[34:35], -1
5484 ; GCN2-NEXT: s_cbranch_vccnz .LBB44_3
5485 ; GCN2-NEXT: ; %bb.1: ; %Flow
5486 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
5487 ; GCN2-NEXT: s_cbranch_vccz .LBB44_4
5488 ; GCN2-NEXT: .LBB44_2: ; %atomicrmw.phi
5489 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5490 ; GCN2-NEXT: .LBB44_3: ; %atomicrmw.global
5491 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
5492 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
5493 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
5494 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
5495 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
5496 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5497 ; GCN2-NEXT: buffer_wbinvl1_vol
5498 ; GCN2-NEXT: s_cbranch_execnz .LBB44_2
5499 ; GCN2-NEXT: .LBB44_4: ; %atomicrmw.private
5500 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
5501 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
5502 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
5503 ; GCN2-NEXT: s_add_i32 s34, s34, 4
5504 ; GCN2-NEXT: v_mov_b32_e32 v1, s34
5505 ; GCN2-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
5506 ; GCN2-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
5507 ; GCN2-NEXT: s_waitcnt vmcnt(1)
5508 ; GCN2-NEXT: v_and_b32_e32 v2, s6, v2
5509 ; GCN2-NEXT: s_waitcnt vmcnt(0)
5510 ; GCN2-NEXT: v_and_b32_e32 v3, s7, v3
5511 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
5512 ; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
5513 ; GCN2-NEXT: s_waitcnt vmcnt(0)
5514 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5516 ; GCN3-LABEL: flat_atomic_and_i64_noret_scalar:
5518 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5519 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
5520 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
5521 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
5522 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
5523 ; GCN3-NEXT: s_mov_b64 s[34:35], -1
5524 ; GCN3-NEXT: s_cbranch_vccnz .LBB44_3
5525 ; GCN3-NEXT: ; %bb.1: ; %Flow
5526 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
5527 ; GCN3-NEXT: s_cbranch_vccz .LBB44_4
5528 ; GCN3-NEXT: .LBB44_2: ; %atomicrmw.phi
5529 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5530 ; GCN3-NEXT: .LBB44_3: ; %atomicrmw.global
5531 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
5532 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
5533 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
5534 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
5535 ; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
5536 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5537 ; GCN3-NEXT: buffer_wbinvl1_vol
5538 ; GCN3-NEXT: s_cbranch_execnz .LBB44_2
5539 ; GCN3-NEXT: .LBB44_4: ; %atomicrmw.private
5540 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
5541 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
5542 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
5543 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
5544 ; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
5545 ; GCN3-NEXT: s_waitcnt vmcnt(1)
5546 ; GCN3-NEXT: v_and_b32_e32 v1, s7, v1
5547 ; GCN3-NEXT: s_waitcnt vmcnt(0)
5548 ; GCN3-NEXT: v_and_b32_e32 v2, s6, v2
5549 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
5550 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
5551 ; GCN3-NEXT: s_waitcnt vmcnt(0)
5552 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5553 %tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst
5557 define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
5558 ; GCN1-LABEL: flat_atomic_and_i64_noret_offset_scalar:
5560 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5561 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
5562 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
5563 ; GCN1-NEXT: s_add_u32 s34, s4, 32
5564 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
5565 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5566 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
5567 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
5568 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
5569 ; GCN1-NEXT: s_mov_b64 s[36:37], -1
5570 ; GCN1-NEXT: s_cbranch_vccnz .LBB45_3
5571 ; GCN1-NEXT: ; %bb.1: ; %Flow
5572 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
5573 ; GCN1-NEXT: s_cbranch_vccz .LBB45_4
5574 ; GCN1-NEXT: .LBB45_2: ; %atomicrmw.phi
5575 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5576 ; GCN1-NEXT: .LBB45_3: ; %atomicrmw.global
5577 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
5578 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
5579 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
5580 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
5581 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
5582 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5583 ; GCN1-NEXT: buffer_wbinvl1_vol
5584 ; GCN1-NEXT: s_cbranch_execnz .LBB45_2
5585 ; GCN1-NEXT: .LBB45_4: ; %atomicrmw.private
5586 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
5587 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
5588 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
5589 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
5590 ; GCN1-NEXT: s_add_i32 s34, s34, 4
5591 ; GCN1-NEXT: v_mov_b32_e32 v1, s34
5592 ; GCN1-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
5593 ; GCN1-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
5594 ; GCN1-NEXT: s_waitcnt vmcnt(1)
5595 ; GCN1-NEXT: v_and_b32_e32 v2, s6, v2
5596 ; GCN1-NEXT: s_waitcnt vmcnt(0)
5597 ; GCN1-NEXT: v_and_b32_e32 v3, s7, v3
5598 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
5599 ; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
5600 ; GCN1-NEXT: s_waitcnt vmcnt(0)
5601 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5603 ; GCN2-LABEL: flat_atomic_and_i64_noret_offset_scalar:
5605 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5606 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
5607 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
5608 ; GCN2-NEXT: s_add_u32 s34, s4, 32
5609 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
5610 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5611 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
5612 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
5613 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
5614 ; GCN2-NEXT: s_mov_b64 s[36:37], -1
5615 ; GCN2-NEXT: s_cbranch_vccnz .LBB45_3
5616 ; GCN2-NEXT: ; %bb.1: ; %Flow
5617 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
5618 ; GCN2-NEXT: s_cbranch_vccz .LBB45_4
5619 ; GCN2-NEXT: .LBB45_2: ; %atomicrmw.phi
5620 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5621 ; GCN2-NEXT: .LBB45_3: ; %atomicrmw.global
5622 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
5623 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
5624 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
5625 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
5626 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
5627 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5628 ; GCN2-NEXT: buffer_wbinvl1_vol
5629 ; GCN2-NEXT: s_cbranch_execnz .LBB45_2
5630 ; GCN2-NEXT: .LBB45_4: ; %atomicrmw.private
5631 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
5632 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
5633 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
5634 ; GCN2-NEXT: s_add_i32 s34, s34, 4
5635 ; GCN2-NEXT: v_mov_b32_e32 v1, s34
5636 ; GCN2-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
5637 ; GCN2-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
5638 ; GCN2-NEXT: s_waitcnt vmcnt(1)
5639 ; GCN2-NEXT: v_and_b32_e32 v2, s6, v2
5640 ; GCN2-NEXT: s_waitcnt vmcnt(0)
5641 ; GCN2-NEXT: v_and_b32_e32 v3, s7, v3
5642 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
5643 ; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
5644 ; GCN2-NEXT: s_waitcnt vmcnt(0)
5645 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5647 ; GCN3-LABEL: flat_atomic_and_i64_noret_offset_scalar:
5649 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5650 ; GCN3-NEXT: s_add_u32 s34, s4, 32
5651 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
5652 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
5653 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
5654 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
5655 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
5656 ; GCN3-NEXT: s_mov_b64 s[36:37], -1
5657 ; GCN3-NEXT: s_cbranch_vccnz .LBB45_3
5658 ; GCN3-NEXT: ; %bb.1: ; %Flow
5659 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
5660 ; GCN3-NEXT: s_cbranch_vccz .LBB45_4
5661 ; GCN3-NEXT: .LBB45_2: ; %atomicrmw.phi
5662 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5663 ; GCN3-NEXT: .LBB45_3: ; %atomicrmw.global
5664 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
5665 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
5666 ; GCN3-NEXT: v_mov_b32_e32 v1, s35
5667 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
5668 ; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
5669 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5670 ; GCN3-NEXT: buffer_wbinvl1_vol
5671 ; GCN3-NEXT: s_cbranch_execnz .LBB45_2
5672 ; GCN3-NEXT: .LBB45_4: ; %atomicrmw.private
5673 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
5674 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
5675 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
5676 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
5677 ; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
5678 ; GCN3-NEXT: s_waitcnt vmcnt(1)
5679 ; GCN3-NEXT: v_and_b32_e32 v1, s7, v1
5680 ; GCN3-NEXT: s_waitcnt vmcnt(0)
5681 ; GCN3-NEXT: v_and_b32_e32 v2, s6, v2
5682 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
5683 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
5684 ; GCN3-NEXT: s_waitcnt vmcnt(0)
5685 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5686 %gep = getelementptr i64, ptr %out, i64 4
5687 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst
5691 define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
5692 ; GCN1-LABEL: flat_atomic_and_i64_ret_scalar:
5694 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5695 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
5696 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
5697 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5698 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
5699 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
5700 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
5701 ; GCN1-NEXT: s_cbranch_vccz .LBB46_2
5702 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
5703 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
5704 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
5705 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
5706 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
5707 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
5708 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5709 ; GCN1-NEXT: buffer_wbinvl1_vol
5710 ; GCN1-NEXT: s_cbranch_execz .LBB46_3
5711 ; GCN1-NEXT: s_branch .LBB46_4
5712 ; GCN1-NEXT: .LBB46_2:
5713 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
5714 ; GCN1-NEXT: .LBB46_3: ; %atomicrmw.private
5715 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
5716 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
5717 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
5718 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
5719 ; GCN1-NEXT: s_add_i32 s34, s34, 4
5720 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
5721 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
5722 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
5723 ; GCN1-NEXT: s_waitcnt vmcnt(1)
5724 ; GCN1-NEXT: v_and_b32_e32 v4, s6, v0
5725 ; GCN1-NEXT: s_waitcnt vmcnt(0)
5726 ; GCN1-NEXT: v_and_b32_e32 v5, s7, v1
5727 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
5728 ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
5729 ; GCN1-NEXT: .LBB46_4: ; %atomicrmw.end
5730 ; GCN1-NEXT: s_waitcnt vmcnt(0)
5731 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5733 ; GCN2-LABEL: flat_atomic_and_i64_ret_scalar:
5735 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5736 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
5737 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
5738 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5739 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
5740 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
5741 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
5742 ; GCN2-NEXT: s_cbranch_vccz .LBB46_2
5743 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
5744 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
5745 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
5746 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
5747 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
5748 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
5749 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5750 ; GCN2-NEXT: buffer_wbinvl1_vol
5751 ; GCN2-NEXT: s_cbranch_execz .LBB46_3
5752 ; GCN2-NEXT: s_branch .LBB46_4
5753 ; GCN2-NEXT: .LBB46_2:
5754 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
5755 ; GCN2-NEXT: .LBB46_3: ; %atomicrmw.private
5756 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
5757 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
5758 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
5759 ; GCN2-NEXT: s_add_i32 s34, s34, 4
5760 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
5761 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
5762 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
5763 ; GCN2-NEXT: s_waitcnt vmcnt(1)
5764 ; GCN2-NEXT: v_and_b32_e32 v4, s6, v0
5765 ; GCN2-NEXT: s_waitcnt vmcnt(0)
5766 ; GCN2-NEXT: v_and_b32_e32 v5, s7, v1
5767 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
5768 ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
5769 ; GCN2-NEXT: .LBB46_4: ; %atomicrmw.end
5770 ; GCN2-NEXT: s_waitcnt vmcnt(0)
5771 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5773 ; GCN3-LABEL: flat_atomic_and_i64_ret_scalar:
5775 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5776 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
5777 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
5778 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
5779 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
5780 ; GCN3-NEXT: s_cbranch_vccz .LBB46_2
5781 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
5782 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
5783 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
5784 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
5785 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
5786 ; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
5787 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5788 ; GCN3-NEXT: buffer_wbinvl1_vol
5789 ; GCN3-NEXT: s_cbranch_execz .LBB46_3
5790 ; GCN3-NEXT: s_branch .LBB46_4
5791 ; GCN3-NEXT: .LBB46_2:
5792 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
5793 ; GCN3-NEXT: .LBB46_3: ; %atomicrmw.private
5794 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
5795 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
5796 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
5797 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
5798 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
5799 ; GCN3-NEXT: s_waitcnt vmcnt(1)
5800 ; GCN3-NEXT: v_and_b32_e32 v3, s7, v1
5801 ; GCN3-NEXT: s_waitcnt vmcnt(0)
5802 ; GCN3-NEXT: v_and_b32_e32 v4, s6, v0
5803 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
5804 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
5805 ; GCN3-NEXT: .LBB46_4: ; %atomicrmw.end
5806 ; GCN3-NEXT: s_waitcnt vmcnt(0)
5807 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5808 %result = atomicrmw and ptr %ptr, i64 %in seq_cst
5812 define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
5813 ; GCN1-LABEL: flat_atomic_and_i64_ret_offset_scalar:
5815 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5816 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
5817 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
5818 ; GCN1-NEXT: s_add_u32 s34, s4, 32
5819 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
5820 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5821 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
5822 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
5823 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
5824 ; GCN1-NEXT: s_cbranch_vccz .LBB47_2
5825 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
5826 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
5827 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
5828 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
5829 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
5830 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
5831 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5832 ; GCN1-NEXT: buffer_wbinvl1_vol
5833 ; GCN1-NEXT: s_cbranch_execz .LBB47_3
5834 ; GCN1-NEXT: s_branch .LBB47_4
5835 ; GCN1-NEXT: .LBB47_2:
5836 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
5837 ; GCN1-NEXT: .LBB47_3: ; %atomicrmw.private
5838 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
5839 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
5840 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
5841 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
5842 ; GCN1-NEXT: s_add_i32 s34, s34, 4
5843 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
5844 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
5845 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
5846 ; GCN1-NEXT: s_waitcnt vmcnt(1)
5847 ; GCN1-NEXT: v_and_b32_e32 v4, s6, v0
5848 ; GCN1-NEXT: s_waitcnt vmcnt(0)
5849 ; GCN1-NEXT: v_and_b32_e32 v5, s7, v1
5850 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
5851 ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
5852 ; GCN1-NEXT: .LBB47_4: ; %atomicrmw.end
5853 ; GCN1-NEXT: s_waitcnt vmcnt(0)
5854 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5856 ; GCN2-LABEL: flat_atomic_and_i64_ret_offset_scalar:
5858 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5859 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
5860 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
5861 ; GCN2-NEXT: s_add_u32 s34, s4, 32
5862 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
5863 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5864 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
5865 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
5866 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
5867 ; GCN2-NEXT: s_cbranch_vccz .LBB47_2
5868 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
5869 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
5870 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
5871 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
5872 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
5873 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
5874 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5875 ; GCN2-NEXT: buffer_wbinvl1_vol
5876 ; GCN2-NEXT: s_cbranch_execz .LBB47_3
5877 ; GCN2-NEXT: s_branch .LBB47_4
5878 ; GCN2-NEXT: .LBB47_2:
5879 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
5880 ; GCN2-NEXT: .LBB47_3: ; %atomicrmw.private
5881 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
5882 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
5883 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
5884 ; GCN2-NEXT: s_add_i32 s34, s34, 4
5885 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
5886 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
5887 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
5888 ; GCN2-NEXT: s_waitcnt vmcnt(1)
5889 ; GCN2-NEXT: v_and_b32_e32 v4, s6, v0
5890 ; GCN2-NEXT: s_waitcnt vmcnt(0)
5891 ; GCN2-NEXT: v_and_b32_e32 v5, s7, v1
5892 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
5893 ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
5894 ; GCN2-NEXT: .LBB47_4: ; %atomicrmw.end
5895 ; GCN2-NEXT: s_waitcnt vmcnt(0)
5896 ; GCN2-NEXT: s_setpc_b64 s[30:31]
5898 ; GCN3-LABEL: flat_atomic_and_i64_ret_offset_scalar:
5900 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5901 ; GCN3-NEXT: s_add_u32 s34, s4, 32
5902 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
5903 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
5904 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
5905 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
5906 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
5907 ; GCN3-NEXT: s_cbranch_vccz .LBB47_2
5908 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
5909 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
5910 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
5911 ; GCN3-NEXT: v_mov_b32_e32 v1, s35
5912 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
5913 ; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
5914 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5915 ; GCN3-NEXT: buffer_wbinvl1_vol
5916 ; GCN3-NEXT: s_cbranch_execz .LBB47_3
5917 ; GCN3-NEXT: s_branch .LBB47_4
5918 ; GCN3-NEXT: .LBB47_2:
5919 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
5920 ; GCN3-NEXT: .LBB47_3: ; %atomicrmw.private
5921 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
5922 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
5923 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
5924 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
5925 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
5926 ; GCN3-NEXT: s_waitcnt vmcnt(1)
5927 ; GCN3-NEXT: v_and_b32_e32 v3, s7, v1
5928 ; GCN3-NEXT: s_waitcnt vmcnt(0)
5929 ; GCN3-NEXT: v_and_b32_e32 v4, s6, v0
5930 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
5931 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
5932 ; GCN3-NEXT: .LBB47_4: ; %atomicrmw.end
5933 ; GCN3-NEXT: s_waitcnt vmcnt(0)
5934 ; GCN3-NEXT: s_setpc_b64 s[30:31]
5935 %gep = getelementptr i64, ptr %out, i64 4
5936 %result = atomicrmw and ptr %gep, i64 %in seq_cst
5940 define void @flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
5941 ; GCN1-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory:
5943 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5944 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
5945 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
5946 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
5947 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
5948 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5949 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
5950 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
5951 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
5952 ; GCN1-NEXT: s_cbranch_execnz .LBB48_3
5953 ; GCN1-NEXT: ; %bb.1: ; %Flow
5954 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
5955 ; GCN1-NEXT: s_cbranch_execnz .LBB48_4
5956 ; GCN1-NEXT: .LBB48_2: ; %atomicrmw.phi
5957 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
5958 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5959 ; GCN1-NEXT: .LBB48_3: ; %atomicrmw.global
5960 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
5961 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5962 ; GCN1-NEXT: buffer_wbinvl1_vol
5963 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
5964 ; GCN1-NEXT: ; implicit-def: $vgpr3
5965 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
5966 ; GCN1-NEXT: s_cbranch_execz .LBB48_2
5967 ; GCN1-NEXT: .LBB48_4: ; %atomicrmw.private
5968 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
5969 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
5970 ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
5971 ; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
5972 ; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
5973 ; GCN1-NEXT: s_waitcnt vmcnt(1)
5974 ; GCN1-NEXT: v_and_b32_e32 v2, v4, v2
5975 ; GCN1-NEXT: s_waitcnt vmcnt(0)
5976 ; GCN1-NEXT: v_and_b32_e32 v3, v5, v3
5977 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
5978 ; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
5979 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
5980 ; GCN1-NEXT: s_waitcnt vmcnt(0)
5981 ; GCN1-NEXT: s_setpc_b64 s[30:31]
5983 ; GCN2-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory:
5985 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5986 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
5987 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
5988 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
5989 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
5990 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5991 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
5992 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
5993 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
5994 ; GCN2-NEXT: s_cbranch_execnz .LBB48_3
5995 ; GCN2-NEXT: ; %bb.1: ; %Flow
5996 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
5997 ; GCN2-NEXT: s_cbranch_execnz .LBB48_4
5998 ; GCN2-NEXT: .LBB48_2: ; %atomicrmw.phi
5999 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
6000 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6001 ; GCN2-NEXT: .LBB48_3: ; %atomicrmw.global
6002 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
6003 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6004 ; GCN2-NEXT: buffer_wbinvl1_vol
6005 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
6006 ; GCN2-NEXT: ; implicit-def: $vgpr3
6007 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6008 ; GCN2-NEXT: s_cbranch_execz .LBB48_2
6009 ; GCN2-NEXT: .LBB48_4: ; %atomicrmw.private
6010 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
6011 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
6012 ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
6013 ; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
6014 ; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
6015 ; GCN2-NEXT: s_waitcnt vmcnt(1)
6016 ; GCN2-NEXT: v_and_b32_e32 v2, v4, v2
6017 ; GCN2-NEXT: s_waitcnt vmcnt(0)
6018 ; GCN2-NEXT: v_and_b32_e32 v3, v5, v3
6019 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
6020 ; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
6021 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
6022 ; GCN2-NEXT: s_waitcnt vmcnt(0)
6023 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6025 ; GCN3-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory:
6027 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6028 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
6029 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
6030 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
6031 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
6032 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
6033 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
6034 ; GCN3-NEXT: s_cbranch_execnz .LBB48_3
6035 ; GCN3-NEXT: ; %bb.1: ; %Flow
6036 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6037 ; GCN3-NEXT: s_cbranch_execnz .LBB48_4
6038 ; GCN3-NEXT: .LBB48_2: ; %atomicrmw.phi
6039 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
6040 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6041 ; GCN3-NEXT: .LBB48_3: ; %atomicrmw.global
6042 ; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
6043 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6044 ; GCN3-NEXT: buffer_wbinvl1_vol
6045 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
6046 ; GCN3-NEXT: ; implicit-def: $vgpr3
6047 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6048 ; GCN3-NEXT: s_cbranch_execz .LBB48_2
6049 ; GCN3-NEXT: .LBB48_4: ; %atomicrmw.private
6050 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
6051 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
6052 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
6053 ; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
6054 ; GCN3-NEXT: s_waitcnt vmcnt(1)
6055 ; GCN3-NEXT: v_and_b32_e32 v1, v1, v3
6056 ; GCN3-NEXT: s_waitcnt vmcnt(0)
6057 ; GCN3-NEXT: v_and_b32_e32 v2, v4, v2
6058 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
6059 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
6060 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
6061 ; GCN3-NEXT: s_waitcnt vmcnt(0)
6062 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6063 %gep = getelementptr i64, ptr %out, i64 4
6064 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
6068 define i64 @flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
6069 ; GCN1-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory:
6071 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6072 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
6073 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
6074 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
6075 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
6076 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6077 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
6078 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
6079 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
6080 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
6081 ; GCN1-NEXT: s_cbranch_execnz .LBB49_3
6082 ; GCN1-NEXT: ; %bb.1: ; %Flow
6083 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6084 ; GCN1-NEXT: s_cbranch_execnz .LBB49_4
6085 ; GCN1-NEXT: .LBB49_2: ; %atomicrmw.phi
6086 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
6087 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6088 ; GCN1-NEXT: .LBB49_3: ; %atomicrmw.global
6089 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc
6090 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6091 ; GCN1-NEXT: buffer_wbinvl1_vol
6092 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
6093 ; GCN1-NEXT: ; implicit-def: $vgpr3
6094 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6095 ; GCN1-NEXT: s_cbranch_execz .LBB49_2
6096 ; GCN1-NEXT: .LBB49_4: ; %atomicrmw.private
6097 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
6098 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
6099 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
6100 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
6101 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
6102 ; GCN1-NEXT: s_waitcnt vmcnt(1)
6103 ; GCN1-NEXT: v_and_b32_e32 v2, v0, v2
6104 ; GCN1-NEXT: s_waitcnt vmcnt(0)
6105 ; GCN1-NEXT: v_and_b32_e32 v3, v1, v3
6106 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
6107 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
6108 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
6109 ; GCN1-NEXT: s_waitcnt vmcnt(0)
6110 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6112 ; GCN2-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory:
6114 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6115 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
6116 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
6117 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
6118 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
6119 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6120 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
6121 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
6122 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
6123 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
6124 ; GCN2-NEXT: s_cbranch_execnz .LBB49_3
6125 ; GCN2-NEXT: ; %bb.1: ; %Flow
6126 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6127 ; GCN2-NEXT: s_cbranch_execnz .LBB49_4
6128 ; GCN2-NEXT: .LBB49_2: ; %atomicrmw.phi
6129 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
6130 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6131 ; GCN2-NEXT: .LBB49_3: ; %atomicrmw.global
6132 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc
6133 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6134 ; GCN2-NEXT: buffer_wbinvl1_vol
6135 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
6136 ; GCN2-NEXT: ; implicit-def: $vgpr3
6137 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6138 ; GCN2-NEXT: s_cbranch_execz .LBB49_2
6139 ; GCN2-NEXT: .LBB49_4: ; %atomicrmw.private
6140 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
6141 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
6142 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
6143 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
6144 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
6145 ; GCN2-NEXT: s_waitcnt vmcnt(1)
6146 ; GCN2-NEXT: v_and_b32_e32 v2, v0, v2
6147 ; GCN2-NEXT: s_waitcnt vmcnt(0)
6148 ; GCN2-NEXT: v_and_b32_e32 v3, v1, v3
6149 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
6150 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
6151 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
6152 ; GCN2-NEXT: s_waitcnt vmcnt(0)
6153 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6155 ; GCN3-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory:
6157 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6158 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
6159 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
6160 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
6161 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
6162 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
6163 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
6164 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
6165 ; GCN3-NEXT: s_cbranch_execnz .LBB49_3
6166 ; GCN3-NEXT: ; %bb.1: ; %Flow
6167 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6168 ; GCN3-NEXT: s_cbranch_execnz .LBB49_4
6169 ; GCN3-NEXT: .LBB49_2: ; %atomicrmw.phi
6170 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
6171 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6172 ; GCN3-NEXT: .LBB49_3: ; %atomicrmw.global
6173 ; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc
6174 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6175 ; GCN3-NEXT: buffer_wbinvl1_vol
6176 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
6177 ; GCN3-NEXT: ; implicit-def: $vgpr3
6178 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6179 ; GCN3-NEXT: s_cbranch_execz .LBB49_2
6180 ; GCN3-NEXT: .LBB49_4: ; %atomicrmw.private
6181 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
6182 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
6183 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
6184 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
6185 ; GCN3-NEXT: s_waitcnt vmcnt(1)
6186 ; GCN3-NEXT: v_and_b32_e32 v3, v1, v3
6187 ; GCN3-NEXT: s_waitcnt vmcnt(0)
6188 ; GCN3-NEXT: v_and_b32_e32 v2, v0, v2
6189 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
6190 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
6191 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
6192 ; GCN3-NEXT: s_waitcnt vmcnt(0)
6193 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6194 %gep = getelementptr i64, ptr %out, i64 4
6195 %result = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
6199 ; ---------------------------------------------------------------------
6201 ; ---------------------------------------------------------------------
6203 define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) {
6204 ; GCN1-LABEL: flat_atomic_nand_i64_noret:
6206 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6207 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
6208 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
6209 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6210 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
6211 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
6212 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
6213 ; GCN1-NEXT: s_cbranch_execnz .LBB50_3
6214 ; GCN1-NEXT: ; %bb.1: ; %Flow3
6215 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6216 ; GCN1-NEXT: s_cbranch_execnz .LBB50_6
6217 ; GCN1-NEXT: .LBB50_2: ; %atomicrmw.phi
6218 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
6219 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6220 ; GCN1-NEXT: .LBB50_3: ; %atomicrmw.global
6221 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
6222 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
6223 ; GCN1-NEXT: flat_load_dword v7, v[4:5]
6224 ; GCN1-NEXT: flat_load_dword v6, v[0:1]
6225 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
6226 ; GCN1-NEXT: .LBB50_4: ; %atomicrmw.start
6227 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6228 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6229 ; GCN1-NEXT: v_and_b32_e32 v4, v7, v3
6230 ; GCN1-NEXT: v_and_b32_e32 v8, v6, v2
6231 ; GCN1-NEXT: v_not_b32_e32 v5, v4
6232 ; GCN1-NEXT: v_not_b32_e32 v4, v8
6233 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
6234 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6235 ; GCN1-NEXT: buffer_wbinvl1_vol
6236 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6237 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
6238 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
6239 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
6240 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
6241 ; GCN1-NEXT: s_cbranch_execnz .LBB50_4
6242 ; GCN1-NEXT: ; %bb.5: ; %Flow
6243 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
6244 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
6245 ; GCN1-NEXT: ; implicit-def: $vgpr3
6246 ; GCN1-NEXT: ; implicit-def: $vgpr2
6247 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6248 ; GCN1-NEXT: s_cbranch_execz .LBB50_2
6249 ; GCN1-NEXT: .LBB50_6: ; %atomicrmw.private
6250 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
6251 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
6252 ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
6253 ; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
6254 ; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
6255 ; GCN1-NEXT: s_waitcnt vmcnt(1)
6256 ; GCN1-NEXT: v_and_b32_e32 v2, v4, v2
6257 ; GCN1-NEXT: s_waitcnt vmcnt(0)
6258 ; GCN1-NEXT: v_and_b32_e32 v3, v5, v3
6259 ; GCN1-NEXT: v_not_b32_e32 v2, v2
6260 ; GCN1-NEXT: v_not_b32_e32 v3, v3
6261 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
6262 ; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
6263 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
6264 ; GCN1-NEXT: s_waitcnt vmcnt(0)
6265 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6267 ; GCN2-LABEL: flat_atomic_nand_i64_noret:
6269 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6270 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
6271 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
6272 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6273 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
6274 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
6275 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
6276 ; GCN2-NEXT: s_cbranch_execnz .LBB50_3
6277 ; GCN2-NEXT: ; %bb.1: ; %Flow3
6278 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6279 ; GCN2-NEXT: s_cbranch_execnz .LBB50_6
6280 ; GCN2-NEXT: .LBB50_2: ; %atomicrmw.phi
6281 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
6282 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6283 ; GCN2-NEXT: .LBB50_3: ; %atomicrmw.global
6284 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
6285 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
6286 ; GCN2-NEXT: flat_load_dword v7, v[4:5]
6287 ; GCN2-NEXT: flat_load_dword v6, v[0:1]
6288 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
6289 ; GCN2-NEXT: .LBB50_4: ; %atomicrmw.start
6290 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
6291 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6292 ; GCN2-NEXT: v_and_b32_e32 v4, v7, v3
6293 ; GCN2-NEXT: v_and_b32_e32 v8, v6, v2
6294 ; GCN2-NEXT: v_not_b32_e32 v5, v4
6295 ; GCN2-NEXT: v_not_b32_e32 v4, v8
6296 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
6297 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6298 ; GCN2-NEXT: buffer_wbinvl1_vol
6299 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6300 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
6301 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
6302 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
6303 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
6304 ; GCN2-NEXT: s_cbranch_execnz .LBB50_4
6305 ; GCN2-NEXT: ; %bb.5: ; %Flow
6306 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
6307 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
6308 ; GCN2-NEXT: ; implicit-def: $vgpr3
6309 ; GCN2-NEXT: ; implicit-def: $vgpr2
6310 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6311 ; GCN2-NEXT: s_cbranch_execz .LBB50_2
6312 ; GCN2-NEXT: .LBB50_6: ; %atomicrmw.private
6313 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
6314 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
6315 ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
6316 ; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
6317 ; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
6318 ; GCN2-NEXT: s_waitcnt vmcnt(1)
6319 ; GCN2-NEXT: v_and_b32_e32 v2, v4, v2
6320 ; GCN2-NEXT: s_waitcnt vmcnt(0)
6321 ; GCN2-NEXT: v_and_b32_e32 v3, v5, v3
6322 ; GCN2-NEXT: v_not_b32_e32 v2, v2
6323 ; GCN2-NEXT: v_not_b32_e32 v3, v3
6324 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
6325 ; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
6326 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
6327 ; GCN2-NEXT: s_waitcnt vmcnt(0)
6328 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6330 ; GCN3-LABEL: flat_atomic_nand_i64_noret:
6332 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6333 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
6334 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
6335 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
6336 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
6337 ; GCN3-NEXT: s_cbranch_execnz .LBB50_3
6338 ; GCN3-NEXT: ; %bb.1: ; %Flow3
6339 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6340 ; GCN3-NEXT: s_cbranch_execnz .LBB50_6
6341 ; GCN3-NEXT: .LBB50_2: ; %atomicrmw.phi
6342 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
6343 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6344 ; GCN3-NEXT: .LBB50_3: ; %atomicrmw.global
6345 ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
6346 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
6347 ; GCN3-NEXT: .LBB50_4: ; %atomicrmw.start
6348 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
6349 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6350 ; GCN3-NEXT: v_and_b32_e32 v4, v7, v3
6351 ; GCN3-NEXT: v_and_b32_e32 v8, v6, v2
6352 ; GCN3-NEXT: v_not_b32_e32 v5, v4
6353 ; GCN3-NEXT: v_not_b32_e32 v4, v8
6354 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
6355 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6356 ; GCN3-NEXT: buffer_wbinvl1_vol
6357 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6358 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
6359 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
6360 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
6361 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
6362 ; GCN3-NEXT: s_cbranch_execnz .LBB50_4
6363 ; GCN3-NEXT: ; %bb.5: ; %Flow
6364 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
6365 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
6366 ; GCN3-NEXT: ; implicit-def: $vgpr3
6367 ; GCN3-NEXT: ; implicit-def: $vgpr2
6368 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6369 ; GCN3-NEXT: s_cbranch_execz .LBB50_2
6370 ; GCN3-NEXT: .LBB50_6: ; %atomicrmw.private
6371 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
6372 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
6373 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
6374 ; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
6375 ; GCN3-NEXT: s_waitcnt vmcnt(1)
6376 ; GCN3-NEXT: v_and_b32_e32 v1, v1, v3
6377 ; GCN3-NEXT: s_waitcnt vmcnt(0)
6378 ; GCN3-NEXT: v_and_b32_e32 v2, v4, v2
6379 ; GCN3-NEXT: v_not_b32_e32 v2, v2
6380 ; GCN3-NEXT: v_not_b32_e32 v1, v1
6381 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
6382 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
6383 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
6384 ; GCN3-NEXT: s_waitcnt vmcnt(0)
6385 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6386 %tmp0 = atomicrmw nand ptr %ptr, i64 %in seq_cst
6390 define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) {
6391 ; GCN1-LABEL: flat_atomic_nand_i64_noret_offset:
6393 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6394 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
6395 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
6396 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
6397 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
6398 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6399 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
6400 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
6401 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
6402 ; GCN1-NEXT: s_cbranch_execnz .LBB51_3
6403 ; GCN1-NEXT: ; %bb.1: ; %Flow3
6404 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6405 ; GCN1-NEXT: s_cbranch_execnz .LBB51_6
6406 ; GCN1-NEXT: .LBB51_2: ; %atomicrmw.phi
6407 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
6408 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6409 ; GCN1-NEXT: .LBB51_3: ; %atomicrmw.global
6410 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
6411 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
6412 ; GCN1-NEXT: flat_load_dword v7, v[4:5]
6413 ; GCN1-NEXT: flat_load_dword v6, v[0:1]
6414 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
6415 ; GCN1-NEXT: .LBB51_4: ; %atomicrmw.start
6416 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6417 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6418 ; GCN1-NEXT: v_and_b32_e32 v4, v7, v3
6419 ; GCN1-NEXT: v_and_b32_e32 v8, v6, v2
6420 ; GCN1-NEXT: v_not_b32_e32 v5, v4
6421 ; GCN1-NEXT: v_not_b32_e32 v4, v8
6422 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
6423 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6424 ; GCN1-NEXT: buffer_wbinvl1_vol
6425 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6426 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
6427 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
6428 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
6429 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
6430 ; GCN1-NEXT: s_cbranch_execnz .LBB51_4
6431 ; GCN1-NEXT: ; %bb.5: ; %Flow
6432 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
6433 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
6434 ; GCN1-NEXT: ; implicit-def: $vgpr3
6435 ; GCN1-NEXT: ; implicit-def: $vgpr2
6436 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6437 ; GCN1-NEXT: s_cbranch_execz .LBB51_2
6438 ; GCN1-NEXT: .LBB51_6: ; %atomicrmw.private
6439 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
6440 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
6441 ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
6442 ; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
6443 ; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
6444 ; GCN1-NEXT: s_waitcnt vmcnt(1)
6445 ; GCN1-NEXT: v_and_b32_e32 v2, v4, v2
6446 ; GCN1-NEXT: s_waitcnt vmcnt(0)
6447 ; GCN1-NEXT: v_and_b32_e32 v3, v5, v3
6448 ; GCN1-NEXT: v_not_b32_e32 v2, v2
6449 ; GCN1-NEXT: v_not_b32_e32 v3, v3
6450 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
6451 ; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
6452 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
6453 ; GCN1-NEXT: s_waitcnt vmcnt(0)
6454 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6456 ; GCN2-LABEL: flat_atomic_nand_i64_noret_offset:
6458 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6459 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
6460 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
6461 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
6462 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
6463 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6464 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
6465 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
6466 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
6467 ; GCN2-NEXT: s_cbranch_execnz .LBB51_3
6468 ; GCN2-NEXT: ; %bb.1: ; %Flow3
6469 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6470 ; GCN2-NEXT: s_cbranch_execnz .LBB51_6
6471 ; GCN2-NEXT: .LBB51_2: ; %atomicrmw.phi
6472 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
6473 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6474 ; GCN2-NEXT: .LBB51_3: ; %atomicrmw.global
6475 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
6476 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
6477 ; GCN2-NEXT: flat_load_dword v7, v[4:5]
6478 ; GCN2-NEXT: flat_load_dword v6, v[0:1]
6479 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
6480 ; GCN2-NEXT: .LBB51_4: ; %atomicrmw.start
6481 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
6482 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6483 ; GCN2-NEXT: v_and_b32_e32 v4, v7, v3
6484 ; GCN2-NEXT: v_and_b32_e32 v8, v6, v2
6485 ; GCN2-NEXT: v_not_b32_e32 v5, v4
6486 ; GCN2-NEXT: v_not_b32_e32 v4, v8
6487 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
6488 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6489 ; GCN2-NEXT: buffer_wbinvl1_vol
6490 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6491 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
6492 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
6493 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
6494 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
6495 ; GCN2-NEXT: s_cbranch_execnz .LBB51_4
6496 ; GCN2-NEXT: ; %bb.5: ; %Flow
6497 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
6498 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
6499 ; GCN2-NEXT: ; implicit-def: $vgpr3
6500 ; GCN2-NEXT: ; implicit-def: $vgpr2
6501 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6502 ; GCN2-NEXT: s_cbranch_execz .LBB51_2
6503 ; GCN2-NEXT: .LBB51_6: ; %atomicrmw.private
6504 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
6505 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
6506 ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
6507 ; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
6508 ; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
6509 ; GCN2-NEXT: s_waitcnt vmcnt(1)
6510 ; GCN2-NEXT: v_and_b32_e32 v2, v4, v2
6511 ; GCN2-NEXT: s_waitcnt vmcnt(0)
6512 ; GCN2-NEXT: v_and_b32_e32 v3, v5, v3
6513 ; GCN2-NEXT: v_not_b32_e32 v2, v2
6514 ; GCN2-NEXT: v_not_b32_e32 v3, v3
6515 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
6516 ; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
6517 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
6518 ; GCN2-NEXT: s_waitcnt vmcnt(0)
6519 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6521 ; GCN3-LABEL: flat_atomic_nand_i64_noret_offset:
6523 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6524 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
6525 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
6526 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
6527 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
6528 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
6529 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
6530 ; GCN3-NEXT: s_cbranch_execnz .LBB51_3
6531 ; GCN3-NEXT: ; %bb.1: ; %Flow3
6532 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6533 ; GCN3-NEXT: s_cbranch_execnz .LBB51_6
6534 ; GCN3-NEXT: .LBB51_2: ; %atomicrmw.phi
6535 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
6536 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6537 ; GCN3-NEXT: .LBB51_3: ; %atomicrmw.global
6538 ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
6539 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
6540 ; GCN3-NEXT: .LBB51_4: ; %atomicrmw.start
6541 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
6542 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6543 ; GCN3-NEXT: v_and_b32_e32 v4, v7, v3
6544 ; GCN3-NEXT: v_and_b32_e32 v8, v6, v2
6545 ; GCN3-NEXT: v_not_b32_e32 v5, v4
6546 ; GCN3-NEXT: v_not_b32_e32 v4, v8
6547 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
6548 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6549 ; GCN3-NEXT: buffer_wbinvl1_vol
6550 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6551 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
6552 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
6553 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
6554 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
6555 ; GCN3-NEXT: s_cbranch_execnz .LBB51_4
6556 ; GCN3-NEXT: ; %bb.5: ; %Flow
6557 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
6558 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
6559 ; GCN3-NEXT: ; implicit-def: $vgpr3
6560 ; GCN3-NEXT: ; implicit-def: $vgpr2
6561 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6562 ; GCN3-NEXT: s_cbranch_execz .LBB51_2
6563 ; GCN3-NEXT: .LBB51_6: ; %atomicrmw.private
6564 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
6565 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
6566 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
6567 ; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
6568 ; GCN3-NEXT: s_waitcnt vmcnt(1)
6569 ; GCN3-NEXT: v_and_b32_e32 v1, v1, v3
6570 ; GCN3-NEXT: s_waitcnt vmcnt(0)
6571 ; GCN3-NEXT: v_and_b32_e32 v2, v4, v2
6572 ; GCN3-NEXT: v_not_b32_e32 v2, v2
6573 ; GCN3-NEXT: v_not_b32_e32 v1, v1
6574 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
6575 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
6576 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
6577 ; GCN3-NEXT: s_waitcnt vmcnt(0)
6578 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6579 %gep = getelementptr i64, ptr %out, i64 4
6580 %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst
6584 define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) {
6585 ; GCN1-LABEL: flat_atomic_nand_i64_ret:
6587 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6588 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
6589 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
6590 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
6591 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6592 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
6593 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
6594 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
6595 ; GCN1-NEXT: s_cbranch_execz .LBB52_4
6596 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
6597 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
6598 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
6599 ; GCN1-NEXT: flat_load_dword v5, v[4:5]
6600 ; GCN1-NEXT: flat_load_dword v4, v[0:1]
6601 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
6602 ; GCN1-NEXT: .LBB52_2: ; %atomicrmw.start
6603 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6604 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6605 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
6606 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
6607 ; GCN1-NEXT: v_and_b32_e32 v4, v7, v3
6608 ; GCN1-NEXT: v_and_b32_e32 v8, v6, v2
6609 ; GCN1-NEXT: v_not_b32_e32 v5, v4
6610 ; GCN1-NEXT: v_not_b32_e32 v4, v8
6611 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
6612 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6613 ; GCN1-NEXT: buffer_wbinvl1_vol
6614 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6615 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
6616 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
6617 ; GCN1-NEXT: s_cbranch_execnz .LBB52_2
6618 ; GCN1-NEXT: ; %bb.3: ; %Flow
6619 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
6620 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
6621 ; GCN1-NEXT: ; implicit-def: $vgpr3
6622 ; GCN1-NEXT: ; implicit-def: $vgpr2
6623 ; GCN1-NEXT: .LBB52_4: ; %Flow3
6624 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6625 ; GCN1-NEXT: s_cbranch_execz .LBB52_6
6626 ; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
6627 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
6628 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
6629 ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
6630 ; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
6631 ; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
6632 ; GCN1-NEXT: s_waitcnt vmcnt(1)
6633 ; GCN1-NEXT: v_and_b32_e32 v2, v4, v2
6634 ; GCN1-NEXT: s_waitcnt vmcnt(0)
6635 ; GCN1-NEXT: v_and_b32_e32 v3, v5, v3
6636 ; GCN1-NEXT: v_not_b32_e32 v2, v2
6637 ; GCN1-NEXT: v_not_b32_e32 v3, v3
6638 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
6639 ; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
6640 ; GCN1-NEXT: .LBB52_6: ; %atomicrmw.phi
6641 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
6642 ; GCN1-NEXT: v_mov_b32_e32 v0, v4
6643 ; GCN1-NEXT: v_mov_b32_e32 v1, v5
6644 ; GCN1-NEXT: s_waitcnt vmcnt(0)
6645 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6647 ; GCN2-LABEL: flat_atomic_nand_i64_ret:
6649 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6650 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
6651 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
6652 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
6653 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6654 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
6655 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
6656 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
6657 ; GCN2-NEXT: s_cbranch_execz .LBB52_4
6658 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
6659 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
6660 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
6661 ; GCN2-NEXT: flat_load_dword v5, v[4:5]
6662 ; GCN2-NEXT: flat_load_dword v4, v[0:1]
6663 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
6664 ; GCN2-NEXT: .LBB52_2: ; %atomicrmw.start
6665 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
6666 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6667 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
6668 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
6669 ; GCN2-NEXT: v_and_b32_e32 v4, v7, v3
6670 ; GCN2-NEXT: v_and_b32_e32 v8, v6, v2
6671 ; GCN2-NEXT: v_not_b32_e32 v5, v4
6672 ; GCN2-NEXT: v_not_b32_e32 v4, v8
6673 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
6674 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6675 ; GCN2-NEXT: buffer_wbinvl1_vol
6676 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
6677 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
6678 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
6679 ; GCN2-NEXT: s_cbranch_execnz .LBB52_2
6680 ; GCN2-NEXT: ; %bb.3: ; %Flow
6681 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
6682 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
6683 ; GCN2-NEXT: ; implicit-def: $vgpr3
6684 ; GCN2-NEXT: ; implicit-def: $vgpr2
6685 ; GCN2-NEXT: .LBB52_4: ; %Flow3
6686 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6687 ; GCN2-NEXT: s_cbranch_execz .LBB52_6
6688 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
6689 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
6690 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
6691 ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
6692 ; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
6693 ; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
6694 ; GCN2-NEXT: s_waitcnt vmcnt(1)
6695 ; GCN2-NEXT: v_and_b32_e32 v2, v4, v2
6696 ; GCN2-NEXT: s_waitcnt vmcnt(0)
6697 ; GCN2-NEXT: v_and_b32_e32 v3, v5, v3
6698 ; GCN2-NEXT: v_not_b32_e32 v2, v2
6699 ; GCN2-NEXT: v_not_b32_e32 v3, v3
6700 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
6701 ; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
6702 ; GCN2-NEXT: .LBB52_6: ; %atomicrmw.phi
6703 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
6704 ; GCN2-NEXT: v_mov_b32_e32 v0, v4
6705 ; GCN2-NEXT: v_mov_b32_e32 v1, v5
6706 ; GCN2-NEXT: s_waitcnt vmcnt(0)
6707 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6709 ; GCN3-LABEL: flat_atomic_nand_i64_ret:
6711 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6712 ; GCN3-NEXT: v_mov_b32_e32 v5, v1
6713 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
6714 ; GCN3-NEXT: v_mov_b32_e32 v4, v0
6715 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
6716 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
6717 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
6718 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
6719 ; GCN3-NEXT: s_cbranch_execnz .LBB52_3
6720 ; GCN3-NEXT: ; %bb.1: ; %Flow3
6721 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6722 ; GCN3-NEXT: s_cbranch_execnz .LBB52_6
6723 ; GCN3-NEXT: .LBB52_2: ; %atomicrmw.phi
6724 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
6725 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6726 ; GCN3-NEXT: .LBB52_3: ; %atomicrmw.global
6727 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
6728 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
6729 ; GCN3-NEXT: .LBB52_4: ; %atomicrmw.start
6730 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
6731 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6732 ; GCN3-NEXT: v_mov_b32_e32 v9, v1
6733 ; GCN3-NEXT: v_mov_b32_e32 v8, v0
6734 ; GCN3-NEXT: v_and_b32_e32 v0, v9, v3
6735 ; GCN3-NEXT: v_and_b32_e32 v1, v8, v2
6736 ; GCN3-NEXT: v_not_b32_e32 v7, v0
6737 ; GCN3-NEXT: v_not_b32_e32 v6, v1
6738 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
6739 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6740 ; GCN3-NEXT: buffer_wbinvl1_vol
6741 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
6742 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
6743 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
6744 ; GCN3-NEXT: s_cbranch_execnz .LBB52_4
6745 ; GCN3-NEXT: ; %bb.5: ; %Flow
6746 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
6747 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
6748 ; GCN3-NEXT: ; implicit-def: $vgpr3
6749 ; GCN3-NEXT: ; implicit-def: $vgpr2
6750 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6751 ; GCN3-NEXT: s_cbranch_execz .LBB52_2
6752 ; GCN3-NEXT: .LBB52_6: ; %atomicrmw.private
6753 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
6754 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
6755 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
6756 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
6757 ; GCN3-NEXT: s_waitcnt vmcnt(1)
6758 ; GCN3-NEXT: v_and_b32_e32 v3, v1, v3
6759 ; GCN3-NEXT: s_waitcnt vmcnt(0)
6760 ; GCN3-NEXT: v_and_b32_e32 v2, v0, v2
6761 ; GCN3-NEXT: v_not_b32_e32 v2, v2
6762 ; GCN3-NEXT: v_not_b32_e32 v3, v3
6763 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
6764 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
6765 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
6766 ; GCN3-NEXT: s_waitcnt vmcnt(0)
6767 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6768 %result = atomicrmw nand ptr %ptr, i64 %in seq_cst
6772 define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) {
6773 ; GCN1-LABEL: flat_atomic_nand_i64_ret_offset:
6775 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6776 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
6777 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
6778 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
6779 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
6780 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6781 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
6782 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
6783 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
6784 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
6785 ; GCN1-NEXT: s_cbranch_execnz .LBB53_3
6786 ; GCN1-NEXT: ; %bb.1: ; %Flow3
6787 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6788 ; GCN1-NEXT: s_cbranch_execnz .LBB53_6
6789 ; GCN1-NEXT: .LBB53_2: ; %atomicrmw.phi
6790 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
6791 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6792 ; GCN1-NEXT: .LBB53_3: ; %atomicrmw.global
6793 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4
6794 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
6795 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
6796 ; GCN1-NEXT: flat_load_dword v0, v[4:5]
6797 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
6798 ; GCN1-NEXT: .LBB53_4: ; %atomicrmw.start
6799 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6800 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6801 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
6802 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
6803 ; GCN1-NEXT: v_and_b32_e32 v0, v9, v3
6804 ; GCN1-NEXT: v_and_b32_e32 v1, v8, v2
6805 ; GCN1-NEXT: v_not_b32_e32 v7, v0
6806 ; GCN1-NEXT: v_not_b32_e32 v6, v1
6807 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
6808 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6809 ; GCN1-NEXT: buffer_wbinvl1_vol
6810 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
6811 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
6812 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
6813 ; GCN1-NEXT: s_cbranch_execnz .LBB53_4
6814 ; GCN1-NEXT: ; %bb.5: ; %Flow
6815 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
6816 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
6817 ; GCN1-NEXT: ; implicit-def: $vgpr3
6818 ; GCN1-NEXT: ; implicit-def: $vgpr2
6819 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6820 ; GCN1-NEXT: s_cbranch_execz .LBB53_2
6821 ; GCN1-NEXT: .LBB53_6: ; %atomicrmw.private
6822 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
6823 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
6824 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
6825 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
6826 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
6827 ; GCN1-NEXT: s_waitcnt vmcnt(1)
6828 ; GCN1-NEXT: v_and_b32_e32 v2, v0, v2
6829 ; GCN1-NEXT: s_waitcnt vmcnt(0)
6830 ; GCN1-NEXT: v_and_b32_e32 v3, v1, v3
6831 ; GCN1-NEXT: v_not_b32_e32 v2, v2
6832 ; GCN1-NEXT: v_not_b32_e32 v3, v3
6833 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
6834 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
6835 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
6836 ; GCN1-NEXT: s_waitcnt vmcnt(0)
6837 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6839 ; GCN2-LABEL: flat_atomic_nand_i64_ret_offset:
6841 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6842 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
6843 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
6844 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
6845 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
6846 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6847 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
6848 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
6849 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
6850 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
6851 ; GCN2-NEXT: s_cbranch_execnz .LBB53_3
6852 ; GCN2-NEXT: ; %bb.1: ; %Flow3
6853 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6854 ; GCN2-NEXT: s_cbranch_execnz .LBB53_6
6855 ; GCN2-NEXT: .LBB53_2: ; %atomicrmw.phi
6856 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
6857 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6858 ; GCN2-NEXT: .LBB53_3: ; %atomicrmw.global
6859 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4
6860 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
6861 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
6862 ; GCN2-NEXT: flat_load_dword v0, v[4:5]
6863 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
6864 ; GCN2-NEXT: .LBB53_4: ; %atomicrmw.start
6865 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
6866 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6867 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
6868 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
6869 ; GCN2-NEXT: v_and_b32_e32 v0, v9, v3
6870 ; GCN2-NEXT: v_and_b32_e32 v1, v8, v2
6871 ; GCN2-NEXT: v_not_b32_e32 v7, v0
6872 ; GCN2-NEXT: v_not_b32_e32 v6, v1
6873 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
6874 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6875 ; GCN2-NEXT: buffer_wbinvl1_vol
6876 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
6877 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
6878 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
6879 ; GCN2-NEXT: s_cbranch_execnz .LBB53_4
6880 ; GCN2-NEXT: ; %bb.5: ; %Flow
6881 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
6882 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
6883 ; GCN2-NEXT: ; implicit-def: $vgpr3
6884 ; GCN2-NEXT: ; implicit-def: $vgpr2
6885 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6886 ; GCN2-NEXT: s_cbranch_execz .LBB53_2
6887 ; GCN2-NEXT: .LBB53_6: ; %atomicrmw.private
6888 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
6889 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
6890 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
6891 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
6892 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
6893 ; GCN2-NEXT: s_waitcnt vmcnt(1)
6894 ; GCN2-NEXT: v_and_b32_e32 v2, v0, v2
6895 ; GCN2-NEXT: s_waitcnt vmcnt(0)
6896 ; GCN2-NEXT: v_and_b32_e32 v3, v1, v3
6897 ; GCN2-NEXT: v_not_b32_e32 v2, v2
6898 ; GCN2-NEXT: v_not_b32_e32 v3, v3
6899 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
6900 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
6901 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
6902 ; GCN2-NEXT: s_waitcnt vmcnt(0)
6903 ; GCN2-NEXT: s_setpc_b64 s[30:31]
6905 ; GCN3-LABEL: flat_atomic_nand_i64_ret_offset:
6907 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6908 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
6909 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
6910 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
6911 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
6912 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
6913 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
6914 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
6915 ; GCN3-NEXT: s_cbranch_execnz .LBB53_3
6916 ; GCN3-NEXT: ; %bb.1: ; %Flow3
6917 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6918 ; GCN3-NEXT: s_cbranch_execnz .LBB53_6
6919 ; GCN3-NEXT: .LBB53_2: ; %atomicrmw.phi
6920 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
6921 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6922 ; GCN3-NEXT: .LBB53_3: ; %atomicrmw.global
6923 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
6924 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
6925 ; GCN3-NEXT: .LBB53_4: ; %atomicrmw.start
6926 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
6927 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6928 ; GCN3-NEXT: v_mov_b32_e32 v9, v1
6929 ; GCN3-NEXT: v_mov_b32_e32 v8, v0
6930 ; GCN3-NEXT: v_and_b32_e32 v0, v9, v3
6931 ; GCN3-NEXT: v_and_b32_e32 v1, v8, v2
6932 ; GCN3-NEXT: v_not_b32_e32 v7, v0
6933 ; GCN3-NEXT: v_not_b32_e32 v6, v1
6934 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
6935 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6936 ; GCN3-NEXT: buffer_wbinvl1_vol
6937 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
6938 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
6939 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
6940 ; GCN3-NEXT: s_cbranch_execnz .LBB53_4
6941 ; GCN3-NEXT: ; %bb.5: ; %Flow
6942 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
6943 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
6944 ; GCN3-NEXT: ; implicit-def: $vgpr3
6945 ; GCN3-NEXT: ; implicit-def: $vgpr2
6946 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
6947 ; GCN3-NEXT: s_cbranch_execz .LBB53_2
6948 ; GCN3-NEXT: .LBB53_6: ; %atomicrmw.private
6949 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
6950 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
6951 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
6952 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
6953 ; GCN3-NEXT: s_waitcnt vmcnt(1)
6954 ; GCN3-NEXT: v_and_b32_e32 v3, v1, v3
6955 ; GCN3-NEXT: s_waitcnt vmcnt(0)
6956 ; GCN3-NEXT: v_and_b32_e32 v2, v0, v2
6957 ; GCN3-NEXT: v_not_b32_e32 v2, v2
6958 ; GCN3-NEXT: v_not_b32_e32 v3, v3
6959 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
6960 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
6961 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
6962 ; GCN3-NEXT: s_waitcnt vmcnt(0)
6963 ; GCN3-NEXT: s_setpc_b64 s[30:31]
6964 %gep = getelementptr i64, ptr %out, i64 4
6965 %result = atomicrmw nand ptr %gep, i64 %in seq_cst
6969 define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
6970 ; GCN1-LABEL: flat_atomic_nand_i64_noret_scalar:
6972 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6973 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
6974 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
6975 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6976 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
6977 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
6978 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
6979 ; GCN1-NEXT: s_mov_b64 s[34:35], -1
6980 ; GCN1-NEXT: s_cbranch_vccnz .LBB54_3
6981 ; GCN1-NEXT: ; %bb.1: ; %Flow3
6982 ; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35]
6983 ; GCN1-NEXT: s_cbranch_vccnz .LBB54_6
6984 ; GCN1-NEXT: .LBB54_2: ; %atomicrmw.phi
6985 ; GCN1-NEXT: s_setpc_b64 s[30:31]
6986 ; GCN1-NEXT: .LBB54_3: ; %atomicrmw.global
6987 ; GCN1-NEXT: s_add_u32 s34, s4, 4
6988 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
6989 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
6990 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
6991 ; GCN1-NEXT: v_mov_b32_e32 v4, s4
6992 ; GCN1-NEXT: v_mov_b32_e32 v5, s5
6993 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
6994 ; GCN1-NEXT: flat_load_dword v2, v[4:5]
6995 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
6996 ; GCN1-NEXT: .LBB54_4: ; %atomicrmw.start
6997 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
6998 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6999 ; GCN1-NEXT: v_and_b32_e32 v0, s7, v3
7000 ; GCN1-NEXT: v_and_b32_e32 v6, s6, v2
7001 ; GCN1-NEXT: v_not_b32_e32 v1, v0
7002 ; GCN1-NEXT: v_not_b32_e32 v0, v6
7003 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
7004 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7005 ; GCN1-NEXT: buffer_wbinvl1_vol
7006 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7007 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
7008 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7009 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
7010 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
7011 ; GCN1-NEXT: s_cbranch_execnz .LBB54_4
7012 ; GCN1-NEXT: ; %bb.5: ; %Flow
7013 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
7014 ; GCN1-NEXT: s_branch .LBB54_2
7015 ; GCN1-NEXT: .LBB54_6: ; %atomicrmw.private
7016 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
7017 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
7018 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
7019 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
7020 ; GCN1-NEXT: s_add_i32 s34, s34, 4
7021 ; GCN1-NEXT: v_mov_b32_e32 v1, s34
7022 ; GCN1-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
7023 ; GCN1-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
7024 ; GCN1-NEXT: s_waitcnt vmcnt(1)
7025 ; GCN1-NEXT: v_and_b32_e32 v2, s6, v2
7026 ; GCN1-NEXT: s_waitcnt vmcnt(0)
7027 ; GCN1-NEXT: v_and_b32_e32 v3, s7, v3
7028 ; GCN1-NEXT: v_not_b32_e32 v2, v2
7029 ; GCN1-NEXT: v_not_b32_e32 v3, v3
7030 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
7031 ; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
7032 ; GCN1-NEXT: s_waitcnt vmcnt(0)
7033 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7035 ; GCN2-LABEL: flat_atomic_nand_i64_noret_scalar:
7037 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7038 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
7039 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
7040 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7041 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
7042 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
7043 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
7044 ; GCN2-NEXT: s_mov_b64 s[34:35], -1
7045 ; GCN2-NEXT: s_cbranch_vccnz .LBB54_3
7046 ; GCN2-NEXT: ; %bb.1: ; %Flow3
7047 ; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35]
7048 ; GCN2-NEXT: s_cbranch_vccnz .LBB54_6
7049 ; GCN2-NEXT: .LBB54_2: ; %atomicrmw.phi
7050 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7051 ; GCN2-NEXT: .LBB54_3: ; %atomicrmw.global
7052 ; GCN2-NEXT: s_add_u32 s34, s4, 4
7053 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
7054 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
7055 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
7056 ; GCN2-NEXT: v_mov_b32_e32 v4, s4
7057 ; GCN2-NEXT: v_mov_b32_e32 v5, s5
7058 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
7059 ; GCN2-NEXT: flat_load_dword v2, v[4:5]
7060 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
7061 ; GCN2-NEXT: .LBB54_4: ; %atomicrmw.start
7062 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
7063 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7064 ; GCN2-NEXT: v_and_b32_e32 v0, s7, v3
7065 ; GCN2-NEXT: v_and_b32_e32 v6, s6, v2
7066 ; GCN2-NEXT: v_not_b32_e32 v1, v0
7067 ; GCN2-NEXT: v_not_b32_e32 v0, v6
7068 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
7069 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7070 ; GCN2-NEXT: buffer_wbinvl1_vol
7071 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7072 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
7073 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7074 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
7075 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
7076 ; GCN2-NEXT: s_cbranch_execnz .LBB54_4
7077 ; GCN2-NEXT: ; %bb.5: ; %Flow
7078 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
7079 ; GCN2-NEXT: s_branch .LBB54_2
7080 ; GCN2-NEXT: .LBB54_6: ; %atomicrmw.private
7081 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
7082 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
7083 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
7084 ; GCN2-NEXT: s_add_i32 s34, s34, 4
7085 ; GCN2-NEXT: v_mov_b32_e32 v1, s34
7086 ; GCN2-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
7087 ; GCN2-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
7088 ; GCN2-NEXT: s_waitcnt vmcnt(1)
7089 ; GCN2-NEXT: v_and_b32_e32 v2, s6, v2
7090 ; GCN2-NEXT: s_waitcnt vmcnt(0)
7091 ; GCN2-NEXT: v_and_b32_e32 v3, s7, v3
7092 ; GCN2-NEXT: v_not_b32_e32 v2, v2
7093 ; GCN2-NEXT: v_not_b32_e32 v3, v3
7094 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
7095 ; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
7096 ; GCN2-NEXT: s_waitcnt vmcnt(0)
7097 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7099 ; GCN3-LABEL: flat_atomic_nand_i64_noret_scalar:
7101 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7102 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
7103 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
7104 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
7105 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
7106 ; GCN3-NEXT: s_mov_b64 s[34:35], -1
7107 ; GCN3-NEXT: s_cbranch_vccnz .LBB54_3
7108 ; GCN3-NEXT: ; %bb.1: ; %Flow3
7109 ; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35]
7110 ; GCN3-NEXT: s_cbranch_vccnz .LBB54_6
7111 ; GCN3-NEXT: .LBB54_2: ; %atomicrmw.phi
7112 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7113 ; GCN3-NEXT: .LBB54_3: ; %atomicrmw.global
7114 ; GCN3-NEXT: v_mov_b32_e32 v4, s4
7115 ; GCN3-NEXT: v_mov_b32_e32 v5, s5
7116 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
7117 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
7118 ; GCN3-NEXT: .LBB54_4: ; %atomicrmw.start
7119 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
7120 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7121 ; GCN3-NEXT: v_and_b32_e32 v0, s7, v3
7122 ; GCN3-NEXT: v_and_b32_e32 v6, s6, v2
7123 ; GCN3-NEXT: v_not_b32_e32 v1, v0
7124 ; GCN3-NEXT: v_not_b32_e32 v0, v6
7125 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
7126 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7127 ; GCN3-NEXT: buffer_wbinvl1_vol
7128 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7129 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
7130 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7131 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
7132 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
7133 ; GCN3-NEXT: s_cbranch_execnz .LBB54_4
7134 ; GCN3-NEXT: ; %bb.5: ; %Flow
7135 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
7136 ; GCN3-NEXT: s_branch .LBB54_2
7137 ; GCN3-NEXT: .LBB54_6: ; %atomicrmw.private
7138 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
7139 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
7140 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
7141 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
7142 ; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
7143 ; GCN3-NEXT: s_waitcnt vmcnt(1)
7144 ; GCN3-NEXT: v_and_b32_e32 v1, s7, v1
7145 ; GCN3-NEXT: s_waitcnt vmcnt(0)
7146 ; GCN3-NEXT: v_and_b32_e32 v2, s6, v2
7147 ; GCN3-NEXT: v_not_b32_e32 v2, v2
7148 ; GCN3-NEXT: v_not_b32_e32 v1, v1
7149 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
7150 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
7151 ; GCN3-NEXT: s_waitcnt vmcnt(0)
7152 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7153 %tmp0 = atomicrmw nand ptr %ptr, i64 %in seq_cst
7157 define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
7158 ; GCN1-LABEL: flat_atomic_nand_i64_noret_offset_scalar:
7160 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7161 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
7162 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
7163 ; GCN1-NEXT: s_add_u32 s34, s4, 32
7164 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
7165 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7166 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
7167 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
7168 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
7169 ; GCN1-NEXT: s_mov_b64 s[36:37], -1
7170 ; GCN1-NEXT: s_cbranch_vccnz .LBB55_3
7171 ; GCN1-NEXT: ; %bb.1: ; %Flow3
7172 ; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37]
7173 ; GCN1-NEXT: s_cbranch_vccnz .LBB55_6
7174 ; GCN1-NEXT: .LBB55_2: ; %atomicrmw.phi
7175 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7176 ; GCN1-NEXT: .LBB55_3: ; %atomicrmw.global
7177 ; GCN1-NEXT: s_add_u32 s36, s34, 4
7178 ; GCN1-NEXT: s_addc_u32 s37, s35, 0
7179 ; GCN1-NEXT: v_mov_b32_e32 v0, s36
7180 ; GCN1-NEXT: v_mov_b32_e32 v1, s37
7181 ; GCN1-NEXT: v_mov_b32_e32 v4, s34
7182 ; GCN1-NEXT: v_mov_b32_e32 v5, s35
7183 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
7184 ; GCN1-NEXT: flat_load_dword v2, v[4:5]
7185 ; GCN1-NEXT: s_mov_b64 s[36:37], 0
7186 ; GCN1-NEXT: .LBB55_4: ; %atomicrmw.start
7187 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
7188 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7189 ; GCN1-NEXT: v_and_b32_e32 v0, s7, v3
7190 ; GCN1-NEXT: v_and_b32_e32 v6, s6, v2
7191 ; GCN1-NEXT: v_not_b32_e32 v1, v0
7192 ; GCN1-NEXT: v_not_b32_e32 v0, v6
7193 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
7194 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7195 ; GCN1-NEXT: buffer_wbinvl1_vol
7196 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7197 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
7198 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
7199 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
7200 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
7201 ; GCN1-NEXT: s_cbranch_execnz .LBB55_4
7202 ; GCN1-NEXT: ; %bb.5: ; %Flow
7203 ; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
7204 ; GCN1-NEXT: s_branch .LBB55_2
7205 ; GCN1-NEXT: .LBB55_6: ; %atomicrmw.private
7206 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
7207 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
7208 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
7209 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
7210 ; GCN1-NEXT: s_add_i32 s34, s34, 4
7211 ; GCN1-NEXT: v_mov_b32_e32 v1, s34
7212 ; GCN1-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
7213 ; GCN1-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
7214 ; GCN1-NEXT: s_waitcnt vmcnt(1)
7215 ; GCN1-NEXT: v_and_b32_e32 v2, s6, v2
7216 ; GCN1-NEXT: s_waitcnt vmcnt(0)
7217 ; GCN1-NEXT: v_and_b32_e32 v3, s7, v3
7218 ; GCN1-NEXT: v_not_b32_e32 v2, v2
7219 ; GCN1-NEXT: v_not_b32_e32 v3, v3
7220 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
7221 ; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
7222 ; GCN1-NEXT: s_waitcnt vmcnt(0)
7223 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7225 ; GCN2-LABEL: flat_atomic_nand_i64_noret_offset_scalar:
7227 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7228 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
7229 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
7230 ; GCN2-NEXT: s_add_u32 s34, s4, 32
7231 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
7232 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7233 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
7234 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
7235 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
7236 ; GCN2-NEXT: s_mov_b64 s[36:37], -1
7237 ; GCN2-NEXT: s_cbranch_vccnz .LBB55_3
7238 ; GCN2-NEXT: ; %bb.1: ; %Flow3
7239 ; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37]
7240 ; GCN2-NEXT: s_cbranch_vccnz .LBB55_6
7241 ; GCN2-NEXT: .LBB55_2: ; %atomicrmw.phi
7242 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7243 ; GCN2-NEXT: .LBB55_3: ; %atomicrmw.global
7244 ; GCN2-NEXT: s_add_u32 s36, s34, 4
7245 ; GCN2-NEXT: s_addc_u32 s37, s35, 0
7246 ; GCN2-NEXT: v_mov_b32_e32 v0, s36
7247 ; GCN2-NEXT: v_mov_b32_e32 v1, s37
7248 ; GCN2-NEXT: v_mov_b32_e32 v4, s34
7249 ; GCN2-NEXT: v_mov_b32_e32 v5, s35
7250 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
7251 ; GCN2-NEXT: flat_load_dword v2, v[4:5]
7252 ; GCN2-NEXT: s_mov_b64 s[36:37], 0
7253 ; GCN2-NEXT: .LBB55_4: ; %atomicrmw.start
7254 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
7255 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7256 ; GCN2-NEXT: v_and_b32_e32 v0, s7, v3
7257 ; GCN2-NEXT: v_and_b32_e32 v6, s6, v2
7258 ; GCN2-NEXT: v_not_b32_e32 v1, v0
7259 ; GCN2-NEXT: v_not_b32_e32 v0, v6
7260 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
7261 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7262 ; GCN2-NEXT: buffer_wbinvl1_vol
7263 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7264 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
7265 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
7266 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
7267 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
7268 ; GCN2-NEXT: s_cbranch_execnz .LBB55_4
7269 ; GCN2-NEXT: ; %bb.5: ; %Flow
7270 ; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
7271 ; GCN2-NEXT: s_branch .LBB55_2
7272 ; GCN2-NEXT: .LBB55_6: ; %atomicrmw.private
7273 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
7274 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
7275 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
7276 ; GCN2-NEXT: s_add_i32 s34, s34, 4
7277 ; GCN2-NEXT: v_mov_b32_e32 v1, s34
7278 ; GCN2-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
7279 ; GCN2-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
7280 ; GCN2-NEXT: s_waitcnt vmcnt(1)
7281 ; GCN2-NEXT: v_and_b32_e32 v2, s6, v2
7282 ; GCN2-NEXT: s_waitcnt vmcnt(0)
7283 ; GCN2-NEXT: v_and_b32_e32 v3, s7, v3
7284 ; GCN2-NEXT: v_not_b32_e32 v2, v2
7285 ; GCN2-NEXT: v_not_b32_e32 v3, v3
7286 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
7287 ; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
7288 ; GCN2-NEXT: s_waitcnt vmcnt(0)
7289 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7291 ; GCN3-LABEL: flat_atomic_nand_i64_noret_offset_scalar:
7293 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7294 ; GCN3-NEXT: s_add_u32 s34, s4, 32
7295 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
7296 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
7297 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
7298 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
7299 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
7300 ; GCN3-NEXT: s_mov_b64 s[36:37], -1
7301 ; GCN3-NEXT: s_cbranch_vccnz .LBB55_3
7302 ; GCN3-NEXT: ; %bb.1: ; %Flow3
7303 ; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37]
7304 ; GCN3-NEXT: s_cbranch_vccnz .LBB55_6
7305 ; GCN3-NEXT: .LBB55_2: ; %atomicrmw.phi
7306 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7307 ; GCN3-NEXT: .LBB55_3: ; %atomicrmw.global
7308 ; GCN3-NEXT: v_mov_b32_e32 v4, s34
7309 ; GCN3-NEXT: v_mov_b32_e32 v5, s35
7310 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
7311 ; GCN3-NEXT: s_mov_b64 s[36:37], 0
7312 ; GCN3-NEXT: .LBB55_4: ; %atomicrmw.start
7313 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
7314 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7315 ; GCN3-NEXT: v_and_b32_e32 v0, s7, v3
7316 ; GCN3-NEXT: v_and_b32_e32 v6, s6, v2
7317 ; GCN3-NEXT: v_not_b32_e32 v1, v0
7318 ; GCN3-NEXT: v_not_b32_e32 v0, v6
7319 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
7320 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7321 ; GCN3-NEXT: buffer_wbinvl1_vol
7322 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
7323 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
7324 ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
7325 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
7326 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
7327 ; GCN3-NEXT: s_cbranch_execnz .LBB55_4
7328 ; GCN3-NEXT: ; %bb.5: ; %Flow
7329 ; GCN3-NEXT: s_or_b64 exec, exec, s[36:37]
7330 ; GCN3-NEXT: s_branch .LBB55_2
7331 ; GCN3-NEXT: .LBB55_6: ; %atomicrmw.private
7332 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
7333 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
7334 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
7335 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
7336 ; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
7337 ; GCN3-NEXT: s_waitcnt vmcnt(1)
7338 ; GCN3-NEXT: v_and_b32_e32 v1, s7, v1
7339 ; GCN3-NEXT: s_waitcnt vmcnt(0)
7340 ; GCN3-NEXT: v_and_b32_e32 v2, s6, v2
7341 ; GCN3-NEXT: v_not_b32_e32 v2, v2
7342 ; GCN3-NEXT: v_not_b32_e32 v1, v1
7343 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
7344 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
7345 ; GCN3-NEXT: s_waitcnt vmcnt(0)
7346 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7347 %gep = getelementptr i64, ptr %out, i64 4
7348 %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst
7352 define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
7353 ; GCN1-LABEL: flat_atomic_nand_i64_ret_scalar:
7355 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7356 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
7357 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
7358 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7359 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
7360 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
7361 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
7362 ; GCN1-NEXT: s_cbranch_vccz .LBB56_4
7363 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
7364 ; GCN1-NEXT: s_add_u32 s34, s4, 4
7365 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
7366 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
7367 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
7368 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
7369 ; GCN1-NEXT: v_mov_b32_e32 v3, s5
7370 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
7371 ; GCN1-NEXT: flat_load_dword v0, v[2:3]
7372 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
7373 ; GCN1-NEXT: .LBB56_2: ; %atomicrmw.start
7374 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
7375 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7376 ; GCN1-NEXT: v_mov_b32_e32 v7, v1
7377 ; GCN1-NEXT: v_mov_b32_e32 v6, v0
7378 ; GCN1-NEXT: v_and_b32_e32 v0, s7, v7
7379 ; GCN1-NEXT: v_and_b32_e32 v1, s6, v6
7380 ; GCN1-NEXT: v_not_b32_e32 v5, v0
7381 ; GCN1-NEXT: v_not_b32_e32 v4, v1
7382 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
7383 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7384 ; GCN1-NEXT: buffer_wbinvl1_vol
7385 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
7386 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7387 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
7388 ; GCN1-NEXT: s_cbranch_execnz .LBB56_2
7389 ; GCN1-NEXT: ; %bb.3: ; %Flow
7390 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
7391 ; GCN1-NEXT: s_branch .LBB56_6
7392 ; GCN1-NEXT: .LBB56_4:
7393 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
7394 ; GCN1-NEXT: s_cbranch_execz .LBB56_6
7395 ; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
7396 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
7397 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
7398 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
7399 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
7400 ; GCN1-NEXT: s_add_i32 s34, s34, 4
7401 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
7402 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
7403 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
7404 ; GCN1-NEXT: s_waitcnt vmcnt(1)
7405 ; GCN1-NEXT: v_and_b32_e32 v4, s6, v0
7406 ; GCN1-NEXT: s_waitcnt vmcnt(0)
7407 ; GCN1-NEXT: v_and_b32_e32 v5, s7, v1
7408 ; GCN1-NEXT: v_not_b32_e32 v4, v4
7409 ; GCN1-NEXT: v_not_b32_e32 v5, v5
7410 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
7411 ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
7412 ; GCN1-NEXT: .LBB56_6: ; %atomicrmw.phi
7413 ; GCN1-NEXT: s_waitcnt vmcnt(0)
7414 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7416 ; GCN2-LABEL: flat_atomic_nand_i64_ret_scalar:
7418 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7419 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
7420 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
7421 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7422 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
7423 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
7424 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
7425 ; GCN2-NEXT: s_cbranch_vccz .LBB56_4
7426 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
7427 ; GCN2-NEXT: s_add_u32 s34, s4, 4
7428 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
7429 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
7430 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
7431 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
7432 ; GCN2-NEXT: v_mov_b32_e32 v3, s5
7433 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
7434 ; GCN2-NEXT: flat_load_dword v0, v[2:3]
7435 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
7436 ; GCN2-NEXT: .LBB56_2: ; %atomicrmw.start
7437 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
7438 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7439 ; GCN2-NEXT: v_mov_b32_e32 v7, v1
7440 ; GCN2-NEXT: v_mov_b32_e32 v6, v0
7441 ; GCN2-NEXT: v_and_b32_e32 v0, s7, v7
7442 ; GCN2-NEXT: v_and_b32_e32 v1, s6, v6
7443 ; GCN2-NEXT: v_not_b32_e32 v5, v0
7444 ; GCN2-NEXT: v_not_b32_e32 v4, v1
7445 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
7446 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7447 ; GCN2-NEXT: buffer_wbinvl1_vol
7448 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
7449 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7450 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
7451 ; GCN2-NEXT: s_cbranch_execnz .LBB56_2
7452 ; GCN2-NEXT: ; %bb.3: ; %Flow
7453 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
7454 ; GCN2-NEXT: s_branch .LBB56_6
7455 ; GCN2-NEXT: .LBB56_4:
7456 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
7457 ; GCN2-NEXT: s_cbranch_execz .LBB56_6
7458 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
7459 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
7460 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
7461 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
7462 ; GCN2-NEXT: s_add_i32 s34, s34, 4
7463 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
7464 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
7465 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
7466 ; GCN2-NEXT: s_waitcnt vmcnt(1)
7467 ; GCN2-NEXT: v_and_b32_e32 v4, s6, v0
7468 ; GCN2-NEXT: s_waitcnt vmcnt(0)
7469 ; GCN2-NEXT: v_and_b32_e32 v5, s7, v1
7470 ; GCN2-NEXT: v_not_b32_e32 v4, v4
7471 ; GCN2-NEXT: v_not_b32_e32 v5, v5
7472 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
7473 ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
7474 ; GCN2-NEXT: .LBB56_6: ; %atomicrmw.phi
7475 ; GCN2-NEXT: s_waitcnt vmcnt(0)
7476 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7478 ; GCN3-LABEL: flat_atomic_nand_i64_ret_scalar:
7480 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7481 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
7482 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
7483 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
7484 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
7485 ; GCN3-NEXT: s_cbranch_vccz .LBB56_4
7486 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
7487 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
7488 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
7489 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
7490 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
7491 ; GCN3-NEXT: .LBB56_2: ; %atomicrmw.start
7492 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
7493 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7494 ; GCN3-NEXT: v_mov_b32_e32 v7, v1
7495 ; GCN3-NEXT: v_mov_b32_e32 v6, v0
7496 ; GCN3-NEXT: v_and_b32_e32 v0, s7, v7
7497 ; GCN3-NEXT: v_and_b32_e32 v1, s6, v6
7498 ; GCN3-NEXT: v_not_b32_e32 v5, v0
7499 ; GCN3-NEXT: v_not_b32_e32 v4, v1
7500 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
7501 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7502 ; GCN3-NEXT: buffer_wbinvl1_vol
7503 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
7504 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
7505 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
7506 ; GCN3-NEXT: s_cbranch_execnz .LBB56_2
7507 ; GCN3-NEXT: ; %bb.3: ; %Flow
7508 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
7509 ; GCN3-NEXT: s_branch .LBB56_6
7510 ; GCN3-NEXT: .LBB56_4:
7511 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
7512 ; GCN3-NEXT: s_cbranch_execz .LBB56_6
7513 ; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
7514 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
7515 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
7516 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
7517 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
7518 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
7519 ; GCN3-NEXT: s_waitcnt vmcnt(1)
7520 ; GCN3-NEXT: v_and_b32_e32 v3, s7, v1
7521 ; GCN3-NEXT: s_waitcnt vmcnt(0)
7522 ; GCN3-NEXT: v_and_b32_e32 v4, s6, v0
7523 ; GCN3-NEXT: v_not_b32_e32 v4, v4
7524 ; GCN3-NEXT: v_not_b32_e32 v3, v3
7525 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
7526 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
7527 ; GCN3-NEXT: .LBB56_6: ; %atomicrmw.phi
7528 ; GCN3-NEXT: s_waitcnt vmcnt(0)
7529 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7530 %result = atomicrmw nand ptr %ptr, i64 %in seq_cst
7534 define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
7535 ; GCN1-LABEL: flat_atomic_nand_i64_ret_offset_scalar:
7537 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7538 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
7539 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
7540 ; GCN1-NEXT: s_add_u32 s34, s4, 32
7541 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
7542 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7543 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
7544 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
7545 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
7546 ; GCN1-NEXT: s_cbranch_vccz .LBB57_4
7547 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
7548 ; GCN1-NEXT: s_add_u32 s36, s34, 4
7549 ; GCN1-NEXT: s_addc_u32 s37, s35, 0
7550 ; GCN1-NEXT: v_mov_b32_e32 v0, s36
7551 ; GCN1-NEXT: v_mov_b32_e32 v1, s37
7552 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
7553 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
7554 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
7555 ; GCN1-NEXT: flat_load_dword v0, v[2:3]
7556 ; GCN1-NEXT: s_mov_b64 s[36:37], 0
7557 ; GCN1-NEXT: .LBB57_2: ; %atomicrmw.start
7558 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
7559 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7560 ; GCN1-NEXT: v_mov_b32_e32 v7, v1
7561 ; GCN1-NEXT: v_mov_b32_e32 v6, v0
7562 ; GCN1-NEXT: v_and_b32_e32 v0, s7, v7
7563 ; GCN1-NEXT: v_and_b32_e32 v1, s6, v6
7564 ; GCN1-NEXT: v_not_b32_e32 v5, v0
7565 ; GCN1-NEXT: v_not_b32_e32 v4, v1
7566 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
7567 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7568 ; GCN1-NEXT: buffer_wbinvl1_vol
7569 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
7570 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
7571 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
7572 ; GCN1-NEXT: s_cbranch_execnz .LBB57_2
7573 ; GCN1-NEXT: ; %bb.3: ; %Flow
7574 ; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
7575 ; GCN1-NEXT: s_branch .LBB57_6
7576 ; GCN1-NEXT: .LBB57_4:
7577 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
7578 ; GCN1-NEXT: s_cbranch_execz .LBB57_6
7579 ; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
7580 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
7581 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
7582 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
7583 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
7584 ; GCN1-NEXT: s_add_i32 s34, s34, 4
7585 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
7586 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
7587 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
7588 ; GCN1-NEXT: s_waitcnt vmcnt(1)
7589 ; GCN1-NEXT: v_and_b32_e32 v4, s6, v0
7590 ; GCN1-NEXT: s_waitcnt vmcnt(0)
7591 ; GCN1-NEXT: v_and_b32_e32 v5, s7, v1
7592 ; GCN1-NEXT: v_not_b32_e32 v4, v4
7593 ; GCN1-NEXT: v_not_b32_e32 v5, v5
7594 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
7595 ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
7596 ; GCN1-NEXT: .LBB57_6: ; %atomicrmw.phi
7597 ; GCN1-NEXT: s_waitcnt vmcnt(0)
7598 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7600 ; GCN2-LABEL: flat_atomic_nand_i64_ret_offset_scalar:
7602 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7603 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
7604 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
7605 ; GCN2-NEXT: s_add_u32 s34, s4, 32
7606 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
7607 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7608 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
7609 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
7610 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
7611 ; GCN2-NEXT: s_cbranch_vccz .LBB57_4
7612 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
7613 ; GCN2-NEXT: s_add_u32 s36, s34, 4
7614 ; GCN2-NEXT: s_addc_u32 s37, s35, 0
7615 ; GCN2-NEXT: v_mov_b32_e32 v0, s36
7616 ; GCN2-NEXT: v_mov_b32_e32 v1, s37
7617 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
7618 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
7619 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
7620 ; GCN2-NEXT: flat_load_dword v0, v[2:3]
7621 ; GCN2-NEXT: s_mov_b64 s[36:37], 0
7622 ; GCN2-NEXT: .LBB57_2: ; %atomicrmw.start
7623 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
7624 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7625 ; GCN2-NEXT: v_mov_b32_e32 v7, v1
7626 ; GCN2-NEXT: v_mov_b32_e32 v6, v0
7627 ; GCN2-NEXT: v_and_b32_e32 v0, s7, v7
7628 ; GCN2-NEXT: v_and_b32_e32 v1, s6, v6
7629 ; GCN2-NEXT: v_not_b32_e32 v5, v0
7630 ; GCN2-NEXT: v_not_b32_e32 v4, v1
7631 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
7632 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7633 ; GCN2-NEXT: buffer_wbinvl1_vol
7634 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
7635 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
7636 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
7637 ; GCN2-NEXT: s_cbranch_execnz .LBB57_2
7638 ; GCN2-NEXT: ; %bb.3: ; %Flow
7639 ; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
7640 ; GCN2-NEXT: s_branch .LBB57_6
7641 ; GCN2-NEXT: .LBB57_4:
7642 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
7643 ; GCN2-NEXT: s_cbranch_execz .LBB57_6
7644 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
7645 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
7646 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
7647 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
7648 ; GCN2-NEXT: s_add_i32 s34, s34, 4
7649 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
7650 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
7651 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
7652 ; GCN2-NEXT: s_waitcnt vmcnt(1)
7653 ; GCN2-NEXT: v_and_b32_e32 v4, s6, v0
7654 ; GCN2-NEXT: s_waitcnt vmcnt(0)
7655 ; GCN2-NEXT: v_and_b32_e32 v5, s7, v1
7656 ; GCN2-NEXT: v_not_b32_e32 v4, v4
7657 ; GCN2-NEXT: v_not_b32_e32 v5, v5
7658 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
7659 ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
7660 ; GCN2-NEXT: .LBB57_6: ; %atomicrmw.phi
7661 ; GCN2-NEXT: s_waitcnt vmcnt(0)
7662 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7664 ; GCN3-LABEL: flat_atomic_nand_i64_ret_offset_scalar:
7666 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7667 ; GCN3-NEXT: s_add_u32 s34, s4, 32
7668 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
7669 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
7670 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
7671 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
7672 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
7673 ; GCN3-NEXT: s_cbranch_vccz .LBB57_4
7674 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
7675 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
7676 ; GCN3-NEXT: v_mov_b32_e32 v3, s35
7677 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
7678 ; GCN3-NEXT: s_mov_b64 s[36:37], 0
7679 ; GCN3-NEXT: .LBB57_2: ; %atomicrmw.start
7680 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
7681 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7682 ; GCN3-NEXT: v_mov_b32_e32 v7, v1
7683 ; GCN3-NEXT: v_mov_b32_e32 v6, v0
7684 ; GCN3-NEXT: v_and_b32_e32 v0, s7, v7
7685 ; GCN3-NEXT: v_and_b32_e32 v1, s6, v6
7686 ; GCN3-NEXT: v_not_b32_e32 v5, v0
7687 ; GCN3-NEXT: v_not_b32_e32 v4, v1
7688 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
7689 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7690 ; GCN3-NEXT: buffer_wbinvl1_vol
7691 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
7692 ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
7693 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
7694 ; GCN3-NEXT: s_cbranch_execnz .LBB57_2
7695 ; GCN3-NEXT: ; %bb.3: ; %Flow
7696 ; GCN3-NEXT: s_or_b64 exec, exec, s[36:37]
7697 ; GCN3-NEXT: s_branch .LBB57_6
7698 ; GCN3-NEXT: .LBB57_4:
7699 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
7700 ; GCN3-NEXT: s_cbranch_execz .LBB57_6
7701 ; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
7702 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
7703 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
7704 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
7705 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
7706 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
7707 ; GCN3-NEXT: s_waitcnt vmcnt(1)
7708 ; GCN3-NEXT: v_and_b32_e32 v3, s7, v1
7709 ; GCN3-NEXT: s_waitcnt vmcnt(0)
7710 ; GCN3-NEXT: v_and_b32_e32 v4, s6, v0
7711 ; GCN3-NEXT: v_not_b32_e32 v4, v4
7712 ; GCN3-NEXT: v_not_b32_e32 v3, v3
7713 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
7714 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
7715 ; GCN3-NEXT: .LBB57_6: ; %atomicrmw.phi
7716 ; GCN3-NEXT: s_waitcnt vmcnt(0)
7717 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7718 %gep = getelementptr i64, ptr %out, i64 4
7719 %result = atomicrmw nand ptr %gep, i64 %in seq_cst
7723 define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
7724 ; GCN1-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory:
7726 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7727 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
7728 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
7729 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
7730 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7731 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7732 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
7733 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
7734 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
7735 ; GCN1-NEXT: s_cbranch_execnz .LBB58_3
7736 ; GCN1-NEXT: ; %bb.1: ; %Flow3
7737 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
7738 ; GCN1-NEXT: s_cbranch_execnz .LBB58_6
7739 ; GCN1-NEXT: .LBB58_2: ; %atomicrmw.phi
7740 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
7741 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7742 ; GCN1-NEXT: .LBB58_3: ; %atomicrmw.global
7743 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
7744 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
7745 ; GCN1-NEXT: flat_load_dword v7, v[4:5]
7746 ; GCN1-NEXT: flat_load_dword v6, v[0:1]
7747 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
7748 ; GCN1-NEXT: .LBB58_4: ; %atomicrmw.start
7749 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
7750 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7751 ; GCN1-NEXT: v_and_b32_e32 v4, v7, v3
7752 ; GCN1-NEXT: v_and_b32_e32 v8, v6, v2
7753 ; GCN1-NEXT: v_not_b32_e32 v5, v4
7754 ; GCN1-NEXT: v_not_b32_e32 v4, v8
7755 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
7756 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7757 ; GCN1-NEXT: buffer_wbinvl1_vol
7758 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7759 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
7760 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
7761 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
7762 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
7763 ; GCN1-NEXT: s_cbranch_execnz .LBB58_4
7764 ; GCN1-NEXT: ; %bb.5: ; %Flow
7765 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
7766 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
7767 ; GCN1-NEXT: ; implicit-def: $vgpr3
7768 ; GCN1-NEXT: ; implicit-def: $vgpr2
7769 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
7770 ; GCN1-NEXT: s_cbranch_execz .LBB58_2
7771 ; GCN1-NEXT: .LBB58_6: ; %atomicrmw.private
7772 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
7773 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
7774 ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
7775 ; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
7776 ; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
7777 ; GCN1-NEXT: s_waitcnt vmcnt(1)
7778 ; GCN1-NEXT: v_and_b32_e32 v2, v4, v2
7779 ; GCN1-NEXT: s_waitcnt vmcnt(0)
7780 ; GCN1-NEXT: v_and_b32_e32 v3, v5, v3
7781 ; GCN1-NEXT: v_not_b32_e32 v2, v2
7782 ; GCN1-NEXT: v_not_b32_e32 v3, v3
7783 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
7784 ; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
7785 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
7786 ; GCN1-NEXT: s_waitcnt vmcnt(0)
7787 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7789 ; GCN2-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory:
7791 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7792 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
7793 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
7794 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
7795 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
7796 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7797 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
7798 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
7799 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
7800 ; GCN2-NEXT: s_cbranch_execnz .LBB58_3
7801 ; GCN2-NEXT: ; %bb.1: ; %Flow3
7802 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
7803 ; GCN2-NEXT: s_cbranch_execnz .LBB58_6
7804 ; GCN2-NEXT: .LBB58_2: ; %atomicrmw.phi
7805 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
7806 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7807 ; GCN2-NEXT: .LBB58_3: ; %atomicrmw.global
7808 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
7809 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
7810 ; GCN2-NEXT: flat_load_dword v7, v[4:5]
7811 ; GCN2-NEXT: flat_load_dword v6, v[0:1]
7812 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
7813 ; GCN2-NEXT: .LBB58_4: ; %atomicrmw.start
7814 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
7815 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7816 ; GCN2-NEXT: v_and_b32_e32 v4, v7, v3
7817 ; GCN2-NEXT: v_and_b32_e32 v8, v6, v2
7818 ; GCN2-NEXT: v_not_b32_e32 v5, v4
7819 ; GCN2-NEXT: v_not_b32_e32 v4, v8
7820 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
7821 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7822 ; GCN2-NEXT: buffer_wbinvl1_vol
7823 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7824 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
7825 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
7826 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
7827 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
7828 ; GCN2-NEXT: s_cbranch_execnz .LBB58_4
7829 ; GCN2-NEXT: ; %bb.5: ; %Flow
7830 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
7831 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
7832 ; GCN2-NEXT: ; implicit-def: $vgpr3
7833 ; GCN2-NEXT: ; implicit-def: $vgpr2
7834 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
7835 ; GCN2-NEXT: s_cbranch_execz .LBB58_2
7836 ; GCN2-NEXT: .LBB58_6: ; %atomicrmw.private
7837 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
7838 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
7839 ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
7840 ; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
7841 ; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
7842 ; GCN2-NEXT: s_waitcnt vmcnt(1)
7843 ; GCN2-NEXT: v_and_b32_e32 v2, v4, v2
7844 ; GCN2-NEXT: s_waitcnt vmcnt(0)
7845 ; GCN2-NEXT: v_and_b32_e32 v3, v5, v3
7846 ; GCN2-NEXT: v_not_b32_e32 v2, v2
7847 ; GCN2-NEXT: v_not_b32_e32 v3, v3
7848 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
7849 ; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
7850 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
7851 ; GCN2-NEXT: s_waitcnt vmcnt(0)
7852 ; GCN2-NEXT: s_setpc_b64 s[30:31]
7854 ; GCN3-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory:
7856 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7857 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
7858 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
7859 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
7860 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
7861 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
7862 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
7863 ; GCN3-NEXT: s_cbranch_execnz .LBB58_3
7864 ; GCN3-NEXT: ; %bb.1: ; %Flow3
7865 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
7866 ; GCN3-NEXT: s_cbranch_execnz .LBB58_6
7867 ; GCN3-NEXT: .LBB58_2: ; %atomicrmw.phi
7868 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
7869 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7870 ; GCN3-NEXT: .LBB58_3: ; %atomicrmw.global
7871 ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
7872 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
7873 ; GCN3-NEXT: .LBB58_4: ; %atomicrmw.start
7874 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
7875 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7876 ; GCN3-NEXT: v_and_b32_e32 v4, v7, v3
7877 ; GCN3-NEXT: v_and_b32_e32 v8, v6, v2
7878 ; GCN3-NEXT: v_not_b32_e32 v5, v4
7879 ; GCN3-NEXT: v_not_b32_e32 v4, v8
7880 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
7881 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7882 ; GCN3-NEXT: buffer_wbinvl1_vol
7883 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
7884 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
7885 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
7886 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
7887 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
7888 ; GCN3-NEXT: s_cbranch_execnz .LBB58_4
7889 ; GCN3-NEXT: ; %bb.5: ; %Flow
7890 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
7891 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
7892 ; GCN3-NEXT: ; implicit-def: $vgpr3
7893 ; GCN3-NEXT: ; implicit-def: $vgpr2
7894 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
7895 ; GCN3-NEXT: s_cbranch_execz .LBB58_2
7896 ; GCN3-NEXT: .LBB58_6: ; %atomicrmw.private
7897 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
7898 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
7899 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
7900 ; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
7901 ; GCN3-NEXT: s_waitcnt vmcnt(1)
7902 ; GCN3-NEXT: v_and_b32_e32 v1, v1, v3
7903 ; GCN3-NEXT: s_waitcnt vmcnt(0)
7904 ; GCN3-NEXT: v_and_b32_e32 v2, v4, v2
7905 ; GCN3-NEXT: v_not_b32_e32 v2, v2
7906 ; GCN3-NEXT: v_not_b32_e32 v1, v1
7907 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
7908 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
7909 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
7910 ; GCN3-NEXT: s_waitcnt vmcnt(0)
7911 ; GCN3-NEXT: s_setpc_b64 s[30:31]
7912 %gep = getelementptr i64, ptr %out, i64 4
7913 %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
7917 define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
7918 ; GCN1-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory:
7920 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7921 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
7922 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
7923 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
7924 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
7925 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7926 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
7927 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
7928 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
7929 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
7930 ; GCN1-NEXT: s_cbranch_execnz .LBB59_3
7931 ; GCN1-NEXT: ; %bb.1: ; %Flow3
7932 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
7933 ; GCN1-NEXT: s_cbranch_execnz .LBB59_6
7934 ; GCN1-NEXT: .LBB59_2: ; %atomicrmw.phi
7935 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
7936 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7937 ; GCN1-NEXT: .LBB59_3: ; %atomicrmw.global
7938 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4
7939 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
7940 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
7941 ; GCN1-NEXT: flat_load_dword v0, v[4:5]
7942 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
7943 ; GCN1-NEXT: .LBB59_4: ; %atomicrmw.start
7944 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
7945 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7946 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
7947 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
7948 ; GCN1-NEXT: v_and_b32_e32 v0, v9, v3
7949 ; GCN1-NEXT: v_and_b32_e32 v1, v8, v2
7950 ; GCN1-NEXT: v_not_b32_e32 v7, v0
7951 ; GCN1-NEXT: v_not_b32_e32 v6, v1
7952 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
7953 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7954 ; GCN1-NEXT: buffer_wbinvl1_vol
7955 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
7956 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
7957 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
7958 ; GCN1-NEXT: s_cbranch_execnz .LBB59_4
7959 ; GCN1-NEXT: ; %bb.5: ; %Flow
7960 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
7961 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
7962 ; GCN1-NEXT: ; implicit-def: $vgpr3
7963 ; GCN1-NEXT: ; implicit-def: $vgpr2
7964 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
7965 ; GCN1-NEXT: s_cbranch_execz .LBB59_2
7966 ; GCN1-NEXT: .LBB59_6: ; %atomicrmw.private
7967 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
7968 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
7969 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
7970 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
7971 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
7972 ; GCN1-NEXT: s_waitcnt vmcnt(1)
7973 ; GCN1-NEXT: v_and_b32_e32 v2, v0, v2
7974 ; GCN1-NEXT: s_waitcnt vmcnt(0)
7975 ; GCN1-NEXT: v_and_b32_e32 v3, v1, v3
7976 ; GCN1-NEXT: v_not_b32_e32 v2, v2
7977 ; GCN1-NEXT: v_not_b32_e32 v3, v3
7978 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
7979 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
7980 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
7981 ; GCN1-NEXT: s_waitcnt vmcnt(0)
7982 ; GCN1-NEXT: s_setpc_b64 s[30:31]
7984 ; GCN2-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory:
7986 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7987 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
7988 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
7989 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
7990 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
7991 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7992 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
7993 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
7994 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
7995 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
7996 ; GCN2-NEXT: s_cbranch_execnz .LBB59_3
7997 ; GCN2-NEXT: ; %bb.1: ; %Flow3
7998 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
7999 ; GCN2-NEXT: s_cbranch_execnz .LBB59_6
8000 ; GCN2-NEXT: .LBB59_2: ; %atomicrmw.phi
8001 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
8002 ; GCN2-NEXT: s_setpc_b64 s[30:31]
8003 ; GCN2-NEXT: .LBB59_3: ; %atomicrmw.global
8004 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4
8005 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
8006 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
8007 ; GCN2-NEXT: flat_load_dword v0, v[4:5]
8008 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
8009 ; GCN2-NEXT: .LBB59_4: ; %atomicrmw.start
8010 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
8011 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8012 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
8013 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
8014 ; GCN2-NEXT: v_and_b32_e32 v0, v9, v3
8015 ; GCN2-NEXT: v_and_b32_e32 v1, v8, v2
8016 ; GCN2-NEXT: v_not_b32_e32 v7, v0
8017 ; GCN2-NEXT: v_not_b32_e32 v6, v1
8018 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
8019 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8020 ; GCN2-NEXT: buffer_wbinvl1_vol
8021 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
8022 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
8023 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
8024 ; GCN2-NEXT: s_cbranch_execnz .LBB59_4
8025 ; GCN2-NEXT: ; %bb.5: ; %Flow
8026 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
8027 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
8028 ; GCN2-NEXT: ; implicit-def: $vgpr3
8029 ; GCN2-NEXT: ; implicit-def: $vgpr2
8030 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8031 ; GCN2-NEXT: s_cbranch_execz .LBB59_2
8032 ; GCN2-NEXT: .LBB59_6: ; %atomicrmw.private
8033 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
8034 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
8035 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
8036 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
8037 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
8038 ; GCN2-NEXT: s_waitcnt vmcnt(1)
8039 ; GCN2-NEXT: v_and_b32_e32 v2, v0, v2
8040 ; GCN2-NEXT: s_waitcnt vmcnt(0)
8041 ; GCN2-NEXT: v_and_b32_e32 v3, v1, v3
8042 ; GCN2-NEXT: v_not_b32_e32 v2, v2
8043 ; GCN2-NEXT: v_not_b32_e32 v3, v3
8044 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
8045 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
8046 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
8047 ; GCN2-NEXT: s_waitcnt vmcnt(0)
8048 ; GCN2-NEXT: s_setpc_b64 s[30:31]
8050 ; GCN3-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory:
8052 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8053 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
8054 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
8055 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
8056 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
8057 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
8058 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
8059 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
8060 ; GCN3-NEXT: s_cbranch_execnz .LBB59_3
8061 ; GCN3-NEXT: ; %bb.1: ; %Flow3
8062 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8063 ; GCN3-NEXT: s_cbranch_execnz .LBB59_6
8064 ; GCN3-NEXT: .LBB59_2: ; %atomicrmw.phi
8065 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
8066 ; GCN3-NEXT: s_setpc_b64 s[30:31]
8067 ; GCN3-NEXT: .LBB59_3: ; %atomicrmw.global
8068 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
8069 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
8070 ; GCN3-NEXT: .LBB59_4: ; %atomicrmw.start
8071 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
8072 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8073 ; GCN3-NEXT: v_mov_b32_e32 v9, v1
8074 ; GCN3-NEXT: v_mov_b32_e32 v8, v0
8075 ; GCN3-NEXT: v_and_b32_e32 v0, v9, v3
8076 ; GCN3-NEXT: v_and_b32_e32 v1, v8, v2
8077 ; GCN3-NEXT: v_not_b32_e32 v7, v0
8078 ; GCN3-NEXT: v_not_b32_e32 v6, v1
8079 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
8080 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8081 ; GCN3-NEXT: buffer_wbinvl1_vol
8082 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
8083 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
8084 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
8085 ; GCN3-NEXT: s_cbranch_execnz .LBB59_4
8086 ; GCN3-NEXT: ; %bb.5: ; %Flow
8087 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
8088 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
8089 ; GCN3-NEXT: ; implicit-def: $vgpr3
8090 ; GCN3-NEXT: ; implicit-def: $vgpr2
8091 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8092 ; GCN3-NEXT: s_cbranch_execz .LBB59_2
8093 ; GCN3-NEXT: .LBB59_6: ; %atomicrmw.private
8094 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
8095 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
8096 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
8097 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
8098 ; GCN3-NEXT: s_waitcnt vmcnt(1)
8099 ; GCN3-NEXT: v_and_b32_e32 v3, v1, v3
8100 ; GCN3-NEXT: s_waitcnt vmcnt(0)
8101 ; GCN3-NEXT: v_and_b32_e32 v2, v0, v2
8102 ; GCN3-NEXT: v_not_b32_e32 v2, v2
8103 ; GCN3-NEXT: v_not_b32_e32 v3, v3
8104 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
8105 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
8106 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
8107 ; GCN3-NEXT: s_waitcnt vmcnt(0)
8108 ; GCN3-NEXT: s_setpc_b64 s[30:31]
8109 %gep = getelementptr i64, ptr %out, i64 4
8110 %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
8114 ; ---------------------------------------------------------------------
8116 ; ---------------------------------------------------------------------
8118 define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) {
8119 ; GCN1-LABEL: flat_atomic_or_i64_noret:
8121 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8122 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
8123 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
8124 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
8125 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
8126 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
8127 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
8128 ; GCN1-NEXT: s_cbranch_execnz .LBB60_3
8129 ; GCN1-NEXT: ; %bb.1: ; %Flow
8130 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8131 ; GCN1-NEXT: s_cbranch_execnz .LBB60_4
8132 ; GCN1-NEXT: .LBB60_2: ; %atomicrmw.phi
8133 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
8134 ; GCN1-NEXT: s_setpc_b64 s[30:31]
8135 ; GCN1-NEXT: .LBB60_3: ; %atomicrmw.global
8136 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
8137 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8138 ; GCN1-NEXT: buffer_wbinvl1_vol
8139 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
8140 ; GCN1-NEXT: ; implicit-def: $vgpr3
8141 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8142 ; GCN1-NEXT: s_cbranch_execz .LBB60_2
8143 ; GCN1-NEXT: .LBB60_4: ; %atomicrmw.private
8144 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
8145 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
8146 ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
8147 ; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
8148 ; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
8149 ; GCN1-NEXT: s_waitcnt vmcnt(1)
8150 ; GCN1-NEXT: v_or_b32_e32 v2, v4, v2
8151 ; GCN1-NEXT: s_waitcnt vmcnt(0)
8152 ; GCN1-NEXT: v_or_b32_e32 v3, v5, v3
8153 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
8154 ; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
8155 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
8156 ; GCN1-NEXT: s_waitcnt vmcnt(0)
8157 ; GCN1-NEXT: s_setpc_b64 s[30:31]
8159 ; GCN2-LABEL: flat_atomic_or_i64_noret:
8161 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8162 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
8163 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
8164 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
8165 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
8166 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
8167 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
8168 ; GCN2-NEXT: s_cbranch_execnz .LBB60_3
8169 ; GCN2-NEXT: ; %bb.1: ; %Flow
8170 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8171 ; GCN2-NEXT: s_cbranch_execnz .LBB60_4
8172 ; GCN2-NEXT: .LBB60_2: ; %atomicrmw.phi
8173 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
8174 ; GCN2-NEXT: s_setpc_b64 s[30:31]
8175 ; GCN2-NEXT: .LBB60_3: ; %atomicrmw.global
8176 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
8177 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8178 ; GCN2-NEXT: buffer_wbinvl1_vol
8179 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
8180 ; GCN2-NEXT: ; implicit-def: $vgpr3
8181 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8182 ; GCN2-NEXT: s_cbranch_execz .LBB60_2
8183 ; GCN2-NEXT: .LBB60_4: ; %atomicrmw.private
8184 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
8185 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
8186 ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
8187 ; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
8188 ; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
8189 ; GCN2-NEXT: s_waitcnt vmcnt(1)
8190 ; GCN2-NEXT: v_or_b32_e32 v2, v4, v2
8191 ; GCN2-NEXT: s_waitcnt vmcnt(0)
8192 ; GCN2-NEXT: v_or_b32_e32 v3, v5, v3
8193 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
8194 ; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
8195 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
8196 ; GCN2-NEXT: s_waitcnt vmcnt(0)
8197 ; GCN2-NEXT: s_setpc_b64 s[30:31]
8199 ; GCN3-LABEL: flat_atomic_or_i64_noret:
8201 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8202 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
8203 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
8204 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
8205 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
8206 ; GCN3-NEXT: s_cbranch_execnz .LBB60_3
8207 ; GCN3-NEXT: ; %bb.1: ; %Flow
8208 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8209 ; GCN3-NEXT: s_cbranch_execnz .LBB60_4
8210 ; GCN3-NEXT: .LBB60_2: ; %atomicrmw.phi
8211 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
8212 ; GCN3-NEXT: s_setpc_b64 s[30:31]
8213 ; GCN3-NEXT: .LBB60_3: ; %atomicrmw.global
8214 ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
8215 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8216 ; GCN3-NEXT: buffer_wbinvl1_vol
8217 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
8218 ; GCN3-NEXT: ; implicit-def: $vgpr3
8219 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8220 ; GCN3-NEXT: s_cbranch_execz .LBB60_2
8221 ; GCN3-NEXT: .LBB60_4: ; %atomicrmw.private
8222 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
8223 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
8224 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
8225 ; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
8226 ; GCN3-NEXT: s_waitcnt vmcnt(1)
8227 ; GCN3-NEXT: v_or_b32_e32 v1, v1, v3
8228 ; GCN3-NEXT: s_waitcnt vmcnt(0)
8229 ; GCN3-NEXT: v_or_b32_e32 v2, v4, v2
8230 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
8231 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
8232 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
8233 ; GCN3-NEXT: s_waitcnt vmcnt(0)
8234 ; GCN3-NEXT: s_setpc_b64 s[30:31]
8235 %tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst
8239 define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) {
8240 ; GCN1-LABEL: flat_atomic_or_i64_noret_offset:
8242 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8243 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
8244 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
8245 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
8246 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
8247 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
8248 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
8249 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
8250 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
8251 ; GCN1-NEXT: s_cbranch_execnz .LBB61_3
8252 ; GCN1-NEXT: ; %bb.1: ; %Flow
8253 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8254 ; GCN1-NEXT: s_cbranch_execnz .LBB61_4
8255 ; GCN1-NEXT: .LBB61_2: ; %atomicrmw.phi
8256 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
8257 ; GCN1-NEXT: s_setpc_b64 s[30:31]
8258 ; GCN1-NEXT: .LBB61_3: ; %atomicrmw.global
8259 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
8260 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8261 ; GCN1-NEXT: buffer_wbinvl1_vol
8262 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
8263 ; GCN1-NEXT: ; implicit-def: $vgpr3
8264 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8265 ; GCN1-NEXT: s_cbranch_execz .LBB61_2
8266 ; GCN1-NEXT: .LBB61_4: ; %atomicrmw.private
8267 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
8268 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
8269 ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
8270 ; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
8271 ; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
8272 ; GCN1-NEXT: s_waitcnt vmcnt(1)
8273 ; GCN1-NEXT: v_or_b32_e32 v2, v4, v2
8274 ; GCN1-NEXT: s_waitcnt vmcnt(0)
8275 ; GCN1-NEXT: v_or_b32_e32 v3, v5, v3
8276 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
8277 ; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
8278 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
8279 ; GCN1-NEXT: s_waitcnt vmcnt(0)
8280 ; GCN1-NEXT: s_setpc_b64 s[30:31]
8282 ; GCN2-LABEL: flat_atomic_or_i64_noret_offset:
8284 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8285 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
8286 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
8287 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
8288 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
8289 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
8290 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
8291 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
8292 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
8293 ; GCN2-NEXT: s_cbranch_execnz .LBB61_3
8294 ; GCN2-NEXT: ; %bb.1: ; %Flow
8295 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8296 ; GCN2-NEXT: s_cbranch_execnz .LBB61_4
8297 ; GCN2-NEXT: .LBB61_2: ; %atomicrmw.phi
8298 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
8299 ; GCN2-NEXT: s_setpc_b64 s[30:31]
8300 ; GCN2-NEXT: .LBB61_3: ; %atomicrmw.global
8301 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
8302 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8303 ; GCN2-NEXT: buffer_wbinvl1_vol
8304 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
8305 ; GCN2-NEXT: ; implicit-def: $vgpr3
8306 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8307 ; GCN2-NEXT: s_cbranch_execz .LBB61_2
8308 ; GCN2-NEXT: .LBB61_4: ; %atomicrmw.private
8309 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
8310 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
8311 ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
8312 ; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
8313 ; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
8314 ; GCN2-NEXT: s_waitcnt vmcnt(1)
8315 ; GCN2-NEXT: v_or_b32_e32 v2, v4, v2
8316 ; GCN2-NEXT: s_waitcnt vmcnt(0)
8317 ; GCN2-NEXT: v_or_b32_e32 v3, v5, v3
8318 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
8319 ; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
8320 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
8321 ; GCN2-NEXT: s_waitcnt vmcnt(0)
8322 ; GCN2-NEXT: s_setpc_b64 s[30:31]
8324 ; GCN3-LABEL: flat_atomic_or_i64_noret_offset:
8326 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8327 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
8328 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
8329 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
8330 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
8331 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
8332 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
8333 ; GCN3-NEXT: s_cbranch_execnz .LBB61_3
8334 ; GCN3-NEXT: ; %bb.1: ; %Flow
8335 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8336 ; GCN3-NEXT: s_cbranch_execnz .LBB61_4
8337 ; GCN3-NEXT: .LBB61_2: ; %atomicrmw.phi
8338 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
8339 ; GCN3-NEXT: s_setpc_b64 s[30:31]
8340 ; GCN3-NEXT: .LBB61_3: ; %atomicrmw.global
8341 ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
8342 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8343 ; GCN3-NEXT: buffer_wbinvl1_vol
8344 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
8345 ; GCN3-NEXT: ; implicit-def: $vgpr3
8346 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8347 ; GCN3-NEXT: s_cbranch_execz .LBB61_2
8348 ; GCN3-NEXT: .LBB61_4: ; %atomicrmw.private
8349 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
8350 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
8351 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
8352 ; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
8353 ; GCN3-NEXT: s_waitcnt vmcnt(1)
8354 ; GCN3-NEXT: v_or_b32_e32 v1, v1, v3
8355 ; GCN3-NEXT: s_waitcnt vmcnt(0)
8356 ; GCN3-NEXT: v_or_b32_e32 v2, v4, v2
8357 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
8358 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
8359 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
8360 ; GCN3-NEXT: s_waitcnt vmcnt(0)
8361 ; GCN3-NEXT: s_setpc_b64 s[30:31]
8362 %gep = getelementptr i64, ptr %out, i64 4
8363 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst
8367 define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) {
8368 ; GCN1-LABEL: flat_atomic_or_i64_ret:
8370 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8371 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
8372 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
8373 ; GCN1-NEXT: v_mov_b32_e32 v5, v1
8374 ; GCN1-NEXT: v_mov_b32_e32 v4, v0
8375 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
8376 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
8377 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
8378 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
8379 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
8380 ; GCN1-NEXT: s_cbranch_execnz .LBB62_3
8381 ; GCN1-NEXT: ; %bb.1: ; %Flow
8382 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8383 ; GCN1-NEXT: s_cbranch_execnz .LBB62_4
8384 ; GCN1-NEXT: .LBB62_2: ; %atomicrmw.phi
8385 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
8386 ; GCN1-NEXT: s_setpc_b64 s[30:31]
8387 ; GCN1-NEXT: .LBB62_3: ; %atomicrmw.global
8388 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc
8389 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8390 ; GCN1-NEXT: buffer_wbinvl1_vol
8391 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
8392 ; GCN1-NEXT: ; implicit-def: $vgpr3
8393 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8394 ; GCN1-NEXT: s_cbranch_execz .LBB62_2
8395 ; GCN1-NEXT: .LBB62_4: ; %atomicrmw.private
8396 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
8397 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
8398 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
8399 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
8400 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
8401 ; GCN1-NEXT: s_waitcnt vmcnt(1)
8402 ; GCN1-NEXT: v_or_b32_e32 v2, v0, v2
8403 ; GCN1-NEXT: s_waitcnt vmcnt(0)
8404 ; GCN1-NEXT: v_or_b32_e32 v3, v1, v3
8405 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
8406 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
8407 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
8408 ; GCN1-NEXT: s_waitcnt vmcnt(0)
8409 ; GCN1-NEXT: s_setpc_b64 s[30:31]
8411 ; GCN2-LABEL: flat_atomic_or_i64_ret:
8413 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8414 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
8415 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
8416 ; GCN2-NEXT: v_mov_b32_e32 v5, v1
8417 ; GCN2-NEXT: v_mov_b32_e32 v4, v0
8418 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
8419 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
8420 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
8421 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
8422 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
8423 ; GCN2-NEXT: s_cbranch_execnz .LBB62_3
8424 ; GCN2-NEXT: ; %bb.1: ; %Flow
8425 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8426 ; GCN2-NEXT: s_cbranch_execnz .LBB62_4
8427 ; GCN2-NEXT: .LBB62_2: ; %atomicrmw.phi
8428 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
8429 ; GCN2-NEXT: s_setpc_b64 s[30:31]
8430 ; GCN2-NEXT: .LBB62_3: ; %atomicrmw.global
8431 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc
8432 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8433 ; GCN2-NEXT: buffer_wbinvl1_vol
8434 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
8435 ; GCN2-NEXT: ; implicit-def: $vgpr3
8436 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8437 ; GCN2-NEXT: s_cbranch_execz .LBB62_2
8438 ; GCN2-NEXT: .LBB62_4: ; %atomicrmw.private
8439 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
8440 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
8441 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
8442 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
8443 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
8444 ; GCN2-NEXT: s_waitcnt vmcnt(1)
8445 ; GCN2-NEXT: v_or_b32_e32 v2, v0, v2
8446 ; GCN2-NEXT: s_waitcnt vmcnt(0)
8447 ; GCN2-NEXT: v_or_b32_e32 v3, v1, v3
8448 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
8449 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
8450 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
8451 ; GCN2-NEXT: s_waitcnt vmcnt(0)
8452 ; GCN2-NEXT: s_setpc_b64 s[30:31]
8454 ; GCN3-LABEL: flat_atomic_or_i64_ret:
8456 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8457 ; GCN3-NEXT: v_mov_b32_e32 v5, v1
8458 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
8459 ; GCN3-NEXT: v_mov_b32_e32 v4, v0
8460 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
8461 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
8462 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
8463 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
8464 ; GCN3-NEXT: s_cbranch_execnz .LBB62_3
8465 ; GCN3-NEXT: ; %bb.1: ; %Flow
8466 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8467 ; GCN3-NEXT: s_cbranch_execnz .LBB62_4
8468 ; GCN3-NEXT: .LBB62_2: ; %atomicrmw.phi
8469 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
8470 ; GCN3-NEXT: s_setpc_b64 s[30:31]
8471 ; GCN3-NEXT: .LBB62_3: ; %atomicrmw.global
8472 ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc
8473 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8474 ; GCN3-NEXT: buffer_wbinvl1_vol
8475 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
8476 ; GCN3-NEXT: ; implicit-def: $vgpr3
8477 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8478 ; GCN3-NEXT: s_cbranch_execz .LBB62_2
8479 ; GCN3-NEXT: .LBB62_4: ; %atomicrmw.private
8480 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
8481 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
8482 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
8483 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
8484 ; GCN3-NEXT: s_waitcnt vmcnt(1)
8485 ; GCN3-NEXT: v_or_b32_e32 v3, v1, v3
8486 ; GCN3-NEXT: s_waitcnt vmcnt(0)
8487 ; GCN3-NEXT: v_or_b32_e32 v2, v0, v2
8488 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
8489 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
8490 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
8491 ; GCN3-NEXT: s_waitcnt vmcnt(0)
8492 ; GCN3-NEXT: s_setpc_b64 s[30:31]
8493 %result = atomicrmw or ptr %ptr, i64 %in seq_cst
8497 define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) {
8498 ; GCN1-LABEL: flat_atomic_or_i64_ret_offset:
8500 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8501 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
8502 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
8503 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
8504 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
8505 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
8506 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
8507 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
8508 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
8509 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
8510 ; GCN1-NEXT: s_cbranch_execnz .LBB63_3
8511 ; GCN1-NEXT: ; %bb.1: ; %Flow
8512 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8513 ; GCN1-NEXT: s_cbranch_execnz .LBB63_4
8514 ; GCN1-NEXT: .LBB63_2: ; %atomicrmw.phi
8515 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
8516 ; GCN1-NEXT: s_setpc_b64 s[30:31]
8517 ; GCN1-NEXT: .LBB63_3: ; %atomicrmw.global
8518 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc
8519 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8520 ; GCN1-NEXT: buffer_wbinvl1_vol
8521 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
8522 ; GCN1-NEXT: ; implicit-def: $vgpr3
8523 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8524 ; GCN1-NEXT: s_cbranch_execz .LBB63_2
8525 ; GCN1-NEXT: .LBB63_4: ; %atomicrmw.private
8526 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
8527 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
8528 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
8529 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
8530 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
8531 ; GCN1-NEXT: s_waitcnt vmcnt(1)
8532 ; GCN1-NEXT: v_or_b32_e32 v2, v0, v2
8533 ; GCN1-NEXT: s_waitcnt vmcnt(0)
8534 ; GCN1-NEXT: v_or_b32_e32 v3, v1, v3
8535 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
8536 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
8537 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
8538 ; GCN1-NEXT: s_waitcnt vmcnt(0)
8539 ; GCN1-NEXT: s_setpc_b64 s[30:31]
8541 ; GCN2-LABEL: flat_atomic_or_i64_ret_offset:
8543 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8544 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
8545 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
8546 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
8547 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
8548 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
8549 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
8550 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
8551 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
8552 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
8553 ; GCN2-NEXT: s_cbranch_execnz .LBB63_3
8554 ; GCN2-NEXT: ; %bb.1: ; %Flow
8555 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8556 ; GCN2-NEXT: s_cbranch_execnz .LBB63_4
8557 ; GCN2-NEXT: .LBB63_2: ; %atomicrmw.phi
8558 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
8559 ; GCN2-NEXT: s_setpc_b64 s[30:31]
8560 ; GCN2-NEXT: .LBB63_3: ; %atomicrmw.global
8561 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc
8562 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8563 ; GCN2-NEXT: buffer_wbinvl1_vol
8564 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
8565 ; GCN2-NEXT: ; implicit-def: $vgpr3
8566 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8567 ; GCN2-NEXT: s_cbranch_execz .LBB63_2
8568 ; GCN2-NEXT: .LBB63_4: ; %atomicrmw.private
8569 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
8570 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
8571 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
8572 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
8573 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
8574 ; GCN2-NEXT: s_waitcnt vmcnt(1)
8575 ; GCN2-NEXT: v_or_b32_e32 v2, v0, v2
8576 ; GCN2-NEXT: s_waitcnt vmcnt(0)
8577 ; GCN2-NEXT: v_or_b32_e32 v3, v1, v3
8578 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
8579 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
8580 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
8581 ; GCN2-NEXT: s_waitcnt vmcnt(0)
8582 ; GCN2-NEXT: s_setpc_b64 s[30:31]
8584 ; GCN3-LABEL: flat_atomic_or_i64_ret_offset:
8586 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8587 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
8588 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
8589 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
8590 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
8591 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
8592 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
8593 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
8594 ; GCN3-NEXT: s_cbranch_execnz .LBB63_3
8595 ; GCN3-NEXT: ; %bb.1: ; %Flow
8596 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8597 ; GCN3-NEXT: s_cbranch_execnz .LBB63_4
8598 ; GCN3-NEXT: .LBB63_2: ; %atomicrmw.phi
8599 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
8600 ; GCN3-NEXT: s_setpc_b64 s[30:31]
8601 ; GCN3-NEXT: .LBB63_3: ; %atomicrmw.global
8602 ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc
8603 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8604 ; GCN3-NEXT: buffer_wbinvl1_vol
8605 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
8606 ; GCN3-NEXT: ; implicit-def: $vgpr3
8607 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
8608 ; GCN3-NEXT: s_cbranch_execz .LBB63_2
8609 ; GCN3-NEXT: .LBB63_4: ; %atomicrmw.private
8610 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
8611 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
8612 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
8613 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
8614 ; GCN3-NEXT: s_waitcnt vmcnt(1)
8615 ; GCN3-NEXT: v_or_b32_e32 v3, v1, v3
8616 ; GCN3-NEXT: s_waitcnt vmcnt(0)
8617 ; GCN3-NEXT: v_or_b32_e32 v2, v0, v2
8618 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
8619 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
8620 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
8621 ; GCN3-NEXT: s_waitcnt vmcnt(0)
8622 ; GCN3-NEXT: s_setpc_b64 s[30:31]
8623 %gep = getelementptr i64, ptr %out, i64 4
8624 %result = atomicrmw or ptr %gep, i64 %in seq_cst
8628 define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
8629 ; GCN1-LABEL: flat_atomic_or_i64_noret_scalar:
8631 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8632 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
8633 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
8634 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
8635 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
8636 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
8637 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
8638 ; GCN1-NEXT: s_mov_b64 s[34:35], -1
8639 ; GCN1-NEXT: s_cbranch_vccnz .LBB64_3
8640 ; GCN1-NEXT: ; %bb.1: ; %Flow
8641 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
8642 ; GCN1-NEXT: s_cbranch_vccz .LBB64_4
8643 ; GCN1-NEXT: .LBB64_2: ; %atomicrmw.phi
8644 ; GCN1-NEXT: s_setpc_b64 s[30:31]
8645 ; GCN1-NEXT: .LBB64_3: ; %atomicrmw.global
8646 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
8647 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
8648 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
8649 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
8650 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
8651 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8652 ; GCN1-NEXT: buffer_wbinvl1_vol
8653 ; GCN1-NEXT: s_cbranch_execnz .LBB64_2
8654 ; GCN1-NEXT: .LBB64_4: ; %atomicrmw.private
8655 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
8656 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
8657 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
8658 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
8659 ; GCN1-NEXT: s_add_i32 s34, s34, 4
8660 ; GCN1-NEXT: v_mov_b32_e32 v1, s34
8661 ; GCN1-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
8662 ; GCN1-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
8663 ; GCN1-NEXT: s_waitcnt vmcnt(1)
8664 ; GCN1-NEXT: v_or_b32_e32 v2, s6, v2
8665 ; GCN1-NEXT: s_waitcnt vmcnt(0)
8666 ; GCN1-NEXT: v_or_b32_e32 v3, s7, v3
8667 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
8668 ; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
8669 ; GCN1-NEXT: s_waitcnt vmcnt(0)
8670 ; GCN1-NEXT: s_setpc_b64 s[30:31]
8672 ; GCN2-LABEL: flat_atomic_or_i64_noret_scalar:
8674 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8675 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
8676 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
8677 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
8678 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
8679 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
8680 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
8681 ; GCN2-NEXT: s_mov_b64 s[34:35], -1
8682 ; GCN2-NEXT: s_cbranch_vccnz .LBB64_3
8683 ; GCN2-NEXT: ; %bb.1: ; %Flow
8684 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
8685 ; GCN2-NEXT: s_cbranch_vccz .LBB64_4
8686 ; GCN2-NEXT: .LBB64_2: ; %atomicrmw.phi
8687 ; GCN2-NEXT: s_setpc_b64 s[30:31]
8688 ; GCN2-NEXT: .LBB64_3: ; %atomicrmw.global
8689 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
8690 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
8691 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
8692 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
8693 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
8694 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8695 ; GCN2-NEXT: buffer_wbinvl1_vol
8696 ; GCN2-NEXT: s_cbranch_execnz .LBB64_2
8697 ; GCN2-NEXT: .LBB64_4: ; %atomicrmw.private
8698 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
8699 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
8700 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
8701 ; GCN2-NEXT: s_add_i32 s34, s34, 4
8702 ; GCN2-NEXT: v_mov_b32_e32 v1, s34
8703 ; GCN2-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
8704 ; GCN2-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
8705 ; GCN2-NEXT: s_waitcnt vmcnt(1)
8706 ; GCN2-NEXT: v_or_b32_e32 v2, s6, v2
8707 ; GCN2-NEXT: s_waitcnt vmcnt(0)
8708 ; GCN2-NEXT: v_or_b32_e32 v3, s7, v3
8709 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
8710 ; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
8711 ; GCN2-NEXT: s_waitcnt vmcnt(0)
8712 ; GCN2-NEXT: s_setpc_b64 s[30:31]
8714 ; GCN3-LABEL: flat_atomic_or_i64_noret_scalar:
8716 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8717 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
8718 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
8719 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
8720 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
8721 ; GCN3-NEXT: s_mov_b64 s[34:35], -1
8722 ; GCN3-NEXT: s_cbranch_vccnz .LBB64_3
8723 ; GCN3-NEXT: ; %bb.1: ; %Flow
8724 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
8725 ; GCN3-NEXT: s_cbranch_vccz .LBB64_4
8726 ; GCN3-NEXT: .LBB64_2: ; %atomicrmw.phi
8727 ; GCN3-NEXT: s_setpc_b64 s[30:31]
8728 ; GCN3-NEXT: .LBB64_3: ; %atomicrmw.global
8729 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
8730 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
8731 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
8732 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
8733 ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
8734 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8735 ; GCN3-NEXT: buffer_wbinvl1_vol
8736 ; GCN3-NEXT: s_cbranch_execnz .LBB64_2
8737 ; GCN3-NEXT: .LBB64_4: ; %atomicrmw.private
8738 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
8739 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
8740 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
8741 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
8742 ; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
8743 ; GCN3-NEXT: s_waitcnt vmcnt(1)
8744 ; GCN3-NEXT: v_or_b32_e32 v1, s7, v1
8745 ; GCN3-NEXT: s_waitcnt vmcnt(0)
8746 ; GCN3-NEXT: v_or_b32_e32 v2, s6, v2
8747 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
8748 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
8749 ; GCN3-NEXT: s_waitcnt vmcnt(0)
8750 ; GCN3-NEXT: s_setpc_b64 s[30:31]
8751 %tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst
8755 define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
8756 ; GCN1-LABEL: flat_atomic_or_i64_noret_offset_scalar:
8758 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8759 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
8760 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
8761 ; GCN1-NEXT: s_add_u32 s34, s4, 32
8762 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
8763 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
8764 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
8765 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
8766 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
8767 ; GCN1-NEXT: s_mov_b64 s[36:37], -1
8768 ; GCN1-NEXT: s_cbranch_vccnz .LBB65_3
8769 ; GCN1-NEXT: ; %bb.1: ; %Flow
8770 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
8771 ; GCN1-NEXT: s_cbranch_vccz .LBB65_4
8772 ; GCN1-NEXT: .LBB65_2: ; %atomicrmw.phi
8773 ; GCN1-NEXT: s_setpc_b64 s[30:31]
8774 ; GCN1-NEXT: .LBB65_3: ; %atomicrmw.global
8775 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
8776 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
8777 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
8778 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
8779 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
8780 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8781 ; GCN1-NEXT: buffer_wbinvl1_vol
8782 ; GCN1-NEXT: s_cbranch_execnz .LBB65_2
8783 ; GCN1-NEXT: .LBB65_4: ; %atomicrmw.private
8784 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
8785 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
8786 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
8787 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
8788 ; GCN1-NEXT: s_add_i32 s34, s34, 4
8789 ; GCN1-NEXT: v_mov_b32_e32 v1, s34
8790 ; GCN1-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
8791 ; GCN1-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
8792 ; GCN1-NEXT: s_waitcnt vmcnt(1)
8793 ; GCN1-NEXT: v_or_b32_e32 v2, s6, v2
8794 ; GCN1-NEXT: s_waitcnt vmcnt(0)
8795 ; GCN1-NEXT: v_or_b32_e32 v3, s7, v3
8796 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
8797 ; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
8798 ; GCN1-NEXT: s_waitcnt vmcnt(0)
8799 ; GCN1-NEXT: s_setpc_b64 s[30:31]
8801 ; GCN2-LABEL: flat_atomic_or_i64_noret_offset_scalar:
8803 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8804 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
8805 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
8806 ; GCN2-NEXT: s_add_u32 s34, s4, 32
8807 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
8808 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
8809 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
8810 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
8811 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
8812 ; GCN2-NEXT: s_mov_b64 s[36:37], -1
8813 ; GCN2-NEXT: s_cbranch_vccnz .LBB65_3
8814 ; GCN2-NEXT: ; %bb.1: ; %Flow
8815 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
8816 ; GCN2-NEXT: s_cbranch_vccz .LBB65_4
8817 ; GCN2-NEXT: .LBB65_2: ; %atomicrmw.phi
8818 ; GCN2-NEXT: s_setpc_b64 s[30:31]
8819 ; GCN2-NEXT: .LBB65_3: ; %atomicrmw.global
8820 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
8821 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
8822 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
8823 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
8824 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
8825 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8826 ; GCN2-NEXT: buffer_wbinvl1_vol
8827 ; GCN2-NEXT: s_cbranch_execnz .LBB65_2
8828 ; GCN2-NEXT: .LBB65_4: ; %atomicrmw.private
8829 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
8830 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
8831 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
8832 ; GCN2-NEXT: s_add_i32 s34, s34, 4
8833 ; GCN2-NEXT: v_mov_b32_e32 v1, s34
8834 ; GCN2-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
8835 ; GCN2-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
8836 ; GCN2-NEXT: s_waitcnt vmcnt(1)
8837 ; GCN2-NEXT: v_or_b32_e32 v2, s6, v2
8838 ; GCN2-NEXT: s_waitcnt vmcnt(0)
8839 ; GCN2-NEXT: v_or_b32_e32 v3, s7, v3
8840 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
8841 ; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
8842 ; GCN2-NEXT: s_waitcnt vmcnt(0)
8843 ; GCN2-NEXT: s_setpc_b64 s[30:31]
8845 ; GCN3-LABEL: flat_atomic_or_i64_noret_offset_scalar:
8847 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8848 ; GCN3-NEXT: s_add_u32 s34, s4, 32
8849 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
8850 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
8851 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
8852 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
8853 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
8854 ; GCN3-NEXT: s_mov_b64 s[36:37], -1
8855 ; GCN3-NEXT: s_cbranch_vccnz .LBB65_3
8856 ; GCN3-NEXT: ; %bb.1: ; %Flow
8857 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
8858 ; GCN3-NEXT: s_cbranch_vccz .LBB65_4
8859 ; GCN3-NEXT: .LBB65_2: ; %atomicrmw.phi
8860 ; GCN3-NEXT: s_setpc_b64 s[30:31]
8861 ; GCN3-NEXT: .LBB65_3: ; %atomicrmw.global
8862 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
8863 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
8864 ; GCN3-NEXT: v_mov_b32_e32 v1, s35
8865 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
8866 ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
8867 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8868 ; GCN3-NEXT: buffer_wbinvl1_vol
8869 ; GCN3-NEXT: s_cbranch_execnz .LBB65_2
8870 ; GCN3-NEXT: .LBB65_4: ; %atomicrmw.private
8871 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
8872 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
8873 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
8874 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
8875 ; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
8876 ; GCN3-NEXT: s_waitcnt vmcnt(1)
8877 ; GCN3-NEXT: v_or_b32_e32 v1, s7, v1
8878 ; GCN3-NEXT: s_waitcnt vmcnt(0)
8879 ; GCN3-NEXT: v_or_b32_e32 v2, s6, v2
8880 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
8881 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
8882 ; GCN3-NEXT: s_waitcnt vmcnt(0)
8883 ; GCN3-NEXT: s_setpc_b64 s[30:31]
8884 %gep = getelementptr i64, ptr %out, i64 4
8885 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst
8889 define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
8890 ; GCN1-LABEL: flat_atomic_or_i64_ret_scalar:
8892 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8893 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
8894 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
8895 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
8896 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
8897 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
8898 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
8899 ; GCN1-NEXT: s_cbranch_vccz .LBB66_2
8900 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
8901 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
8902 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
8903 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
8904 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
8905 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
8906 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8907 ; GCN1-NEXT: buffer_wbinvl1_vol
8908 ; GCN1-NEXT: s_cbranch_execz .LBB66_3
8909 ; GCN1-NEXT: s_branch .LBB66_4
8910 ; GCN1-NEXT: .LBB66_2:
8911 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
8912 ; GCN1-NEXT: .LBB66_3: ; %atomicrmw.private
8913 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
8914 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
8915 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
8916 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
8917 ; GCN1-NEXT: s_add_i32 s34, s34, 4
8918 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
8919 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
8920 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
8921 ; GCN1-NEXT: s_waitcnt vmcnt(1)
8922 ; GCN1-NEXT: v_or_b32_e32 v4, s6, v0
8923 ; GCN1-NEXT: s_waitcnt vmcnt(0)
8924 ; GCN1-NEXT: v_or_b32_e32 v5, s7, v1
8925 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
8926 ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
8927 ; GCN1-NEXT: .LBB66_4: ; %atomicrmw.end
8928 ; GCN1-NEXT: s_waitcnt vmcnt(0)
8929 ; GCN1-NEXT: s_setpc_b64 s[30:31]
8931 ; GCN2-LABEL: flat_atomic_or_i64_ret_scalar:
8933 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8934 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
8935 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
8936 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
8937 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
8938 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
8939 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
8940 ; GCN2-NEXT: s_cbranch_vccz .LBB66_2
8941 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
8942 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
8943 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
8944 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
8945 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
8946 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
8947 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8948 ; GCN2-NEXT: buffer_wbinvl1_vol
8949 ; GCN2-NEXT: s_cbranch_execz .LBB66_3
8950 ; GCN2-NEXT: s_branch .LBB66_4
8951 ; GCN2-NEXT: .LBB66_2:
8952 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
8953 ; GCN2-NEXT: .LBB66_3: ; %atomicrmw.private
8954 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
8955 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
8956 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
8957 ; GCN2-NEXT: s_add_i32 s34, s34, 4
8958 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
8959 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
8960 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
8961 ; GCN2-NEXT: s_waitcnt vmcnt(1)
8962 ; GCN2-NEXT: v_or_b32_e32 v4, s6, v0
8963 ; GCN2-NEXT: s_waitcnt vmcnt(0)
8964 ; GCN2-NEXT: v_or_b32_e32 v5, s7, v1
8965 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
8966 ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
8967 ; GCN2-NEXT: .LBB66_4: ; %atomicrmw.end
8968 ; GCN2-NEXT: s_waitcnt vmcnt(0)
8969 ; GCN2-NEXT: s_setpc_b64 s[30:31]
8971 ; GCN3-LABEL: flat_atomic_or_i64_ret_scalar:
8973 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8974 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
8975 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
8976 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
8977 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
8978 ; GCN3-NEXT: s_cbranch_vccz .LBB66_2
8979 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
8980 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
8981 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
8982 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
8983 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
8984 ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
8985 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8986 ; GCN3-NEXT: buffer_wbinvl1_vol
8987 ; GCN3-NEXT: s_cbranch_execz .LBB66_3
8988 ; GCN3-NEXT: s_branch .LBB66_4
8989 ; GCN3-NEXT: .LBB66_2:
8990 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
8991 ; GCN3-NEXT: .LBB66_3: ; %atomicrmw.private
8992 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
8993 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
8994 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
8995 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
8996 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
8997 ; GCN3-NEXT: s_waitcnt vmcnt(1)
8998 ; GCN3-NEXT: v_or_b32_e32 v3, s7, v1
8999 ; GCN3-NEXT: s_waitcnt vmcnt(0)
9000 ; GCN3-NEXT: v_or_b32_e32 v4, s6, v0
9001 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
9002 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
9003 ; GCN3-NEXT: .LBB66_4: ; %atomicrmw.end
9004 ; GCN3-NEXT: s_waitcnt vmcnt(0)
9005 ; GCN3-NEXT: s_setpc_b64 s[30:31]
9006 %result = atomicrmw or ptr %ptr, i64 %in seq_cst
9010 define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
9011 ; GCN1-LABEL: flat_atomic_or_i64_ret_offset_scalar:
9013 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9014 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
9015 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
9016 ; GCN1-NEXT: s_add_u32 s34, s4, 32
9017 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
9018 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
9019 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
9020 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
9021 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
9022 ; GCN1-NEXT: s_cbranch_vccz .LBB67_2
9023 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
9024 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
9025 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
9026 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
9027 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
9028 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
9029 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9030 ; GCN1-NEXT: buffer_wbinvl1_vol
9031 ; GCN1-NEXT: s_cbranch_execz .LBB67_3
9032 ; GCN1-NEXT: s_branch .LBB67_4
9033 ; GCN1-NEXT: .LBB67_2:
9034 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
9035 ; GCN1-NEXT: .LBB67_3: ; %atomicrmw.private
9036 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
9037 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
9038 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
9039 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
9040 ; GCN1-NEXT: s_add_i32 s34, s34, 4
9041 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
9042 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
9043 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
9044 ; GCN1-NEXT: s_waitcnt vmcnt(1)
9045 ; GCN1-NEXT: v_or_b32_e32 v4, s6, v0
9046 ; GCN1-NEXT: s_waitcnt vmcnt(0)
9047 ; GCN1-NEXT: v_or_b32_e32 v5, s7, v1
9048 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
9049 ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
9050 ; GCN1-NEXT: .LBB67_4: ; %atomicrmw.end
9051 ; GCN1-NEXT: s_waitcnt vmcnt(0)
9052 ; GCN1-NEXT: s_setpc_b64 s[30:31]
9054 ; GCN2-LABEL: flat_atomic_or_i64_ret_offset_scalar:
9056 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9057 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
9058 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
9059 ; GCN2-NEXT: s_add_u32 s34, s4, 32
9060 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
9061 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
9062 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
9063 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
9064 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
9065 ; GCN2-NEXT: s_cbranch_vccz .LBB67_2
9066 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
9067 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
9068 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
9069 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
9070 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
9071 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
9072 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9073 ; GCN2-NEXT: buffer_wbinvl1_vol
9074 ; GCN2-NEXT: s_cbranch_execz .LBB67_3
9075 ; GCN2-NEXT: s_branch .LBB67_4
9076 ; GCN2-NEXT: .LBB67_2:
9077 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
9078 ; GCN2-NEXT: .LBB67_3: ; %atomicrmw.private
9079 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
9080 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
9081 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
9082 ; GCN2-NEXT: s_add_i32 s34, s34, 4
9083 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
9084 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
9085 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
9086 ; GCN2-NEXT: s_waitcnt vmcnt(1)
9087 ; GCN2-NEXT: v_or_b32_e32 v4, s6, v0
9088 ; GCN2-NEXT: s_waitcnt vmcnt(0)
9089 ; GCN2-NEXT: v_or_b32_e32 v5, s7, v1
9090 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
9091 ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
9092 ; GCN2-NEXT: .LBB67_4: ; %atomicrmw.end
9093 ; GCN2-NEXT: s_waitcnt vmcnt(0)
9094 ; GCN2-NEXT: s_setpc_b64 s[30:31]
9096 ; GCN3-LABEL: flat_atomic_or_i64_ret_offset_scalar:
9098 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9099 ; GCN3-NEXT: s_add_u32 s34, s4, 32
9100 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
9101 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
9102 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
9103 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
9104 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
9105 ; GCN3-NEXT: s_cbranch_vccz .LBB67_2
9106 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
9107 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
9108 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
9109 ; GCN3-NEXT: v_mov_b32_e32 v1, s35
9110 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
9111 ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
9112 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9113 ; GCN3-NEXT: buffer_wbinvl1_vol
9114 ; GCN3-NEXT: s_cbranch_execz .LBB67_3
9115 ; GCN3-NEXT: s_branch .LBB67_4
9116 ; GCN3-NEXT: .LBB67_2:
9117 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
9118 ; GCN3-NEXT: .LBB67_3: ; %atomicrmw.private
9119 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
9120 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
9121 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
9122 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
9123 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
9124 ; GCN3-NEXT: s_waitcnt vmcnt(1)
9125 ; GCN3-NEXT: v_or_b32_e32 v3, s7, v1
9126 ; GCN3-NEXT: s_waitcnt vmcnt(0)
9127 ; GCN3-NEXT: v_or_b32_e32 v4, s6, v0
9128 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
9129 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
9130 ; GCN3-NEXT: .LBB67_4: ; %atomicrmw.end
9131 ; GCN3-NEXT: s_waitcnt vmcnt(0)
9132 ; GCN3-NEXT: s_setpc_b64 s[30:31]
9133 %gep = getelementptr i64, ptr %out, i64 4
9134 %result = atomicrmw or ptr %gep, i64 %in seq_cst
9138 define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
9139 ; GCN1-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory:
9141 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9142 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
9143 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
9144 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
9145 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
9146 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
9147 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
9148 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
9149 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
9150 ; GCN1-NEXT: s_cbranch_execnz .LBB68_3
9151 ; GCN1-NEXT: ; %bb.1: ; %Flow
9152 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9153 ; GCN1-NEXT: s_cbranch_execnz .LBB68_4
9154 ; GCN1-NEXT: .LBB68_2: ; %atomicrmw.phi
9155 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
9156 ; GCN1-NEXT: s_setpc_b64 s[30:31]
9157 ; GCN1-NEXT: .LBB68_3: ; %atomicrmw.global
9158 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
9159 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9160 ; GCN1-NEXT: buffer_wbinvl1_vol
9161 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
9162 ; GCN1-NEXT: ; implicit-def: $vgpr3
9163 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9164 ; GCN1-NEXT: s_cbranch_execz .LBB68_2
9165 ; GCN1-NEXT: .LBB68_4: ; %atomicrmw.private
9166 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
9167 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
9168 ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
9169 ; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
9170 ; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
9171 ; GCN1-NEXT: s_waitcnt vmcnt(1)
9172 ; GCN1-NEXT: v_or_b32_e32 v2, v4, v2
9173 ; GCN1-NEXT: s_waitcnt vmcnt(0)
9174 ; GCN1-NEXT: v_or_b32_e32 v3, v5, v3
9175 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
9176 ; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
9177 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
9178 ; GCN1-NEXT: s_waitcnt vmcnt(0)
9179 ; GCN1-NEXT: s_setpc_b64 s[30:31]
9181 ; GCN2-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory:
9183 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9184 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
9185 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
9186 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
9187 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
9188 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
9189 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
9190 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
9191 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
9192 ; GCN2-NEXT: s_cbranch_execnz .LBB68_3
9193 ; GCN2-NEXT: ; %bb.1: ; %Flow
9194 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9195 ; GCN2-NEXT: s_cbranch_execnz .LBB68_4
9196 ; GCN2-NEXT: .LBB68_2: ; %atomicrmw.phi
9197 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
9198 ; GCN2-NEXT: s_setpc_b64 s[30:31]
9199 ; GCN2-NEXT: .LBB68_3: ; %atomicrmw.global
9200 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
9201 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9202 ; GCN2-NEXT: buffer_wbinvl1_vol
9203 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
9204 ; GCN2-NEXT: ; implicit-def: $vgpr3
9205 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9206 ; GCN2-NEXT: s_cbranch_execz .LBB68_2
9207 ; GCN2-NEXT: .LBB68_4: ; %atomicrmw.private
9208 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
9209 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
9210 ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
9211 ; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
9212 ; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
9213 ; GCN2-NEXT: s_waitcnt vmcnt(1)
9214 ; GCN2-NEXT: v_or_b32_e32 v2, v4, v2
9215 ; GCN2-NEXT: s_waitcnt vmcnt(0)
9216 ; GCN2-NEXT: v_or_b32_e32 v3, v5, v3
9217 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
9218 ; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
9219 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
9220 ; GCN2-NEXT: s_waitcnt vmcnt(0)
9221 ; GCN2-NEXT: s_setpc_b64 s[30:31]
9223 ; GCN3-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory:
9225 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9226 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
9227 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
9228 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
9229 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
9230 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
9231 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
9232 ; GCN3-NEXT: s_cbranch_execnz .LBB68_3
9233 ; GCN3-NEXT: ; %bb.1: ; %Flow
9234 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9235 ; GCN3-NEXT: s_cbranch_execnz .LBB68_4
9236 ; GCN3-NEXT: .LBB68_2: ; %atomicrmw.phi
9237 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
9238 ; GCN3-NEXT: s_setpc_b64 s[30:31]
9239 ; GCN3-NEXT: .LBB68_3: ; %atomicrmw.global
9240 ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
9241 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9242 ; GCN3-NEXT: buffer_wbinvl1_vol
9243 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
9244 ; GCN3-NEXT: ; implicit-def: $vgpr3
9245 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9246 ; GCN3-NEXT: s_cbranch_execz .LBB68_2
9247 ; GCN3-NEXT: .LBB68_4: ; %atomicrmw.private
9248 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
9249 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
9250 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
9251 ; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
9252 ; GCN3-NEXT: s_waitcnt vmcnt(1)
9253 ; GCN3-NEXT: v_or_b32_e32 v1, v1, v3
9254 ; GCN3-NEXT: s_waitcnt vmcnt(0)
9255 ; GCN3-NEXT: v_or_b32_e32 v2, v4, v2
9256 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
9257 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
9258 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
9259 ; GCN3-NEXT: s_waitcnt vmcnt(0)
9260 ; GCN3-NEXT: s_setpc_b64 s[30:31]
9261 %gep = getelementptr i64, ptr %out, i64 4
9262 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
9266 define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
9267 ; GCN1-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory:
9269 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9270 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
9271 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
9272 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
9273 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
9274 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
9275 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
9276 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
9277 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
9278 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
9279 ; GCN1-NEXT: s_cbranch_execnz .LBB69_3
9280 ; GCN1-NEXT: ; %bb.1: ; %Flow
9281 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9282 ; GCN1-NEXT: s_cbranch_execnz .LBB69_4
9283 ; GCN1-NEXT: .LBB69_2: ; %atomicrmw.phi
9284 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
9285 ; GCN1-NEXT: s_setpc_b64 s[30:31]
9286 ; GCN1-NEXT: .LBB69_3: ; %atomicrmw.global
9287 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc
9288 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9289 ; GCN1-NEXT: buffer_wbinvl1_vol
9290 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
9291 ; GCN1-NEXT: ; implicit-def: $vgpr3
9292 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9293 ; GCN1-NEXT: s_cbranch_execz .LBB69_2
9294 ; GCN1-NEXT: .LBB69_4: ; %atomicrmw.private
9295 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
9296 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
9297 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
9298 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
9299 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
9300 ; GCN1-NEXT: s_waitcnt vmcnt(1)
9301 ; GCN1-NEXT: v_or_b32_e32 v2, v0, v2
9302 ; GCN1-NEXT: s_waitcnt vmcnt(0)
9303 ; GCN1-NEXT: v_or_b32_e32 v3, v1, v3
9304 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
9305 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
9306 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
9307 ; GCN1-NEXT: s_waitcnt vmcnt(0)
9308 ; GCN1-NEXT: s_setpc_b64 s[30:31]
9310 ; GCN2-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory:
9312 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9313 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
9314 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
9315 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
9316 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
9317 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
9318 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
9319 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
9320 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
9321 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
9322 ; GCN2-NEXT: s_cbranch_execnz .LBB69_3
9323 ; GCN2-NEXT: ; %bb.1: ; %Flow
9324 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9325 ; GCN2-NEXT: s_cbranch_execnz .LBB69_4
9326 ; GCN2-NEXT: .LBB69_2: ; %atomicrmw.phi
9327 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
9328 ; GCN2-NEXT: s_setpc_b64 s[30:31]
9329 ; GCN2-NEXT: .LBB69_3: ; %atomicrmw.global
9330 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc
9331 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9332 ; GCN2-NEXT: buffer_wbinvl1_vol
9333 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
9334 ; GCN2-NEXT: ; implicit-def: $vgpr3
9335 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9336 ; GCN2-NEXT: s_cbranch_execz .LBB69_2
9337 ; GCN2-NEXT: .LBB69_4: ; %atomicrmw.private
9338 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
9339 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
9340 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
9341 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
9342 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
9343 ; GCN2-NEXT: s_waitcnt vmcnt(1)
9344 ; GCN2-NEXT: v_or_b32_e32 v2, v0, v2
9345 ; GCN2-NEXT: s_waitcnt vmcnt(0)
9346 ; GCN2-NEXT: v_or_b32_e32 v3, v1, v3
9347 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
9348 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
9349 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
9350 ; GCN2-NEXT: s_waitcnt vmcnt(0)
9351 ; GCN2-NEXT: s_setpc_b64 s[30:31]
9353 ; GCN3-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory:
9355 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9356 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
9357 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
9358 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
9359 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
9360 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
9361 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
9362 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
9363 ; GCN3-NEXT: s_cbranch_execnz .LBB69_3
9364 ; GCN3-NEXT: ; %bb.1: ; %Flow
9365 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9366 ; GCN3-NEXT: s_cbranch_execnz .LBB69_4
9367 ; GCN3-NEXT: .LBB69_2: ; %atomicrmw.phi
9368 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
9369 ; GCN3-NEXT: s_setpc_b64 s[30:31]
9370 ; GCN3-NEXT: .LBB69_3: ; %atomicrmw.global
9371 ; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc
9372 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9373 ; GCN3-NEXT: buffer_wbinvl1_vol
9374 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
9375 ; GCN3-NEXT: ; implicit-def: $vgpr3
9376 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9377 ; GCN3-NEXT: s_cbranch_execz .LBB69_2
9378 ; GCN3-NEXT: .LBB69_4: ; %atomicrmw.private
9379 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
9380 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
9381 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
9382 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
9383 ; GCN3-NEXT: s_waitcnt vmcnt(1)
9384 ; GCN3-NEXT: v_or_b32_e32 v3, v1, v3
9385 ; GCN3-NEXT: s_waitcnt vmcnt(0)
9386 ; GCN3-NEXT: v_or_b32_e32 v2, v0, v2
9387 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
9388 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
9389 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
9390 ; GCN3-NEXT: s_waitcnt vmcnt(0)
9391 ; GCN3-NEXT: s_setpc_b64 s[30:31]
9392 %gep = getelementptr i64, ptr %out, i64 4
9393 %result = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
9397 ; ---------------------------------------------------------------------
9399 ; ---------------------------------------------------------------------
9401 define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) {
9402 ; GCN1-LABEL: flat_atomic_xor_i64_noret:
9404 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9405 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
9406 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
9407 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
9408 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
9409 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
9410 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
9411 ; GCN1-NEXT: s_cbranch_execnz .LBB70_3
9412 ; GCN1-NEXT: ; %bb.1: ; %Flow
9413 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9414 ; GCN1-NEXT: s_cbranch_execnz .LBB70_4
9415 ; GCN1-NEXT: .LBB70_2: ; %atomicrmw.phi
9416 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
9417 ; GCN1-NEXT: s_setpc_b64 s[30:31]
9418 ; GCN1-NEXT: .LBB70_3: ; %atomicrmw.global
9419 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
9420 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9421 ; GCN1-NEXT: buffer_wbinvl1_vol
9422 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
9423 ; GCN1-NEXT: ; implicit-def: $vgpr3
9424 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9425 ; GCN1-NEXT: s_cbranch_execz .LBB70_2
9426 ; GCN1-NEXT: .LBB70_4: ; %atomicrmw.private
9427 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
9428 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
9429 ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
9430 ; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
9431 ; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
9432 ; GCN1-NEXT: s_waitcnt vmcnt(1)
9433 ; GCN1-NEXT: v_xor_b32_e32 v2, v4, v2
9434 ; GCN1-NEXT: s_waitcnt vmcnt(0)
9435 ; GCN1-NEXT: v_xor_b32_e32 v3, v5, v3
9436 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
9437 ; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
9438 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
9439 ; GCN1-NEXT: s_waitcnt vmcnt(0)
9440 ; GCN1-NEXT: s_setpc_b64 s[30:31]
9442 ; GCN2-LABEL: flat_atomic_xor_i64_noret:
9444 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9445 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
9446 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
9447 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
9448 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
9449 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
9450 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
9451 ; GCN2-NEXT: s_cbranch_execnz .LBB70_3
9452 ; GCN2-NEXT: ; %bb.1: ; %Flow
9453 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9454 ; GCN2-NEXT: s_cbranch_execnz .LBB70_4
9455 ; GCN2-NEXT: .LBB70_2: ; %atomicrmw.phi
9456 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
9457 ; GCN2-NEXT: s_setpc_b64 s[30:31]
9458 ; GCN2-NEXT: .LBB70_3: ; %atomicrmw.global
9459 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
9460 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9461 ; GCN2-NEXT: buffer_wbinvl1_vol
9462 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
9463 ; GCN2-NEXT: ; implicit-def: $vgpr3
9464 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9465 ; GCN2-NEXT: s_cbranch_execz .LBB70_2
9466 ; GCN2-NEXT: .LBB70_4: ; %atomicrmw.private
9467 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
9468 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
9469 ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
9470 ; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
9471 ; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
9472 ; GCN2-NEXT: s_waitcnt vmcnt(1)
9473 ; GCN2-NEXT: v_xor_b32_e32 v2, v4, v2
9474 ; GCN2-NEXT: s_waitcnt vmcnt(0)
9475 ; GCN2-NEXT: v_xor_b32_e32 v3, v5, v3
9476 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
9477 ; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
9478 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
9479 ; GCN2-NEXT: s_waitcnt vmcnt(0)
9480 ; GCN2-NEXT: s_setpc_b64 s[30:31]
9482 ; GCN3-LABEL: flat_atomic_xor_i64_noret:
9484 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9485 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
9486 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
9487 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
9488 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
9489 ; GCN3-NEXT: s_cbranch_execnz .LBB70_3
9490 ; GCN3-NEXT: ; %bb.1: ; %Flow
9491 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9492 ; GCN3-NEXT: s_cbranch_execnz .LBB70_4
9493 ; GCN3-NEXT: .LBB70_2: ; %atomicrmw.phi
9494 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
9495 ; GCN3-NEXT: s_setpc_b64 s[30:31]
9496 ; GCN3-NEXT: .LBB70_3: ; %atomicrmw.global
9497 ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
9498 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9499 ; GCN3-NEXT: buffer_wbinvl1_vol
9500 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
9501 ; GCN3-NEXT: ; implicit-def: $vgpr3
9502 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9503 ; GCN3-NEXT: s_cbranch_execz .LBB70_2
9504 ; GCN3-NEXT: .LBB70_4: ; %atomicrmw.private
9505 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
9506 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
9507 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
9508 ; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
9509 ; GCN3-NEXT: s_waitcnt vmcnt(1)
9510 ; GCN3-NEXT: v_xor_b32_e32 v1, v1, v3
9511 ; GCN3-NEXT: s_waitcnt vmcnt(0)
9512 ; GCN3-NEXT: v_xor_b32_e32 v2, v4, v2
9513 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
9514 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
9515 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
9516 ; GCN3-NEXT: s_waitcnt vmcnt(0)
9517 ; GCN3-NEXT: s_setpc_b64 s[30:31]
9518 %tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst
9522 define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) {
9523 ; GCN1-LABEL: flat_atomic_xor_i64_noret_offset:
9525 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9526 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
9527 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
9528 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
9529 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
9530 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
9531 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
9532 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
9533 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
9534 ; GCN1-NEXT: s_cbranch_execnz .LBB71_3
9535 ; GCN1-NEXT: ; %bb.1: ; %Flow
9536 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9537 ; GCN1-NEXT: s_cbranch_execnz .LBB71_4
9538 ; GCN1-NEXT: .LBB71_2: ; %atomicrmw.phi
9539 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
9540 ; GCN1-NEXT: s_setpc_b64 s[30:31]
9541 ; GCN1-NEXT: .LBB71_3: ; %atomicrmw.global
9542 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
9543 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9544 ; GCN1-NEXT: buffer_wbinvl1_vol
9545 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
9546 ; GCN1-NEXT: ; implicit-def: $vgpr3
9547 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9548 ; GCN1-NEXT: s_cbranch_execz .LBB71_2
9549 ; GCN1-NEXT: .LBB71_4: ; %atomicrmw.private
9550 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
9551 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
9552 ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
9553 ; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
9554 ; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
9555 ; GCN1-NEXT: s_waitcnt vmcnt(1)
9556 ; GCN1-NEXT: v_xor_b32_e32 v2, v4, v2
9557 ; GCN1-NEXT: s_waitcnt vmcnt(0)
9558 ; GCN1-NEXT: v_xor_b32_e32 v3, v5, v3
9559 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
9560 ; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
9561 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
9562 ; GCN1-NEXT: s_waitcnt vmcnt(0)
9563 ; GCN1-NEXT: s_setpc_b64 s[30:31]
9565 ; GCN2-LABEL: flat_atomic_xor_i64_noret_offset:
9567 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9568 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
9569 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
9570 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
9571 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
9572 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
9573 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
9574 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
9575 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
9576 ; GCN2-NEXT: s_cbranch_execnz .LBB71_3
9577 ; GCN2-NEXT: ; %bb.1: ; %Flow
9578 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9579 ; GCN2-NEXT: s_cbranch_execnz .LBB71_4
9580 ; GCN2-NEXT: .LBB71_2: ; %atomicrmw.phi
9581 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
9582 ; GCN2-NEXT: s_setpc_b64 s[30:31]
9583 ; GCN2-NEXT: .LBB71_3: ; %atomicrmw.global
9584 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
9585 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9586 ; GCN2-NEXT: buffer_wbinvl1_vol
9587 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
9588 ; GCN2-NEXT: ; implicit-def: $vgpr3
9589 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9590 ; GCN2-NEXT: s_cbranch_execz .LBB71_2
9591 ; GCN2-NEXT: .LBB71_4: ; %atomicrmw.private
9592 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
9593 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
9594 ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
9595 ; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
9596 ; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
9597 ; GCN2-NEXT: s_waitcnt vmcnt(1)
9598 ; GCN2-NEXT: v_xor_b32_e32 v2, v4, v2
9599 ; GCN2-NEXT: s_waitcnt vmcnt(0)
9600 ; GCN2-NEXT: v_xor_b32_e32 v3, v5, v3
9601 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
9602 ; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
9603 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
9604 ; GCN2-NEXT: s_waitcnt vmcnt(0)
9605 ; GCN2-NEXT: s_setpc_b64 s[30:31]
9607 ; GCN3-LABEL: flat_atomic_xor_i64_noret_offset:
9609 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9610 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
9611 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
9612 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
9613 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
9614 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
9615 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
9616 ; GCN3-NEXT: s_cbranch_execnz .LBB71_3
9617 ; GCN3-NEXT: ; %bb.1: ; %Flow
9618 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9619 ; GCN3-NEXT: s_cbranch_execnz .LBB71_4
9620 ; GCN3-NEXT: .LBB71_2: ; %atomicrmw.phi
9621 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
9622 ; GCN3-NEXT: s_setpc_b64 s[30:31]
9623 ; GCN3-NEXT: .LBB71_3: ; %atomicrmw.global
9624 ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
9625 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9626 ; GCN3-NEXT: buffer_wbinvl1_vol
9627 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
9628 ; GCN3-NEXT: ; implicit-def: $vgpr3
9629 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9630 ; GCN3-NEXT: s_cbranch_execz .LBB71_2
9631 ; GCN3-NEXT: .LBB71_4: ; %atomicrmw.private
9632 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
9633 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
9634 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
9635 ; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
9636 ; GCN3-NEXT: s_waitcnt vmcnt(1)
9637 ; GCN3-NEXT: v_xor_b32_e32 v1, v1, v3
9638 ; GCN3-NEXT: s_waitcnt vmcnt(0)
9639 ; GCN3-NEXT: v_xor_b32_e32 v2, v4, v2
9640 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
9641 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
9642 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
9643 ; GCN3-NEXT: s_waitcnt vmcnt(0)
9644 ; GCN3-NEXT: s_setpc_b64 s[30:31]
9645 %gep = getelementptr i64, ptr %out, i64 4
9646 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst
9650 define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) {
9651 ; GCN1-LABEL: flat_atomic_xor_i64_ret:
9653 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9654 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
9655 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
9656 ; GCN1-NEXT: v_mov_b32_e32 v5, v1
9657 ; GCN1-NEXT: v_mov_b32_e32 v4, v0
9658 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
9659 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
9660 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
9661 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
9662 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
9663 ; GCN1-NEXT: s_cbranch_execnz .LBB72_3
9664 ; GCN1-NEXT: ; %bb.1: ; %Flow
9665 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9666 ; GCN1-NEXT: s_cbranch_execnz .LBB72_4
9667 ; GCN1-NEXT: .LBB72_2: ; %atomicrmw.phi
9668 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
9669 ; GCN1-NEXT: s_setpc_b64 s[30:31]
9670 ; GCN1-NEXT: .LBB72_3: ; %atomicrmw.global
9671 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc
9672 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9673 ; GCN1-NEXT: buffer_wbinvl1_vol
9674 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
9675 ; GCN1-NEXT: ; implicit-def: $vgpr3
9676 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9677 ; GCN1-NEXT: s_cbranch_execz .LBB72_2
9678 ; GCN1-NEXT: .LBB72_4: ; %atomicrmw.private
9679 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
9680 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
9681 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
9682 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
9683 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
9684 ; GCN1-NEXT: s_waitcnt vmcnt(1)
9685 ; GCN1-NEXT: v_xor_b32_e32 v2, v0, v2
9686 ; GCN1-NEXT: s_waitcnt vmcnt(0)
9687 ; GCN1-NEXT: v_xor_b32_e32 v3, v1, v3
9688 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
9689 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
9690 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
9691 ; GCN1-NEXT: s_waitcnt vmcnt(0)
9692 ; GCN1-NEXT: s_setpc_b64 s[30:31]
9694 ; GCN2-LABEL: flat_atomic_xor_i64_ret:
9696 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9697 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
9698 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
9699 ; GCN2-NEXT: v_mov_b32_e32 v5, v1
9700 ; GCN2-NEXT: v_mov_b32_e32 v4, v0
9701 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
9702 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
9703 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
9704 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
9705 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
9706 ; GCN2-NEXT: s_cbranch_execnz .LBB72_3
9707 ; GCN2-NEXT: ; %bb.1: ; %Flow
9708 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9709 ; GCN2-NEXT: s_cbranch_execnz .LBB72_4
9710 ; GCN2-NEXT: .LBB72_2: ; %atomicrmw.phi
9711 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
9712 ; GCN2-NEXT: s_setpc_b64 s[30:31]
9713 ; GCN2-NEXT: .LBB72_3: ; %atomicrmw.global
9714 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc
9715 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9716 ; GCN2-NEXT: buffer_wbinvl1_vol
9717 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
9718 ; GCN2-NEXT: ; implicit-def: $vgpr3
9719 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9720 ; GCN2-NEXT: s_cbranch_execz .LBB72_2
9721 ; GCN2-NEXT: .LBB72_4: ; %atomicrmw.private
9722 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
9723 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
9724 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
9725 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
9726 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
9727 ; GCN2-NEXT: s_waitcnt vmcnt(1)
9728 ; GCN2-NEXT: v_xor_b32_e32 v2, v0, v2
9729 ; GCN2-NEXT: s_waitcnt vmcnt(0)
9730 ; GCN2-NEXT: v_xor_b32_e32 v3, v1, v3
9731 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
9732 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
9733 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
9734 ; GCN2-NEXT: s_waitcnt vmcnt(0)
9735 ; GCN2-NEXT: s_setpc_b64 s[30:31]
9737 ; GCN3-LABEL: flat_atomic_xor_i64_ret:
9739 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9740 ; GCN3-NEXT: v_mov_b32_e32 v5, v1
9741 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
9742 ; GCN3-NEXT: v_mov_b32_e32 v4, v0
9743 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
9744 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
9745 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
9746 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
9747 ; GCN3-NEXT: s_cbranch_execnz .LBB72_3
9748 ; GCN3-NEXT: ; %bb.1: ; %Flow
9749 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9750 ; GCN3-NEXT: s_cbranch_execnz .LBB72_4
9751 ; GCN3-NEXT: .LBB72_2: ; %atomicrmw.phi
9752 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
9753 ; GCN3-NEXT: s_setpc_b64 s[30:31]
9754 ; GCN3-NEXT: .LBB72_3: ; %atomicrmw.global
9755 ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc
9756 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9757 ; GCN3-NEXT: buffer_wbinvl1_vol
9758 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
9759 ; GCN3-NEXT: ; implicit-def: $vgpr3
9760 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9761 ; GCN3-NEXT: s_cbranch_execz .LBB72_2
9762 ; GCN3-NEXT: .LBB72_4: ; %atomicrmw.private
9763 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
9764 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
9765 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
9766 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
9767 ; GCN3-NEXT: s_waitcnt vmcnt(1)
9768 ; GCN3-NEXT: v_xor_b32_e32 v3, v1, v3
9769 ; GCN3-NEXT: s_waitcnt vmcnt(0)
9770 ; GCN3-NEXT: v_xor_b32_e32 v2, v0, v2
9771 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
9772 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
9773 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
9774 ; GCN3-NEXT: s_waitcnt vmcnt(0)
9775 ; GCN3-NEXT: s_setpc_b64 s[30:31]
9776 %result = atomicrmw xor ptr %ptr, i64 %in seq_cst
9780 define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) {
9781 ; GCN1-LABEL: flat_atomic_xor_i64_ret_offset:
9783 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9784 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
9785 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
9786 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
9787 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
9788 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
9789 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
9790 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
9791 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
9792 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
9793 ; GCN1-NEXT: s_cbranch_execnz .LBB73_3
9794 ; GCN1-NEXT: ; %bb.1: ; %Flow
9795 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9796 ; GCN1-NEXT: s_cbranch_execnz .LBB73_4
9797 ; GCN1-NEXT: .LBB73_2: ; %atomicrmw.phi
9798 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
9799 ; GCN1-NEXT: s_setpc_b64 s[30:31]
9800 ; GCN1-NEXT: .LBB73_3: ; %atomicrmw.global
9801 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc
9802 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9803 ; GCN1-NEXT: buffer_wbinvl1_vol
9804 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
9805 ; GCN1-NEXT: ; implicit-def: $vgpr3
9806 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9807 ; GCN1-NEXT: s_cbranch_execz .LBB73_2
9808 ; GCN1-NEXT: .LBB73_4: ; %atomicrmw.private
9809 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
9810 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
9811 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
9812 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
9813 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
9814 ; GCN1-NEXT: s_waitcnt vmcnt(1)
9815 ; GCN1-NEXT: v_xor_b32_e32 v2, v0, v2
9816 ; GCN1-NEXT: s_waitcnt vmcnt(0)
9817 ; GCN1-NEXT: v_xor_b32_e32 v3, v1, v3
9818 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
9819 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
9820 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
9821 ; GCN1-NEXT: s_waitcnt vmcnt(0)
9822 ; GCN1-NEXT: s_setpc_b64 s[30:31]
9824 ; GCN2-LABEL: flat_atomic_xor_i64_ret_offset:
9826 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9827 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
9828 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
9829 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
9830 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
9831 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
9832 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
9833 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
9834 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
9835 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
9836 ; GCN2-NEXT: s_cbranch_execnz .LBB73_3
9837 ; GCN2-NEXT: ; %bb.1: ; %Flow
9838 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9839 ; GCN2-NEXT: s_cbranch_execnz .LBB73_4
9840 ; GCN2-NEXT: .LBB73_2: ; %atomicrmw.phi
9841 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
9842 ; GCN2-NEXT: s_setpc_b64 s[30:31]
9843 ; GCN2-NEXT: .LBB73_3: ; %atomicrmw.global
9844 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc
9845 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9846 ; GCN2-NEXT: buffer_wbinvl1_vol
9847 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
9848 ; GCN2-NEXT: ; implicit-def: $vgpr3
9849 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9850 ; GCN2-NEXT: s_cbranch_execz .LBB73_2
9851 ; GCN2-NEXT: .LBB73_4: ; %atomicrmw.private
9852 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
9853 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
9854 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
9855 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
9856 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
9857 ; GCN2-NEXT: s_waitcnt vmcnt(1)
9858 ; GCN2-NEXT: v_xor_b32_e32 v2, v0, v2
9859 ; GCN2-NEXT: s_waitcnt vmcnt(0)
9860 ; GCN2-NEXT: v_xor_b32_e32 v3, v1, v3
9861 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
9862 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
9863 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
9864 ; GCN2-NEXT: s_waitcnt vmcnt(0)
9865 ; GCN2-NEXT: s_setpc_b64 s[30:31]
9867 ; GCN3-LABEL: flat_atomic_xor_i64_ret_offset:
9869 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9870 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
9871 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
9872 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
9873 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
9874 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
9875 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
9876 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
9877 ; GCN3-NEXT: s_cbranch_execnz .LBB73_3
9878 ; GCN3-NEXT: ; %bb.1: ; %Flow
9879 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9880 ; GCN3-NEXT: s_cbranch_execnz .LBB73_4
9881 ; GCN3-NEXT: .LBB73_2: ; %atomicrmw.phi
9882 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
9883 ; GCN3-NEXT: s_setpc_b64 s[30:31]
9884 ; GCN3-NEXT: .LBB73_3: ; %atomicrmw.global
9885 ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc
9886 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9887 ; GCN3-NEXT: buffer_wbinvl1_vol
9888 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
9889 ; GCN3-NEXT: ; implicit-def: $vgpr3
9890 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
9891 ; GCN3-NEXT: s_cbranch_execz .LBB73_2
9892 ; GCN3-NEXT: .LBB73_4: ; %atomicrmw.private
9893 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
9894 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
9895 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
9896 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
9897 ; GCN3-NEXT: s_waitcnt vmcnt(1)
9898 ; GCN3-NEXT: v_xor_b32_e32 v3, v1, v3
9899 ; GCN3-NEXT: s_waitcnt vmcnt(0)
9900 ; GCN3-NEXT: v_xor_b32_e32 v2, v0, v2
9901 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
9902 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
9903 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
9904 ; GCN3-NEXT: s_waitcnt vmcnt(0)
9905 ; GCN3-NEXT: s_setpc_b64 s[30:31]
9906 %gep = getelementptr i64, ptr %out, i64 4
9907 %result = atomicrmw xor ptr %gep, i64 %in seq_cst
9911 define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
9912 ; GCN1-LABEL: flat_atomic_xor_i64_noret_scalar:
9914 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9915 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
9916 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
9917 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
9918 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
9919 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
9920 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
9921 ; GCN1-NEXT: s_mov_b64 s[34:35], -1
9922 ; GCN1-NEXT: s_cbranch_vccnz .LBB74_3
9923 ; GCN1-NEXT: ; %bb.1: ; %Flow
9924 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
9925 ; GCN1-NEXT: s_cbranch_vccz .LBB74_4
9926 ; GCN1-NEXT: .LBB74_2: ; %atomicrmw.phi
9927 ; GCN1-NEXT: s_setpc_b64 s[30:31]
9928 ; GCN1-NEXT: .LBB74_3: ; %atomicrmw.global
9929 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
9930 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
9931 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
9932 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
9933 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
9934 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9935 ; GCN1-NEXT: buffer_wbinvl1_vol
9936 ; GCN1-NEXT: s_cbranch_execnz .LBB74_2
9937 ; GCN1-NEXT: .LBB74_4: ; %atomicrmw.private
9938 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
9939 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
9940 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
9941 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
9942 ; GCN1-NEXT: s_add_i32 s34, s34, 4
9943 ; GCN1-NEXT: v_mov_b32_e32 v1, s34
9944 ; GCN1-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
9945 ; GCN1-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
9946 ; GCN1-NEXT: s_waitcnt vmcnt(1)
9947 ; GCN1-NEXT: v_xor_b32_e32 v2, s6, v2
9948 ; GCN1-NEXT: s_waitcnt vmcnt(0)
9949 ; GCN1-NEXT: v_xor_b32_e32 v3, s7, v3
9950 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
9951 ; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
9952 ; GCN1-NEXT: s_waitcnt vmcnt(0)
9953 ; GCN1-NEXT: s_setpc_b64 s[30:31]
9955 ; GCN2-LABEL: flat_atomic_xor_i64_noret_scalar:
9957 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9958 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
9959 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
9960 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
9961 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
9962 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
9963 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
9964 ; GCN2-NEXT: s_mov_b64 s[34:35], -1
9965 ; GCN2-NEXT: s_cbranch_vccnz .LBB74_3
9966 ; GCN2-NEXT: ; %bb.1: ; %Flow
9967 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
9968 ; GCN2-NEXT: s_cbranch_vccz .LBB74_4
9969 ; GCN2-NEXT: .LBB74_2: ; %atomicrmw.phi
9970 ; GCN2-NEXT: s_setpc_b64 s[30:31]
9971 ; GCN2-NEXT: .LBB74_3: ; %atomicrmw.global
9972 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
9973 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
9974 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
9975 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
9976 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
9977 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9978 ; GCN2-NEXT: buffer_wbinvl1_vol
9979 ; GCN2-NEXT: s_cbranch_execnz .LBB74_2
9980 ; GCN2-NEXT: .LBB74_4: ; %atomicrmw.private
9981 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
9982 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
9983 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
9984 ; GCN2-NEXT: s_add_i32 s34, s34, 4
9985 ; GCN2-NEXT: v_mov_b32_e32 v1, s34
9986 ; GCN2-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
9987 ; GCN2-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
9988 ; GCN2-NEXT: s_waitcnt vmcnt(1)
9989 ; GCN2-NEXT: v_xor_b32_e32 v2, s6, v2
9990 ; GCN2-NEXT: s_waitcnt vmcnt(0)
9991 ; GCN2-NEXT: v_xor_b32_e32 v3, s7, v3
9992 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
9993 ; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
9994 ; GCN2-NEXT: s_waitcnt vmcnt(0)
9995 ; GCN2-NEXT: s_setpc_b64 s[30:31]
9997 ; GCN3-LABEL: flat_atomic_xor_i64_noret_scalar:
9999 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10000 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
10001 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
10002 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
10003 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
10004 ; GCN3-NEXT: s_mov_b64 s[34:35], -1
10005 ; GCN3-NEXT: s_cbranch_vccnz .LBB74_3
10006 ; GCN3-NEXT: ; %bb.1: ; %Flow
10007 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
10008 ; GCN3-NEXT: s_cbranch_vccz .LBB74_4
10009 ; GCN3-NEXT: .LBB74_2: ; %atomicrmw.phi
10010 ; GCN3-NEXT: s_setpc_b64 s[30:31]
10011 ; GCN3-NEXT: .LBB74_3: ; %atomicrmw.global
10012 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
10013 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
10014 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
10015 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
10016 ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
10017 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10018 ; GCN3-NEXT: buffer_wbinvl1_vol
10019 ; GCN3-NEXT: s_cbranch_execnz .LBB74_2
10020 ; GCN3-NEXT: .LBB74_4: ; %atomicrmw.private
10021 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
10022 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
10023 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
10024 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
10025 ; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
10026 ; GCN3-NEXT: s_waitcnt vmcnt(1)
10027 ; GCN3-NEXT: v_xor_b32_e32 v1, s7, v1
10028 ; GCN3-NEXT: s_waitcnt vmcnt(0)
10029 ; GCN3-NEXT: v_xor_b32_e32 v2, s6, v2
10030 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
10031 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
10032 ; GCN3-NEXT: s_waitcnt vmcnt(0)
10033 ; GCN3-NEXT: s_setpc_b64 s[30:31]
10034 %tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst
10038 define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
10039 ; GCN1-LABEL: flat_atomic_xor_i64_noret_offset_scalar:
10041 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10042 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
10043 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
10044 ; GCN1-NEXT: s_add_u32 s34, s4, 32
10045 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
10046 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
10047 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
10048 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
10049 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
10050 ; GCN1-NEXT: s_mov_b64 s[36:37], -1
10051 ; GCN1-NEXT: s_cbranch_vccnz .LBB75_3
10052 ; GCN1-NEXT: ; %bb.1: ; %Flow
10053 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
10054 ; GCN1-NEXT: s_cbranch_vccz .LBB75_4
10055 ; GCN1-NEXT: .LBB75_2: ; %atomicrmw.phi
10056 ; GCN1-NEXT: s_setpc_b64 s[30:31]
10057 ; GCN1-NEXT: .LBB75_3: ; %atomicrmw.global
10058 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
10059 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
10060 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
10061 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
10062 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
10063 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10064 ; GCN1-NEXT: buffer_wbinvl1_vol
10065 ; GCN1-NEXT: s_cbranch_execnz .LBB75_2
10066 ; GCN1-NEXT: .LBB75_4: ; %atomicrmw.private
10067 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
10068 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
10069 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
10070 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
10071 ; GCN1-NEXT: s_add_i32 s34, s34, 4
10072 ; GCN1-NEXT: v_mov_b32_e32 v1, s34
10073 ; GCN1-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
10074 ; GCN1-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
10075 ; GCN1-NEXT: s_waitcnt vmcnt(1)
10076 ; GCN1-NEXT: v_xor_b32_e32 v2, s6, v2
10077 ; GCN1-NEXT: s_waitcnt vmcnt(0)
10078 ; GCN1-NEXT: v_xor_b32_e32 v3, s7, v3
10079 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
10080 ; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
10081 ; GCN1-NEXT: s_waitcnt vmcnt(0)
10082 ; GCN1-NEXT: s_setpc_b64 s[30:31]
10084 ; GCN2-LABEL: flat_atomic_xor_i64_noret_offset_scalar:
10086 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10087 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
10088 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
10089 ; GCN2-NEXT: s_add_u32 s34, s4, 32
10090 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
10091 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
10092 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
10093 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
10094 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
10095 ; GCN2-NEXT: s_mov_b64 s[36:37], -1
10096 ; GCN2-NEXT: s_cbranch_vccnz .LBB75_3
10097 ; GCN2-NEXT: ; %bb.1: ; %Flow
10098 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
10099 ; GCN2-NEXT: s_cbranch_vccz .LBB75_4
10100 ; GCN2-NEXT: .LBB75_2: ; %atomicrmw.phi
10101 ; GCN2-NEXT: s_setpc_b64 s[30:31]
10102 ; GCN2-NEXT: .LBB75_3: ; %atomicrmw.global
10103 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
10104 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
10105 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
10106 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
10107 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
10108 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10109 ; GCN2-NEXT: buffer_wbinvl1_vol
10110 ; GCN2-NEXT: s_cbranch_execnz .LBB75_2
10111 ; GCN2-NEXT: .LBB75_4: ; %atomicrmw.private
10112 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
10113 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
10114 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
10115 ; GCN2-NEXT: s_add_i32 s34, s34, 4
10116 ; GCN2-NEXT: v_mov_b32_e32 v1, s34
10117 ; GCN2-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
10118 ; GCN2-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
10119 ; GCN2-NEXT: s_waitcnt vmcnt(1)
10120 ; GCN2-NEXT: v_xor_b32_e32 v2, s6, v2
10121 ; GCN2-NEXT: s_waitcnt vmcnt(0)
10122 ; GCN2-NEXT: v_xor_b32_e32 v3, s7, v3
10123 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
10124 ; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
10125 ; GCN2-NEXT: s_waitcnt vmcnt(0)
10126 ; GCN2-NEXT: s_setpc_b64 s[30:31]
10128 ; GCN3-LABEL: flat_atomic_xor_i64_noret_offset_scalar:
10130 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10131 ; GCN3-NEXT: s_add_u32 s34, s4, 32
10132 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
10133 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
10134 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
10135 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
10136 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
10137 ; GCN3-NEXT: s_mov_b64 s[36:37], -1
10138 ; GCN3-NEXT: s_cbranch_vccnz .LBB75_3
10139 ; GCN3-NEXT: ; %bb.1: ; %Flow
10140 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
10141 ; GCN3-NEXT: s_cbranch_vccz .LBB75_4
10142 ; GCN3-NEXT: .LBB75_2: ; %atomicrmw.phi
10143 ; GCN3-NEXT: s_setpc_b64 s[30:31]
10144 ; GCN3-NEXT: .LBB75_3: ; %atomicrmw.global
10145 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
10146 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
10147 ; GCN3-NEXT: v_mov_b32_e32 v1, s35
10148 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
10149 ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
10150 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10151 ; GCN3-NEXT: buffer_wbinvl1_vol
10152 ; GCN3-NEXT: s_cbranch_execnz .LBB75_2
10153 ; GCN3-NEXT: .LBB75_4: ; %atomicrmw.private
10154 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
10155 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
10156 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
10157 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
10158 ; GCN3-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
10159 ; GCN3-NEXT: s_waitcnt vmcnt(1)
10160 ; GCN3-NEXT: v_xor_b32_e32 v1, s7, v1
10161 ; GCN3-NEXT: s_waitcnt vmcnt(0)
10162 ; GCN3-NEXT: v_xor_b32_e32 v2, s6, v2
10163 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
10164 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
10165 ; GCN3-NEXT: s_waitcnt vmcnt(0)
10166 ; GCN3-NEXT: s_setpc_b64 s[30:31]
10167 %gep = getelementptr i64, ptr %out, i64 4
10168 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst
10172 define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
10173 ; GCN1-LABEL: flat_atomic_xor_i64_ret_scalar:
10175 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10176 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
10177 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
10178 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
10179 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
10180 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
10181 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
10182 ; GCN1-NEXT: s_cbranch_vccz .LBB76_2
10183 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
10184 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
10185 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
10186 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
10187 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
10188 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
10189 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10190 ; GCN1-NEXT: buffer_wbinvl1_vol
10191 ; GCN1-NEXT: s_cbranch_execz .LBB76_3
10192 ; GCN1-NEXT: s_branch .LBB76_4
10193 ; GCN1-NEXT: .LBB76_2:
10194 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
10195 ; GCN1-NEXT: .LBB76_3: ; %atomicrmw.private
10196 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
10197 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
10198 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
10199 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
10200 ; GCN1-NEXT: s_add_i32 s34, s34, 4
10201 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
10202 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
10203 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
10204 ; GCN1-NEXT: s_waitcnt vmcnt(1)
10205 ; GCN1-NEXT: v_xor_b32_e32 v4, s6, v0
10206 ; GCN1-NEXT: s_waitcnt vmcnt(0)
10207 ; GCN1-NEXT: v_xor_b32_e32 v5, s7, v1
10208 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
10209 ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
10210 ; GCN1-NEXT: .LBB76_4: ; %atomicrmw.end
10211 ; GCN1-NEXT: s_waitcnt vmcnt(0)
10212 ; GCN1-NEXT: s_setpc_b64 s[30:31]
10214 ; GCN2-LABEL: flat_atomic_xor_i64_ret_scalar:
10216 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10217 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
10218 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
10219 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
10220 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
10221 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
10222 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
10223 ; GCN2-NEXT: s_cbranch_vccz .LBB76_2
10224 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
10225 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
10226 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
10227 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
10228 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
10229 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
10230 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10231 ; GCN2-NEXT: buffer_wbinvl1_vol
10232 ; GCN2-NEXT: s_cbranch_execz .LBB76_3
10233 ; GCN2-NEXT: s_branch .LBB76_4
10234 ; GCN2-NEXT: .LBB76_2:
10235 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
10236 ; GCN2-NEXT: .LBB76_3: ; %atomicrmw.private
10237 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
10238 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
10239 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
10240 ; GCN2-NEXT: s_add_i32 s34, s34, 4
10241 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
10242 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
10243 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
10244 ; GCN2-NEXT: s_waitcnt vmcnt(1)
10245 ; GCN2-NEXT: v_xor_b32_e32 v4, s6, v0
10246 ; GCN2-NEXT: s_waitcnt vmcnt(0)
10247 ; GCN2-NEXT: v_xor_b32_e32 v5, s7, v1
10248 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
10249 ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
10250 ; GCN2-NEXT: .LBB76_4: ; %atomicrmw.end
10251 ; GCN2-NEXT: s_waitcnt vmcnt(0)
10252 ; GCN2-NEXT: s_setpc_b64 s[30:31]
10254 ; GCN3-LABEL: flat_atomic_xor_i64_ret_scalar:
10256 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10257 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
10258 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
10259 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
10260 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
10261 ; GCN3-NEXT: s_cbranch_vccz .LBB76_2
10262 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
10263 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
10264 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
10265 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
10266 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
10267 ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
10268 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10269 ; GCN3-NEXT: buffer_wbinvl1_vol
10270 ; GCN3-NEXT: s_cbranch_execz .LBB76_3
10271 ; GCN3-NEXT: s_branch .LBB76_4
10272 ; GCN3-NEXT: .LBB76_2:
10273 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
10274 ; GCN3-NEXT: .LBB76_3: ; %atomicrmw.private
10275 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
10276 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
10277 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
10278 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
10279 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
10280 ; GCN3-NEXT: s_waitcnt vmcnt(1)
10281 ; GCN3-NEXT: v_xor_b32_e32 v3, s7, v1
10282 ; GCN3-NEXT: s_waitcnt vmcnt(0)
10283 ; GCN3-NEXT: v_xor_b32_e32 v4, s6, v0
10284 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
10285 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
10286 ; GCN3-NEXT: .LBB76_4: ; %atomicrmw.end
10287 ; GCN3-NEXT: s_waitcnt vmcnt(0)
10288 ; GCN3-NEXT: s_setpc_b64 s[30:31]
10289 %result = atomicrmw xor ptr %ptr, i64 %in seq_cst
10293 define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
10294 ; GCN1-LABEL: flat_atomic_xor_i64_ret_offset_scalar:
10296 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10297 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
10298 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
10299 ; GCN1-NEXT: s_add_u32 s34, s4, 32
10300 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
10301 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
10302 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
10303 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
10304 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
10305 ; GCN1-NEXT: s_cbranch_vccz .LBB77_2
10306 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
10307 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
10308 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
10309 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
10310 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
10311 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
10312 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10313 ; GCN1-NEXT: buffer_wbinvl1_vol
10314 ; GCN1-NEXT: s_cbranch_execz .LBB77_3
10315 ; GCN1-NEXT: s_branch .LBB77_4
10316 ; GCN1-NEXT: .LBB77_2:
10317 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
10318 ; GCN1-NEXT: .LBB77_3: ; %atomicrmw.private
10319 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
10320 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
10321 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
10322 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
10323 ; GCN1-NEXT: s_add_i32 s34, s34, 4
10324 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
10325 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
10326 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
10327 ; GCN1-NEXT: s_waitcnt vmcnt(1)
10328 ; GCN1-NEXT: v_xor_b32_e32 v4, s6, v0
10329 ; GCN1-NEXT: s_waitcnt vmcnt(0)
10330 ; GCN1-NEXT: v_xor_b32_e32 v5, s7, v1
10331 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
10332 ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
10333 ; GCN1-NEXT: .LBB77_4: ; %atomicrmw.end
10334 ; GCN1-NEXT: s_waitcnt vmcnt(0)
10335 ; GCN1-NEXT: s_setpc_b64 s[30:31]
10337 ; GCN2-LABEL: flat_atomic_xor_i64_ret_offset_scalar:
10339 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10340 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
10341 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
10342 ; GCN2-NEXT: s_add_u32 s34, s4, 32
10343 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
10344 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
10345 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
10346 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
10347 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
10348 ; GCN2-NEXT: s_cbranch_vccz .LBB77_2
10349 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
10350 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
10351 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
10352 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
10353 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
10354 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
10355 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10356 ; GCN2-NEXT: buffer_wbinvl1_vol
10357 ; GCN2-NEXT: s_cbranch_execz .LBB77_3
10358 ; GCN2-NEXT: s_branch .LBB77_4
10359 ; GCN2-NEXT: .LBB77_2:
10360 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
10361 ; GCN2-NEXT: .LBB77_3: ; %atomicrmw.private
10362 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
10363 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
10364 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
10365 ; GCN2-NEXT: s_add_i32 s34, s34, 4
10366 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
10367 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
10368 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
10369 ; GCN2-NEXT: s_waitcnt vmcnt(1)
10370 ; GCN2-NEXT: v_xor_b32_e32 v4, s6, v0
10371 ; GCN2-NEXT: s_waitcnt vmcnt(0)
10372 ; GCN2-NEXT: v_xor_b32_e32 v5, s7, v1
10373 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
10374 ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
10375 ; GCN2-NEXT: .LBB77_4: ; %atomicrmw.end
10376 ; GCN2-NEXT: s_waitcnt vmcnt(0)
10377 ; GCN2-NEXT: s_setpc_b64 s[30:31]
10379 ; GCN3-LABEL: flat_atomic_xor_i64_ret_offset_scalar:
10381 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10382 ; GCN3-NEXT: s_add_u32 s34, s4, 32
10383 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
10384 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
10385 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
10386 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
10387 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
10388 ; GCN3-NEXT: s_cbranch_vccz .LBB77_2
10389 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
10390 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
10391 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
10392 ; GCN3-NEXT: v_mov_b32_e32 v1, s35
10393 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
10394 ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
10395 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10396 ; GCN3-NEXT: buffer_wbinvl1_vol
10397 ; GCN3-NEXT: s_cbranch_execz .LBB77_3
10398 ; GCN3-NEXT: s_branch .LBB77_4
10399 ; GCN3-NEXT: .LBB77_2:
10400 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
10401 ; GCN3-NEXT: .LBB77_3: ; %atomicrmw.private
10402 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
10403 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
10404 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
10405 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
10406 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
10407 ; GCN3-NEXT: s_waitcnt vmcnt(1)
10408 ; GCN3-NEXT: v_xor_b32_e32 v3, s7, v1
10409 ; GCN3-NEXT: s_waitcnt vmcnt(0)
10410 ; GCN3-NEXT: v_xor_b32_e32 v4, s6, v0
10411 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
10412 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
10413 ; GCN3-NEXT: .LBB77_4: ; %atomicrmw.end
10414 ; GCN3-NEXT: s_waitcnt vmcnt(0)
10415 ; GCN3-NEXT: s_setpc_b64 s[30:31]
10416 %gep = getelementptr i64, ptr %out, i64 4
10417 %result = atomicrmw xor ptr %gep, i64 %in seq_cst
10421 define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
10422 ; GCN1-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory:
10424 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10425 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
10426 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
10427 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
10428 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
10429 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
10430 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
10431 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
10432 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
10433 ; GCN1-NEXT: s_cbranch_execnz .LBB78_3
10434 ; GCN1-NEXT: ; %bb.1: ; %Flow
10435 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
10436 ; GCN1-NEXT: s_cbranch_execnz .LBB78_4
10437 ; GCN1-NEXT: .LBB78_2: ; %atomicrmw.phi
10438 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
10439 ; GCN1-NEXT: s_setpc_b64 s[30:31]
10440 ; GCN1-NEXT: .LBB78_3: ; %atomicrmw.global
10441 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
10442 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10443 ; GCN1-NEXT: buffer_wbinvl1_vol
10444 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
10445 ; GCN1-NEXT: ; implicit-def: $vgpr3
10446 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
10447 ; GCN1-NEXT: s_cbranch_execz .LBB78_2
10448 ; GCN1-NEXT: .LBB78_4: ; %atomicrmw.private
10449 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
10450 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
10451 ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
10452 ; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
10453 ; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
10454 ; GCN1-NEXT: s_waitcnt vmcnt(1)
10455 ; GCN1-NEXT: v_xor_b32_e32 v2, v4, v2
10456 ; GCN1-NEXT: s_waitcnt vmcnt(0)
10457 ; GCN1-NEXT: v_xor_b32_e32 v3, v5, v3
10458 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
10459 ; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
10460 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
10461 ; GCN1-NEXT: s_waitcnt vmcnt(0)
10462 ; GCN1-NEXT: s_setpc_b64 s[30:31]
10464 ; GCN2-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory:
10466 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10467 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
10468 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
10469 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
10470 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
10471 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
10472 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
10473 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
10474 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
10475 ; GCN2-NEXT: s_cbranch_execnz .LBB78_3
10476 ; GCN2-NEXT: ; %bb.1: ; %Flow
10477 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
10478 ; GCN2-NEXT: s_cbranch_execnz .LBB78_4
10479 ; GCN2-NEXT: .LBB78_2: ; %atomicrmw.phi
10480 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
10481 ; GCN2-NEXT: s_setpc_b64 s[30:31]
10482 ; GCN2-NEXT: .LBB78_3: ; %atomicrmw.global
10483 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
10484 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10485 ; GCN2-NEXT: buffer_wbinvl1_vol
10486 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
10487 ; GCN2-NEXT: ; implicit-def: $vgpr3
10488 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
10489 ; GCN2-NEXT: s_cbranch_execz .LBB78_2
10490 ; GCN2-NEXT: .LBB78_4: ; %atomicrmw.private
10491 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
10492 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
10493 ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
10494 ; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
10495 ; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
10496 ; GCN2-NEXT: s_waitcnt vmcnt(1)
10497 ; GCN2-NEXT: v_xor_b32_e32 v2, v4, v2
10498 ; GCN2-NEXT: s_waitcnt vmcnt(0)
10499 ; GCN2-NEXT: v_xor_b32_e32 v3, v5, v3
10500 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
10501 ; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
10502 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
10503 ; GCN2-NEXT: s_waitcnt vmcnt(0)
10504 ; GCN2-NEXT: s_setpc_b64 s[30:31]
10506 ; GCN3-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory:
10508 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10509 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
10510 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
10511 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
10512 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
10513 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
10514 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
10515 ; GCN3-NEXT: s_cbranch_execnz .LBB78_3
10516 ; GCN3-NEXT: ; %bb.1: ; %Flow
10517 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
10518 ; GCN3-NEXT: s_cbranch_execnz .LBB78_4
10519 ; GCN3-NEXT: .LBB78_2: ; %atomicrmw.phi
10520 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
10521 ; GCN3-NEXT: s_setpc_b64 s[30:31]
10522 ; GCN3-NEXT: .LBB78_3: ; %atomicrmw.global
10523 ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
10524 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10525 ; GCN3-NEXT: buffer_wbinvl1_vol
10526 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
10527 ; GCN3-NEXT: ; implicit-def: $vgpr3
10528 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
10529 ; GCN3-NEXT: s_cbranch_execz .LBB78_2
10530 ; GCN3-NEXT: .LBB78_4: ; %atomicrmw.private
10531 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
10532 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
10533 ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
10534 ; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
10535 ; GCN3-NEXT: s_waitcnt vmcnt(1)
10536 ; GCN3-NEXT: v_xor_b32_e32 v1, v1, v3
10537 ; GCN3-NEXT: s_waitcnt vmcnt(0)
10538 ; GCN3-NEXT: v_xor_b32_e32 v2, v4, v2
10539 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
10540 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
10541 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
10542 ; GCN3-NEXT: s_waitcnt vmcnt(0)
10543 ; GCN3-NEXT: s_setpc_b64 s[30:31]
10544 %gep = getelementptr i64, ptr %out, i64 4
10545 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
10549 define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
10550 ; GCN1-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory:
10552 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10553 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
10554 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
10555 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
10556 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
10557 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
10558 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
10559 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
10560 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
10561 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
10562 ; GCN1-NEXT: s_cbranch_execnz .LBB79_3
10563 ; GCN1-NEXT: ; %bb.1: ; %Flow
10564 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
10565 ; GCN1-NEXT: s_cbranch_execnz .LBB79_4
10566 ; GCN1-NEXT: .LBB79_2: ; %atomicrmw.phi
10567 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
10568 ; GCN1-NEXT: s_setpc_b64 s[30:31]
10569 ; GCN1-NEXT: .LBB79_3: ; %atomicrmw.global
10570 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc
10571 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10572 ; GCN1-NEXT: buffer_wbinvl1_vol
10573 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
10574 ; GCN1-NEXT: ; implicit-def: $vgpr3
10575 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
10576 ; GCN1-NEXT: s_cbranch_execz .LBB79_2
10577 ; GCN1-NEXT: .LBB79_4: ; %atomicrmw.private
10578 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
10579 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
10580 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
10581 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
10582 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
10583 ; GCN1-NEXT: s_waitcnt vmcnt(1)
10584 ; GCN1-NEXT: v_xor_b32_e32 v2, v0, v2
10585 ; GCN1-NEXT: s_waitcnt vmcnt(0)
10586 ; GCN1-NEXT: v_xor_b32_e32 v3, v1, v3
10587 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
10588 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
10589 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
10590 ; GCN1-NEXT: s_waitcnt vmcnt(0)
10591 ; GCN1-NEXT: s_setpc_b64 s[30:31]
10593 ; GCN2-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory:
10595 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10596 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
10597 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
10598 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
10599 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
10600 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
10601 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
10602 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
10603 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
10604 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
10605 ; GCN2-NEXT: s_cbranch_execnz .LBB79_3
10606 ; GCN2-NEXT: ; %bb.1: ; %Flow
10607 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
10608 ; GCN2-NEXT: s_cbranch_execnz .LBB79_4
10609 ; GCN2-NEXT: .LBB79_2: ; %atomicrmw.phi
10610 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
10611 ; GCN2-NEXT: s_setpc_b64 s[30:31]
10612 ; GCN2-NEXT: .LBB79_3: ; %atomicrmw.global
10613 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc
10614 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10615 ; GCN2-NEXT: buffer_wbinvl1_vol
10616 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
10617 ; GCN2-NEXT: ; implicit-def: $vgpr3
10618 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
10619 ; GCN2-NEXT: s_cbranch_execz .LBB79_2
10620 ; GCN2-NEXT: .LBB79_4: ; %atomicrmw.private
10621 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
10622 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
10623 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
10624 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
10625 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
10626 ; GCN2-NEXT: s_waitcnt vmcnt(1)
10627 ; GCN2-NEXT: v_xor_b32_e32 v2, v0, v2
10628 ; GCN2-NEXT: s_waitcnt vmcnt(0)
10629 ; GCN2-NEXT: v_xor_b32_e32 v3, v1, v3
10630 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
10631 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
10632 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
10633 ; GCN2-NEXT: s_waitcnt vmcnt(0)
10634 ; GCN2-NEXT: s_setpc_b64 s[30:31]
10636 ; GCN3-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory:
10638 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10639 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
10640 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
10641 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
10642 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
10643 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
10644 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
10645 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
10646 ; GCN3-NEXT: s_cbranch_execnz .LBB79_3
10647 ; GCN3-NEXT: ; %bb.1: ; %Flow
10648 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
10649 ; GCN3-NEXT: s_cbranch_execnz .LBB79_4
10650 ; GCN3-NEXT: .LBB79_2: ; %atomicrmw.phi
10651 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
10652 ; GCN3-NEXT: s_setpc_b64 s[30:31]
10653 ; GCN3-NEXT: .LBB79_3: ; %atomicrmw.global
10654 ; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc
10655 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10656 ; GCN3-NEXT: buffer_wbinvl1_vol
10657 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
10658 ; GCN3-NEXT: ; implicit-def: $vgpr3
10659 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
10660 ; GCN3-NEXT: s_cbranch_execz .LBB79_2
10661 ; GCN3-NEXT: .LBB79_4: ; %atomicrmw.private
10662 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
10663 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
10664 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
10665 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
10666 ; GCN3-NEXT: s_waitcnt vmcnt(1)
10667 ; GCN3-NEXT: v_xor_b32_e32 v3, v1, v3
10668 ; GCN3-NEXT: s_waitcnt vmcnt(0)
10669 ; GCN3-NEXT: v_xor_b32_e32 v2, v0, v2
10670 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
10671 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
10672 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
10673 ; GCN3-NEXT: s_waitcnt vmcnt(0)
10674 ; GCN3-NEXT: s_setpc_b64 s[30:31]
10675 %gep = getelementptr i64, ptr %out, i64 4
10676 %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
10680 ; ---------------------------------------------------------------------
10682 ; ---------------------------------------------------------------------
10684 define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) {
10685 ; GCN1-LABEL: flat_atomic_max_i64_noret:
10687 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10688 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
10689 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
10690 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
10691 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
10692 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
10693 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
10694 ; GCN1-NEXT: s_cbranch_execnz .LBB80_3
10695 ; GCN1-NEXT: ; %bb.1: ; %Flow3
10696 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
10697 ; GCN1-NEXT: s_cbranch_execnz .LBB80_6
10698 ; GCN1-NEXT: .LBB80_2: ; %atomicrmw.phi
10699 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
10700 ; GCN1-NEXT: s_setpc_b64 s[30:31]
10701 ; GCN1-NEXT: .LBB80_3: ; %atomicrmw.global
10702 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
10703 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
10704 ; GCN1-NEXT: flat_load_dword v7, v[4:5]
10705 ; GCN1-NEXT: flat_load_dword v6, v[0:1]
10706 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
10707 ; GCN1-NEXT: .LBB80_4: ; %atomicrmw.start
10708 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
10709 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10710 ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
10711 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
10712 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
10713 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
10714 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10715 ; GCN1-NEXT: buffer_wbinvl1_vol
10716 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
10717 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
10718 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
10719 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
10720 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
10721 ; GCN1-NEXT: s_cbranch_execnz .LBB80_4
10722 ; GCN1-NEXT: ; %bb.5: ; %Flow
10723 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
10724 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
10725 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
10726 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
10727 ; GCN1-NEXT: s_cbranch_execz .LBB80_2
10728 ; GCN1-NEXT: .LBB80_6: ; %atomicrmw.private
10729 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
10730 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
10731 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
10732 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
10733 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
10734 ; GCN1-NEXT: s_waitcnt vmcnt(0)
10735 ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
10736 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
10737 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
10738 ; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
10739 ; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
10740 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
10741 ; GCN1-NEXT: s_waitcnt vmcnt(0)
10742 ; GCN1-NEXT: s_setpc_b64 s[30:31]
10744 ; GCN2-LABEL: flat_atomic_max_i64_noret:
10746 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10747 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
10748 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
10749 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
10750 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
10751 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
10752 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
10753 ; GCN2-NEXT: s_cbranch_execnz .LBB80_3
10754 ; GCN2-NEXT: ; %bb.1: ; %Flow3
10755 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
10756 ; GCN2-NEXT: s_cbranch_execnz .LBB80_6
10757 ; GCN2-NEXT: .LBB80_2: ; %atomicrmw.phi
10758 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
10759 ; GCN2-NEXT: s_setpc_b64 s[30:31]
10760 ; GCN2-NEXT: .LBB80_3: ; %atomicrmw.global
10761 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
10762 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
10763 ; GCN2-NEXT: flat_load_dword v7, v[4:5]
10764 ; GCN2-NEXT: flat_load_dword v6, v[0:1]
10765 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
10766 ; GCN2-NEXT: .LBB80_4: ; %atomicrmw.start
10767 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
10768 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10769 ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
10770 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
10771 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
10772 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
10773 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10774 ; GCN2-NEXT: buffer_wbinvl1_vol
10775 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
10776 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
10777 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
10778 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
10779 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
10780 ; GCN2-NEXT: s_cbranch_execnz .LBB80_4
10781 ; GCN2-NEXT: ; %bb.5: ; %Flow
10782 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
10783 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
10784 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
10785 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
10786 ; GCN2-NEXT: s_cbranch_execz .LBB80_2
10787 ; GCN2-NEXT: .LBB80_6: ; %atomicrmw.private
10788 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
10789 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
10790 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
10791 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
10792 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
10793 ; GCN2-NEXT: s_waitcnt vmcnt(0)
10794 ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
10795 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
10796 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
10797 ; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
10798 ; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
10799 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
10800 ; GCN2-NEXT: s_waitcnt vmcnt(0)
10801 ; GCN2-NEXT: s_setpc_b64 s[30:31]
10803 ; GCN3-LABEL: flat_atomic_max_i64_noret:
10805 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10806 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
10807 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
10808 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
10809 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
10810 ; GCN3-NEXT: s_cbranch_execnz .LBB80_3
10811 ; GCN3-NEXT: ; %bb.1: ; %Flow3
10812 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
10813 ; GCN3-NEXT: s_cbranch_execnz .LBB80_6
10814 ; GCN3-NEXT: .LBB80_2: ; %atomicrmw.phi
10815 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
10816 ; GCN3-NEXT: s_setpc_b64 s[30:31]
10817 ; GCN3-NEXT: .LBB80_3: ; %atomicrmw.global
10818 ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
10819 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
10820 ; GCN3-NEXT: .LBB80_4: ; %atomicrmw.start
10821 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
10822 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10823 ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
10824 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
10825 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
10826 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
10827 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10828 ; GCN3-NEXT: buffer_wbinvl1_vol
10829 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
10830 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
10831 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
10832 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
10833 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
10834 ; GCN3-NEXT: s_cbranch_execnz .LBB80_4
10835 ; GCN3-NEXT: ; %bb.5: ; %Flow
10836 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
10837 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
10838 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
10839 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
10840 ; GCN3-NEXT: s_cbranch_execz .LBB80_2
10841 ; GCN3-NEXT: .LBB80_6: ; %atomicrmw.private
10842 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
10843 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
10844 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
10845 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
10846 ; GCN3-NEXT: s_waitcnt vmcnt(0)
10847 ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
10848 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
10849 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
10850 ; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
10851 ; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
10852 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
10853 ; GCN3-NEXT: s_waitcnt vmcnt(0)
10854 ; GCN3-NEXT: s_setpc_b64 s[30:31]
10855 %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst
10859 define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) {
10860 ; GCN1-LABEL: flat_atomic_max_i64_noret_offset:
10862 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10863 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
10864 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
10865 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
10866 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
10867 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
10868 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
10869 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
10870 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
10871 ; GCN1-NEXT: s_cbranch_execnz .LBB81_3
10872 ; GCN1-NEXT: ; %bb.1: ; %Flow3
10873 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
10874 ; GCN1-NEXT: s_cbranch_execnz .LBB81_6
10875 ; GCN1-NEXT: .LBB81_2: ; %atomicrmw.phi
10876 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
10877 ; GCN1-NEXT: s_setpc_b64 s[30:31]
10878 ; GCN1-NEXT: .LBB81_3: ; %atomicrmw.global
10879 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
10880 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
10881 ; GCN1-NEXT: flat_load_dword v7, v[4:5]
10882 ; GCN1-NEXT: flat_load_dword v6, v[0:1]
10883 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
10884 ; GCN1-NEXT: .LBB81_4: ; %atomicrmw.start
10885 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
10886 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10887 ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
10888 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
10889 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
10890 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
10891 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10892 ; GCN1-NEXT: buffer_wbinvl1_vol
10893 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
10894 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
10895 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
10896 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
10897 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
10898 ; GCN1-NEXT: s_cbranch_execnz .LBB81_4
10899 ; GCN1-NEXT: ; %bb.5: ; %Flow
10900 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
10901 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
10902 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
10903 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
10904 ; GCN1-NEXT: s_cbranch_execz .LBB81_2
10905 ; GCN1-NEXT: .LBB81_6: ; %atomicrmw.private
10906 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
10907 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
10908 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
10909 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
10910 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
10911 ; GCN1-NEXT: s_waitcnt vmcnt(0)
10912 ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
10913 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
10914 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
10915 ; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
10916 ; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
10917 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
10918 ; GCN1-NEXT: s_waitcnt vmcnt(0)
10919 ; GCN1-NEXT: s_setpc_b64 s[30:31]
10921 ; GCN2-LABEL: flat_atomic_max_i64_noret_offset:
10923 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10924 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
10925 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
10926 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
10927 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
10928 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
10929 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
10930 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
10931 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
10932 ; GCN2-NEXT: s_cbranch_execnz .LBB81_3
10933 ; GCN2-NEXT: ; %bb.1: ; %Flow3
10934 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
10935 ; GCN2-NEXT: s_cbranch_execnz .LBB81_6
10936 ; GCN2-NEXT: .LBB81_2: ; %atomicrmw.phi
10937 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
10938 ; GCN2-NEXT: s_setpc_b64 s[30:31]
10939 ; GCN2-NEXT: .LBB81_3: ; %atomicrmw.global
10940 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
10941 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
10942 ; GCN2-NEXT: flat_load_dword v7, v[4:5]
10943 ; GCN2-NEXT: flat_load_dword v6, v[0:1]
10944 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
10945 ; GCN2-NEXT: .LBB81_4: ; %atomicrmw.start
10946 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
10947 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10948 ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
10949 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
10950 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
10951 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
10952 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10953 ; GCN2-NEXT: buffer_wbinvl1_vol
10954 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
10955 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
10956 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
10957 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
10958 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
10959 ; GCN2-NEXT: s_cbranch_execnz .LBB81_4
10960 ; GCN2-NEXT: ; %bb.5: ; %Flow
10961 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
10962 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
10963 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
10964 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
10965 ; GCN2-NEXT: s_cbranch_execz .LBB81_2
10966 ; GCN2-NEXT: .LBB81_6: ; %atomicrmw.private
10967 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
10968 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
10969 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
10970 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
10971 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
10972 ; GCN2-NEXT: s_waitcnt vmcnt(0)
10973 ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
10974 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
10975 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
10976 ; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
10977 ; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
10978 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
10979 ; GCN2-NEXT: s_waitcnt vmcnt(0)
10980 ; GCN2-NEXT: s_setpc_b64 s[30:31]
10982 ; GCN3-LABEL: flat_atomic_max_i64_noret_offset:
10984 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10985 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
10986 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
10987 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
10988 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
10989 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
10990 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
10991 ; GCN3-NEXT: s_cbranch_execnz .LBB81_3
10992 ; GCN3-NEXT: ; %bb.1: ; %Flow3
10993 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
10994 ; GCN3-NEXT: s_cbranch_execnz .LBB81_6
10995 ; GCN3-NEXT: .LBB81_2: ; %atomicrmw.phi
10996 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
10997 ; GCN3-NEXT: s_setpc_b64 s[30:31]
10998 ; GCN3-NEXT: .LBB81_3: ; %atomicrmw.global
10999 ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
11000 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
11001 ; GCN3-NEXT: .LBB81_4: ; %atomicrmw.start
11002 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
11003 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11004 ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
11005 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
11006 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
11007 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
11008 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11009 ; GCN3-NEXT: buffer_wbinvl1_vol
11010 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
11011 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
11012 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
11013 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
11014 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
11015 ; GCN3-NEXT: s_cbranch_execnz .LBB81_4
11016 ; GCN3-NEXT: ; %bb.5: ; %Flow
11017 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
11018 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
11019 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
11020 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
11021 ; GCN3-NEXT: s_cbranch_execz .LBB81_2
11022 ; GCN3-NEXT: .LBB81_6: ; %atomicrmw.private
11023 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
11024 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
11025 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
11026 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
11027 ; GCN3-NEXT: s_waitcnt vmcnt(0)
11028 ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
11029 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
11030 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
11031 ; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
11032 ; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
11033 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
11034 ; GCN3-NEXT: s_waitcnt vmcnt(0)
11035 ; GCN3-NEXT: s_setpc_b64 s[30:31]
11036 %gep = getelementptr i64, ptr %out, i64 4
11037 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst
11041 define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) {
11042 ; GCN1-LABEL: flat_atomic_max_i64_ret:
11044 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11045 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
11046 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
11047 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
11048 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
11049 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
11050 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
11051 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
11052 ; GCN1-NEXT: s_cbranch_execz .LBB82_4
11053 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
11054 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
11055 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
11056 ; GCN1-NEXT: flat_load_dword v5, v[4:5]
11057 ; GCN1-NEXT: flat_load_dword v4, v[0:1]
11058 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
11059 ; GCN1-NEXT: .LBB82_2: ; %atomicrmw.start
11060 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
11061 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11062 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
11063 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
11064 ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
11065 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
11066 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
11067 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
11068 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11069 ; GCN1-NEXT: buffer_wbinvl1_vol
11070 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
11071 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
11072 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
11073 ; GCN1-NEXT: s_cbranch_execnz .LBB82_2
11074 ; GCN1-NEXT: ; %bb.3: ; %Flow
11075 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
11076 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
11077 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
11078 ; GCN1-NEXT: .LBB82_4: ; %Flow3
11079 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
11080 ; GCN1-NEXT: s_cbranch_execz .LBB82_6
11081 ; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
11082 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
11083 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
11084 ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
11085 ; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
11086 ; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
11087 ; GCN1-NEXT: s_waitcnt vmcnt(0)
11088 ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[2:3]
11089 ; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
11090 ; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
11091 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
11092 ; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
11093 ; GCN1-NEXT: .LBB82_6: ; %atomicrmw.phi
11094 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
11095 ; GCN1-NEXT: v_mov_b32_e32 v0, v4
11096 ; GCN1-NEXT: v_mov_b32_e32 v1, v5
11097 ; GCN1-NEXT: s_waitcnt vmcnt(0)
11098 ; GCN1-NEXT: s_setpc_b64 s[30:31]
11100 ; GCN2-LABEL: flat_atomic_max_i64_ret:
11102 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11103 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
11104 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
11105 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
11106 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
11107 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
11108 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
11109 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
11110 ; GCN2-NEXT: s_cbranch_execz .LBB82_4
11111 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
11112 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
11113 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
11114 ; GCN2-NEXT: flat_load_dword v5, v[4:5]
11115 ; GCN2-NEXT: flat_load_dword v4, v[0:1]
11116 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
11117 ; GCN2-NEXT: .LBB82_2: ; %atomicrmw.start
11118 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
11119 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11120 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
11121 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
11122 ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
11123 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
11124 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
11125 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
11126 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11127 ; GCN2-NEXT: buffer_wbinvl1_vol
11128 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
11129 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
11130 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
11131 ; GCN2-NEXT: s_cbranch_execnz .LBB82_2
11132 ; GCN2-NEXT: ; %bb.3: ; %Flow
11133 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
11134 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
11135 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
11136 ; GCN2-NEXT: .LBB82_4: ; %Flow3
11137 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
11138 ; GCN2-NEXT: s_cbranch_execz .LBB82_6
11139 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
11140 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
11141 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
11142 ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
11143 ; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
11144 ; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
11145 ; GCN2-NEXT: s_waitcnt vmcnt(0)
11146 ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[2:3]
11147 ; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
11148 ; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
11149 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
11150 ; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
11151 ; GCN2-NEXT: .LBB82_6: ; %atomicrmw.phi
11152 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
11153 ; GCN2-NEXT: v_mov_b32_e32 v0, v4
11154 ; GCN2-NEXT: v_mov_b32_e32 v1, v5
11155 ; GCN2-NEXT: s_waitcnt vmcnt(0)
11156 ; GCN2-NEXT: s_setpc_b64 s[30:31]
11158 ; GCN3-LABEL: flat_atomic_max_i64_ret:
11160 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11161 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
11162 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
11163 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
11164 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
11165 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
11166 ; GCN3-NEXT: s_cbranch_execz .LBB82_4
11167 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
11168 ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
11169 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
11170 ; GCN3-NEXT: .LBB82_2: ; %atomicrmw.start
11171 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
11172 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11173 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
11174 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
11175 ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
11176 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
11177 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
11178 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
11179 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11180 ; GCN3-NEXT: buffer_wbinvl1_vol
11181 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
11182 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
11183 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
11184 ; GCN3-NEXT: s_cbranch_execnz .LBB82_2
11185 ; GCN3-NEXT: ; %bb.3: ; %Flow
11186 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
11187 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
11188 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
11189 ; GCN3-NEXT: .LBB82_4: ; %Flow3
11190 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
11191 ; GCN3-NEXT: s_cbranch_execz .LBB82_6
11192 ; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
11193 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
11194 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
11195 ; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
11196 ; GCN3-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4
11197 ; GCN3-NEXT: s_waitcnt vmcnt(0)
11198 ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[2:3]
11199 ; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
11200 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
11201 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
11202 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
11203 ; GCN3-NEXT: .LBB82_6: ; %atomicrmw.phi
11204 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
11205 ; GCN3-NEXT: v_mov_b32_e32 v0, v4
11206 ; GCN3-NEXT: v_mov_b32_e32 v1, v5
11207 ; GCN3-NEXT: s_waitcnt vmcnt(0)
11208 ; GCN3-NEXT: s_setpc_b64 s[30:31]
11209 %result = atomicrmw max ptr %ptr, i64 %in seq_cst
11213 define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) {
11214 ; GCN1-LABEL: flat_atomic_max_i64_ret_offset:
11216 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11217 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
11218 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
11219 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
11220 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
11221 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
11222 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
11223 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
11224 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
11225 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
11226 ; GCN1-NEXT: s_cbranch_execnz .LBB83_3
11227 ; GCN1-NEXT: ; %bb.1: ; %Flow3
11228 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
11229 ; GCN1-NEXT: s_cbranch_execnz .LBB83_6
11230 ; GCN1-NEXT: .LBB83_2: ; %atomicrmw.phi
11231 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
11232 ; GCN1-NEXT: s_setpc_b64 s[30:31]
11233 ; GCN1-NEXT: .LBB83_3: ; %atomicrmw.global
11234 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4
11235 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
11236 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
11237 ; GCN1-NEXT: flat_load_dword v0, v[4:5]
11238 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
11239 ; GCN1-NEXT: .LBB83_4: ; %atomicrmw.start
11240 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
11241 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11242 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
11243 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
11244 ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
11245 ; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
11246 ; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
11247 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
11248 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11249 ; GCN1-NEXT: buffer_wbinvl1_vol
11250 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
11251 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
11252 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
11253 ; GCN1-NEXT: s_cbranch_execnz .LBB83_4
11254 ; GCN1-NEXT: ; %bb.5: ; %Flow
11255 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
11256 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
11257 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
11258 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
11259 ; GCN1-NEXT: s_cbranch_execz .LBB83_2
11260 ; GCN1-NEXT: .LBB83_6: ; %atomicrmw.private
11261 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
11262 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
11263 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
11264 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
11265 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
11266 ; GCN1-NEXT: s_waitcnt vmcnt(0)
11267 ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
11268 ; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
11269 ; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
11270 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
11271 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
11272 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
11273 ; GCN1-NEXT: s_waitcnt vmcnt(0)
11274 ; GCN1-NEXT: s_setpc_b64 s[30:31]
11276 ; GCN2-LABEL: flat_atomic_max_i64_ret_offset:
11278 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11279 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
11280 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
11281 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
11282 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
11283 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
11284 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
11285 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
11286 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
11287 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
11288 ; GCN2-NEXT: s_cbranch_execnz .LBB83_3
11289 ; GCN2-NEXT: ; %bb.1: ; %Flow3
11290 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
11291 ; GCN2-NEXT: s_cbranch_execnz .LBB83_6
11292 ; GCN2-NEXT: .LBB83_2: ; %atomicrmw.phi
11293 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
11294 ; GCN2-NEXT: s_setpc_b64 s[30:31]
11295 ; GCN2-NEXT: .LBB83_3: ; %atomicrmw.global
11296 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4
11297 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
11298 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
11299 ; GCN2-NEXT: flat_load_dword v0, v[4:5]
11300 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
11301 ; GCN2-NEXT: .LBB83_4: ; %atomicrmw.start
11302 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
11303 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11304 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
11305 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
11306 ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
11307 ; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
11308 ; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
11309 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
11310 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11311 ; GCN2-NEXT: buffer_wbinvl1_vol
11312 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
11313 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
11314 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
11315 ; GCN2-NEXT: s_cbranch_execnz .LBB83_4
11316 ; GCN2-NEXT: ; %bb.5: ; %Flow
11317 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
11318 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
11319 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
11320 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
11321 ; GCN2-NEXT: s_cbranch_execz .LBB83_2
11322 ; GCN2-NEXT: .LBB83_6: ; %atomicrmw.private
11323 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
11324 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
11325 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
11326 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
11327 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
11328 ; GCN2-NEXT: s_waitcnt vmcnt(0)
11329 ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
11330 ; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
11331 ; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
11332 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
11333 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
11334 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
11335 ; GCN2-NEXT: s_waitcnt vmcnt(0)
11336 ; GCN2-NEXT: s_setpc_b64 s[30:31]
11338 ; GCN3-LABEL: flat_atomic_max_i64_ret_offset:
11340 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11341 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
11342 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
11343 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
11344 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
11345 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
11346 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
11347 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
11348 ; GCN3-NEXT: s_cbranch_execnz .LBB83_3
11349 ; GCN3-NEXT: ; %bb.1: ; %Flow3
11350 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
11351 ; GCN3-NEXT: s_cbranch_execnz .LBB83_6
11352 ; GCN3-NEXT: .LBB83_2: ; %atomicrmw.phi
11353 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
11354 ; GCN3-NEXT: s_setpc_b64 s[30:31]
11355 ; GCN3-NEXT: .LBB83_3: ; %atomicrmw.global
11356 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
11357 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
11358 ; GCN3-NEXT: .LBB83_4: ; %atomicrmw.start
11359 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
11360 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11361 ; GCN3-NEXT: v_mov_b32_e32 v9, v1
11362 ; GCN3-NEXT: v_mov_b32_e32 v8, v0
11363 ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
11364 ; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
11365 ; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
11366 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
11367 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11368 ; GCN3-NEXT: buffer_wbinvl1_vol
11369 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
11370 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
11371 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
11372 ; GCN3-NEXT: s_cbranch_execnz .LBB83_4
11373 ; GCN3-NEXT: ; %bb.5: ; %Flow
11374 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
11375 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
11376 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
11377 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
11378 ; GCN3-NEXT: s_cbranch_execz .LBB83_2
11379 ; GCN3-NEXT: .LBB83_6: ; %atomicrmw.private
11380 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
11381 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
11382 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
11383 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
11384 ; GCN3-NEXT: s_waitcnt vmcnt(0)
11385 ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
11386 ; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
11387 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
11388 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
11389 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
11390 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
11391 ; GCN3-NEXT: s_waitcnt vmcnt(0)
11392 ; GCN3-NEXT: s_setpc_b64 s[30:31]
11393 %gep = getelementptr i64, ptr %out, i64 4
11394 %result = atomicrmw max ptr %gep, i64 %in seq_cst
11398 define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
11399 ; GCN1-LABEL: flat_atomic_max_i64_noret_scalar:
11401 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11402 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
11403 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
11404 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
11405 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
11406 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
11407 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
11408 ; GCN1-NEXT: s_mov_b64 s[34:35], -1
11409 ; GCN1-NEXT: s_cbranch_vccnz .LBB84_3
11410 ; GCN1-NEXT: ; %bb.1: ; %Flow3
11411 ; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35]
11412 ; GCN1-NEXT: s_cbranch_vccnz .LBB84_6
11413 ; GCN1-NEXT: .LBB84_2: ; %atomicrmw.phi
11414 ; GCN1-NEXT: s_setpc_b64 s[30:31]
11415 ; GCN1-NEXT: .LBB84_3: ; %atomicrmw.global
11416 ; GCN1-NEXT: s_add_u32 s34, s4, 4
11417 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
11418 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
11419 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
11420 ; GCN1-NEXT: v_mov_b32_e32 v4, s4
11421 ; GCN1-NEXT: v_mov_b32_e32 v5, s5
11422 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
11423 ; GCN1-NEXT: flat_load_dword v2, v[4:5]
11424 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
11425 ; GCN1-NEXT: v_mov_b32_e32 v6, s7
11426 ; GCN1-NEXT: v_mov_b32_e32 v7, s6
11427 ; GCN1-NEXT: .LBB84_4: ; %atomicrmw.start
11428 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
11429 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11430 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
11431 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
11432 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
11433 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
11434 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11435 ; GCN1-NEXT: buffer_wbinvl1_vol
11436 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
11437 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
11438 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
11439 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
11440 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
11441 ; GCN1-NEXT: s_cbranch_execnz .LBB84_4
11442 ; GCN1-NEXT: ; %bb.5: ; %Flow
11443 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
11444 ; GCN1-NEXT: s_branch .LBB84_2
11445 ; GCN1-NEXT: .LBB84_6: ; %atomicrmw.private
11446 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
11447 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
11448 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
11449 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
11450 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
11451 ; GCN1-NEXT: s_add_i32 s34, s34, 4
11452 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
11453 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
11454 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
11455 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
11456 ; GCN1-NEXT: s_waitcnt vmcnt(0)
11457 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
11458 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
11459 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
11460 ; GCN1-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
11461 ; GCN1-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
11462 ; GCN1-NEXT: s_waitcnt vmcnt(0)
11463 ; GCN1-NEXT: s_setpc_b64 s[30:31]
11465 ; GCN2-LABEL: flat_atomic_max_i64_noret_scalar:
11467 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11468 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
11469 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
11470 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
11471 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
11472 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
11473 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
11474 ; GCN2-NEXT: s_mov_b64 s[34:35], -1
11475 ; GCN2-NEXT: s_cbranch_vccnz .LBB84_3
11476 ; GCN2-NEXT: ; %bb.1: ; %Flow3
11477 ; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35]
11478 ; GCN2-NEXT: s_cbranch_vccnz .LBB84_6
11479 ; GCN2-NEXT: .LBB84_2: ; %atomicrmw.phi
11480 ; GCN2-NEXT: s_setpc_b64 s[30:31]
11481 ; GCN2-NEXT: .LBB84_3: ; %atomicrmw.global
11482 ; GCN2-NEXT: s_add_u32 s34, s4, 4
11483 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
11484 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
11485 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
11486 ; GCN2-NEXT: v_mov_b32_e32 v4, s4
11487 ; GCN2-NEXT: v_mov_b32_e32 v5, s5
11488 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
11489 ; GCN2-NEXT: flat_load_dword v2, v[4:5]
11490 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
11491 ; GCN2-NEXT: v_mov_b32_e32 v6, s7
11492 ; GCN2-NEXT: v_mov_b32_e32 v7, s6
11493 ; GCN2-NEXT: .LBB84_4: ; %atomicrmw.start
11494 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
11495 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11496 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
11497 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
11498 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
11499 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
11500 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11501 ; GCN2-NEXT: buffer_wbinvl1_vol
11502 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
11503 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
11504 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
11505 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
11506 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
11507 ; GCN2-NEXT: s_cbranch_execnz .LBB84_4
11508 ; GCN2-NEXT: ; %bb.5: ; %Flow
11509 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
11510 ; GCN2-NEXT: s_branch .LBB84_2
11511 ; GCN2-NEXT: .LBB84_6: ; %atomicrmw.private
11512 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
11513 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
11514 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
11515 ; GCN2-NEXT: s_add_i32 s34, s34, 4
11516 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
11517 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
11518 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
11519 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
11520 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
11521 ; GCN2-NEXT: s_waitcnt vmcnt(0)
11522 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
11523 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
11524 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
11525 ; GCN2-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
11526 ; GCN2-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
11527 ; GCN2-NEXT: s_waitcnt vmcnt(0)
11528 ; GCN2-NEXT: s_setpc_b64 s[30:31]
11530 ; GCN3-LABEL: flat_atomic_max_i64_noret_scalar:
11532 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11533 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
11534 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
11535 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
11536 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
11537 ; GCN3-NEXT: s_mov_b64 s[34:35], -1
11538 ; GCN3-NEXT: s_cbranch_vccnz .LBB84_3
11539 ; GCN3-NEXT: ; %bb.1: ; %Flow3
11540 ; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35]
11541 ; GCN3-NEXT: s_cbranch_vccnz .LBB84_6
11542 ; GCN3-NEXT: .LBB84_2: ; %atomicrmw.phi
11543 ; GCN3-NEXT: s_setpc_b64 s[30:31]
11544 ; GCN3-NEXT: .LBB84_3: ; %atomicrmw.global
11545 ; GCN3-NEXT: v_mov_b32_e32 v4, s4
11546 ; GCN3-NEXT: v_mov_b32_e32 v5, s5
11547 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
11548 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
11549 ; GCN3-NEXT: v_mov_b32_e32 v6, s7
11550 ; GCN3-NEXT: v_mov_b32_e32 v7, s6
11551 ; GCN3-NEXT: .LBB84_4: ; %atomicrmw.start
11552 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
11553 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11554 ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
11555 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
11556 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
11557 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
11558 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11559 ; GCN3-NEXT: buffer_wbinvl1_vol
11560 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
11561 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
11562 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
11563 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
11564 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
11565 ; GCN3-NEXT: s_cbranch_execnz .LBB84_4
11566 ; GCN3-NEXT: ; %bb.5: ; %Flow
11567 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
11568 ; GCN3-NEXT: s_branch .LBB84_2
11569 ; GCN3-NEXT: .LBB84_6: ; %atomicrmw.private
11570 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
11571 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
11572 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
11573 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
11574 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
11575 ; GCN3-NEXT: v_mov_b32_e32 v4, s6
11576 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
11577 ; GCN3-NEXT: s_waitcnt vmcnt(0)
11578 ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
11579 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
11580 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
11581 ; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
11582 ; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
11583 ; GCN3-NEXT: s_waitcnt vmcnt(0)
11584 ; GCN3-NEXT: s_setpc_b64 s[30:31]
11585 %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst
11589 define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
11590 ; GCN1-LABEL: flat_atomic_max_i64_noret_offset_scalar:
11592 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11593 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
11594 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
11595 ; GCN1-NEXT: s_add_u32 s34, s4, 32
11596 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
11597 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
11598 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
11599 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
11600 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
11601 ; GCN1-NEXT: s_mov_b64 s[36:37], -1
11602 ; GCN1-NEXT: s_cbranch_vccnz .LBB85_3
11603 ; GCN1-NEXT: ; %bb.1: ; %Flow3
11604 ; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37]
11605 ; GCN1-NEXT: s_cbranch_vccnz .LBB85_6
11606 ; GCN1-NEXT: .LBB85_2: ; %atomicrmw.phi
11607 ; GCN1-NEXT: s_setpc_b64 s[30:31]
11608 ; GCN1-NEXT: .LBB85_3: ; %atomicrmw.global
11609 ; GCN1-NEXT: s_add_u32 s36, s34, 4
11610 ; GCN1-NEXT: s_addc_u32 s37, s35, 0
11611 ; GCN1-NEXT: v_mov_b32_e32 v0, s36
11612 ; GCN1-NEXT: v_mov_b32_e32 v1, s37
11613 ; GCN1-NEXT: v_mov_b32_e32 v4, s34
11614 ; GCN1-NEXT: v_mov_b32_e32 v5, s35
11615 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
11616 ; GCN1-NEXT: flat_load_dword v2, v[4:5]
11617 ; GCN1-NEXT: s_mov_b64 s[36:37], 0
11618 ; GCN1-NEXT: v_mov_b32_e32 v6, s7
11619 ; GCN1-NEXT: v_mov_b32_e32 v7, s6
11620 ; GCN1-NEXT: .LBB85_4: ; %atomicrmw.start
11621 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
11622 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11623 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
11624 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
11625 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
11626 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
11627 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11628 ; GCN1-NEXT: buffer_wbinvl1_vol
11629 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
11630 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
11631 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
11632 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
11633 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
11634 ; GCN1-NEXT: s_cbranch_execnz .LBB85_4
11635 ; GCN1-NEXT: ; %bb.5: ; %Flow
11636 ; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
11637 ; GCN1-NEXT: s_branch .LBB85_2
11638 ; GCN1-NEXT: .LBB85_6: ; %atomicrmw.private
11639 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
11640 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
11641 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
11642 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
11643 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
11644 ; GCN1-NEXT: s_add_i32 s34, s34, 4
11645 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
11646 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
11647 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
11648 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
11649 ; GCN1-NEXT: s_waitcnt vmcnt(0)
11650 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
11651 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
11652 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
11653 ; GCN1-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
11654 ; GCN1-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
11655 ; GCN1-NEXT: s_waitcnt vmcnt(0)
11656 ; GCN1-NEXT: s_setpc_b64 s[30:31]
11658 ; GCN2-LABEL: flat_atomic_max_i64_noret_offset_scalar:
11660 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11661 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
11662 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
11663 ; GCN2-NEXT: s_add_u32 s34, s4, 32
11664 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
11665 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
11666 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
11667 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
11668 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
11669 ; GCN2-NEXT: s_mov_b64 s[36:37], -1
11670 ; GCN2-NEXT: s_cbranch_vccnz .LBB85_3
11671 ; GCN2-NEXT: ; %bb.1: ; %Flow3
11672 ; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37]
11673 ; GCN2-NEXT: s_cbranch_vccnz .LBB85_6
11674 ; GCN2-NEXT: .LBB85_2: ; %atomicrmw.phi
11675 ; GCN2-NEXT: s_setpc_b64 s[30:31]
11676 ; GCN2-NEXT: .LBB85_3: ; %atomicrmw.global
11677 ; GCN2-NEXT: s_add_u32 s36, s34, 4
11678 ; GCN2-NEXT: s_addc_u32 s37, s35, 0
11679 ; GCN2-NEXT: v_mov_b32_e32 v0, s36
11680 ; GCN2-NEXT: v_mov_b32_e32 v1, s37
11681 ; GCN2-NEXT: v_mov_b32_e32 v4, s34
11682 ; GCN2-NEXT: v_mov_b32_e32 v5, s35
11683 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
11684 ; GCN2-NEXT: flat_load_dword v2, v[4:5]
11685 ; GCN2-NEXT: s_mov_b64 s[36:37], 0
11686 ; GCN2-NEXT: v_mov_b32_e32 v6, s7
11687 ; GCN2-NEXT: v_mov_b32_e32 v7, s6
11688 ; GCN2-NEXT: .LBB85_4: ; %atomicrmw.start
11689 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
11690 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11691 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
11692 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
11693 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
11694 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
11695 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11696 ; GCN2-NEXT: buffer_wbinvl1_vol
11697 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
11698 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
11699 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
11700 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
11701 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
11702 ; GCN2-NEXT: s_cbranch_execnz .LBB85_4
11703 ; GCN2-NEXT: ; %bb.5: ; %Flow
11704 ; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
11705 ; GCN2-NEXT: s_branch .LBB85_2
11706 ; GCN2-NEXT: .LBB85_6: ; %atomicrmw.private
11707 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
11708 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
11709 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
11710 ; GCN2-NEXT: s_add_i32 s34, s34, 4
11711 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
11712 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
11713 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
11714 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
11715 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
11716 ; GCN2-NEXT: s_waitcnt vmcnt(0)
11717 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
11718 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
11719 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
11720 ; GCN2-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
11721 ; GCN2-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
11722 ; GCN2-NEXT: s_waitcnt vmcnt(0)
11723 ; GCN2-NEXT: s_setpc_b64 s[30:31]
11725 ; GCN3-LABEL: flat_atomic_max_i64_noret_offset_scalar:
11727 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11728 ; GCN3-NEXT: s_add_u32 s34, s4, 32
11729 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
11730 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
11731 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
11732 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
11733 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
11734 ; GCN3-NEXT: s_mov_b64 s[36:37], -1
11735 ; GCN3-NEXT: s_cbranch_vccnz .LBB85_3
11736 ; GCN3-NEXT: ; %bb.1: ; %Flow3
11737 ; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37]
11738 ; GCN3-NEXT: s_cbranch_vccnz .LBB85_6
11739 ; GCN3-NEXT: .LBB85_2: ; %atomicrmw.phi
11740 ; GCN3-NEXT: s_setpc_b64 s[30:31]
11741 ; GCN3-NEXT: .LBB85_3: ; %atomicrmw.global
11742 ; GCN3-NEXT: v_mov_b32_e32 v4, s34
11743 ; GCN3-NEXT: v_mov_b32_e32 v5, s35
11744 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
11745 ; GCN3-NEXT: s_mov_b64 s[36:37], 0
11746 ; GCN3-NEXT: v_mov_b32_e32 v6, s7
11747 ; GCN3-NEXT: v_mov_b32_e32 v7, s6
11748 ; GCN3-NEXT: .LBB85_4: ; %atomicrmw.start
11749 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
11750 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11751 ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
11752 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
11753 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
11754 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
11755 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11756 ; GCN3-NEXT: buffer_wbinvl1_vol
11757 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
11758 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
11759 ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
11760 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
11761 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
11762 ; GCN3-NEXT: s_cbranch_execnz .LBB85_4
11763 ; GCN3-NEXT: ; %bb.5: ; %Flow
11764 ; GCN3-NEXT: s_or_b64 exec, exec, s[36:37]
11765 ; GCN3-NEXT: s_branch .LBB85_2
11766 ; GCN3-NEXT: .LBB85_6: ; %atomicrmw.private
11767 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
11768 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
11769 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
11770 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
11771 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
11772 ; GCN3-NEXT: v_mov_b32_e32 v4, s6
11773 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
11774 ; GCN3-NEXT: s_waitcnt vmcnt(0)
11775 ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
11776 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
11777 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
11778 ; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
11779 ; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
11780 ; GCN3-NEXT: s_waitcnt vmcnt(0)
11781 ; GCN3-NEXT: s_setpc_b64 s[30:31]
11782 %gep = getelementptr i64, ptr %out, i64 4
11783 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst
11787 define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
11788 ; GCN1-LABEL: flat_atomic_max_i64_ret_scalar:
11790 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11791 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
11792 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
11793 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
11794 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
11795 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
11796 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
11797 ; GCN1-NEXT: s_cbranch_vccz .LBB86_4
11798 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
11799 ; GCN1-NEXT: s_add_u32 s34, s4, 4
11800 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
11801 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
11802 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
11803 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
11804 ; GCN1-NEXT: v_mov_b32_e32 v3, s5
11805 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
11806 ; GCN1-NEXT: flat_load_dword v0, v[2:3]
11807 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
11808 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
11809 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
11810 ; GCN1-NEXT: .LBB86_2: ; %atomicrmw.start
11811 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
11812 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11813 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
11814 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
11815 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
11816 ; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
11817 ; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
11818 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
11819 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11820 ; GCN1-NEXT: buffer_wbinvl1_vol
11821 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
11822 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
11823 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
11824 ; GCN1-NEXT: s_cbranch_execnz .LBB86_2
11825 ; GCN1-NEXT: ; %bb.3: ; %Flow
11826 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
11827 ; GCN1-NEXT: s_branch .LBB86_6
11828 ; GCN1-NEXT: .LBB86_4:
11829 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
11830 ; GCN1-NEXT: s_cbranch_execz .LBB86_6
11831 ; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
11832 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
11833 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
11834 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
11835 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
11836 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
11837 ; GCN1-NEXT: s_add_i32 s34, s34, 4
11838 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
11839 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
11840 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
11841 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
11842 ; GCN1-NEXT: s_waitcnt vmcnt(0)
11843 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
11844 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
11845 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
11846 ; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
11847 ; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
11848 ; GCN1-NEXT: .LBB86_6: ; %atomicrmw.phi
11849 ; GCN1-NEXT: s_waitcnt vmcnt(0)
11850 ; GCN1-NEXT: s_setpc_b64 s[30:31]
11852 ; GCN2-LABEL: flat_atomic_max_i64_ret_scalar:
11854 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11855 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
11856 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
11857 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
11858 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
11859 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
11860 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
11861 ; GCN2-NEXT: s_cbranch_vccz .LBB86_4
11862 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
11863 ; GCN2-NEXT: s_add_u32 s34, s4, 4
11864 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
11865 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
11866 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
11867 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
11868 ; GCN2-NEXT: v_mov_b32_e32 v3, s5
11869 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
11870 ; GCN2-NEXT: flat_load_dword v0, v[2:3]
11871 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
11872 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
11873 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
11874 ; GCN2-NEXT: .LBB86_2: ; %atomicrmw.start
11875 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
11876 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11877 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
11878 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
11879 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
11880 ; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
11881 ; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
11882 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
11883 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11884 ; GCN2-NEXT: buffer_wbinvl1_vol
11885 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
11886 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
11887 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
11888 ; GCN2-NEXT: s_cbranch_execnz .LBB86_2
11889 ; GCN2-NEXT: ; %bb.3: ; %Flow
11890 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
11891 ; GCN2-NEXT: s_branch .LBB86_6
11892 ; GCN2-NEXT: .LBB86_4:
11893 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
11894 ; GCN2-NEXT: s_cbranch_execz .LBB86_6
11895 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
11896 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
11897 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
11898 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
11899 ; GCN2-NEXT: s_add_i32 s34, s34, 4
11900 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
11901 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
11902 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
11903 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
11904 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
11905 ; GCN2-NEXT: s_waitcnt vmcnt(0)
11906 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
11907 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
11908 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
11909 ; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
11910 ; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
11911 ; GCN2-NEXT: .LBB86_6: ; %atomicrmw.phi
11912 ; GCN2-NEXT: s_waitcnt vmcnt(0)
11913 ; GCN2-NEXT: s_setpc_b64 s[30:31]
11915 ; GCN3-LABEL: flat_atomic_max_i64_ret_scalar:
11917 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11918 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
11919 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
11920 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
11921 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
11922 ; GCN3-NEXT: s_cbranch_vccz .LBB86_4
11923 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
11924 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
11925 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
11926 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
11927 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
11928 ; GCN3-NEXT: v_mov_b32_e32 v4, s7
11929 ; GCN3-NEXT: v_mov_b32_e32 v5, s6
11930 ; GCN3-NEXT: .LBB86_2: ; %atomicrmw.start
11931 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
11932 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11933 ; GCN3-NEXT: v_mov_b32_e32 v9, v1
11934 ; GCN3-NEXT: v_mov_b32_e32 v8, v0
11935 ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
11936 ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
11937 ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
11938 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
11939 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11940 ; GCN3-NEXT: buffer_wbinvl1_vol
11941 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
11942 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
11943 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
11944 ; GCN3-NEXT: s_cbranch_execnz .LBB86_2
11945 ; GCN3-NEXT: ; %bb.3: ; %Flow
11946 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
11947 ; GCN3-NEXT: s_branch .LBB86_6
11948 ; GCN3-NEXT: .LBB86_4:
11949 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
11950 ; GCN3-NEXT: s_cbranch_execz .LBB86_6
11951 ; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
11952 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
11953 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
11954 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
11955 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
11956 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
11957 ; GCN3-NEXT: v_mov_b32_e32 v4, s6
11958 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
11959 ; GCN3-NEXT: s_waitcnt vmcnt(0)
11960 ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
11961 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
11962 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
11963 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
11964 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
11965 ; GCN3-NEXT: .LBB86_6: ; %atomicrmw.phi
11966 ; GCN3-NEXT: s_waitcnt vmcnt(0)
11967 ; GCN3-NEXT: s_setpc_b64 s[30:31]
11968 %result = atomicrmw max ptr %ptr, i64 %in seq_cst
11972 define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
11973 ; GCN1-LABEL: flat_atomic_max_i64_ret_offset_scalar:
11975 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11976 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
11977 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
11978 ; GCN1-NEXT: s_add_u32 s34, s4, 32
11979 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
11980 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
11981 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
11982 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
11983 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
11984 ; GCN1-NEXT: s_cbranch_vccz .LBB87_4
11985 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
11986 ; GCN1-NEXT: s_add_u32 s36, s34, 4
11987 ; GCN1-NEXT: s_addc_u32 s37, s35, 0
11988 ; GCN1-NEXT: v_mov_b32_e32 v0, s36
11989 ; GCN1-NEXT: v_mov_b32_e32 v1, s37
11990 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
11991 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
11992 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
11993 ; GCN1-NEXT: flat_load_dword v0, v[2:3]
11994 ; GCN1-NEXT: s_mov_b64 s[36:37], 0
11995 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
11996 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
11997 ; GCN1-NEXT: .LBB87_2: ; %atomicrmw.start
11998 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
11999 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12000 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
12001 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
12002 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
12003 ; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
12004 ; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
12005 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
12006 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12007 ; GCN1-NEXT: buffer_wbinvl1_vol
12008 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
12009 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
12010 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
12011 ; GCN1-NEXT: s_cbranch_execnz .LBB87_2
12012 ; GCN1-NEXT: ; %bb.3: ; %Flow
12013 ; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
12014 ; GCN1-NEXT: s_branch .LBB87_6
12015 ; GCN1-NEXT: .LBB87_4:
12016 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
12017 ; GCN1-NEXT: s_cbranch_execz .LBB87_6
12018 ; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
12019 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
12020 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
12021 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
12022 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
12023 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
12024 ; GCN1-NEXT: s_add_i32 s34, s34, 4
12025 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
12026 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
12027 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
12028 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
12029 ; GCN1-NEXT: s_waitcnt vmcnt(0)
12030 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
12031 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
12032 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
12033 ; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
12034 ; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
12035 ; GCN1-NEXT: .LBB87_6: ; %atomicrmw.phi
12036 ; GCN1-NEXT: s_waitcnt vmcnt(0)
12037 ; GCN1-NEXT: s_setpc_b64 s[30:31]
12039 ; GCN2-LABEL: flat_atomic_max_i64_ret_offset_scalar:
12041 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12042 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
12043 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
12044 ; GCN2-NEXT: s_add_u32 s34, s4, 32
12045 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
12046 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
12047 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
12048 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
12049 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
12050 ; GCN2-NEXT: s_cbranch_vccz .LBB87_4
12051 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
12052 ; GCN2-NEXT: s_add_u32 s36, s34, 4
12053 ; GCN2-NEXT: s_addc_u32 s37, s35, 0
12054 ; GCN2-NEXT: v_mov_b32_e32 v0, s36
12055 ; GCN2-NEXT: v_mov_b32_e32 v1, s37
12056 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
12057 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
12058 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
12059 ; GCN2-NEXT: flat_load_dword v0, v[2:3]
12060 ; GCN2-NEXT: s_mov_b64 s[36:37], 0
12061 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
12062 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
12063 ; GCN2-NEXT: .LBB87_2: ; %atomicrmw.start
12064 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
12065 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12066 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
12067 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
12068 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
12069 ; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
12070 ; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
12071 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
12072 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12073 ; GCN2-NEXT: buffer_wbinvl1_vol
12074 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
12075 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
12076 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
12077 ; GCN2-NEXT: s_cbranch_execnz .LBB87_2
12078 ; GCN2-NEXT: ; %bb.3: ; %Flow
12079 ; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
12080 ; GCN2-NEXT: s_branch .LBB87_6
12081 ; GCN2-NEXT: .LBB87_4:
12082 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
12083 ; GCN2-NEXT: s_cbranch_execz .LBB87_6
12084 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
12085 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
12086 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
12087 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
12088 ; GCN2-NEXT: s_add_i32 s34, s34, 4
12089 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
12090 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
12091 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
12092 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
12093 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
12094 ; GCN2-NEXT: s_waitcnt vmcnt(0)
12095 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
12096 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
12097 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
12098 ; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
12099 ; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
12100 ; GCN2-NEXT: .LBB87_6: ; %atomicrmw.phi
12101 ; GCN2-NEXT: s_waitcnt vmcnt(0)
12102 ; GCN2-NEXT: s_setpc_b64 s[30:31]
12104 ; GCN3-LABEL: flat_atomic_max_i64_ret_offset_scalar:
12106 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12107 ; GCN3-NEXT: s_add_u32 s34, s4, 32
12108 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
12109 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
12110 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
12111 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
12112 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
12113 ; GCN3-NEXT: s_cbranch_vccz .LBB87_4
12114 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
12115 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
12116 ; GCN3-NEXT: v_mov_b32_e32 v3, s35
12117 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
12118 ; GCN3-NEXT: s_mov_b64 s[36:37], 0
12119 ; GCN3-NEXT: v_mov_b32_e32 v4, s7
12120 ; GCN3-NEXT: v_mov_b32_e32 v5, s6
12121 ; GCN3-NEXT: .LBB87_2: ; %atomicrmw.start
12122 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
12123 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12124 ; GCN3-NEXT: v_mov_b32_e32 v9, v1
12125 ; GCN3-NEXT: v_mov_b32_e32 v8, v0
12126 ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
12127 ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
12128 ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
12129 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
12130 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12131 ; GCN3-NEXT: buffer_wbinvl1_vol
12132 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
12133 ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
12134 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
12135 ; GCN3-NEXT: s_cbranch_execnz .LBB87_2
12136 ; GCN3-NEXT: ; %bb.3: ; %Flow
12137 ; GCN3-NEXT: s_or_b64 exec, exec, s[36:37]
12138 ; GCN3-NEXT: s_branch .LBB87_6
12139 ; GCN3-NEXT: .LBB87_4:
12140 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
12141 ; GCN3-NEXT: s_cbranch_execz .LBB87_6
12142 ; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
12143 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
12144 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
12145 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
12146 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
12147 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
12148 ; GCN3-NEXT: v_mov_b32_e32 v4, s6
12149 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
12150 ; GCN3-NEXT: s_waitcnt vmcnt(0)
12151 ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
12152 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
12153 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
12154 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
12155 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
12156 ; GCN3-NEXT: .LBB87_6: ; %atomicrmw.phi
12157 ; GCN3-NEXT: s_waitcnt vmcnt(0)
12158 ; GCN3-NEXT: s_setpc_b64 s[30:31]
12159 %gep = getelementptr i64, ptr %out, i64 4
12160 %result = atomicrmw max ptr %gep, i64 %in seq_cst
12164 define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
12165 ; GCN1-LABEL: atomic_max_i64_addr64_offset:
12166 ; GCN1: ; %bb.0: ; %entry
12167 ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
12168 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
12169 ; GCN1-NEXT: s_mov_b32 s14, -1
12170 ; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
12171 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
12172 ; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f
12173 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
12174 ; GCN1-NEXT: s_add_u32 s12, s12, s11
12175 ; GCN1-NEXT: s_addc_u32 s13, s13, 0
12176 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
12177 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
12178 ; GCN1-NEXT: s_add_u32 s0, s0, s4
12179 ; GCN1-NEXT: s_addc_u32 s1, s1, s5
12180 ; GCN1-NEXT: s_add_u32 s0, s0, 32
12181 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
12182 ; GCN1-NEXT: s_cmp_eq_u32 s1, s8
12183 ; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0
12184 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
12185 ; GCN1-NEXT: s_mov_b64 s[4:5], -1
12186 ; GCN1-NEXT: s_cbranch_vccnz .LBB88_3
12187 ; GCN1-NEXT: ; %bb.1: ; %Flow6
12188 ; GCN1-NEXT: s_and_b64 vcc, exec, s[4:5]
12189 ; GCN1-NEXT: s_cbranch_vccnz .LBB88_6
12190 ; GCN1-NEXT: .LBB88_2: ; %atomicrmw.phi
12191 ; GCN1-NEXT: s_endpgm
12192 ; GCN1-NEXT: .LBB88_3: ; %atomicrmw.global
12193 ; GCN1-NEXT: v_mov_b32_e32 v5, s1
12194 ; GCN1-NEXT: v_mov_b32_e32 v4, s0
12195 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
12196 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
12197 ; GCN1-NEXT: v_mov_b32_e32 v6, s3
12198 ; GCN1-NEXT: v_mov_b32_e32 v7, s2
12199 ; GCN1-NEXT: .LBB88_4: ; %atomicrmw.start
12200 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
12201 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12202 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
12203 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
12204 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
12205 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
12206 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12207 ; GCN1-NEXT: buffer_wbinvl1_vol
12208 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
12209 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
12210 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
12211 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
12212 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
12213 ; GCN1-NEXT: s_cbranch_execnz .LBB88_4
12214 ; GCN1-NEXT: ; %bb.5: ; %Flow
12215 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
12216 ; GCN1-NEXT: s_branch .LBB88_2
12217 ; GCN1-NEXT: .LBB88_6: ; %atomicrmw.private
12218 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0
12219 ; GCN1-NEXT: v_mov_b32_e32 v5, s2
12220 ; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec
12221 ; GCN1-NEXT: s_cselect_b32 s0, s0, -1
12222 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
12223 ; GCN1-NEXT: s_add_i32 s0, s0, 4
12224 ; GCN1-NEXT: v_mov_b32_e32 v3, s0
12225 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
12226 ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
12227 ; GCN1-NEXT: v_mov_b32_e32 v4, s3
12228 ; GCN1-NEXT: s_waitcnt vmcnt(0)
12229 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
12230 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
12231 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
12232 ; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
12233 ; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
12234 ; GCN1-NEXT: s_endpgm
12236 ; GCN2-LABEL: atomic_max_i64_addr64_offset:
12237 ; GCN2: ; %bb.0: ; %entry
12238 ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
12239 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
12240 ; GCN2-NEXT: s_mov_b32 s90, -1
12241 ; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
12242 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
12243 ; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc
12244 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000
12245 ; GCN2-NEXT: s_add_u32 s88, s88, s11
12246 ; GCN2-NEXT: s_addc_u32 s89, s89, 0
12247 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
12248 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
12249 ; GCN2-NEXT: s_add_u32 s0, s0, s4
12250 ; GCN2-NEXT: s_addc_u32 s1, s1, s5
12251 ; GCN2-NEXT: s_add_u32 s0, s0, 32
12252 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
12253 ; GCN2-NEXT: s_cmp_eq_u32 s1, s8
12254 ; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0
12255 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5]
12256 ; GCN2-NEXT: s_mov_b64 s[4:5], -1
12257 ; GCN2-NEXT: s_cbranch_vccnz .LBB88_3
12258 ; GCN2-NEXT: ; %bb.1: ; %Flow6
12259 ; GCN2-NEXT: s_and_b64 vcc, exec, s[4:5]
12260 ; GCN2-NEXT: s_cbranch_vccnz .LBB88_6
12261 ; GCN2-NEXT: .LBB88_2: ; %atomicrmw.phi
12262 ; GCN2-NEXT: s_endpgm
12263 ; GCN2-NEXT: .LBB88_3: ; %atomicrmw.global
12264 ; GCN2-NEXT: v_mov_b32_e32 v5, s1
12265 ; GCN2-NEXT: v_mov_b32_e32 v4, s0
12266 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
12267 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
12268 ; GCN2-NEXT: v_mov_b32_e32 v6, s3
12269 ; GCN2-NEXT: v_mov_b32_e32 v7, s2
12270 ; GCN2-NEXT: .LBB88_4: ; %atomicrmw.start
12271 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
12272 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12273 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
12274 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
12275 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
12276 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
12277 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12278 ; GCN2-NEXT: buffer_wbinvl1_vol
12279 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
12280 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
12281 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
12282 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
12283 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
12284 ; GCN2-NEXT: s_cbranch_execnz .LBB88_4
12285 ; GCN2-NEXT: ; %bb.5: ; %Flow
12286 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
12287 ; GCN2-NEXT: s_branch .LBB88_2
12288 ; GCN2-NEXT: .LBB88_6: ; %atomicrmw.private
12289 ; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
12290 ; GCN2-NEXT: s_cselect_b32 s0, s0, -1
12291 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
12292 ; GCN2-NEXT: s_add_i32 s0, s0, 4
12293 ; GCN2-NEXT: v_mov_b32_e32 v3, s0
12294 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
12295 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
12296 ; GCN2-NEXT: v_mov_b32_e32 v5, s2
12297 ; GCN2-NEXT: v_mov_b32_e32 v4, s3
12298 ; GCN2-NEXT: s_waitcnt vmcnt(0)
12299 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
12300 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
12301 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
12302 ; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
12303 ; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
12304 ; GCN2-NEXT: s_endpgm
12306 ; GCN3-LABEL: atomic_max_i64_addr64_offset:
12307 ; GCN3: ; %bb.0: ; %entry
12308 ; GCN3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
12309 ; GCN3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
12310 ; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
12311 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
12312 ; GCN3-NEXT: s_mov_b32 s14, -1
12313 ; GCN3-NEXT: s_mov_b32 s15, 0xe00000
12314 ; GCN3-NEXT: s_add_u32 s12, s12, s11
12315 ; GCN3-NEXT: s_addc_u32 s13, s13, 0
12316 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
12317 ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
12318 ; GCN3-NEXT: s_add_u32 s0, s0, s6
12319 ; GCN3-NEXT: s_addc_u32 s1, s1, s7
12320 ; GCN3-NEXT: s_add_u32 s0, s0, 32
12321 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
12322 ; GCN3-NEXT: s_addc_u32 s1, s1, 0
12323 ; GCN3-NEXT: s_cmp_eq_u32 s1, s5
12324 ; GCN3-NEXT: s_cselect_b64 s[4:5], -1, 0
12325 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[4:5]
12326 ; GCN3-NEXT: s_mov_b64 s[4:5], -1
12327 ; GCN3-NEXT: s_cbranch_vccnz .LBB88_3
12328 ; GCN3-NEXT: ; %bb.1: ; %Flow6
12329 ; GCN3-NEXT: s_and_b64 vcc, exec, s[4:5]
12330 ; GCN3-NEXT: s_cbranch_vccnz .LBB88_6
12331 ; GCN3-NEXT: .LBB88_2: ; %atomicrmw.phi
12332 ; GCN3-NEXT: s_endpgm
12333 ; GCN3-NEXT: .LBB88_3: ; %atomicrmw.global
12334 ; GCN3-NEXT: v_mov_b32_e32 v5, s1
12335 ; GCN3-NEXT: v_mov_b32_e32 v4, s0
12336 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
12337 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
12338 ; GCN3-NEXT: v_mov_b32_e32 v6, s3
12339 ; GCN3-NEXT: v_mov_b32_e32 v7, s2
12340 ; GCN3-NEXT: .LBB88_4: ; %atomicrmw.start
12341 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
12342 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12343 ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
12344 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
12345 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
12346 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
12347 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12348 ; GCN3-NEXT: buffer_wbinvl1_vol
12349 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
12350 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
12351 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
12352 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
12353 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
12354 ; GCN3-NEXT: s_cbranch_execnz .LBB88_4
12355 ; GCN3-NEXT: ; %bb.5: ; %Flow
12356 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
12357 ; GCN3-NEXT: s_branch .LBB88_2
12358 ; GCN3-NEXT: .LBB88_6: ; %atomicrmw.private
12359 ; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0
12360 ; GCN3-NEXT: s_cselect_b32 s0, s0, -1
12361 ; GCN3-NEXT: v_mov_b32_e32 v2, s0
12362 ; GCN3-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
12363 ; GCN3-NEXT: buffer_load_dword v1, v2, s[12:15], 0 offen offset:4
12364 ; GCN3-NEXT: v_mov_b32_e32 v4, s2
12365 ; GCN3-NEXT: v_mov_b32_e32 v3, s3
12366 ; GCN3-NEXT: s_waitcnt vmcnt(0)
12367 ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
12368 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
12369 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
12370 ; GCN3-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
12371 ; GCN3-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen offset:4
12372 ; GCN3-NEXT: s_endpgm
12374 %ptr = getelementptr i64, ptr %out, i64 %index
12375 %gep = getelementptr i64, ptr %ptr, i64 4
12376 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst
12380 define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
12381 ; GCN1-LABEL: atomic_max_i64_ret_addr64_offset:
12382 ; GCN1: ; %bb.0: ; %entry
12383 ; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
12384 ; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
12385 ; GCN1-NEXT: s_mov_b32 s18, -1
12386 ; GCN1-NEXT: s_mov_b32 s19, 0xe8f000
12387 ; GCN1-NEXT: s_add_u32 s16, s16, s11
12388 ; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9
12389 ; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41
12390 ; GCN1-NEXT: s_addc_u32 s17, s17, 0
12391 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
12392 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
12393 ; GCN1-NEXT: s_add_u32 s0, s8, s0
12394 ; GCN1-NEXT: s_addc_u32 s1, s9, s1
12395 ; GCN1-NEXT: s_add_u32 s0, s0, 32
12396 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
12397 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2
12398 ; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
12399 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
12400 ; GCN1-NEXT: s_cbranch_vccz .LBB89_4
12401 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
12402 ; GCN1-NEXT: v_mov_b32_e32 v3, s1
12403 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
12404 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
12405 ; GCN1-NEXT: s_mov_b64 s[2:3], 0
12406 ; GCN1-NEXT: v_mov_b32_e32 v4, s13
12407 ; GCN1-NEXT: v_mov_b32_e32 v5, s12
12408 ; GCN1-NEXT: .LBB89_2: ; %atomicrmw.start
12409 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
12410 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12411 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
12412 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
12413 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9]
12414 ; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
12415 ; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
12416 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
12417 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12418 ; GCN1-NEXT: buffer_wbinvl1_vol
12419 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
12420 ; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
12421 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
12422 ; GCN1-NEXT: s_cbranch_execnz .LBB89_2
12423 ; GCN1-NEXT: ; %bb.3: ; %Flow
12424 ; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
12425 ; GCN1-NEXT: s_branch .LBB89_6
12426 ; GCN1-NEXT: .LBB89_4:
12427 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
12428 ; GCN1-NEXT: s_cbranch_execz .LBB89_6
12429 ; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
12430 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
12431 ; GCN1-NEXT: v_mov_b32_e32 v5, s12
12432 ; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
12433 ; GCN1-NEXT: s_cselect_b32 s0, s0, -1
12434 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
12435 ; GCN1-NEXT: s_add_i32 s0, s0, 4
12436 ; GCN1-NEXT: v_mov_b32_e32 v3, s0
12437 ; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen
12438 ; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
12439 ; GCN1-NEXT: v_mov_b32_e32 v4, s13
12440 ; GCN1-NEXT: s_waitcnt vmcnt(0)
12441 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
12442 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
12443 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
12444 ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen
12445 ; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen
12446 ; GCN1-NEXT: .LBB89_6: ; %atomicrmw.phi
12447 ; GCN1-NEXT: v_mov_b32_e32 v2, s10
12448 ; GCN1-NEXT: v_mov_b32_e32 v3, s11
12449 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
12450 ; GCN1-NEXT: s_endpgm
12452 ; GCN2-LABEL: atomic_max_i64_ret_addr64_offset:
12453 ; GCN2: ; %bb.0: ; %entry
12454 ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
12455 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
12456 ; GCN2-NEXT: s_mov_b32 s90, -1
12457 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000
12458 ; GCN2-NEXT: s_add_u32 s88, s88, s11
12459 ; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
12460 ; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104
12461 ; GCN2-NEXT: s_addc_u32 s89, s89, 0
12462 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
12463 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
12464 ; GCN2-NEXT: s_add_u32 s0, s8, s0
12465 ; GCN2-NEXT: s_addc_u32 s1, s9, s1
12466 ; GCN2-NEXT: s_add_u32 s0, s0, 32
12467 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
12468 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2
12469 ; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
12470 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
12471 ; GCN2-NEXT: s_cbranch_vccz .LBB89_4
12472 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
12473 ; GCN2-NEXT: v_mov_b32_e32 v3, s1
12474 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
12475 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
12476 ; GCN2-NEXT: s_mov_b64 s[2:3], 0
12477 ; GCN2-NEXT: v_mov_b32_e32 v4, s13
12478 ; GCN2-NEXT: v_mov_b32_e32 v5, s12
12479 ; GCN2-NEXT: .LBB89_2: ; %atomicrmw.start
12480 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
12481 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12482 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
12483 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
12484 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9]
12485 ; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
12486 ; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
12487 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
12488 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12489 ; GCN2-NEXT: buffer_wbinvl1_vol
12490 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
12491 ; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
12492 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
12493 ; GCN2-NEXT: s_cbranch_execnz .LBB89_2
12494 ; GCN2-NEXT: ; %bb.3: ; %Flow
12495 ; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
12496 ; GCN2-NEXT: s_branch .LBB89_6
12497 ; GCN2-NEXT: .LBB89_4:
12498 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
12499 ; GCN2-NEXT: s_cbranch_execz .LBB89_6
12500 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
12501 ; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
12502 ; GCN2-NEXT: s_cselect_b32 s0, s0, -1
12503 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
12504 ; GCN2-NEXT: s_add_i32 s0, s0, 4
12505 ; GCN2-NEXT: v_mov_b32_e32 v3, s0
12506 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
12507 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
12508 ; GCN2-NEXT: v_mov_b32_e32 v5, s12
12509 ; GCN2-NEXT: v_mov_b32_e32 v4, s13
12510 ; GCN2-NEXT: s_waitcnt vmcnt(0)
12511 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
12512 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
12513 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
12514 ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
12515 ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
12516 ; GCN2-NEXT: .LBB89_6: ; %atomicrmw.phi
12517 ; GCN2-NEXT: v_mov_b32_e32 v2, s10
12518 ; GCN2-NEXT: v_mov_b32_e32 v3, s11
12519 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
12520 ; GCN2-NEXT: s_endpgm
12522 ; GCN3-LABEL: atomic_max_i64_ret_addr64_offset:
12523 ; GCN3: ; %bb.0: ; %entry
12524 ; GCN3-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
12525 ; GCN3-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
12526 ; GCN3-NEXT: s_mov_b32 s18, -1
12527 ; GCN3-NEXT: s_mov_b32 s19, 0xe00000
12528 ; GCN3-NEXT: s_add_u32 s16, s16, s11
12529 ; GCN3-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
12530 ; GCN3-NEXT: s_addc_u32 s17, s17, 0
12531 ; GCN3-NEXT: s_mov_b64 s[2:3], src_private_base
12532 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
12533 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
12534 ; GCN3-NEXT: s_add_u32 s0, s8, s0
12535 ; GCN3-NEXT: s_addc_u32 s1, s9, s1
12536 ; GCN3-NEXT: s_add_u32 s0, s0, 32
12537 ; GCN3-NEXT: s_addc_u32 s1, s1, 0
12538 ; GCN3-NEXT: s_cmp_eq_u32 s1, s3
12539 ; GCN3-NEXT: s_cselect_b64 s[2:3], -1, 0
12540 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3]
12541 ; GCN3-NEXT: s_cbranch_vccz .LBB89_4
12542 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
12543 ; GCN3-NEXT: v_mov_b32_e32 v3, s1
12544 ; GCN3-NEXT: v_mov_b32_e32 v2, s0
12545 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
12546 ; GCN3-NEXT: s_mov_b64 s[2:3], 0
12547 ; GCN3-NEXT: v_mov_b32_e32 v4, s13
12548 ; GCN3-NEXT: v_mov_b32_e32 v5, s12
12549 ; GCN3-NEXT: .LBB89_2: ; %atomicrmw.start
12550 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
12551 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12552 ; GCN3-NEXT: v_mov_b32_e32 v9, v1
12553 ; GCN3-NEXT: v_mov_b32_e32 v8, v0
12554 ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9]
12555 ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
12556 ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
12557 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
12558 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12559 ; GCN3-NEXT: buffer_wbinvl1_vol
12560 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
12561 ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
12562 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
12563 ; GCN3-NEXT: s_cbranch_execnz .LBB89_2
12564 ; GCN3-NEXT: ; %bb.3: ; %Flow
12565 ; GCN3-NEXT: s_or_b64 exec, exec, s[2:3]
12566 ; GCN3-NEXT: s_branch .LBB89_6
12567 ; GCN3-NEXT: .LBB89_4:
12568 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
12569 ; GCN3-NEXT: s_cbranch_execz .LBB89_6
12570 ; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
12571 ; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0
12572 ; GCN3-NEXT: s_cselect_b32 s0, s0, -1
12573 ; GCN3-NEXT: v_mov_b32_e32 v2, s0
12574 ; GCN3-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen
12575 ; GCN3-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen offset:4
12576 ; GCN3-NEXT: v_mov_b32_e32 v4, s12
12577 ; GCN3-NEXT: v_mov_b32_e32 v3, s13
12578 ; GCN3-NEXT: s_waitcnt vmcnt(0)
12579 ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
12580 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
12581 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
12582 ; GCN3-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen
12583 ; GCN3-NEXT: buffer_store_dword v3, v2, s[16:19], 0 offen offset:4
12584 ; GCN3-NEXT: .LBB89_6: ; %atomicrmw.phi
12585 ; GCN3-NEXT: v_mov_b32_e32 v2, s10
12586 ; GCN3-NEXT: v_mov_b32_e32 v3, s11
12587 ; GCN3-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
12588 ; GCN3-NEXT: s_endpgm
12590 %ptr = getelementptr i64, ptr %out, i64 %index
12591 %gep = getelementptr i64, ptr %ptr, i64 4
12592 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst
12593 store i64 %tmp0, ptr %out2
12597 define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) {
12598 ; GCN1-LABEL: atomic_max_i64_addr64:
12599 ; GCN1: ; %bb.0: ; %entry
12600 ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
12601 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
12602 ; GCN1-NEXT: s_mov_b32 s14, -1
12603 ; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
12604 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
12605 ; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f
12606 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
12607 ; GCN1-NEXT: s_add_u32 s12, s12, s11
12608 ; GCN1-NEXT: s_addc_u32 s13, s13, 0
12609 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
12610 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
12611 ; GCN1-NEXT: s_add_u32 s0, s0, s4
12612 ; GCN1-NEXT: s_addc_u32 s1, s1, s5
12613 ; GCN1-NEXT: s_cmp_eq_u32 s1, s8
12614 ; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0
12615 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
12616 ; GCN1-NEXT: s_mov_b64 s[4:5], -1
12617 ; GCN1-NEXT: s_cbranch_vccnz .LBB90_3
12618 ; GCN1-NEXT: ; %bb.1: ; %Flow6
12619 ; GCN1-NEXT: s_and_b64 vcc, exec, s[4:5]
12620 ; GCN1-NEXT: s_cbranch_vccnz .LBB90_6
12621 ; GCN1-NEXT: .LBB90_2: ; %atomicrmw.phi
12622 ; GCN1-NEXT: s_endpgm
12623 ; GCN1-NEXT: .LBB90_3: ; %atomicrmw.global
12624 ; GCN1-NEXT: v_mov_b32_e32 v5, s1
12625 ; GCN1-NEXT: v_mov_b32_e32 v4, s0
12626 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
12627 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
12628 ; GCN1-NEXT: v_mov_b32_e32 v6, s3
12629 ; GCN1-NEXT: v_mov_b32_e32 v7, s2
12630 ; GCN1-NEXT: .LBB90_4: ; %atomicrmw.start
12631 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
12632 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12633 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
12634 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
12635 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
12636 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
12637 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12638 ; GCN1-NEXT: buffer_wbinvl1_vol
12639 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
12640 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
12641 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
12642 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
12643 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
12644 ; GCN1-NEXT: s_cbranch_execnz .LBB90_4
12645 ; GCN1-NEXT: ; %bb.5: ; %Flow
12646 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
12647 ; GCN1-NEXT: s_branch .LBB90_2
12648 ; GCN1-NEXT: .LBB90_6: ; %atomicrmw.private
12649 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0
12650 ; GCN1-NEXT: v_mov_b32_e32 v5, s2
12651 ; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec
12652 ; GCN1-NEXT: s_cselect_b32 s0, s0, -1
12653 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
12654 ; GCN1-NEXT: s_add_i32 s0, s0, 4
12655 ; GCN1-NEXT: v_mov_b32_e32 v3, s0
12656 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
12657 ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
12658 ; GCN1-NEXT: v_mov_b32_e32 v4, s3
12659 ; GCN1-NEXT: s_waitcnt vmcnt(0)
12660 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
12661 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
12662 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
12663 ; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
12664 ; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
12665 ; GCN1-NEXT: s_endpgm
12667 ; GCN2-LABEL: atomic_max_i64_addr64:
12668 ; GCN2: ; %bb.0: ; %entry
12669 ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
12670 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
12671 ; GCN2-NEXT: s_mov_b32 s90, -1
12672 ; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
12673 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
12674 ; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc
12675 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000
12676 ; GCN2-NEXT: s_add_u32 s88, s88, s11
12677 ; GCN2-NEXT: s_addc_u32 s89, s89, 0
12678 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
12679 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
12680 ; GCN2-NEXT: s_add_u32 s0, s0, s4
12681 ; GCN2-NEXT: s_addc_u32 s1, s1, s5
12682 ; GCN2-NEXT: s_cmp_eq_u32 s1, s8
12683 ; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0
12684 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5]
12685 ; GCN2-NEXT: s_mov_b64 s[4:5], -1
12686 ; GCN2-NEXT: s_cbranch_vccnz .LBB90_3
12687 ; GCN2-NEXT: ; %bb.1: ; %Flow6
12688 ; GCN2-NEXT: s_and_b64 vcc, exec, s[4:5]
12689 ; GCN2-NEXT: s_cbranch_vccnz .LBB90_6
12690 ; GCN2-NEXT: .LBB90_2: ; %atomicrmw.phi
12691 ; GCN2-NEXT: s_endpgm
12692 ; GCN2-NEXT: .LBB90_3: ; %atomicrmw.global
12693 ; GCN2-NEXT: v_mov_b32_e32 v5, s1
12694 ; GCN2-NEXT: v_mov_b32_e32 v4, s0
12695 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
12696 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
12697 ; GCN2-NEXT: v_mov_b32_e32 v6, s3
12698 ; GCN2-NEXT: v_mov_b32_e32 v7, s2
12699 ; GCN2-NEXT: .LBB90_4: ; %atomicrmw.start
12700 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
12701 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12702 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
12703 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
12704 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
12705 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
12706 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12707 ; GCN2-NEXT: buffer_wbinvl1_vol
12708 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
12709 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
12710 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
12711 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
12712 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
12713 ; GCN2-NEXT: s_cbranch_execnz .LBB90_4
12714 ; GCN2-NEXT: ; %bb.5: ; %Flow
12715 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
12716 ; GCN2-NEXT: s_branch .LBB90_2
12717 ; GCN2-NEXT: .LBB90_6: ; %atomicrmw.private
12718 ; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
12719 ; GCN2-NEXT: s_cselect_b32 s0, s0, -1
12720 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
12721 ; GCN2-NEXT: s_add_i32 s0, s0, 4
12722 ; GCN2-NEXT: v_mov_b32_e32 v3, s0
12723 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
12724 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
12725 ; GCN2-NEXT: v_mov_b32_e32 v5, s2
12726 ; GCN2-NEXT: v_mov_b32_e32 v4, s3
12727 ; GCN2-NEXT: s_waitcnt vmcnt(0)
12728 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
12729 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
12730 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
12731 ; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
12732 ; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
12733 ; GCN2-NEXT: s_endpgm
12735 ; GCN3-LABEL: atomic_max_i64_addr64:
12736 ; GCN3: ; %bb.0: ; %entry
12737 ; GCN3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
12738 ; GCN3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
12739 ; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
12740 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
12741 ; GCN3-NEXT: s_mov_b32 s14, -1
12742 ; GCN3-NEXT: s_mov_b32 s15, 0xe00000
12743 ; GCN3-NEXT: s_add_u32 s12, s12, s11
12744 ; GCN3-NEXT: s_addc_u32 s13, s13, 0
12745 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
12746 ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
12747 ; GCN3-NEXT: s_add_u32 s0, s0, s6
12748 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
12749 ; GCN3-NEXT: s_addc_u32 s1, s1, s7
12750 ; GCN3-NEXT: s_cmp_eq_u32 s1, s5
12751 ; GCN3-NEXT: s_cselect_b64 s[4:5], -1, 0
12752 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[4:5]
12753 ; GCN3-NEXT: s_mov_b64 s[4:5], -1
12754 ; GCN3-NEXT: s_cbranch_vccnz .LBB90_3
12755 ; GCN3-NEXT: ; %bb.1: ; %Flow6
12756 ; GCN3-NEXT: s_and_b64 vcc, exec, s[4:5]
12757 ; GCN3-NEXT: s_cbranch_vccnz .LBB90_6
12758 ; GCN3-NEXT: .LBB90_2: ; %atomicrmw.phi
12759 ; GCN3-NEXT: s_endpgm
12760 ; GCN3-NEXT: .LBB90_3: ; %atomicrmw.global
12761 ; GCN3-NEXT: v_mov_b32_e32 v5, s1
12762 ; GCN3-NEXT: v_mov_b32_e32 v4, s0
12763 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
12764 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
12765 ; GCN3-NEXT: v_mov_b32_e32 v6, s3
12766 ; GCN3-NEXT: v_mov_b32_e32 v7, s2
12767 ; GCN3-NEXT: .LBB90_4: ; %atomicrmw.start
12768 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
12769 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12770 ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
12771 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
12772 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
12773 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
12774 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12775 ; GCN3-NEXT: buffer_wbinvl1_vol
12776 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
12777 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
12778 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
12779 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
12780 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
12781 ; GCN3-NEXT: s_cbranch_execnz .LBB90_4
12782 ; GCN3-NEXT: ; %bb.5: ; %Flow
12783 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
12784 ; GCN3-NEXT: s_branch .LBB90_2
12785 ; GCN3-NEXT: .LBB90_6: ; %atomicrmw.private
12786 ; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0
12787 ; GCN3-NEXT: s_cselect_b32 s0, s0, -1
12788 ; GCN3-NEXT: v_mov_b32_e32 v2, s0
12789 ; GCN3-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
12790 ; GCN3-NEXT: buffer_load_dword v1, v2, s[12:15], 0 offen offset:4
12791 ; GCN3-NEXT: v_mov_b32_e32 v4, s2
12792 ; GCN3-NEXT: v_mov_b32_e32 v3, s3
12793 ; GCN3-NEXT: s_waitcnt vmcnt(0)
12794 ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
12795 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
12796 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
12797 ; GCN3-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
12798 ; GCN3-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen offset:4
12799 ; GCN3-NEXT: s_endpgm
12801 %ptr = getelementptr i64, ptr %out, i64 %index
12802 %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst
12806 define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
12807 ; GCN1-LABEL: atomic_max_i64_ret_addr64:
12808 ; GCN1: ; %bb.0: ; %entry
12809 ; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
12810 ; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
12811 ; GCN1-NEXT: s_mov_b32 s18, -1
12812 ; GCN1-NEXT: s_mov_b32 s19, 0xe8f000
12813 ; GCN1-NEXT: s_add_u32 s16, s16, s11
12814 ; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9
12815 ; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41
12816 ; GCN1-NEXT: s_addc_u32 s17, s17, 0
12817 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
12818 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
12819 ; GCN1-NEXT: s_add_u32 s0, s8, s0
12820 ; GCN1-NEXT: s_addc_u32 s1, s9, s1
12821 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2
12822 ; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
12823 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
12824 ; GCN1-NEXT: s_cbranch_vccz .LBB91_4
12825 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
12826 ; GCN1-NEXT: v_mov_b32_e32 v3, s1
12827 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
12828 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
12829 ; GCN1-NEXT: s_mov_b64 s[2:3], 0
12830 ; GCN1-NEXT: v_mov_b32_e32 v4, s13
12831 ; GCN1-NEXT: v_mov_b32_e32 v5, s12
12832 ; GCN1-NEXT: .LBB91_2: ; %atomicrmw.start
12833 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
12834 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12835 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
12836 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
12837 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9]
12838 ; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
12839 ; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
12840 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
12841 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12842 ; GCN1-NEXT: buffer_wbinvl1_vol
12843 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
12844 ; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
12845 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
12846 ; GCN1-NEXT: s_cbranch_execnz .LBB91_2
12847 ; GCN1-NEXT: ; %bb.3: ; %Flow
12848 ; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
12849 ; GCN1-NEXT: s_branch .LBB91_6
12850 ; GCN1-NEXT: .LBB91_4:
12851 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
12852 ; GCN1-NEXT: s_cbranch_execz .LBB91_6
12853 ; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
12854 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
12855 ; GCN1-NEXT: v_mov_b32_e32 v5, s12
12856 ; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
12857 ; GCN1-NEXT: s_cselect_b32 s0, s0, -1
12858 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
12859 ; GCN1-NEXT: s_add_i32 s0, s0, 4
12860 ; GCN1-NEXT: v_mov_b32_e32 v3, s0
12861 ; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen
12862 ; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
12863 ; GCN1-NEXT: v_mov_b32_e32 v4, s13
12864 ; GCN1-NEXT: s_waitcnt vmcnt(0)
12865 ; GCN1-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
12866 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
12867 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
12868 ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen
12869 ; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen
12870 ; GCN1-NEXT: .LBB91_6: ; %atomicrmw.phi
12871 ; GCN1-NEXT: v_mov_b32_e32 v2, s10
12872 ; GCN1-NEXT: v_mov_b32_e32 v3, s11
12873 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
12874 ; GCN1-NEXT: s_endpgm
12876 ; GCN2-LABEL: atomic_max_i64_ret_addr64:
12877 ; GCN2: ; %bb.0: ; %entry
12878 ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
12879 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
12880 ; GCN2-NEXT: s_mov_b32 s90, -1
12881 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000
12882 ; GCN2-NEXT: s_add_u32 s88, s88, s11
12883 ; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
12884 ; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104
12885 ; GCN2-NEXT: s_addc_u32 s89, s89, 0
12886 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
12887 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
12888 ; GCN2-NEXT: s_add_u32 s0, s8, s0
12889 ; GCN2-NEXT: s_addc_u32 s1, s9, s1
12890 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2
12891 ; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
12892 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
12893 ; GCN2-NEXT: s_cbranch_vccz .LBB91_4
12894 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
12895 ; GCN2-NEXT: v_mov_b32_e32 v3, s1
12896 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
12897 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
12898 ; GCN2-NEXT: s_mov_b64 s[2:3], 0
12899 ; GCN2-NEXT: v_mov_b32_e32 v4, s13
12900 ; GCN2-NEXT: v_mov_b32_e32 v5, s12
12901 ; GCN2-NEXT: .LBB91_2: ; %atomicrmw.start
12902 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
12903 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12904 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
12905 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
12906 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9]
12907 ; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
12908 ; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
12909 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
12910 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12911 ; GCN2-NEXT: buffer_wbinvl1_vol
12912 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
12913 ; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
12914 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
12915 ; GCN2-NEXT: s_cbranch_execnz .LBB91_2
12916 ; GCN2-NEXT: ; %bb.3: ; %Flow
12917 ; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
12918 ; GCN2-NEXT: s_branch .LBB91_6
12919 ; GCN2-NEXT: .LBB91_4:
12920 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
12921 ; GCN2-NEXT: s_cbranch_execz .LBB91_6
12922 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
12923 ; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
12924 ; GCN2-NEXT: s_cselect_b32 s0, s0, -1
12925 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
12926 ; GCN2-NEXT: s_add_i32 s0, s0, 4
12927 ; GCN2-NEXT: v_mov_b32_e32 v3, s0
12928 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
12929 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
12930 ; GCN2-NEXT: v_mov_b32_e32 v5, s12
12931 ; GCN2-NEXT: v_mov_b32_e32 v4, s13
12932 ; GCN2-NEXT: s_waitcnt vmcnt(0)
12933 ; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
12934 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
12935 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
12936 ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
12937 ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
12938 ; GCN2-NEXT: .LBB91_6: ; %atomicrmw.phi
12939 ; GCN2-NEXT: v_mov_b32_e32 v2, s10
12940 ; GCN2-NEXT: v_mov_b32_e32 v3, s11
12941 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
12942 ; GCN2-NEXT: s_endpgm
12944 ; GCN3-LABEL: atomic_max_i64_ret_addr64:
12945 ; GCN3: ; %bb.0: ; %entry
12946 ; GCN3-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
12947 ; GCN3-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
12948 ; GCN3-NEXT: s_mov_b32 s18, -1
12949 ; GCN3-NEXT: s_mov_b32 s19, 0xe00000
12950 ; GCN3-NEXT: s_add_u32 s16, s16, s11
12951 ; GCN3-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
12952 ; GCN3-NEXT: s_addc_u32 s17, s17, 0
12953 ; GCN3-NEXT: s_mov_b64 s[2:3], src_private_base
12954 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
12955 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
12956 ; GCN3-NEXT: s_add_u32 s0, s8, s0
12957 ; GCN3-NEXT: s_addc_u32 s1, s9, s1
12958 ; GCN3-NEXT: s_cmp_eq_u32 s1, s3
12959 ; GCN3-NEXT: s_cselect_b64 s[2:3], -1, 0
12960 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3]
12961 ; GCN3-NEXT: s_cbranch_vccz .LBB91_4
12962 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
12963 ; GCN3-NEXT: v_mov_b32_e32 v3, s1
12964 ; GCN3-NEXT: v_mov_b32_e32 v2, s0
12965 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
12966 ; GCN3-NEXT: s_mov_b64 s[2:3], 0
12967 ; GCN3-NEXT: v_mov_b32_e32 v4, s13
12968 ; GCN3-NEXT: v_mov_b32_e32 v5, s12
12969 ; GCN3-NEXT: .LBB91_2: ; %atomicrmw.start
12970 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
12971 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12972 ; GCN3-NEXT: v_mov_b32_e32 v9, v1
12973 ; GCN3-NEXT: v_mov_b32_e32 v8, v0
12974 ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9]
12975 ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
12976 ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
12977 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
12978 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12979 ; GCN3-NEXT: buffer_wbinvl1_vol
12980 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
12981 ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
12982 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
12983 ; GCN3-NEXT: s_cbranch_execnz .LBB91_2
12984 ; GCN3-NEXT: ; %bb.3: ; %Flow
12985 ; GCN3-NEXT: s_or_b64 exec, exec, s[2:3]
12986 ; GCN3-NEXT: s_branch .LBB91_6
12987 ; GCN3-NEXT: .LBB91_4:
12988 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
12989 ; GCN3-NEXT: s_cbranch_execz .LBB91_6
12990 ; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
12991 ; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0
12992 ; GCN3-NEXT: s_cselect_b32 s0, s0, -1
12993 ; GCN3-NEXT: v_mov_b32_e32 v2, s0
12994 ; GCN3-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen
12995 ; GCN3-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen offset:4
12996 ; GCN3-NEXT: v_mov_b32_e32 v4, s12
12997 ; GCN3-NEXT: v_mov_b32_e32 v3, s13
12998 ; GCN3-NEXT: s_waitcnt vmcnt(0)
12999 ; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
13000 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
13001 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
13002 ; GCN3-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen
13003 ; GCN3-NEXT: buffer_store_dword v3, v2, s[16:19], 0 offen offset:4
13004 ; GCN3-NEXT: .LBB91_6: ; %atomicrmw.phi
13005 ; GCN3-NEXT: v_mov_b32_e32 v2, s10
13006 ; GCN3-NEXT: v_mov_b32_e32 v3, s11
13007 ; GCN3-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
13008 ; GCN3-NEXT: s_endpgm
13010 %ptr = getelementptr i64, ptr %out, i64 %index
13011 %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst
13012 store i64 %tmp0, ptr %out2
13016 define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
13017 ; GCN1-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory:
13019 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13020 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
13021 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
13022 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
13023 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
13024 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
13025 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
13026 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
13027 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
13028 ; GCN1-NEXT: s_cbranch_execnz .LBB92_3
13029 ; GCN1-NEXT: ; %bb.1: ; %Flow3
13030 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13031 ; GCN1-NEXT: s_cbranch_execnz .LBB92_6
13032 ; GCN1-NEXT: .LBB92_2: ; %atomicrmw.phi
13033 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
13034 ; GCN1-NEXT: s_setpc_b64 s[30:31]
13035 ; GCN1-NEXT: .LBB92_3: ; %atomicrmw.global
13036 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
13037 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
13038 ; GCN1-NEXT: flat_load_dword v7, v[4:5]
13039 ; GCN1-NEXT: flat_load_dword v6, v[0:1]
13040 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
13041 ; GCN1-NEXT: .LBB92_4: ; %atomicrmw.start
13042 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
13043 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13044 ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
13045 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
13046 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
13047 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
13048 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13049 ; GCN1-NEXT: buffer_wbinvl1_vol
13050 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
13051 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
13052 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
13053 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
13054 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
13055 ; GCN1-NEXT: s_cbranch_execnz .LBB92_4
13056 ; GCN1-NEXT: ; %bb.5: ; %Flow
13057 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
13058 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
13059 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
13060 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13061 ; GCN1-NEXT: s_cbranch_execz .LBB92_2
13062 ; GCN1-NEXT: .LBB92_6: ; %atomicrmw.private
13063 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
13064 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
13065 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
13066 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
13067 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
13068 ; GCN1-NEXT: s_waitcnt vmcnt(0)
13069 ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
13070 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
13071 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
13072 ; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
13073 ; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
13074 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
13075 ; GCN1-NEXT: s_waitcnt vmcnt(0)
13076 ; GCN1-NEXT: s_setpc_b64 s[30:31]
13078 ; GCN2-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory:
13080 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13081 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
13082 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
13083 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
13084 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
13085 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
13086 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
13087 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
13088 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
13089 ; GCN2-NEXT: s_cbranch_execnz .LBB92_3
13090 ; GCN2-NEXT: ; %bb.1: ; %Flow3
13091 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13092 ; GCN2-NEXT: s_cbranch_execnz .LBB92_6
13093 ; GCN2-NEXT: .LBB92_2: ; %atomicrmw.phi
13094 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
13095 ; GCN2-NEXT: s_setpc_b64 s[30:31]
13096 ; GCN2-NEXT: .LBB92_3: ; %atomicrmw.global
13097 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
13098 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
13099 ; GCN2-NEXT: flat_load_dword v7, v[4:5]
13100 ; GCN2-NEXT: flat_load_dword v6, v[0:1]
13101 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
13102 ; GCN2-NEXT: .LBB92_4: ; %atomicrmw.start
13103 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
13104 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13105 ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
13106 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
13107 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
13108 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
13109 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13110 ; GCN2-NEXT: buffer_wbinvl1_vol
13111 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
13112 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
13113 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
13114 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
13115 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
13116 ; GCN2-NEXT: s_cbranch_execnz .LBB92_4
13117 ; GCN2-NEXT: ; %bb.5: ; %Flow
13118 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
13119 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
13120 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
13121 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13122 ; GCN2-NEXT: s_cbranch_execz .LBB92_2
13123 ; GCN2-NEXT: .LBB92_6: ; %atomicrmw.private
13124 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
13125 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
13126 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
13127 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
13128 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
13129 ; GCN2-NEXT: s_waitcnt vmcnt(0)
13130 ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
13131 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
13132 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
13133 ; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
13134 ; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
13135 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
13136 ; GCN2-NEXT: s_waitcnt vmcnt(0)
13137 ; GCN2-NEXT: s_setpc_b64 s[30:31]
13139 ; GCN3-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory:
13141 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13142 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
13143 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
13144 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
13145 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
13146 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
13147 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
13148 ; GCN3-NEXT: s_cbranch_execnz .LBB92_3
13149 ; GCN3-NEXT: ; %bb.1: ; %Flow3
13150 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13151 ; GCN3-NEXT: s_cbranch_execnz .LBB92_6
13152 ; GCN3-NEXT: .LBB92_2: ; %atomicrmw.phi
13153 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
13154 ; GCN3-NEXT: s_setpc_b64 s[30:31]
13155 ; GCN3-NEXT: .LBB92_3: ; %atomicrmw.global
13156 ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
13157 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
13158 ; GCN3-NEXT: .LBB92_4: ; %atomicrmw.start
13159 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
13160 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13161 ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
13162 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
13163 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
13164 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
13165 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13166 ; GCN3-NEXT: buffer_wbinvl1_vol
13167 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
13168 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
13169 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
13170 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
13171 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
13172 ; GCN3-NEXT: s_cbranch_execnz .LBB92_4
13173 ; GCN3-NEXT: ; %bb.5: ; %Flow
13174 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
13175 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
13176 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
13177 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13178 ; GCN3-NEXT: s_cbranch_execz .LBB92_2
13179 ; GCN3-NEXT: .LBB92_6: ; %atomicrmw.private
13180 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
13181 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
13182 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
13183 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
13184 ; GCN3-NEXT: s_waitcnt vmcnt(0)
13185 ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
13186 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
13187 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
13188 ; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
13189 ; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
13190 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
13191 ; GCN3-NEXT: s_waitcnt vmcnt(0)
13192 ; GCN3-NEXT: s_setpc_b64 s[30:31]
13193 %gep = getelementptr i64, ptr %out, i64 4
13194 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
13198 define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
13199 ; GCN1-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
13201 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13202 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
13203 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
13204 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
13205 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
13206 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
13207 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
13208 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
13209 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
13210 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
13211 ; GCN1-NEXT: s_cbranch_execnz .LBB93_3
13212 ; GCN1-NEXT: ; %bb.1: ; %Flow3
13213 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13214 ; GCN1-NEXT: s_cbranch_execnz .LBB93_6
13215 ; GCN1-NEXT: .LBB93_2: ; %atomicrmw.phi
13216 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
13217 ; GCN1-NEXT: s_setpc_b64 s[30:31]
13218 ; GCN1-NEXT: .LBB93_3: ; %atomicrmw.global
13219 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4
13220 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
13221 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
13222 ; GCN1-NEXT: flat_load_dword v0, v[4:5]
13223 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
13224 ; GCN1-NEXT: .LBB93_4: ; %atomicrmw.start
13225 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
13226 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13227 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
13228 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
13229 ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
13230 ; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
13231 ; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
13232 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
13233 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13234 ; GCN1-NEXT: buffer_wbinvl1_vol
13235 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
13236 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
13237 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
13238 ; GCN1-NEXT: s_cbranch_execnz .LBB93_4
13239 ; GCN1-NEXT: ; %bb.5: ; %Flow
13240 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
13241 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
13242 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
13243 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13244 ; GCN1-NEXT: s_cbranch_execz .LBB93_2
13245 ; GCN1-NEXT: .LBB93_6: ; %atomicrmw.private
13246 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
13247 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
13248 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
13249 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
13250 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
13251 ; GCN1-NEXT: s_waitcnt vmcnt(0)
13252 ; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
13253 ; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
13254 ; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
13255 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
13256 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
13257 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
13258 ; GCN1-NEXT: s_waitcnt vmcnt(0)
13259 ; GCN1-NEXT: s_setpc_b64 s[30:31]
13261 ; GCN2-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
13263 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13264 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
13265 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
13266 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
13267 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
13268 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
13269 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
13270 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
13271 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
13272 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
13273 ; GCN2-NEXT: s_cbranch_execnz .LBB93_3
13274 ; GCN2-NEXT: ; %bb.1: ; %Flow3
13275 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13276 ; GCN2-NEXT: s_cbranch_execnz .LBB93_6
13277 ; GCN2-NEXT: .LBB93_2: ; %atomicrmw.phi
13278 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
13279 ; GCN2-NEXT: s_setpc_b64 s[30:31]
13280 ; GCN2-NEXT: .LBB93_3: ; %atomicrmw.global
13281 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4
13282 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
13283 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
13284 ; GCN2-NEXT: flat_load_dword v0, v[4:5]
13285 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
13286 ; GCN2-NEXT: .LBB93_4: ; %atomicrmw.start
13287 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
13288 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13289 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
13290 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
13291 ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
13292 ; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
13293 ; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
13294 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
13295 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13296 ; GCN2-NEXT: buffer_wbinvl1_vol
13297 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
13298 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
13299 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
13300 ; GCN2-NEXT: s_cbranch_execnz .LBB93_4
13301 ; GCN2-NEXT: ; %bb.5: ; %Flow
13302 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
13303 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
13304 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
13305 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13306 ; GCN2-NEXT: s_cbranch_execz .LBB93_2
13307 ; GCN2-NEXT: .LBB93_6: ; %atomicrmw.private
13308 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
13309 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
13310 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
13311 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
13312 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
13313 ; GCN2-NEXT: s_waitcnt vmcnt(0)
13314 ; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
13315 ; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
13316 ; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
13317 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
13318 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
13319 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
13320 ; GCN2-NEXT: s_waitcnt vmcnt(0)
13321 ; GCN2-NEXT: s_setpc_b64 s[30:31]
13323 ; GCN3-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
13325 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13326 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
13327 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
13328 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
13329 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
13330 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
13331 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
13332 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
13333 ; GCN3-NEXT: s_cbranch_execnz .LBB93_3
13334 ; GCN3-NEXT: ; %bb.1: ; %Flow3
13335 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13336 ; GCN3-NEXT: s_cbranch_execnz .LBB93_6
13337 ; GCN3-NEXT: .LBB93_2: ; %atomicrmw.phi
13338 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
13339 ; GCN3-NEXT: s_setpc_b64 s[30:31]
13340 ; GCN3-NEXT: .LBB93_3: ; %atomicrmw.global
13341 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
13342 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
13343 ; GCN3-NEXT: .LBB93_4: ; %atomicrmw.start
13344 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
13345 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13346 ; GCN3-NEXT: v_mov_b32_e32 v9, v1
13347 ; GCN3-NEXT: v_mov_b32_e32 v8, v0
13348 ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
13349 ; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
13350 ; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
13351 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
13352 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13353 ; GCN3-NEXT: buffer_wbinvl1_vol
13354 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
13355 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
13356 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
13357 ; GCN3-NEXT: s_cbranch_execnz .LBB93_4
13358 ; GCN3-NEXT: ; %bb.5: ; %Flow
13359 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
13360 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
13361 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
13362 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13363 ; GCN3-NEXT: s_cbranch_execz .LBB93_2
13364 ; GCN3-NEXT: .LBB93_6: ; %atomicrmw.private
13365 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
13366 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
13367 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
13368 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
13369 ; GCN3-NEXT: s_waitcnt vmcnt(0)
13370 ; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
13371 ; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
13372 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
13373 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
13374 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
13375 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
13376 ; GCN3-NEXT: s_waitcnt vmcnt(0)
13377 ; GCN3-NEXT: s_setpc_b64 s[30:31]
13378 %gep = getelementptr i64, ptr %out, i64 4
13379 %result = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
13383 ; ---------------------------------------------------------------------
13385 ; ---------------------------------------------------------------------
13387 define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) {
13388 ; GCN1-LABEL: flat_atomic_umax_i64_noret:
13390 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13391 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
13392 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
13393 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
13394 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
13395 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
13396 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
13397 ; GCN1-NEXT: s_cbranch_execnz .LBB94_3
13398 ; GCN1-NEXT: ; %bb.1: ; %Flow3
13399 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13400 ; GCN1-NEXT: s_cbranch_execnz .LBB94_6
13401 ; GCN1-NEXT: .LBB94_2: ; %atomicrmw.phi
13402 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
13403 ; GCN1-NEXT: s_setpc_b64 s[30:31]
13404 ; GCN1-NEXT: .LBB94_3: ; %atomicrmw.global
13405 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
13406 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
13407 ; GCN1-NEXT: flat_load_dword v7, v[4:5]
13408 ; GCN1-NEXT: flat_load_dword v6, v[0:1]
13409 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
13410 ; GCN1-NEXT: .LBB94_4: ; %atomicrmw.start
13411 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
13412 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13413 ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
13414 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
13415 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
13416 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
13417 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13418 ; GCN1-NEXT: buffer_wbinvl1_vol
13419 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
13420 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
13421 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
13422 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
13423 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
13424 ; GCN1-NEXT: s_cbranch_execnz .LBB94_4
13425 ; GCN1-NEXT: ; %bb.5: ; %Flow
13426 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
13427 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
13428 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
13429 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13430 ; GCN1-NEXT: s_cbranch_execz .LBB94_2
13431 ; GCN1-NEXT: .LBB94_6: ; %atomicrmw.private
13432 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
13433 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
13434 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
13435 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
13436 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
13437 ; GCN1-NEXT: s_waitcnt vmcnt(0)
13438 ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
13439 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
13440 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
13441 ; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
13442 ; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
13443 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
13444 ; GCN1-NEXT: s_waitcnt vmcnt(0)
13445 ; GCN1-NEXT: s_setpc_b64 s[30:31]
13447 ; GCN2-LABEL: flat_atomic_umax_i64_noret:
13449 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13450 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
13451 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
13452 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
13453 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
13454 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
13455 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
13456 ; GCN2-NEXT: s_cbranch_execnz .LBB94_3
13457 ; GCN2-NEXT: ; %bb.1: ; %Flow3
13458 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13459 ; GCN2-NEXT: s_cbranch_execnz .LBB94_6
13460 ; GCN2-NEXT: .LBB94_2: ; %atomicrmw.phi
13461 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
13462 ; GCN2-NEXT: s_setpc_b64 s[30:31]
13463 ; GCN2-NEXT: .LBB94_3: ; %atomicrmw.global
13464 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
13465 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
13466 ; GCN2-NEXT: flat_load_dword v7, v[4:5]
13467 ; GCN2-NEXT: flat_load_dword v6, v[0:1]
13468 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
13469 ; GCN2-NEXT: .LBB94_4: ; %atomicrmw.start
13470 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
13471 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13472 ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
13473 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
13474 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
13475 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
13476 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13477 ; GCN2-NEXT: buffer_wbinvl1_vol
13478 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
13479 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
13480 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
13481 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
13482 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
13483 ; GCN2-NEXT: s_cbranch_execnz .LBB94_4
13484 ; GCN2-NEXT: ; %bb.5: ; %Flow
13485 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
13486 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
13487 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
13488 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13489 ; GCN2-NEXT: s_cbranch_execz .LBB94_2
13490 ; GCN2-NEXT: .LBB94_6: ; %atomicrmw.private
13491 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
13492 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
13493 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
13494 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
13495 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
13496 ; GCN2-NEXT: s_waitcnt vmcnt(0)
13497 ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
13498 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
13499 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
13500 ; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
13501 ; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
13502 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
13503 ; GCN2-NEXT: s_waitcnt vmcnt(0)
13504 ; GCN2-NEXT: s_setpc_b64 s[30:31]
13506 ; GCN3-LABEL: flat_atomic_umax_i64_noret:
13508 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13509 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
13510 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
13511 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
13512 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
13513 ; GCN3-NEXT: s_cbranch_execnz .LBB94_3
13514 ; GCN3-NEXT: ; %bb.1: ; %Flow3
13515 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13516 ; GCN3-NEXT: s_cbranch_execnz .LBB94_6
13517 ; GCN3-NEXT: .LBB94_2: ; %atomicrmw.phi
13518 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
13519 ; GCN3-NEXT: s_setpc_b64 s[30:31]
13520 ; GCN3-NEXT: .LBB94_3: ; %atomicrmw.global
13521 ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
13522 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
13523 ; GCN3-NEXT: .LBB94_4: ; %atomicrmw.start
13524 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
13525 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13526 ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
13527 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
13528 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
13529 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
13530 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13531 ; GCN3-NEXT: buffer_wbinvl1_vol
13532 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
13533 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
13534 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
13535 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
13536 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
13537 ; GCN3-NEXT: s_cbranch_execnz .LBB94_4
13538 ; GCN3-NEXT: ; %bb.5: ; %Flow
13539 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
13540 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
13541 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
13542 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13543 ; GCN3-NEXT: s_cbranch_execz .LBB94_2
13544 ; GCN3-NEXT: .LBB94_6: ; %atomicrmw.private
13545 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
13546 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
13547 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
13548 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
13549 ; GCN3-NEXT: s_waitcnt vmcnt(0)
13550 ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
13551 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
13552 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
13553 ; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
13554 ; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
13555 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
13556 ; GCN3-NEXT: s_waitcnt vmcnt(0)
13557 ; GCN3-NEXT: s_setpc_b64 s[30:31]
13558 %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst
13562 define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) {
13563 ; GCN1-LABEL: flat_atomic_umax_i64_noret_offset:
13565 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13566 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
13567 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
13568 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
13569 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
13570 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
13571 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
13572 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
13573 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
13574 ; GCN1-NEXT: s_cbranch_execnz .LBB95_3
13575 ; GCN1-NEXT: ; %bb.1: ; %Flow3
13576 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13577 ; GCN1-NEXT: s_cbranch_execnz .LBB95_6
13578 ; GCN1-NEXT: .LBB95_2: ; %atomicrmw.phi
13579 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
13580 ; GCN1-NEXT: s_setpc_b64 s[30:31]
13581 ; GCN1-NEXT: .LBB95_3: ; %atomicrmw.global
13582 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
13583 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
13584 ; GCN1-NEXT: flat_load_dword v7, v[4:5]
13585 ; GCN1-NEXT: flat_load_dword v6, v[0:1]
13586 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
13587 ; GCN1-NEXT: .LBB95_4: ; %atomicrmw.start
13588 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
13589 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13590 ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
13591 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
13592 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
13593 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
13594 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13595 ; GCN1-NEXT: buffer_wbinvl1_vol
13596 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
13597 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
13598 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
13599 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
13600 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
13601 ; GCN1-NEXT: s_cbranch_execnz .LBB95_4
13602 ; GCN1-NEXT: ; %bb.5: ; %Flow
13603 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
13604 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
13605 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
13606 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13607 ; GCN1-NEXT: s_cbranch_execz .LBB95_2
13608 ; GCN1-NEXT: .LBB95_6: ; %atomicrmw.private
13609 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
13610 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
13611 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
13612 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
13613 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
13614 ; GCN1-NEXT: s_waitcnt vmcnt(0)
13615 ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
13616 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
13617 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
13618 ; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
13619 ; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
13620 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
13621 ; GCN1-NEXT: s_waitcnt vmcnt(0)
13622 ; GCN1-NEXT: s_setpc_b64 s[30:31]
13624 ; GCN2-LABEL: flat_atomic_umax_i64_noret_offset:
13626 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13627 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
13628 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
13629 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
13630 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
13631 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
13632 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
13633 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
13634 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
13635 ; GCN2-NEXT: s_cbranch_execnz .LBB95_3
13636 ; GCN2-NEXT: ; %bb.1: ; %Flow3
13637 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13638 ; GCN2-NEXT: s_cbranch_execnz .LBB95_6
13639 ; GCN2-NEXT: .LBB95_2: ; %atomicrmw.phi
13640 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
13641 ; GCN2-NEXT: s_setpc_b64 s[30:31]
13642 ; GCN2-NEXT: .LBB95_3: ; %atomicrmw.global
13643 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
13644 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
13645 ; GCN2-NEXT: flat_load_dword v7, v[4:5]
13646 ; GCN2-NEXT: flat_load_dword v6, v[0:1]
13647 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
13648 ; GCN2-NEXT: .LBB95_4: ; %atomicrmw.start
13649 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
13650 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13651 ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
13652 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
13653 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
13654 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
13655 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13656 ; GCN2-NEXT: buffer_wbinvl1_vol
13657 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
13658 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
13659 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
13660 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
13661 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
13662 ; GCN2-NEXT: s_cbranch_execnz .LBB95_4
13663 ; GCN2-NEXT: ; %bb.5: ; %Flow
13664 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
13665 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
13666 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
13667 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13668 ; GCN2-NEXT: s_cbranch_execz .LBB95_2
13669 ; GCN2-NEXT: .LBB95_6: ; %atomicrmw.private
13670 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
13671 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
13672 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
13673 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
13674 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
13675 ; GCN2-NEXT: s_waitcnt vmcnt(0)
13676 ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
13677 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
13678 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
13679 ; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
13680 ; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
13681 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
13682 ; GCN2-NEXT: s_waitcnt vmcnt(0)
13683 ; GCN2-NEXT: s_setpc_b64 s[30:31]
13685 ; GCN3-LABEL: flat_atomic_umax_i64_noret_offset:
13687 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13688 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
13689 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
13690 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
13691 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
13692 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
13693 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
13694 ; GCN3-NEXT: s_cbranch_execnz .LBB95_3
13695 ; GCN3-NEXT: ; %bb.1: ; %Flow3
13696 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13697 ; GCN3-NEXT: s_cbranch_execnz .LBB95_6
13698 ; GCN3-NEXT: .LBB95_2: ; %atomicrmw.phi
13699 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
13700 ; GCN3-NEXT: s_setpc_b64 s[30:31]
13701 ; GCN3-NEXT: .LBB95_3: ; %atomicrmw.global
13702 ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
13703 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
13704 ; GCN3-NEXT: .LBB95_4: ; %atomicrmw.start
13705 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
13706 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13707 ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
13708 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
13709 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
13710 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
13711 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13712 ; GCN3-NEXT: buffer_wbinvl1_vol
13713 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
13714 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
13715 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
13716 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
13717 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
13718 ; GCN3-NEXT: s_cbranch_execnz .LBB95_4
13719 ; GCN3-NEXT: ; %bb.5: ; %Flow
13720 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
13721 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
13722 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
13723 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13724 ; GCN3-NEXT: s_cbranch_execz .LBB95_2
13725 ; GCN3-NEXT: .LBB95_6: ; %atomicrmw.private
13726 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
13727 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
13728 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
13729 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
13730 ; GCN3-NEXT: s_waitcnt vmcnt(0)
13731 ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
13732 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
13733 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
13734 ; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
13735 ; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
13736 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
13737 ; GCN3-NEXT: s_waitcnt vmcnt(0)
13738 ; GCN3-NEXT: s_setpc_b64 s[30:31]
13739 %gep = getelementptr i64, ptr %out, i64 4
13740 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst
13744 define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) {
13745 ; GCN1-LABEL: flat_atomic_umax_i64_ret:
13747 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13748 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
13749 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
13750 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
13751 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
13752 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
13753 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
13754 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
13755 ; GCN1-NEXT: s_cbranch_execz .LBB96_4
13756 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
13757 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
13758 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
13759 ; GCN1-NEXT: flat_load_dword v5, v[4:5]
13760 ; GCN1-NEXT: flat_load_dword v4, v[0:1]
13761 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
13762 ; GCN1-NEXT: .LBB96_2: ; %atomicrmw.start
13763 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
13764 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13765 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
13766 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
13767 ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
13768 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
13769 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
13770 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
13771 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13772 ; GCN1-NEXT: buffer_wbinvl1_vol
13773 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
13774 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
13775 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
13776 ; GCN1-NEXT: s_cbranch_execnz .LBB96_2
13777 ; GCN1-NEXT: ; %bb.3: ; %Flow
13778 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
13779 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
13780 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
13781 ; GCN1-NEXT: .LBB96_4: ; %Flow3
13782 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13783 ; GCN1-NEXT: s_cbranch_execz .LBB96_6
13784 ; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
13785 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
13786 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
13787 ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
13788 ; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
13789 ; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
13790 ; GCN1-NEXT: s_waitcnt vmcnt(0)
13791 ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[2:3]
13792 ; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
13793 ; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
13794 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
13795 ; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
13796 ; GCN1-NEXT: .LBB96_6: ; %atomicrmw.phi
13797 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
13798 ; GCN1-NEXT: v_mov_b32_e32 v0, v4
13799 ; GCN1-NEXT: v_mov_b32_e32 v1, v5
13800 ; GCN1-NEXT: s_waitcnt vmcnt(0)
13801 ; GCN1-NEXT: s_setpc_b64 s[30:31]
13803 ; GCN2-LABEL: flat_atomic_umax_i64_ret:
13805 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13806 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
13807 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
13808 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
13809 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
13810 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
13811 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
13812 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
13813 ; GCN2-NEXT: s_cbranch_execz .LBB96_4
13814 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
13815 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
13816 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
13817 ; GCN2-NEXT: flat_load_dword v5, v[4:5]
13818 ; GCN2-NEXT: flat_load_dword v4, v[0:1]
13819 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
13820 ; GCN2-NEXT: .LBB96_2: ; %atomicrmw.start
13821 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
13822 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13823 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
13824 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
13825 ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
13826 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
13827 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
13828 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
13829 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13830 ; GCN2-NEXT: buffer_wbinvl1_vol
13831 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
13832 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
13833 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
13834 ; GCN2-NEXT: s_cbranch_execnz .LBB96_2
13835 ; GCN2-NEXT: ; %bb.3: ; %Flow
13836 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
13837 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
13838 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
13839 ; GCN2-NEXT: .LBB96_4: ; %Flow3
13840 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13841 ; GCN2-NEXT: s_cbranch_execz .LBB96_6
13842 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
13843 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
13844 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
13845 ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
13846 ; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
13847 ; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
13848 ; GCN2-NEXT: s_waitcnt vmcnt(0)
13849 ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[2:3]
13850 ; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
13851 ; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
13852 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
13853 ; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
13854 ; GCN2-NEXT: .LBB96_6: ; %atomicrmw.phi
13855 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
13856 ; GCN2-NEXT: v_mov_b32_e32 v0, v4
13857 ; GCN2-NEXT: v_mov_b32_e32 v1, v5
13858 ; GCN2-NEXT: s_waitcnt vmcnt(0)
13859 ; GCN2-NEXT: s_setpc_b64 s[30:31]
13861 ; GCN3-LABEL: flat_atomic_umax_i64_ret:
13863 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13864 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
13865 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
13866 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
13867 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
13868 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
13869 ; GCN3-NEXT: s_cbranch_execz .LBB96_4
13870 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
13871 ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
13872 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
13873 ; GCN3-NEXT: .LBB96_2: ; %atomicrmw.start
13874 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
13875 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13876 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
13877 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
13878 ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
13879 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
13880 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
13881 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
13882 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13883 ; GCN3-NEXT: buffer_wbinvl1_vol
13884 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
13885 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
13886 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
13887 ; GCN3-NEXT: s_cbranch_execnz .LBB96_2
13888 ; GCN3-NEXT: ; %bb.3: ; %Flow
13889 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
13890 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
13891 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
13892 ; GCN3-NEXT: .LBB96_4: ; %Flow3
13893 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13894 ; GCN3-NEXT: s_cbranch_execz .LBB96_6
13895 ; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
13896 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
13897 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
13898 ; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
13899 ; GCN3-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4
13900 ; GCN3-NEXT: s_waitcnt vmcnt(0)
13901 ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[2:3]
13902 ; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
13903 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
13904 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
13905 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
13906 ; GCN3-NEXT: .LBB96_6: ; %atomicrmw.phi
13907 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
13908 ; GCN3-NEXT: v_mov_b32_e32 v0, v4
13909 ; GCN3-NEXT: v_mov_b32_e32 v1, v5
13910 ; GCN3-NEXT: s_waitcnt vmcnt(0)
13911 ; GCN3-NEXT: s_setpc_b64 s[30:31]
13912 %result = atomicrmw umax ptr %ptr, i64 %in seq_cst
13916 define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) {
13917 ; GCN1-LABEL: flat_atomic_umax_i64_ret_offset:
13919 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13920 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
13921 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
13922 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
13923 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
13924 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
13925 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
13926 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
13927 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
13928 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
13929 ; GCN1-NEXT: s_cbranch_execnz .LBB97_3
13930 ; GCN1-NEXT: ; %bb.1: ; %Flow3
13931 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13932 ; GCN1-NEXT: s_cbranch_execnz .LBB97_6
13933 ; GCN1-NEXT: .LBB97_2: ; %atomicrmw.phi
13934 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
13935 ; GCN1-NEXT: s_setpc_b64 s[30:31]
13936 ; GCN1-NEXT: .LBB97_3: ; %atomicrmw.global
13937 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4
13938 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
13939 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
13940 ; GCN1-NEXT: flat_load_dword v0, v[4:5]
13941 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
13942 ; GCN1-NEXT: .LBB97_4: ; %atomicrmw.start
13943 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
13944 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13945 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
13946 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
13947 ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
13948 ; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
13949 ; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
13950 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
13951 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13952 ; GCN1-NEXT: buffer_wbinvl1_vol
13953 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
13954 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
13955 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
13956 ; GCN1-NEXT: s_cbranch_execnz .LBB97_4
13957 ; GCN1-NEXT: ; %bb.5: ; %Flow
13958 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
13959 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
13960 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
13961 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13962 ; GCN1-NEXT: s_cbranch_execz .LBB97_2
13963 ; GCN1-NEXT: .LBB97_6: ; %atomicrmw.private
13964 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
13965 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
13966 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
13967 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
13968 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
13969 ; GCN1-NEXT: s_waitcnt vmcnt(0)
13970 ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
13971 ; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
13972 ; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
13973 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
13974 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
13975 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
13976 ; GCN1-NEXT: s_waitcnt vmcnt(0)
13977 ; GCN1-NEXT: s_setpc_b64 s[30:31]
13979 ; GCN2-LABEL: flat_atomic_umax_i64_ret_offset:
13981 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13982 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
13983 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
13984 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
13985 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
13986 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
13987 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
13988 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
13989 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
13990 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
13991 ; GCN2-NEXT: s_cbranch_execnz .LBB97_3
13992 ; GCN2-NEXT: ; %bb.1: ; %Flow3
13993 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
13994 ; GCN2-NEXT: s_cbranch_execnz .LBB97_6
13995 ; GCN2-NEXT: .LBB97_2: ; %atomicrmw.phi
13996 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
13997 ; GCN2-NEXT: s_setpc_b64 s[30:31]
13998 ; GCN2-NEXT: .LBB97_3: ; %atomicrmw.global
13999 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4
14000 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
14001 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
14002 ; GCN2-NEXT: flat_load_dword v0, v[4:5]
14003 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
14004 ; GCN2-NEXT: .LBB97_4: ; %atomicrmw.start
14005 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
14006 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14007 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
14008 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
14009 ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
14010 ; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
14011 ; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
14012 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
14013 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14014 ; GCN2-NEXT: buffer_wbinvl1_vol
14015 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
14016 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
14017 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
14018 ; GCN2-NEXT: s_cbranch_execnz .LBB97_4
14019 ; GCN2-NEXT: ; %bb.5: ; %Flow
14020 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
14021 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
14022 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
14023 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
14024 ; GCN2-NEXT: s_cbranch_execz .LBB97_2
14025 ; GCN2-NEXT: .LBB97_6: ; %atomicrmw.private
14026 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
14027 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
14028 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
14029 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
14030 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
14031 ; GCN2-NEXT: s_waitcnt vmcnt(0)
14032 ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
14033 ; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
14034 ; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
14035 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
14036 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
14037 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
14038 ; GCN2-NEXT: s_waitcnt vmcnt(0)
14039 ; GCN2-NEXT: s_setpc_b64 s[30:31]
14041 ; GCN3-LABEL: flat_atomic_umax_i64_ret_offset:
14043 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14044 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
14045 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
14046 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
14047 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
14048 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
14049 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
14050 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
14051 ; GCN3-NEXT: s_cbranch_execnz .LBB97_3
14052 ; GCN3-NEXT: ; %bb.1: ; %Flow3
14053 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
14054 ; GCN3-NEXT: s_cbranch_execnz .LBB97_6
14055 ; GCN3-NEXT: .LBB97_2: ; %atomicrmw.phi
14056 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
14057 ; GCN3-NEXT: s_setpc_b64 s[30:31]
14058 ; GCN3-NEXT: .LBB97_3: ; %atomicrmw.global
14059 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
14060 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
14061 ; GCN3-NEXT: .LBB97_4: ; %atomicrmw.start
14062 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
14063 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14064 ; GCN3-NEXT: v_mov_b32_e32 v9, v1
14065 ; GCN3-NEXT: v_mov_b32_e32 v8, v0
14066 ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
14067 ; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
14068 ; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
14069 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
14070 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14071 ; GCN3-NEXT: buffer_wbinvl1_vol
14072 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
14073 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
14074 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
14075 ; GCN3-NEXT: s_cbranch_execnz .LBB97_4
14076 ; GCN3-NEXT: ; %bb.5: ; %Flow
14077 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
14078 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
14079 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
14080 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
14081 ; GCN3-NEXT: s_cbranch_execz .LBB97_2
14082 ; GCN3-NEXT: .LBB97_6: ; %atomicrmw.private
14083 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
14084 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
14085 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
14086 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
14087 ; GCN3-NEXT: s_waitcnt vmcnt(0)
14088 ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
14089 ; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
14090 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
14091 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
14092 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
14093 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
14094 ; GCN3-NEXT: s_waitcnt vmcnt(0)
14095 ; GCN3-NEXT: s_setpc_b64 s[30:31]
14096 %gep = getelementptr i64, ptr %out, i64 4
14097 %result = atomicrmw umax ptr %gep, i64 %in seq_cst
14101 define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
14102 ; GCN1-LABEL: flat_atomic_umax_i64_noret_scalar:
14104 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14105 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
14106 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
14107 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
14108 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
14109 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
14110 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
14111 ; GCN1-NEXT: s_mov_b64 s[34:35], -1
14112 ; GCN1-NEXT: s_cbranch_vccnz .LBB98_3
14113 ; GCN1-NEXT: ; %bb.1: ; %Flow3
14114 ; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35]
14115 ; GCN1-NEXT: s_cbranch_vccnz .LBB98_6
14116 ; GCN1-NEXT: .LBB98_2: ; %atomicrmw.phi
14117 ; GCN1-NEXT: s_setpc_b64 s[30:31]
14118 ; GCN1-NEXT: .LBB98_3: ; %atomicrmw.global
14119 ; GCN1-NEXT: s_add_u32 s34, s4, 4
14120 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
14121 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
14122 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
14123 ; GCN1-NEXT: v_mov_b32_e32 v4, s4
14124 ; GCN1-NEXT: v_mov_b32_e32 v5, s5
14125 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
14126 ; GCN1-NEXT: flat_load_dword v2, v[4:5]
14127 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
14128 ; GCN1-NEXT: v_mov_b32_e32 v6, s7
14129 ; GCN1-NEXT: v_mov_b32_e32 v7, s6
14130 ; GCN1-NEXT: .LBB98_4: ; %atomicrmw.start
14131 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
14132 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14133 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
14134 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
14135 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
14136 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
14137 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14138 ; GCN1-NEXT: buffer_wbinvl1_vol
14139 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
14140 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
14141 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
14142 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
14143 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
14144 ; GCN1-NEXT: s_cbranch_execnz .LBB98_4
14145 ; GCN1-NEXT: ; %bb.5: ; %Flow
14146 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
14147 ; GCN1-NEXT: s_branch .LBB98_2
14148 ; GCN1-NEXT: .LBB98_6: ; %atomicrmw.private
14149 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
14150 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
14151 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
14152 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
14153 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
14154 ; GCN1-NEXT: s_add_i32 s34, s34, 4
14155 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
14156 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
14157 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
14158 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
14159 ; GCN1-NEXT: s_waitcnt vmcnt(0)
14160 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
14161 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
14162 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
14163 ; GCN1-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
14164 ; GCN1-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
14165 ; GCN1-NEXT: s_waitcnt vmcnt(0)
14166 ; GCN1-NEXT: s_setpc_b64 s[30:31]
14168 ; GCN2-LABEL: flat_atomic_umax_i64_noret_scalar:
14170 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14171 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
14172 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
14173 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
14174 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
14175 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
14176 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
14177 ; GCN2-NEXT: s_mov_b64 s[34:35], -1
14178 ; GCN2-NEXT: s_cbranch_vccnz .LBB98_3
14179 ; GCN2-NEXT: ; %bb.1: ; %Flow3
14180 ; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35]
14181 ; GCN2-NEXT: s_cbranch_vccnz .LBB98_6
14182 ; GCN2-NEXT: .LBB98_2: ; %atomicrmw.phi
14183 ; GCN2-NEXT: s_setpc_b64 s[30:31]
14184 ; GCN2-NEXT: .LBB98_3: ; %atomicrmw.global
14185 ; GCN2-NEXT: s_add_u32 s34, s4, 4
14186 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
14187 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
14188 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
14189 ; GCN2-NEXT: v_mov_b32_e32 v4, s4
14190 ; GCN2-NEXT: v_mov_b32_e32 v5, s5
14191 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
14192 ; GCN2-NEXT: flat_load_dword v2, v[4:5]
14193 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
14194 ; GCN2-NEXT: v_mov_b32_e32 v6, s7
14195 ; GCN2-NEXT: v_mov_b32_e32 v7, s6
14196 ; GCN2-NEXT: .LBB98_4: ; %atomicrmw.start
14197 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
14198 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14199 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
14200 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
14201 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
14202 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
14203 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14204 ; GCN2-NEXT: buffer_wbinvl1_vol
14205 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
14206 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
14207 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
14208 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
14209 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
14210 ; GCN2-NEXT: s_cbranch_execnz .LBB98_4
14211 ; GCN2-NEXT: ; %bb.5: ; %Flow
14212 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
14213 ; GCN2-NEXT: s_branch .LBB98_2
14214 ; GCN2-NEXT: .LBB98_6: ; %atomicrmw.private
14215 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
14216 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
14217 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
14218 ; GCN2-NEXT: s_add_i32 s34, s34, 4
14219 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
14220 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
14221 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
14222 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
14223 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
14224 ; GCN2-NEXT: s_waitcnt vmcnt(0)
14225 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
14226 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
14227 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
14228 ; GCN2-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
14229 ; GCN2-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
14230 ; GCN2-NEXT: s_waitcnt vmcnt(0)
14231 ; GCN2-NEXT: s_setpc_b64 s[30:31]
14233 ; GCN3-LABEL: flat_atomic_umax_i64_noret_scalar:
14235 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14236 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
14237 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
14238 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
14239 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
14240 ; GCN3-NEXT: s_mov_b64 s[34:35], -1
14241 ; GCN3-NEXT: s_cbranch_vccnz .LBB98_3
14242 ; GCN3-NEXT: ; %bb.1: ; %Flow3
14243 ; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35]
14244 ; GCN3-NEXT: s_cbranch_vccnz .LBB98_6
14245 ; GCN3-NEXT: .LBB98_2: ; %atomicrmw.phi
14246 ; GCN3-NEXT: s_setpc_b64 s[30:31]
14247 ; GCN3-NEXT: .LBB98_3: ; %atomicrmw.global
14248 ; GCN3-NEXT: v_mov_b32_e32 v4, s4
14249 ; GCN3-NEXT: v_mov_b32_e32 v5, s5
14250 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
14251 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
14252 ; GCN3-NEXT: v_mov_b32_e32 v6, s7
14253 ; GCN3-NEXT: v_mov_b32_e32 v7, s6
14254 ; GCN3-NEXT: .LBB98_4: ; %atomicrmw.start
14255 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
14256 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14257 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
14258 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
14259 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
14260 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
14261 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14262 ; GCN3-NEXT: buffer_wbinvl1_vol
14263 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
14264 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
14265 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
14266 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
14267 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
14268 ; GCN3-NEXT: s_cbranch_execnz .LBB98_4
14269 ; GCN3-NEXT: ; %bb.5: ; %Flow
14270 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
14271 ; GCN3-NEXT: s_branch .LBB98_2
14272 ; GCN3-NEXT: .LBB98_6: ; %atomicrmw.private
14273 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
14274 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
14275 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
14276 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
14277 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
14278 ; GCN3-NEXT: v_mov_b32_e32 v4, s6
14279 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
14280 ; GCN3-NEXT: s_waitcnt vmcnt(0)
14281 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
14282 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
14283 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
14284 ; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
14285 ; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
14286 ; GCN3-NEXT: s_waitcnt vmcnt(0)
14287 ; GCN3-NEXT: s_setpc_b64 s[30:31]
14288 %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst
14292 define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
14293 ; GCN1-LABEL: flat_atomic_umax_i64_noret_offset_scalar:
14295 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14296 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
14297 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
14298 ; GCN1-NEXT: s_add_u32 s34, s4, 32
14299 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
14300 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
14301 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
14302 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
14303 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
14304 ; GCN1-NEXT: s_mov_b64 s[36:37], -1
14305 ; GCN1-NEXT: s_cbranch_vccnz .LBB99_3
14306 ; GCN1-NEXT: ; %bb.1: ; %Flow3
14307 ; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37]
14308 ; GCN1-NEXT: s_cbranch_vccnz .LBB99_6
14309 ; GCN1-NEXT: .LBB99_2: ; %atomicrmw.phi
14310 ; GCN1-NEXT: s_setpc_b64 s[30:31]
14311 ; GCN1-NEXT: .LBB99_3: ; %atomicrmw.global
14312 ; GCN1-NEXT: s_add_u32 s36, s34, 4
14313 ; GCN1-NEXT: s_addc_u32 s37, s35, 0
14314 ; GCN1-NEXT: v_mov_b32_e32 v0, s36
14315 ; GCN1-NEXT: v_mov_b32_e32 v1, s37
14316 ; GCN1-NEXT: v_mov_b32_e32 v4, s34
14317 ; GCN1-NEXT: v_mov_b32_e32 v5, s35
14318 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
14319 ; GCN1-NEXT: flat_load_dword v2, v[4:5]
14320 ; GCN1-NEXT: s_mov_b64 s[36:37], 0
14321 ; GCN1-NEXT: v_mov_b32_e32 v6, s7
14322 ; GCN1-NEXT: v_mov_b32_e32 v7, s6
14323 ; GCN1-NEXT: .LBB99_4: ; %atomicrmw.start
14324 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
14325 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14326 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
14327 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
14328 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
14329 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
14330 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14331 ; GCN1-NEXT: buffer_wbinvl1_vol
14332 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
14333 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
14334 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
14335 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
14336 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
14337 ; GCN1-NEXT: s_cbranch_execnz .LBB99_4
14338 ; GCN1-NEXT: ; %bb.5: ; %Flow
14339 ; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
14340 ; GCN1-NEXT: s_branch .LBB99_2
14341 ; GCN1-NEXT: .LBB99_6: ; %atomicrmw.private
14342 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
14343 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
14344 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
14345 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
14346 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
14347 ; GCN1-NEXT: s_add_i32 s34, s34, 4
14348 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
14349 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
14350 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
14351 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
14352 ; GCN1-NEXT: s_waitcnt vmcnt(0)
14353 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
14354 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
14355 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
14356 ; GCN1-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
14357 ; GCN1-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
14358 ; GCN1-NEXT: s_waitcnt vmcnt(0)
14359 ; GCN1-NEXT: s_setpc_b64 s[30:31]
14361 ; GCN2-LABEL: flat_atomic_umax_i64_noret_offset_scalar:
14363 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14364 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
14365 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
14366 ; GCN2-NEXT: s_add_u32 s34, s4, 32
14367 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
14368 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
14369 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
14370 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
14371 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
14372 ; GCN2-NEXT: s_mov_b64 s[36:37], -1
14373 ; GCN2-NEXT: s_cbranch_vccnz .LBB99_3
14374 ; GCN2-NEXT: ; %bb.1: ; %Flow3
14375 ; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37]
14376 ; GCN2-NEXT: s_cbranch_vccnz .LBB99_6
14377 ; GCN2-NEXT: .LBB99_2: ; %atomicrmw.phi
14378 ; GCN2-NEXT: s_setpc_b64 s[30:31]
14379 ; GCN2-NEXT: .LBB99_3: ; %atomicrmw.global
14380 ; GCN2-NEXT: s_add_u32 s36, s34, 4
14381 ; GCN2-NEXT: s_addc_u32 s37, s35, 0
14382 ; GCN2-NEXT: v_mov_b32_e32 v0, s36
14383 ; GCN2-NEXT: v_mov_b32_e32 v1, s37
14384 ; GCN2-NEXT: v_mov_b32_e32 v4, s34
14385 ; GCN2-NEXT: v_mov_b32_e32 v5, s35
14386 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
14387 ; GCN2-NEXT: flat_load_dword v2, v[4:5]
14388 ; GCN2-NEXT: s_mov_b64 s[36:37], 0
14389 ; GCN2-NEXT: v_mov_b32_e32 v6, s7
14390 ; GCN2-NEXT: v_mov_b32_e32 v7, s6
14391 ; GCN2-NEXT: .LBB99_4: ; %atomicrmw.start
14392 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
14393 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14394 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
14395 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
14396 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
14397 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
14398 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14399 ; GCN2-NEXT: buffer_wbinvl1_vol
14400 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
14401 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
14402 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
14403 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
14404 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
14405 ; GCN2-NEXT: s_cbranch_execnz .LBB99_4
14406 ; GCN2-NEXT: ; %bb.5: ; %Flow
14407 ; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
14408 ; GCN2-NEXT: s_branch .LBB99_2
14409 ; GCN2-NEXT: .LBB99_6: ; %atomicrmw.private
14410 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
14411 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
14412 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
14413 ; GCN2-NEXT: s_add_i32 s34, s34, 4
14414 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
14415 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
14416 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
14417 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
14418 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
14419 ; GCN2-NEXT: s_waitcnt vmcnt(0)
14420 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
14421 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
14422 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
14423 ; GCN2-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
14424 ; GCN2-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
14425 ; GCN2-NEXT: s_waitcnt vmcnt(0)
14426 ; GCN2-NEXT: s_setpc_b64 s[30:31]
14428 ; GCN3-LABEL: flat_atomic_umax_i64_noret_offset_scalar:
14430 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14431 ; GCN3-NEXT: s_add_u32 s34, s4, 32
14432 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
14433 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
14434 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
14435 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
14436 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
14437 ; GCN3-NEXT: s_mov_b64 s[36:37], -1
14438 ; GCN3-NEXT: s_cbranch_vccnz .LBB99_3
14439 ; GCN3-NEXT: ; %bb.1: ; %Flow3
14440 ; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37]
14441 ; GCN3-NEXT: s_cbranch_vccnz .LBB99_6
14442 ; GCN3-NEXT: .LBB99_2: ; %atomicrmw.phi
14443 ; GCN3-NEXT: s_setpc_b64 s[30:31]
14444 ; GCN3-NEXT: .LBB99_3: ; %atomicrmw.global
14445 ; GCN3-NEXT: v_mov_b32_e32 v4, s34
14446 ; GCN3-NEXT: v_mov_b32_e32 v5, s35
14447 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
14448 ; GCN3-NEXT: s_mov_b64 s[36:37], 0
14449 ; GCN3-NEXT: v_mov_b32_e32 v6, s7
14450 ; GCN3-NEXT: v_mov_b32_e32 v7, s6
14451 ; GCN3-NEXT: .LBB99_4: ; %atomicrmw.start
14452 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
14453 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14454 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
14455 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
14456 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
14457 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
14458 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14459 ; GCN3-NEXT: buffer_wbinvl1_vol
14460 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
14461 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
14462 ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
14463 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
14464 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
14465 ; GCN3-NEXT: s_cbranch_execnz .LBB99_4
14466 ; GCN3-NEXT: ; %bb.5: ; %Flow
14467 ; GCN3-NEXT: s_or_b64 exec, exec, s[36:37]
14468 ; GCN3-NEXT: s_branch .LBB99_2
14469 ; GCN3-NEXT: .LBB99_6: ; %atomicrmw.private
14470 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
14471 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
14472 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
14473 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
14474 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
14475 ; GCN3-NEXT: v_mov_b32_e32 v4, s6
14476 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
14477 ; GCN3-NEXT: s_waitcnt vmcnt(0)
14478 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
14479 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
14480 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
14481 ; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
14482 ; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
14483 ; GCN3-NEXT: s_waitcnt vmcnt(0)
14484 ; GCN3-NEXT: s_setpc_b64 s[30:31]
14485 %gep = getelementptr i64, ptr %out, i64 4
14486 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst
14490 define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
14491 ; GCN1-LABEL: flat_atomic_umax_i64_ret_scalar:
14493 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14494 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
14495 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
14496 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
14497 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
14498 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
14499 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
14500 ; GCN1-NEXT: s_cbranch_vccz .LBB100_4
14501 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
14502 ; GCN1-NEXT: s_add_u32 s34, s4, 4
14503 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
14504 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
14505 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
14506 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
14507 ; GCN1-NEXT: v_mov_b32_e32 v3, s5
14508 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
14509 ; GCN1-NEXT: flat_load_dword v0, v[2:3]
14510 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
14511 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
14512 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
14513 ; GCN1-NEXT: .LBB100_2: ; %atomicrmw.start
14514 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
14515 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14516 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
14517 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
14518 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
14519 ; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
14520 ; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
14521 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
14522 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14523 ; GCN1-NEXT: buffer_wbinvl1_vol
14524 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
14525 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
14526 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
14527 ; GCN1-NEXT: s_cbranch_execnz .LBB100_2
14528 ; GCN1-NEXT: ; %bb.3: ; %Flow
14529 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
14530 ; GCN1-NEXT: s_branch .LBB100_6
14531 ; GCN1-NEXT: .LBB100_4:
14532 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
14533 ; GCN1-NEXT: s_cbranch_execz .LBB100_6
14534 ; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
14535 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
14536 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
14537 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
14538 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
14539 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
14540 ; GCN1-NEXT: s_add_i32 s34, s34, 4
14541 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
14542 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
14543 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
14544 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
14545 ; GCN1-NEXT: s_waitcnt vmcnt(0)
14546 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
14547 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
14548 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
14549 ; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
14550 ; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
14551 ; GCN1-NEXT: .LBB100_6: ; %atomicrmw.phi
14552 ; GCN1-NEXT: s_waitcnt vmcnt(0)
14553 ; GCN1-NEXT: s_setpc_b64 s[30:31]
14555 ; GCN2-LABEL: flat_atomic_umax_i64_ret_scalar:
14557 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14558 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
14559 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
14560 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
14561 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
14562 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
14563 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
14564 ; GCN2-NEXT: s_cbranch_vccz .LBB100_4
14565 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
14566 ; GCN2-NEXT: s_add_u32 s34, s4, 4
14567 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
14568 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
14569 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
14570 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
14571 ; GCN2-NEXT: v_mov_b32_e32 v3, s5
14572 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
14573 ; GCN2-NEXT: flat_load_dword v0, v[2:3]
14574 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
14575 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
14576 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
14577 ; GCN2-NEXT: .LBB100_2: ; %atomicrmw.start
14578 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
14579 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14580 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
14581 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
14582 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
14583 ; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
14584 ; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
14585 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
14586 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14587 ; GCN2-NEXT: buffer_wbinvl1_vol
14588 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
14589 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
14590 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
14591 ; GCN2-NEXT: s_cbranch_execnz .LBB100_2
14592 ; GCN2-NEXT: ; %bb.3: ; %Flow
14593 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
14594 ; GCN2-NEXT: s_branch .LBB100_6
14595 ; GCN2-NEXT: .LBB100_4:
14596 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
14597 ; GCN2-NEXT: s_cbranch_execz .LBB100_6
14598 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
14599 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
14600 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
14601 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
14602 ; GCN2-NEXT: s_add_i32 s34, s34, 4
14603 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
14604 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
14605 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
14606 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
14607 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
14608 ; GCN2-NEXT: s_waitcnt vmcnt(0)
14609 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
14610 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
14611 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
14612 ; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
14613 ; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
14614 ; GCN2-NEXT: .LBB100_6: ; %atomicrmw.phi
14615 ; GCN2-NEXT: s_waitcnt vmcnt(0)
14616 ; GCN2-NEXT: s_setpc_b64 s[30:31]
14618 ; GCN3-LABEL: flat_atomic_umax_i64_ret_scalar:
14620 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14621 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
14622 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
14623 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
14624 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
14625 ; GCN3-NEXT: s_cbranch_vccz .LBB100_4
14626 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
14627 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
14628 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
14629 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
14630 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
14631 ; GCN3-NEXT: v_mov_b32_e32 v4, s7
14632 ; GCN3-NEXT: v_mov_b32_e32 v5, s6
14633 ; GCN3-NEXT: .LBB100_2: ; %atomicrmw.start
14634 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
14635 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14636 ; GCN3-NEXT: v_mov_b32_e32 v9, v1
14637 ; GCN3-NEXT: v_mov_b32_e32 v8, v0
14638 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
14639 ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
14640 ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
14641 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
14642 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14643 ; GCN3-NEXT: buffer_wbinvl1_vol
14644 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
14645 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
14646 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
14647 ; GCN3-NEXT: s_cbranch_execnz .LBB100_2
14648 ; GCN3-NEXT: ; %bb.3: ; %Flow
14649 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
14650 ; GCN3-NEXT: s_branch .LBB100_6
14651 ; GCN3-NEXT: .LBB100_4:
14652 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
14653 ; GCN3-NEXT: s_cbranch_execz .LBB100_6
14654 ; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
14655 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
14656 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
14657 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
14658 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
14659 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
14660 ; GCN3-NEXT: v_mov_b32_e32 v4, s6
14661 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
14662 ; GCN3-NEXT: s_waitcnt vmcnt(0)
14663 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
14664 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
14665 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
14666 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
14667 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
14668 ; GCN3-NEXT: .LBB100_6: ; %atomicrmw.phi
14669 ; GCN3-NEXT: s_waitcnt vmcnt(0)
14670 ; GCN3-NEXT: s_setpc_b64 s[30:31]
14671 %result = atomicrmw umax ptr %ptr, i64 %in seq_cst
14675 define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
14676 ; GCN1-LABEL: flat_atomic_umax_i64_ret_offset_scalar:
14678 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14679 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
14680 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
14681 ; GCN1-NEXT: s_add_u32 s34, s4, 32
14682 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
14683 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
14684 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
14685 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
14686 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
14687 ; GCN1-NEXT: s_cbranch_vccz .LBB101_4
14688 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
14689 ; GCN1-NEXT: s_add_u32 s36, s34, 4
14690 ; GCN1-NEXT: s_addc_u32 s37, s35, 0
14691 ; GCN1-NEXT: v_mov_b32_e32 v0, s36
14692 ; GCN1-NEXT: v_mov_b32_e32 v1, s37
14693 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
14694 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
14695 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
14696 ; GCN1-NEXT: flat_load_dword v0, v[2:3]
14697 ; GCN1-NEXT: s_mov_b64 s[36:37], 0
14698 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
14699 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
14700 ; GCN1-NEXT: .LBB101_2: ; %atomicrmw.start
14701 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
14702 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14703 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
14704 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
14705 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
14706 ; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
14707 ; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
14708 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
14709 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14710 ; GCN1-NEXT: buffer_wbinvl1_vol
14711 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
14712 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
14713 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
14714 ; GCN1-NEXT: s_cbranch_execnz .LBB101_2
14715 ; GCN1-NEXT: ; %bb.3: ; %Flow
14716 ; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
14717 ; GCN1-NEXT: s_branch .LBB101_6
14718 ; GCN1-NEXT: .LBB101_4:
14719 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
14720 ; GCN1-NEXT: s_cbranch_execz .LBB101_6
14721 ; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
14722 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
14723 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
14724 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
14725 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
14726 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
14727 ; GCN1-NEXT: s_add_i32 s34, s34, 4
14728 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
14729 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
14730 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
14731 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
14732 ; GCN1-NEXT: s_waitcnt vmcnt(0)
14733 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
14734 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
14735 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
14736 ; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
14737 ; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
14738 ; GCN1-NEXT: .LBB101_6: ; %atomicrmw.phi
14739 ; GCN1-NEXT: s_waitcnt vmcnt(0)
14740 ; GCN1-NEXT: s_setpc_b64 s[30:31]
14742 ; GCN2-LABEL: flat_atomic_umax_i64_ret_offset_scalar:
14744 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14745 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
14746 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
14747 ; GCN2-NEXT: s_add_u32 s34, s4, 32
14748 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
14749 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
14750 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
14751 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
14752 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
14753 ; GCN2-NEXT: s_cbranch_vccz .LBB101_4
14754 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
14755 ; GCN2-NEXT: s_add_u32 s36, s34, 4
14756 ; GCN2-NEXT: s_addc_u32 s37, s35, 0
14757 ; GCN2-NEXT: v_mov_b32_e32 v0, s36
14758 ; GCN2-NEXT: v_mov_b32_e32 v1, s37
14759 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
14760 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
14761 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
14762 ; GCN2-NEXT: flat_load_dword v0, v[2:3]
14763 ; GCN2-NEXT: s_mov_b64 s[36:37], 0
14764 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
14765 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
14766 ; GCN2-NEXT: .LBB101_2: ; %atomicrmw.start
14767 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
14768 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14769 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
14770 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
14771 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
14772 ; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
14773 ; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
14774 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
14775 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14776 ; GCN2-NEXT: buffer_wbinvl1_vol
14777 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
14778 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
14779 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
14780 ; GCN2-NEXT: s_cbranch_execnz .LBB101_2
14781 ; GCN2-NEXT: ; %bb.3: ; %Flow
14782 ; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
14783 ; GCN2-NEXT: s_branch .LBB101_6
14784 ; GCN2-NEXT: .LBB101_4:
14785 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
14786 ; GCN2-NEXT: s_cbranch_execz .LBB101_6
14787 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
14788 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
14789 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
14790 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
14791 ; GCN2-NEXT: s_add_i32 s34, s34, 4
14792 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
14793 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
14794 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
14795 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
14796 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
14797 ; GCN2-NEXT: s_waitcnt vmcnt(0)
14798 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
14799 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
14800 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
14801 ; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
14802 ; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
14803 ; GCN2-NEXT: .LBB101_6: ; %atomicrmw.phi
14804 ; GCN2-NEXT: s_waitcnt vmcnt(0)
14805 ; GCN2-NEXT: s_setpc_b64 s[30:31]
14807 ; GCN3-LABEL: flat_atomic_umax_i64_ret_offset_scalar:
14809 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14810 ; GCN3-NEXT: s_add_u32 s34, s4, 32
14811 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
14812 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
14813 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
14814 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
14815 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
14816 ; GCN3-NEXT: s_cbranch_vccz .LBB101_4
14817 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
14818 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
14819 ; GCN3-NEXT: v_mov_b32_e32 v3, s35
14820 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
14821 ; GCN3-NEXT: s_mov_b64 s[36:37], 0
14822 ; GCN3-NEXT: v_mov_b32_e32 v4, s7
14823 ; GCN3-NEXT: v_mov_b32_e32 v5, s6
14824 ; GCN3-NEXT: .LBB101_2: ; %atomicrmw.start
14825 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
14826 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14827 ; GCN3-NEXT: v_mov_b32_e32 v9, v1
14828 ; GCN3-NEXT: v_mov_b32_e32 v8, v0
14829 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
14830 ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
14831 ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
14832 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
14833 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14834 ; GCN3-NEXT: buffer_wbinvl1_vol
14835 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
14836 ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
14837 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
14838 ; GCN3-NEXT: s_cbranch_execnz .LBB101_2
14839 ; GCN3-NEXT: ; %bb.3: ; %Flow
14840 ; GCN3-NEXT: s_or_b64 exec, exec, s[36:37]
14841 ; GCN3-NEXT: s_branch .LBB101_6
14842 ; GCN3-NEXT: .LBB101_4:
14843 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
14844 ; GCN3-NEXT: s_cbranch_execz .LBB101_6
14845 ; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
14846 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
14847 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
14848 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
14849 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
14850 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
14851 ; GCN3-NEXT: v_mov_b32_e32 v4, s6
14852 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
14853 ; GCN3-NEXT: s_waitcnt vmcnt(0)
14854 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
14855 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
14856 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
14857 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
14858 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
14859 ; GCN3-NEXT: .LBB101_6: ; %atomicrmw.phi
14860 ; GCN3-NEXT: s_waitcnt vmcnt(0)
14861 ; GCN3-NEXT: s_setpc_b64 s[30:31]
14862 %gep = getelementptr i64, ptr %out, i64 4
14863 %result = atomicrmw umax ptr %gep, i64 %in seq_cst
14867 define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
14868 ; GCN1-LABEL: atomic_umax_i64_addr64_offset:
14869 ; GCN1: ; %bb.0: ; %entry
14870 ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
14871 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
14872 ; GCN1-NEXT: s_mov_b32 s14, -1
14873 ; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
14874 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
14875 ; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f
14876 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
14877 ; GCN1-NEXT: s_add_u32 s12, s12, s11
14878 ; GCN1-NEXT: s_addc_u32 s13, s13, 0
14879 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
14880 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
14881 ; GCN1-NEXT: s_add_u32 s0, s0, s4
14882 ; GCN1-NEXT: s_addc_u32 s1, s1, s5
14883 ; GCN1-NEXT: s_add_u32 s0, s0, 32
14884 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
14885 ; GCN1-NEXT: s_cmp_eq_u32 s1, s8
14886 ; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0
14887 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
14888 ; GCN1-NEXT: s_mov_b64 s[4:5], -1
14889 ; GCN1-NEXT: s_cbranch_vccnz .LBB102_3
14890 ; GCN1-NEXT: ; %bb.1: ; %Flow6
14891 ; GCN1-NEXT: s_and_b64 vcc, exec, s[4:5]
14892 ; GCN1-NEXT: s_cbranch_vccnz .LBB102_6
14893 ; GCN1-NEXT: .LBB102_2: ; %atomicrmw.phi
14894 ; GCN1-NEXT: s_endpgm
14895 ; GCN1-NEXT: .LBB102_3: ; %atomicrmw.global
14896 ; GCN1-NEXT: v_mov_b32_e32 v5, s1
14897 ; GCN1-NEXT: v_mov_b32_e32 v4, s0
14898 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
14899 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
14900 ; GCN1-NEXT: v_mov_b32_e32 v6, s3
14901 ; GCN1-NEXT: v_mov_b32_e32 v7, s2
14902 ; GCN1-NEXT: .LBB102_4: ; %atomicrmw.start
14903 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
14904 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14905 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
14906 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
14907 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
14908 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
14909 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14910 ; GCN1-NEXT: buffer_wbinvl1_vol
14911 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
14912 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
14913 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
14914 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
14915 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
14916 ; GCN1-NEXT: s_cbranch_execnz .LBB102_4
14917 ; GCN1-NEXT: ; %bb.5: ; %Flow
14918 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
14919 ; GCN1-NEXT: s_branch .LBB102_2
14920 ; GCN1-NEXT: .LBB102_6: ; %atomicrmw.private
14921 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0
14922 ; GCN1-NEXT: v_mov_b32_e32 v5, s2
14923 ; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec
14924 ; GCN1-NEXT: s_cselect_b32 s0, s0, -1
14925 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
14926 ; GCN1-NEXT: s_add_i32 s0, s0, 4
14927 ; GCN1-NEXT: v_mov_b32_e32 v3, s0
14928 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
14929 ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
14930 ; GCN1-NEXT: v_mov_b32_e32 v4, s3
14931 ; GCN1-NEXT: s_waitcnt vmcnt(0)
14932 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
14933 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
14934 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
14935 ; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
14936 ; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
14937 ; GCN1-NEXT: s_endpgm
14939 ; GCN2-LABEL: atomic_umax_i64_addr64_offset:
14940 ; GCN2: ; %bb.0: ; %entry
14941 ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
14942 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
14943 ; GCN2-NEXT: s_mov_b32 s90, -1
14944 ; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
14945 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
14946 ; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc
14947 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000
14948 ; GCN2-NEXT: s_add_u32 s88, s88, s11
14949 ; GCN2-NEXT: s_addc_u32 s89, s89, 0
14950 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
14951 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
14952 ; GCN2-NEXT: s_add_u32 s0, s0, s4
14953 ; GCN2-NEXT: s_addc_u32 s1, s1, s5
14954 ; GCN2-NEXT: s_add_u32 s0, s0, 32
14955 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
14956 ; GCN2-NEXT: s_cmp_eq_u32 s1, s8
14957 ; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0
14958 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5]
14959 ; GCN2-NEXT: s_mov_b64 s[4:5], -1
14960 ; GCN2-NEXT: s_cbranch_vccnz .LBB102_3
14961 ; GCN2-NEXT: ; %bb.1: ; %Flow6
14962 ; GCN2-NEXT: s_and_b64 vcc, exec, s[4:5]
14963 ; GCN2-NEXT: s_cbranch_vccnz .LBB102_6
14964 ; GCN2-NEXT: .LBB102_2: ; %atomicrmw.phi
14965 ; GCN2-NEXT: s_endpgm
14966 ; GCN2-NEXT: .LBB102_3: ; %atomicrmw.global
14967 ; GCN2-NEXT: v_mov_b32_e32 v5, s1
14968 ; GCN2-NEXT: v_mov_b32_e32 v4, s0
14969 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
14970 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
14971 ; GCN2-NEXT: v_mov_b32_e32 v6, s3
14972 ; GCN2-NEXT: v_mov_b32_e32 v7, s2
14973 ; GCN2-NEXT: .LBB102_4: ; %atomicrmw.start
14974 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
14975 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14976 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
14977 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
14978 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
14979 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
14980 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14981 ; GCN2-NEXT: buffer_wbinvl1_vol
14982 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
14983 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
14984 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
14985 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
14986 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
14987 ; GCN2-NEXT: s_cbranch_execnz .LBB102_4
14988 ; GCN2-NEXT: ; %bb.5: ; %Flow
14989 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
14990 ; GCN2-NEXT: s_branch .LBB102_2
14991 ; GCN2-NEXT: .LBB102_6: ; %atomicrmw.private
14992 ; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
14993 ; GCN2-NEXT: s_cselect_b32 s0, s0, -1
14994 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
14995 ; GCN2-NEXT: s_add_i32 s0, s0, 4
14996 ; GCN2-NEXT: v_mov_b32_e32 v3, s0
14997 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
14998 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
14999 ; GCN2-NEXT: v_mov_b32_e32 v5, s2
15000 ; GCN2-NEXT: v_mov_b32_e32 v4, s3
15001 ; GCN2-NEXT: s_waitcnt vmcnt(0)
15002 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
15003 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
15004 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
15005 ; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
15006 ; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
15007 ; GCN2-NEXT: s_endpgm
15009 ; GCN3-LABEL: atomic_umax_i64_addr64_offset:
15010 ; GCN3: ; %bb.0: ; %entry
15011 ; GCN3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
15012 ; GCN3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
15013 ; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
15014 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
15015 ; GCN3-NEXT: s_mov_b32 s14, -1
15016 ; GCN3-NEXT: s_mov_b32 s15, 0xe00000
15017 ; GCN3-NEXT: s_add_u32 s12, s12, s11
15018 ; GCN3-NEXT: s_addc_u32 s13, s13, 0
15019 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
15020 ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
15021 ; GCN3-NEXT: s_add_u32 s0, s0, s6
15022 ; GCN3-NEXT: s_addc_u32 s1, s1, s7
15023 ; GCN3-NEXT: s_add_u32 s0, s0, 32
15024 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
15025 ; GCN3-NEXT: s_addc_u32 s1, s1, 0
15026 ; GCN3-NEXT: s_cmp_eq_u32 s1, s5
15027 ; GCN3-NEXT: s_cselect_b64 s[4:5], -1, 0
15028 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[4:5]
15029 ; GCN3-NEXT: s_mov_b64 s[4:5], -1
15030 ; GCN3-NEXT: s_cbranch_vccnz .LBB102_3
15031 ; GCN3-NEXT: ; %bb.1: ; %Flow6
15032 ; GCN3-NEXT: s_and_b64 vcc, exec, s[4:5]
15033 ; GCN3-NEXT: s_cbranch_vccnz .LBB102_6
15034 ; GCN3-NEXT: .LBB102_2: ; %atomicrmw.phi
15035 ; GCN3-NEXT: s_endpgm
15036 ; GCN3-NEXT: .LBB102_3: ; %atomicrmw.global
15037 ; GCN3-NEXT: v_mov_b32_e32 v5, s1
15038 ; GCN3-NEXT: v_mov_b32_e32 v4, s0
15039 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
15040 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
15041 ; GCN3-NEXT: v_mov_b32_e32 v6, s3
15042 ; GCN3-NEXT: v_mov_b32_e32 v7, s2
15043 ; GCN3-NEXT: .LBB102_4: ; %atomicrmw.start
15044 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
15045 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15046 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
15047 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
15048 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
15049 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
15050 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15051 ; GCN3-NEXT: buffer_wbinvl1_vol
15052 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
15053 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
15054 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
15055 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
15056 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
15057 ; GCN3-NEXT: s_cbranch_execnz .LBB102_4
15058 ; GCN3-NEXT: ; %bb.5: ; %Flow
15059 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
15060 ; GCN3-NEXT: s_branch .LBB102_2
15061 ; GCN3-NEXT: .LBB102_6: ; %atomicrmw.private
15062 ; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0
15063 ; GCN3-NEXT: s_cselect_b32 s0, s0, -1
15064 ; GCN3-NEXT: v_mov_b32_e32 v2, s0
15065 ; GCN3-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
15066 ; GCN3-NEXT: buffer_load_dword v1, v2, s[12:15], 0 offen offset:4
15067 ; GCN3-NEXT: v_mov_b32_e32 v4, s2
15068 ; GCN3-NEXT: v_mov_b32_e32 v3, s3
15069 ; GCN3-NEXT: s_waitcnt vmcnt(0)
15070 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
15071 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
15072 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
15073 ; GCN3-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
15074 ; GCN3-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen offset:4
15075 ; GCN3-NEXT: s_endpgm
15077 %ptr = getelementptr i64, ptr %out, i64 %index
15078 %gep = getelementptr i64, ptr %ptr, i64 4
15079 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst
15083 define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
15084 ; GCN1-LABEL: atomic_umax_i64_ret_addr64_offset:
15085 ; GCN1: ; %bb.0: ; %entry
15086 ; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
15087 ; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
15088 ; GCN1-NEXT: s_mov_b32 s18, -1
15089 ; GCN1-NEXT: s_mov_b32 s19, 0xe8f000
15090 ; GCN1-NEXT: s_add_u32 s16, s16, s11
15091 ; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9
15092 ; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41
15093 ; GCN1-NEXT: s_addc_u32 s17, s17, 0
15094 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
15095 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
15096 ; GCN1-NEXT: s_add_u32 s0, s8, s0
15097 ; GCN1-NEXT: s_addc_u32 s1, s9, s1
15098 ; GCN1-NEXT: s_add_u32 s0, s0, 32
15099 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
15100 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2
15101 ; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
15102 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
15103 ; GCN1-NEXT: s_cbranch_vccz .LBB103_4
15104 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
15105 ; GCN1-NEXT: v_mov_b32_e32 v3, s1
15106 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
15107 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
15108 ; GCN1-NEXT: s_mov_b64 s[2:3], 0
15109 ; GCN1-NEXT: v_mov_b32_e32 v4, s13
15110 ; GCN1-NEXT: v_mov_b32_e32 v5, s12
15111 ; GCN1-NEXT: .LBB103_2: ; %atomicrmw.start
15112 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
15113 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15114 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
15115 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
15116 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9]
15117 ; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
15118 ; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
15119 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
15120 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15121 ; GCN1-NEXT: buffer_wbinvl1_vol
15122 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
15123 ; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
15124 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
15125 ; GCN1-NEXT: s_cbranch_execnz .LBB103_2
15126 ; GCN1-NEXT: ; %bb.3: ; %Flow
15127 ; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
15128 ; GCN1-NEXT: s_branch .LBB103_6
15129 ; GCN1-NEXT: .LBB103_4:
15130 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
15131 ; GCN1-NEXT: s_cbranch_execz .LBB103_6
15132 ; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
15133 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
15134 ; GCN1-NEXT: v_mov_b32_e32 v5, s12
15135 ; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
15136 ; GCN1-NEXT: s_cselect_b32 s0, s0, -1
15137 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
15138 ; GCN1-NEXT: s_add_i32 s0, s0, 4
15139 ; GCN1-NEXT: v_mov_b32_e32 v3, s0
15140 ; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen
15141 ; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
15142 ; GCN1-NEXT: v_mov_b32_e32 v4, s13
15143 ; GCN1-NEXT: s_waitcnt vmcnt(0)
15144 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1]
15145 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
15146 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
15147 ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen
15148 ; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen
15149 ; GCN1-NEXT: .LBB103_6: ; %atomicrmw.phi
15150 ; GCN1-NEXT: v_mov_b32_e32 v2, s10
15151 ; GCN1-NEXT: v_mov_b32_e32 v3, s11
15152 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
15153 ; GCN1-NEXT: s_endpgm
15155 ; GCN2-LABEL: atomic_umax_i64_ret_addr64_offset:
15156 ; GCN2: ; %bb.0: ; %entry
15157 ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
15158 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
15159 ; GCN2-NEXT: s_mov_b32 s90, -1
15160 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000
15161 ; GCN2-NEXT: s_add_u32 s88, s88, s11
15162 ; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
15163 ; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104
15164 ; GCN2-NEXT: s_addc_u32 s89, s89, 0
15165 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
15166 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
15167 ; GCN2-NEXT: s_add_u32 s0, s8, s0
15168 ; GCN2-NEXT: s_addc_u32 s1, s9, s1
15169 ; GCN2-NEXT: s_add_u32 s0, s0, 32
15170 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
15171 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2
15172 ; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
15173 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
15174 ; GCN2-NEXT: s_cbranch_vccz .LBB103_4
15175 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
15176 ; GCN2-NEXT: v_mov_b32_e32 v3, s1
15177 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
15178 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
15179 ; GCN2-NEXT: s_mov_b64 s[2:3], 0
15180 ; GCN2-NEXT: v_mov_b32_e32 v4, s13
15181 ; GCN2-NEXT: v_mov_b32_e32 v5, s12
15182 ; GCN2-NEXT: .LBB103_2: ; %atomicrmw.start
15183 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
15184 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15185 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
15186 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
15187 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9]
15188 ; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
15189 ; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
15190 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
15191 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15192 ; GCN2-NEXT: buffer_wbinvl1_vol
15193 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
15194 ; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
15195 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
15196 ; GCN2-NEXT: s_cbranch_execnz .LBB103_2
15197 ; GCN2-NEXT: ; %bb.3: ; %Flow
15198 ; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
15199 ; GCN2-NEXT: s_branch .LBB103_6
15200 ; GCN2-NEXT: .LBB103_4:
15201 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
15202 ; GCN2-NEXT: s_cbranch_execz .LBB103_6
15203 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
15204 ; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
15205 ; GCN2-NEXT: s_cselect_b32 s0, s0, -1
15206 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
15207 ; GCN2-NEXT: s_add_i32 s0, s0, 4
15208 ; GCN2-NEXT: v_mov_b32_e32 v3, s0
15209 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
15210 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
15211 ; GCN2-NEXT: v_mov_b32_e32 v5, s12
15212 ; GCN2-NEXT: v_mov_b32_e32 v4, s13
15213 ; GCN2-NEXT: s_waitcnt vmcnt(0)
15214 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1]
15215 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
15216 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
15217 ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
15218 ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
15219 ; GCN2-NEXT: .LBB103_6: ; %atomicrmw.phi
15220 ; GCN2-NEXT: v_mov_b32_e32 v2, s10
15221 ; GCN2-NEXT: v_mov_b32_e32 v3, s11
15222 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
15223 ; GCN2-NEXT: s_endpgm
15225 ; GCN3-LABEL: atomic_umax_i64_ret_addr64_offset:
15226 ; GCN3: ; %bb.0: ; %entry
15227 ; GCN3-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
15228 ; GCN3-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
15229 ; GCN3-NEXT: s_mov_b32 s18, -1
15230 ; GCN3-NEXT: s_mov_b32 s19, 0xe00000
15231 ; GCN3-NEXT: s_add_u32 s16, s16, s11
15232 ; GCN3-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
15233 ; GCN3-NEXT: s_addc_u32 s17, s17, 0
15234 ; GCN3-NEXT: s_mov_b64 s[2:3], src_private_base
15235 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
15236 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
15237 ; GCN3-NEXT: s_add_u32 s0, s8, s0
15238 ; GCN3-NEXT: s_addc_u32 s1, s9, s1
15239 ; GCN3-NEXT: s_add_u32 s0, s0, 32
15240 ; GCN3-NEXT: s_addc_u32 s1, s1, 0
15241 ; GCN3-NEXT: s_cmp_eq_u32 s1, s3
15242 ; GCN3-NEXT: s_cselect_b64 s[2:3], -1, 0
15243 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3]
15244 ; GCN3-NEXT: s_cbranch_vccz .LBB103_4
15245 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
15246 ; GCN3-NEXT: v_mov_b32_e32 v3, s1
15247 ; GCN3-NEXT: v_mov_b32_e32 v2, s0
15248 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
15249 ; GCN3-NEXT: s_mov_b64 s[2:3], 0
15250 ; GCN3-NEXT: v_mov_b32_e32 v4, s13
15251 ; GCN3-NEXT: v_mov_b32_e32 v5, s12
15252 ; GCN3-NEXT: .LBB103_2: ; %atomicrmw.start
15253 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
15254 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15255 ; GCN3-NEXT: v_mov_b32_e32 v9, v1
15256 ; GCN3-NEXT: v_mov_b32_e32 v8, v0
15257 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9]
15258 ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
15259 ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
15260 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
15261 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15262 ; GCN3-NEXT: buffer_wbinvl1_vol
15263 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
15264 ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
15265 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
15266 ; GCN3-NEXT: s_cbranch_execnz .LBB103_2
15267 ; GCN3-NEXT: ; %bb.3: ; %Flow
15268 ; GCN3-NEXT: s_or_b64 exec, exec, s[2:3]
15269 ; GCN3-NEXT: s_branch .LBB103_6
15270 ; GCN3-NEXT: .LBB103_4:
15271 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
15272 ; GCN3-NEXT: s_cbranch_execz .LBB103_6
15273 ; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
15274 ; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0
15275 ; GCN3-NEXT: s_cselect_b32 s0, s0, -1
15276 ; GCN3-NEXT: v_mov_b32_e32 v2, s0
15277 ; GCN3-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen
15278 ; GCN3-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen offset:4
15279 ; GCN3-NEXT: v_mov_b32_e32 v4, s12
15280 ; GCN3-NEXT: v_mov_b32_e32 v3, s13
15281 ; GCN3-NEXT: s_waitcnt vmcnt(0)
15282 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1]
15283 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
15284 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
15285 ; GCN3-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen
15286 ; GCN3-NEXT: buffer_store_dword v3, v2, s[16:19], 0 offen offset:4
15287 ; GCN3-NEXT: .LBB103_6: ; %atomicrmw.phi
15288 ; GCN3-NEXT: v_mov_b32_e32 v2, s10
15289 ; GCN3-NEXT: v_mov_b32_e32 v3, s11
15290 ; GCN3-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
15291 ; GCN3-NEXT: s_endpgm
15293 %ptr = getelementptr i64, ptr %out, i64 %index
15294 %gep = getelementptr i64, ptr %ptr, i64 4
15295 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst
15296 store i64 %tmp0, ptr %out2
15300 define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
15301 ; GCN1-LABEL: atomic_umax_i64_ret_addr64:
15302 ; GCN1: ; %bb.0: ; %entry
15303 ; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
15304 ; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
15305 ; GCN1-NEXT: s_mov_b32 s18, -1
15306 ; GCN1-NEXT: s_mov_b32 s19, 0xe8f000
15307 ; GCN1-NEXT: s_add_u32 s16, s16, s11
15308 ; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9
15309 ; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41
15310 ; GCN1-NEXT: s_addc_u32 s17, s17, 0
15311 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
15312 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
15313 ; GCN1-NEXT: s_add_u32 s0, s8, s0
15314 ; GCN1-NEXT: s_addc_u32 s1, s9, s1
15315 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2
15316 ; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
15317 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
15318 ; GCN1-NEXT: s_cbranch_vccz .LBB104_4
15319 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
15320 ; GCN1-NEXT: v_mov_b32_e32 v3, s1
15321 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
15322 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
15323 ; GCN1-NEXT: s_mov_b64 s[2:3], 0
15324 ; GCN1-NEXT: v_mov_b32_e32 v4, s13
15325 ; GCN1-NEXT: v_mov_b32_e32 v5, s12
15326 ; GCN1-NEXT: .LBB104_2: ; %atomicrmw.start
15327 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
15328 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15329 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
15330 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
15331 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9]
15332 ; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
15333 ; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
15334 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
15335 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15336 ; GCN1-NEXT: buffer_wbinvl1_vol
15337 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
15338 ; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
15339 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
15340 ; GCN1-NEXT: s_cbranch_execnz .LBB104_2
15341 ; GCN1-NEXT: ; %bb.3: ; %Flow
15342 ; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
15343 ; GCN1-NEXT: s_branch .LBB104_6
15344 ; GCN1-NEXT: .LBB104_4:
15345 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
15346 ; GCN1-NEXT: s_cbranch_execz .LBB104_6
15347 ; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
15348 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
15349 ; GCN1-NEXT: v_mov_b32_e32 v5, s12
15350 ; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
15351 ; GCN1-NEXT: s_cselect_b32 s0, s0, -1
15352 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
15353 ; GCN1-NEXT: s_add_i32 s0, s0, 4
15354 ; GCN1-NEXT: v_mov_b32_e32 v3, s0
15355 ; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen
15356 ; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
15357 ; GCN1-NEXT: v_mov_b32_e32 v4, s13
15358 ; GCN1-NEXT: s_waitcnt vmcnt(0)
15359 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1]
15360 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
15361 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
15362 ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen
15363 ; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen
15364 ; GCN1-NEXT: .LBB104_6: ; %atomicrmw.phi
15365 ; GCN1-NEXT: v_mov_b32_e32 v2, s10
15366 ; GCN1-NEXT: v_mov_b32_e32 v3, s11
15367 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
15368 ; GCN1-NEXT: s_endpgm
15370 ; GCN2-LABEL: atomic_umax_i64_ret_addr64:
15371 ; GCN2: ; %bb.0: ; %entry
15372 ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
15373 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
15374 ; GCN2-NEXT: s_mov_b32 s90, -1
15375 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000
15376 ; GCN2-NEXT: s_add_u32 s88, s88, s11
15377 ; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
15378 ; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104
15379 ; GCN2-NEXT: s_addc_u32 s89, s89, 0
15380 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
15381 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
15382 ; GCN2-NEXT: s_add_u32 s0, s8, s0
15383 ; GCN2-NEXT: s_addc_u32 s1, s9, s1
15384 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2
15385 ; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
15386 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
15387 ; GCN2-NEXT: s_cbranch_vccz .LBB104_4
15388 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
15389 ; GCN2-NEXT: v_mov_b32_e32 v3, s1
15390 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
15391 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
15392 ; GCN2-NEXT: s_mov_b64 s[2:3], 0
15393 ; GCN2-NEXT: v_mov_b32_e32 v4, s13
15394 ; GCN2-NEXT: v_mov_b32_e32 v5, s12
15395 ; GCN2-NEXT: .LBB104_2: ; %atomicrmw.start
15396 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
15397 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15398 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
15399 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
15400 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9]
15401 ; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
15402 ; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
15403 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
15404 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15405 ; GCN2-NEXT: buffer_wbinvl1_vol
15406 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
15407 ; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
15408 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
15409 ; GCN2-NEXT: s_cbranch_execnz .LBB104_2
15410 ; GCN2-NEXT: ; %bb.3: ; %Flow
15411 ; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
15412 ; GCN2-NEXT: s_branch .LBB104_6
15413 ; GCN2-NEXT: .LBB104_4:
15414 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
15415 ; GCN2-NEXT: s_cbranch_execz .LBB104_6
15416 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
15417 ; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
15418 ; GCN2-NEXT: s_cselect_b32 s0, s0, -1
15419 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
15420 ; GCN2-NEXT: s_add_i32 s0, s0, 4
15421 ; GCN2-NEXT: v_mov_b32_e32 v3, s0
15422 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
15423 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
15424 ; GCN2-NEXT: v_mov_b32_e32 v5, s12
15425 ; GCN2-NEXT: v_mov_b32_e32 v4, s13
15426 ; GCN2-NEXT: s_waitcnt vmcnt(0)
15427 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1]
15428 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
15429 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
15430 ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
15431 ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
15432 ; GCN2-NEXT: .LBB104_6: ; %atomicrmw.phi
15433 ; GCN2-NEXT: v_mov_b32_e32 v2, s10
15434 ; GCN2-NEXT: v_mov_b32_e32 v3, s11
15435 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
15436 ; GCN2-NEXT: s_endpgm
15438 ; GCN3-LABEL: atomic_umax_i64_ret_addr64:
15439 ; GCN3: ; %bb.0: ; %entry
15440 ; GCN3-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
15441 ; GCN3-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
15442 ; GCN3-NEXT: s_mov_b32 s18, -1
15443 ; GCN3-NEXT: s_mov_b32 s19, 0xe00000
15444 ; GCN3-NEXT: s_add_u32 s16, s16, s11
15445 ; GCN3-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
15446 ; GCN3-NEXT: s_addc_u32 s17, s17, 0
15447 ; GCN3-NEXT: s_mov_b64 s[2:3], src_private_base
15448 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
15449 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
15450 ; GCN3-NEXT: s_add_u32 s0, s8, s0
15451 ; GCN3-NEXT: s_addc_u32 s1, s9, s1
15452 ; GCN3-NEXT: s_cmp_eq_u32 s1, s3
15453 ; GCN3-NEXT: s_cselect_b64 s[2:3], -1, 0
15454 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3]
15455 ; GCN3-NEXT: s_cbranch_vccz .LBB104_4
15456 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
15457 ; GCN3-NEXT: v_mov_b32_e32 v3, s1
15458 ; GCN3-NEXT: v_mov_b32_e32 v2, s0
15459 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
15460 ; GCN3-NEXT: s_mov_b64 s[2:3], 0
15461 ; GCN3-NEXT: v_mov_b32_e32 v4, s13
15462 ; GCN3-NEXT: v_mov_b32_e32 v5, s12
15463 ; GCN3-NEXT: .LBB104_2: ; %atomicrmw.start
15464 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
15465 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15466 ; GCN3-NEXT: v_mov_b32_e32 v9, v1
15467 ; GCN3-NEXT: v_mov_b32_e32 v8, v0
15468 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9]
15469 ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
15470 ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
15471 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
15472 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15473 ; GCN3-NEXT: buffer_wbinvl1_vol
15474 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
15475 ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
15476 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
15477 ; GCN3-NEXT: s_cbranch_execnz .LBB104_2
15478 ; GCN3-NEXT: ; %bb.3: ; %Flow
15479 ; GCN3-NEXT: s_or_b64 exec, exec, s[2:3]
15480 ; GCN3-NEXT: s_branch .LBB104_6
15481 ; GCN3-NEXT: .LBB104_4:
15482 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
15483 ; GCN3-NEXT: s_cbranch_execz .LBB104_6
15484 ; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
15485 ; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0
15486 ; GCN3-NEXT: s_cselect_b32 s0, s0, -1
15487 ; GCN3-NEXT: v_mov_b32_e32 v2, s0
15488 ; GCN3-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen
15489 ; GCN3-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen offset:4
15490 ; GCN3-NEXT: v_mov_b32_e32 v4, s12
15491 ; GCN3-NEXT: v_mov_b32_e32 v3, s13
15492 ; GCN3-NEXT: s_waitcnt vmcnt(0)
15493 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1]
15494 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
15495 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
15496 ; GCN3-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen
15497 ; GCN3-NEXT: buffer_store_dword v3, v2, s[16:19], 0 offen offset:4
15498 ; GCN3-NEXT: .LBB104_6: ; %atomicrmw.phi
15499 ; GCN3-NEXT: v_mov_b32_e32 v2, s10
15500 ; GCN3-NEXT: v_mov_b32_e32 v3, s11
15501 ; GCN3-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
15502 ; GCN3-NEXT: s_endpgm
15504 %ptr = getelementptr i64, ptr %out, i64 %index
15505 %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst
15506 store i64 %tmp0, ptr %out2
15510 define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
15511 ; GCN1-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory:
15513 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15514 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
15515 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
15516 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
15517 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
15518 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
15519 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
15520 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
15521 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
15522 ; GCN1-NEXT: s_cbranch_execnz .LBB105_3
15523 ; GCN1-NEXT: ; %bb.1: ; %Flow3
15524 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
15525 ; GCN1-NEXT: s_cbranch_execnz .LBB105_6
15526 ; GCN1-NEXT: .LBB105_2: ; %atomicrmw.phi
15527 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
15528 ; GCN1-NEXT: s_setpc_b64 s[30:31]
15529 ; GCN1-NEXT: .LBB105_3: ; %atomicrmw.global
15530 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
15531 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
15532 ; GCN1-NEXT: flat_load_dword v7, v[4:5]
15533 ; GCN1-NEXT: flat_load_dword v6, v[0:1]
15534 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
15535 ; GCN1-NEXT: .LBB105_4: ; %atomicrmw.start
15536 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
15537 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15538 ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
15539 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
15540 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
15541 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
15542 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15543 ; GCN1-NEXT: buffer_wbinvl1_vol
15544 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
15545 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
15546 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
15547 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
15548 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
15549 ; GCN1-NEXT: s_cbranch_execnz .LBB105_4
15550 ; GCN1-NEXT: ; %bb.5: ; %Flow
15551 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
15552 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
15553 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
15554 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
15555 ; GCN1-NEXT: s_cbranch_execz .LBB105_2
15556 ; GCN1-NEXT: .LBB105_6: ; %atomicrmw.private
15557 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
15558 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
15559 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
15560 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
15561 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
15562 ; GCN1-NEXT: s_waitcnt vmcnt(0)
15563 ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
15564 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
15565 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
15566 ; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
15567 ; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
15568 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
15569 ; GCN1-NEXT: s_waitcnt vmcnt(0)
15570 ; GCN1-NEXT: s_setpc_b64 s[30:31]
15572 ; GCN2-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory:
15574 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15575 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
15576 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
15577 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
15578 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
15579 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
15580 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
15581 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
15582 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
15583 ; GCN2-NEXT: s_cbranch_execnz .LBB105_3
15584 ; GCN2-NEXT: ; %bb.1: ; %Flow3
15585 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
15586 ; GCN2-NEXT: s_cbranch_execnz .LBB105_6
15587 ; GCN2-NEXT: .LBB105_2: ; %atomicrmw.phi
15588 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
15589 ; GCN2-NEXT: s_setpc_b64 s[30:31]
15590 ; GCN2-NEXT: .LBB105_3: ; %atomicrmw.global
15591 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
15592 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
15593 ; GCN2-NEXT: flat_load_dword v7, v[4:5]
15594 ; GCN2-NEXT: flat_load_dword v6, v[0:1]
15595 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
15596 ; GCN2-NEXT: .LBB105_4: ; %atomicrmw.start
15597 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
15598 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15599 ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
15600 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
15601 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
15602 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
15603 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15604 ; GCN2-NEXT: buffer_wbinvl1_vol
15605 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
15606 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
15607 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
15608 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
15609 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
15610 ; GCN2-NEXT: s_cbranch_execnz .LBB105_4
15611 ; GCN2-NEXT: ; %bb.5: ; %Flow
15612 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
15613 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
15614 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
15615 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
15616 ; GCN2-NEXT: s_cbranch_execz .LBB105_2
15617 ; GCN2-NEXT: .LBB105_6: ; %atomicrmw.private
15618 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
15619 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
15620 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
15621 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
15622 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
15623 ; GCN2-NEXT: s_waitcnt vmcnt(0)
15624 ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
15625 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
15626 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
15627 ; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
15628 ; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
15629 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
15630 ; GCN2-NEXT: s_waitcnt vmcnt(0)
15631 ; GCN2-NEXT: s_setpc_b64 s[30:31]
15633 ; GCN3-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory:
15635 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15636 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
15637 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
15638 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
15639 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
15640 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
15641 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
15642 ; GCN3-NEXT: s_cbranch_execnz .LBB105_3
15643 ; GCN3-NEXT: ; %bb.1: ; %Flow3
15644 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
15645 ; GCN3-NEXT: s_cbranch_execnz .LBB105_6
15646 ; GCN3-NEXT: .LBB105_2: ; %atomicrmw.phi
15647 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
15648 ; GCN3-NEXT: s_setpc_b64 s[30:31]
15649 ; GCN3-NEXT: .LBB105_3: ; %atomicrmw.global
15650 ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
15651 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
15652 ; GCN3-NEXT: .LBB105_4: ; %atomicrmw.start
15653 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
15654 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15655 ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
15656 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
15657 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
15658 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
15659 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15660 ; GCN3-NEXT: buffer_wbinvl1_vol
15661 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
15662 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
15663 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
15664 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
15665 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
15666 ; GCN3-NEXT: s_cbranch_execnz .LBB105_4
15667 ; GCN3-NEXT: ; %bb.5: ; %Flow
15668 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
15669 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
15670 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
15671 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
15672 ; GCN3-NEXT: s_cbranch_execz .LBB105_2
15673 ; GCN3-NEXT: .LBB105_6: ; %atomicrmw.private
15674 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
15675 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
15676 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
15677 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
15678 ; GCN3-NEXT: s_waitcnt vmcnt(0)
15679 ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
15680 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
15681 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
15682 ; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
15683 ; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
15684 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
15685 ; GCN3-NEXT: s_waitcnt vmcnt(0)
15686 ; GCN3-NEXT: s_setpc_b64 s[30:31]
15687 %gep = getelementptr i64, ptr %out, i64 4
15688 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
15692 define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
15693 ; GCN1-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
15695 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15696 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
15697 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
15698 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
15699 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
15700 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
15701 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
15702 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
15703 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
15704 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
15705 ; GCN1-NEXT: s_cbranch_execnz .LBB106_3
15706 ; GCN1-NEXT: ; %bb.1: ; %Flow3
15707 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
15708 ; GCN1-NEXT: s_cbranch_execnz .LBB106_6
15709 ; GCN1-NEXT: .LBB106_2: ; %atomicrmw.phi
15710 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
15711 ; GCN1-NEXT: s_setpc_b64 s[30:31]
15712 ; GCN1-NEXT: .LBB106_3: ; %atomicrmw.global
15713 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4
15714 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
15715 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
15716 ; GCN1-NEXT: flat_load_dword v0, v[4:5]
15717 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
15718 ; GCN1-NEXT: .LBB106_4: ; %atomicrmw.start
15719 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
15720 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15721 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
15722 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
15723 ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
15724 ; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
15725 ; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
15726 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
15727 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15728 ; GCN1-NEXT: buffer_wbinvl1_vol
15729 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
15730 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
15731 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
15732 ; GCN1-NEXT: s_cbranch_execnz .LBB106_4
15733 ; GCN1-NEXT: ; %bb.5: ; %Flow
15734 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
15735 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
15736 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
15737 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
15738 ; GCN1-NEXT: s_cbranch_execz .LBB106_2
15739 ; GCN1-NEXT: .LBB106_6: ; %atomicrmw.private
15740 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
15741 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
15742 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
15743 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
15744 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
15745 ; GCN1-NEXT: s_waitcnt vmcnt(0)
15746 ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
15747 ; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
15748 ; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
15749 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
15750 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
15751 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
15752 ; GCN1-NEXT: s_waitcnt vmcnt(0)
15753 ; GCN1-NEXT: s_setpc_b64 s[30:31]
15755 ; GCN2-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
15757 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15758 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
15759 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
15760 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
15761 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
15762 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
15763 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
15764 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
15765 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
15766 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
15767 ; GCN2-NEXT: s_cbranch_execnz .LBB106_3
15768 ; GCN2-NEXT: ; %bb.1: ; %Flow3
15769 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
15770 ; GCN2-NEXT: s_cbranch_execnz .LBB106_6
15771 ; GCN2-NEXT: .LBB106_2: ; %atomicrmw.phi
15772 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
15773 ; GCN2-NEXT: s_setpc_b64 s[30:31]
15774 ; GCN2-NEXT: .LBB106_3: ; %atomicrmw.global
15775 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4
15776 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
15777 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
15778 ; GCN2-NEXT: flat_load_dword v0, v[4:5]
15779 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
15780 ; GCN2-NEXT: .LBB106_4: ; %atomicrmw.start
15781 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
15782 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15783 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
15784 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
15785 ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
15786 ; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
15787 ; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
15788 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
15789 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15790 ; GCN2-NEXT: buffer_wbinvl1_vol
15791 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
15792 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
15793 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
15794 ; GCN2-NEXT: s_cbranch_execnz .LBB106_4
15795 ; GCN2-NEXT: ; %bb.5: ; %Flow
15796 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
15797 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
15798 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
15799 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
15800 ; GCN2-NEXT: s_cbranch_execz .LBB106_2
15801 ; GCN2-NEXT: .LBB106_6: ; %atomicrmw.private
15802 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
15803 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
15804 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
15805 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
15806 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
15807 ; GCN2-NEXT: s_waitcnt vmcnt(0)
15808 ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
15809 ; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
15810 ; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
15811 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
15812 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
15813 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
15814 ; GCN2-NEXT: s_waitcnt vmcnt(0)
15815 ; GCN2-NEXT: s_setpc_b64 s[30:31]
15817 ; GCN3-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
15819 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15820 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
15821 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
15822 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
15823 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
15824 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
15825 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
15826 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
15827 ; GCN3-NEXT: s_cbranch_execnz .LBB106_3
15828 ; GCN3-NEXT: ; %bb.1: ; %Flow3
15829 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
15830 ; GCN3-NEXT: s_cbranch_execnz .LBB106_6
15831 ; GCN3-NEXT: .LBB106_2: ; %atomicrmw.phi
15832 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
15833 ; GCN3-NEXT: s_setpc_b64 s[30:31]
15834 ; GCN3-NEXT: .LBB106_3: ; %atomicrmw.global
15835 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
15836 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
15837 ; GCN3-NEXT: .LBB106_4: ; %atomicrmw.start
15838 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
15839 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15840 ; GCN3-NEXT: v_mov_b32_e32 v9, v1
15841 ; GCN3-NEXT: v_mov_b32_e32 v8, v0
15842 ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
15843 ; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
15844 ; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
15845 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
15846 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15847 ; GCN3-NEXT: buffer_wbinvl1_vol
15848 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
15849 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
15850 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
15851 ; GCN3-NEXT: s_cbranch_execnz .LBB106_4
15852 ; GCN3-NEXT: ; %bb.5: ; %Flow
15853 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
15854 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
15855 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
15856 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
15857 ; GCN3-NEXT: s_cbranch_execz .LBB106_2
15858 ; GCN3-NEXT: .LBB106_6: ; %atomicrmw.private
15859 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
15860 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
15861 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
15862 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
15863 ; GCN3-NEXT: s_waitcnt vmcnt(0)
15864 ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
15865 ; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
15866 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
15867 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
15868 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
15869 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
15870 ; GCN3-NEXT: s_waitcnt vmcnt(0)
15871 ; GCN3-NEXT: s_setpc_b64 s[30:31]
15872 %gep = getelementptr i64, ptr %out, i64 4
15873 %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
15877 ; ---------------------------------------------------------------------
15879 ; ---------------------------------------------------------------------
15881 define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) {
15882 ; GCN1-LABEL: flat_atomic_umin_i64_noret:
15884 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15885 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
15886 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
15887 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
15888 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
15889 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
15890 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
15891 ; GCN1-NEXT: s_cbranch_execnz .LBB107_3
15892 ; GCN1-NEXT: ; %bb.1: ; %Flow3
15893 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
15894 ; GCN1-NEXT: s_cbranch_execnz .LBB107_6
15895 ; GCN1-NEXT: .LBB107_2: ; %atomicrmw.phi
15896 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
15897 ; GCN1-NEXT: s_setpc_b64 s[30:31]
15898 ; GCN1-NEXT: .LBB107_3: ; %atomicrmw.global
15899 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
15900 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
15901 ; GCN1-NEXT: flat_load_dword v7, v[4:5]
15902 ; GCN1-NEXT: flat_load_dword v6, v[0:1]
15903 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
15904 ; GCN1-NEXT: .LBB107_4: ; %atomicrmw.start
15905 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
15906 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15907 ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
15908 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
15909 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
15910 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
15911 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15912 ; GCN1-NEXT: buffer_wbinvl1_vol
15913 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
15914 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
15915 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
15916 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
15917 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
15918 ; GCN1-NEXT: s_cbranch_execnz .LBB107_4
15919 ; GCN1-NEXT: ; %bb.5: ; %Flow
15920 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
15921 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
15922 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
15923 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
15924 ; GCN1-NEXT: s_cbranch_execz .LBB107_2
15925 ; GCN1-NEXT: .LBB107_6: ; %atomicrmw.private
15926 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
15927 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
15928 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
15929 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
15930 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
15931 ; GCN1-NEXT: s_waitcnt vmcnt(0)
15932 ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
15933 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
15934 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
15935 ; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
15936 ; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
15937 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
15938 ; GCN1-NEXT: s_waitcnt vmcnt(0)
15939 ; GCN1-NEXT: s_setpc_b64 s[30:31]
15941 ; GCN2-LABEL: flat_atomic_umin_i64_noret:
15943 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15944 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
15945 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
15946 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
15947 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
15948 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
15949 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
15950 ; GCN2-NEXT: s_cbranch_execnz .LBB107_3
15951 ; GCN2-NEXT: ; %bb.1: ; %Flow3
15952 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
15953 ; GCN2-NEXT: s_cbranch_execnz .LBB107_6
15954 ; GCN2-NEXT: .LBB107_2: ; %atomicrmw.phi
15955 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
15956 ; GCN2-NEXT: s_setpc_b64 s[30:31]
15957 ; GCN2-NEXT: .LBB107_3: ; %atomicrmw.global
15958 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
15959 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
15960 ; GCN2-NEXT: flat_load_dword v7, v[4:5]
15961 ; GCN2-NEXT: flat_load_dword v6, v[0:1]
15962 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
15963 ; GCN2-NEXT: .LBB107_4: ; %atomicrmw.start
15964 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
15965 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15966 ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
15967 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
15968 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
15969 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
15970 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
15971 ; GCN2-NEXT: buffer_wbinvl1_vol
15972 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
15973 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
15974 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
15975 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
15976 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
15977 ; GCN2-NEXT: s_cbranch_execnz .LBB107_4
15978 ; GCN2-NEXT: ; %bb.5: ; %Flow
15979 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
15980 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
15981 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
15982 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
15983 ; GCN2-NEXT: s_cbranch_execz .LBB107_2
15984 ; GCN2-NEXT: .LBB107_6: ; %atomicrmw.private
15985 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
15986 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
15987 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
15988 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
15989 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
15990 ; GCN2-NEXT: s_waitcnt vmcnt(0)
15991 ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
15992 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
15993 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
15994 ; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
15995 ; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
15996 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
15997 ; GCN2-NEXT: s_waitcnt vmcnt(0)
15998 ; GCN2-NEXT: s_setpc_b64 s[30:31]
16000 ; GCN3-LABEL: flat_atomic_umin_i64_noret:
16002 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16003 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
16004 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
16005 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
16006 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
16007 ; GCN3-NEXT: s_cbranch_execnz .LBB107_3
16008 ; GCN3-NEXT: ; %bb.1: ; %Flow3
16009 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
16010 ; GCN3-NEXT: s_cbranch_execnz .LBB107_6
16011 ; GCN3-NEXT: .LBB107_2: ; %atomicrmw.phi
16012 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
16013 ; GCN3-NEXT: s_setpc_b64 s[30:31]
16014 ; GCN3-NEXT: .LBB107_3: ; %atomicrmw.global
16015 ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
16016 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
16017 ; GCN3-NEXT: .LBB107_4: ; %atomicrmw.start
16018 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
16019 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16020 ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
16021 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
16022 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
16023 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
16024 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16025 ; GCN3-NEXT: buffer_wbinvl1_vol
16026 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
16027 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
16028 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
16029 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
16030 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
16031 ; GCN3-NEXT: s_cbranch_execnz .LBB107_4
16032 ; GCN3-NEXT: ; %bb.5: ; %Flow
16033 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
16034 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
16035 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
16036 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
16037 ; GCN3-NEXT: s_cbranch_execz .LBB107_2
16038 ; GCN3-NEXT: .LBB107_6: ; %atomicrmw.private
16039 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
16040 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
16041 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
16042 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
16043 ; GCN3-NEXT: s_waitcnt vmcnt(0)
16044 ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
16045 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
16046 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
16047 ; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
16048 ; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
16049 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
16050 ; GCN3-NEXT: s_waitcnt vmcnt(0)
16051 ; GCN3-NEXT: s_setpc_b64 s[30:31]
16052 %tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst
16056 define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) {
16057 ; GCN1-LABEL: flat_atomic_umin_i64_noret_offset:
16059 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16060 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
16061 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
16062 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
16063 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
16064 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
16065 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
16066 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
16067 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
16068 ; GCN1-NEXT: s_cbranch_execnz .LBB108_3
16069 ; GCN1-NEXT: ; %bb.1: ; %Flow3
16070 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
16071 ; GCN1-NEXT: s_cbranch_execnz .LBB108_6
16072 ; GCN1-NEXT: .LBB108_2: ; %atomicrmw.phi
16073 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
16074 ; GCN1-NEXT: s_setpc_b64 s[30:31]
16075 ; GCN1-NEXT: .LBB108_3: ; %atomicrmw.global
16076 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
16077 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
16078 ; GCN1-NEXT: flat_load_dword v7, v[4:5]
16079 ; GCN1-NEXT: flat_load_dword v6, v[0:1]
16080 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
16081 ; GCN1-NEXT: .LBB108_4: ; %atomicrmw.start
16082 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
16083 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16084 ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
16085 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
16086 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
16087 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
16088 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16089 ; GCN1-NEXT: buffer_wbinvl1_vol
16090 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
16091 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
16092 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
16093 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
16094 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
16095 ; GCN1-NEXT: s_cbranch_execnz .LBB108_4
16096 ; GCN1-NEXT: ; %bb.5: ; %Flow
16097 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
16098 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
16099 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
16100 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
16101 ; GCN1-NEXT: s_cbranch_execz .LBB108_2
16102 ; GCN1-NEXT: .LBB108_6: ; %atomicrmw.private
16103 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
16104 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
16105 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
16106 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
16107 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
16108 ; GCN1-NEXT: s_waitcnt vmcnt(0)
16109 ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
16110 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
16111 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
16112 ; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
16113 ; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
16114 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
16115 ; GCN1-NEXT: s_waitcnt vmcnt(0)
16116 ; GCN1-NEXT: s_setpc_b64 s[30:31]
16118 ; GCN2-LABEL: flat_atomic_umin_i64_noret_offset:
16120 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16121 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
16122 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
16123 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
16124 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
16125 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
16126 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
16127 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
16128 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
16129 ; GCN2-NEXT: s_cbranch_execnz .LBB108_3
16130 ; GCN2-NEXT: ; %bb.1: ; %Flow3
16131 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
16132 ; GCN2-NEXT: s_cbranch_execnz .LBB108_6
16133 ; GCN2-NEXT: .LBB108_2: ; %atomicrmw.phi
16134 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
16135 ; GCN2-NEXT: s_setpc_b64 s[30:31]
16136 ; GCN2-NEXT: .LBB108_3: ; %atomicrmw.global
16137 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
16138 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
16139 ; GCN2-NEXT: flat_load_dword v7, v[4:5]
16140 ; GCN2-NEXT: flat_load_dword v6, v[0:1]
16141 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
16142 ; GCN2-NEXT: .LBB108_4: ; %atomicrmw.start
16143 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
16144 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16145 ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
16146 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
16147 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
16148 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
16149 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16150 ; GCN2-NEXT: buffer_wbinvl1_vol
16151 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
16152 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
16153 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
16154 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
16155 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
16156 ; GCN2-NEXT: s_cbranch_execnz .LBB108_4
16157 ; GCN2-NEXT: ; %bb.5: ; %Flow
16158 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
16159 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
16160 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
16161 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
16162 ; GCN2-NEXT: s_cbranch_execz .LBB108_2
16163 ; GCN2-NEXT: .LBB108_6: ; %atomicrmw.private
16164 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
16165 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
16166 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
16167 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
16168 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
16169 ; GCN2-NEXT: s_waitcnt vmcnt(0)
16170 ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
16171 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
16172 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
16173 ; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
16174 ; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
16175 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
16176 ; GCN2-NEXT: s_waitcnt vmcnt(0)
16177 ; GCN2-NEXT: s_setpc_b64 s[30:31]
16179 ; GCN3-LABEL: flat_atomic_umin_i64_noret_offset:
16181 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16182 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
16183 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
16184 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
16185 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
16186 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
16187 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
16188 ; GCN3-NEXT: s_cbranch_execnz .LBB108_3
16189 ; GCN3-NEXT: ; %bb.1: ; %Flow3
16190 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
16191 ; GCN3-NEXT: s_cbranch_execnz .LBB108_6
16192 ; GCN3-NEXT: .LBB108_2: ; %atomicrmw.phi
16193 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
16194 ; GCN3-NEXT: s_setpc_b64 s[30:31]
16195 ; GCN3-NEXT: .LBB108_3: ; %atomicrmw.global
16196 ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
16197 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
16198 ; GCN3-NEXT: .LBB108_4: ; %atomicrmw.start
16199 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
16200 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16201 ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
16202 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
16203 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
16204 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
16205 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16206 ; GCN3-NEXT: buffer_wbinvl1_vol
16207 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
16208 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
16209 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
16210 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
16211 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
16212 ; GCN3-NEXT: s_cbranch_execnz .LBB108_4
16213 ; GCN3-NEXT: ; %bb.5: ; %Flow
16214 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
16215 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
16216 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
16217 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
16218 ; GCN3-NEXT: s_cbranch_execz .LBB108_2
16219 ; GCN3-NEXT: .LBB108_6: ; %atomicrmw.private
16220 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
16221 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
16222 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
16223 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
16224 ; GCN3-NEXT: s_waitcnt vmcnt(0)
16225 ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
16226 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
16227 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
16228 ; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
16229 ; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
16230 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
16231 ; GCN3-NEXT: s_waitcnt vmcnt(0)
16232 ; GCN3-NEXT: s_setpc_b64 s[30:31]
16233 %gep = getelementptr i64, ptr %out, i64 4
16234 %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst
16238 define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) {
16239 ; GCN1-LABEL: flat_atomic_umin_i64_ret:
16241 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16242 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
16243 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
16244 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
16245 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
16246 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
16247 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
16248 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
16249 ; GCN1-NEXT: s_cbranch_execz .LBB109_4
16250 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
16251 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
16252 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
16253 ; GCN1-NEXT: flat_load_dword v5, v[4:5]
16254 ; GCN1-NEXT: flat_load_dword v4, v[0:1]
16255 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
16256 ; GCN1-NEXT: .LBB109_2: ; %atomicrmw.start
16257 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
16258 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16259 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
16260 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
16261 ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
16262 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
16263 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
16264 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
16265 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16266 ; GCN1-NEXT: buffer_wbinvl1_vol
16267 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
16268 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
16269 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
16270 ; GCN1-NEXT: s_cbranch_execnz .LBB109_2
16271 ; GCN1-NEXT: ; %bb.3: ; %Flow
16272 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
16273 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
16274 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
16275 ; GCN1-NEXT: .LBB109_4: ; %Flow3
16276 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
16277 ; GCN1-NEXT: s_cbranch_execz .LBB109_6
16278 ; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
16279 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
16280 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
16281 ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
16282 ; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
16283 ; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
16284 ; GCN1-NEXT: s_waitcnt vmcnt(0)
16285 ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[4:5], v[2:3]
16286 ; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
16287 ; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
16288 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
16289 ; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
16290 ; GCN1-NEXT: .LBB109_6: ; %atomicrmw.phi
16291 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
16292 ; GCN1-NEXT: v_mov_b32_e32 v0, v4
16293 ; GCN1-NEXT: v_mov_b32_e32 v1, v5
16294 ; GCN1-NEXT: s_waitcnt vmcnt(0)
16295 ; GCN1-NEXT: s_setpc_b64 s[30:31]
16297 ; GCN2-LABEL: flat_atomic_umin_i64_ret:
16299 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16300 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
16301 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
16302 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
16303 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
16304 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
16305 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
16306 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
16307 ; GCN2-NEXT: s_cbranch_execz .LBB109_4
16308 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
16309 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
16310 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
16311 ; GCN2-NEXT: flat_load_dword v5, v[4:5]
16312 ; GCN2-NEXT: flat_load_dword v4, v[0:1]
16313 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
16314 ; GCN2-NEXT: .LBB109_2: ; %atomicrmw.start
16315 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
16316 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16317 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
16318 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
16319 ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
16320 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
16321 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
16322 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
16323 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16324 ; GCN2-NEXT: buffer_wbinvl1_vol
16325 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
16326 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
16327 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
16328 ; GCN2-NEXT: s_cbranch_execnz .LBB109_2
16329 ; GCN2-NEXT: ; %bb.3: ; %Flow
16330 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
16331 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
16332 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
16333 ; GCN2-NEXT: .LBB109_4: ; %Flow3
16334 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
16335 ; GCN2-NEXT: s_cbranch_execz .LBB109_6
16336 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
16337 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
16338 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
16339 ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
16340 ; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
16341 ; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
16342 ; GCN2-NEXT: s_waitcnt vmcnt(0)
16343 ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[4:5], v[2:3]
16344 ; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
16345 ; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
16346 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
16347 ; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
16348 ; GCN2-NEXT: .LBB109_6: ; %atomicrmw.phi
16349 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
16350 ; GCN2-NEXT: v_mov_b32_e32 v0, v4
16351 ; GCN2-NEXT: v_mov_b32_e32 v1, v5
16352 ; GCN2-NEXT: s_waitcnt vmcnt(0)
16353 ; GCN2-NEXT: s_setpc_b64 s[30:31]
16355 ; GCN3-LABEL: flat_atomic_umin_i64_ret:
16357 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16358 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
16359 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
16360 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
16361 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
16362 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
16363 ; GCN3-NEXT: s_cbranch_execz .LBB109_4
16364 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
16365 ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
16366 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
16367 ; GCN3-NEXT: .LBB109_2: ; %atomicrmw.start
16368 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
16369 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16370 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
16371 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
16372 ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
16373 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
16374 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
16375 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
16376 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16377 ; GCN3-NEXT: buffer_wbinvl1_vol
16378 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
16379 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
16380 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
16381 ; GCN3-NEXT: s_cbranch_execnz .LBB109_2
16382 ; GCN3-NEXT: ; %bb.3: ; %Flow
16383 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
16384 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
16385 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
16386 ; GCN3-NEXT: .LBB109_4: ; %Flow3
16387 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
16388 ; GCN3-NEXT: s_cbranch_execz .LBB109_6
16389 ; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
16390 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
16391 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
16392 ; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
16393 ; GCN3-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4
16394 ; GCN3-NEXT: s_waitcnt vmcnt(0)
16395 ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[4:5], v[2:3]
16396 ; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
16397 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
16398 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
16399 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
16400 ; GCN3-NEXT: .LBB109_6: ; %atomicrmw.phi
16401 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
16402 ; GCN3-NEXT: v_mov_b32_e32 v0, v4
16403 ; GCN3-NEXT: v_mov_b32_e32 v1, v5
16404 ; GCN3-NEXT: s_waitcnt vmcnt(0)
16405 ; GCN3-NEXT: s_setpc_b64 s[30:31]
16406 %result = atomicrmw umin ptr %ptr, i64 %in seq_cst
16410 define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) {
16411 ; GCN1-LABEL: flat_atomic_umin_i64_ret_offset:
16413 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16414 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
16415 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
16416 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
16417 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
16418 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
16419 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
16420 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
16421 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
16422 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
16423 ; GCN1-NEXT: s_cbranch_execnz .LBB110_3
16424 ; GCN1-NEXT: ; %bb.1: ; %Flow3
16425 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
16426 ; GCN1-NEXT: s_cbranch_execnz .LBB110_6
16427 ; GCN1-NEXT: .LBB110_2: ; %atomicrmw.phi
16428 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
16429 ; GCN1-NEXT: s_setpc_b64 s[30:31]
16430 ; GCN1-NEXT: .LBB110_3: ; %atomicrmw.global
16431 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4
16432 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
16433 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
16434 ; GCN1-NEXT: flat_load_dword v0, v[4:5]
16435 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
16436 ; GCN1-NEXT: .LBB110_4: ; %atomicrmw.start
16437 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
16438 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16439 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
16440 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
16441 ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
16442 ; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
16443 ; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
16444 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
16445 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16446 ; GCN1-NEXT: buffer_wbinvl1_vol
16447 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
16448 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
16449 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
16450 ; GCN1-NEXT: s_cbranch_execnz .LBB110_4
16451 ; GCN1-NEXT: ; %bb.5: ; %Flow
16452 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
16453 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
16454 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
16455 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
16456 ; GCN1-NEXT: s_cbranch_execz .LBB110_2
16457 ; GCN1-NEXT: .LBB110_6: ; %atomicrmw.private
16458 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
16459 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
16460 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
16461 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
16462 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
16463 ; GCN1-NEXT: s_waitcnt vmcnt(0)
16464 ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
16465 ; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
16466 ; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
16467 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
16468 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
16469 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
16470 ; GCN1-NEXT: s_waitcnt vmcnt(0)
16471 ; GCN1-NEXT: s_setpc_b64 s[30:31]
16473 ; GCN2-LABEL: flat_atomic_umin_i64_ret_offset:
16475 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16476 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
16477 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
16478 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
16479 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
16480 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
16481 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
16482 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
16483 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
16484 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
16485 ; GCN2-NEXT: s_cbranch_execnz .LBB110_3
16486 ; GCN2-NEXT: ; %bb.1: ; %Flow3
16487 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
16488 ; GCN2-NEXT: s_cbranch_execnz .LBB110_6
16489 ; GCN2-NEXT: .LBB110_2: ; %atomicrmw.phi
16490 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
16491 ; GCN2-NEXT: s_setpc_b64 s[30:31]
16492 ; GCN2-NEXT: .LBB110_3: ; %atomicrmw.global
16493 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4
16494 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
16495 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
16496 ; GCN2-NEXT: flat_load_dword v0, v[4:5]
16497 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
16498 ; GCN2-NEXT: .LBB110_4: ; %atomicrmw.start
16499 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
16500 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16501 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
16502 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
16503 ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
16504 ; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
16505 ; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
16506 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
16507 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16508 ; GCN2-NEXT: buffer_wbinvl1_vol
16509 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
16510 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
16511 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
16512 ; GCN2-NEXT: s_cbranch_execnz .LBB110_4
16513 ; GCN2-NEXT: ; %bb.5: ; %Flow
16514 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
16515 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
16516 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
16517 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
16518 ; GCN2-NEXT: s_cbranch_execz .LBB110_2
16519 ; GCN2-NEXT: .LBB110_6: ; %atomicrmw.private
16520 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
16521 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
16522 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
16523 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
16524 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
16525 ; GCN2-NEXT: s_waitcnt vmcnt(0)
16526 ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
16527 ; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
16528 ; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
16529 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
16530 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
16531 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
16532 ; GCN2-NEXT: s_waitcnt vmcnt(0)
16533 ; GCN2-NEXT: s_setpc_b64 s[30:31]
16535 ; GCN3-LABEL: flat_atomic_umin_i64_ret_offset:
16537 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16538 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
16539 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
16540 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
16541 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
16542 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
16543 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
16544 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
16545 ; GCN3-NEXT: s_cbranch_execnz .LBB110_3
16546 ; GCN3-NEXT: ; %bb.1: ; %Flow3
16547 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
16548 ; GCN3-NEXT: s_cbranch_execnz .LBB110_6
16549 ; GCN3-NEXT: .LBB110_2: ; %atomicrmw.phi
16550 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
16551 ; GCN3-NEXT: s_setpc_b64 s[30:31]
16552 ; GCN3-NEXT: .LBB110_3: ; %atomicrmw.global
16553 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
16554 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
16555 ; GCN3-NEXT: .LBB110_4: ; %atomicrmw.start
16556 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
16557 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16558 ; GCN3-NEXT: v_mov_b32_e32 v9, v1
16559 ; GCN3-NEXT: v_mov_b32_e32 v8, v0
16560 ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
16561 ; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
16562 ; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
16563 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
16564 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16565 ; GCN3-NEXT: buffer_wbinvl1_vol
16566 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
16567 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
16568 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
16569 ; GCN3-NEXT: s_cbranch_execnz .LBB110_4
16570 ; GCN3-NEXT: ; %bb.5: ; %Flow
16571 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
16572 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
16573 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
16574 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
16575 ; GCN3-NEXT: s_cbranch_execz .LBB110_2
16576 ; GCN3-NEXT: .LBB110_6: ; %atomicrmw.private
16577 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
16578 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
16579 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
16580 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
16581 ; GCN3-NEXT: s_waitcnt vmcnt(0)
16582 ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
16583 ; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
16584 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
16585 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
16586 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
16587 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
16588 ; GCN3-NEXT: s_waitcnt vmcnt(0)
16589 ; GCN3-NEXT: s_setpc_b64 s[30:31]
16590 %gep = getelementptr i64, ptr %out, i64 4
16591 %result = atomicrmw umin ptr %gep, i64 %in seq_cst
16595 define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
16596 ; GCN1-LABEL: flat_atomic_umin_i64_noret_scalar:
16598 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16599 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
16600 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
16601 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
16602 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
16603 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
16604 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
16605 ; GCN1-NEXT: s_mov_b64 s[34:35], -1
16606 ; GCN1-NEXT: s_cbranch_vccnz .LBB111_3
16607 ; GCN1-NEXT: ; %bb.1: ; %Flow3
16608 ; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35]
16609 ; GCN1-NEXT: s_cbranch_vccnz .LBB111_6
16610 ; GCN1-NEXT: .LBB111_2: ; %atomicrmw.phi
16611 ; GCN1-NEXT: s_setpc_b64 s[30:31]
16612 ; GCN1-NEXT: .LBB111_3: ; %atomicrmw.global
16613 ; GCN1-NEXT: s_add_u32 s34, s4, 4
16614 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
16615 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
16616 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
16617 ; GCN1-NEXT: v_mov_b32_e32 v4, s4
16618 ; GCN1-NEXT: v_mov_b32_e32 v5, s5
16619 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
16620 ; GCN1-NEXT: flat_load_dword v2, v[4:5]
16621 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
16622 ; GCN1-NEXT: v_mov_b32_e32 v6, s7
16623 ; GCN1-NEXT: v_mov_b32_e32 v7, s6
16624 ; GCN1-NEXT: .LBB111_4: ; %atomicrmw.start
16625 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
16626 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16627 ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
16628 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
16629 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
16630 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
16631 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16632 ; GCN1-NEXT: buffer_wbinvl1_vol
16633 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
16634 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
16635 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
16636 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
16637 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
16638 ; GCN1-NEXT: s_cbranch_execnz .LBB111_4
16639 ; GCN1-NEXT: ; %bb.5: ; %Flow
16640 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
16641 ; GCN1-NEXT: s_branch .LBB111_2
16642 ; GCN1-NEXT: .LBB111_6: ; %atomicrmw.private
16643 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
16644 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
16645 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
16646 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
16647 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
16648 ; GCN1-NEXT: s_add_i32 s34, s34, 4
16649 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
16650 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
16651 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
16652 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
16653 ; GCN1-NEXT: s_waitcnt vmcnt(0)
16654 ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1]
16655 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
16656 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
16657 ; GCN1-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
16658 ; GCN1-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
16659 ; GCN1-NEXT: s_waitcnt vmcnt(0)
16660 ; GCN1-NEXT: s_setpc_b64 s[30:31]
16662 ; GCN2-LABEL: flat_atomic_umin_i64_noret_scalar:
16664 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16665 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
16666 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
16667 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
16668 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
16669 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
16670 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
16671 ; GCN2-NEXT: s_mov_b64 s[34:35], -1
16672 ; GCN2-NEXT: s_cbranch_vccnz .LBB111_3
16673 ; GCN2-NEXT: ; %bb.1: ; %Flow3
16674 ; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35]
16675 ; GCN2-NEXT: s_cbranch_vccnz .LBB111_6
16676 ; GCN2-NEXT: .LBB111_2: ; %atomicrmw.phi
16677 ; GCN2-NEXT: s_setpc_b64 s[30:31]
16678 ; GCN2-NEXT: .LBB111_3: ; %atomicrmw.global
16679 ; GCN2-NEXT: s_add_u32 s34, s4, 4
16680 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
16681 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
16682 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
16683 ; GCN2-NEXT: v_mov_b32_e32 v4, s4
16684 ; GCN2-NEXT: v_mov_b32_e32 v5, s5
16685 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
16686 ; GCN2-NEXT: flat_load_dword v2, v[4:5]
16687 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
16688 ; GCN2-NEXT: v_mov_b32_e32 v6, s7
16689 ; GCN2-NEXT: v_mov_b32_e32 v7, s6
16690 ; GCN2-NEXT: .LBB111_4: ; %atomicrmw.start
16691 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
16692 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16693 ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
16694 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
16695 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
16696 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
16697 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16698 ; GCN2-NEXT: buffer_wbinvl1_vol
16699 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
16700 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
16701 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
16702 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
16703 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
16704 ; GCN2-NEXT: s_cbranch_execnz .LBB111_4
16705 ; GCN2-NEXT: ; %bb.5: ; %Flow
16706 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
16707 ; GCN2-NEXT: s_branch .LBB111_2
16708 ; GCN2-NEXT: .LBB111_6: ; %atomicrmw.private
16709 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
16710 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
16711 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
16712 ; GCN2-NEXT: s_add_i32 s34, s34, 4
16713 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
16714 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
16715 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
16716 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
16717 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
16718 ; GCN2-NEXT: s_waitcnt vmcnt(0)
16719 ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1]
16720 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
16721 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
16722 ; GCN2-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
16723 ; GCN2-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
16724 ; GCN2-NEXT: s_waitcnt vmcnt(0)
16725 ; GCN2-NEXT: s_setpc_b64 s[30:31]
16727 ; GCN3-LABEL: flat_atomic_umin_i64_noret_scalar:
16729 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16730 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
16731 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
16732 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
16733 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
16734 ; GCN3-NEXT: s_mov_b64 s[34:35], -1
16735 ; GCN3-NEXT: s_cbranch_vccnz .LBB111_3
16736 ; GCN3-NEXT: ; %bb.1: ; %Flow3
16737 ; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35]
16738 ; GCN3-NEXT: s_cbranch_vccnz .LBB111_6
16739 ; GCN3-NEXT: .LBB111_2: ; %atomicrmw.phi
16740 ; GCN3-NEXT: s_setpc_b64 s[30:31]
16741 ; GCN3-NEXT: .LBB111_3: ; %atomicrmw.global
16742 ; GCN3-NEXT: v_mov_b32_e32 v4, s4
16743 ; GCN3-NEXT: v_mov_b32_e32 v5, s5
16744 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
16745 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
16746 ; GCN3-NEXT: v_mov_b32_e32 v6, s7
16747 ; GCN3-NEXT: v_mov_b32_e32 v7, s6
16748 ; GCN3-NEXT: .LBB111_4: ; %atomicrmw.start
16749 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
16750 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16751 ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
16752 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
16753 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
16754 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
16755 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16756 ; GCN3-NEXT: buffer_wbinvl1_vol
16757 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
16758 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
16759 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
16760 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
16761 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
16762 ; GCN3-NEXT: s_cbranch_execnz .LBB111_4
16763 ; GCN3-NEXT: ; %bb.5: ; %Flow
16764 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
16765 ; GCN3-NEXT: s_branch .LBB111_2
16766 ; GCN3-NEXT: .LBB111_6: ; %atomicrmw.private
16767 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
16768 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
16769 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
16770 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
16771 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
16772 ; GCN3-NEXT: v_mov_b32_e32 v4, s6
16773 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
16774 ; GCN3-NEXT: s_waitcnt vmcnt(0)
16775 ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1]
16776 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
16777 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
16778 ; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
16779 ; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
16780 ; GCN3-NEXT: s_waitcnt vmcnt(0)
16781 ; GCN3-NEXT: s_setpc_b64 s[30:31]
16782 %tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst
16786 define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
16787 ; GCN1-LABEL: flat_atomic_umin_i64_noret_offset_scalar:
16789 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16790 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
16791 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
16792 ; GCN1-NEXT: s_add_u32 s34, s4, 32
16793 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
16794 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
16795 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
16796 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
16797 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
16798 ; GCN1-NEXT: s_mov_b64 s[36:37], -1
16799 ; GCN1-NEXT: s_cbranch_vccnz .LBB112_3
16800 ; GCN1-NEXT: ; %bb.1: ; %Flow3
16801 ; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37]
16802 ; GCN1-NEXT: s_cbranch_vccnz .LBB112_6
16803 ; GCN1-NEXT: .LBB112_2: ; %atomicrmw.phi
16804 ; GCN1-NEXT: s_setpc_b64 s[30:31]
16805 ; GCN1-NEXT: .LBB112_3: ; %atomicrmw.global
16806 ; GCN1-NEXT: s_add_u32 s36, s34, 4
16807 ; GCN1-NEXT: s_addc_u32 s37, s35, 0
16808 ; GCN1-NEXT: v_mov_b32_e32 v0, s36
16809 ; GCN1-NEXT: v_mov_b32_e32 v1, s37
16810 ; GCN1-NEXT: v_mov_b32_e32 v4, s34
16811 ; GCN1-NEXT: v_mov_b32_e32 v5, s35
16812 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
16813 ; GCN1-NEXT: flat_load_dword v2, v[4:5]
16814 ; GCN1-NEXT: s_mov_b64 s[36:37], 0
16815 ; GCN1-NEXT: v_mov_b32_e32 v6, s7
16816 ; GCN1-NEXT: v_mov_b32_e32 v7, s6
16817 ; GCN1-NEXT: .LBB112_4: ; %atomicrmw.start
16818 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
16819 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16820 ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
16821 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
16822 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
16823 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
16824 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16825 ; GCN1-NEXT: buffer_wbinvl1_vol
16826 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
16827 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
16828 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
16829 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
16830 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
16831 ; GCN1-NEXT: s_cbranch_execnz .LBB112_4
16832 ; GCN1-NEXT: ; %bb.5: ; %Flow
16833 ; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
16834 ; GCN1-NEXT: s_branch .LBB112_2
16835 ; GCN1-NEXT: .LBB112_6: ; %atomicrmw.private
16836 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
16837 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
16838 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
16839 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
16840 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
16841 ; GCN1-NEXT: s_add_i32 s34, s34, 4
16842 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
16843 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
16844 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
16845 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
16846 ; GCN1-NEXT: s_waitcnt vmcnt(0)
16847 ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1]
16848 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
16849 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
16850 ; GCN1-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
16851 ; GCN1-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
16852 ; GCN1-NEXT: s_waitcnt vmcnt(0)
16853 ; GCN1-NEXT: s_setpc_b64 s[30:31]
16855 ; GCN2-LABEL: flat_atomic_umin_i64_noret_offset_scalar:
16857 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16858 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
16859 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
16860 ; GCN2-NEXT: s_add_u32 s34, s4, 32
16861 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
16862 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
16863 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
16864 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
16865 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
16866 ; GCN2-NEXT: s_mov_b64 s[36:37], -1
16867 ; GCN2-NEXT: s_cbranch_vccnz .LBB112_3
16868 ; GCN2-NEXT: ; %bb.1: ; %Flow3
16869 ; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37]
16870 ; GCN2-NEXT: s_cbranch_vccnz .LBB112_6
16871 ; GCN2-NEXT: .LBB112_2: ; %atomicrmw.phi
16872 ; GCN2-NEXT: s_setpc_b64 s[30:31]
16873 ; GCN2-NEXT: .LBB112_3: ; %atomicrmw.global
16874 ; GCN2-NEXT: s_add_u32 s36, s34, 4
16875 ; GCN2-NEXT: s_addc_u32 s37, s35, 0
16876 ; GCN2-NEXT: v_mov_b32_e32 v0, s36
16877 ; GCN2-NEXT: v_mov_b32_e32 v1, s37
16878 ; GCN2-NEXT: v_mov_b32_e32 v4, s34
16879 ; GCN2-NEXT: v_mov_b32_e32 v5, s35
16880 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
16881 ; GCN2-NEXT: flat_load_dword v2, v[4:5]
16882 ; GCN2-NEXT: s_mov_b64 s[36:37], 0
16883 ; GCN2-NEXT: v_mov_b32_e32 v6, s7
16884 ; GCN2-NEXT: v_mov_b32_e32 v7, s6
16885 ; GCN2-NEXT: .LBB112_4: ; %atomicrmw.start
16886 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
16887 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16888 ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
16889 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
16890 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
16891 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
16892 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16893 ; GCN2-NEXT: buffer_wbinvl1_vol
16894 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
16895 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
16896 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
16897 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
16898 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
16899 ; GCN2-NEXT: s_cbranch_execnz .LBB112_4
16900 ; GCN2-NEXT: ; %bb.5: ; %Flow
16901 ; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
16902 ; GCN2-NEXT: s_branch .LBB112_2
16903 ; GCN2-NEXT: .LBB112_6: ; %atomicrmw.private
16904 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
16905 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
16906 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
16907 ; GCN2-NEXT: s_add_i32 s34, s34, 4
16908 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
16909 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
16910 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
16911 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
16912 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
16913 ; GCN2-NEXT: s_waitcnt vmcnt(0)
16914 ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1]
16915 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
16916 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
16917 ; GCN2-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
16918 ; GCN2-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
16919 ; GCN2-NEXT: s_waitcnt vmcnt(0)
16920 ; GCN2-NEXT: s_setpc_b64 s[30:31]
16922 ; GCN3-LABEL: flat_atomic_umin_i64_noret_offset_scalar:
16924 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16925 ; GCN3-NEXT: s_add_u32 s34, s4, 32
16926 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
16927 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
16928 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
16929 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
16930 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
16931 ; GCN3-NEXT: s_mov_b64 s[36:37], -1
16932 ; GCN3-NEXT: s_cbranch_vccnz .LBB112_3
16933 ; GCN3-NEXT: ; %bb.1: ; %Flow3
16934 ; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37]
16935 ; GCN3-NEXT: s_cbranch_vccnz .LBB112_6
16936 ; GCN3-NEXT: .LBB112_2: ; %atomicrmw.phi
16937 ; GCN3-NEXT: s_setpc_b64 s[30:31]
16938 ; GCN3-NEXT: .LBB112_3: ; %atomicrmw.global
16939 ; GCN3-NEXT: v_mov_b32_e32 v4, s34
16940 ; GCN3-NEXT: v_mov_b32_e32 v5, s35
16941 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
16942 ; GCN3-NEXT: s_mov_b64 s[36:37], 0
16943 ; GCN3-NEXT: v_mov_b32_e32 v6, s7
16944 ; GCN3-NEXT: v_mov_b32_e32 v7, s6
16945 ; GCN3-NEXT: .LBB112_4: ; %atomicrmw.start
16946 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
16947 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16948 ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
16949 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
16950 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
16951 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
16952 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16953 ; GCN3-NEXT: buffer_wbinvl1_vol
16954 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
16955 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
16956 ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
16957 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
16958 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
16959 ; GCN3-NEXT: s_cbranch_execnz .LBB112_4
16960 ; GCN3-NEXT: ; %bb.5: ; %Flow
16961 ; GCN3-NEXT: s_or_b64 exec, exec, s[36:37]
16962 ; GCN3-NEXT: s_branch .LBB112_2
16963 ; GCN3-NEXT: .LBB112_6: ; %atomicrmw.private
16964 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
16965 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
16966 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
16967 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
16968 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
16969 ; GCN3-NEXT: v_mov_b32_e32 v4, s6
16970 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
16971 ; GCN3-NEXT: s_waitcnt vmcnt(0)
16972 ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1]
16973 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
16974 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
16975 ; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
16976 ; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
16977 ; GCN3-NEXT: s_waitcnt vmcnt(0)
16978 ; GCN3-NEXT: s_setpc_b64 s[30:31]
16979 %gep = getelementptr i64, ptr %out, i64 4
16980 %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst
16984 define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
16985 ; GCN1-LABEL: flat_atomic_umin_i64_ret_scalar:
16987 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16988 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
16989 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
16990 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
16991 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
16992 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
16993 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
16994 ; GCN1-NEXT: s_cbranch_vccz .LBB113_4
16995 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
16996 ; GCN1-NEXT: s_add_u32 s34, s4, 4
16997 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
16998 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
16999 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
17000 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
17001 ; GCN1-NEXT: v_mov_b32_e32 v3, s5
17002 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
17003 ; GCN1-NEXT: flat_load_dword v0, v[2:3]
17004 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
17005 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
17006 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
17007 ; GCN1-NEXT: .LBB113_2: ; %atomicrmw.start
17008 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
17009 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17010 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
17011 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
17012 ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
17013 ; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
17014 ; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
17015 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
17016 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17017 ; GCN1-NEXT: buffer_wbinvl1_vol
17018 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
17019 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
17020 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
17021 ; GCN1-NEXT: s_cbranch_execnz .LBB113_2
17022 ; GCN1-NEXT: ; %bb.3: ; %Flow
17023 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
17024 ; GCN1-NEXT: s_branch .LBB113_6
17025 ; GCN1-NEXT: .LBB113_4:
17026 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
17027 ; GCN1-NEXT: s_cbranch_execz .LBB113_6
17028 ; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
17029 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
17030 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
17031 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
17032 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
17033 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
17034 ; GCN1-NEXT: s_add_i32 s34, s34, 4
17035 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
17036 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
17037 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
17038 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
17039 ; GCN1-NEXT: s_waitcnt vmcnt(0)
17040 ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1]
17041 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
17042 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
17043 ; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
17044 ; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
17045 ; GCN1-NEXT: .LBB113_6: ; %atomicrmw.phi
17046 ; GCN1-NEXT: s_waitcnt vmcnt(0)
17047 ; GCN1-NEXT: s_setpc_b64 s[30:31]
17049 ; GCN2-LABEL: flat_atomic_umin_i64_ret_scalar:
17051 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17052 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
17053 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
17054 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
17055 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
17056 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
17057 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
17058 ; GCN2-NEXT: s_cbranch_vccz .LBB113_4
17059 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
17060 ; GCN2-NEXT: s_add_u32 s34, s4, 4
17061 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
17062 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
17063 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
17064 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
17065 ; GCN2-NEXT: v_mov_b32_e32 v3, s5
17066 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
17067 ; GCN2-NEXT: flat_load_dword v0, v[2:3]
17068 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
17069 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
17070 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
17071 ; GCN2-NEXT: .LBB113_2: ; %atomicrmw.start
17072 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
17073 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17074 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
17075 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
17076 ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
17077 ; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
17078 ; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
17079 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
17080 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17081 ; GCN2-NEXT: buffer_wbinvl1_vol
17082 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
17083 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
17084 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
17085 ; GCN2-NEXT: s_cbranch_execnz .LBB113_2
17086 ; GCN2-NEXT: ; %bb.3: ; %Flow
17087 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
17088 ; GCN2-NEXT: s_branch .LBB113_6
17089 ; GCN2-NEXT: .LBB113_4:
17090 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
17091 ; GCN2-NEXT: s_cbranch_execz .LBB113_6
17092 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
17093 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
17094 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
17095 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
17096 ; GCN2-NEXT: s_add_i32 s34, s34, 4
17097 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
17098 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
17099 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
17100 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
17101 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
17102 ; GCN2-NEXT: s_waitcnt vmcnt(0)
17103 ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1]
17104 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
17105 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
17106 ; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
17107 ; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
17108 ; GCN2-NEXT: .LBB113_6: ; %atomicrmw.phi
17109 ; GCN2-NEXT: s_waitcnt vmcnt(0)
17110 ; GCN2-NEXT: s_setpc_b64 s[30:31]
17112 ; GCN3-LABEL: flat_atomic_umin_i64_ret_scalar:
17114 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17115 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
17116 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
17117 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
17118 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
17119 ; GCN3-NEXT: s_cbranch_vccz .LBB113_4
17120 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
17121 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
17122 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
17123 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
17124 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
17125 ; GCN3-NEXT: v_mov_b32_e32 v4, s7
17126 ; GCN3-NEXT: v_mov_b32_e32 v5, s6
17127 ; GCN3-NEXT: .LBB113_2: ; %atomicrmw.start
17128 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
17129 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17130 ; GCN3-NEXT: v_mov_b32_e32 v9, v1
17131 ; GCN3-NEXT: v_mov_b32_e32 v8, v0
17132 ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
17133 ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
17134 ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
17135 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
17136 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17137 ; GCN3-NEXT: buffer_wbinvl1_vol
17138 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
17139 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
17140 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
17141 ; GCN3-NEXT: s_cbranch_execnz .LBB113_2
17142 ; GCN3-NEXT: ; %bb.3: ; %Flow
17143 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
17144 ; GCN3-NEXT: s_branch .LBB113_6
17145 ; GCN3-NEXT: .LBB113_4:
17146 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
17147 ; GCN3-NEXT: s_cbranch_execz .LBB113_6
17148 ; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
17149 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
17150 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
17151 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
17152 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
17153 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
17154 ; GCN3-NEXT: v_mov_b32_e32 v4, s6
17155 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
17156 ; GCN3-NEXT: s_waitcnt vmcnt(0)
17157 ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1]
17158 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
17159 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
17160 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
17161 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
17162 ; GCN3-NEXT: .LBB113_6: ; %atomicrmw.phi
17163 ; GCN3-NEXT: s_waitcnt vmcnt(0)
17164 ; GCN3-NEXT: s_setpc_b64 s[30:31]
17165 %result = atomicrmw umin ptr %ptr, i64 %in seq_cst
17169 define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
17170 ; GCN1-LABEL: flat_atomic_umin_i64_ret_offset_scalar:
17172 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17173 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
17174 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
17175 ; GCN1-NEXT: s_add_u32 s34, s4, 32
17176 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
17177 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
17178 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
17179 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
17180 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
17181 ; GCN1-NEXT: s_cbranch_vccz .LBB114_4
17182 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
17183 ; GCN1-NEXT: s_add_u32 s36, s34, 4
17184 ; GCN1-NEXT: s_addc_u32 s37, s35, 0
17185 ; GCN1-NEXT: v_mov_b32_e32 v0, s36
17186 ; GCN1-NEXT: v_mov_b32_e32 v1, s37
17187 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
17188 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
17189 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
17190 ; GCN1-NEXT: flat_load_dword v0, v[2:3]
17191 ; GCN1-NEXT: s_mov_b64 s[36:37], 0
17192 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
17193 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
17194 ; GCN1-NEXT: .LBB114_2: ; %atomicrmw.start
17195 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
17196 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17197 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
17198 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
17199 ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
17200 ; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
17201 ; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
17202 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
17203 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17204 ; GCN1-NEXT: buffer_wbinvl1_vol
17205 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
17206 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
17207 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
17208 ; GCN1-NEXT: s_cbranch_execnz .LBB114_2
17209 ; GCN1-NEXT: ; %bb.3: ; %Flow
17210 ; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
17211 ; GCN1-NEXT: s_branch .LBB114_6
17212 ; GCN1-NEXT: .LBB114_4:
17213 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
17214 ; GCN1-NEXT: s_cbranch_execz .LBB114_6
17215 ; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
17216 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
17217 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
17218 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
17219 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
17220 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
17221 ; GCN1-NEXT: s_add_i32 s34, s34, 4
17222 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
17223 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
17224 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
17225 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
17226 ; GCN1-NEXT: s_waitcnt vmcnt(0)
17227 ; GCN1-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1]
17228 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
17229 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
17230 ; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
17231 ; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
17232 ; GCN1-NEXT: .LBB114_6: ; %atomicrmw.phi
17233 ; GCN1-NEXT: s_waitcnt vmcnt(0)
17234 ; GCN1-NEXT: s_setpc_b64 s[30:31]
17236 ; GCN2-LABEL: flat_atomic_umin_i64_ret_offset_scalar:
17238 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17239 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
17240 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
17241 ; GCN2-NEXT: s_add_u32 s34, s4, 32
17242 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
17243 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
17244 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
17245 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
17246 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
17247 ; GCN2-NEXT: s_cbranch_vccz .LBB114_4
17248 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
17249 ; GCN2-NEXT: s_add_u32 s36, s34, 4
17250 ; GCN2-NEXT: s_addc_u32 s37, s35, 0
17251 ; GCN2-NEXT: v_mov_b32_e32 v0, s36
17252 ; GCN2-NEXT: v_mov_b32_e32 v1, s37
17253 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
17254 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
17255 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
17256 ; GCN2-NEXT: flat_load_dword v0, v[2:3]
17257 ; GCN2-NEXT: s_mov_b64 s[36:37], 0
17258 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
17259 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
17260 ; GCN2-NEXT: .LBB114_2: ; %atomicrmw.start
17261 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
17262 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17263 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
17264 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
17265 ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
17266 ; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
17267 ; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
17268 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
17269 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17270 ; GCN2-NEXT: buffer_wbinvl1_vol
17271 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
17272 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
17273 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
17274 ; GCN2-NEXT: s_cbranch_execnz .LBB114_2
17275 ; GCN2-NEXT: ; %bb.3: ; %Flow
17276 ; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
17277 ; GCN2-NEXT: s_branch .LBB114_6
17278 ; GCN2-NEXT: .LBB114_4:
17279 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
17280 ; GCN2-NEXT: s_cbranch_execz .LBB114_6
17281 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
17282 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
17283 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
17284 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
17285 ; GCN2-NEXT: s_add_i32 s34, s34, 4
17286 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
17287 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
17288 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
17289 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
17290 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
17291 ; GCN2-NEXT: s_waitcnt vmcnt(0)
17292 ; GCN2-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1]
17293 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
17294 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
17295 ; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
17296 ; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
17297 ; GCN2-NEXT: .LBB114_6: ; %atomicrmw.phi
17298 ; GCN2-NEXT: s_waitcnt vmcnt(0)
17299 ; GCN2-NEXT: s_setpc_b64 s[30:31]
17301 ; GCN3-LABEL: flat_atomic_umin_i64_ret_offset_scalar:
17303 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17304 ; GCN3-NEXT: s_add_u32 s34, s4, 32
17305 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
17306 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
17307 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
17308 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
17309 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
17310 ; GCN3-NEXT: s_cbranch_vccz .LBB114_4
17311 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
17312 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
17313 ; GCN3-NEXT: v_mov_b32_e32 v3, s35
17314 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
17315 ; GCN3-NEXT: s_mov_b64 s[36:37], 0
17316 ; GCN3-NEXT: v_mov_b32_e32 v4, s7
17317 ; GCN3-NEXT: v_mov_b32_e32 v5, s6
17318 ; GCN3-NEXT: .LBB114_2: ; %atomicrmw.start
17319 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
17320 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17321 ; GCN3-NEXT: v_mov_b32_e32 v9, v1
17322 ; GCN3-NEXT: v_mov_b32_e32 v8, v0
17323 ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
17324 ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
17325 ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
17326 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
17327 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17328 ; GCN3-NEXT: buffer_wbinvl1_vol
17329 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
17330 ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
17331 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
17332 ; GCN3-NEXT: s_cbranch_execnz .LBB114_2
17333 ; GCN3-NEXT: ; %bb.3: ; %Flow
17334 ; GCN3-NEXT: s_or_b64 exec, exec, s[36:37]
17335 ; GCN3-NEXT: s_branch .LBB114_6
17336 ; GCN3-NEXT: .LBB114_4:
17337 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
17338 ; GCN3-NEXT: s_cbranch_execz .LBB114_6
17339 ; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
17340 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
17341 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
17342 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
17343 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
17344 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
17345 ; GCN3-NEXT: v_mov_b32_e32 v4, s6
17346 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
17347 ; GCN3-NEXT: s_waitcnt vmcnt(0)
17348 ; GCN3-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[0:1]
17349 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
17350 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
17351 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
17352 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
17353 ; GCN3-NEXT: .LBB114_6: ; %atomicrmw.phi
17354 ; GCN3-NEXT: s_waitcnt vmcnt(0)
17355 ; GCN3-NEXT: s_setpc_b64 s[30:31]
17356 %gep = getelementptr i64, ptr %out, i64 4
17357 %result = atomicrmw umin ptr %gep, i64 %in seq_cst
17361 define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
17362 ; GCN1-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory:
17364 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17365 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
17366 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
17367 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
17368 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
17369 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
17370 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
17371 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
17372 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
17373 ; GCN1-NEXT: s_cbranch_execnz .LBB115_3
17374 ; GCN1-NEXT: ; %bb.1: ; %Flow3
17375 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
17376 ; GCN1-NEXT: s_cbranch_execnz .LBB115_6
17377 ; GCN1-NEXT: .LBB115_2: ; %atomicrmw.phi
17378 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
17379 ; GCN1-NEXT: s_setpc_b64 s[30:31]
17380 ; GCN1-NEXT: .LBB115_3: ; %atomicrmw.global
17381 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
17382 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
17383 ; GCN1-NEXT: flat_load_dword v7, v[4:5]
17384 ; GCN1-NEXT: flat_load_dword v6, v[0:1]
17385 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
17386 ; GCN1-NEXT: .LBB115_4: ; %atomicrmw.start
17387 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
17388 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17389 ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
17390 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
17391 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
17392 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
17393 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17394 ; GCN1-NEXT: buffer_wbinvl1_vol
17395 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
17396 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
17397 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
17398 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
17399 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
17400 ; GCN1-NEXT: s_cbranch_execnz .LBB115_4
17401 ; GCN1-NEXT: ; %bb.5: ; %Flow
17402 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
17403 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
17404 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
17405 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
17406 ; GCN1-NEXT: s_cbranch_execz .LBB115_2
17407 ; GCN1-NEXT: .LBB115_6: ; %atomicrmw.private
17408 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
17409 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
17410 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
17411 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
17412 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
17413 ; GCN1-NEXT: s_waitcnt vmcnt(0)
17414 ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
17415 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
17416 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
17417 ; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
17418 ; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
17419 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
17420 ; GCN1-NEXT: s_waitcnt vmcnt(0)
17421 ; GCN1-NEXT: s_setpc_b64 s[30:31]
17423 ; GCN2-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory:
17425 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17426 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
17427 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
17428 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
17429 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
17430 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
17431 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
17432 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
17433 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
17434 ; GCN2-NEXT: s_cbranch_execnz .LBB115_3
17435 ; GCN2-NEXT: ; %bb.1: ; %Flow3
17436 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
17437 ; GCN2-NEXT: s_cbranch_execnz .LBB115_6
17438 ; GCN2-NEXT: .LBB115_2: ; %atomicrmw.phi
17439 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
17440 ; GCN2-NEXT: s_setpc_b64 s[30:31]
17441 ; GCN2-NEXT: .LBB115_3: ; %atomicrmw.global
17442 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
17443 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
17444 ; GCN2-NEXT: flat_load_dword v7, v[4:5]
17445 ; GCN2-NEXT: flat_load_dword v6, v[0:1]
17446 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
17447 ; GCN2-NEXT: .LBB115_4: ; %atomicrmw.start
17448 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
17449 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17450 ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
17451 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
17452 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
17453 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
17454 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17455 ; GCN2-NEXT: buffer_wbinvl1_vol
17456 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
17457 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
17458 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
17459 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
17460 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
17461 ; GCN2-NEXT: s_cbranch_execnz .LBB115_4
17462 ; GCN2-NEXT: ; %bb.5: ; %Flow
17463 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
17464 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
17465 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
17466 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
17467 ; GCN2-NEXT: s_cbranch_execz .LBB115_2
17468 ; GCN2-NEXT: .LBB115_6: ; %atomicrmw.private
17469 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
17470 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
17471 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
17472 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
17473 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
17474 ; GCN2-NEXT: s_waitcnt vmcnt(0)
17475 ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
17476 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
17477 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
17478 ; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
17479 ; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
17480 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
17481 ; GCN2-NEXT: s_waitcnt vmcnt(0)
17482 ; GCN2-NEXT: s_setpc_b64 s[30:31]
17484 ; GCN3-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory:
17486 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17487 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
17488 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
17489 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
17490 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
17491 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
17492 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
17493 ; GCN3-NEXT: s_cbranch_execnz .LBB115_3
17494 ; GCN3-NEXT: ; %bb.1: ; %Flow3
17495 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
17496 ; GCN3-NEXT: s_cbranch_execnz .LBB115_6
17497 ; GCN3-NEXT: .LBB115_2: ; %atomicrmw.phi
17498 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
17499 ; GCN3-NEXT: s_setpc_b64 s[30:31]
17500 ; GCN3-NEXT: .LBB115_3: ; %atomicrmw.global
17501 ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
17502 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
17503 ; GCN3-NEXT: .LBB115_4: ; %atomicrmw.start
17504 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
17505 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17506 ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
17507 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
17508 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
17509 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
17510 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17511 ; GCN3-NEXT: buffer_wbinvl1_vol
17512 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
17513 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
17514 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
17515 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
17516 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
17517 ; GCN3-NEXT: s_cbranch_execnz .LBB115_4
17518 ; GCN3-NEXT: ; %bb.5: ; %Flow
17519 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
17520 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
17521 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
17522 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
17523 ; GCN3-NEXT: s_cbranch_execz .LBB115_2
17524 ; GCN3-NEXT: .LBB115_6: ; %atomicrmw.private
17525 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
17526 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
17527 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
17528 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
17529 ; GCN3-NEXT: s_waitcnt vmcnt(0)
17530 ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
17531 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
17532 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
17533 ; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
17534 ; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
17535 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
17536 ; GCN3-NEXT: s_waitcnt vmcnt(0)
17537 ; GCN3-NEXT: s_setpc_b64 s[30:31]
17538 %gep = getelementptr i64, ptr %out, i64 4
17539 %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
17543 define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
17544 ; GCN1-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
17546 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17547 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
17548 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
17549 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
17550 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
17551 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
17552 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
17553 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
17554 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
17555 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
17556 ; GCN1-NEXT: s_cbranch_execnz .LBB116_3
17557 ; GCN1-NEXT: ; %bb.1: ; %Flow3
17558 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
17559 ; GCN1-NEXT: s_cbranch_execnz .LBB116_6
17560 ; GCN1-NEXT: .LBB116_2: ; %atomicrmw.phi
17561 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
17562 ; GCN1-NEXT: s_setpc_b64 s[30:31]
17563 ; GCN1-NEXT: .LBB116_3: ; %atomicrmw.global
17564 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4
17565 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
17566 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
17567 ; GCN1-NEXT: flat_load_dword v0, v[4:5]
17568 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
17569 ; GCN1-NEXT: .LBB116_4: ; %atomicrmw.start
17570 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
17571 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17572 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
17573 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
17574 ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
17575 ; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
17576 ; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
17577 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
17578 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17579 ; GCN1-NEXT: buffer_wbinvl1_vol
17580 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
17581 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
17582 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
17583 ; GCN1-NEXT: s_cbranch_execnz .LBB116_4
17584 ; GCN1-NEXT: ; %bb.5: ; %Flow
17585 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
17586 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
17587 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
17588 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
17589 ; GCN1-NEXT: s_cbranch_execz .LBB116_2
17590 ; GCN1-NEXT: .LBB116_6: ; %atomicrmw.private
17591 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
17592 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
17593 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
17594 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
17595 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
17596 ; GCN1-NEXT: s_waitcnt vmcnt(0)
17597 ; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
17598 ; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
17599 ; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
17600 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
17601 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
17602 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
17603 ; GCN1-NEXT: s_waitcnt vmcnt(0)
17604 ; GCN1-NEXT: s_setpc_b64 s[30:31]
17606 ; GCN2-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
17608 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17609 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
17610 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
17611 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
17612 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
17613 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
17614 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
17615 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
17616 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
17617 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
17618 ; GCN2-NEXT: s_cbranch_execnz .LBB116_3
17619 ; GCN2-NEXT: ; %bb.1: ; %Flow3
17620 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
17621 ; GCN2-NEXT: s_cbranch_execnz .LBB116_6
17622 ; GCN2-NEXT: .LBB116_2: ; %atomicrmw.phi
17623 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
17624 ; GCN2-NEXT: s_setpc_b64 s[30:31]
17625 ; GCN2-NEXT: .LBB116_3: ; %atomicrmw.global
17626 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4
17627 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
17628 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
17629 ; GCN2-NEXT: flat_load_dword v0, v[4:5]
17630 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
17631 ; GCN2-NEXT: .LBB116_4: ; %atomicrmw.start
17632 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
17633 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17634 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
17635 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
17636 ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
17637 ; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
17638 ; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
17639 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
17640 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17641 ; GCN2-NEXT: buffer_wbinvl1_vol
17642 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
17643 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
17644 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
17645 ; GCN2-NEXT: s_cbranch_execnz .LBB116_4
17646 ; GCN2-NEXT: ; %bb.5: ; %Flow
17647 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
17648 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
17649 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
17650 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
17651 ; GCN2-NEXT: s_cbranch_execz .LBB116_2
17652 ; GCN2-NEXT: .LBB116_6: ; %atomicrmw.private
17653 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
17654 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
17655 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
17656 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
17657 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
17658 ; GCN2-NEXT: s_waitcnt vmcnt(0)
17659 ; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
17660 ; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
17661 ; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
17662 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
17663 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
17664 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
17665 ; GCN2-NEXT: s_waitcnt vmcnt(0)
17666 ; GCN2-NEXT: s_setpc_b64 s[30:31]
17668 ; GCN3-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
17670 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17671 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
17672 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
17673 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
17674 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
17675 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
17676 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
17677 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
17678 ; GCN3-NEXT: s_cbranch_execnz .LBB116_3
17679 ; GCN3-NEXT: ; %bb.1: ; %Flow3
17680 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
17681 ; GCN3-NEXT: s_cbranch_execnz .LBB116_6
17682 ; GCN3-NEXT: .LBB116_2: ; %atomicrmw.phi
17683 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
17684 ; GCN3-NEXT: s_setpc_b64 s[30:31]
17685 ; GCN3-NEXT: .LBB116_3: ; %atomicrmw.global
17686 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
17687 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
17688 ; GCN3-NEXT: .LBB116_4: ; %atomicrmw.start
17689 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
17690 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17691 ; GCN3-NEXT: v_mov_b32_e32 v9, v1
17692 ; GCN3-NEXT: v_mov_b32_e32 v8, v0
17693 ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
17694 ; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
17695 ; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
17696 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
17697 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17698 ; GCN3-NEXT: buffer_wbinvl1_vol
17699 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
17700 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
17701 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
17702 ; GCN3-NEXT: s_cbranch_execnz .LBB116_4
17703 ; GCN3-NEXT: ; %bb.5: ; %Flow
17704 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
17705 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
17706 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
17707 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
17708 ; GCN3-NEXT: s_cbranch_execz .LBB116_2
17709 ; GCN3-NEXT: .LBB116_6: ; %atomicrmw.private
17710 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
17711 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
17712 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
17713 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
17714 ; GCN3-NEXT: s_waitcnt vmcnt(0)
17715 ; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
17716 ; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
17717 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
17718 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
17719 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
17720 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
17721 ; GCN3-NEXT: s_waitcnt vmcnt(0)
17722 ; GCN3-NEXT: s_setpc_b64 s[30:31]
17723 %gep = getelementptr i64, ptr %out, i64 4
17724 %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
17728 ; ---------------------------------------------------------------------
17730 ; ---------------------------------------------------------------------
17732 define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) {
17733 ; GCN1-LABEL: flat_atomic_min_i64_noret:
17735 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17736 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
17737 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
17738 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
17739 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
17740 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
17741 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
17742 ; GCN1-NEXT: s_cbranch_execnz .LBB117_3
17743 ; GCN1-NEXT: ; %bb.1: ; %Flow3
17744 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
17745 ; GCN1-NEXT: s_cbranch_execnz .LBB117_6
17746 ; GCN1-NEXT: .LBB117_2: ; %atomicrmw.phi
17747 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
17748 ; GCN1-NEXT: s_setpc_b64 s[30:31]
17749 ; GCN1-NEXT: .LBB117_3: ; %atomicrmw.global
17750 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
17751 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
17752 ; GCN1-NEXT: flat_load_dword v7, v[4:5]
17753 ; GCN1-NEXT: flat_load_dword v6, v[0:1]
17754 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
17755 ; GCN1-NEXT: .LBB117_4: ; %atomicrmw.start
17756 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
17757 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17758 ; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
17759 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
17760 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
17761 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
17762 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17763 ; GCN1-NEXT: buffer_wbinvl1_vol
17764 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
17765 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
17766 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
17767 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
17768 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
17769 ; GCN1-NEXT: s_cbranch_execnz .LBB117_4
17770 ; GCN1-NEXT: ; %bb.5: ; %Flow
17771 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
17772 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
17773 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
17774 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
17775 ; GCN1-NEXT: s_cbranch_execz .LBB117_2
17776 ; GCN1-NEXT: .LBB117_6: ; %atomicrmw.private
17777 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
17778 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
17779 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
17780 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
17781 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
17782 ; GCN1-NEXT: s_waitcnt vmcnt(0)
17783 ; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
17784 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
17785 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
17786 ; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
17787 ; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
17788 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
17789 ; GCN1-NEXT: s_waitcnt vmcnt(0)
17790 ; GCN1-NEXT: s_setpc_b64 s[30:31]
17792 ; GCN2-LABEL: flat_atomic_min_i64_noret:
17794 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17795 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
17796 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
17797 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
17798 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
17799 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
17800 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
17801 ; GCN2-NEXT: s_cbranch_execnz .LBB117_3
17802 ; GCN2-NEXT: ; %bb.1: ; %Flow3
17803 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
17804 ; GCN2-NEXT: s_cbranch_execnz .LBB117_6
17805 ; GCN2-NEXT: .LBB117_2: ; %atomicrmw.phi
17806 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
17807 ; GCN2-NEXT: s_setpc_b64 s[30:31]
17808 ; GCN2-NEXT: .LBB117_3: ; %atomicrmw.global
17809 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
17810 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
17811 ; GCN2-NEXT: flat_load_dword v7, v[4:5]
17812 ; GCN2-NEXT: flat_load_dword v6, v[0:1]
17813 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
17814 ; GCN2-NEXT: .LBB117_4: ; %atomicrmw.start
17815 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
17816 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17817 ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
17818 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
17819 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
17820 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
17821 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17822 ; GCN2-NEXT: buffer_wbinvl1_vol
17823 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
17824 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
17825 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
17826 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
17827 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
17828 ; GCN2-NEXT: s_cbranch_execnz .LBB117_4
17829 ; GCN2-NEXT: ; %bb.5: ; %Flow
17830 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
17831 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
17832 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
17833 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
17834 ; GCN2-NEXT: s_cbranch_execz .LBB117_2
17835 ; GCN2-NEXT: .LBB117_6: ; %atomicrmw.private
17836 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
17837 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
17838 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
17839 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
17840 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
17841 ; GCN2-NEXT: s_waitcnt vmcnt(0)
17842 ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
17843 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
17844 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
17845 ; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
17846 ; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
17847 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
17848 ; GCN2-NEXT: s_waitcnt vmcnt(0)
17849 ; GCN2-NEXT: s_setpc_b64 s[30:31]
17851 ; GCN3-LABEL: flat_atomic_min_i64_noret:
17853 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17854 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
17855 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
17856 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
17857 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
17858 ; GCN3-NEXT: s_cbranch_execnz .LBB117_3
17859 ; GCN3-NEXT: ; %bb.1: ; %Flow3
17860 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
17861 ; GCN3-NEXT: s_cbranch_execnz .LBB117_6
17862 ; GCN3-NEXT: .LBB117_2: ; %atomicrmw.phi
17863 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
17864 ; GCN3-NEXT: s_setpc_b64 s[30:31]
17865 ; GCN3-NEXT: .LBB117_3: ; %atomicrmw.global
17866 ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
17867 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
17868 ; GCN3-NEXT: .LBB117_4: ; %atomicrmw.start
17869 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
17870 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17871 ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
17872 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
17873 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
17874 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
17875 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17876 ; GCN3-NEXT: buffer_wbinvl1_vol
17877 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
17878 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
17879 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
17880 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
17881 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
17882 ; GCN3-NEXT: s_cbranch_execnz .LBB117_4
17883 ; GCN3-NEXT: ; %bb.5: ; %Flow
17884 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
17885 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
17886 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
17887 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
17888 ; GCN3-NEXT: s_cbranch_execz .LBB117_2
17889 ; GCN3-NEXT: .LBB117_6: ; %atomicrmw.private
17890 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
17891 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
17892 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
17893 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
17894 ; GCN3-NEXT: s_waitcnt vmcnt(0)
17895 ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
17896 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
17897 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
17898 ; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
17899 ; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
17900 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
17901 ; GCN3-NEXT: s_waitcnt vmcnt(0)
17902 ; GCN3-NEXT: s_setpc_b64 s[30:31]
17903 %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst
17907 define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) {
17908 ; GCN1-LABEL: flat_atomic_min_i64_noret_offset:
17910 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17911 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
17912 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
17913 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
17914 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
17915 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
17916 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
17917 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
17918 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
17919 ; GCN1-NEXT: s_cbranch_execnz .LBB118_3
17920 ; GCN1-NEXT: ; %bb.1: ; %Flow3
17921 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
17922 ; GCN1-NEXT: s_cbranch_execnz .LBB118_6
17923 ; GCN1-NEXT: .LBB118_2: ; %atomicrmw.phi
17924 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
17925 ; GCN1-NEXT: s_setpc_b64 s[30:31]
17926 ; GCN1-NEXT: .LBB118_3: ; %atomicrmw.global
17927 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
17928 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
17929 ; GCN1-NEXT: flat_load_dword v7, v[4:5]
17930 ; GCN1-NEXT: flat_load_dword v6, v[0:1]
17931 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
17932 ; GCN1-NEXT: .LBB118_4: ; %atomicrmw.start
17933 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
17934 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17935 ; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
17936 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
17937 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
17938 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
17939 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17940 ; GCN1-NEXT: buffer_wbinvl1_vol
17941 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
17942 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
17943 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
17944 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
17945 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
17946 ; GCN1-NEXT: s_cbranch_execnz .LBB118_4
17947 ; GCN1-NEXT: ; %bb.5: ; %Flow
17948 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
17949 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
17950 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
17951 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
17952 ; GCN1-NEXT: s_cbranch_execz .LBB118_2
17953 ; GCN1-NEXT: .LBB118_6: ; %atomicrmw.private
17954 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
17955 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
17956 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
17957 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
17958 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
17959 ; GCN1-NEXT: s_waitcnt vmcnt(0)
17960 ; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
17961 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
17962 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
17963 ; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
17964 ; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
17965 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
17966 ; GCN1-NEXT: s_waitcnt vmcnt(0)
17967 ; GCN1-NEXT: s_setpc_b64 s[30:31]
17969 ; GCN2-LABEL: flat_atomic_min_i64_noret_offset:
17971 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17972 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
17973 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
17974 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
17975 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
17976 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
17977 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
17978 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
17979 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
17980 ; GCN2-NEXT: s_cbranch_execnz .LBB118_3
17981 ; GCN2-NEXT: ; %bb.1: ; %Flow3
17982 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
17983 ; GCN2-NEXT: s_cbranch_execnz .LBB118_6
17984 ; GCN2-NEXT: .LBB118_2: ; %atomicrmw.phi
17985 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
17986 ; GCN2-NEXT: s_setpc_b64 s[30:31]
17987 ; GCN2-NEXT: .LBB118_3: ; %atomicrmw.global
17988 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
17989 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
17990 ; GCN2-NEXT: flat_load_dword v7, v[4:5]
17991 ; GCN2-NEXT: flat_load_dword v6, v[0:1]
17992 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
17993 ; GCN2-NEXT: .LBB118_4: ; %atomicrmw.start
17994 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
17995 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17996 ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
17997 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
17998 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
17999 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
18000 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18001 ; GCN2-NEXT: buffer_wbinvl1_vol
18002 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
18003 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
18004 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
18005 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
18006 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
18007 ; GCN2-NEXT: s_cbranch_execnz .LBB118_4
18008 ; GCN2-NEXT: ; %bb.5: ; %Flow
18009 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
18010 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
18011 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
18012 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
18013 ; GCN2-NEXT: s_cbranch_execz .LBB118_2
18014 ; GCN2-NEXT: .LBB118_6: ; %atomicrmw.private
18015 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
18016 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
18017 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
18018 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
18019 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
18020 ; GCN2-NEXT: s_waitcnt vmcnt(0)
18021 ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
18022 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
18023 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
18024 ; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
18025 ; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
18026 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
18027 ; GCN2-NEXT: s_waitcnt vmcnt(0)
18028 ; GCN2-NEXT: s_setpc_b64 s[30:31]
18030 ; GCN3-LABEL: flat_atomic_min_i64_noret_offset:
18032 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18033 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
18034 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
18035 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
18036 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
18037 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
18038 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
18039 ; GCN3-NEXT: s_cbranch_execnz .LBB118_3
18040 ; GCN3-NEXT: ; %bb.1: ; %Flow3
18041 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
18042 ; GCN3-NEXT: s_cbranch_execnz .LBB118_6
18043 ; GCN3-NEXT: .LBB118_2: ; %atomicrmw.phi
18044 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
18045 ; GCN3-NEXT: s_setpc_b64 s[30:31]
18046 ; GCN3-NEXT: .LBB118_3: ; %atomicrmw.global
18047 ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
18048 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
18049 ; GCN3-NEXT: .LBB118_4: ; %atomicrmw.start
18050 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
18051 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18052 ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
18053 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
18054 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
18055 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
18056 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18057 ; GCN3-NEXT: buffer_wbinvl1_vol
18058 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
18059 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
18060 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
18061 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
18062 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
18063 ; GCN3-NEXT: s_cbranch_execnz .LBB118_4
18064 ; GCN3-NEXT: ; %bb.5: ; %Flow
18065 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
18066 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
18067 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
18068 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
18069 ; GCN3-NEXT: s_cbranch_execz .LBB118_2
18070 ; GCN3-NEXT: .LBB118_6: ; %atomicrmw.private
18071 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
18072 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
18073 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
18074 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
18075 ; GCN3-NEXT: s_waitcnt vmcnt(0)
18076 ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
18077 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
18078 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
18079 ; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
18080 ; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
18081 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
18082 ; GCN3-NEXT: s_waitcnt vmcnt(0)
18083 ; GCN3-NEXT: s_setpc_b64 s[30:31]
18084 %gep = getelementptr i64, ptr %out, i64 4
18085 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst
18089 define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) {
18090 ; GCN1-LABEL: flat_atomic_min_i64_ret:
18092 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18093 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
18094 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
18095 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
18096 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
18097 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
18098 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
18099 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
18100 ; GCN1-NEXT: s_cbranch_execz .LBB119_4
18101 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
18102 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
18103 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
18104 ; GCN1-NEXT: flat_load_dword v5, v[4:5]
18105 ; GCN1-NEXT: flat_load_dword v4, v[0:1]
18106 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
18107 ; GCN1-NEXT: .LBB119_2: ; %atomicrmw.start
18108 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
18109 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18110 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
18111 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
18112 ; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
18113 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
18114 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
18115 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
18116 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18117 ; GCN1-NEXT: buffer_wbinvl1_vol
18118 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
18119 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
18120 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
18121 ; GCN1-NEXT: s_cbranch_execnz .LBB119_2
18122 ; GCN1-NEXT: ; %bb.3: ; %Flow
18123 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
18124 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
18125 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
18126 ; GCN1-NEXT: .LBB119_4: ; %Flow3
18127 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
18128 ; GCN1-NEXT: s_cbranch_execz .LBB119_6
18129 ; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
18130 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
18131 ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
18132 ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
18133 ; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
18134 ; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
18135 ; GCN1-NEXT: s_waitcnt vmcnt(0)
18136 ; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[4:5], v[2:3]
18137 ; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
18138 ; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
18139 ; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
18140 ; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
18141 ; GCN1-NEXT: .LBB119_6: ; %atomicrmw.phi
18142 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
18143 ; GCN1-NEXT: v_mov_b32_e32 v0, v4
18144 ; GCN1-NEXT: v_mov_b32_e32 v1, v5
18145 ; GCN1-NEXT: s_waitcnt vmcnt(0)
18146 ; GCN1-NEXT: s_setpc_b64 s[30:31]
18148 ; GCN2-LABEL: flat_atomic_min_i64_ret:
18150 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18151 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
18152 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
18153 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
18154 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
18155 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
18156 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
18157 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
18158 ; GCN2-NEXT: s_cbranch_execz .LBB119_4
18159 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
18160 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
18161 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
18162 ; GCN2-NEXT: flat_load_dword v5, v[4:5]
18163 ; GCN2-NEXT: flat_load_dword v4, v[0:1]
18164 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
18165 ; GCN2-NEXT: .LBB119_2: ; %atomicrmw.start
18166 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
18167 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18168 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
18169 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
18170 ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
18171 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
18172 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
18173 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
18174 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18175 ; GCN2-NEXT: buffer_wbinvl1_vol
18176 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
18177 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
18178 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
18179 ; GCN2-NEXT: s_cbranch_execnz .LBB119_2
18180 ; GCN2-NEXT: ; %bb.3: ; %Flow
18181 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
18182 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
18183 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
18184 ; GCN2-NEXT: .LBB119_4: ; %Flow3
18185 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
18186 ; GCN2-NEXT: s_cbranch_execz .LBB119_6
18187 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
18188 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
18189 ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
18190 ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
18191 ; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
18192 ; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
18193 ; GCN2-NEXT: s_waitcnt vmcnt(0)
18194 ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[4:5], v[2:3]
18195 ; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
18196 ; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
18197 ; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
18198 ; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
18199 ; GCN2-NEXT: .LBB119_6: ; %atomicrmw.phi
18200 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
18201 ; GCN2-NEXT: v_mov_b32_e32 v0, v4
18202 ; GCN2-NEXT: v_mov_b32_e32 v1, v5
18203 ; GCN2-NEXT: s_waitcnt vmcnt(0)
18204 ; GCN2-NEXT: s_setpc_b64 s[30:31]
18206 ; GCN3-LABEL: flat_atomic_min_i64_ret:
18208 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18209 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
18210 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
18211 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
18212 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
18213 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
18214 ; GCN3-NEXT: s_cbranch_execz .LBB119_4
18215 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
18216 ; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
18217 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
18218 ; GCN3-NEXT: .LBB119_2: ; %atomicrmw.start
18219 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
18220 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18221 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
18222 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
18223 ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
18224 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
18225 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
18226 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
18227 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18228 ; GCN3-NEXT: buffer_wbinvl1_vol
18229 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
18230 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
18231 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
18232 ; GCN3-NEXT: s_cbranch_execnz .LBB119_2
18233 ; GCN3-NEXT: ; %bb.3: ; %Flow
18234 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
18235 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
18236 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
18237 ; GCN3-NEXT: .LBB119_4: ; %Flow3
18238 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
18239 ; GCN3-NEXT: s_cbranch_execz .LBB119_6
18240 ; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
18241 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
18242 ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
18243 ; GCN3-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
18244 ; GCN3-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4
18245 ; GCN3-NEXT: s_waitcnt vmcnt(0)
18246 ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[4:5], v[2:3]
18247 ; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
18248 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
18249 ; GCN3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
18250 ; GCN3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
18251 ; GCN3-NEXT: .LBB119_6: ; %atomicrmw.phi
18252 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
18253 ; GCN3-NEXT: v_mov_b32_e32 v0, v4
18254 ; GCN3-NEXT: v_mov_b32_e32 v1, v5
18255 ; GCN3-NEXT: s_waitcnt vmcnt(0)
18256 ; GCN3-NEXT: s_setpc_b64 s[30:31]
18257 %result = atomicrmw min ptr %ptr, i64 %in seq_cst
18261 define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) {
18262 ; GCN1-LABEL: flat_atomic_min_i64_ret_offset:
18264 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18265 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
18266 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
18267 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
18268 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
18269 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
18270 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
18271 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
18272 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
18273 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
18274 ; GCN1-NEXT: s_cbranch_execnz .LBB120_3
18275 ; GCN1-NEXT: ; %bb.1: ; %Flow3
18276 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
18277 ; GCN1-NEXT: s_cbranch_execnz .LBB120_6
18278 ; GCN1-NEXT: .LBB120_2: ; %atomicrmw.phi
18279 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
18280 ; GCN1-NEXT: s_setpc_b64 s[30:31]
18281 ; GCN1-NEXT: .LBB120_3: ; %atomicrmw.global
18282 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4
18283 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
18284 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
18285 ; GCN1-NEXT: flat_load_dword v0, v[4:5]
18286 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
18287 ; GCN1-NEXT: .LBB120_4: ; %atomicrmw.start
18288 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
18289 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18290 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
18291 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
18292 ; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
18293 ; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
18294 ; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
18295 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
18296 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18297 ; GCN1-NEXT: buffer_wbinvl1_vol
18298 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
18299 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
18300 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
18301 ; GCN1-NEXT: s_cbranch_execnz .LBB120_4
18302 ; GCN1-NEXT: ; %bb.5: ; %Flow
18303 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
18304 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
18305 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
18306 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
18307 ; GCN1-NEXT: s_cbranch_execz .LBB120_2
18308 ; GCN1-NEXT: .LBB120_6: ; %atomicrmw.private
18309 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
18310 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
18311 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
18312 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
18313 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
18314 ; GCN1-NEXT: s_waitcnt vmcnt(0)
18315 ; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
18316 ; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
18317 ; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
18318 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
18319 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
18320 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
18321 ; GCN1-NEXT: s_waitcnt vmcnt(0)
18322 ; GCN1-NEXT: s_setpc_b64 s[30:31]
18324 ; GCN2-LABEL: flat_atomic_min_i64_ret_offset:
18326 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18327 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
18328 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
18329 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
18330 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
18331 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
18332 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
18333 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
18334 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
18335 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
18336 ; GCN2-NEXT: s_cbranch_execnz .LBB120_3
18337 ; GCN2-NEXT: ; %bb.1: ; %Flow3
18338 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
18339 ; GCN2-NEXT: s_cbranch_execnz .LBB120_6
18340 ; GCN2-NEXT: .LBB120_2: ; %atomicrmw.phi
18341 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
18342 ; GCN2-NEXT: s_setpc_b64 s[30:31]
18343 ; GCN2-NEXT: .LBB120_3: ; %atomicrmw.global
18344 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4
18345 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
18346 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
18347 ; GCN2-NEXT: flat_load_dword v0, v[4:5]
18348 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
18349 ; GCN2-NEXT: .LBB120_4: ; %atomicrmw.start
18350 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
18351 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18352 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
18353 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
18354 ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
18355 ; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
18356 ; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
18357 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
18358 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18359 ; GCN2-NEXT: buffer_wbinvl1_vol
18360 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
18361 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
18362 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
18363 ; GCN2-NEXT: s_cbranch_execnz .LBB120_4
18364 ; GCN2-NEXT: ; %bb.5: ; %Flow
18365 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
18366 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
18367 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
18368 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
18369 ; GCN2-NEXT: s_cbranch_execz .LBB120_2
18370 ; GCN2-NEXT: .LBB120_6: ; %atomicrmw.private
18371 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
18372 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
18373 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
18374 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
18375 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
18376 ; GCN2-NEXT: s_waitcnt vmcnt(0)
18377 ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
18378 ; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
18379 ; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
18380 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
18381 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
18382 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
18383 ; GCN2-NEXT: s_waitcnt vmcnt(0)
18384 ; GCN2-NEXT: s_setpc_b64 s[30:31]
18386 ; GCN3-LABEL: flat_atomic_min_i64_ret_offset:
18388 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18389 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
18390 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
18391 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
18392 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
18393 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
18394 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
18395 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
18396 ; GCN3-NEXT: s_cbranch_execnz .LBB120_3
18397 ; GCN3-NEXT: ; %bb.1: ; %Flow3
18398 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
18399 ; GCN3-NEXT: s_cbranch_execnz .LBB120_6
18400 ; GCN3-NEXT: .LBB120_2: ; %atomicrmw.phi
18401 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
18402 ; GCN3-NEXT: s_setpc_b64 s[30:31]
18403 ; GCN3-NEXT: .LBB120_3: ; %atomicrmw.global
18404 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
18405 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
18406 ; GCN3-NEXT: .LBB120_4: ; %atomicrmw.start
18407 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
18408 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18409 ; GCN3-NEXT: v_mov_b32_e32 v9, v1
18410 ; GCN3-NEXT: v_mov_b32_e32 v8, v0
18411 ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
18412 ; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
18413 ; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
18414 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
18415 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18416 ; GCN3-NEXT: buffer_wbinvl1_vol
18417 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
18418 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
18419 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
18420 ; GCN3-NEXT: s_cbranch_execnz .LBB120_4
18421 ; GCN3-NEXT: ; %bb.5: ; %Flow
18422 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
18423 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
18424 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
18425 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
18426 ; GCN3-NEXT: s_cbranch_execz .LBB120_2
18427 ; GCN3-NEXT: .LBB120_6: ; %atomicrmw.private
18428 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
18429 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
18430 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
18431 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
18432 ; GCN3-NEXT: s_waitcnt vmcnt(0)
18433 ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
18434 ; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
18435 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
18436 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
18437 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
18438 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
18439 ; GCN3-NEXT: s_waitcnt vmcnt(0)
18440 ; GCN3-NEXT: s_setpc_b64 s[30:31]
18441 %gep = getelementptr i64, ptr %out, i64 4
18442 %result = atomicrmw min ptr %gep, i64 %in seq_cst
18446 define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
18447 ; GCN1-LABEL: flat_atomic_min_i64_noret_scalar:
18449 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18450 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
18451 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
18452 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
18453 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
18454 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
18455 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
18456 ; GCN1-NEXT: s_mov_b64 s[34:35], -1
18457 ; GCN1-NEXT: s_cbranch_vccnz .LBB121_3
18458 ; GCN1-NEXT: ; %bb.1: ; %Flow3
18459 ; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35]
18460 ; GCN1-NEXT: s_cbranch_vccnz .LBB121_6
18461 ; GCN1-NEXT: .LBB121_2: ; %atomicrmw.phi
18462 ; GCN1-NEXT: s_setpc_b64 s[30:31]
18463 ; GCN1-NEXT: .LBB121_3: ; %atomicrmw.global
18464 ; GCN1-NEXT: s_add_u32 s34, s4, 4
18465 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
18466 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
18467 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
18468 ; GCN1-NEXT: v_mov_b32_e32 v4, s4
18469 ; GCN1-NEXT: v_mov_b32_e32 v5, s5
18470 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
18471 ; GCN1-NEXT: flat_load_dword v2, v[4:5]
18472 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
18473 ; GCN1-NEXT: v_mov_b32_e32 v6, s7
18474 ; GCN1-NEXT: v_mov_b32_e32 v7, s6
18475 ; GCN1-NEXT: .LBB121_4: ; %atomicrmw.start
18476 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
18477 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18478 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
18479 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
18480 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
18481 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
18482 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18483 ; GCN1-NEXT: buffer_wbinvl1_vol
18484 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
18485 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
18486 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
18487 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
18488 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
18489 ; GCN1-NEXT: s_cbranch_execnz .LBB121_4
18490 ; GCN1-NEXT: ; %bb.5: ; %Flow
18491 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
18492 ; GCN1-NEXT: s_branch .LBB121_2
18493 ; GCN1-NEXT: .LBB121_6: ; %atomicrmw.private
18494 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
18495 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
18496 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
18497 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
18498 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
18499 ; GCN1-NEXT: s_add_i32 s34, s34, 4
18500 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
18501 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
18502 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
18503 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
18504 ; GCN1-NEXT: s_waitcnt vmcnt(0)
18505 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1]
18506 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
18507 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
18508 ; GCN1-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
18509 ; GCN1-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
18510 ; GCN1-NEXT: s_waitcnt vmcnt(0)
18511 ; GCN1-NEXT: s_setpc_b64 s[30:31]
18513 ; GCN2-LABEL: flat_atomic_min_i64_noret_scalar:
18515 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18516 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
18517 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
18518 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
18519 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
18520 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
18521 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
18522 ; GCN2-NEXT: s_mov_b64 s[34:35], -1
18523 ; GCN2-NEXT: s_cbranch_vccnz .LBB121_3
18524 ; GCN2-NEXT: ; %bb.1: ; %Flow3
18525 ; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35]
18526 ; GCN2-NEXT: s_cbranch_vccnz .LBB121_6
18527 ; GCN2-NEXT: .LBB121_2: ; %atomicrmw.phi
18528 ; GCN2-NEXT: s_setpc_b64 s[30:31]
18529 ; GCN2-NEXT: .LBB121_3: ; %atomicrmw.global
18530 ; GCN2-NEXT: s_add_u32 s34, s4, 4
18531 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
18532 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
18533 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
18534 ; GCN2-NEXT: v_mov_b32_e32 v4, s4
18535 ; GCN2-NEXT: v_mov_b32_e32 v5, s5
18536 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
18537 ; GCN2-NEXT: flat_load_dword v2, v[4:5]
18538 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
18539 ; GCN2-NEXT: v_mov_b32_e32 v6, s7
18540 ; GCN2-NEXT: v_mov_b32_e32 v7, s6
18541 ; GCN2-NEXT: .LBB121_4: ; %atomicrmw.start
18542 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
18543 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18544 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
18545 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
18546 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
18547 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
18548 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18549 ; GCN2-NEXT: buffer_wbinvl1_vol
18550 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
18551 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
18552 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
18553 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
18554 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
18555 ; GCN2-NEXT: s_cbranch_execnz .LBB121_4
18556 ; GCN2-NEXT: ; %bb.5: ; %Flow
18557 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
18558 ; GCN2-NEXT: s_branch .LBB121_2
18559 ; GCN2-NEXT: .LBB121_6: ; %atomicrmw.private
18560 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
18561 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
18562 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
18563 ; GCN2-NEXT: s_add_i32 s34, s34, 4
18564 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
18565 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
18566 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
18567 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
18568 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
18569 ; GCN2-NEXT: s_waitcnt vmcnt(0)
18570 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1]
18571 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
18572 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
18573 ; GCN2-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
18574 ; GCN2-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
18575 ; GCN2-NEXT: s_waitcnt vmcnt(0)
18576 ; GCN2-NEXT: s_setpc_b64 s[30:31]
18578 ; GCN3-LABEL: flat_atomic_min_i64_noret_scalar:
18580 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18581 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
18582 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
18583 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
18584 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
18585 ; GCN3-NEXT: s_mov_b64 s[34:35], -1
18586 ; GCN3-NEXT: s_cbranch_vccnz .LBB121_3
18587 ; GCN3-NEXT: ; %bb.1: ; %Flow3
18588 ; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35]
18589 ; GCN3-NEXT: s_cbranch_vccnz .LBB121_6
18590 ; GCN3-NEXT: .LBB121_2: ; %atomicrmw.phi
18591 ; GCN3-NEXT: s_setpc_b64 s[30:31]
18592 ; GCN3-NEXT: .LBB121_3: ; %atomicrmw.global
18593 ; GCN3-NEXT: v_mov_b32_e32 v4, s4
18594 ; GCN3-NEXT: v_mov_b32_e32 v5, s5
18595 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
18596 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
18597 ; GCN3-NEXT: v_mov_b32_e32 v6, s7
18598 ; GCN3-NEXT: v_mov_b32_e32 v7, s6
18599 ; GCN3-NEXT: .LBB121_4: ; %atomicrmw.start
18600 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
18601 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18602 ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
18603 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
18604 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
18605 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
18606 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18607 ; GCN3-NEXT: buffer_wbinvl1_vol
18608 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
18609 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
18610 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
18611 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
18612 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
18613 ; GCN3-NEXT: s_cbranch_execnz .LBB121_4
18614 ; GCN3-NEXT: ; %bb.5: ; %Flow
18615 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
18616 ; GCN3-NEXT: s_branch .LBB121_2
18617 ; GCN3-NEXT: .LBB121_6: ; %atomicrmw.private
18618 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
18619 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
18620 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
18621 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
18622 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
18623 ; GCN3-NEXT: v_mov_b32_e32 v4, s6
18624 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
18625 ; GCN3-NEXT: s_waitcnt vmcnt(0)
18626 ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1]
18627 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
18628 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
18629 ; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
18630 ; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
18631 ; GCN3-NEXT: s_waitcnt vmcnt(0)
18632 ; GCN3-NEXT: s_setpc_b64 s[30:31]
18633 %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst
18637 define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
18638 ; GCN1-LABEL: flat_atomic_min_i64_noret_offset_scalar:
18640 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18641 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
18642 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
18643 ; GCN1-NEXT: s_add_u32 s34, s4, 32
18644 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
18645 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
18646 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
18647 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
18648 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
18649 ; GCN1-NEXT: s_mov_b64 s[36:37], -1
18650 ; GCN1-NEXT: s_cbranch_vccnz .LBB122_3
18651 ; GCN1-NEXT: ; %bb.1: ; %Flow3
18652 ; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37]
18653 ; GCN1-NEXT: s_cbranch_vccnz .LBB122_6
18654 ; GCN1-NEXT: .LBB122_2: ; %atomicrmw.phi
18655 ; GCN1-NEXT: s_setpc_b64 s[30:31]
18656 ; GCN1-NEXT: .LBB122_3: ; %atomicrmw.global
18657 ; GCN1-NEXT: s_add_u32 s36, s34, 4
18658 ; GCN1-NEXT: s_addc_u32 s37, s35, 0
18659 ; GCN1-NEXT: v_mov_b32_e32 v0, s36
18660 ; GCN1-NEXT: v_mov_b32_e32 v1, s37
18661 ; GCN1-NEXT: v_mov_b32_e32 v4, s34
18662 ; GCN1-NEXT: v_mov_b32_e32 v5, s35
18663 ; GCN1-NEXT: flat_load_dword v3, v[0:1]
18664 ; GCN1-NEXT: flat_load_dword v2, v[4:5]
18665 ; GCN1-NEXT: s_mov_b64 s[36:37], 0
18666 ; GCN1-NEXT: v_mov_b32_e32 v6, s7
18667 ; GCN1-NEXT: v_mov_b32_e32 v7, s6
18668 ; GCN1-NEXT: .LBB122_4: ; %atomicrmw.start
18669 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
18670 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18671 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
18672 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
18673 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
18674 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
18675 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18676 ; GCN1-NEXT: buffer_wbinvl1_vol
18677 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
18678 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
18679 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
18680 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
18681 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
18682 ; GCN1-NEXT: s_cbranch_execnz .LBB122_4
18683 ; GCN1-NEXT: ; %bb.5: ; %Flow
18684 ; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
18685 ; GCN1-NEXT: s_branch .LBB122_2
18686 ; GCN1-NEXT: .LBB122_6: ; %atomicrmw.private
18687 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
18688 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
18689 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
18690 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
18691 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
18692 ; GCN1-NEXT: s_add_i32 s34, s34, 4
18693 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
18694 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
18695 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
18696 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
18697 ; GCN1-NEXT: s_waitcnt vmcnt(0)
18698 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1]
18699 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
18700 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
18701 ; GCN1-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
18702 ; GCN1-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
18703 ; GCN1-NEXT: s_waitcnt vmcnt(0)
18704 ; GCN1-NEXT: s_setpc_b64 s[30:31]
18706 ; GCN2-LABEL: flat_atomic_min_i64_noret_offset_scalar:
18708 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18709 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
18710 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
18711 ; GCN2-NEXT: s_add_u32 s34, s4, 32
18712 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
18713 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
18714 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
18715 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
18716 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
18717 ; GCN2-NEXT: s_mov_b64 s[36:37], -1
18718 ; GCN2-NEXT: s_cbranch_vccnz .LBB122_3
18719 ; GCN2-NEXT: ; %bb.1: ; %Flow3
18720 ; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37]
18721 ; GCN2-NEXT: s_cbranch_vccnz .LBB122_6
18722 ; GCN2-NEXT: .LBB122_2: ; %atomicrmw.phi
18723 ; GCN2-NEXT: s_setpc_b64 s[30:31]
18724 ; GCN2-NEXT: .LBB122_3: ; %atomicrmw.global
18725 ; GCN2-NEXT: s_add_u32 s36, s34, 4
18726 ; GCN2-NEXT: s_addc_u32 s37, s35, 0
18727 ; GCN2-NEXT: v_mov_b32_e32 v0, s36
18728 ; GCN2-NEXT: v_mov_b32_e32 v1, s37
18729 ; GCN2-NEXT: v_mov_b32_e32 v4, s34
18730 ; GCN2-NEXT: v_mov_b32_e32 v5, s35
18731 ; GCN2-NEXT: flat_load_dword v3, v[0:1]
18732 ; GCN2-NEXT: flat_load_dword v2, v[4:5]
18733 ; GCN2-NEXT: s_mov_b64 s[36:37], 0
18734 ; GCN2-NEXT: v_mov_b32_e32 v6, s7
18735 ; GCN2-NEXT: v_mov_b32_e32 v7, s6
18736 ; GCN2-NEXT: .LBB122_4: ; %atomicrmw.start
18737 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
18738 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18739 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
18740 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
18741 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
18742 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
18743 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18744 ; GCN2-NEXT: buffer_wbinvl1_vol
18745 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
18746 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
18747 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
18748 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
18749 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
18750 ; GCN2-NEXT: s_cbranch_execnz .LBB122_4
18751 ; GCN2-NEXT: ; %bb.5: ; %Flow
18752 ; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
18753 ; GCN2-NEXT: s_branch .LBB122_2
18754 ; GCN2-NEXT: .LBB122_6: ; %atomicrmw.private
18755 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
18756 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
18757 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
18758 ; GCN2-NEXT: s_add_i32 s34, s34, 4
18759 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
18760 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
18761 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
18762 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
18763 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
18764 ; GCN2-NEXT: s_waitcnt vmcnt(0)
18765 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1]
18766 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
18767 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
18768 ; GCN2-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
18769 ; GCN2-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
18770 ; GCN2-NEXT: s_waitcnt vmcnt(0)
18771 ; GCN2-NEXT: s_setpc_b64 s[30:31]
18773 ; GCN3-LABEL: flat_atomic_min_i64_noret_offset_scalar:
18775 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18776 ; GCN3-NEXT: s_add_u32 s34, s4, 32
18777 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
18778 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
18779 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
18780 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
18781 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
18782 ; GCN3-NEXT: s_mov_b64 s[36:37], -1
18783 ; GCN3-NEXT: s_cbranch_vccnz .LBB122_3
18784 ; GCN3-NEXT: ; %bb.1: ; %Flow3
18785 ; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37]
18786 ; GCN3-NEXT: s_cbranch_vccnz .LBB122_6
18787 ; GCN3-NEXT: .LBB122_2: ; %atomicrmw.phi
18788 ; GCN3-NEXT: s_setpc_b64 s[30:31]
18789 ; GCN3-NEXT: .LBB122_3: ; %atomicrmw.global
18790 ; GCN3-NEXT: v_mov_b32_e32 v4, s34
18791 ; GCN3-NEXT: v_mov_b32_e32 v5, s35
18792 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
18793 ; GCN3-NEXT: s_mov_b64 s[36:37], 0
18794 ; GCN3-NEXT: v_mov_b32_e32 v6, s7
18795 ; GCN3-NEXT: v_mov_b32_e32 v7, s6
18796 ; GCN3-NEXT: .LBB122_4: ; %atomicrmw.start
18797 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
18798 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18799 ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
18800 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
18801 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
18802 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
18803 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18804 ; GCN3-NEXT: buffer_wbinvl1_vol
18805 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
18806 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
18807 ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
18808 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
18809 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
18810 ; GCN3-NEXT: s_cbranch_execnz .LBB122_4
18811 ; GCN3-NEXT: ; %bb.5: ; %Flow
18812 ; GCN3-NEXT: s_or_b64 exec, exec, s[36:37]
18813 ; GCN3-NEXT: s_branch .LBB122_2
18814 ; GCN3-NEXT: .LBB122_6: ; %atomicrmw.private
18815 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
18816 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
18817 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
18818 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
18819 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
18820 ; GCN3-NEXT: v_mov_b32_e32 v4, s6
18821 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
18822 ; GCN3-NEXT: s_waitcnt vmcnt(0)
18823 ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1]
18824 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
18825 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
18826 ; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
18827 ; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
18828 ; GCN3-NEXT: s_waitcnt vmcnt(0)
18829 ; GCN3-NEXT: s_setpc_b64 s[30:31]
18830 %gep = getelementptr i64, ptr %out, i64 4
18831 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst
18835 define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
18836 ; GCN1-LABEL: flat_atomic_min_i64_ret_scalar:
18838 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18839 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
18840 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
18841 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
18842 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
18843 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
18844 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
18845 ; GCN1-NEXT: s_cbranch_vccz .LBB123_4
18846 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
18847 ; GCN1-NEXT: s_add_u32 s34, s4, 4
18848 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
18849 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
18850 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
18851 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
18852 ; GCN1-NEXT: v_mov_b32_e32 v3, s5
18853 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
18854 ; GCN1-NEXT: flat_load_dword v0, v[2:3]
18855 ; GCN1-NEXT: s_mov_b64 s[34:35], 0
18856 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
18857 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
18858 ; GCN1-NEXT: .LBB123_2: ; %atomicrmw.start
18859 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
18860 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18861 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
18862 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
18863 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
18864 ; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
18865 ; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
18866 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
18867 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18868 ; GCN1-NEXT: buffer_wbinvl1_vol
18869 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
18870 ; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
18871 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
18872 ; GCN1-NEXT: s_cbranch_execnz .LBB123_2
18873 ; GCN1-NEXT: ; %bb.3: ; %Flow
18874 ; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
18875 ; GCN1-NEXT: s_branch .LBB123_6
18876 ; GCN1-NEXT: .LBB123_4:
18877 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
18878 ; GCN1-NEXT: s_cbranch_execz .LBB123_6
18879 ; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
18880 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
18881 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
18882 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
18883 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
18884 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
18885 ; GCN1-NEXT: s_add_i32 s34, s34, 4
18886 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
18887 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
18888 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
18889 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
18890 ; GCN1-NEXT: s_waitcnt vmcnt(0)
18891 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1]
18892 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
18893 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
18894 ; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
18895 ; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
18896 ; GCN1-NEXT: .LBB123_6: ; %atomicrmw.phi
18897 ; GCN1-NEXT: s_waitcnt vmcnt(0)
18898 ; GCN1-NEXT: s_setpc_b64 s[30:31]
18900 ; GCN2-LABEL: flat_atomic_min_i64_ret_scalar:
18902 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18903 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
18904 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
18905 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
18906 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
18907 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
18908 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
18909 ; GCN2-NEXT: s_cbranch_vccz .LBB123_4
18910 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
18911 ; GCN2-NEXT: s_add_u32 s34, s4, 4
18912 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
18913 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
18914 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
18915 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
18916 ; GCN2-NEXT: v_mov_b32_e32 v3, s5
18917 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
18918 ; GCN2-NEXT: flat_load_dword v0, v[2:3]
18919 ; GCN2-NEXT: s_mov_b64 s[34:35], 0
18920 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
18921 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
18922 ; GCN2-NEXT: .LBB123_2: ; %atomicrmw.start
18923 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
18924 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18925 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
18926 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
18927 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
18928 ; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
18929 ; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
18930 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
18931 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18932 ; GCN2-NEXT: buffer_wbinvl1_vol
18933 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
18934 ; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
18935 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
18936 ; GCN2-NEXT: s_cbranch_execnz .LBB123_2
18937 ; GCN2-NEXT: ; %bb.3: ; %Flow
18938 ; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
18939 ; GCN2-NEXT: s_branch .LBB123_6
18940 ; GCN2-NEXT: .LBB123_4:
18941 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
18942 ; GCN2-NEXT: s_cbranch_execz .LBB123_6
18943 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
18944 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
18945 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
18946 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
18947 ; GCN2-NEXT: s_add_i32 s34, s34, 4
18948 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
18949 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
18950 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
18951 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
18952 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
18953 ; GCN2-NEXT: s_waitcnt vmcnt(0)
18954 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1]
18955 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
18956 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
18957 ; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
18958 ; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
18959 ; GCN2-NEXT: .LBB123_6: ; %atomicrmw.phi
18960 ; GCN2-NEXT: s_waitcnt vmcnt(0)
18961 ; GCN2-NEXT: s_setpc_b64 s[30:31]
18963 ; GCN3-LABEL: flat_atomic_min_i64_ret_scalar:
18965 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18966 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
18967 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
18968 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
18969 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
18970 ; GCN3-NEXT: s_cbranch_vccz .LBB123_4
18971 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
18972 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
18973 ; GCN3-NEXT: v_mov_b32_e32 v3, s5
18974 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
18975 ; GCN3-NEXT: s_mov_b64 s[34:35], 0
18976 ; GCN3-NEXT: v_mov_b32_e32 v4, s7
18977 ; GCN3-NEXT: v_mov_b32_e32 v5, s6
18978 ; GCN3-NEXT: .LBB123_2: ; %atomicrmw.start
18979 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
18980 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18981 ; GCN3-NEXT: v_mov_b32_e32 v9, v1
18982 ; GCN3-NEXT: v_mov_b32_e32 v8, v0
18983 ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
18984 ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
18985 ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
18986 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
18987 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18988 ; GCN3-NEXT: buffer_wbinvl1_vol
18989 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
18990 ; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
18991 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
18992 ; GCN3-NEXT: s_cbranch_execnz .LBB123_2
18993 ; GCN3-NEXT: ; %bb.3: ; %Flow
18994 ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
18995 ; GCN3-NEXT: s_branch .LBB123_6
18996 ; GCN3-NEXT: .LBB123_4:
18997 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
18998 ; GCN3-NEXT: s_cbranch_execz .LBB123_6
18999 ; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
19000 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
19001 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
19002 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
19003 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
19004 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
19005 ; GCN3-NEXT: v_mov_b32_e32 v4, s6
19006 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
19007 ; GCN3-NEXT: s_waitcnt vmcnt(0)
19008 ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1]
19009 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
19010 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
19011 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
19012 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
19013 ; GCN3-NEXT: .LBB123_6: ; %atomicrmw.phi
19014 ; GCN3-NEXT: s_waitcnt vmcnt(0)
19015 ; GCN3-NEXT: s_setpc_b64 s[30:31]
19016 %result = atomicrmw min ptr %ptr, i64 %in seq_cst
19020 define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
19021 ; GCN1-LABEL: flat_atomic_min_i64_ret_offset_scalar:
19023 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19024 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
19025 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
19026 ; GCN1-NEXT: s_add_u32 s34, s4, 32
19027 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
19028 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
19029 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
19030 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
19031 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
19032 ; GCN1-NEXT: s_cbranch_vccz .LBB124_4
19033 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
19034 ; GCN1-NEXT: s_add_u32 s36, s34, 4
19035 ; GCN1-NEXT: s_addc_u32 s37, s35, 0
19036 ; GCN1-NEXT: v_mov_b32_e32 v0, s36
19037 ; GCN1-NEXT: v_mov_b32_e32 v1, s37
19038 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
19039 ; GCN1-NEXT: v_mov_b32_e32 v3, s35
19040 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
19041 ; GCN1-NEXT: flat_load_dword v0, v[2:3]
19042 ; GCN1-NEXT: s_mov_b64 s[36:37], 0
19043 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
19044 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
19045 ; GCN1-NEXT: .LBB124_2: ; %atomicrmw.start
19046 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
19047 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19048 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
19049 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
19050 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
19051 ; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
19052 ; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
19053 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
19054 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19055 ; GCN1-NEXT: buffer_wbinvl1_vol
19056 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
19057 ; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
19058 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
19059 ; GCN1-NEXT: s_cbranch_execnz .LBB124_2
19060 ; GCN1-NEXT: ; %bb.3: ; %Flow
19061 ; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
19062 ; GCN1-NEXT: s_branch .LBB124_6
19063 ; GCN1-NEXT: .LBB124_4:
19064 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
19065 ; GCN1-NEXT: s_cbranch_execz .LBB124_6
19066 ; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
19067 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
19068 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
19069 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
19070 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
19071 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
19072 ; GCN1-NEXT: s_add_i32 s34, s34, 4
19073 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
19074 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
19075 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
19076 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
19077 ; GCN1-NEXT: s_waitcnt vmcnt(0)
19078 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1]
19079 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
19080 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
19081 ; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
19082 ; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
19083 ; GCN1-NEXT: .LBB124_6: ; %atomicrmw.phi
19084 ; GCN1-NEXT: s_waitcnt vmcnt(0)
19085 ; GCN1-NEXT: s_setpc_b64 s[30:31]
19087 ; GCN2-LABEL: flat_atomic_min_i64_ret_offset_scalar:
19089 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19090 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
19091 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
19092 ; GCN2-NEXT: s_add_u32 s34, s4, 32
19093 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
19094 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
19095 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
19096 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
19097 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
19098 ; GCN2-NEXT: s_cbranch_vccz .LBB124_4
19099 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
19100 ; GCN2-NEXT: s_add_u32 s36, s34, 4
19101 ; GCN2-NEXT: s_addc_u32 s37, s35, 0
19102 ; GCN2-NEXT: v_mov_b32_e32 v0, s36
19103 ; GCN2-NEXT: v_mov_b32_e32 v1, s37
19104 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
19105 ; GCN2-NEXT: v_mov_b32_e32 v3, s35
19106 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
19107 ; GCN2-NEXT: flat_load_dword v0, v[2:3]
19108 ; GCN2-NEXT: s_mov_b64 s[36:37], 0
19109 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
19110 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
19111 ; GCN2-NEXT: .LBB124_2: ; %atomicrmw.start
19112 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
19113 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19114 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
19115 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
19116 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
19117 ; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
19118 ; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
19119 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
19120 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19121 ; GCN2-NEXT: buffer_wbinvl1_vol
19122 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
19123 ; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
19124 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
19125 ; GCN2-NEXT: s_cbranch_execnz .LBB124_2
19126 ; GCN2-NEXT: ; %bb.3: ; %Flow
19127 ; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
19128 ; GCN2-NEXT: s_branch .LBB124_6
19129 ; GCN2-NEXT: .LBB124_4:
19130 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
19131 ; GCN2-NEXT: s_cbranch_execz .LBB124_6
19132 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
19133 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
19134 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
19135 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
19136 ; GCN2-NEXT: s_add_i32 s34, s34, 4
19137 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
19138 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
19139 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
19140 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
19141 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
19142 ; GCN2-NEXT: s_waitcnt vmcnt(0)
19143 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1]
19144 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
19145 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
19146 ; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
19147 ; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
19148 ; GCN2-NEXT: .LBB124_6: ; %atomicrmw.phi
19149 ; GCN2-NEXT: s_waitcnt vmcnt(0)
19150 ; GCN2-NEXT: s_setpc_b64 s[30:31]
19152 ; GCN3-LABEL: flat_atomic_min_i64_ret_offset_scalar:
19154 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19155 ; GCN3-NEXT: s_add_u32 s34, s4, 32
19156 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
19157 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
19158 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
19159 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
19160 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
19161 ; GCN3-NEXT: s_cbranch_vccz .LBB124_4
19162 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
19163 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
19164 ; GCN3-NEXT: v_mov_b32_e32 v3, s35
19165 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
19166 ; GCN3-NEXT: s_mov_b64 s[36:37], 0
19167 ; GCN3-NEXT: v_mov_b32_e32 v4, s7
19168 ; GCN3-NEXT: v_mov_b32_e32 v5, s6
19169 ; GCN3-NEXT: .LBB124_2: ; %atomicrmw.start
19170 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
19171 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19172 ; GCN3-NEXT: v_mov_b32_e32 v9, v1
19173 ; GCN3-NEXT: v_mov_b32_e32 v8, v0
19174 ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
19175 ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
19176 ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
19177 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
19178 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19179 ; GCN3-NEXT: buffer_wbinvl1_vol
19180 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
19181 ; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
19182 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
19183 ; GCN3-NEXT: s_cbranch_execnz .LBB124_2
19184 ; GCN3-NEXT: ; %bb.3: ; %Flow
19185 ; GCN3-NEXT: s_or_b64 exec, exec, s[36:37]
19186 ; GCN3-NEXT: s_branch .LBB124_6
19187 ; GCN3-NEXT: .LBB124_4:
19188 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
19189 ; GCN3-NEXT: s_cbranch_execz .LBB124_6
19190 ; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
19191 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
19192 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
19193 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
19194 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
19195 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
19196 ; GCN3-NEXT: v_mov_b32_e32 v4, s6
19197 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
19198 ; GCN3-NEXT: s_waitcnt vmcnt(0)
19199 ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[0:1]
19200 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
19201 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
19202 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
19203 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
19204 ; GCN3-NEXT: .LBB124_6: ; %atomicrmw.phi
19205 ; GCN3-NEXT: s_waitcnt vmcnt(0)
19206 ; GCN3-NEXT: s_setpc_b64 s[30:31]
19207 %gep = getelementptr i64, ptr %out, i64 4
19208 %result = atomicrmw min ptr %gep, i64 %in seq_cst
19212 define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
19213 ; GCN1-LABEL: atomic_min_i64_addr64_offset:
19214 ; GCN1: ; %bb.0: ; %entry
19215 ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
19216 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
19217 ; GCN1-NEXT: s_mov_b32 s14, -1
19218 ; GCN1-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
19219 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
19220 ; GCN1-NEXT: s_load_dword s8, s[4:5], 0x3f
19221 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
19222 ; GCN1-NEXT: s_add_u32 s12, s12, s11
19223 ; GCN1-NEXT: s_addc_u32 s13, s13, 0
19224 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
19225 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
19226 ; GCN1-NEXT: s_add_u32 s0, s0, s4
19227 ; GCN1-NEXT: s_addc_u32 s1, s1, s5
19228 ; GCN1-NEXT: s_add_u32 s0, s0, 32
19229 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
19230 ; GCN1-NEXT: s_cmp_eq_u32 s1, s8
19231 ; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0
19232 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
19233 ; GCN1-NEXT: s_mov_b64 s[4:5], -1
19234 ; GCN1-NEXT: s_cbranch_vccnz .LBB125_3
19235 ; GCN1-NEXT: ; %bb.1: ; %Flow6
19236 ; GCN1-NEXT: s_and_b64 vcc, exec, s[4:5]
19237 ; GCN1-NEXT: s_cbranch_vccnz .LBB125_6
19238 ; GCN1-NEXT: .LBB125_2: ; %atomicrmw.phi
19239 ; GCN1-NEXT: s_endpgm
19240 ; GCN1-NEXT: .LBB125_3: ; %atomicrmw.global
19241 ; GCN1-NEXT: v_mov_b32_e32 v5, s1
19242 ; GCN1-NEXT: v_mov_b32_e32 v4, s0
19243 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
19244 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
19245 ; GCN1-NEXT: v_mov_b32_e32 v6, s3
19246 ; GCN1-NEXT: v_mov_b32_e32 v7, s2
19247 ; GCN1-NEXT: .LBB125_4: ; %atomicrmw.start
19248 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
19249 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19250 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
19251 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
19252 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
19253 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
19254 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19255 ; GCN1-NEXT: buffer_wbinvl1_vol
19256 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
19257 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
19258 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
19259 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
19260 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
19261 ; GCN1-NEXT: s_cbranch_execnz .LBB125_4
19262 ; GCN1-NEXT: ; %bb.5: ; %Flow
19263 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
19264 ; GCN1-NEXT: s_branch .LBB125_2
19265 ; GCN1-NEXT: .LBB125_6: ; %atomicrmw.private
19266 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0
19267 ; GCN1-NEXT: v_mov_b32_e32 v5, s2
19268 ; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec
19269 ; GCN1-NEXT: s_cselect_b32 s0, s0, -1
19270 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
19271 ; GCN1-NEXT: s_add_i32 s0, s0, 4
19272 ; GCN1-NEXT: v_mov_b32_e32 v3, s0
19273 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
19274 ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
19275 ; GCN1-NEXT: v_mov_b32_e32 v4, s3
19276 ; GCN1-NEXT: s_waitcnt vmcnt(0)
19277 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1]
19278 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
19279 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
19280 ; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
19281 ; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
19282 ; GCN1-NEXT: s_endpgm
19284 ; GCN2-LABEL: atomic_min_i64_addr64_offset:
19285 ; GCN2: ; %bb.0: ; %entry
19286 ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
19287 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
19288 ; GCN2-NEXT: s_mov_b32 s90, -1
19289 ; GCN2-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
19290 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
19291 ; GCN2-NEXT: s_load_dword s8, s[4:5], 0xfc
19292 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000
19293 ; GCN2-NEXT: s_add_u32 s88, s88, s11
19294 ; GCN2-NEXT: s_addc_u32 s89, s89, 0
19295 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
19296 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
19297 ; GCN2-NEXT: s_add_u32 s0, s0, s4
19298 ; GCN2-NEXT: s_addc_u32 s1, s1, s5
19299 ; GCN2-NEXT: s_add_u32 s0, s0, 32
19300 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
19301 ; GCN2-NEXT: s_cmp_eq_u32 s1, s8
19302 ; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0
19303 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5]
19304 ; GCN2-NEXT: s_mov_b64 s[4:5], -1
19305 ; GCN2-NEXT: s_cbranch_vccnz .LBB125_3
19306 ; GCN2-NEXT: ; %bb.1: ; %Flow6
19307 ; GCN2-NEXT: s_and_b64 vcc, exec, s[4:5]
19308 ; GCN2-NEXT: s_cbranch_vccnz .LBB125_6
19309 ; GCN2-NEXT: .LBB125_2: ; %atomicrmw.phi
19310 ; GCN2-NEXT: s_endpgm
19311 ; GCN2-NEXT: .LBB125_3: ; %atomicrmw.global
19312 ; GCN2-NEXT: v_mov_b32_e32 v5, s1
19313 ; GCN2-NEXT: v_mov_b32_e32 v4, s0
19314 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
19315 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
19316 ; GCN2-NEXT: v_mov_b32_e32 v6, s3
19317 ; GCN2-NEXT: v_mov_b32_e32 v7, s2
19318 ; GCN2-NEXT: .LBB125_4: ; %atomicrmw.start
19319 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
19320 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19321 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
19322 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
19323 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
19324 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
19325 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19326 ; GCN2-NEXT: buffer_wbinvl1_vol
19327 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
19328 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
19329 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
19330 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
19331 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
19332 ; GCN2-NEXT: s_cbranch_execnz .LBB125_4
19333 ; GCN2-NEXT: ; %bb.5: ; %Flow
19334 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
19335 ; GCN2-NEXT: s_branch .LBB125_2
19336 ; GCN2-NEXT: .LBB125_6: ; %atomicrmw.private
19337 ; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
19338 ; GCN2-NEXT: s_cselect_b32 s0, s0, -1
19339 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
19340 ; GCN2-NEXT: s_add_i32 s0, s0, 4
19341 ; GCN2-NEXT: v_mov_b32_e32 v3, s0
19342 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
19343 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
19344 ; GCN2-NEXT: v_mov_b32_e32 v5, s2
19345 ; GCN2-NEXT: v_mov_b32_e32 v4, s3
19346 ; GCN2-NEXT: s_waitcnt vmcnt(0)
19347 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1]
19348 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
19349 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
19350 ; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
19351 ; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
19352 ; GCN2-NEXT: s_endpgm
19354 ; GCN3-LABEL: atomic_min_i64_addr64_offset:
19355 ; GCN3: ; %bb.0: ; %entry
19356 ; GCN3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
19357 ; GCN3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
19358 ; GCN3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
19359 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
19360 ; GCN3-NEXT: s_mov_b32 s14, -1
19361 ; GCN3-NEXT: s_mov_b32 s15, 0xe00000
19362 ; GCN3-NEXT: s_add_u32 s12, s12, s11
19363 ; GCN3-NEXT: s_addc_u32 s13, s13, 0
19364 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
19365 ; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
19366 ; GCN3-NEXT: s_add_u32 s0, s0, s6
19367 ; GCN3-NEXT: s_addc_u32 s1, s1, s7
19368 ; GCN3-NEXT: s_add_u32 s0, s0, 32
19369 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
19370 ; GCN3-NEXT: s_addc_u32 s1, s1, 0
19371 ; GCN3-NEXT: s_cmp_eq_u32 s1, s5
19372 ; GCN3-NEXT: s_cselect_b64 s[4:5], -1, 0
19373 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[4:5]
19374 ; GCN3-NEXT: s_mov_b64 s[4:5], -1
19375 ; GCN3-NEXT: s_cbranch_vccnz .LBB125_3
19376 ; GCN3-NEXT: ; %bb.1: ; %Flow6
19377 ; GCN3-NEXT: s_and_b64 vcc, exec, s[4:5]
19378 ; GCN3-NEXT: s_cbranch_vccnz .LBB125_6
19379 ; GCN3-NEXT: .LBB125_2: ; %atomicrmw.phi
19380 ; GCN3-NEXT: s_endpgm
19381 ; GCN3-NEXT: .LBB125_3: ; %atomicrmw.global
19382 ; GCN3-NEXT: v_mov_b32_e32 v5, s1
19383 ; GCN3-NEXT: v_mov_b32_e32 v4, s0
19384 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
19385 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
19386 ; GCN3-NEXT: v_mov_b32_e32 v6, s3
19387 ; GCN3-NEXT: v_mov_b32_e32 v7, s2
19388 ; GCN3-NEXT: .LBB125_4: ; %atomicrmw.start
19389 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
19390 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19391 ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
19392 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
19393 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
19394 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
19395 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19396 ; GCN3-NEXT: buffer_wbinvl1_vol
19397 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
19398 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
19399 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
19400 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
19401 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
19402 ; GCN3-NEXT: s_cbranch_execnz .LBB125_4
19403 ; GCN3-NEXT: ; %bb.5: ; %Flow
19404 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
19405 ; GCN3-NEXT: s_branch .LBB125_2
19406 ; GCN3-NEXT: .LBB125_6: ; %atomicrmw.private
19407 ; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0
19408 ; GCN3-NEXT: s_cselect_b32 s0, s0, -1
19409 ; GCN3-NEXT: v_mov_b32_e32 v2, s0
19410 ; GCN3-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
19411 ; GCN3-NEXT: buffer_load_dword v1, v2, s[12:15], 0 offen offset:4
19412 ; GCN3-NEXT: v_mov_b32_e32 v4, s2
19413 ; GCN3-NEXT: v_mov_b32_e32 v3, s3
19414 ; GCN3-NEXT: s_waitcnt vmcnt(0)
19415 ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1]
19416 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
19417 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
19418 ; GCN3-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
19419 ; GCN3-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen offset:4
19420 ; GCN3-NEXT: s_endpgm
19422 %ptr = getelementptr i64, ptr %out, i64 %index
19423 %gep = getelementptr i64, ptr %ptr, i64 4
19424 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst
19428 define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
19429 ; GCN1-LABEL: atomic_min_i64_ret_addr64_offset:
19430 ; GCN1: ; %bb.0: ; %entry
19431 ; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
19432 ; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
19433 ; GCN1-NEXT: s_mov_b32 s18, -1
19434 ; GCN1-NEXT: s_mov_b32 s19, 0xe8f000
19435 ; GCN1-NEXT: s_add_u32 s16, s16, s11
19436 ; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9
19437 ; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41
19438 ; GCN1-NEXT: s_addc_u32 s17, s17, 0
19439 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
19440 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
19441 ; GCN1-NEXT: s_add_u32 s0, s8, s0
19442 ; GCN1-NEXT: s_addc_u32 s1, s9, s1
19443 ; GCN1-NEXT: s_add_u32 s0, s0, 32
19444 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
19445 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2
19446 ; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
19447 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
19448 ; GCN1-NEXT: s_cbranch_vccz .LBB126_4
19449 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
19450 ; GCN1-NEXT: v_mov_b32_e32 v3, s1
19451 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
19452 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
19453 ; GCN1-NEXT: s_mov_b64 s[2:3], 0
19454 ; GCN1-NEXT: v_mov_b32_e32 v4, s13
19455 ; GCN1-NEXT: v_mov_b32_e32 v5, s12
19456 ; GCN1-NEXT: .LBB126_2: ; %atomicrmw.start
19457 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
19458 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19459 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
19460 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
19461 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9]
19462 ; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
19463 ; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
19464 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
19465 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19466 ; GCN1-NEXT: buffer_wbinvl1_vol
19467 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
19468 ; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
19469 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
19470 ; GCN1-NEXT: s_cbranch_execnz .LBB126_2
19471 ; GCN1-NEXT: ; %bb.3: ; %Flow
19472 ; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
19473 ; GCN1-NEXT: s_branch .LBB126_6
19474 ; GCN1-NEXT: .LBB126_4:
19475 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
19476 ; GCN1-NEXT: s_cbranch_execz .LBB126_6
19477 ; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
19478 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
19479 ; GCN1-NEXT: v_mov_b32_e32 v5, s12
19480 ; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
19481 ; GCN1-NEXT: s_cselect_b32 s0, s0, -1
19482 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
19483 ; GCN1-NEXT: s_add_i32 s0, s0, 4
19484 ; GCN1-NEXT: v_mov_b32_e32 v3, s0
19485 ; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen
19486 ; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
19487 ; GCN1-NEXT: v_mov_b32_e32 v4, s13
19488 ; GCN1-NEXT: s_waitcnt vmcnt(0)
19489 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[0:1]
19490 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
19491 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
19492 ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen
19493 ; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen
19494 ; GCN1-NEXT: .LBB126_6: ; %atomicrmw.phi
19495 ; GCN1-NEXT: v_mov_b32_e32 v2, s10
19496 ; GCN1-NEXT: v_mov_b32_e32 v3, s11
19497 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
19498 ; GCN1-NEXT: s_endpgm
19500 ; GCN2-LABEL: atomic_min_i64_ret_addr64_offset:
19501 ; GCN2: ; %bb.0: ; %entry
19502 ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
19503 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
19504 ; GCN2-NEXT: s_mov_b32 s90, -1
19505 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000
19506 ; GCN2-NEXT: s_add_u32 s88, s88, s11
19507 ; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
19508 ; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104
19509 ; GCN2-NEXT: s_addc_u32 s89, s89, 0
19510 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
19511 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
19512 ; GCN2-NEXT: s_add_u32 s0, s8, s0
19513 ; GCN2-NEXT: s_addc_u32 s1, s9, s1
19514 ; GCN2-NEXT: s_add_u32 s0, s0, 32
19515 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
19516 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2
19517 ; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
19518 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
19519 ; GCN2-NEXT: s_cbranch_vccz .LBB126_4
19520 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
19521 ; GCN2-NEXT: v_mov_b32_e32 v3, s1
19522 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
19523 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
19524 ; GCN2-NEXT: s_mov_b64 s[2:3], 0
19525 ; GCN2-NEXT: v_mov_b32_e32 v4, s13
19526 ; GCN2-NEXT: v_mov_b32_e32 v5, s12
19527 ; GCN2-NEXT: .LBB126_2: ; %atomicrmw.start
19528 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
19529 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19530 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
19531 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
19532 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9]
19533 ; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
19534 ; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
19535 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
19536 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19537 ; GCN2-NEXT: buffer_wbinvl1_vol
19538 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
19539 ; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
19540 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
19541 ; GCN2-NEXT: s_cbranch_execnz .LBB126_2
19542 ; GCN2-NEXT: ; %bb.3: ; %Flow
19543 ; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
19544 ; GCN2-NEXT: s_branch .LBB126_6
19545 ; GCN2-NEXT: .LBB126_4:
19546 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
19547 ; GCN2-NEXT: s_cbranch_execz .LBB126_6
19548 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
19549 ; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
19550 ; GCN2-NEXT: s_cselect_b32 s0, s0, -1
19551 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
19552 ; GCN2-NEXT: s_add_i32 s0, s0, 4
19553 ; GCN2-NEXT: v_mov_b32_e32 v3, s0
19554 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
19555 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
19556 ; GCN2-NEXT: v_mov_b32_e32 v5, s12
19557 ; GCN2-NEXT: v_mov_b32_e32 v4, s13
19558 ; GCN2-NEXT: s_waitcnt vmcnt(0)
19559 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[0:1]
19560 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
19561 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
19562 ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
19563 ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
19564 ; GCN2-NEXT: .LBB126_6: ; %atomicrmw.phi
19565 ; GCN2-NEXT: v_mov_b32_e32 v2, s10
19566 ; GCN2-NEXT: v_mov_b32_e32 v3, s11
19567 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
19568 ; GCN2-NEXT: s_endpgm
19570 ; GCN3-LABEL: atomic_min_i64_ret_addr64_offset:
19571 ; GCN3: ; %bb.0: ; %entry
19572 ; GCN3-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
19573 ; GCN3-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
19574 ; GCN3-NEXT: s_mov_b32 s18, -1
19575 ; GCN3-NEXT: s_mov_b32 s19, 0xe00000
19576 ; GCN3-NEXT: s_add_u32 s16, s16, s11
19577 ; GCN3-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
19578 ; GCN3-NEXT: s_addc_u32 s17, s17, 0
19579 ; GCN3-NEXT: s_mov_b64 s[2:3], src_private_base
19580 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
19581 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
19582 ; GCN3-NEXT: s_add_u32 s0, s8, s0
19583 ; GCN3-NEXT: s_addc_u32 s1, s9, s1
19584 ; GCN3-NEXT: s_add_u32 s0, s0, 32
19585 ; GCN3-NEXT: s_addc_u32 s1, s1, 0
19586 ; GCN3-NEXT: s_cmp_eq_u32 s1, s3
19587 ; GCN3-NEXT: s_cselect_b64 s[2:3], -1, 0
19588 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3]
19589 ; GCN3-NEXT: s_cbranch_vccz .LBB126_4
19590 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
19591 ; GCN3-NEXT: v_mov_b32_e32 v3, s1
19592 ; GCN3-NEXT: v_mov_b32_e32 v2, s0
19593 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
19594 ; GCN3-NEXT: s_mov_b64 s[2:3], 0
19595 ; GCN3-NEXT: v_mov_b32_e32 v4, s13
19596 ; GCN3-NEXT: v_mov_b32_e32 v5, s12
19597 ; GCN3-NEXT: .LBB126_2: ; %atomicrmw.start
19598 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
19599 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19600 ; GCN3-NEXT: v_mov_b32_e32 v9, v1
19601 ; GCN3-NEXT: v_mov_b32_e32 v8, v0
19602 ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9]
19603 ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
19604 ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
19605 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
19606 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19607 ; GCN3-NEXT: buffer_wbinvl1_vol
19608 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
19609 ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
19610 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
19611 ; GCN3-NEXT: s_cbranch_execnz .LBB126_2
19612 ; GCN3-NEXT: ; %bb.3: ; %Flow
19613 ; GCN3-NEXT: s_or_b64 exec, exec, s[2:3]
19614 ; GCN3-NEXT: s_branch .LBB126_6
19615 ; GCN3-NEXT: .LBB126_4:
19616 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
19617 ; GCN3-NEXT: s_cbranch_execz .LBB126_6
19618 ; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
19619 ; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0
19620 ; GCN3-NEXT: s_cselect_b32 s0, s0, -1
19621 ; GCN3-NEXT: v_mov_b32_e32 v2, s0
19622 ; GCN3-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen
19623 ; GCN3-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen offset:4
19624 ; GCN3-NEXT: v_mov_b32_e32 v4, s12
19625 ; GCN3-NEXT: v_mov_b32_e32 v3, s13
19626 ; GCN3-NEXT: s_waitcnt vmcnt(0)
19627 ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[0:1]
19628 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
19629 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
19630 ; GCN3-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen
19631 ; GCN3-NEXT: buffer_store_dword v3, v2, s[16:19], 0 offen offset:4
19632 ; GCN3-NEXT: .LBB126_6: ; %atomicrmw.phi
19633 ; GCN3-NEXT: v_mov_b32_e32 v2, s10
19634 ; GCN3-NEXT: v_mov_b32_e32 v3, s11
19635 ; GCN3-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
19636 ; GCN3-NEXT: s_endpgm
19638 %ptr = getelementptr i64, ptr %out, i64 %index
19639 %gep = getelementptr i64, ptr %ptr, i64 4
19640 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst
19641 store i64 %tmp0, ptr %out2
19645 define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
19646 ; GCN1-LABEL: atomic_min_i64:
19647 ; GCN1: ; %bb.0: ; %entry
19648 ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
19649 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
19650 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
19651 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x3d
19652 ; GCN1-NEXT: s_mov_b32 s14, -1
19653 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
19654 ; GCN1-NEXT: s_add_u32 s12, s12, s11
19655 ; GCN1-NEXT: s_addc_u32 s13, s13, 0
19656 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
19657 ; GCN1-NEXT: s_cmp_eq_u32 s1, s4
19658 ; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0
19659 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
19660 ; GCN1-NEXT: s_mov_b64 s[4:5], -1
19661 ; GCN1-NEXT: s_cbranch_vccnz .LBB127_3
19662 ; GCN1-NEXT: ; %bb.1: ; %Flow5
19663 ; GCN1-NEXT: s_and_b64 vcc, exec, s[4:5]
19664 ; GCN1-NEXT: s_cbranch_vccnz .LBB127_6
19665 ; GCN1-NEXT: .LBB127_2: ; %atomicrmw.phi
19666 ; GCN1-NEXT: s_endpgm
19667 ; GCN1-NEXT: .LBB127_3: ; %atomicrmw.global
19668 ; GCN1-NEXT: v_mov_b32_e32 v5, s1
19669 ; GCN1-NEXT: v_mov_b32_e32 v4, s0
19670 ; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
19671 ; GCN1-NEXT: s_mov_b64 s[4:5], 0
19672 ; GCN1-NEXT: v_mov_b32_e32 v6, s3
19673 ; GCN1-NEXT: v_mov_b32_e32 v7, s2
19674 ; GCN1-NEXT: .LBB127_4: ; %atomicrmw.start
19675 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
19676 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19677 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
19678 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
19679 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
19680 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
19681 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19682 ; GCN1-NEXT: buffer_wbinvl1_vol
19683 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
19684 ; GCN1-NEXT: v_mov_b32_e32 v3, v1
19685 ; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
19686 ; GCN1-NEXT: v_mov_b32_e32 v2, v0
19687 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
19688 ; GCN1-NEXT: s_cbranch_execnz .LBB127_4
19689 ; GCN1-NEXT: ; %bb.5: ; %Flow
19690 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
19691 ; GCN1-NEXT: s_branch .LBB127_2
19692 ; GCN1-NEXT: .LBB127_6: ; %atomicrmw.private
19693 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0
19694 ; GCN1-NEXT: v_mov_b32_e32 v5, s2
19695 ; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec
19696 ; GCN1-NEXT: s_cselect_b32 s0, s0, -1
19697 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
19698 ; GCN1-NEXT: s_add_i32 s0, s0, 4
19699 ; GCN1-NEXT: v_mov_b32_e32 v3, s0
19700 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
19701 ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
19702 ; GCN1-NEXT: v_mov_b32_e32 v4, s3
19703 ; GCN1-NEXT: s_waitcnt vmcnt(0)
19704 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1]
19705 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
19706 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
19707 ; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
19708 ; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
19709 ; GCN1-NEXT: s_endpgm
19711 ; GCN2-LABEL: atomic_min_i64:
19712 ; GCN2: ; %bb.0: ; %entry
19713 ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
19714 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
19715 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
19716 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0xf4
19717 ; GCN2-NEXT: s_mov_b32 s90, -1
19718 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000
19719 ; GCN2-NEXT: s_add_u32 s88, s88, s11
19720 ; GCN2-NEXT: s_addc_u32 s89, s89, 0
19721 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
19722 ; GCN2-NEXT: s_cmp_eq_u32 s1, s4
19723 ; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0
19724 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5]
19725 ; GCN2-NEXT: s_mov_b64 s[4:5], -1
19726 ; GCN2-NEXT: s_cbranch_vccnz .LBB127_3
19727 ; GCN2-NEXT: ; %bb.1: ; %Flow5
19728 ; GCN2-NEXT: s_and_b64 vcc, exec, s[4:5]
19729 ; GCN2-NEXT: s_cbranch_vccnz .LBB127_6
19730 ; GCN2-NEXT: .LBB127_2: ; %atomicrmw.phi
19731 ; GCN2-NEXT: s_endpgm
19732 ; GCN2-NEXT: .LBB127_3: ; %atomicrmw.global
19733 ; GCN2-NEXT: v_mov_b32_e32 v5, s1
19734 ; GCN2-NEXT: v_mov_b32_e32 v4, s0
19735 ; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
19736 ; GCN2-NEXT: s_mov_b64 s[4:5], 0
19737 ; GCN2-NEXT: v_mov_b32_e32 v6, s3
19738 ; GCN2-NEXT: v_mov_b32_e32 v7, s2
19739 ; GCN2-NEXT: .LBB127_4: ; %atomicrmw.start
19740 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
19741 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19742 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
19743 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
19744 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
19745 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
19746 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19747 ; GCN2-NEXT: buffer_wbinvl1_vol
19748 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
19749 ; GCN2-NEXT: v_mov_b32_e32 v3, v1
19750 ; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
19751 ; GCN2-NEXT: v_mov_b32_e32 v2, v0
19752 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
19753 ; GCN2-NEXT: s_cbranch_execnz .LBB127_4
19754 ; GCN2-NEXT: ; %bb.5: ; %Flow
19755 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
19756 ; GCN2-NEXT: s_branch .LBB127_2
19757 ; GCN2-NEXT: .LBB127_6: ; %atomicrmw.private
19758 ; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
19759 ; GCN2-NEXT: s_cselect_b32 s0, s0, -1
19760 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
19761 ; GCN2-NEXT: s_add_i32 s0, s0, 4
19762 ; GCN2-NEXT: v_mov_b32_e32 v3, s0
19763 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
19764 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
19765 ; GCN2-NEXT: v_mov_b32_e32 v5, s2
19766 ; GCN2-NEXT: v_mov_b32_e32 v4, s3
19767 ; GCN2-NEXT: s_waitcnt vmcnt(0)
19768 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1]
19769 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
19770 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
19771 ; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
19772 ; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
19773 ; GCN2-NEXT: s_endpgm
19775 ; GCN3-LABEL: atomic_min_i64:
19776 ; GCN3: ; %bb.0: ; %entry
19777 ; GCN3-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
19778 ; GCN3-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
19779 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
19780 ; GCN3-NEXT: s_mov_b32 s14, -1
19781 ; GCN3-NEXT: s_mov_b32 s15, 0xe00000
19782 ; GCN3-NEXT: s_add_u32 s12, s12, s11
19783 ; GCN3-NEXT: s_addc_u32 s13, s13, 0
19784 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
19785 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
19786 ; GCN3-NEXT: s_cmp_eq_u32 s1, s5
19787 ; GCN3-NEXT: s_cselect_b64 s[4:5], -1, 0
19788 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[4:5]
19789 ; GCN3-NEXT: s_mov_b64 s[4:5], -1
19790 ; GCN3-NEXT: s_cbranch_vccnz .LBB127_3
19791 ; GCN3-NEXT: ; %bb.1: ; %Flow5
19792 ; GCN3-NEXT: s_and_b64 vcc, exec, s[4:5]
19793 ; GCN3-NEXT: s_cbranch_vccnz .LBB127_6
19794 ; GCN3-NEXT: .LBB127_2: ; %atomicrmw.phi
19795 ; GCN3-NEXT: s_endpgm
19796 ; GCN3-NEXT: .LBB127_3: ; %atomicrmw.global
19797 ; GCN3-NEXT: v_mov_b32_e32 v5, s1
19798 ; GCN3-NEXT: v_mov_b32_e32 v4, s0
19799 ; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
19800 ; GCN3-NEXT: s_mov_b64 s[4:5], 0
19801 ; GCN3-NEXT: v_mov_b32_e32 v6, s3
19802 ; GCN3-NEXT: v_mov_b32_e32 v7, s2
19803 ; GCN3-NEXT: .LBB127_4: ; %atomicrmw.start
19804 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
19805 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19806 ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
19807 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
19808 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
19809 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
19810 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19811 ; GCN3-NEXT: buffer_wbinvl1_vol
19812 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
19813 ; GCN3-NEXT: v_mov_b32_e32 v3, v1
19814 ; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
19815 ; GCN3-NEXT: v_mov_b32_e32 v2, v0
19816 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
19817 ; GCN3-NEXT: s_cbranch_execnz .LBB127_4
19818 ; GCN3-NEXT: ; %bb.5: ; %Flow
19819 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
19820 ; GCN3-NEXT: s_branch .LBB127_2
19821 ; GCN3-NEXT: .LBB127_6: ; %atomicrmw.private
19822 ; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0
19823 ; GCN3-NEXT: s_cselect_b32 s0, s0, -1
19824 ; GCN3-NEXT: v_mov_b32_e32 v2, s0
19825 ; GCN3-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
19826 ; GCN3-NEXT: buffer_load_dword v1, v2, s[12:15], 0 offen offset:4
19827 ; GCN3-NEXT: v_mov_b32_e32 v4, s2
19828 ; GCN3-NEXT: v_mov_b32_e32 v3, s3
19829 ; GCN3-NEXT: s_waitcnt vmcnt(0)
19830 ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1]
19831 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
19832 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
19833 ; GCN3-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
19834 ; GCN3-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen offset:4
19835 ; GCN3-NEXT: s_endpgm
19837 %tmp0 = atomicrmw min ptr %out, i64 %in seq_cst
19841 define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
19842 ; GCN1-LABEL: atomic_min_i64_ret_addr64:
19843 ; GCN1: ; %bb.0: ; %entry
19844 ; GCN1-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
19845 ; GCN1-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
19846 ; GCN1-NEXT: s_mov_b32 s18, -1
19847 ; GCN1-NEXT: s_mov_b32 s19, 0xe8f000
19848 ; GCN1-NEXT: s_add_u32 s16, s16, s11
19849 ; GCN1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9
19850 ; GCN1-NEXT: s_load_dword s2, s[4:5], 0x41
19851 ; GCN1-NEXT: s_addc_u32 s17, s17, 0
19852 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
19853 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
19854 ; GCN1-NEXT: s_add_u32 s0, s8, s0
19855 ; GCN1-NEXT: s_addc_u32 s1, s9, s1
19856 ; GCN1-NEXT: s_cmp_eq_u32 s1, s2
19857 ; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0
19858 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3]
19859 ; GCN1-NEXT: s_cbranch_vccz .LBB128_4
19860 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
19861 ; GCN1-NEXT: v_mov_b32_e32 v3, s1
19862 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
19863 ; GCN1-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
19864 ; GCN1-NEXT: s_mov_b64 s[2:3], 0
19865 ; GCN1-NEXT: v_mov_b32_e32 v4, s13
19866 ; GCN1-NEXT: v_mov_b32_e32 v5, s12
19867 ; GCN1-NEXT: .LBB128_2: ; %atomicrmw.start
19868 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
19869 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19870 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
19871 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
19872 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9]
19873 ; GCN1-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
19874 ; GCN1-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
19875 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
19876 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19877 ; GCN1-NEXT: buffer_wbinvl1_vol
19878 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
19879 ; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
19880 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3]
19881 ; GCN1-NEXT: s_cbranch_execnz .LBB128_2
19882 ; GCN1-NEXT: ; %bb.3: ; %Flow
19883 ; GCN1-NEXT: s_or_b64 exec, exec, s[2:3]
19884 ; GCN1-NEXT: s_branch .LBB128_6
19885 ; GCN1-NEXT: .LBB128_4:
19886 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
19887 ; GCN1-NEXT: s_cbranch_execz .LBB128_6
19888 ; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
19889 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
19890 ; GCN1-NEXT: v_mov_b32_e32 v5, s12
19891 ; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec
19892 ; GCN1-NEXT: s_cselect_b32 s0, s0, -1
19893 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
19894 ; GCN1-NEXT: s_add_i32 s0, s0, 4
19895 ; GCN1-NEXT: v_mov_b32_e32 v3, s0
19896 ; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen
19897 ; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
19898 ; GCN1-NEXT: v_mov_b32_e32 v4, s13
19899 ; GCN1-NEXT: s_waitcnt vmcnt(0)
19900 ; GCN1-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[0:1]
19901 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
19902 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
19903 ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen
19904 ; GCN1-NEXT: buffer_store_dword v4, v3, s[16:19], 0 offen
19905 ; GCN1-NEXT: .LBB128_6: ; %atomicrmw.phi
19906 ; GCN1-NEXT: v_mov_b32_e32 v2, s10
19907 ; GCN1-NEXT: v_mov_b32_e32 v3, s11
19908 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
19909 ; GCN1-NEXT: s_endpgm
19911 ; GCN2-LABEL: atomic_min_i64_ret_addr64:
19912 ; GCN2: ; %bb.0: ; %entry
19913 ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
19914 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
19915 ; GCN2-NEXT: s_mov_b32 s90, -1
19916 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000
19917 ; GCN2-NEXT: s_add_u32 s88, s88, s11
19918 ; GCN2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
19919 ; GCN2-NEXT: s_load_dword s2, s[4:5], 0x104
19920 ; GCN2-NEXT: s_addc_u32 s89, s89, 0
19921 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
19922 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
19923 ; GCN2-NEXT: s_add_u32 s0, s8, s0
19924 ; GCN2-NEXT: s_addc_u32 s1, s9, s1
19925 ; GCN2-NEXT: s_cmp_eq_u32 s1, s2
19926 ; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0
19927 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3]
19928 ; GCN2-NEXT: s_cbranch_vccz .LBB128_4
19929 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
19930 ; GCN2-NEXT: v_mov_b32_e32 v3, s1
19931 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
19932 ; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
19933 ; GCN2-NEXT: s_mov_b64 s[2:3], 0
19934 ; GCN2-NEXT: v_mov_b32_e32 v4, s13
19935 ; GCN2-NEXT: v_mov_b32_e32 v5, s12
19936 ; GCN2-NEXT: .LBB128_2: ; %atomicrmw.start
19937 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
19938 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19939 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
19940 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
19941 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9]
19942 ; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
19943 ; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
19944 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
19945 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19946 ; GCN2-NEXT: buffer_wbinvl1_vol
19947 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
19948 ; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
19949 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3]
19950 ; GCN2-NEXT: s_cbranch_execnz .LBB128_2
19951 ; GCN2-NEXT: ; %bb.3: ; %Flow
19952 ; GCN2-NEXT: s_or_b64 exec, exec, s[2:3]
19953 ; GCN2-NEXT: s_branch .LBB128_6
19954 ; GCN2-NEXT: .LBB128_4:
19955 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
19956 ; GCN2-NEXT: s_cbranch_execz .LBB128_6
19957 ; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
19958 ; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0
19959 ; GCN2-NEXT: s_cselect_b32 s0, s0, -1
19960 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
19961 ; GCN2-NEXT: s_add_i32 s0, s0, 4
19962 ; GCN2-NEXT: v_mov_b32_e32 v3, s0
19963 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
19964 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
19965 ; GCN2-NEXT: v_mov_b32_e32 v5, s12
19966 ; GCN2-NEXT: v_mov_b32_e32 v4, s13
19967 ; GCN2-NEXT: s_waitcnt vmcnt(0)
19968 ; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[0:1]
19969 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
19970 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
19971 ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen
19972 ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen
19973 ; GCN2-NEXT: .LBB128_6: ; %atomicrmw.phi
19974 ; GCN2-NEXT: v_mov_b32_e32 v2, s10
19975 ; GCN2-NEXT: v_mov_b32_e32 v3, s11
19976 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
19977 ; GCN2-NEXT: s_endpgm
19979 ; GCN3-LABEL: atomic_min_i64_ret_addr64:
19980 ; GCN3: ; %bb.0: ; %entry
19981 ; GCN3-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
19982 ; GCN3-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
19983 ; GCN3-NEXT: s_mov_b32 s18, -1
19984 ; GCN3-NEXT: s_mov_b32 s19, 0xe00000
19985 ; GCN3-NEXT: s_add_u32 s16, s16, s11
19986 ; GCN3-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
19987 ; GCN3-NEXT: s_addc_u32 s17, s17, 0
19988 ; GCN3-NEXT: s_mov_b64 s[2:3], src_private_base
19989 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
19990 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
19991 ; GCN3-NEXT: s_add_u32 s0, s8, s0
19992 ; GCN3-NEXT: s_addc_u32 s1, s9, s1
19993 ; GCN3-NEXT: s_cmp_eq_u32 s1, s3
19994 ; GCN3-NEXT: s_cselect_b64 s[2:3], -1, 0
19995 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[2:3]
19996 ; GCN3-NEXT: s_cbranch_vccz .LBB128_4
19997 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
19998 ; GCN3-NEXT: v_mov_b32_e32 v3, s1
19999 ; GCN3-NEXT: v_mov_b32_e32 v2, s0
20000 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
20001 ; GCN3-NEXT: s_mov_b64 s[2:3], 0
20002 ; GCN3-NEXT: v_mov_b32_e32 v4, s13
20003 ; GCN3-NEXT: v_mov_b32_e32 v5, s12
20004 ; GCN3-NEXT: .LBB128_2: ; %atomicrmw.start
20005 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
20006 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20007 ; GCN3-NEXT: v_mov_b32_e32 v9, v1
20008 ; GCN3-NEXT: v_mov_b32_e32 v8, v0
20009 ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9]
20010 ; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
20011 ; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
20012 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
20013 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20014 ; GCN3-NEXT: buffer_wbinvl1_vol
20015 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
20016 ; GCN3-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
20017 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[2:3]
20018 ; GCN3-NEXT: s_cbranch_execnz .LBB128_2
20019 ; GCN3-NEXT: ; %bb.3: ; %Flow
20020 ; GCN3-NEXT: s_or_b64 exec, exec, s[2:3]
20021 ; GCN3-NEXT: s_branch .LBB128_6
20022 ; GCN3-NEXT: .LBB128_4:
20023 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
20024 ; GCN3-NEXT: s_cbranch_execz .LBB128_6
20025 ; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
20026 ; GCN3-NEXT: s_cmp_lg_u64 s[0:1], 0
20027 ; GCN3-NEXT: s_cselect_b32 s0, s0, -1
20028 ; GCN3-NEXT: v_mov_b32_e32 v2, s0
20029 ; GCN3-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen
20030 ; GCN3-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen offset:4
20031 ; GCN3-NEXT: v_mov_b32_e32 v4, s12
20032 ; GCN3-NEXT: v_mov_b32_e32 v3, s13
20033 ; GCN3-NEXT: s_waitcnt vmcnt(0)
20034 ; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[0:1]
20035 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
20036 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
20037 ; GCN3-NEXT: buffer_store_dword v4, v2, s[16:19], 0 offen
20038 ; GCN3-NEXT: buffer_store_dword v3, v2, s[16:19], 0 offen offset:4
20039 ; GCN3-NEXT: .LBB128_6: ; %atomicrmw.phi
20040 ; GCN3-NEXT: v_mov_b32_e32 v2, s10
20041 ; GCN3-NEXT: v_mov_b32_e32 v3, s11
20042 ; GCN3-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
20043 ; GCN3-NEXT: s_endpgm
20045 %ptr = getelementptr i64, ptr %out, i64 %index
20046 %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst
20047 store i64 %tmp0, ptr %out2
20051 define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
20052 ; GCN1-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory:
20054 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20055 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
20056 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
20057 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
20058 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
20059 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
20060 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
20061 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
20062 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
20063 ; GCN1-NEXT: s_cbranch_execnz .LBB129_3
20064 ; GCN1-NEXT: ; %bb.1: ; %Flow3
20065 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20066 ; GCN1-NEXT: s_cbranch_execnz .LBB129_6
20067 ; GCN1-NEXT: .LBB129_2: ; %atomicrmw.phi
20068 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
20069 ; GCN1-NEXT: s_setpc_b64 s[30:31]
20070 ; GCN1-NEXT: .LBB129_3: ; %atomicrmw.global
20071 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
20072 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
20073 ; GCN1-NEXT: flat_load_dword v7, v[4:5]
20074 ; GCN1-NEXT: flat_load_dword v6, v[0:1]
20075 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
20076 ; GCN1-NEXT: .LBB129_4: ; %atomicrmw.start
20077 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
20078 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20079 ; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
20080 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
20081 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
20082 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
20083 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20084 ; GCN1-NEXT: buffer_wbinvl1_vol
20085 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
20086 ; GCN1-NEXT: v_mov_b32_e32 v7, v5
20087 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
20088 ; GCN1-NEXT: v_mov_b32_e32 v6, v4
20089 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
20090 ; GCN1-NEXT: s_cbranch_execnz .LBB129_4
20091 ; GCN1-NEXT: ; %bb.5: ; %Flow
20092 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
20093 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
20094 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
20095 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20096 ; GCN1-NEXT: s_cbranch_execz .LBB129_2
20097 ; GCN1-NEXT: .LBB129_6: ; %atomicrmw.private
20098 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
20099 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
20100 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
20101 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
20102 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
20103 ; GCN1-NEXT: s_waitcnt vmcnt(0)
20104 ; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
20105 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
20106 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
20107 ; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
20108 ; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
20109 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
20110 ; GCN1-NEXT: s_waitcnt vmcnt(0)
20111 ; GCN1-NEXT: s_setpc_b64 s[30:31]
20113 ; GCN2-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory:
20115 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20116 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
20117 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
20118 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
20119 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
20120 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
20121 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
20122 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
20123 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
20124 ; GCN2-NEXT: s_cbranch_execnz .LBB129_3
20125 ; GCN2-NEXT: ; %bb.1: ; %Flow3
20126 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20127 ; GCN2-NEXT: s_cbranch_execnz .LBB129_6
20128 ; GCN2-NEXT: .LBB129_2: ; %atomicrmw.phi
20129 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
20130 ; GCN2-NEXT: s_setpc_b64 s[30:31]
20131 ; GCN2-NEXT: .LBB129_3: ; %atomicrmw.global
20132 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
20133 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
20134 ; GCN2-NEXT: flat_load_dword v7, v[4:5]
20135 ; GCN2-NEXT: flat_load_dword v6, v[0:1]
20136 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
20137 ; GCN2-NEXT: .LBB129_4: ; %atomicrmw.start
20138 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
20139 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20140 ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
20141 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
20142 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
20143 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
20144 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20145 ; GCN2-NEXT: buffer_wbinvl1_vol
20146 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
20147 ; GCN2-NEXT: v_mov_b32_e32 v7, v5
20148 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
20149 ; GCN2-NEXT: v_mov_b32_e32 v6, v4
20150 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
20151 ; GCN2-NEXT: s_cbranch_execnz .LBB129_4
20152 ; GCN2-NEXT: ; %bb.5: ; %Flow
20153 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
20154 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
20155 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
20156 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20157 ; GCN2-NEXT: s_cbranch_execz .LBB129_2
20158 ; GCN2-NEXT: .LBB129_6: ; %atomicrmw.private
20159 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
20160 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
20161 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
20162 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
20163 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
20164 ; GCN2-NEXT: s_waitcnt vmcnt(0)
20165 ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
20166 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
20167 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
20168 ; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
20169 ; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
20170 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
20171 ; GCN2-NEXT: s_waitcnt vmcnt(0)
20172 ; GCN2-NEXT: s_setpc_b64 s[30:31]
20174 ; GCN3-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory:
20176 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20177 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
20178 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
20179 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
20180 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
20181 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
20182 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
20183 ; GCN3-NEXT: s_cbranch_execnz .LBB129_3
20184 ; GCN3-NEXT: ; %bb.1: ; %Flow3
20185 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20186 ; GCN3-NEXT: s_cbranch_execnz .LBB129_6
20187 ; GCN3-NEXT: .LBB129_2: ; %atomicrmw.phi
20188 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
20189 ; GCN3-NEXT: s_setpc_b64 s[30:31]
20190 ; GCN3-NEXT: .LBB129_3: ; %atomicrmw.global
20191 ; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
20192 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
20193 ; GCN3-NEXT: .LBB129_4: ; %atomicrmw.start
20194 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
20195 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20196 ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
20197 ; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
20198 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
20199 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
20200 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20201 ; GCN3-NEXT: buffer_wbinvl1_vol
20202 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
20203 ; GCN3-NEXT: v_mov_b32_e32 v7, v5
20204 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
20205 ; GCN3-NEXT: v_mov_b32_e32 v6, v4
20206 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
20207 ; GCN3-NEXT: s_cbranch_execnz .LBB129_4
20208 ; GCN3-NEXT: ; %bb.5: ; %Flow
20209 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
20210 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
20211 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
20212 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20213 ; GCN3-NEXT: s_cbranch_execz .LBB129_2
20214 ; GCN3-NEXT: .LBB129_6: ; %atomicrmw.private
20215 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
20216 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
20217 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
20218 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
20219 ; GCN3-NEXT: s_waitcnt vmcnt(0)
20220 ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
20221 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
20222 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
20223 ; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
20224 ; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
20225 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
20226 ; GCN3-NEXT: s_waitcnt vmcnt(0)
20227 ; GCN3-NEXT: s_setpc_b64 s[30:31]
20228 %gep = getelementptr i64, ptr %out, i64 4
20229 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
20233 define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
20234 ; GCN1-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
20236 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20237 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
20238 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
20239 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
20240 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
20241 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
20242 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
20243 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
20244 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
20245 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
20246 ; GCN1-NEXT: s_cbranch_execnz .LBB130_3
20247 ; GCN1-NEXT: ; %bb.1: ; %Flow3
20248 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20249 ; GCN1-NEXT: s_cbranch_execnz .LBB130_6
20250 ; GCN1-NEXT: .LBB130_2: ; %atomicrmw.phi
20251 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
20252 ; GCN1-NEXT: s_setpc_b64 s[30:31]
20253 ; GCN1-NEXT: .LBB130_3: ; %atomicrmw.global
20254 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4
20255 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
20256 ; GCN1-NEXT: flat_load_dword v1, v[0:1]
20257 ; GCN1-NEXT: flat_load_dword v0, v[4:5]
20258 ; GCN1-NEXT: s_mov_b64 s[6:7], 0
20259 ; GCN1-NEXT: .LBB130_4: ; %atomicrmw.start
20260 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
20261 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20262 ; GCN1-NEXT: v_mov_b32_e32 v9, v1
20263 ; GCN1-NEXT: v_mov_b32_e32 v8, v0
20264 ; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
20265 ; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
20266 ; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
20267 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
20268 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20269 ; GCN1-NEXT: buffer_wbinvl1_vol
20270 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
20271 ; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
20272 ; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
20273 ; GCN1-NEXT: s_cbranch_execnz .LBB130_4
20274 ; GCN1-NEXT: ; %bb.5: ; %Flow
20275 ; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
20276 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
20277 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
20278 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20279 ; GCN1-NEXT: s_cbranch_execz .LBB130_2
20280 ; GCN1-NEXT: .LBB130_6: ; %atomicrmw.private
20281 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
20282 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
20283 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
20284 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
20285 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
20286 ; GCN1-NEXT: s_waitcnt vmcnt(0)
20287 ; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
20288 ; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
20289 ; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
20290 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
20291 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
20292 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
20293 ; GCN1-NEXT: s_waitcnt vmcnt(0)
20294 ; GCN1-NEXT: s_setpc_b64 s[30:31]
20296 ; GCN2-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
20298 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20299 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
20300 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
20301 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
20302 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
20303 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
20304 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
20305 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
20306 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
20307 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
20308 ; GCN2-NEXT: s_cbranch_execnz .LBB130_3
20309 ; GCN2-NEXT: ; %bb.1: ; %Flow3
20310 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20311 ; GCN2-NEXT: s_cbranch_execnz .LBB130_6
20312 ; GCN2-NEXT: .LBB130_2: ; %atomicrmw.phi
20313 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
20314 ; GCN2-NEXT: s_setpc_b64 s[30:31]
20315 ; GCN2-NEXT: .LBB130_3: ; %atomicrmw.global
20316 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4
20317 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
20318 ; GCN2-NEXT: flat_load_dword v1, v[0:1]
20319 ; GCN2-NEXT: flat_load_dword v0, v[4:5]
20320 ; GCN2-NEXT: s_mov_b64 s[6:7], 0
20321 ; GCN2-NEXT: .LBB130_4: ; %atomicrmw.start
20322 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
20323 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20324 ; GCN2-NEXT: v_mov_b32_e32 v9, v1
20325 ; GCN2-NEXT: v_mov_b32_e32 v8, v0
20326 ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
20327 ; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
20328 ; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
20329 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
20330 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20331 ; GCN2-NEXT: buffer_wbinvl1_vol
20332 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
20333 ; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
20334 ; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
20335 ; GCN2-NEXT: s_cbranch_execnz .LBB130_4
20336 ; GCN2-NEXT: ; %bb.5: ; %Flow
20337 ; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
20338 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
20339 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
20340 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20341 ; GCN2-NEXT: s_cbranch_execz .LBB130_2
20342 ; GCN2-NEXT: .LBB130_6: ; %atomicrmw.private
20343 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
20344 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
20345 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
20346 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
20347 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
20348 ; GCN2-NEXT: s_waitcnt vmcnt(0)
20349 ; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
20350 ; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
20351 ; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
20352 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
20353 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
20354 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
20355 ; GCN2-NEXT: s_waitcnt vmcnt(0)
20356 ; GCN2-NEXT: s_setpc_b64 s[30:31]
20358 ; GCN3-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
20360 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20361 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
20362 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
20363 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
20364 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
20365 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
20366 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
20367 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
20368 ; GCN3-NEXT: s_cbranch_execnz .LBB130_3
20369 ; GCN3-NEXT: ; %bb.1: ; %Flow3
20370 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20371 ; GCN3-NEXT: s_cbranch_execnz .LBB130_6
20372 ; GCN3-NEXT: .LBB130_2: ; %atomicrmw.phi
20373 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
20374 ; GCN3-NEXT: s_setpc_b64 s[30:31]
20375 ; GCN3-NEXT: .LBB130_3: ; %atomicrmw.global
20376 ; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
20377 ; GCN3-NEXT: s_mov_b64 s[6:7], 0
20378 ; GCN3-NEXT: .LBB130_4: ; %atomicrmw.start
20379 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
20380 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20381 ; GCN3-NEXT: v_mov_b32_e32 v9, v1
20382 ; GCN3-NEXT: v_mov_b32_e32 v8, v0
20383 ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
20384 ; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
20385 ; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
20386 ; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
20387 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20388 ; GCN3-NEXT: buffer_wbinvl1_vol
20389 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
20390 ; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
20391 ; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
20392 ; GCN3-NEXT: s_cbranch_execnz .LBB130_4
20393 ; GCN3-NEXT: ; %bb.5: ; %Flow
20394 ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
20395 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
20396 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
20397 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20398 ; GCN3-NEXT: s_cbranch_execz .LBB130_2
20399 ; GCN3-NEXT: .LBB130_6: ; %atomicrmw.private
20400 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
20401 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
20402 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
20403 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
20404 ; GCN3-NEXT: s_waitcnt vmcnt(0)
20405 ; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
20406 ; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
20407 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
20408 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
20409 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
20410 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
20411 ; GCN3-NEXT: s_waitcnt vmcnt(0)
20412 ; GCN3-NEXT: s_setpc_b64 s[30:31]
20413 %gep = getelementptr i64, ptr %out, i64 4
20414 %result = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
20418 ; ---------------------------------------------------------------------
20419 ; atomicrmw uinc_wrap
20420 ; ---------------------------------------------------------------------
20422 define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) {
20423 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret:
20425 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20426 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
20427 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
20428 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
20429 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
20430 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
20431 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
20432 ; GCN1-NEXT: s_cbranch_execnz .LBB131_3
20433 ; GCN1-NEXT: ; %bb.1: ; %Flow
20434 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20435 ; GCN1-NEXT: s_cbranch_execnz .LBB131_4
20436 ; GCN1-NEXT: .LBB131_2: ; %atomicrmw.phi
20437 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
20438 ; GCN1-NEXT: s_setpc_b64 s[30:31]
20439 ; GCN1-NEXT: .LBB131_3: ; %atomicrmw.global
20440 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
20441 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20442 ; GCN1-NEXT: buffer_wbinvl1_vol
20443 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
20444 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
20445 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20446 ; GCN1-NEXT: s_cbranch_execz .LBB131_2
20447 ; GCN1-NEXT: .LBB131_4: ; %atomicrmw.private
20448 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
20449 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
20450 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
20451 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
20452 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
20453 ; GCN1-NEXT: s_waitcnt vmcnt(1)
20454 ; GCN1-NEXT: v_add_i32_e32 v6, vcc, 1, v0
20455 ; GCN1-NEXT: s_waitcnt vmcnt(0)
20456 ; GCN1-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
20457 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
20458 ; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc
20459 ; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc
20460 ; GCN1-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
20461 ; GCN1-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen
20462 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
20463 ; GCN1-NEXT: s_waitcnt vmcnt(0)
20464 ; GCN1-NEXT: s_setpc_b64 s[30:31]
20466 ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret:
20468 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20469 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
20470 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
20471 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
20472 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
20473 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
20474 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
20475 ; GCN2-NEXT: s_cbranch_execnz .LBB131_3
20476 ; GCN2-NEXT: ; %bb.1: ; %Flow
20477 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20478 ; GCN2-NEXT: s_cbranch_execnz .LBB131_4
20479 ; GCN2-NEXT: .LBB131_2: ; %atomicrmw.phi
20480 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
20481 ; GCN2-NEXT: s_setpc_b64 s[30:31]
20482 ; GCN2-NEXT: .LBB131_3: ; %atomicrmw.global
20483 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
20484 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20485 ; GCN2-NEXT: buffer_wbinvl1_vol
20486 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
20487 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
20488 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20489 ; GCN2-NEXT: s_cbranch_execz .LBB131_2
20490 ; GCN2-NEXT: .LBB131_4: ; %atomicrmw.private
20491 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
20492 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
20493 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
20494 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
20495 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
20496 ; GCN2-NEXT: s_waitcnt vmcnt(1)
20497 ; GCN2-NEXT: v_add_u32_e32 v6, vcc, 1, v0
20498 ; GCN2-NEXT: s_waitcnt vmcnt(0)
20499 ; GCN2-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
20500 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
20501 ; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc
20502 ; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc
20503 ; GCN2-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
20504 ; GCN2-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen
20505 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
20506 ; GCN2-NEXT: s_waitcnt vmcnt(0)
20507 ; GCN2-NEXT: s_setpc_b64 s[30:31]
20509 ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret:
20511 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20512 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
20513 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
20514 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
20515 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
20516 ; GCN3-NEXT: s_cbranch_execnz .LBB131_3
20517 ; GCN3-NEXT: ; %bb.1: ; %Flow
20518 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20519 ; GCN3-NEXT: s_cbranch_execnz .LBB131_4
20520 ; GCN3-NEXT: .LBB131_2: ; %atomicrmw.phi
20521 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
20522 ; GCN3-NEXT: s_setpc_b64 s[30:31]
20523 ; GCN3-NEXT: .LBB131_3: ; %atomicrmw.global
20524 ; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
20525 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20526 ; GCN3-NEXT: buffer_wbinvl1_vol
20527 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
20528 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
20529 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20530 ; GCN3-NEXT: s_cbranch_execz .LBB131_2
20531 ; GCN3-NEXT: .LBB131_4: ; %atomicrmw.private
20532 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
20533 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
20534 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
20535 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
20536 ; GCN3-NEXT: s_waitcnt vmcnt(1)
20537 ; GCN3-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0
20538 ; GCN3-NEXT: s_waitcnt vmcnt(0)
20539 ; GCN3-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
20540 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
20541 ; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc
20542 ; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
20543 ; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
20544 ; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4
20545 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
20546 ; GCN3-NEXT: s_waitcnt vmcnt(0)
20547 ; GCN3-NEXT: s_setpc_b64 s[30:31]
20548 %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst
20552 define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) {
20553 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret_offset:
20555 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20556 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
20557 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
20558 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
20559 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
20560 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
20561 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
20562 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
20563 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
20564 ; GCN1-NEXT: s_cbranch_execnz .LBB132_3
20565 ; GCN1-NEXT: ; %bb.1: ; %Flow
20566 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20567 ; GCN1-NEXT: s_cbranch_execnz .LBB132_4
20568 ; GCN1-NEXT: .LBB132_2: ; %atomicrmw.phi
20569 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
20570 ; GCN1-NEXT: s_setpc_b64 s[30:31]
20571 ; GCN1-NEXT: .LBB132_3: ; %atomicrmw.global
20572 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
20573 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20574 ; GCN1-NEXT: buffer_wbinvl1_vol
20575 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
20576 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
20577 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20578 ; GCN1-NEXT: s_cbranch_execz .LBB132_2
20579 ; GCN1-NEXT: .LBB132_4: ; %atomicrmw.private
20580 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
20581 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
20582 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
20583 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
20584 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
20585 ; GCN1-NEXT: s_waitcnt vmcnt(1)
20586 ; GCN1-NEXT: v_add_i32_e32 v6, vcc, 1, v0
20587 ; GCN1-NEXT: s_waitcnt vmcnt(0)
20588 ; GCN1-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
20589 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
20590 ; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc
20591 ; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc
20592 ; GCN1-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
20593 ; GCN1-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen
20594 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
20595 ; GCN1-NEXT: s_waitcnt vmcnt(0)
20596 ; GCN1-NEXT: s_setpc_b64 s[30:31]
20598 ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_offset:
20600 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20601 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
20602 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
20603 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
20604 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
20605 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
20606 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
20607 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
20608 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
20609 ; GCN2-NEXT: s_cbranch_execnz .LBB132_3
20610 ; GCN2-NEXT: ; %bb.1: ; %Flow
20611 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20612 ; GCN2-NEXT: s_cbranch_execnz .LBB132_4
20613 ; GCN2-NEXT: .LBB132_2: ; %atomicrmw.phi
20614 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
20615 ; GCN2-NEXT: s_setpc_b64 s[30:31]
20616 ; GCN2-NEXT: .LBB132_3: ; %atomicrmw.global
20617 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
20618 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20619 ; GCN2-NEXT: buffer_wbinvl1_vol
20620 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
20621 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
20622 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20623 ; GCN2-NEXT: s_cbranch_execz .LBB132_2
20624 ; GCN2-NEXT: .LBB132_4: ; %atomicrmw.private
20625 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
20626 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
20627 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
20628 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
20629 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
20630 ; GCN2-NEXT: s_waitcnt vmcnt(1)
20631 ; GCN2-NEXT: v_add_u32_e32 v6, vcc, 1, v0
20632 ; GCN2-NEXT: s_waitcnt vmcnt(0)
20633 ; GCN2-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
20634 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
20635 ; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc
20636 ; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc
20637 ; GCN2-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
20638 ; GCN2-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen
20639 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
20640 ; GCN2-NEXT: s_waitcnt vmcnt(0)
20641 ; GCN2-NEXT: s_setpc_b64 s[30:31]
20643 ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_offset:
20645 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20646 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
20647 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
20648 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
20649 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
20650 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
20651 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
20652 ; GCN3-NEXT: s_cbranch_execnz .LBB132_3
20653 ; GCN3-NEXT: ; %bb.1: ; %Flow
20654 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20655 ; GCN3-NEXT: s_cbranch_execnz .LBB132_4
20656 ; GCN3-NEXT: .LBB132_2: ; %atomicrmw.phi
20657 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
20658 ; GCN3-NEXT: s_setpc_b64 s[30:31]
20659 ; GCN3-NEXT: .LBB132_3: ; %atomicrmw.global
20660 ; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
20661 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20662 ; GCN3-NEXT: buffer_wbinvl1_vol
20663 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
20664 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
20665 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20666 ; GCN3-NEXT: s_cbranch_execz .LBB132_2
20667 ; GCN3-NEXT: .LBB132_4: ; %atomicrmw.private
20668 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
20669 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
20670 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
20671 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
20672 ; GCN3-NEXT: s_waitcnt vmcnt(1)
20673 ; GCN3-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0
20674 ; GCN3-NEXT: s_waitcnt vmcnt(0)
20675 ; GCN3-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
20676 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
20677 ; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc
20678 ; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
20679 ; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
20680 ; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4
20681 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
20682 ; GCN3-NEXT: s_waitcnt vmcnt(0)
20683 ; GCN3-NEXT: s_setpc_b64 s[30:31]
20684 %gep = getelementptr i64, ptr %out, i64 4
20685 %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst
20689 define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) {
20690 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret:
20692 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20693 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
20694 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
20695 ; GCN1-NEXT: v_mov_b32_e32 v5, v1
20696 ; GCN1-NEXT: v_mov_b32_e32 v4, v0
20697 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
20698 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
20699 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
20700 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
20701 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
20702 ; GCN1-NEXT: s_cbranch_execnz .LBB133_3
20703 ; GCN1-NEXT: ; %bb.1: ; %Flow
20704 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20705 ; GCN1-NEXT: s_cbranch_execnz .LBB133_4
20706 ; GCN1-NEXT: .LBB133_2: ; %atomicrmw.phi
20707 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
20708 ; GCN1-NEXT: s_setpc_b64 s[30:31]
20709 ; GCN1-NEXT: .LBB133_3: ; %atomicrmw.global
20710 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc
20711 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20712 ; GCN1-NEXT: buffer_wbinvl1_vol
20713 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
20714 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
20715 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20716 ; GCN1-NEXT: s_cbranch_execz .LBB133_2
20717 ; GCN1-NEXT: .LBB133_4: ; %atomicrmw.private
20718 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
20719 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
20720 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
20721 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
20722 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
20723 ; GCN1-NEXT: s_waitcnt vmcnt(1)
20724 ; GCN1-NEXT: v_add_i32_e32 v6, vcc, 1, v0
20725 ; GCN1-NEXT: s_waitcnt vmcnt(0)
20726 ; GCN1-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
20727 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
20728 ; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc
20729 ; GCN1-NEXT: v_cndmask_b32_e32 v2, 0, v7, vcc
20730 ; GCN1-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
20731 ; GCN1-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
20732 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
20733 ; GCN1-NEXT: s_waitcnt vmcnt(0)
20734 ; GCN1-NEXT: s_setpc_b64 s[30:31]
20736 ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret:
20738 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20739 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
20740 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
20741 ; GCN2-NEXT: v_mov_b32_e32 v5, v1
20742 ; GCN2-NEXT: v_mov_b32_e32 v4, v0
20743 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
20744 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
20745 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
20746 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
20747 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
20748 ; GCN2-NEXT: s_cbranch_execnz .LBB133_3
20749 ; GCN2-NEXT: ; %bb.1: ; %Flow
20750 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20751 ; GCN2-NEXT: s_cbranch_execnz .LBB133_4
20752 ; GCN2-NEXT: .LBB133_2: ; %atomicrmw.phi
20753 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
20754 ; GCN2-NEXT: s_setpc_b64 s[30:31]
20755 ; GCN2-NEXT: .LBB133_3: ; %atomicrmw.global
20756 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc
20757 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20758 ; GCN2-NEXT: buffer_wbinvl1_vol
20759 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
20760 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
20761 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20762 ; GCN2-NEXT: s_cbranch_execz .LBB133_2
20763 ; GCN2-NEXT: .LBB133_4: ; %atomicrmw.private
20764 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
20765 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
20766 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
20767 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
20768 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
20769 ; GCN2-NEXT: s_waitcnt vmcnt(1)
20770 ; GCN2-NEXT: v_add_u32_e32 v6, vcc, 1, v0
20771 ; GCN2-NEXT: s_waitcnt vmcnt(0)
20772 ; GCN2-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
20773 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
20774 ; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc
20775 ; GCN2-NEXT: v_cndmask_b32_e32 v2, 0, v7, vcc
20776 ; GCN2-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
20777 ; GCN2-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
20778 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
20779 ; GCN2-NEXT: s_waitcnt vmcnt(0)
20780 ; GCN2-NEXT: s_setpc_b64 s[30:31]
20782 ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret:
20784 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20785 ; GCN3-NEXT: v_mov_b32_e32 v5, v1
20786 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
20787 ; GCN3-NEXT: v_mov_b32_e32 v4, v0
20788 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
20789 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
20790 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
20791 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
20792 ; GCN3-NEXT: s_cbranch_execnz .LBB133_3
20793 ; GCN3-NEXT: ; %bb.1: ; %Flow
20794 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20795 ; GCN3-NEXT: s_cbranch_execnz .LBB133_4
20796 ; GCN3-NEXT: .LBB133_2: ; %atomicrmw.phi
20797 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
20798 ; GCN3-NEXT: s_setpc_b64 s[30:31]
20799 ; GCN3-NEXT: .LBB133_3: ; %atomicrmw.global
20800 ; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc
20801 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20802 ; GCN3-NEXT: buffer_wbinvl1_vol
20803 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
20804 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
20805 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20806 ; GCN3-NEXT: s_cbranch_execz .LBB133_2
20807 ; GCN3-NEXT: .LBB133_4: ; %atomicrmw.private
20808 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
20809 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
20810 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
20811 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
20812 ; GCN3-NEXT: s_waitcnt vmcnt(1)
20813 ; GCN3-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0
20814 ; GCN3-NEXT: s_waitcnt vmcnt(0)
20815 ; GCN3-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
20816 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
20817 ; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
20818 ; GCN3-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc
20819 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
20820 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
20821 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
20822 ; GCN3-NEXT: s_waitcnt vmcnt(0)
20823 ; GCN3-NEXT: s_setpc_b64 s[30:31]
20824 %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst
20828 define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) {
20829 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret_offset:
20831 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20832 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
20833 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
20834 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
20835 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
20836 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
20837 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
20838 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
20839 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
20840 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
20841 ; GCN1-NEXT: s_cbranch_execnz .LBB134_3
20842 ; GCN1-NEXT: ; %bb.1: ; %Flow
20843 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20844 ; GCN1-NEXT: s_cbranch_execnz .LBB134_4
20845 ; GCN1-NEXT: .LBB134_2: ; %atomicrmw.phi
20846 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
20847 ; GCN1-NEXT: s_setpc_b64 s[30:31]
20848 ; GCN1-NEXT: .LBB134_3: ; %atomicrmw.global
20849 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc
20850 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20851 ; GCN1-NEXT: buffer_wbinvl1_vol
20852 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
20853 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
20854 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20855 ; GCN1-NEXT: s_cbranch_execz .LBB134_2
20856 ; GCN1-NEXT: .LBB134_4: ; %atomicrmw.private
20857 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
20858 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
20859 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
20860 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
20861 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
20862 ; GCN1-NEXT: s_waitcnt vmcnt(1)
20863 ; GCN1-NEXT: v_add_i32_e32 v6, vcc, 1, v0
20864 ; GCN1-NEXT: s_waitcnt vmcnt(0)
20865 ; GCN1-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
20866 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
20867 ; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc
20868 ; GCN1-NEXT: v_cndmask_b32_e32 v2, 0, v7, vcc
20869 ; GCN1-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
20870 ; GCN1-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
20871 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
20872 ; GCN1-NEXT: s_waitcnt vmcnt(0)
20873 ; GCN1-NEXT: s_setpc_b64 s[30:31]
20875 ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_offset:
20877 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20878 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
20879 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
20880 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
20881 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
20882 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
20883 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
20884 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
20885 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
20886 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
20887 ; GCN2-NEXT: s_cbranch_execnz .LBB134_3
20888 ; GCN2-NEXT: ; %bb.1: ; %Flow
20889 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20890 ; GCN2-NEXT: s_cbranch_execnz .LBB134_4
20891 ; GCN2-NEXT: .LBB134_2: ; %atomicrmw.phi
20892 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
20893 ; GCN2-NEXT: s_setpc_b64 s[30:31]
20894 ; GCN2-NEXT: .LBB134_3: ; %atomicrmw.global
20895 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc
20896 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20897 ; GCN2-NEXT: buffer_wbinvl1_vol
20898 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
20899 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
20900 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20901 ; GCN2-NEXT: s_cbranch_execz .LBB134_2
20902 ; GCN2-NEXT: .LBB134_4: ; %atomicrmw.private
20903 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
20904 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
20905 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
20906 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
20907 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
20908 ; GCN2-NEXT: s_waitcnt vmcnt(1)
20909 ; GCN2-NEXT: v_add_u32_e32 v6, vcc, 1, v0
20910 ; GCN2-NEXT: s_waitcnt vmcnt(0)
20911 ; GCN2-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
20912 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
20913 ; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc
20914 ; GCN2-NEXT: v_cndmask_b32_e32 v2, 0, v7, vcc
20915 ; GCN2-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
20916 ; GCN2-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
20917 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
20918 ; GCN2-NEXT: s_waitcnt vmcnt(0)
20919 ; GCN2-NEXT: s_setpc_b64 s[30:31]
20921 ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_offset:
20923 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20924 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
20925 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
20926 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
20927 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
20928 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
20929 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
20930 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
20931 ; GCN3-NEXT: s_cbranch_execnz .LBB134_3
20932 ; GCN3-NEXT: ; %bb.1: ; %Flow
20933 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20934 ; GCN3-NEXT: s_cbranch_execnz .LBB134_4
20935 ; GCN3-NEXT: .LBB134_2: ; %atomicrmw.phi
20936 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
20937 ; GCN3-NEXT: s_setpc_b64 s[30:31]
20938 ; GCN3-NEXT: .LBB134_3: ; %atomicrmw.global
20939 ; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc
20940 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20941 ; GCN3-NEXT: buffer_wbinvl1_vol
20942 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
20943 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
20944 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
20945 ; GCN3-NEXT: s_cbranch_execz .LBB134_2
20946 ; GCN3-NEXT: .LBB134_4: ; %atomicrmw.private
20947 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
20948 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
20949 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
20950 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
20951 ; GCN3-NEXT: s_waitcnt vmcnt(1)
20952 ; GCN3-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0
20953 ; GCN3-NEXT: s_waitcnt vmcnt(0)
20954 ; GCN3-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
20955 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
20956 ; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
20957 ; GCN3-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc
20958 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
20959 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
20960 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
20961 ; GCN3-NEXT: s_waitcnt vmcnt(0)
20962 ; GCN3-NEXT: s_setpc_b64 s[30:31]
20963 %gep = getelementptr i64, ptr %out, i64 4
20964 %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst
20968 define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
20969 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar:
20971 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20972 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
20973 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
20974 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
20975 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
20976 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
20977 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
20978 ; GCN1-NEXT: s_mov_b64 s[34:35], -1
20979 ; GCN1-NEXT: s_cbranch_vccnz .LBB135_3
20980 ; GCN1-NEXT: ; %bb.1: ; %Flow
20981 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
20982 ; GCN1-NEXT: s_cbranch_vccz .LBB135_4
20983 ; GCN1-NEXT: .LBB135_2: ; %atomicrmw.phi
20984 ; GCN1-NEXT: s_setpc_b64 s[30:31]
20985 ; GCN1-NEXT: .LBB135_3: ; %atomicrmw.global
20986 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
20987 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
20988 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
20989 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
20990 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
20991 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20992 ; GCN1-NEXT: buffer_wbinvl1_vol
20993 ; GCN1-NEXT: s_cbranch_execnz .LBB135_2
20994 ; GCN1-NEXT: .LBB135_4: ; %atomicrmw.private
20995 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
20996 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
20997 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
20998 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
20999 ; GCN1-NEXT: s_add_i32 s34, s34, 4
21000 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
21001 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
21002 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
21003 ; GCN1-NEXT: s_waitcnt vmcnt(1)
21004 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0
21005 ; GCN1-NEXT: s_waitcnt vmcnt(0)
21006 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
21007 ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
21008 ; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc
21009 ; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc
21010 ; GCN1-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
21011 ; GCN1-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen
21012 ; GCN1-NEXT: s_waitcnt vmcnt(0)
21013 ; GCN1-NEXT: s_setpc_b64 s[30:31]
21015 ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar:
21017 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21018 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
21019 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
21020 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
21021 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
21022 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
21023 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
21024 ; GCN2-NEXT: s_mov_b64 s[34:35], -1
21025 ; GCN2-NEXT: s_cbranch_vccnz .LBB135_3
21026 ; GCN2-NEXT: ; %bb.1: ; %Flow
21027 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
21028 ; GCN2-NEXT: s_cbranch_vccz .LBB135_4
21029 ; GCN2-NEXT: .LBB135_2: ; %atomicrmw.phi
21030 ; GCN2-NEXT: s_setpc_b64 s[30:31]
21031 ; GCN2-NEXT: .LBB135_3: ; %atomicrmw.global
21032 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
21033 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
21034 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
21035 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
21036 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
21037 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
21038 ; GCN2-NEXT: buffer_wbinvl1_vol
21039 ; GCN2-NEXT: s_cbranch_execnz .LBB135_2
21040 ; GCN2-NEXT: .LBB135_4: ; %atomicrmw.private
21041 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
21042 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
21043 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
21044 ; GCN2-NEXT: s_add_i32 s34, s34, 4
21045 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
21046 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
21047 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
21048 ; GCN2-NEXT: s_waitcnt vmcnt(1)
21049 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0
21050 ; GCN2-NEXT: s_waitcnt vmcnt(0)
21051 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
21052 ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
21053 ; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc
21054 ; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc
21055 ; GCN2-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
21056 ; GCN2-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen
21057 ; GCN2-NEXT: s_waitcnt vmcnt(0)
21058 ; GCN2-NEXT: s_setpc_b64 s[30:31]
21060 ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar:
21062 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21063 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
21064 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
21065 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
21066 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
21067 ; GCN3-NEXT: s_mov_b64 s[34:35], -1
21068 ; GCN3-NEXT: s_cbranch_vccnz .LBB135_3
21069 ; GCN3-NEXT: ; %bb.1: ; %Flow
21070 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
21071 ; GCN3-NEXT: s_cbranch_vccz .LBB135_4
21072 ; GCN3-NEXT: .LBB135_2: ; %atomicrmw.phi
21073 ; GCN3-NEXT: s_setpc_b64 s[30:31]
21074 ; GCN3-NEXT: .LBB135_3: ; %atomicrmw.global
21075 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
21076 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
21077 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
21078 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
21079 ; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
21080 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
21081 ; GCN3-NEXT: buffer_wbinvl1_vol
21082 ; GCN3-NEXT: s_cbranch_execnz .LBB135_2
21083 ; GCN3-NEXT: .LBB135_4: ; %atomicrmw.private
21084 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
21085 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
21086 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
21087 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
21088 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
21089 ; GCN3-NEXT: s_waitcnt vmcnt(1)
21090 ; GCN3-NEXT: v_add_co_u32_e32 v3, vcc, 1, v0
21091 ; GCN3-NEXT: s_waitcnt vmcnt(0)
21092 ; GCN3-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc
21093 ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
21094 ; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
21095 ; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
21096 ; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
21097 ; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen offset:4
21098 ; GCN3-NEXT: s_waitcnt vmcnt(0)
21099 ; GCN3-NEXT: s_setpc_b64 s[30:31]
21100 %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst
21104 define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
21105 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar:
21107 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21108 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
21109 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
21110 ; GCN1-NEXT: s_add_u32 s34, s4, 32
21111 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
21112 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
21113 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
21114 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
21115 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
21116 ; GCN1-NEXT: s_mov_b64 s[36:37], -1
21117 ; GCN1-NEXT: s_cbranch_vccnz .LBB136_3
21118 ; GCN1-NEXT: ; %bb.1: ; %Flow
21119 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
21120 ; GCN1-NEXT: s_cbranch_vccz .LBB136_4
21121 ; GCN1-NEXT: .LBB136_2: ; %atomicrmw.phi
21122 ; GCN1-NEXT: s_setpc_b64 s[30:31]
21123 ; GCN1-NEXT: .LBB136_3: ; %atomicrmw.global
21124 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
21125 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
21126 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
21127 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
21128 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
21129 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
21130 ; GCN1-NEXT: buffer_wbinvl1_vol
21131 ; GCN1-NEXT: s_cbranch_execnz .LBB136_2
21132 ; GCN1-NEXT: .LBB136_4: ; %atomicrmw.private
21133 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
21134 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
21135 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
21136 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
21137 ; GCN1-NEXT: s_add_i32 s34, s34, 4
21138 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
21139 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
21140 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
21141 ; GCN1-NEXT: s_waitcnt vmcnt(1)
21142 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0
21143 ; GCN1-NEXT: s_waitcnt vmcnt(0)
21144 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
21145 ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
21146 ; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc
21147 ; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc
21148 ; GCN1-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
21149 ; GCN1-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen
21150 ; GCN1-NEXT: s_waitcnt vmcnt(0)
21151 ; GCN1-NEXT: s_setpc_b64 s[30:31]
21153 ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar:
21155 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21156 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
21157 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
21158 ; GCN2-NEXT: s_add_u32 s34, s4, 32
21159 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
21160 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
21161 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
21162 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
21163 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
21164 ; GCN2-NEXT: s_mov_b64 s[36:37], -1
21165 ; GCN2-NEXT: s_cbranch_vccnz .LBB136_3
21166 ; GCN2-NEXT: ; %bb.1: ; %Flow
21167 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
21168 ; GCN2-NEXT: s_cbranch_vccz .LBB136_4
21169 ; GCN2-NEXT: .LBB136_2: ; %atomicrmw.phi
21170 ; GCN2-NEXT: s_setpc_b64 s[30:31]
21171 ; GCN2-NEXT: .LBB136_3: ; %atomicrmw.global
21172 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
21173 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
21174 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
21175 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
21176 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
21177 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
21178 ; GCN2-NEXT: buffer_wbinvl1_vol
21179 ; GCN2-NEXT: s_cbranch_execnz .LBB136_2
21180 ; GCN2-NEXT: .LBB136_4: ; %atomicrmw.private
21181 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
21182 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
21183 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
21184 ; GCN2-NEXT: s_add_i32 s34, s34, 4
21185 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
21186 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
21187 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
21188 ; GCN2-NEXT: s_waitcnt vmcnt(1)
21189 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0
21190 ; GCN2-NEXT: s_waitcnt vmcnt(0)
21191 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
21192 ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
21193 ; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc
21194 ; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc
21195 ; GCN2-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
21196 ; GCN2-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen
21197 ; GCN2-NEXT: s_waitcnt vmcnt(0)
21198 ; GCN2-NEXT: s_setpc_b64 s[30:31]
21200 ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar:
21202 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21203 ; GCN3-NEXT: s_add_u32 s34, s4, 32
21204 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
21205 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
21206 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
21207 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
21208 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
21209 ; GCN3-NEXT: s_mov_b64 s[36:37], -1
21210 ; GCN3-NEXT: s_cbranch_vccnz .LBB136_3
21211 ; GCN3-NEXT: ; %bb.1: ; %Flow
21212 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
21213 ; GCN3-NEXT: s_cbranch_vccz .LBB136_4
21214 ; GCN3-NEXT: .LBB136_2: ; %atomicrmw.phi
21215 ; GCN3-NEXT: s_setpc_b64 s[30:31]
21216 ; GCN3-NEXT: .LBB136_3: ; %atomicrmw.global
21217 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
21218 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
21219 ; GCN3-NEXT: v_mov_b32_e32 v1, s35
21220 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
21221 ; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
21222 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
21223 ; GCN3-NEXT: buffer_wbinvl1_vol
21224 ; GCN3-NEXT: s_cbranch_execnz .LBB136_2
21225 ; GCN3-NEXT: .LBB136_4: ; %atomicrmw.private
21226 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
21227 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
21228 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
21229 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
21230 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
21231 ; GCN3-NEXT: s_waitcnt vmcnt(1)
21232 ; GCN3-NEXT: v_add_co_u32_e32 v3, vcc, 1, v0
21233 ; GCN3-NEXT: s_waitcnt vmcnt(0)
21234 ; GCN3-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc
21235 ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
21236 ; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
21237 ; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
21238 ; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
21239 ; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen offset:4
21240 ; GCN3-NEXT: s_waitcnt vmcnt(0)
21241 ; GCN3-NEXT: s_setpc_b64 s[30:31]
21242 %gep = getelementptr i64, ptr %out, i64 4
21243 %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst
21247 define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
21248 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar:
21250 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21251 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
21252 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
21253 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
21254 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
21255 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
21256 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
21257 ; GCN1-NEXT: s_cbranch_vccz .LBB137_2
21258 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
21259 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
21260 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
21261 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
21262 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
21263 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
21264 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
21265 ; GCN1-NEXT: buffer_wbinvl1_vol
21266 ; GCN1-NEXT: s_cbranch_execz .LBB137_3
21267 ; GCN1-NEXT: s_branch .LBB137_4
21268 ; GCN1-NEXT: .LBB137_2:
21269 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
21270 ; GCN1-NEXT: .LBB137_3: ; %atomicrmw.private
21271 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
21272 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
21273 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
21274 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
21275 ; GCN1-NEXT: s_add_i32 s34, s34, 4
21276 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
21277 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
21278 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
21279 ; GCN1-NEXT: s_waitcnt vmcnt(1)
21280 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0
21281 ; GCN1-NEXT: s_waitcnt vmcnt(0)
21282 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
21283 ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
21284 ; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
21285 ; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
21286 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
21287 ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
21288 ; GCN1-NEXT: .LBB137_4: ; %atomicrmw.end
21289 ; GCN1-NEXT: s_waitcnt vmcnt(0)
21290 ; GCN1-NEXT: s_setpc_b64 s[30:31]
21292 ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar:
21294 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21295 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
21296 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
21297 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
21298 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
21299 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
21300 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
21301 ; GCN2-NEXT: s_cbranch_vccz .LBB137_2
21302 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
21303 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
21304 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
21305 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
21306 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
21307 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
21308 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
21309 ; GCN2-NEXT: buffer_wbinvl1_vol
21310 ; GCN2-NEXT: s_cbranch_execz .LBB137_3
21311 ; GCN2-NEXT: s_branch .LBB137_4
21312 ; GCN2-NEXT: .LBB137_2:
21313 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
21314 ; GCN2-NEXT: .LBB137_3: ; %atomicrmw.private
21315 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
21316 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
21317 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
21318 ; GCN2-NEXT: s_add_i32 s34, s34, 4
21319 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
21320 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
21321 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
21322 ; GCN2-NEXT: s_waitcnt vmcnt(1)
21323 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0
21324 ; GCN2-NEXT: s_waitcnt vmcnt(0)
21325 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
21326 ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
21327 ; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
21328 ; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
21329 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
21330 ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
21331 ; GCN2-NEXT: .LBB137_4: ; %atomicrmw.end
21332 ; GCN2-NEXT: s_waitcnt vmcnt(0)
21333 ; GCN2-NEXT: s_setpc_b64 s[30:31]
21335 ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar:
21337 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21338 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
21339 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
21340 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
21341 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
21342 ; GCN3-NEXT: s_cbranch_vccz .LBB137_2
21343 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
21344 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
21345 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
21346 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
21347 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
21348 ; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
21349 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
21350 ; GCN3-NEXT: buffer_wbinvl1_vol
21351 ; GCN3-NEXT: s_cbranch_execz .LBB137_3
21352 ; GCN3-NEXT: s_branch .LBB137_4
21353 ; GCN3-NEXT: .LBB137_2:
21354 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
21355 ; GCN3-NEXT: .LBB137_3: ; %atomicrmw.private
21356 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
21357 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
21358 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
21359 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
21360 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
21361 ; GCN3-NEXT: s_waitcnt vmcnt(1)
21362 ; GCN3-NEXT: v_add_co_u32_e32 v3, vcc, 1, v0
21363 ; GCN3-NEXT: s_waitcnt vmcnt(0)
21364 ; GCN3-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc
21365 ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
21366 ; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
21367 ; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
21368 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
21369 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
21370 ; GCN3-NEXT: .LBB137_4: ; %atomicrmw.end
21371 ; GCN3-NEXT: s_waitcnt vmcnt(0)
21372 ; GCN3-NEXT: s_setpc_b64 s[30:31]
21373 %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst
21377 define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
21378 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar:
21380 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21381 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
21382 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
21383 ; GCN1-NEXT: s_add_u32 s34, s4, 32
21384 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
21385 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
21386 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
21387 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
21388 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
21389 ; GCN1-NEXT: s_cbranch_vccz .LBB138_2
21390 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
21391 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
21392 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
21393 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
21394 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
21395 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
21396 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
21397 ; GCN1-NEXT: buffer_wbinvl1_vol
21398 ; GCN1-NEXT: s_cbranch_execz .LBB138_3
21399 ; GCN1-NEXT: s_branch .LBB138_4
21400 ; GCN1-NEXT: .LBB138_2:
21401 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
21402 ; GCN1-NEXT: .LBB138_3: ; %atomicrmw.private
21403 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
21404 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
21405 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
21406 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
21407 ; GCN1-NEXT: s_add_i32 s34, s34, 4
21408 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
21409 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
21410 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
21411 ; GCN1-NEXT: s_waitcnt vmcnt(1)
21412 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v0
21413 ; GCN1-NEXT: s_waitcnt vmcnt(0)
21414 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
21415 ; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
21416 ; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
21417 ; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
21418 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
21419 ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
21420 ; GCN1-NEXT: .LBB138_4: ; %atomicrmw.end
21421 ; GCN1-NEXT: s_waitcnt vmcnt(0)
21422 ; GCN1-NEXT: s_setpc_b64 s[30:31]
21424 ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar:
21426 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21427 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
21428 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
21429 ; GCN2-NEXT: s_add_u32 s34, s4, 32
21430 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
21431 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
21432 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
21433 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
21434 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
21435 ; GCN2-NEXT: s_cbranch_vccz .LBB138_2
21436 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
21437 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
21438 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
21439 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
21440 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
21441 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
21442 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
21443 ; GCN2-NEXT: buffer_wbinvl1_vol
21444 ; GCN2-NEXT: s_cbranch_execz .LBB138_3
21445 ; GCN2-NEXT: s_branch .LBB138_4
21446 ; GCN2-NEXT: .LBB138_2:
21447 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
21448 ; GCN2-NEXT: .LBB138_3: ; %atomicrmw.private
21449 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
21450 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
21451 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
21452 ; GCN2-NEXT: s_add_i32 s34, s34, 4
21453 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
21454 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
21455 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
21456 ; GCN2-NEXT: s_waitcnt vmcnt(1)
21457 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v0
21458 ; GCN2-NEXT: s_waitcnt vmcnt(0)
21459 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
21460 ; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
21461 ; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
21462 ; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
21463 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
21464 ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
21465 ; GCN2-NEXT: .LBB138_4: ; %atomicrmw.end
21466 ; GCN2-NEXT: s_waitcnt vmcnt(0)
21467 ; GCN2-NEXT: s_setpc_b64 s[30:31]
21469 ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar:
21471 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21472 ; GCN3-NEXT: s_add_u32 s34, s4, 32
21473 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
21474 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
21475 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
21476 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
21477 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
21478 ; GCN3-NEXT: s_cbranch_vccz .LBB138_2
21479 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
21480 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
21481 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
21482 ; GCN3-NEXT: v_mov_b32_e32 v1, s35
21483 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
21484 ; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
21485 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
21486 ; GCN3-NEXT: buffer_wbinvl1_vol
21487 ; GCN3-NEXT: s_cbranch_execz .LBB138_3
21488 ; GCN3-NEXT: s_branch .LBB138_4
21489 ; GCN3-NEXT: .LBB138_2:
21490 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
21491 ; GCN3-NEXT: .LBB138_3: ; %atomicrmw.private
21492 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
21493 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
21494 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
21495 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
21496 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
21497 ; GCN3-NEXT: s_waitcnt vmcnt(1)
21498 ; GCN3-NEXT: v_add_co_u32_e32 v3, vcc, 1, v0
21499 ; GCN3-NEXT: s_waitcnt vmcnt(0)
21500 ; GCN3-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc
21501 ; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
21502 ; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
21503 ; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
21504 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
21505 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
21506 ; GCN3-NEXT: .LBB138_4: ; %atomicrmw.end
21507 ; GCN3-NEXT: s_waitcnt vmcnt(0)
21508 ; GCN3-NEXT: s_setpc_b64 s[30:31]
21509 %gep = getelementptr i64, ptr %out, i64 4
21510 %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst
21514 define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
21515 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory:
21517 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21518 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
21519 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
21520 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
21521 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
21522 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
21523 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
21524 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
21525 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
21526 ; GCN1-NEXT: s_cbranch_execnz .LBB139_3
21527 ; GCN1-NEXT: ; %bb.1: ; %Flow
21528 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
21529 ; GCN1-NEXT: s_cbranch_execnz .LBB139_4
21530 ; GCN1-NEXT: .LBB139_2: ; %atomicrmw.phi
21531 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
21532 ; GCN1-NEXT: s_setpc_b64 s[30:31]
21533 ; GCN1-NEXT: .LBB139_3: ; %atomicrmw.global
21534 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
21535 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
21536 ; GCN1-NEXT: buffer_wbinvl1_vol
21537 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
21538 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
21539 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
21540 ; GCN1-NEXT: s_cbranch_execz .LBB139_2
21541 ; GCN1-NEXT: .LBB139_4: ; %atomicrmw.private
21542 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
21543 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
21544 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
21545 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
21546 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
21547 ; GCN1-NEXT: s_waitcnt vmcnt(1)
21548 ; GCN1-NEXT: v_add_i32_e32 v6, vcc, 1, v0
21549 ; GCN1-NEXT: s_waitcnt vmcnt(0)
21550 ; GCN1-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
21551 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
21552 ; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc
21553 ; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc
21554 ; GCN1-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
21555 ; GCN1-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen
21556 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
21557 ; GCN1-NEXT: s_waitcnt vmcnt(0)
21558 ; GCN1-NEXT: s_setpc_b64 s[30:31]
21560 ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory:
21562 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21563 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
21564 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
21565 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
21566 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
21567 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
21568 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
21569 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
21570 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
21571 ; GCN2-NEXT: s_cbranch_execnz .LBB139_3
21572 ; GCN2-NEXT: ; %bb.1: ; %Flow
21573 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
21574 ; GCN2-NEXT: s_cbranch_execnz .LBB139_4
21575 ; GCN2-NEXT: .LBB139_2: ; %atomicrmw.phi
21576 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
21577 ; GCN2-NEXT: s_setpc_b64 s[30:31]
21578 ; GCN2-NEXT: .LBB139_3: ; %atomicrmw.global
21579 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
21580 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
21581 ; GCN2-NEXT: buffer_wbinvl1_vol
21582 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
21583 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
21584 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
21585 ; GCN2-NEXT: s_cbranch_execz .LBB139_2
21586 ; GCN2-NEXT: .LBB139_4: ; %atomicrmw.private
21587 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
21588 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
21589 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
21590 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
21591 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
21592 ; GCN2-NEXT: s_waitcnt vmcnt(1)
21593 ; GCN2-NEXT: v_add_u32_e32 v6, vcc, 1, v0
21594 ; GCN2-NEXT: s_waitcnt vmcnt(0)
21595 ; GCN2-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
21596 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
21597 ; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc
21598 ; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc
21599 ; GCN2-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
21600 ; GCN2-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen
21601 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
21602 ; GCN2-NEXT: s_waitcnt vmcnt(0)
21603 ; GCN2-NEXT: s_setpc_b64 s[30:31]
21605 ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory:
21607 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21608 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
21609 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
21610 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
21611 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
21612 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
21613 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
21614 ; GCN3-NEXT: s_cbranch_execnz .LBB139_3
21615 ; GCN3-NEXT: ; %bb.1: ; %Flow
21616 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
21617 ; GCN3-NEXT: s_cbranch_execnz .LBB139_4
21618 ; GCN3-NEXT: .LBB139_2: ; %atomicrmw.phi
21619 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
21620 ; GCN3-NEXT: s_setpc_b64 s[30:31]
21621 ; GCN3-NEXT: .LBB139_3: ; %atomicrmw.global
21622 ; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
21623 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
21624 ; GCN3-NEXT: buffer_wbinvl1_vol
21625 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
21626 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
21627 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
21628 ; GCN3-NEXT: s_cbranch_execz .LBB139_2
21629 ; GCN3-NEXT: .LBB139_4: ; %atomicrmw.private
21630 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
21631 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
21632 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
21633 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
21634 ; GCN3-NEXT: s_waitcnt vmcnt(1)
21635 ; GCN3-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0
21636 ; GCN3-NEXT: s_waitcnt vmcnt(0)
21637 ; GCN3-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
21638 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
21639 ; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc
21640 ; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
21641 ; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen
21642 ; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4
21643 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
21644 ; GCN3-NEXT: s_waitcnt vmcnt(0)
21645 ; GCN3-NEXT: s_setpc_b64 s[30:31]
21646 %gep = getelementptr i64, ptr %out, i64 4
21647 %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
21651 define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
21652 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory:
21654 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21655 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
21656 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
21657 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
21658 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
21659 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
21660 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
21661 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
21662 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
21663 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
21664 ; GCN1-NEXT: s_cbranch_execnz .LBB140_3
21665 ; GCN1-NEXT: ; %bb.1: ; %Flow
21666 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
21667 ; GCN1-NEXT: s_cbranch_execnz .LBB140_4
21668 ; GCN1-NEXT: .LBB140_2: ; %atomicrmw.phi
21669 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
21670 ; GCN1-NEXT: s_setpc_b64 s[30:31]
21671 ; GCN1-NEXT: .LBB140_3: ; %atomicrmw.global
21672 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc
21673 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
21674 ; GCN1-NEXT: buffer_wbinvl1_vol
21675 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
21676 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
21677 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
21678 ; GCN1-NEXT: s_cbranch_execz .LBB140_2
21679 ; GCN1-NEXT: .LBB140_4: ; %atomicrmw.private
21680 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
21681 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
21682 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
21683 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
21684 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
21685 ; GCN1-NEXT: s_waitcnt vmcnt(1)
21686 ; GCN1-NEXT: v_add_i32_e32 v6, vcc, 1, v0
21687 ; GCN1-NEXT: s_waitcnt vmcnt(0)
21688 ; GCN1-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
21689 ; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
21690 ; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc
21691 ; GCN1-NEXT: v_cndmask_b32_e32 v2, 0, v7, vcc
21692 ; GCN1-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
21693 ; GCN1-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
21694 ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
21695 ; GCN1-NEXT: s_waitcnt vmcnt(0)
21696 ; GCN1-NEXT: s_setpc_b64 s[30:31]
21698 ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory:
21700 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21701 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
21702 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
21703 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
21704 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
21705 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
21706 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
21707 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
21708 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
21709 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
21710 ; GCN2-NEXT: s_cbranch_execnz .LBB140_3
21711 ; GCN2-NEXT: ; %bb.1: ; %Flow
21712 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
21713 ; GCN2-NEXT: s_cbranch_execnz .LBB140_4
21714 ; GCN2-NEXT: .LBB140_2: ; %atomicrmw.phi
21715 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
21716 ; GCN2-NEXT: s_setpc_b64 s[30:31]
21717 ; GCN2-NEXT: .LBB140_3: ; %atomicrmw.global
21718 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc
21719 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
21720 ; GCN2-NEXT: buffer_wbinvl1_vol
21721 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
21722 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
21723 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
21724 ; GCN2-NEXT: s_cbranch_execz .LBB140_2
21725 ; GCN2-NEXT: .LBB140_4: ; %atomicrmw.private
21726 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
21727 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
21728 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
21729 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
21730 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
21731 ; GCN2-NEXT: s_waitcnt vmcnt(1)
21732 ; GCN2-NEXT: v_add_u32_e32 v6, vcc, 1, v0
21733 ; GCN2-NEXT: s_waitcnt vmcnt(0)
21734 ; GCN2-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
21735 ; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
21736 ; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc
21737 ; GCN2-NEXT: v_cndmask_b32_e32 v2, 0, v7, vcc
21738 ; GCN2-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
21739 ; GCN2-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
21740 ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
21741 ; GCN2-NEXT: s_waitcnt vmcnt(0)
21742 ; GCN2-NEXT: s_setpc_b64 s[30:31]
21744 ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory:
21746 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21747 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
21748 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
21749 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
21750 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
21751 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
21752 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
21753 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
21754 ; GCN3-NEXT: s_cbranch_execnz .LBB140_3
21755 ; GCN3-NEXT: ; %bb.1: ; %Flow
21756 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
21757 ; GCN3-NEXT: s_cbranch_execnz .LBB140_4
21758 ; GCN3-NEXT: .LBB140_2: ; %atomicrmw.phi
21759 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
21760 ; GCN3-NEXT: s_setpc_b64 s[30:31]
21761 ; GCN3-NEXT: .LBB140_3: ; %atomicrmw.global
21762 ; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc
21763 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
21764 ; GCN3-NEXT: buffer_wbinvl1_vol
21765 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
21766 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
21767 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
21768 ; GCN3-NEXT: s_cbranch_execz .LBB140_2
21769 ; GCN3-NEXT: .LBB140_4: ; %atomicrmw.private
21770 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
21771 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
21772 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
21773 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
21774 ; GCN3-NEXT: s_waitcnt vmcnt(1)
21775 ; GCN3-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0
21776 ; GCN3-NEXT: s_waitcnt vmcnt(0)
21777 ; GCN3-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
21778 ; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
21779 ; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
21780 ; GCN3-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc
21781 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
21782 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
21783 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
21784 ; GCN3-NEXT: s_waitcnt vmcnt(0)
21785 ; GCN3-NEXT: s_setpc_b64 s[30:31]
21786 %gep = getelementptr i64, ptr %out, i64 4
21787 %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
21791 ; ---------------------------------------------------------------------
21792 ; atomicrmw udec_wrap
21793 ; ---------------------------------------------------------------------
21795 define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) {
21796 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret:
21798 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21799 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
21800 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
21801 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
21802 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
21803 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
21804 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
21805 ; GCN1-NEXT: s_cbranch_execnz .LBB141_3
21806 ; GCN1-NEXT: ; %bb.1: ; %Flow
21807 ; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
21808 ; GCN1-NEXT: s_cbranch_execnz .LBB141_4
21809 ; GCN1-NEXT: .LBB141_2: ; %atomicrmw.phi
21810 ; GCN1-NEXT: s_or_b64 exec, exec, s[8:9]
21811 ; GCN1-NEXT: s_setpc_b64 s[30:31]
21812 ; GCN1-NEXT: .LBB141_3: ; %atomicrmw.global
21813 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
21814 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
21815 ; GCN1-NEXT: buffer_wbinvl1_vol
21816 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
21817 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
21818 ; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
21819 ; GCN1-NEXT: s_cbranch_execz .LBB141_2
21820 ; GCN1-NEXT: .LBB141_4: ; %atomicrmw.private
21821 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
21822 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
21823 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
21824 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
21825 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
21826 ; GCN1-NEXT: s_waitcnt vmcnt(0)
21827 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
21828 ; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3]
21829 ; GCN1-NEXT: v_add_i32_e64 v0, s[6:7], -1, v0
21830 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5]
21831 ; GCN1-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v1, s[6:7]
21832 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
21833 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
21834 ; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
21835 ; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
21836 ; GCN1-NEXT: s_or_b64 exec, exec, s[8:9]
21837 ; GCN1-NEXT: s_waitcnt vmcnt(0)
21838 ; GCN1-NEXT: s_setpc_b64 s[30:31]
21840 ; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret:
21842 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21843 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
21844 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
21845 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
21846 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
21847 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
21848 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
21849 ; GCN2-NEXT: s_cbranch_execnz .LBB141_3
21850 ; GCN2-NEXT: ; %bb.1: ; %Flow
21851 ; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
21852 ; GCN2-NEXT: s_cbranch_execnz .LBB141_4
21853 ; GCN2-NEXT: .LBB141_2: ; %atomicrmw.phi
21854 ; GCN2-NEXT: s_or_b64 exec, exec, s[8:9]
21855 ; GCN2-NEXT: s_setpc_b64 s[30:31]
21856 ; GCN2-NEXT: .LBB141_3: ; %atomicrmw.global
21857 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
21858 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
21859 ; GCN2-NEXT: buffer_wbinvl1_vol
21860 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
21861 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
21862 ; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
21863 ; GCN2-NEXT: s_cbranch_execz .LBB141_2
21864 ; GCN2-NEXT: .LBB141_4: ; %atomicrmw.private
21865 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
21866 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
21867 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
21868 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
21869 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
21870 ; GCN2-NEXT: s_waitcnt vmcnt(0)
21871 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
21872 ; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3]
21873 ; GCN2-NEXT: v_add_u32_e64 v0, s[6:7], -1, v0
21874 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5]
21875 ; GCN2-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v1, s[6:7]
21876 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
21877 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
21878 ; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
21879 ; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
21880 ; GCN2-NEXT: s_or_b64 exec, exec, s[8:9]
21881 ; GCN2-NEXT: s_waitcnt vmcnt(0)
21882 ; GCN2-NEXT: s_setpc_b64 s[30:31]
21884 ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret:
21886 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21887 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
21888 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
21889 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
21890 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
21891 ; GCN3-NEXT: s_cbranch_execnz .LBB141_3
21892 ; GCN3-NEXT: ; %bb.1: ; %Flow
21893 ; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
21894 ; GCN3-NEXT: s_cbranch_execnz .LBB141_4
21895 ; GCN3-NEXT: .LBB141_2: ; %atomicrmw.phi
21896 ; GCN3-NEXT: s_or_b64 exec, exec, s[8:9]
21897 ; GCN3-NEXT: s_setpc_b64 s[30:31]
21898 ; GCN3-NEXT: .LBB141_3: ; %atomicrmw.global
21899 ; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
21900 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
21901 ; GCN3-NEXT: buffer_wbinvl1_vol
21902 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
21903 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
21904 ; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
21905 ; GCN3-NEXT: s_cbranch_execz .LBB141_2
21906 ; GCN3-NEXT: .LBB141_4: ; %atomicrmw.private
21907 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
21908 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
21909 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
21910 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
21911 ; GCN3-NEXT: s_waitcnt vmcnt(0)
21912 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
21913 ; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3]
21914 ; GCN3-NEXT: v_add_co_u32_e64 v0, s[6:7], -1, v0
21915 ; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5]
21916 ; GCN3-NEXT: v_addc_co_u32_e64 v1, s[6:7], -1, v1, s[6:7]
21917 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
21918 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
21919 ; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
21920 ; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
21921 ; GCN3-NEXT: s_or_b64 exec, exec, s[8:9]
21922 ; GCN3-NEXT: s_waitcnt vmcnt(0)
21923 ; GCN3-NEXT: s_setpc_b64 s[30:31]
21924 %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst
21928 define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) {
21929 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_offset:
21931 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21932 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
21933 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
21934 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
21935 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
21936 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
21937 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
21938 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
21939 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
21940 ; GCN1-NEXT: s_cbranch_execnz .LBB142_3
21941 ; GCN1-NEXT: ; %bb.1: ; %Flow
21942 ; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
21943 ; GCN1-NEXT: s_cbranch_execnz .LBB142_4
21944 ; GCN1-NEXT: .LBB142_2: ; %atomicrmw.phi
21945 ; GCN1-NEXT: s_or_b64 exec, exec, s[8:9]
21946 ; GCN1-NEXT: s_setpc_b64 s[30:31]
21947 ; GCN1-NEXT: .LBB142_3: ; %atomicrmw.global
21948 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
21949 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
21950 ; GCN1-NEXT: buffer_wbinvl1_vol
21951 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
21952 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
21953 ; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
21954 ; GCN1-NEXT: s_cbranch_execz .LBB142_2
21955 ; GCN1-NEXT: .LBB142_4: ; %atomicrmw.private
21956 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
21957 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
21958 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
21959 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
21960 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
21961 ; GCN1-NEXT: s_waitcnt vmcnt(0)
21962 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
21963 ; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3]
21964 ; GCN1-NEXT: v_add_i32_e64 v0, s[6:7], -1, v0
21965 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5]
21966 ; GCN1-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v1, s[6:7]
21967 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
21968 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
21969 ; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
21970 ; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
21971 ; GCN1-NEXT: s_or_b64 exec, exec, s[8:9]
21972 ; GCN1-NEXT: s_waitcnt vmcnt(0)
21973 ; GCN1-NEXT: s_setpc_b64 s[30:31]
21975 ; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_offset:
21977 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21978 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
21979 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
21980 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
21981 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
21982 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
21983 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
21984 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
21985 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
21986 ; GCN2-NEXT: s_cbranch_execnz .LBB142_3
21987 ; GCN2-NEXT: ; %bb.1: ; %Flow
21988 ; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
21989 ; GCN2-NEXT: s_cbranch_execnz .LBB142_4
21990 ; GCN2-NEXT: .LBB142_2: ; %atomicrmw.phi
21991 ; GCN2-NEXT: s_or_b64 exec, exec, s[8:9]
21992 ; GCN2-NEXT: s_setpc_b64 s[30:31]
21993 ; GCN2-NEXT: .LBB142_3: ; %atomicrmw.global
21994 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
21995 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
21996 ; GCN2-NEXT: buffer_wbinvl1_vol
21997 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
21998 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
21999 ; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
22000 ; GCN2-NEXT: s_cbranch_execz .LBB142_2
22001 ; GCN2-NEXT: .LBB142_4: ; %atomicrmw.private
22002 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
22003 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
22004 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
22005 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
22006 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
22007 ; GCN2-NEXT: s_waitcnt vmcnt(0)
22008 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
22009 ; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3]
22010 ; GCN2-NEXT: v_add_u32_e64 v0, s[6:7], -1, v0
22011 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5]
22012 ; GCN2-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v1, s[6:7]
22013 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
22014 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
22015 ; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
22016 ; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
22017 ; GCN2-NEXT: s_or_b64 exec, exec, s[8:9]
22018 ; GCN2-NEXT: s_waitcnt vmcnt(0)
22019 ; GCN2-NEXT: s_setpc_b64 s[30:31]
22021 ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_offset:
22023 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22024 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
22025 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
22026 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
22027 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
22028 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
22029 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
22030 ; GCN3-NEXT: s_cbranch_execnz .LBB142_3
22031 ; GCN3-NEXT: ; %bb.1: ; %Flow
22032 ; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
22033 ; GCN3-NEXT: s_cbranch_execnz .LBB142_4
22034 ; GCN3-NEXT: .LBB142_2: ; %atomicrmw.phi
22035 ; GCN3-NEXT: s_or_b64 exec, exec, s[8:9]
22036 ; GCN3-NEXT: s_setpc_b64 s[30:31]
22037 ; GCN3-NEXT: .LBB142_3: ; %atomicrmw.global
22038 ; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
22039 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
22040 ; GCN3-NEXT: buffer_wbinvl1_vol
22041 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
22042 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
22043 ; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
22044 ; GCN3-NEXT: s_cbranch_execz .LBB142_2
22045 ; GCN3-NEXT: .LBB142_4: ; %atomicrmw.private
22046 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
22047 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
22048 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
22049 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
22050 ; GCN3-NEXT: s_waitcnt vmcnt(0)
22051 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
22052 ; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3]
22053 ; GCN3-NEXT: v_add_co_u32_e64 v0, s[6:7], -1, v0
22054 ; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5]
22055 ; GCN3-NEXT: v_addc_co_u32_e64 v1, s[6:7], -1, v1, s[6:7]
22056 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
22057 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
22058 ; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
22059 ; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
22060 ; GCN3-NEXT: s_or_b64 exec, exec, s[8:9]
22061 ; GCN3-NEXT: s_waitcnt vmcnt(0)
22062 ; GCN3-NEXT: s_setpc_b64 s[30:31]
22063 %gep = getelementptr i64, ptr %out, i64 4
22064 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst
22068 define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) {
22069 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret:
22071 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22072 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
22073 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
22074 ; GCN1-NEXT: v_mov_b32_e32 v5, v1
22075 ; GCN1-NEXT: v_mov_b32_e32 v4, v0
22076 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
22077 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
22078 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
22079 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
22080 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
22081 ; GCN1-NEXT: s_cbranch_execnz .LBB143_3
22082 ; GCN1-NEXT: ; %bb.1: ; %Flow
22083 ; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
22084 ; GCN1-NEXT: s_cbranch_execnz .LBB143_4
22085 ; GCN1-NEXT: .LBB143_2: ; %atomicrmw.phi
22086 ; GCN1-NEXT: s_or_b64 exec, exec, s[8:9]
22087 ; GCN1-NEXT: s_setpc_b64 s[30:31]
22088 ; GCN1-NEXT: .LBB143_3: ; %atomicrmw.global
22089 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc
22090 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
22091 ; GCN1-NEXT: buffer_wbinvl1_vol
22092 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
22093 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
22094 ; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
22095 ; GCN1-NEXT: s_cbranch_execz .LBB143_2
22096 ; GCN1-NEXT: .LBB143_4: ; %atomicrmw.private
22097 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
22098 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
22099 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
22100 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
22101 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
22102 ; GCN1-NEXT: s_waitcnt vmcnt(1)
22103 ; GCN1-NEXT: v_add_i32_e64 v6, s[6:7], -1, v0
22104 ; GCN1-NEXT: s_waitcnt vmcnt(0)
22105 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
22106 ; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3]
22107 ; GCN1-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7]
22108 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5]
22109 ; GCN1-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
22110 ; GCN1-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
22111 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
22112 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
22113 ; GCN1-NEXT: s_or_b64 exec, exec, s[8:9]
22114 ; GCN1-NEXT: s_waitcnt vmcnt(0)
22115 ; GCN1-NEXT: s_setpc_b64 s[30:31]
22117 ; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret:
22119 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22120 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
22121 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
22122 ; GCN2-NEXT: v_mov_b32_e32 v5, v1
22123 ; GCN2-NEXT: v_mov_b32_e32 v4, v0
22124 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
22125 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
22126 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
22127 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
22128 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
22129 ; GCN2-NEXT: s_cbranch_execnz .LBB143_3
22130 ; GCN2-NEXT: ; %bb.1: ; %Flow
22131 ; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
22132 ; GCN2-NEXT: s_cbranch_execnz .LBB143_4
22133 ; GCN2-NEXT: .LBB143_2: ; %atomicrmw.phi
22134 ; GCN2-NEXT: s_or_b64 exec, exec, s[8:9]
22135 ; GCN2-NEXT: s_setpc_b64 s[30:31]
22136 ; GCN2-NEXT: .LBB143_3: ; %atomicrmw.global
22137 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc
22138 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
22139 ; GCN2-NEXT: buffer_wbinvl1_vol
22140 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
22141 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
22142 ; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
22143 ; GCN2-NEXT: s_cbranch_execz .LBB143_2
22144 ; GCN2-NEXT: .LBB143_4: ; %atomicrmw.private
22145 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
22146 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
22147 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
22148 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
22149 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
22150 ; GCN2-NEXT: s_waitcnt vmcnt(1)
22151 ; GCN2-NEXT: v_add_u32_e64 v6, s[6:7], -1, v0
22152 ; GCN2-NEXT: s_waitcnt vmcnt(0)
22153 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
22154 ; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3]
22155 ; GCN2-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7]
22156 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5]
22157 ; GCN2-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
22158 ; GCN2-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
22159 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
22160 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
22161 ; GCN2-NEXT: s_or_b64 exec, exec, s[8:9]
22162 ; GCN2-NEXT: s_waitcnt vmcnt(0)
22163 ; GCN2-NEXT: s_setpc_b64 s[30:31]
22165 ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret:
22167 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22168 ; GCN3-NEXT: v_mov_b32_e32 v5, v1
22169 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
22170 ; GCN3-NEXT: v_mov_b32_e32 v4, v0
22171 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
22172 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
22173 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
22174 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
22175 ; GCN3-NEXT: s_cbranch_execnz .LBB143_3
22176 ; GCN3-NEXT: ; %bb.1: ; %Flow
22177 ; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
22178 ; GCN3-NEXT: s_cbranch_execnz .LBB143_4
22179 ; GCN3-NEXT: .LBB143_2: ; %atomicrmw.phi
22180 ; GCN3-NEXT: s_or_b64 exec, exec, s[8:9]
22181 ; GCN3-NEXT: s_setpc_b64 s[30:31]
22182 ; GCN3-NEXT: .LBB143_3: ; %atomicrmw.global
22183 ; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc
22184 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
22185 ; GCN3-NEXT: buffer_wbinvl1_vol
22186 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
22187 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
22188 ; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
22189 ; GCN3-NEXT: s_cbranch_execz .LBB143_2
22190 ; GCN3-NEXT: .LBB143_4: ; %atomicrmw.private
22191 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
22192 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
22193 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
22194 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
22195 ; GCN3-NEXT: s_waitcnt vmcnt(1)
22196 ; GCN3-NEXT: v_add_co_u32_e64 v5, s[6:7], -1, v0
22197 ; GCN3-NEXT: s_waitcnt vmcnt(0)
22198 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
22199 ; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3]
22200 ; GCN3-NEXT: v_addc_co_u32_e64 v6, s[6:7], -1, v1, s[6:7]
22201 ; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5]
22202 ; GCN3-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
22203 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
22204 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
22205 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
22206 ; GCN3-NEXT: s_or_b64 exec, exec, s[8:9]
22207 ; GCN3-NEXT: s_waitcnt vmcnt(0)
22208 ; GCN3-NEXT: s_setpc_b64 s[30:31]
22209 %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst
22213 define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) {
22214 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_offset:
22216 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22217 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
22218 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
22219 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
22220 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
22221 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
22222 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
22223 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
22224 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
22225 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
22226 ; GCN1-NEXT: s_cbranch_execnz .LBB144_3
22227 ; GCN1-NEXT: ; %bb.1: ; %Flow
22228 ; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
22229 ; GCN1-NEXT: s_cbranch_execnz .LBB144_4
22230 ; GCN1-NEXT: .LBB144_2: ; %atomicrmw.phi
22231 ; GCN1-NEXT: s_or_b64 exec, exec, s[8:9]
22232 ; GCN1-NEXT: s_setpc_b64 s[30:31]
22233 ; GCN1-NEXT: .LBB144_3: ; %atomicrmw.global
22234 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc
22235 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
22236 ; GCN1-NEXT: buffer_wbinvl1_vol
22237 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
22238 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
22239 ; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
22240 ; GCN1-NEXT: s_cbranch_execz .LBB144_2
22241 ; GCN1-NEXT: .LBB144_4: ; %atomicrmw.private
22242 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
22243 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
22244 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
22245 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
22246 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
22247 ; GCN1-NEXT: s_waitcnt vmcnt(1)
22248 ; GCN1-NEXT: v_add_i32_e64 v6, s[6:7], -1, v0
22249 ; GCN1-NEXT: s_waitcnt vmcnt(0)
22250 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
22251 ; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3]
22252 ; GCN1-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7]
22253 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5]
22254 ; GCN1-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
22255 ; GCN1-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
22256 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
22257 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
22258 ; GCN1-NEXT: s_or_b64 exec, exec, s[8:9]
22259 ; GCN1-NEXT: s_waitcnt vmcnt(0)
22260 ; GCN1-NEXT: s_setpc_b64 s[30:31]
22262 ; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_offset:
22264 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22265 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
22266 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
22267 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
22268 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
22269 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
22270 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
22271 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
22272 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
22273 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
22274 ; GCN2-NEXT: s_cbranch_execnz .LBB144_3
22275 ; GCN2-NEXT: ; %bb.1: ; %Flow
22276 ; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
22277 ; GCN2-NEXT: s_cbranch_execnz .LBB144_4
22278 ; GCN2-NEXT: .LBB144_2: ; %atomicrmw.phi
22279 ; GCN2-NEXT: s_or_b64 exec, exec, s[8:9]
22280 ; GCN2-NEXT: s_setpc_b64 s[30:31]
22281 ; GCN2-NEXT: .LBB144_3: ; %atomicrmw.global
22282 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc
22283 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
22284 ; GCN2-NEXT: buffer_wbinvl1_vol
22285 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
22286 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
22287 ; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
22288 ; GCN2-NEXT: s_cbranch_execz .LBB144_2
22289 ; GCN2-NEXT: .LBB144_4: ; %atomicrmw.private
22290 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
22291 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
22292 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
22293 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
22294 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
22295 ; GCN2-NEXT: s_waitcnt vmcnt(1)
22296 ; GCN2-NEXT: v_add_u32_e64 v6, s[6:7], -1, v0
22297 ; GCN2-NEXT: s_waitcnt vmcnt(0)
22298 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
22299 ; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3]
22300 ; GCN2-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7]
22301 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5]
22302 ; GCN2-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
22303 ; GCN2-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
22304 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
22305 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
22306 ; GCN2-NEXT: s_or_b64 exec, exec, s[8:9]
22307 ; GCN2-NEXT: s_waitcnt vmcnt(0)
22308 ; GCN2-NEXT: s_setpc_b64 s[30:31]
22310 ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_offset:
22312 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22313 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
22314 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
22315 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
22316 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
22317 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
22318 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
22319 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
22320 ; GCN3-NEXT: s_cbranch_execnz .LBB144_3
22321 ; GCN3-NEXT: ; %bb.1: ; %Flow
22322 ; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
22323 ; GCN3-NEXT: s_cbranch_execnz .LBB144_4
22324 ; GCN3-NEXT: .LBB144_2: ; %atomicrmw.phi
22325 ; GCN3-NEXT: s_or_b64 exec, exec, s[8:9]
22326 ; GCN3-NEXT: s_setpc_b64 s[30:31]
22327 ; GCN3-NEXT: .LBB144_3: ; %atomicrmw.global
22328 ; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc
22329 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
22330 ; GCN3-NEXT: buffer_wbinvl1_vol
22331 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
22332 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
22333 ; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
22334 ; GCN3-NEXT: s_cbranch_execz .LBB144_2
22335 ; GCN3-NEXT: .LBB144_4: ; %atomicrmw.private
22336 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
22337 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
22338 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
22339 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
22340 ; GCN3-NEXT: s_waitcnt vmcnt(1)
22341 ; GCN3-NEXT: v_add_co_u32_e64 v5, s[6:7], -1, v0
22342 ; GCN3-NEXT: s_waitcnt vmcnt(0)
22343 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
22344 ; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3]
22345 ; GCN3-NEXT: v_addc_co_u32_e64 v6, s[6:7], -1, v1, s[6:7]
22346 ; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5]
22347 ; GCN3-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
22348 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
22349 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
22350 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
22351 ; GCN3-NEXT: s_or_b64 exec, exec, s[8:9]
22352 ; GCN3-NEXT: s_waitcnt vmcnt(0)
22353 ; GCN3-NEXT: s_setpc_b64 s[30:31]
22354 %gep = getelementptr i64, ptr %out, i64 4
22355 %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst
22359 define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
22360 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_scalar:
22362 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22363 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
22364 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
22365 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
22366 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
22367 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
22368 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
22369 ; GCN1-NEXT: s_mov_b64 s[34:35], -1
22370 ; GCN1-NEXT: s_cbranch_vccnz .LBB145_3
22371 ; GCN1-NEXT: ; %bb.1: ; %Flow
22372 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
22373 ; GCN1-NEXT: s_cbranch_vccz .LBB145_4
22374 ; GCN1-NEXT: .LBB145_2: ; %atomicrmw.phi
22375 ; GCN1-NEXT: s_setpc_b64 s[30:31]
22376 ; GCN1-NEXT: .LBB145_3: ; %atomicrmw.global
22377 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
22378 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
22379 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
22380 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
22381 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
22382 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
22383 ; GCN1-NEXT: buffer_wbinvl1_vol
22384 ; GCN1-NEXT: s_cbranch_execnz .LBB145_2
22385 ; GCN1-NEXT: .LBB145_4: ; %atomicrmw.private
22386 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
22387 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
22388 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
22389 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
22390 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
22391 ; GCN1-NEXT: s_add_i32 s34, s34, 4
22392 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
22393 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
22394 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
22395 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
22396 ; GCN1-NEXT: s_waitcnt vmcnt(0)
22397 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
22398 ; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1]
22399 ; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v0
22400 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35]
22401 ; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v1, s[36:37]
22402 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
22403 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
22404 ; GCN1-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
22405 ; GCN1-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
22406 ; GCN1-NEXT: s_waitcnt vmcnt(0)
22407 ; GCN1-NEXT: s_setpc_b64 s[30:31]
22409 ; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_scalar:
22411 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22412 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
22413 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
22414 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
22415 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
22416 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
22417 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
22418 ; GCN2-NEXT: s_mov_b64 s[34:35], -1
22419 ; GCN2-NEXT: s_cbranch_vccnz .LBB145_3
22420 ; GCN2-NEXT: ; %bb.1: ; %Flow
22421 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
22422 ; GCN2-NEXT: s_cbranch_vccz .LBB145_4
22423 ; GCN2-NEXT: .LBB145_2: ; %atomicrmw.phi
22424 ; GCN2-NEXT: s_setpc_b64 s[30:31]
22425 ; GCN2-NEXT: .LBB145_3: ; %atomicrmw.global
22426 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
22427 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
22428 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
22429 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
22430 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
22431 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
22432 ; GCN2-NEXT: buffer_wbinvl1_vol
22433 ; GCN2-NEXT: s_cbranch_execnz .LBB145_2
22434 ; GCN2-NEXT: .LBB145_4: ; %atomicrmw.private
22435 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
22436 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
22437 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
22438 ; GCN2-NEXT: s_add_i32 s34, s34, 4
22439 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
22440 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
22441 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
22442 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
22443 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
22444 ; GCN2-NEXT: s_waitcnt vmcnt(0)
22445 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
22446 ; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1]
22447 ; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v0
22448 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35]
22449 ; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v1, s[36:37]
22450 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
22451 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
22452 ; GCN2-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
22453 ; GCN2-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
22454 ; GCN2-NEXT: s_waitcnt vmcnt(0)
22455 ; GCN2-NEXT: s_setpc_b64 s[30:31]
22457 ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_scalar:
22459 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22460 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
22461 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
22462 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
22463 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
22464 ; GCN3-NEXT: s_mov_b64 s[34:35], -1
22465 ; GCN3-NEXT: s_cbranch_vccnz .LBB145_3
22466 ; GCN3-NEXT: ; %bb.1: ; %Flow
22467 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
22468 ; GCN3-NEXT: s_cbranch_vccz .LBB145_4
22469 ; GCN3-NEXT: .LBB145_2: ; %atomicrmw.phi
22470 ; GCN3-NEXT: s_setpc_b64 s[30:31]
22471 ; GCN3-NEXT: .LBB145_3: ; %atomicrmw.global
22472 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
22473 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
22474 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
22475 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
22476 ; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
22477 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
22478 ; GCN3-NEXT: buffer_wbinvl1_vol
22479 ; GCN3-NEXT: s_cbranch_execnz .LBB145_2
22480 ; GCN3-NEXT: .LBB145_4: ; %atomicrmw.private
22481 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
22482 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
22483 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
22484 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
22485 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
22486 ; GCN3-NEXT: v_mov_b32_e32 v4, s6
22487 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
22488 ; GCN3-NEXT: s_waitcnt vmcnt(0)
22489 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
22490 ; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1]
22491 ; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v0
22492 ; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35]
22493 ; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v1, s[36:37]
22494 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
22495 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
22496 ; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
22497 ; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
22498 ; GCN3-NEXT: s_waitcnt vmcnt(0)
22499 ; GCN3-NEXT: s_setpc_b64 s[30:31]
22500 %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst
22504 define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
22505 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar:
22507 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22508 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
22509 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
22510 ; GCN1-NEXT: s_add_u32 s34, s4, 32
22511 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
22512 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
22513 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
22514 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
22515 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
22516 ; GCN1-NEXT: s_mov_b64 s[36:37], -1
22517 ; GCN1-NEXT: s_cbranch_vccnz .LBB146_3
22518 ; GCN1-NEXT: ; %bb.1: ; %Flow
22519 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
22520 ; GCN1-NEXT: s_cbranch_vccz .LBB146_4
22521 ; GCN1-NEXT: .LBB146_2: ; %atomicrmw.phi
22522 ; GCN1-NEXT: s_setpc_b64 s[30:31]
22523 ; GCN1-NEXT: .LBB146_3: ; %atomicrmw.global
22524 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
22525 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
22526 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
22527 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
22528 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
22529 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
22530 ; GCN1-NEXT: buffer_wbinvl1_vol
22531 ; GCN1-NEXT: s_cbranch_execnz .LBB146_2
22532 ; GCN1-NEXT: .LBB146_4: ; %atomicrmw.private
22533 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
22534 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
22535 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
22536 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
22537 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
22538 ; GCN1-NEXT: s_add_i32 s34, s34, 4
22539 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
22540 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
22541 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
22542 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
22543 ; GCN1-NEXT: s_waitcnt vmcnt(0)
22544 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
22545 ; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1]
22546 ; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v0
22547 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35]
22548 ; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v1, s[36:37]
22549 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
22550 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
22551 ; GCN1-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
22552 ; GCN1-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
22553 ; GCN1-NEXT: s_waitcnt vmcnt(0)
22554 ; GCN1-NEXT: s_setpc_b64 s[30:31]
22556 ; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar:
22558 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22559 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
22560 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
22561 ; GCN2-NEXT: s_add_u32 s34, s4, 32
22562 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
22563 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
22564 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
22565 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
22566 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
22567 ; GCN2-NEXT: s_mov_b64 s[36:37], -1
22568 ; GCN2-NEXT: s_cbranch_vccnz .LBB146_3
22569 ; GCN2-NEXT: ; %bb.1: ; %Flow
22570 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
22571 ; GCN2-NEXT: s_cbranch_vccz .LBB146_4
22572 ; GCN2-NEXT: .LBB146_2: ; %atomicrmw.phi
22573 ; GCN2-NEXT: s_setpc_b64 s[30:31]
22574 ; GCN2-NEXT: .LBB146_3: ; %atomicrmw.global
22575 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
22576 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
22577 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
22578 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
22579 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
22580 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
22581 ; GCN2-NEXT: buffer_wbinvl1_vol
22582 ; GCN2-NEXT: s_cbranch_execnz .LBB146_2
22583 ; GCN2-NEXT: .LBB146_4: ; %atomicrmw.private
22584 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
22585 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
22586 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
22587 ; GCN2-NEXT: s_add_i32 s34, s34, 4
22588 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
22589 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
22590 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
22591 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
22592 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
22593 ; GCN2-NEXT: s_waitcnt vmcnt(0)
22594 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
22595 ; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1]
22596 ; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v0
22597 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35]
22598 ; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v1, s[36:37]
22599 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
22600 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
22601 ; GCN2-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
22602 ; GCN2-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen
22603 ; GCN2-NEXT: s_waitcnt vmcnt(0)
22604 ; GCN2-NEXT: s_setpc_b64 s[30:31]
22606 ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar:
22608 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22609 ; GCN3-NEXT: s_add_u32 s34, s4, 32
22610 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
22611 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
22612 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
22613 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
22614 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
22615 ; GCN3-NEXT: s_mov_b64 s[36:37], -1
22616 ; GCN3-NEXT: s_cbranch_vccnz .LBB146_3
22617 ; GCN3-NEXT: ; %bb.1: ; %Flow
22618 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
22619 ; GCN3-NEXT: s_cbranch_vccz .LBB146_4
22620 ; GCN3-NEXT: .LBB146_2: ; %atomicrmw.phi
22621 ; GCN3-NEXT: s_setpc_b64 s[30:31]
22622 ; GCN3-NEXT: .LBB146_3: ; %atomicrmw.global
22623 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
22624 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
22625 ; GCN3-NEXT: v_mov_b32_e32 v1, s35
22626 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
22627 ; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
22628 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
22629 ; GCN3-NEXT: buffer_wbinvl1_vol
22630 ; GCN3-NEXT: s_cbranch_execnz .LBB146_2
22631 ; GCN3-NEXT: .LBB146_4: ; %atomicrmw.private
22632 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
22633 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
22634 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
22635 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
22636 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
22637 ; GCN3-NEXT: v_mov_b32_e32 v4, s6
22638 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
22639 ; GCN3-NEXT: s_waitcnt vmcnt(0)
22640 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
22641 ; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1]
22642 ; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v0
22643 ; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35]
22644 ; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v1, s[36:37]
22645 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
22646 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
22647 ; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
22648 ; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
22649 ; GCN3-NEXT: s_waitcnt vmcnt(0)
22650 ; GCN3-NEXT: s_setpc_b64 s[30:31]
22651 %gep = getelementptr i64, ptr %out, i64 4
22652 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst
22656 define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
22657 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_scalar:
22659 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22660 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
22661 ; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
22662 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
22663 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34
22664 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
22665 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
22666 ; GCN1-NEXT: s_cbranch_vccz .LBB147_2
22667 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
22668 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
22669 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
22670 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
22671 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
22672 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
22673 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
22674 ; GCN1-NEXT: buffer_wbinvl1_vol
22675 ; GCN1-NEXT: s_cbranch_execz .LBB147_3
22676 ; GCN1-NEXT: s_branch .LBB147_4
22677 ; GCN1-NEXT: .LBB147_2:
22678 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
22679 ; GCN1-NEXT: .LBB147_3: ; %atomicrmw.private
22680 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
22681 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
22682 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
22683 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1
22684 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
22685 ; GCN1-NEXT: s_add_i32 s34, s34, 4
22686 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
22687 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
22688 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
22689 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
22690 ; GCN1-NEXT: s_waitcnt vmcnt(1)
22691 ; GCN1-NEXT: v_add_i32_e64 v6, s[36:37], -1, v0
22692 ; GCN1-NEXT: s_waitcnt vmcnt(0)
22693 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
22694 ; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1]
22695 ; GCN1-NEXT: v_addc_u32_e64 v7, s[36:37], -1, v1, s[36:37]
22696 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35]
22697 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
22698 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
22699 ; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
22700 ; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
22701 ; GCN1-NEXT: .LBB147_4: ; %atomicrmw.end
22702 ; GCN1-NEXT: s_waitcnt vmcnt(0)
22703 ; GCN1-NEXT: s_setpc_b64 s[30:31]
22705 ; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_scalar:
22707 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22708 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
22709 ; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
22710 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
22711 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34
22712 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
22713 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
22714 ; GCN2-NEXT: s_cbranch_vccz .LBB147_2
22715 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
22716 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
22717 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
22718 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
22719 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
22720 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
22721 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
22722 ; GCN2-NEXT: buffer_wbinvl1_vol
22723 ; GCN2-NEXT: s_cbranch_execz .LBB147_3
22724 ; GCN2-NEXT: s_branch .LBB147_4
22725 ; GCN2-NEXT: .LBB147_2:
22726 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
22727 ; GCN2-NEXT: .LBB147_3: ; %atomicrmw.private
22728 ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
22729 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1
22730 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
22731 ; GCN2-NEXT: s_add_i32 s34, s34, 4
22732 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
22733 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
22734 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
22735 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
22736 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
22737 ; GCN2-NEXT: s_waitcnt vmcnt(1)
22738 ; GCN2-NEXT: v_add_u32_e64 v6, s[36:37], -1, v0
22739 ; GCN2-NEXT: s_waitcnt vmcnt(0)
22740 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
22741 ; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1]
22742 ; GCN2-NEXT: v_addc_u32_e64 v7, s[36:37], -1, v1, s[36:37]
22743 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35]
22744 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
22745 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
22746 ; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
22747 ; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
22748 ; GCN2-NEXT: .LBB147_4: ; %atomicrmw.end
22749 ; GCN2-NEXT: s_waitcnt vmcnt(0)
22750 ; GCN2-NEXT: s_setpc_b64 s[30:31]
22752 ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_scalar:
22754 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22755 ; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
22756 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35
22757 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
22758 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
22759 ; GCN3-NEXT: s_cbranch_vccz .LBB147_2
22760 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
22761 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
22762 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
22763 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
22764 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
22765 ; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
22766 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
22767 ; GCN3-NEXT: buffer_wbinvl1_vol
22768 ; GCN3-NEXT: s_cbranch_execz .LBB147_3
22769 ; GCN3-NEXT: s_branch .LBB147_4
22770 ; GCN3-NEXT: .LBB147_2:
22771 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
22772 ; GCN3-NEXT: .LBB147_3: ; %atomicrmw.private
22773 ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
22774 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1
22775 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
22776 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
22777 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
22778 ; GCN3-NEXT: v_mov_b32_e32 v4, s6
22779 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
22780 ; GCN3-NEXT: s_waitcnt vmcnt(1)
22781 ; GCN3-NEXT: v_add_co_u32_e64 v5, s[36:37], -1, v0
22782 ; GCN3-NEXT: s_waitcnt vmcnt(0)
22783 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
22784 ; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1]
22785 ; GCN3-NEXT: v_addc_co_u32_e64 v6, s[36:37], -1, v1, s[36:37]
22786 ; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35]
22787 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
22788 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
22789 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
22790 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
22791 ; GCN3-NEXT: .LBB147_4: ; %atomicrmw.end
22792 ; GCN3-NEXT: s_waitcnt vmcnt(0)
22793 ; GCN3-NEXT: s_setpc_b64 s[30:31]
22794 %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst
22798 define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
22799 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar:
22801 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22802 ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
22803 ; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
22804 ; GCN1-NEXT: s_add_u32 s34, s4, 32
22805 ; GCN1-NEXT: s_addc_u32 s35, s5, 0
22806 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
22807 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36
22808 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
22809 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
22810 ; GCN1-NEXT: s_cbranch_vccz .LBB148_2
22811 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
22812 ; GCN1-NEXT: v_mov_b32_e32 v0, s34
22813 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
22814 ; GCN1-NEXT: v_mov_b32_e32 v1, s35
22815 ; GCN1-NEXT: v_mov_b32_e32 v3, s7
22816 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
22817 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
22818 ; GCN1-NEXT: buffer_wbinvl1_vol
22819 ; GCN1-NEXT: s_cbranch_execz .LBB148_3
22820 ; GCN1-NEXT: s_branch .LBB148_4
22821 ; GCN1-NEXT: .LBB148_2:
22822 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
22823 ; GCN1-NEXT: .LBB148_3: ; %atomicrmw.private
22824 ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
22825 ; GCN1-NEXT: v_mov_b32_e32 v5, s6
22826 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
22827 ; GCN1-NEXT: s_cselect_b32 s34, s34, -1
22828 ; GCN1-NEXT: v_mov_b32_e32 v2, s34
22829 ; GCN1-NEXT: s_add_i32 s34, s34, 4
22830 ; GCN1-NEXT: v_mov_b32_e32 v3, s34
22831 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
22832 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
22833 ; GCN1-NEXT: v_mov_b32_e32 v4, s7
22834 ; GCN1-NEXT: s_waitcnt vmcnt(1)
22835 ; GCN1-NEXT: v_add_i32_e64 v6, s[36:37], -1, v0
22836 ; GCN1-NEXT: s_waitcnt vmcnt(0)
22837 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
22838 ; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1]
22839 ; GCN1-NEXT: v_addc_u32_e64 v7, s[36:37], -1, v1, s[36:37]
22840 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35]
22841 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
22842 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
22843 ; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
22844 ; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
22845 ; GCN1-NEXT: .LBB148_4: ; %atomicrmw.end
22846 ; GCN1-NEXT: s_waitcnt vmcnt(0)
22847 ; GCN1-NEXT: s_setpc_b64 s[30:31]
22849 ; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar:
22851 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22852 ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
22853 ; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
22854 ; GCN2-NEXT: s_add_u32 s34, s4, 32
22855 ; GCN2-NEXT: s_addc_u32 s35, s5, 0
22856 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
22857 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36
22858 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
22859 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
22860 ; GCN2-NEXT: s_cbranch_vccz .LBB148_2
22861 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
22862 ; GCN2-NEXT: v_mov_b32_e32 v0, s34
22863 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
22864 ; GCN2-NEXT: v_mov_b32_e32 v1, s35
22865 ; GCN2-NEXT: v_mov_b32_e32 v3, s7
22866 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
22867 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
22868 ; GCN2-NEXT: buffer_wbinvl1_vol
22869 ; GCN2-NEXT: s_cbranch_execz .LBB148_3
22870 ; GCN2-NEXT: s_branch .LBB148_4
22871 ; GCN2-NEXT: .LBB148_2:
22872 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
22873 ; GCN2-NEXT: .LBB148_3: ; %atomicrmw.private
22874 ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
22875 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1
22876 ; GCN2-NEXT: v_mov_b32_e32 v2, s34
22877 ; GCN2-NEXT: s_add_i32 s34, s34, 4
22878 ; GCN2-NEXT: v_mov_b32_e32 v3, s34
22879 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
22880 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen
22881 ; GCN2-NEXT: v_mov_b32_e32 v5, s6
22882 ; GCN2-NEXT: v_mov_b32_e32 v4, s7
22883 ; GCN2-NEXT: s_waitcnt vmcnt(1)
22884 ; GCN2-NEXT: v_add_u32_e64 v6, s[36:37], -1, v0
22885 ; GCN2-NEXT: s_waitcnt vmcnt(0)
22886 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
22887 ; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1]
22888 ; GCN2-NEXT: v_addc_u32_e64 v7, s[36:37], -1, v1, s[36:37]
22889 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35]
22890 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
22891 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
22892 ; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
22893 ; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
22894 ; GCN2-NEXT: .LBB148_4: ; %atomicrmw.end
22895 ; GCN2-NEXT: s_waitcnt vmcnt(0)
22896 ; GCN2-NEXT: s_setpc_b64 s[30:31]
22898 ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar:
22900 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22901 ; GCN3-NEXT: s_add_u32 s34, s4, 32
22902 ; GCN3-NEXT: s_addc_u32 s35, s5, 0
22903 ; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
22904 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37
22905 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
22906 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
22907 ; GCN3-NEXT: s_cbranch_vccz .LBB148_2
22908 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
22909 ; GCN3-NEXT: v_mov_b32_e32 v0, s34
22910 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
22911 ; GCN3-NEXT: v_mov_b32_e32 v1, s35
22912 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
22913 ; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
22914 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
22915 ; GCN3-NEXT: buffer_wbinvl1_vol
22916 ; GCN3-NEXT: s_cbranch_execz .LBB148_3
22917 ; GCN3-NEXT: s_branch .LBB148_4
22918 ; GCN3-NEXT: .LBB148_2:
22919 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
22920 ; GCN3-NEXT: .LBB148_3: ; %atomicrmw.private
22921 ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
22922 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1
22923 ; GCN3-NEXT: v_mov_b32_e32 v2, s34
22924 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
22925 ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
22926 ; GCN3-NEXT: v_mov_b32_e32 v4, s6
22927 ; GCN3-NEXT: v_mov_b32_e32 v3, s7
22928 ; GCN3-NEXT: s_waitcnt vmcnt(1)
22929 ; GCN3-NEXT: v_add_co_u32_e64 v5, s[36:37], -1, v0
22930 ; GCN3-NEXT: s_waitcnt vmcnt(0)
22931 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
22932 ; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1]
22933 ; GCN3-NEXT: v_addc_co_u32_e64 v6, s[36:37], -1, v1, s[36:37]
22934 ; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35]
22935 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
22936 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
22937 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
22938 ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
22939 ; GCN3-NEXT: .LBB148_4: ; %atomicrmw.end
22940 ; GCN3-NEXT: s_waitcnt vmcnt(0)
22941 ; GCN3-NEXT: s_setpc_b64 s[30:31]
22942 %gep = getelementptr i64, ptr %out, i64 4
22943 %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst
22947 define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
22948 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory:
22950 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22951 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
22952 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
22953 ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0
22954 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
22955 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
22956 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
22957 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
22958 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
22959 ; GCN1-NEXT: s_cbranch_execnz .LBB149_3
22960 ; GCN1-NEXT: ; %bb.1: ; %Flow
22961 ; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
22962 ; GCN1-NEXT: s_cbranch_execnz .LBB149_4
22963 ; GCN1-NEXT: .LBB149_2: ; %atomicrmw.phi
22964 ; GCN1-NEXT: s_or_b64 exec, exec, s[8:9]
22965 ; GCN1-NEXT: s_setpc_b64 s[30:31]
22966 ; GCN1-NEXT: .LBB149_3: ; %atomicrmw.global
22967 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
22968 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
22969 ; GCN1-NEXT: buffer_wbinvl1_vol
22970 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
22971 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
22972 ; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
22973 ; GCN1-NEXT: s_cbranch_execz .LBB149_2
22974 ; GCN1-NEXT: .LBB149_4: ; %atomicrmw.private
22975 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
22976 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
22977 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
22978 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
22979 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
22980 ; GCN1-NEXT: s_waitcnt vmcnt(0)
22981 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
22982 ; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3]
22983 ; GCN1-NEXT: v_add_i32_e64 v0, s[6:7], -1, v0
22984 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5]
22985 ; GCN1-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v1, s[6:7]
22986 ; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
22987 ; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
22988 ; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
22989 ; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
22990 ; GCN1-NEXT: s_or_b64 exec, exec, s[8:9]
22991 ; GCN1-NEXT: s_waitcnt vmcnt(0)
22992 ; GCN1-NEXT: s_setpc_b64 s[30:31]
22994 ; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory:
22996 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22997 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
22998 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
22999 ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0
23000 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
23001 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
23002 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
23003 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
23004 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
23005 ; GCN2-NEXT: s_cbranch_execnz .LBB149_3
23006 ; GCN2-NEXT: ; %bb.1: ; %Flow
23007 ; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
23008 ; GCN2-NEXT: s_cbranch_execnz .LBB149_4
23009 ; GCN2-NEXT: .LBB149_2: ; %atomicrmw.phi
23010 ; GCN2-NEXT: s_or_b64 exec, exec, s[8:9]
23011 ; GCN2-NEXT: s_setpc_b64 s[30:31]
23012 ; GCN2-NEXT: .LBB149_3: ; %atomicrmw.global
23013 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
23014 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
23015 ; GCN2-NEXT: buffer_wbinvl1_vol
23016 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
23017 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
23018 ; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
23019 ; GCN2-NEXT: s_cbranch_execz .LBB149_2
23020 ; GCN2-NEXT: .LBB149_4: ; %atomicrmw.private
23021 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
23022 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
23023 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
23024 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
23025 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
23026 ; GCN2-NEXT: s_waitcnt vmcnt(0)
23027 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
23028 ; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3]
23029 ; GCN2-NEXT: v_add_u32_e64 v0, s[6:7], -1, v0
23030 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5]
23031 ; GCN2-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v1, s[6:7]
23032 ; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
23033 ; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
23034 ; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
23035 ; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen
23036 ; GCN2-NEXT: s_or_b64 exec, exec, s[8:9]
23037 ; GCN2-NEXT: s_waitcnt vmcnt(0)
23038 ; GCN2-NEXT: s_setpc_b64 s[30:31]
23040 ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory:
23042 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
23043 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
23044 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
23045 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
23046 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
23047 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
23048 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
23049 ; GCN3-NEXT: s_cbranch_execnz .LBB149_3
23050 ; GCN3-NEXT: ; %bb.1: ; %Flow
23051 ; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
23052 ; GCN3-NEXT: s_cbranch_execnz .LBB149_4
23053 ; GCN3-NEXT: .LBB149_2: ; %atomicrmw.phi
23054 ; GCN3-NEXT: s_or_b64 exec, exec, s[8:9]
23055 ; GCN3-NEXT: s_setpc_b64 s[30:31]
23056 ; GCN3-NEXT: .LBB149_3: ; %atomicrmw.global
23057 ; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
23058 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
23059 ; GCN3-NEXT: buffer_wbinvl1_vol
23060 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
23061 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
23062 ; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
23063 ; GCN3-NEXT: s_cbranch_execz .LBB149_2
23064 ; GCN3-NEXT: .LBB149_4: ; %atomicrmw.private
23065 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
23066 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
23067 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
23068 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
23069 ; GCN3-NEXT: s_waitcnt vmcnt(0)
23070 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
23071 ; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3]
23072 ; GCN3-NEXT: v_add_co_u32_e64 v0, s[6:7], -1, v0
23073 ; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5]
23074 ; GCN3-NEXT: v_addc_co_u32_e64 v1, s[6:7], -1, v1, s[6:7]
23075 ; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
23076 ; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
23077 ; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen
23078 ; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4
23079 ; GCN3-NEXT: s_or_b64 exec, exec, s[8:9]
23080 ; GCN3-NEXT: s_waitcnt vmcnt(0)
23081 ; GCN3-NEXT: s_setpc_b64 s[30:31]
23082 %gep = getelementptr i64, ptr %out, i64 4
23083 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
23087 define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
23088 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory:
23090 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
23091 ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
23092 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
23093 ; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0
23094 ; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
23095 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
23096 ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
23097 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
23098 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
23099 ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
23100 ; GCN1-NEXT: s_cbranch_execnz .LBB150_3
23101 ; GCN1-NEXT: ; %bb.1: ; %Flow
23102 ; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
23103 ; GCN1-NEXT: s_cbranch_execnz .LBB150_4
23104 ; GCN1-NEXT: .LBB150_2: ; %atomicrmw.phi
23105 ; GCN1-NEXT: s_or_b64 exec, exec, s[8:9]
23106 ; GCN1-NEXT: s_setpc_b64 s[30:31]
23107 ; GCN1-NEXT: .LBB150_3: ; %atomicrmw.global
23108 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc
23109 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
23110 ; GCN1-NEXT: buffer_wbinvl1_vol
23111 ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
23112 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
23113 ; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
23114 ; GCN1-NEXT: s_cbranch_execz .LBB150_2
23115 ; GCN1-NEXT: .LBB150_4: ; %atomicrmw.private
23116 ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
23117 ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
23118 ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
23119 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
23120 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
23121 ; GCN1-NEXT: s_waitcnt vmcnt(1)
23122 ; GCN1-NEXT: v_add_i32_e64 v6, s[6:7], -1, v0
23123 ; GCN1-NEXT: s_waitcnt vmcnt(0)
23124 ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
23125 ; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3]
23126 ; GCN1-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7]
23127 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5]
23128 ; GCN1-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
23129 ; GCN1-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
23130 ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
23131 ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
23132 ; GCN1-NEXT: s_or_b64 exec, exec, s[8:9]
23133 ; GCN1-NEXT: s_waitcnt vmcnt(0)
23134 ; GCN1-NEXT: s_setpc_b64 s[30:31]
23136 ; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory:
23138 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
23139 ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
23140 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
23141 ; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0
23142 ; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
23143 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
23144 ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
23145 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
23146 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
23147 ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
23148 ; GCN2-NEXT: s_cbranch_execnz .LBB150_3
23149 ; GCN2-NEXT: ; %bb.1: ; %Flow
23150 ; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
23151 ; GCN2-NEXT: s_cbranch_execnz .LBB150_4
23152 ; GCN2-NEXT: .LBB150_2: ; %atomicrmw.phi
23153 ; GCN2-NEXT: s_or_b64 exec, exec, s[8:9]
23154 ; GCN2-NEXT: s_setpc_b64 s[30:31]
23155 ; GCN2-NEXT: .LBB150_3: ; %atomicrmw.global
23156 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc
23157 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
23158 ; GCN2-NEXT: buffer_wbinvl1_vol
23159 ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
23160 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
23161 ; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
23162 ; GCN2-NEXT: s_cbranch_execz .LBB150_2
23163 ; GCN2-NEXT: .LBB150_4: ; %atomicrmw.private
23164 ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
23165 ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
23166 ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
23167 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
23168 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
23169 ; GCN2-NEXT: s_waitcnt vmcnt(1)
23170 ; GCN2-NEXT: v_add_u32_e64 v6, s[6:7], -1, v0
23171 ; GCN2-NEXT: s_waitcnt vmcnt(0)
23172 ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
23173 ; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3]
23174 ; GCN2-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7]
23175 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5]
23176 ; GCN2-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
23177 ; GCN2-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
23178 ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
23179 ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
23180 ; GCN2-NEXT: s_or_b64 exec, exec, s[8:9]
23181 ; GCN2-NEXT: s_waitcnt vmcnt(0)
23182 ; GCN2-NEXT: s_setpc_b64 s[30:31]
23184 ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory:
23186 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
23187 ; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 32, v0
23188 ; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
23189 ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
23190 ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
23191 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
23192 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
23193 ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
23194 ; GCN3-NEXT: s_cbranch_execnz .LBB150_3
23195 ; GCN3-NEXT: ; %bb.1: ; %Flow
23196 ; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
23197 ; GCN3-NEXT: s_cbranch_execnz .LBB150_4
23198 ; GCN3-NEXT: .LBB150_2: ; %atomicrmw.phi
23199 ; GCN3-NEXT: s_or_b64 exec, exec, s[8:9]
23200 ; GCN3-NEXT: s_setpc_b64 s[30:31]
23201 ; GCN3-NEXT: .LBB150_3: ; %atomicrmw.global
23202 ; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc
23203 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
23204 ; GCN3-NEXT: buffer_wbinvl1_vol
23205 ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
23206 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
23207 ; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
23208 ; GCN3-NEXT: s_cbranch_execz .LBB150_2
23209 ; GCN3-NEXT: .LBB150_4: ; %atomicrmw.private
23210 ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
23211 ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
23212 ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
23213 ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
23214 ; GCN3-NEXT: s_waitcnt vmcnt(1)
23215 ; GCN3-NEXT: v_add_co_u32_e64 v5, s[6:7], -1, v0
23216 ; GCN3-NEXT: s_waitcnt vmcnt(0)
23217 ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
23218 ; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3]
23219 ; GCN3-NEXT: v_addc_co_u32_e64 v6, s[6:7], -1, v1, s[6:7]
23220 ; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5]
23221 ; GCN3-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
23222 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
23223 ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
23224 ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
23225 ; GCN3-NEXT: s_or_b64 exec, exec, s[8:9]
23226 ; GCN3-NEXT: s_waitcnt vmcnt(0)
23227 ; GCN3-NEXT: s_setpc_b64 s[30:31]
23228 %gep = getelementptr i64, ptr %out, i64 4
23229 %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0